diff --git a/.gitignore b/.gitignore index 5ccdbf046c6..df7cb26de9f 100644 --- a/.gitignore +++ b/.gitignore @@ -101,6 +101,8 @@ GSYMS /tools/openfst-1.6.2/ /tools/openfst-1.6.5.tar.gz /tools/openfst-1.6.5/ +/tools/openfst-1.6.7.tar.gz +/tools/openfst-1.6.7/ /tools/BeamformIt/ /tools/libsndfile-1.0.25.tar.gz /tools/libsndfile-1.0.25/ diff --git a/.travis.yml b/.travis.yml index 2ce8731d6c8..23507297413 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,6 +21,7 @@ addons: - gfortran-4.9 - liblapack-dev - clang-3.8 + - sox branches: only: @@ -47,7 +48,7 @@ script: # http://peter.eisentraut.org/blog/2014/12/01/ccache-and-clang-part-3/ # for the explanation why extra switches needed for clang with ccache. - CXX="ccache clang++-3.8 -Qunused-arguments -fcolor-diagnostics -Wno-tautological-compare" - CFLAGS="-march=native" + CFLAGS="" LDFLAGS="-llapack" INCDIRS="$XROOT/usr/include" LIBDIRS="$XROOT/usr/lib" diff --git a/COPYING b/COPYING index d8804be572c..5a5cab00a29 100644 --- a/COPYING +++ b/COPYING @@ -56,7 +56,7 @@ contributors and original source material as well as the full text of the Apache License v 2.0 are set forth below. Individual Contributors (in alphabetical order) - + Mohit Agarwal Tanel Alumae Gilles Boulianne @@ -123,7 +123,7 @@ Individual Contributors (in alphabetical order) Haihua Xu Hainan Xu Xiaohui Zhang - + Other Source Material This project includes a port and modification of materials from JAMA: A Java @@ -136,9 +136,9 @@ Other Source Material "Signal processing with lapped transforms," Artech House, Inc., 1992. The current copyright holder, Henrique S. Malvar, has given his permission for the release of this modified version under the Apache License 2.0. - - This project includes material from the OpenFST Library v1.2.7 available at - http://www.openfst.org and released under the Apache License v. 2.0. + + This project includes material from the OpenFST Library v1.2.7 available at + http://www.openfst.org and released under the Apache License v. 2.0. [OpenFst COPYING file begins here] @@ -147,7 +147,7 @@ Other Source Material You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. diff --git a/README.md b/README.md index 4496f627006..963b82ed427 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ -[![Build Status](https://travis-ci.org/kaldi-asr/kaldi.svg?branch=master)](https://travis-ci.org/kaldi-asr/kaldi) - +[![Build Status](https://travis-ci.com/kaldi-asr/kaldi.svg?branch=master)](https://travis-ci.com/kaldi-asr/kaldi) Kaldi Speech Recognition Toolkit ================================ diff --git a/egs/aishell/s5/local/aishell_train_lms.sh b/egs/aishell/s5/local/aishell_train_lms.sh index ea72614689d..9b6cdad2960 100755 --- a/egs/aishell/s5/local/aishell_train_lms.sh +++ b/egs/aishell/s5/local/aishell_train_lms.sh @@ -23,7 +23,7 @@ kaldi_lm=`which train_lm.sh` if [ -z $kaldi_lm ]; then echo "$0: train_lm.sh is not found. That might mean it's not installed" echo "$0: or it is not added to PATH" - echo "$0: Use the script tools/extra/install_kaldi_lm.sh to install it" + echo "$0: Use the script tools/extras/install_kaldi_lm.sh to install it" exit 1 fi diff --git a/egs/aishell2/README.txt b/egs/aishell2/README.txt new file mode 100644 index 00000000000..e8b4260f2bb --- /dev/null +++ b/egs/aishell2/README.txt @@ -0,0 +1,50 @@ +# AISHELL-2 + +AISHELL-2 is by far the largest free speech corpus available for Mandarin ASR research. +## 1. DATA +### training data +* 1000 hours of speech data (around 1 million utterances) +* 1991 speakers (845 male and 1146 female) +* clean recording environment(studio or quiet living room) +* read speech +* reading prompts from various domain: entertainment, finance, technology, sports, control command, place of interest etc. +* near field recording via 3 parallel channels(iOS, Android, Microphone). +* iOS data is free for non-commercial research and education use (e.g. universities and colleges) + +### evaluation data: +Currently we release AISHELL2-2018A-EVAL, containing: +* dev: 2500 utterances from 5 speaker +* test: 5000 utterances from 10 speakers + +you can download above evaluation set from: +http://www.aishelltech.com/aishell_eval + +we may update and release other evaluation sets on the website later, targeting on different applications and senarios. + +## 2. RECIPE +Based on Kaldi standard system, AISHELL-2 provides a self-contained Mandarin ASR recipe, with: +* a word segmentation module, which is a must-have component for Chinese ASR systems +* an open-sourced Mandarin lexicon(DaCiDian) +* a simplified GMM training recipe +* acoustic channel adaptation recipe(AM fine-tuning) + +# CONTACT +AISHELL foundation is a non-profit online organization, with members from speech industry and research institutes. + +We hope AISHELL-2 corpus and recipe could be beneficial to the entire speech community. + +Depends on your location and internet speed, we distribute the corpus in two ways: +* hard-disk delivery +* cloud-disk downloading + +To apply for AISHELL-2 corpus for free, you need to fill in a very simple application form, confirming that: +* university department / education institute info +* only for non-commercial research / education use + +AISHELL-foundation covers all data distribution fees (including the corpus, hard-disk cost etc) + +Data re-distribution inside your university department is OK for convenience. However, users are not supposed to re-distribute AISHELL-2 to other universities or education institutes. + +To get the application form, or you come across any problem with the recipe, contact us via: + +aishell.foundation@gmail.com diff --git a/egs/aishell2/s5/RESULTS b/egs/aishell2/s5/RESULTS new file mode 100644 index 00000000000..67a8ad6a937 --- /dev/null +++ b/egs/aishell2/s5/RESULTS @@ -0,0 +1,6 @@ +%WER 44.78 [ 22176 / 49527, 370 ins, 2179 del, 19627 sub ] exp/mono/decode_test/cer_9_0.0 +%WER 24.78 [ 12271 / 49527, 394 ins, 815 del, 11062 sub ] exp/tri1/decode_test/cer_11_0.0 +%WER 22.54 [ 11165 / 49527, 390 ins, 665 del, 10110 sub ] exp/tri2/decode_test/cer_11_0.0 +%WER 19.78 [ 9795 / 49527, 313 ins, 684 del, 8798 sub ] exp/tri3/decode_test/cer_13_0.0 + +# (Chain model results are at the beginning of corresponding scripts) diff --git a/egs/aishell2/s5/cmd.sh b/egs/aishell2/s5/cmd.sh new file mode 100755 index 00000000000..71dd849a93b --- /dev/null +++ b/egs/aishell2/s5/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/aishell2/s5/conf/decode.conf b/egs/aishell2/s5/conf/decode.conf new file mode 100644 index 00000000000..c8a0ece58bf --- /dev/null +++ b/egs/aishell2/s5/conf/decode.conf @@ -0,0 +1,3 @@ +beam=11.0 # beam for decoding. Was 13.0 in the scripts. +first_beam=8.0 # beam for 1st-pass decoding in SAT. + diff --git a/egs/aishell2/s5/conf/mfcc.conf b/egs/aishell2/s5/conf/mfcc.conf new file mode 100644 index 00000000000..32988403b00 --- /dev/null +++ b/egs/aishell2/s5/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false +--sample-frequency=16000 diff --git a/egs/aishell2/s5/conf/mfcc_hires.conf b/egs/aishell2/s5/conf/mfcc_hires.conf new file mode 100644 index 00000000000..137d00add94 --- /dev/null +++ b/egs/aishell2/s5/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=16000 # AISHELL-2 is sampled at 16kHz +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=20 # low cutoff frequency for mel bins +--high-freq=-400 # high cutoff frequency, relative to Nyquist of 8000 (=7600) diff --git a/egs/aishell2/s5/conf/pitch.conf b/egs/aishell2/s5/conf/pitch.conf new file mode 100644 index 00000000000..e959a19d5b8 --- /dev/null +++ b/egs/aishell2/s5/conf/pitch.conf @@ -0,0 +1 @@ +--sample-frequency=16000 diff --git a/egs/aishell2/s5/local/chain/compare_wer.sh b/egs/aishell2/s5/local/chain/compare_wer.sh new file mode 100755 index 00000000000..c66a861c3f3 --- /dev/null +++ b/egs/aishell2/s5/local/chain/compare_wer.sh @@ -0,0 +1,80 @@ +#!/bin/bash + +# Copyright 2018 Emotech LTD (Author: Xuechen LIU) +# Apache 2.0 + +# compare wer between diff. models in aishell2 chain directory + +set -e +. ./cmd.sh +. ./path.sh + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/tdnn_7h_sp" + exit 1 +fi + +echo "# $0 $*" + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) +} + +# print model names +echo -n "# Model " +for x in $*; do + printf "% 10s" " $(basename $x)" +done +echo + +# print number of parameters +echo -n "# Num. of params " +for x in $*; do + set_names $x + params=$(steps/info/chain_dir_info.pl "$x" | grep -o 'num-params=[0-9]*\.[0-9]*M' | cut -d'=' -f2-) + printf "% 10s\n" $params +done + +# print decode WER results +echo -n "# WER(%) " +for x in $*; do + set_names $x + wer=$([ -d $x ] && grep WER $x/decode_test/cer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +# print final log prob for train & validation +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf($8)}' | cut -c1-7) + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf($8)}' | cut -c1-7) + printf "% 10s" $prob +done +echo + +# do the same for xent objective +echo -n "# Final train prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/aishell2/s5/local/chain/run_tdnn.sh b/egs/aishell2/s5/local/chain/run_tdnn.sh new file mode 120000 index 00000000000..61f8f499182 --- /dev/null +++ b/egs/aishell2/s5/local/chain/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1b.sh \ No newline at end of file diff --git a/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..459bd64eeb5 --- /dev/null +++ b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1a.sh @@ -0,0 +1,204 @@ +#!/bin/bash + +# this is the original baseline scripts, which is supposed to be deprecated. + +# results +# local/chain/compare_wer.sh exp/chain/tdnn_1a_sp/ +# Model tdnn_1a_sp +# WER(%) 9.89 +# Final train prob -0.0653 +# Final valid prob -0.0765 +# Final train prob (xent) -0.7340 +# Final valid prob (xent) -0.8030 + +set -e + +# configs for 'chain' +affix= +stage=10 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn_1a # Note: _sp will get added to this +decode_iter= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=2 +num_jobs_final=4 +nj=10 +minibatch_size=128 +frames_per_eg=150,110,90 +remove_egs=true +common_egs_dir= +xent_regularize=0.1 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 9 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 10 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.002" + linear_opts="orthonormal-constraint=1.0" + output_opts="l2-regularize=0.0005 bottleneck-dim=256" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $opts dim=1280 + linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn3l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn3 $opts dim=1280 + linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn5l dim=256 $linear_opts + relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l) + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 + linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280 + linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn8l,tdnn6l) dim=1280 + linear-component name=prefinal-l dim=256 $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 11 ]; then + #if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + # utils/create_split_dir.pl \ + # /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + #fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_sp_lats \ + --dir $dir || exit 1; +fi + +if [ $stage -le 12 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph +fi + +graph_dir=$dir/graph +if [ $stage -le 13 ]; then + for test_set in $test_sets; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 10 --cmd "$decode_cmd" \ + $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1; + done +fi + +echo "local/chain/run_tdnn.sh succeeded" +exit 0; diff --git a/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh new file mode 100755 index 00000000000..30a19293181 --- /dev/null +++ b/egs/aishell2/s5/local/chain/tuning/run_tdnn_1b.sh @@ -0,0 +1,258 @@ +#!/bin/bash + +# _1b is as _1a, but with pitch feats, i-vector and dropout schedule added, referenced from wsj + +# basic info: +# steps/info/chain_dir_info.pl exp/chain/tdnn_1b_all_sp/ +# exp/chain/tdnn_1b_all_sp/: num-iters=1446 nj=2..2 num-params=19.3M dim=43+100->4456 combine=-0.079->-0.075 (over 9) xent:train/valid[962,1445,final]=(-0.922,-0.795,-0.746/-0.960,-0.840,-0.785) logprob:train/valid[962,1445,final]=(-0.084,-0.072,-0.070/-0.085,-0.075,-0.071) + +# results: +# local/chain/compare_wer.sh exp/chain/tdnn_1d_all_sp/ +# Model tdnn_1d_all_sp +# Num. of params 19.3M +# WER(%) 8.84 +# Final train prob -0.0696 +# Final valid prob -0.0714 +# Final train prob (xent) -0.7458 +# Final valid prob (xent) -0.7854 + +set -e + +# configs for 'chain' +affix=all +stage=0 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn_1b # Note: _sp will get added to this +decode_iter= + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=2 +num_jobs_final=4 +nj=15 +minibatch_size=128 +dropout_schedule='0,0@0.20,0.3@0.50,0' +frames_per_eg=150,110,90 +remove_egs=true +common_egs_dir= +xent_regularize=0.1 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 9 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 10 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + feat_dim=$(feat-to-dim scp:data/${train_set}_hires/feats.scp -) + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.002" + linear_opts="orthonormal-constraint=1.0" + output_opts="l2-regularize=0.0005 bottleneck-dim=256" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=$feat_dim name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1280 + linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn3l dim=256 $linear_opts + relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1280 + linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn5l dim=256 $linear_opts + relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l) + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 + linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280 + linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn8l,tdnn6l) dim=1280 + linear-component name=prefinal-l dim=256 $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 11 ]; then + #if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + # utils/create_split_dir.pl \ + # /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + #fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/chain/ivectors_${train_set}_${affix} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_sp_lats \ + --dir $dir || exit 1; +fi + +if [ $stage -le 12 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph +fi + +graph_dir=$dir/graph +if [ $stage -le 13 ]; then + for test_set in $test_sets; do + nj=$(wc -l data/${test_set}_hires/spk2utt | awk '{print $1}') + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --online-ivector-dir exp/chain/ivectors_${test_set}_${affix} \ + $graph_dir data/${test_set}_hires $dir/decode_${test_set} || exit 1; + done +fi + +echo "local/chain/run_tdnn.sh succeeded" +exit 0; diff --git a/egs/aishell2/s5/local/nnet3/compare_wer.sh b/egs/aishell2/s5/local/nnet3/compare_wer.sh new file mode 100755 index 00000000000..84dda2fda14 --- /dev/null +++ b/egs/aishell2/s5/local/nnet3/compare_wer.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# Copyright 2018 Emotech LTD (Author: Xuechen LIU) +# Apache 2.0 + +# compare wer between diff. models in aishell2 nnet3 directory + +set -e +. ./cmd.sh +. ./path.sh + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/nnet3/tdnn_sp exp/nnet3/tdnn_sp_pr" + exit 1 +fi + +echo "# $0 $*" + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) +} + +# print model names +echo -n "# Model " +for x in $*; do + printf "% 10s" " $(basename $x)" +done +echo + +# print decode WER results +echo -n "# WER(%) " +for x in $*; do + set_names $x + wer=$([ -d $x ] && grep WER $x/decode_test/cer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +# print log for train & validation +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.combined.log | grep log-like | awk '{printf($8)}' | cut -c1-7) + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.combined.log | grep log-like | awk '{printf($8)}' | cut -c1-7) + printf "% 10s" $prob +done +echo diff --git a/egs/aishell2/s5/local/nnet3/finetune_tdnn.sh b/egs/aishell2/s5/local/nnet3/finetune_tdnn.sh new file mode 120000 index 00000000000..e05db4f91f4 --- /dev/null +++ b/egs/aishell2/s5/local/nnet3/finetune_tdnn.sh @@ -0,0 +1 @@ +tuning/finetune_tdnn_1a.sh \ No newline at end of file diff --git a/egs/aishell2/s5/local/nnet3/run_tdnn.sh b/egs/aishell2/s5/local/nnet3/run_tdnn.sh new file mode 120000 index 00000000000..34499362831 --- /dev/null +++ b/egs/aishell2/s5/local/nnet3/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1a.sh \ No newline at end of file diff --git a/egs/aishell2/s5/local/nnet3/tuning/finetune_tdnn_1a.sh b/egs/aishell2/s5/local/nnet3/tuning/finetune_tdnn_1a.sh new file mode 100755 index 00000000000..050d8f9d4fc --- /dev/null +++ b/egs/aishell2/s5/local/nnet3/tuning/finetune_tdnn_1a.sh @@ -0,0 +1,64 @@ +# !/bin/bash + +# This script uses weight transfer as a transfer learning method to transfer +# already trained neural net model on aishell2 to a finetune data set. +. ./path.sh +. ./cmd.sh + +data_set=finetune +data_dir=data/${data_set} +ali_dir=exp/${data_set}_ali +src_dir=exp/nnet3/tdnn_sp +dir=${src_dir}_${data_set} + +num_jobs_initial=1 +num_jobs_final=1 +num_epochs=5 +initial_effective_lrate=0.0005 +final_effective_lrate=0.00002 +minibatch_size=1024 + +stage=1 +train_stage=-10 +nj=4 + +if [ $stage -le 1 ]; then + # align new data(finetune set) with GMM, we probably replace GMM with NN later + steps/make_mfcc.sh \ + --cmd "$train_cmd" --nj $nj --mfcc-config conf/mfcc.conf \ + ${data_dir} exp/make_mfcc/${data_set} mfcc + steps/compute_cmvn_stats.sh ${data_dir} exp/make_mfcc/${data_set} mfcc || exit 1; + + utils/fix_data_dir.sh ${data_dir} || exit 1; + steps/align_si.sh --cmd "$train_cmd" --nj ${nj} ${data_dir} data/lang exp/tri3 ${ali_dir} + + # extract mfcc_hires for AM finetuning + utils/copy_data_dir.sh ${data_dir} ${data_dir}_hires + rm -f ${data_dir}_hires/{cmvn.scp,feats.scp} + #utils/data/perturb_data_dir_volume.sh ${data_dir}_hires || exit 1; + steps/make_mfcc.sh \ + --cmd "$train_cmd" --nj $nj --mfcc-config conf/mfcc_hires.conf \ + ${data_dir}_hires exp/make_mfcc/${data_set}_hires mfcc_hires + steps/compute_cmvn_stats.sh ${data_dir}_hires exp/make_mfcc/${data_set}_hires mfcc_hires +fi + +if [ $stage -le 2 ]; then + $train_cmd $dir/log/generate_input_model.log \ + nnet3-am-copy --raw=true $src_dir/final.mdl $dir/input.raw + +if [ $stage -le 3 ]; then + steps/nnet3/train_dnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.input-model $dir/input.raw \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.optimization.minibatch-size $minibatch_size \ + --feat-dir ${data_dir}_hires \ + --lang data/lang \ + --ali-dir ${ali_dir} \ + --dir $dir || exit 1; +fi diff --git a/egs/aishell2/s5/local/nnet3/tuning/run_tdnn_1a.sh b/egs/aishell2/s5/local/nnet3/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..34ca1f0f224 --- /dev/null +++ b/egs/aishell2/s5/local/nnet3/tuning/run_tdnn_1a.sh @@ -0,0 +1,135 @@ +#!/bin/bash + +# This script is based on swbd/s5c/local/nnet3/run_tdnn.sh + +# this is the standard "tdnn" system, built in nnet3; it's what we use to +# call multi-splice. + +# At this script level we don't support not running on GPU, as it would be painfully slow. +# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, +# --num-threads 16 and --minibatch-size 128. + +# results +# local/nnet3/compare_wer.sh exp/nnet3/tdnn_sp/ +# Model tdnn_sp +# WER(%) 11.20 +# Final train prob -0.9601 +# Final valid prob -1.0819 + +set -e + +stage=0 +train_stage=-10 +affix= +common_egs_dir= + +# training options +initial_effective_lrate=0.0015 +final_effective_lrate=0.00015 +num_epochs=4 +num_jobs_initial=2 +num_jobs_final=6 +nj=30 +remove_egs=true + +# feature options +use_ivectors=false + +# End configuration section. + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=850 + relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2) + relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2) + relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn6 dim=850 + output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 8 ]; then + #if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + # utils/create_split_dir.pl \ + # /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + #fi + + steps/nnet3/train_dnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval 500 \ + --use-gpu true \ + --feat-dir=data/${train_set}_hires \ + --ali-dir $ali_dir \ + --lang data/lang \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 9 ]; then + for decode_set in $test_sets; do + # this version of the decoding treats each utterance separately + # without carrying forward speaker information. + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + decode_dir=${dir}/decode_$decode_set + steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \ + $graph_dir data/${decode_set}_hires $decode_dir || exit 1; + done +fi + +wait; +echo "local/nnet3/run_tdnn.sh succeeded" +exit 0; diff --git a/egs/aishell2/s5/local/nnet3/tuning/run_tdnn_1b.sh b/egs/aishell2/s5/local/nnet3/tuning/run_tdnn_1b.sh new file mode 100755 index 00000000000..ea3a59e90ee --- /dev/null +++ b/egs/aishell2/s5/local/nnet3/tuning/run_tdnn_1b.sh @@ -0,0 +1,137 @@ +#!/bin/bash + +# This script is based on run_tdnn_1a.sh, but with pitch features applied + +# this is the standard "tdnn" system, built in nnet3; it's what we use to +# call multi-splice. + +# At this script level we don't support not running on GPU, as it would be painfully slow. +# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, +# --num-threads 16 and --minibatch-size 128. + +# results +# local/nnet3/compare_wer.sh exp/nnet3/tdnn_sp/ +# Model tdnn_sp +# WER(%) 11.02 +# Final train prob -1.1265 +# Final valid prob -1.2600 + +set -e + +stage=0 +train_stage=-10 +affix= +common_egs_dir= + +# training options +initial_effective_lrate=0.0015 +final_effective_lrate=0.00015 +num_epochs=4 +num_jobs_initial=2 +num_jobs_final=12 +nj=30 +remove_egs=true + +# feature options +use_ivectors=false + +# End configuration section. + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=$input_dim name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=850 + relu-batchnorm-layer name=tdnn2 dim=850 input=Append(-1,0,2) + relu-batchnorm-layer name=tdnn3 dim=850 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn4 dim=850 input=Append(-7,0,2) + relu-batchnorm-layer name=tdnn5 dim=850 input=Append(-3,0,3) + relu-batchnorm-layer name=tdnn6 dim=850 + output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 8 ]; then + #if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + # utils/create_split_dir.pl \ + # /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + #fi + + steps/nnet3/train_dnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval 500 \ + --use-gpu true \ + --feat-dir=data/${train_set}_hires \ + --ali-dir $ali_dir \ + --lang data/lang \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 9 ]; then + for decode_set in $test_sets; do + # this version of the decoding treats each utterance separately + # without carrying forward speaker information. + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + decode_dir=${dir}/decode_$decode_set + steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" \ + $graph_dir data/${decode_set}_hires $decode_dir || exit 1; + done +fi + +wait; +echo "local/nnet3/run_tdnn.sh succeeded" +exit 0; diff --git a/egs/aishell2/s5/local/prepare_all.sh b/egs/aishell2/s5/local/prepare_all.sh new file mode 100755 index 00000000000..3928eb95ca3 --- /dev/null +++ b/egs/aishell2/s5/local/prepare_all.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG) +# 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU) +# 2018 Emotech LTD (Author: Xuechen LIU) +# Apache 2.0 + +trn_set= +dev_set= +tst_set= + +stage=1 + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if [ $# -ne 3 ]; then + echo "prepare_all.sh " + echo " e.g prepare_all.sh /data/AISHELL-2/iOS/train /data/AISHELL-2/iOS/dev /data/AISHELL-2/iOS/test" + exit 1; +fi + +trn_set=$1 +dev_set=$2 +tst_set=$3 + +# download DaCiDian raw resources, convert to Kaldi lexicon format +if [ $stage -le 1 ]; then + local/prepare_dict.sh data/local/dict || exit 1; +fi + +# wav.scp, text(word-segmented), utt2spk, spk2utt +if [ $stage -le 2 ]; then + local/prepare_data.sh ${trn_set} data/local/dict data/local/train data/train || exit 1; + local/prepare_data.sh ${dev_set} data/local/dict data/local/dev data/dev || exit 1; + local/prepare_data.sh ${tst_set} data/local/dict data/local/test data/test || exit 1; +fi + +# L +if [ $stage -le 3 ]; then + utils/prepare_lang.sh --position-dependent-phones false \ + data/local/dict "" data/local/lang data/lang || exit 1; +fi + +# arpa LM +if [ $stage -le 4 ]; then + local/train_lms.sh \ + data/local/dict/lexicon.txt \ + data/local/train/text \ + data/local/lm || exit 1; +fi + +# G compilation, check LG composition +if [ $stage -le 5 ]; then + utils/format_lm.sh data/lang data/local/lm/3gram-mincount/lm_unpruned.gz \ + data/local/dict/lexicon.txt data/lang_test || exit 1; +fi + +echo "local/prepare_all.sh succeeded" +exit 0; + diff --git a/egs/aishell2/s5/local/prepare_data.sh b/egs/aishell2/s5/local/prepare_data.sh new file mode 100755 index 00000000000..419d8eddfd1 --- /dev/null +++ b/egs/aishell2/s5/local/prepare_data.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG) +# 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU) +# Apache 2.0 + +# transform raw AISHELL-2 data to kaldi format + +. ./path.sh || exit 1; + +tmp= +dir= + +if [ $# != 4 ]; then + echo "Usage: $0 " + echo " $0 /export/AISHELL-2/iOS/train data/local/dict data/local/train data/train" + exit 1; +fi + +corpus=$1 +dict_dir=$2 +tmp=$3 +dir=$4 + +echo "prepare_data.sh: Preparing data in $corpus" + +mkdir -p $tmp +mkdir -p $dir + +# corpus check +if [ ! -d $corpus ] || [ ! -f $corpus/wav.scp ] || [ ! -f $corpus/trans.txt ]; then + echo "Error: $0 requires wav.scp and trans.txt under $corpus directory." + exit 1; +fi + +# validate utt-key list +awk '{print $1}' $corpus/wav.scp > $tmp/wav_utt.list +awk '{print $1}' $corpus/trans.txt > $tmp/trans_utt.list +utils/filter_scp.pl -f 1 $tmp/wav_utt.list $tmp/trans_utt.list > $tmp/utt.list + +# wav.scp +awk -F'\t' -v path_prefix=$corpus '{printf("%s\t%s/%s\n",$1,path_prefix,$2)}' $corpus/wav.scp > $tmp/tmp_wav.scp +utils/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_wav.scp | sort -k 1 | uniq > $tmp/wav.scp + +# text +python -c "import jieba" 2>/dev/null || \ + (echo "jieba is not found. Use tools/extra/install_jieba.sh to install it." && exit 1;) +utils/filter_scp.pl -f 1 $tmp/utt.list $corpus/trans.txt | sort -k 1 | uniq > $tmp/trans.txt +awk '{print $1}' $dict_dir/lexicon.txt | sort | uniq | awk 'BEGIN{idx=0}{print $1,idx++}'> $tmp/vocab.txt +python local/word_segmentation.py $tmp/vocab.txt $tmp/trans.txt > $tmp/text + +# utt2spk & spk2utt +awk -F'\t' '{print $2}' $tmp/wav.scp > $tmp/wav.list +sed -e 's:\.wav::g' $tmp/wav.list | \ + awk -F'/' '{i=NF-1;printf("%s\t%s\n",$NF,$i)}' > $tmp/tmp_utt2spk +utils/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_utt2spk | sort -k 1 | uniq > $tmp/utt2spk +utils/utt2spk_to_spk2utt.pl $tmp/utt2spk | sort -k 1 | uniq > $tmp/spk2utt + +# copy prepared resources from tmp_dir to target dir +mkdir -p $dir +for f in wav.scp text spk2utt utt2spk; do + cp $tmp/$f $dir/$f || exit 1; +done + +echo "local/prepare_data.sh succeeded" +exit 0; diff --git a/egs/aishell2/s5/local/prepare_dict.sh b/egs/aishell2/s5/local/prepare_dict.sh new file mode 100755 index 00000000000..d59585273a7 --- /dev/null +++ b/egs/aishell2/s5/local/prepare_dict.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG) +# 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU) +# Apache 2.0 + +# This is a shell script, and it download and process DaCiDian for Mandarin ASR. + +. ./path.sh + +download_dir=data/local/DaCiDian +dir=data/local/dict + +if [ $# -ne 1 ]; then + echo "Usage: $0 "; + exit 1; +fi + +dir=$1 + +# download the DaCiDian from github +git clone https://github.com/aishell-foundation/DaCiDian.git $download_dir + +# here we map to the phone spn(spoken noise) +mkdir -p $dir +python $download_dir/DaCiDian.py $download_dir/word_to_pinyin.txt $download_dir/pinyin_to_phone.txt > $dir/lexicon.txt +echo -e "\tspn" >> $dir/lexicon.txt + +# prepare silence_phones.txt, nonsilence_phones.txt, optional_silence.txt, extra_questions.txt +cat $dir/lexicon.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \ + sort -u |\ + perl -e ' + my %ph_cl; + while () { + $phone = $_; + chomp($phone); + chomp($_); + $phone = $_; + next if ($phone eq "sil"); + if (exists $ph_cl{$phone}) { push(@{$ph_cl{$phone}}, $_) } + else { $ph_cl{$phone} = [$_]; } + } + foreach $key ( keys %ph_cl ) { + print "@{ $ph_cl{$key} }\n" + } + ' | sort -k1 > $dir/nonsilence_phones.txt || exit 1; + +echo sil > $dir/silence_phones.txt +echo sil > $dir/optional_silence.txt + +cat $dir/silence_phones.txt | awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1; +cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) { + $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \ + >> $dir/extra_questions.txt || exit 1; + +echo "local/prepare_dict.sh succeeded" +exit 0; + diff --git a/egs/aishell2/s5/local/run_gmm.sh b/egs/aishell2/s5/local/run_gmm.sh new file mode 100755 index 00000000000..569e5ab570a --- /dev/null +++ b/egs/aishell2/s5/local/run_gmm.sh @@ -0,0 +1,115 @@ +#!/bin/bash +# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG) +# 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU) +# 2018 Emotech LTD (Author: Xuechen LIU) +# Apache 2.0 + +set -e + +# number of jobs +nj=20 +stage=1 + +. ./cmd.sh +[ -f ./path.sh ] && . ./path.sh; +. ./utils/parse_options.sh + +# nj for dev and test +dev_nj=$(wc -l data/dev/spk2utt | awk '{print $1}' || exit 1;) +test_nj=$(wc -l data/test/spk2utt | awk '{print $1}' || exit 1;) + +# Now make MFCC features. +if [ $stage -le 1 ]; then + # mfccdir should be some place with a largish disk where you + # want to store MFCC features. + for x in train dev test; do + steps/make_mfcc_pitch.sh --pitch-config conf/pitch.conf --cmd "$train_cmd" --nj $nj \ + data/$x exp/make_mfcc/$x mfcc || exit 1; + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x mfcc || exit 1; + utils/fix_data_dir.sh data/$x || exit 1; + done + + # subset the training data for fast startup + for x in 100 300; do + utils/subset_data_dir.sh data/train ${x}000 data/train_${x}k + done +fi + +# mono +if [ $stage -le 2 ]; then + # training + steps/train_mono.sh --cmd "$train_cmd" --nj $nj \ + data/train_100k data/lang exp/mono || exit 1; + + # decoding + utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph || exit 1; + steps/decode.sh --cmd "$decode_cmd" --config conf/decode.conf --nj ${dev_nj} \ + exp/mono/graph data/dev exp/mono/decode_dev + steps/decode.sh --cmd "$decode_cmd" --config conf/decode.conf --nj ${test_nj} \ + exp/mono/graph data/test exp/mono/decode_test + + # alignment + steps/align_si.sh --cmd "$train_cmd" --nj $nj \ + data/train_300k data/lang exp/mono exp/mono_ali || exit 1; +fi + +# tri1 +if [ $stage -le 3 ]; then + # training + steps/train_deltas.sh --cmd "$train_cmd" \ + 4000 32000 data/train_300k data/lang exp/mono_ali exp/tri1 || exit 1; + + # decoding + utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1; + steps/decode.sh --cmd "$decode_cmd" --config conf/decode.conf --nj ${dev_nj} \ + exp/tri1/graph data/dev exp/tri1/decode_dev + steps/decode.sh --cmd "$decode_cmd" --config conf/decode.conf --nj ${test_nj} \ + exp/tri1/graph data/test exp/tri1/decode_test + + # alignment + steps/align_si.sh --cmd "$train_cmd" --nj $nj \ + data/train data/lang exp/tri1 exp/tri1_ali || exit 1; +fi + +# tri2 +if [ $stage -le 4 ]; then + # training + steps/train_deltas.sh --cmd "$train_cmd" \ + 7000 56000 data/train data/lang exp/tri1_ali exp/tri2 || exit 1; + + # decoding + utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph + steps/decode.sh --cmd "$decode_cmd" --config conf/decode.conf --nj ${dev_nj} \ + exp/tri2/graph data/dev exp/tri2/decode_dev + steps/decode.sh --cmd "$decode_cmd" --config conf/decode.conf --nj ${test_nj} \ + exp/tri2/graph data/test exp/tri2/decode_test + + # alignment + steps/align_si.sh --cmd "$train_cmd" --nj $nj \ + data/train data/lang exp/tri2 exp/tri2_ali || exit 1; +fi + +# tri3 +if [ $stage -le 5 ]; then + # training [LDA+MLLT] + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 10000 80000 data/train data/lang exp/tri2_ali exp/tri3 || exit 1; + + # decoding + utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph || exit 1; + steps/decode.sh --cmd "$decode_cmd" --nj ${dev_nj} --config conf/decode.conf \ + exp/tri3/graph data/dev exp/tri3/decode_dev + steps/decode.sh --cmd "$decode_cmd" --nj ${test_nj} --config conf/decode.conf \ + exp/tri3/graph data/test exp/tri3/decode_test + + # alignment + steps/align_si.sh --cmd "$train_cmd" --nj $nj \ + data/train data/lang exp/tri3 exp/tri3_ali || exit 1; + + steps/align_si.sh --cmd "$train_cmd" --nj ${nj} \ + data/dev data/lang exp/tri3 exp/tri3_ali_dev || exit 1; +fi + +echo "local/run_gmm.sh succeeded" +exit 0; + diff --git a/egs/aishell2/s5/local/score.sh b/egs/aishell2/s5/local/score.sh new file mode 100755 index 00000000000..a9786169973 --- /dev/null +++ b/egs/aishell2/s5/local/score.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +set -e -o pipefail +set -x +steps/score_kaldi.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" + +echo "$0: Done" diff --git a/egs/aishell2/s5/local/show_results.sh b/egs/aishell2/s5/local/show_results.sh new file mode 100644 index 00000000000..714cb0f4358 --- /dev/null +++ b/egs/aishell2/s5/local/show_results.sh @@ -0,0 +1,2 @@ +# !/bin/bash +for x in exp/*/decode_test; do [ -d $x ] && grep WER $x/cer_* | utils/best_wer.sh; done 2>/dev/null diff --git a/egs/aishell2/s5/local/train_lms.sh b/egs/aishell2/s5/local/train_lms.sh new file mode 100755 index 00000000000..179a7b78e14 --- /dev/null +++ b/egs/aishell2/s5/local/train_lms.sh @@ -0,0 +1,91 @@ +#!/bin/bash +# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG) +# 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU) +# Apache 2.0 + +. ./path.sh +. ./utils/parse_options.sh + +if [ $# -ne 3 ]; then + echo "train_lms.sh " + echo " e.g train_lms.sh data/local/dict/lexicon.txt data/local/train/text data/local/lm" + exit 1; +fi + +lexicon=$1 +text=$2 +dir=$3 + +for f in "$text" "$lexicon"; do + [ ! -f $x ] && echo "$0: No such file $f" && exit 1; +done + +kaldi_lm=`which train_lm.sh` +if [ -z $kaldi_lm ]; then + echo "$0: train_lm.sh is not found. That might mean it's not installed" + echo "$0: or it is not added to PATH" + echo "$0: Use the script tools/extras/install_kaldi_lm.sh to install it" + exit 1 +fi + +mkdir -p $dir +cleantext=$dir/text.no_oov + +cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } + {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ + > $cleantext || exit 1; + +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ + sort -nr > $dir/word.counts || exit 1; + +# Get counts from acoustic training transcripts, and add one-count +# for each word in the lexicon (but not silence, we don't want it +# in the LM-- we'll add it optionally later). +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ + cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ + sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; + +# note: we probably won't really make use of as there aren't any OOVs +cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "" "" "" > $dir/word_map \ + || exit 1; + +# note: ignore 1st field of train.txt, it's the utterance-id. +cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1]=$2;} + { for(n=2;n<=NF;n++) { printf map[$n]; if(n$dir/train.gz \ + || exit 1; + +train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1; + +# note: output is +# data/local/lm/3gram-mincount/lm_unpruned.gz + +echo "local/train_lms.sh succeeded" +exit 0 + + +# From here is some commands to do a baseline with SRILM (assuming +# you have it installed). +heldout_sent=10000 # Don't change this if you want result to be comparable with + # kaldi_lm results +sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities. +mkdir -p $sdir +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $sdir/heldout +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $sdir/train + +cat $dir/word_map | awk '{print $1}' | cat - <(echo ""; echo "" ) > $sdir/wordlist + + +ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \ + -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz +ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout +# 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482 + +# Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above. +# Difference in WSJ must have been due to different treatment of . +ngram -lm $dir/3gram-mincount/lm_unpruned.gz -ppl $sdir/heldout +# 0 zeroprobs, logprob= -250913 ppl= 90.4439 ppl1= 132.379 + +echo "local/train_lms.sh succeeded" +exit 0 diff --git a/egs/aishell2/s5/local/wer_hyp_filter b/egs/aishell2/s5/local/wer_hyp_filter new file mode 100755 index 00000000000..c6660e4efe1 --- /dev/null +++ b/egs/aishell2/s5/local/wer_hyp_filter @@ -0,0 +1,19 @@ +#!/usr/bin/env perl + +@filters=('',''); + +foreach $w (@filters) { + $bad{$w} = 1; +} + +while() { + @A = split(" ", $_); + $id = shift @A; + print "$id "; + foreach $a (@A) { + if (!defined $bad{$a}) { + print "$a "; + } + } + print "\n"; +} diff --git a/egs/aishell2/s5/local/wer_output_filter b/egs/aishell2/s5/local/wer_output_filter new file mode 100755 index 00000000000..aceeeec41b4 --- /dev/null +++ b/egs/aishell2/s5/local/wer_output_filter @@ -0,0 +1,25 @@ +#!/usr/bin/env perl +# Copyright 2012-2014 Johns Hopkins University (Author: Yenda Trmal) +# Apache 2.0 +use utf8; + +use open qw(:encoding(utf8)); +binmode STDIN, ":utf8"; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; + +while (<>) { + @F = split " "; + print $F[0] . " "; + foreach $s (@F[1..$#F]) { + if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) { + print ""; + } else { + print "$s" + } + print " "; + } + print "\n"; +} + + diff --git a/egs/aishell2/s5/local/wer_ref_filter b/egs/aishell2/s5/local/wer_ref_filter new file mode 100755 index 00000000000..c6660e4efe1 --- /dev/null +++ b/egs/aishell2/s5/local/wer_ref_filter @@ -0,0 +1,19 @@ +#!/usr/bin/env perl + +@filters=('',''); + +foreach $w (@filters) { + $bad{$w} = 1; +} + +while() { + @A = split(" ", $_); + $id = shift @A; + print "$id "; + foreach $a (@A) { + if (!defined $bad{$a}) { + print "$a "; + } + } + print "\n"; +} diff --git a/egs/aishell2/s5/local/word_segmentation.py b/egs/aishell2/s5/local/word_segmentation.py new file mode 100644 index 00000000000..1cb2c1e7350 --- /dev/null +++ b/egs/aishell2/s5/local/word_segmentation.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python +# encoding=utf-8 +# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG) +# 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU) +# Apache 2.0 + +import sys +import jieba +reload(sys) +sys.setdefaultencoding('utf-8') + +if len(sys.argv) < 3: + sys.stderr.write("word_segmentation.py > \n") + exit(1) + +vocab_file=sys.argv[1] +trans_file=sys.argv[2] + +jieba.set_dictionary(vocab_file) +for line in open(trans_file): + key,trans = line.strip().split('\t',1) + words = jieba.cut(trans) + new_line = key + '\t' + " ".join(words) + print(new_line) diff --git a/egs/aishell2/s5/path.sh b/egs/aishell2/s5/path.sh new file mode 100755 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/aishell2/s5/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/aishell2/s5/run.sh b/egs/aishell2/s5/run.sh new file mode 100755 index 00000000000..8afdd3ed310 --- /dev/null +++ b/egs/aishell2/s5/run.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG) +# 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU) +# Apache 2.0 + +# AISHELL-2 provides: +# * a Mandarin speech corpus (~1000hrs), free for non-commercial research/education use +# * a baseline recipe setup for large scale Mandarin ASR system +# For more details, read $KALDI_ROOT/egs/aishell2/README.txt + +# modify this to your AISHELL-2 training data path +# e.g: +# trn_set=/disk10/data/AISHELL-2/iOS/data +# dev_set=/disk10/data/AISHELL-2/iOS/dev +# tst_set=/disk10/data/AISHELL-2/iOS/test +trn_set= +dev_set= +tst_set= + +nj=20 +stage=1 +gmm_stage=1 + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +# prepare trn/dev/tst data, lexicon, lang etc +if [ $stage -le 1 ]; then + local/prepare_all.sh ${trn_set} ${dev_set} ${tst_set} || exit 1; +fi + +# GMM +if [ $stage -le 2 ]; then + local/run_gmm.sh --nj $nj --stage $gmm_stage +fi + +# chain +if [ $stage -le 3 ]; then + local/chain/run_tdnn.sh --nj $nj +fi + +local/show_results.sh + +exit 0; diff --git a/egs/aishell2/s5/steps b/egs/aishell2/s5/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/aishell2/s5/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/aishell2/s5/utils b/egs/aishell2/s5/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/aishell2/s5/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/ami/s5b/local/chain/run_tdnn_lstm_bs.sh b/egs/ami/s5b/local/chain/run_tdnn_lstm_bs.sh new file mode 120000 index 00000000000..c3c8dc56cc2 --- /dev/null +++ b/egs/ami/s5b/local/chain/run_tdnn_lstm_bs.sh @@ -0,0 +1 @@ +tuning/run_tdnn_lstm_bs_1a.sh \ No newline at end of file diff --git a/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh new file mode 100755 index 00000000000..b672a44e572 --- /dev/null +++ b/egs/ami/s5b/local/chain/tuning/run_tdnn_lstm_bs_1a.sh @@ -0,0 +1,309 @@ +#!/bin/bash + +# same as tdnn_lstm_1o but use backstitch training. +# Also num-epochs and l2-regularize are tuned for best performance. + +# local/chain/tuning/run_tdnn_lstm_bs_1a.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned +# local/chain/compare_wer_general.sh sdm1 tdnn_lstm_bs_1a_sp_bi_ihmali_ld5 tdnn_lstm1o_sp_bi_ihmali_ld5 + +# System tdnn_lstm_bs_1a_sp_bi_ihmali_ld5 tdnn_lstm1o_sp_bi_ihmali_ld5 +# WER on dev 33.8 35.2 +# WER on eval 37.5 38.7 +# Final train prob -0.126056 -0.167549 +# Final valid prob -0.228452 -0.24847 +# Final train prob (xent) -1.51685 -1.7403 +# Final valid prob (xent) -2.04719 -2.13732 + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +min_seg_len=1.55 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3 # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned +num_epochs=10 + +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +remove_egs=true +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tlstm_affix=1a #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + +alpha=0.2 +back_interval=1 + +# decode options +extra_left_context=50 +frames_per_chunk= + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +xent_regularize=0.1 + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + tdnn_opts="l2-regularize=0.003" + lstm_opts="l2-regularize=0.005" + output_opts="l2-regularize=0.001" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=1024 $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 $tdnn_opts + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 $tdnn_opts + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 $tdnn_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 $tdnn_opts + fast-lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 $tdnn_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024 $tdnn_opts + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024 $tdnn_opts + fast-lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.max-models-combine=30 \ + --trainer.optimization.backstitch-training-scale $alpha \ + --trainer.optimization.backstitch-training-interval $back_interval \ + --trainer.max-param-change 2.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs $remove_egs \ + --cleanup true \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --extra-left-context $extra_left_context \ + --frames-per-chunk "$frames_per_chunk" \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --online-ivector-dir exp/$mic/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/ami/s5b/local/nnet3/run_tdnn.sh b/egs/ami/s5b/local/nnet3/run_tdnn.sh index b7f770b4a44..cc6b60696b1 100755 --- a/egs/ami/s5b/local/nnet3/run_tdnn.sh +++ b/egs/ami/s5b/local/nnet3/run_tdnn.sh @@ -23,7 +23,6 @@ # local/nnet3/run_tdnn.sh --mic sdm1 --use-ihm-ali true --affix _cleaned2 --gmm tri4a --train-set train_cleaned2 & - set -e -o pipefail -u # First the options that are passed through to run_ivector_common.sh diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn.sh index f5cb2249b95..4f485edf7da 100755 --- a/egs/babel/s5d/local/chain/tuning/run_tdnn.sh +++ b/egs/babel/s5d/local/chain/tuning/run_tdnn.sh @@ -35,7 +35,7 @@ train_stage=-10 tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. tdnn_affix= #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. common_egs_dir= # you can set this to use previously dumped egs. - +chunk_width=150,120,90,75 # End configuration section. echo "$0 $@" # Print the command line for logging @@ -141,15 +141,15 @@ if [ $stage -le 17 ]; then fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=450 - relu-renorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=450 - relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=450 - relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=450 - relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=450 - relu-renorm-layer name=tdnn7 input=Append(-6,-3,0) dim=450 + relu-batchnorm-layer name=tdnn1 dim=450 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=450 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=450 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=450 + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=450 + relu-batchnorm-layer name=tdnn7 input=Append(-6,-3,0) dim=450 ## adding the layers for chain branch - relu-renorm-layer name=prefinal-chain input=tdnn7 dim=450 target-rms=0.5 + relu-batchnorm-layer name=prefinal-chain input=tdnn7 dim=450 target-rms=0.5 output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 # adding the layers for xent branch @@ -161,7 +161,7 @@ if [ $stage -le 17 ]; then # final-layer learns at a rate independent of the regularization # constant; and the 0.5 was tuned so as to make the relative progress # similar in the xent and regular final layers. - relu-renorm-layer name=prefinal-xent input=tdnn7 dim=450 target-rms=0.5 + relu-batchnorm-layer name=prefinal-xent input=tdnn7 dim=450 target-rms=0.5 output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 EOF @@ -188,7 +188,7 @@ if [ $stage -le 18 ]; then --chain.lm-opts="--num-extra-lm-states=2000" \ --egs.dir "$common_egs_dir" \ --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width 150 \ + --egs.chunk-width $chunk_width \ --trainer.num-chunk-per-minibatch 128 \ --trainer.frames-per-iter 1500000 \ --trainer.num-epochs 4 \ diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh index 881323cd22c..72f7a3c32dd 100755 --- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh +++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm.sh @@ -35,6 +35,7 @@ train_stage=-10 tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. tdnn_affix= #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. common_egs_dir= # you can set this to use previously dumped egs. +chunk_width=150,120,90,75 # End configuration section. echo "$0 $@" # Print the command line for logging @@ -143,17 +144,17 @@ if [ $stage -le 17 ]; then fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=1024 - relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 - relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + relu-batchnorm-layer name=tdnn1 dim=1024 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts - relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 - relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts - relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 - relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts ## adding the layers for chain branch @@ -194,7 +195,7 @@ if [ $stage -le 18 ]; then --chain.lm-opts="--num-extra-lm-states=2000" \ --egs.dir "$common_egs_dir" \ --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width 150 \ + --egs.chunk-width $chunk_width \ --trainer.num-chunk-per-minibatch 128 \ --trainer.frames-per-iter 1500000 \ --trainer.num-epochs 4 \ diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh index 4a2fd806a17..be0c2cc4b9b 100755 --- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh +++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab1.sh @@ -33,6 +33,7 @@ train_stage=-10 tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. tdnn_affix=_bab1 #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. common_egs_dir=exp/chain_cleaned/tdnn_lstm_sp/egs # you can set this to use previously dumped egs. +chunk_width=150,120,90,75 # End configuration section. echo "$0 $@" # Print the command line for logging @@ -141,17 +142,17 @@ if [ $stage -le 17 ]; then fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=512 - relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=512 - relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=512 + relu-batchnorm-layer name=tdnn1 dim=512 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=512 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=512 # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults fast-lstmp-layer name=fastlstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts - relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=512 - relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=512 fast-lstmp-layer name=fastlstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts - relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=512 - relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=512 fast-lstmp-layer name=fastlstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts ## adding the layers for chain branch @@ -192,7 +193,7 @@ if [ $stage -le 18 ]; then --chain.lm-opts="--num-extra-lm-states=2000" \ --egs.dir "$common_egs_dir" \ --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width 150 \ + --egs.chunk-width $chunk_width \ --trainer.num-chunk-per-minibatch 128 \ --trainer.frames-per-iter 1500000 \ --trainer.num-epochs 4 \ diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh index 733331aa6db..8f21a239794 100755 --- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh +++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab2.sh @@ -33,6 +33,7 @@ train_stage=-10 tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. tdnn_affix=_bab2 #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. common_egs_dir=exp/chain_cleaned/tdnn_lstm_sp/egs # you can set this to use previously dumped egs. +chunk_width=150,120,90,75 # End configuration section. echo "$0 $@" # Print the command line for logging @@ -141,17 +142,17 @@ if [ $stage -le 17 ]; then fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=512 - relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=512 - relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=512 + relu-batchnorm-layer name=tdnn1 dim=512 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=512 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=512 # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults fast-lstmp-layer name=fastlstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts - relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=512 - relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=512 fast-lstmp-layer name=fastlstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts - relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=512 - relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=512 fast-lstmp-layer name=fastlstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts ## adding the layers for chain branch @@ -192,7 +193,7 @@ if [ $stage -le 18 ]; then --chain.lm-opts="--num-extra-lm-states=2000" \ --egs.dir "$common_egs_dir" \ --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width 150 \ + --egs.chunk-width $chunk_width \ --trainer.num-chunk-per-minibatch 128 \ --trainer.frames-per-iter 1500000 \ --trainer.num-epochs 4 \ diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh index 13d75b8a10c..7898d172242 100755 --- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh +++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab3.sh @@ -34,6 +34,7 @@ train_stage=-10 tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. tdnn_affix=_bab3 #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. common_egs_dir=exp/chain_cleaned/tdnn_lstm_sp/egs # you can set this to use previously dumped egs. +chunk_width=150,120,90,75 # End configuration section. echo "$0 $@" # Print the command line for logging @@ -142,17 +143,17 @@ if [ $stage -le 17 ]; then fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=512 - relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=512 - relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=512 + relu-batchnorm-layer name=tdnn1 dim=512 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=512 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=512 # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults fast-lstmp-layer name=fastlstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts - relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=512 - relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=512 fast-lstmp-layer name=fastlstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts - relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=512 - relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=512 fast-lstmp-layer name=fastlstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts ## adding the layers for chain branch @@ -193,7 +194,7 @@ if [ $stage -le 18 ]; then --chain.lm-opts="--num-extra-lm-states=2000" \ --egs.dir "$common_egs_dir" \ --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width 150 \ + --egs.chunk-width $chunk_width \ --trainer.num-chunk-per-minibatch 128 \ --trainer.frames-per-iter 1500000 \ --trainer.num-epochs 2 \ diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh index fdac3e6847b..49462573245 100755 --- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh +++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab4.sh @@ -34,6 +34,7 @@ train_stage=-10 tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. tdnn_affix=_bab4 #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. common_egs_dir=exp/chain_cleaned/tdnn_lstm_sp/egs # you can set this to use previously dumped egs. +chunk_width=150,120,90,75 # End configuration section. echo "$0 $@" # Print the command line for logging @@ -142,17 +143,17 @@ if [ $stage -le 17 ]; then fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=1024 - relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 - relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + relu-batchnorm-layer name=tdnn1 dim=1024 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts - relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 - relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts - relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 - relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts ## adding the layers for chain branch @@ -193,7 +194,7 @@ if [ $stage -le 18 ]; then --chain.lm-opts="--num-extra-lm-states=2000" \ --egs.dir "$common_egs_dir" \ --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width 150 \ + --egs.chunk-width $chunk_width \ --trainer.num-chunk-per-minibatch 128 \ --trainer.frames-per-iter 1500000 \ --trainer.num-epochs 4 \ diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh index 1ffd0dfcc0e..c888d985f5e 100755 --- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh +++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab5.sh @@ -34,6 +34,7 @@ train_stage=-10 tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. tdnn_affix=_bab5 #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. common_egs_dir=exp/chain_cleaned/tdnn_lstm_sp/egs # you can set this to use previously dumped egs. +chunk_width=150,120,90,75 # End configuration section. echo "$0 $@" # Print the command line for logging @@ -142,17 +143,17 @@ if [ $stage -le 17 ]; then fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=1024 - relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 - relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + relu-batchnorm-layer name=tdnn1 dim=1024 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts - relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 - relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts - relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 - relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts ## adding the layers for chain branch @@ -193,7 +194,7 @@ if [ $stage -le 18 ]; then --chain.lm-opts="--num-extra-lm-states=2000" \ --egs.dir "$common_egs_dir" \ --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width 150 \ + --egs.chunk-width $chunk_width \ --trainer.num-chunk-per-minibatch 128 \ --trainer.frames-per-iter 1500000 \ --trainer.num-epochs 4 \ diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh index bb00446357f..e9a045e113a 100755 --- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh +++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab6.sh @@ -34,6 +34,7 @@ train_stage=-10 tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. tdnn_affix=_bab6 #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. common_egs_dir=exp/chain_cleaned/tdnn_lstm_sp/egs # you can set this to use previously dumped egs. +chunk_width=150,120,90,75 # End configuration section. echo "$0 $@" # Print the command line for logging @@ -142,17 +143,17 @@ if [ $stage -le 17 ]; then fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=512 - relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=512 - relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=512 + relu-batchnorm-layer name=tdnn1 dim=512 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=512 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=512 # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults fast-lstmp-layer name=fastlstm1 cell-dim=512 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts - relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=512 - relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=512 fast-lstmp-layer name=fastlstm2 cell-dim=512 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts - relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=512 - relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=512 fast-lstmp-layer name=fastlstm3 cell-dim=512 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts ## adding the layers for chain branch @@ -193,7 +194,7 @@ if [ $stage -le 18 ]; then --chain.lm-opts="--num-extra-lm-states=2000" \ --egs.dir "$common_egs_dir" \ --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width 150 \ + --egs.chunk-width $chunk_width \ --trainer.num-chunk-per-minibatch 128 \ --trainer.frames-per-iter 1500000 \ --trainer.num-epochs 4 \ diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh index e4ce12269dc..ce192a91665 100755 --- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh +++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7.sh @@ -35,6 +35,7 @@ train_stage=-10 tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. tdnn_affix="_bab7" #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. common_egs_dir=exp/chain_cleaned/tdnn_lstm_sp/egs # you can set this to use previously dumped egs. +chunk_width=150,120,90,75 # End configuration section. echo "$0 $@" # Print the command line for logging @@ -143,17 +144,17 @@ if [ $stage -le 17 ]; then fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=1024 - relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 - relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + relu-batchnorm-layer name=tdnn1 dim=1024 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts - relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 - relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts - relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 - relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts ## adding the layers for chain branch @@ -194,7 +195,7 @@ if [ $stage -le 18 ]; then --chain.lm-opts="--num-extra-lm-states=2000" \ --egs.dir "$common_egs_dir" \ --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width 150 \ + --egs.chunk-width $chunk_width \ --trainer.num-chunk-per-minibatch 128 \ --trainer.frames-per-iter 1500000 \ --trainer.num-epochs 4 \ diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7_bs_batchnorm.sh b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh similarity index 83% rename from egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7_bs_batchnorm.sh rename to egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh index d35b84a3a6c..3fc0ef2206c 100755 --- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7_bs_batchnorm.sh +++ b/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab8.sh @@ -2,28 +2,26 @@ # by default, with cleanup -# this is basically the same as the run_tdnn_lstm_bab7.sh -# with dropout and backstitch (+more iterations as the convergence with dropout and backstitch is slower) # please note that the language(s) was not selected for any particular reason (other to represent the various sizes of babel datasets) -# 304-lithuanian | %WER 38.1 | 20041 61492 | 64.7 25.8 9.5 2.7 38.1 28.0 | -0.290 | exp/chain_cleaned/tdnn_lstm_bab7_bs_batchnorm_sp/decode_dev10h.pem/score_9/dev10h.pem.ctm.sys -# num-iters=120 nj=2..12 num-params=36.7M dim=43+100->3273 combine=-0.152->-0.140 -# xent:train/valid[79,119,final]=(-2.33,-1.63,-1.62/-2.51,-2.01,-2.01) -# logprob:train/valid[79,119,final]=(-0.184,-0.126,-0.125/-0.221,-0.201,-0.202) -# 206-zulu | %WER 50.6 | 22805 52162 | 53.5 37.4 9.1 4.1 50.6 30.4 | -0.635 | exp/chain_cleaned/tdnn_lstm_bab7_bs_batchnorm_sp/decode_dev10h.pem/score_10/dev10h.pem.ctm.sys -# num-iters=167 nj=2..12 num-params=36.7M dim=43+100->3274 combine=-0.191->-0.182 -# xent:train/valid[110,166,final]=(-2.46,-1.82,-1.74/-2.58,-2.13,-2.10) -# logprob:train/valid[110,166,final]=(-0.231,-0.170,-0.160/-0.269,-0.244,-0.244) -# 104-pashto | %WER 38.6 | 21825 101803 | 65.4 25.1 9.4 4.0 38.6 29.6 | -0.473 | exp/chain_cleaned/tdnn_lstm_bab7_bs_batchnorm_sp/decode_dev10h.pem/score_9/dev10h.pem.ctm.sys -# num-iters=214 nj=2..12 num-params=36.8M dim=43+100->3328 combine=-0.169->-0.164 -# xent:train/valid[141,213,final]=(-2.39,-1.69,-1.63/-2.57,-2.05,-2.03) -# logprob:train/valid[141,213,final]=(-0.210,-0.151,-0.144/-0.259,-0.228,-0.227) +# 304-lithuanian | %WER 39.9 | 20041 61492 | 62.7 27.3 10.0 2.6 39.9 28.6 | -0.268 | exp/chain_cleaned/tdnn_lstm_bab7_sp/decode_dev10h.pem/score_10/dev10h.pem.ctm.sys +# num-iters=48 nj=2..12 num-params=36.7M dim=43+100->3273 combine=-0.204->-0.179 +# xent:train/valid[31,47,final]=(-2.35,-1.89,-1.86/-2.49,-2.19,-2.17) +# logprob:train/valid[31,47,final]=(-0.199,-0.158,-0.154/-0.236,-0.221,-0.222) +# 206-zulu | %WER 52.2 | 22805 52162 | 51.6 38.2 10.2 3.8 52.2 30.7 | -0.629 | exp/chain_cleaned/tdnn_lstm_bab7_sp/decode_dev10h.pem/score_11/dev10h.pem.ctm.sys +# num-iters=66 nj=2..12 num-params=36.7M dim=43+100->3274 combine=-0.237->-0.215 +# xent:train/valid[43,65,final]=(-2.42,-1.96,-1.94/-2.53,-2.25,-2.24) +# logprob:train/valid[43,65,final]=(-0.239,-0.188,-0.186/-0.279,-0.267,-0.266) +# 104-pashto | %WER 40.2 | 21825 101803 | 63.8 25.8 10.4 3.9 40.2 29.8 | -0.438 | exp/chain_cleaned/tdnn_lstm_bab7_sp/decode_dev10h.pem/score_10/dev10h.pem.ctm.sys +# num-iters=85 nj=2..12 num-params=36.8M dim=43+100->3328 combine=-0.203->-0.189 +# xent:train/valid[55,84,final]=(-2.27,-1.81,-1.79/-2.46,-2.18,-2.17) +# logprob:train/valid[55,84,final]=(-0.213,-0.166,-0.163/-0.264,-0.249,-0.250) + set -e -o pipefail # First the options that are passed through to run_ivector_common.sh # (some of which are also used in this script directly). stage=17 nj=30 -dropout_schedule='0,0@0.20,0.3@0.50,0' train_set=train_cleaned gmm=tri5_cleaned # the gmm for the target data langdir=data/langp/tri5_ali @@ -34,8 +32,10 @@ nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned # are just hardcoded at this level, in the commands below. train_stage=-10 tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. -tdnn_affix="_bab7_bs_batchnorm" #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +tdnn_affix="_bab8" #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. common_egs_dir=exp/chain_cleaned/tdnn_lstm_sp/egs # you can set this to use previously dumped egs. +dropout_schedule='0,0@0.20,0.3@0.50,0' +chunk_width=150,120,90,75 # End configuration section. echo "$0 $@" # Print the command line for logging @@ -130,7 +130,7 @@ if [ $stage -le 17 ]; then num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - lstm_opts="decay-time=20 dropout-proportion=0.0" + lstm_opts="decay-time=20 dropout-proportion=0.0 " label_delay=5 mkdir -p $dir/configs @@ -195,7 +195,7 @@ if [ $stage -le 18 ]; then --chain.lm-opts="--num-extra-lm-states=2000" \ --egs.dir "$common_egs_dir" \ --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width 150 \ + --egs.chunk-width $chunk_width \ --trainer.num-chunk-per-minibatch 128 \ --trainer.frames-per-iter 1500000 \ --trainer.num-epochs 10 \ diff --git a/egs/callhome_diarization/v1/diarization/extract_ivectors.sh b/egs/callhome_diarization/v1/diarization/extract_ivectors.sh index 9219de63293..370a37b873e 100755 --- a/egs/callhome_diarization/v1/diarization/extract_ivectors.sh +++ b/egs/callhome_diarization/v1/diarization/extract_ivectors.sh @@ -45,6 +45,8 @@ if [ $# != 3 ]; then echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." echo " --window # Sliding window length in seconds" echo " --period # Period of sliding windows in seconds" + echo " --pca-dim # If provided, the whitening transform also" + echo " # performs dimension reduction." echo " --min-segment # Minimum segment length in seconds per ivector" echo " --hard-min # Removes segments less than min-segment if true." echo " # Useful for extracting training ivectors." @@ -66,7 +68,6 @@ for f in $srcdir/final.ie $srcdir/final.ubm $data/feats.scp ; do [ ! -f $f ] && echo "No such file $f" && exit 1; done - sub_data=$dir/subsegments_data mkdir -p $sub_data @@ -103,7 +104,6 @@ else feats="ark,s,cs:add-deltas $delta_opts scp:$sub_sdata/JOB/feats.scp ark:- |" fi - if [ $stage -le 1 ]; then echo "$0: extracting iVectors" dubm="fgmm-global-to-gmm $srcdir/final.ubm -|" diff --git a/egs/callhome_diarization/v1/diarization/make_rttm.py b/egs/callhome_diarization/v1/diarization/make_rttm.py index 270fd8d787f..1705411069f 100755 --- a/egs/callhome_diarization/v1/diarization/make_rttm.py +++ b/egs/callhome_diarization/v1/diarization/make_rttm.py @@ -5,8 +5,11 @@ # Apache 2.0. """This script converts a segments and labels file to a NIST RTTM -file. It handles overlapping segments (e.g. the output of a sliding- -window diarization system). +file. It creates flat segmentation (i.e. no overlapping regions) +from overlapping segments, e.g. the output of a sliding-window +diarization system. The speaker boundary between two overlapping +segments by different speakers is placed at the midpoint between +the end of the first segment and the start of the second segment. The segments file format is: diff --git a/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh b/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh new file mode 100755 index 00000000000..d7591a6a3a8 --- /dev/null +++ b/egs/callhome_diarization/v1/diarization/nnet3/xvector/extract_xvectors.sh @@ -0,0 +1,162 @@ +#!/bin/bash + +# Copyright 2017-2018 Daniel Povey +# 2017-2018 David Snyder +# 2017-2018 Matthew Maciejewski +# Apache 2.0. + +# This script is a modified version of diarization/extract_ivectors.sh +# that extracts x-vectors instead of i-vectors for speaker diarization. +# +# The script assumes that the x-vector DNN has already been trained, and +# a data directory that contains a segments file and features for the +# x-vector DNN exists. The segments file was most likely created by a +# speech activity detection system that identified the speech segments in +# the recordings. This script performs a subsegmentation, that further +# splits the speech segments into very short overlapping subsegments (e.g., +# 1.5 seconds, with a 0.75 overlap). Finally, x-vectors are extracted +# for each of the subsegments. After this, you will most likely use +# diarization/nnet3/xvector/score_plda.sh to compute the similarity +# between all pairs of x-vectors in a recording. + +# Begin configuration section. +nj=30 +cmd="run.pl" +chunk_size=-1 # The chunk size over which the embedding is extracted. + # If left unspecified, it uses the max_chunk_size in the nnet + # directory. +stage=0 +window=1.5 +period=0.75 +pca_dim= +min_segment=0.5 +hard_min=false +apply_cmn=true # If true, apply sliding window cepstral mean normalization +use_gpu=false +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo " e.g.: $0 exp/xvector_nnet data/train exp/xvectors" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --window # Sliding window length in seconds" + echo " --period # Period of sliding windows in seconds" + echo " --pca-dim # If provided, the whitening transform also" + echo " # performs dimension reduction" + echo " --min-segment # Minimum segment length in seconds per xvector" + echo " --hard-min # Removes segments less than min-segment if true." + echo " # Useful for extracting training xvectors." + echo " --chunk-size # If provided, extracts embeddings with specified" + echo " # chunk size, and averages to produce final embedding" + echo " --apply-cmn # If true, apply sliding window cepstral mean" + echo " # normalization to features" + echo " --nj # Number of jobs" + echo " --stage # To control partial reruns" + exit 1; +fi + +srcdir=$1 +data=$2 +dir=$3 + +for f in $srcdir/final.raw $srcdir/min_chunk_size $srcdir/max_chunk_size $data/feats.scp ; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done + +min_chunk_size=`cat $srcdir/min_chunk_size 2>/dev/null` +max_chunk_size=`cat $srcdir/max_chunk_size 2>/dev/null` + +nnet=$srcdir/final.raw +if [ -f $srcdir/extract.config ] ; then + echo "$0: using $srcdir/extract.config to extract xvectors" + nnet="nnet3-copy --nnet-config=$srcdir/extract.config $srcdir/final.raw - |" +fi + +if [ $chunk_size -le 0 ]; then + chunk_size=$max_chunk_size +fi + +if [ $max_chunk_size -lt $chunk_size ]; then + echo "$0: specified chunk size of $chunk_size is larger than the maximum chunk size, $max_chunk_size" && exit 1; +fi + +sub_data=$dir/subsegments_data +mkdir -p $sub_data + +# Set up sliding-window subsegments +if [ $stage -le 0 ]; then + if $hard_min; then + awk -v min=$min_segment '{if($4-$3 >= min){print $0}}' $data/segments \ + > $dir/pruned_segments + segments=$dir/pruned_segments + else + segments=$data/segments + fi + utils/data/get_uniform_subsegments.py \ + --max-segment-duration=$window \ + --overlap-duration=$(echo "$window-$period" | bc) \ + --max-remaining-duration=$min_segment \ + --constant-duration=True \ + $segments > $dir/subsegments + utils/data/subsegment_data_dir.sh $data \ + $dir/subsegments $sub_data +fi + +# Set various variables. +mkdir -p $dir/log +sub_sdata=$sub_data/split$nj; +utils/split_data.sh $sub_data $nj || exit 1; + +## Set up features. +if $apply_cmn; then + feats="ark,s,cs:apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 scp:${sub_sdata}/JOB/feats.scp ark:- |" +else + feats="scp:${sub_sdata}/JOB/feats.scp" +fi + +if [ $stage -le 1 ]; then + echo "$0: extracting xvectors from nnet" + if $use_gpu; then + for g in $(seq $nj); do + $cmd --gpu 1 ${dir}/log/extract.$g.log \ + nnet3-xvector-compute --use-gpu=yes --min-chunk-size=$min_chunk_size --chunk-size=$chunk_size \ + "$nnet" "`echo $feats | sed s/JOB/$g/g`" ark,scp:${dir}/xvector.$g.ark,${dir}/xvector.$g.scp || exit 1 & + done + wait + else + $cmd JOB=1:$nj ${dir}/log/extract.JOB.log \ + nnet3-xvector-compute --use-gpu=no --min-chunk-size=$min_chunk_size --chunk-size=$chunk_size \ + "$nnet" "$feats" ark,scp:${dir}/xvector.JOB.ark,${dir}/xvector.JOB.scp || exit 1; + fi +fi + +if [ $stage -le 2 ]; then + echo "$0: combining xvectors across jobs" + for j in $(seq $nj); do cat $dir/xvector.$j.scp; done >$dir/xvector.scp || exit 1; + cp $sub_data/{segments,spk2utt,utt2spk} $dir +fi + +if [ $stage -le 3 ]; then + echo "$0: Computing mean of xvectors" + $cmd $dir/log/mean.log \ + ivector-mean scp:$dir/xvector.scp $dir/mean.vec || exit 1; +fi + +if [ $stage -le 4 ]; then + if [ -z "$pca_dim" ]; then + pca_dim=-1 + fi + echo "$0: Computing whitening transform" + $cmd $dir/log/transform.log \ + est-pca --read-vectors=true --normalize-mean=false \ + --normalize-variance=true --dim=$pca_dim \ + scp:$dir/xvector.scp $dir/transform.mat || exit 1; +fi diff --git a/egs/callhome_diarization/v1/diarization/nnet3/xvector/score_plda.sh b/egs/callhome_diarization/v1/diarization/nnet3/xvector/score_plda.sh new file mode 100755 index 00000000000..703bafd8912 --- /dev/null +++ b/egs/callhome_diarization/v1/diarization/nnet3/xvector/score_plda.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# Copyright 2016-2018 David Snyder +# 2017-2018 Matthew Maciejewski +# Apache 2.0. + +# This script is a modified version of diarization/score_plda.sh +# that replaces i-vectors with x-vectors. +# +# This script computes PLDA scores from pairs of x-vectors extracted +# from segments of a recording. These scores are in the form of +# affinity matrices, one for each recording. Most likely, the x-vectors +# were computed using diarization/nnet3/xvector/extract_xvectors.sh. +# The affinity matrices are most likely going to be clustered using +# diarization/cluster.sh. + +# Begin configuration section. +cmd="run.pl" +stage=0 +target_energy=0.1 +nj=10 +cleanup=true +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo " e.g.: $0 exp/xvectors_callhome_heldout exp/xvectors_callhome_test exp/xvectors_callhome_test" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --nj # Number of jobs (also see num-processes and num-threads)" + echo " --stage # To control partial reruns" + echo " --target-energy # Target energy remaining in xvectors after applying" + echo " # a conversation dependent PCA." + echo " --cleanup # If true, remove temporary files" + exit 1; +fi + +pldadir=$1 +xvecdir=$2 +dir=$3 + +mkdir -p $dir/tmp + +for f in $xvecdir/xvector.scp $xvecdir/spk2utt $xvecdir/utt2spk $xvecdir/segments $pldadir/plda $pldadir/mean.vec $pldadir/transform.mat; do + [ ! -f $f ] && echo "No such file $f" && exit 1; +done +cp $xvecdir/xvector.scp $dir/tmp/feats.scp +cp $xvecdir/spk2utt $dir/tmp/ +cp $xvecdir/utt2spk $dir/tmp/ +cp $xvecdir/segments $dir/tmp/ +cp $xvecdir/spk2utt $dir/ +cp $xvecdir/utt2spk $dir/ +cp $xvecdir/segments $dir/ + +utils/fix_data_dir.sh $dir/tmp > /dev/null + +sdata=$dir/tmp/split$nj; +utils/split_data.sh $dir/tmp $nj || exit 1; + +# Set various variables. +mkdir -p $dir/log + +feats="ark:ivector-subtract-global-mean $pldadir/mean.vec scp:$sdata/JOB/feats.scp ark:- | transform-vec $pldadir/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" +if [ $stage -le 0 ]; then + echo "$0: scoring xvectors" + $cmd JOB=1:$nj $dir/log/plda_scoring.JOB.log \ + ivector-plda-scoring-dense --target-energy=$target_energy $pldadir/plda \ + ark:$sdata/JOB/spk2utt "$feats" ark,scp:$dir/scores.JOB.ark,$dir/scores.JOB.scp || exit 1; +fi + +if [ $stage -le 1 ]; then + echo "$0: combining PLDA scores across jobs" + for j in $(seq $nj); do cat $dir/scores.$j.scp; done >$dir/scores.scp || exit 1; +fi + +if $cleanup ; then + rm -rf $dir/tmp || exit 1; +fi diff --git a/egs/callhome_diarization/v1/diarization/vad_to_segments.sh b/egs/callhome_diarization/v1/diarization/vad_to_segments.sh index 5f1daf9656a..d653e0313ea 100755 --- a/egs/callhome_diarization/v1/diarization/vad_to_segments.sh +++ b/egs/callhome_diarization/v1/diarization/vad_to_segments.sh @@ -11,12 +11,13 @@ nj=2 stage=0 cmd=run.pl segmentation_opts= # E.g. set this as --segmentation-opts "--silance-proportion 0.2 --max-segment-length 10" +min_duration=0.25 # end configuration section. echo "$0 $@" # Print the command line for logging -[ -f .path.sh ] && . ./path.sh +if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; if [ $# -ne 2 ]; then @@ -26,6 +27,7 @@ if [ $# -ne 2 ]; then echo " --stage (0|1) # start script from part-way through" echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes" echo " --segmentation-opts '--opt1 opt1val --opt2 opt2val' # options for segmentation.pl" + echo " --min-duration # min duration in seconds for segments (smaller ones are discarded)" echo "e.g.:" echo "$0 data/train data/train_segmented" exit 1; @@ -58,7 +60,9 @@ if [ $stage -le 0 ]; then for n in `seq $nj`; do cat $sdata/$n/subsegments - done | sort > $data/subsegments || exit 1; + done | sort | \ + awk -v m=$min_duration '{if ($4 - $3 >= m) { print $0 }}' \ + > $data/subsegments || exit 1; fi if [ $stage -le 1 ]; then diff --git a/egs/callhome_diarization/v1/local/make_musan.py b/egs/callhome_diarization/v1/local/make_musan.py new file mode 100755 index 00000000000..b3f6652ba40 --- /dev/null +++ b/egs/callhome_diarization/v1/local/make_musan.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +# Copyright 2015 David Snyder +# Apache 2.0. +# +# This file is meant to be invoked by make_musan.sh. + +import os, sys + +def process_music_annotations(path): + utt2spk = {} + utt2vocals = {} + lines = open(path, 'r').readlines() + for line in lines: + utt, genres, vocals, musician = line.rstrip().split()[:4] + # For this application, the musican ID isn't important + utt2spk[utt] = utt + utt2vocals[utt] = vocals == "Y" + return utt2spk, utt2vocals + +def prepare_music(root_dir, use_vocals): + utt2vocals = {} + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + music_dir = os.path.join(root_dir, "music") + for root, dirs, files in os.walk(music_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + elif str(file) == "ANNOTATIONS": + utt2spk_part, utt2vocals_part = process_music_annotations(file_path) + utt2spk.update(utt2spk_part) + utt2vocals.update(utt2vocals_part) + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2vocals: + if utt in utt2wav: + if use_vocals or not utt2vocals[utt]: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + return utt2spk_str, utt2wav_str + +def prepare_speech(root_dir): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + speech_dir = os.path.join(root_dir, "speech") + for root, dirs, files in os.walk(speech_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + return utt2spk_str, utt2wav_str + +def prepare_noise(root_dir): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + noise_dir = os.path.join(root_dir, "noise") + for root, dirs, files in os.walk(noise_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + return utt2spk_str, utt2wav_str + +def main(): + in_dir = sys.argv[1] + out_dir = sys.argv[2] + use_vocals = sys.argv[3] == "Y" + utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals) + utt2spk_speech, utt2wav_speech = prepare_speech(in_dir) + utt2spk_noise, utt2wav_noise = prepare_noise(in_dir) + utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise + utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise + wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w') + wav_fi.write(utt2wav) + utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w') + utt2spk_fi.write(utt2spk) + + +if __name__=="__main__": + main() diff --git a/egs/callhome_diarization/v1/local/make_musan.sh b/egs/callhome_diarization/v1/local/make_musan.sh new file mode 100755 index 00000000000..694940ad70f --- /dev/null +++ b/egs/callhome_diarization/v1/local/make_musan.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Copyright 2015 David Snyder +# Apache 2.0. +# +# This script, called by ../run.sh, creates the MUSAN +# data directory. The required dataset is freely available at +# http://www.openslr.org/17/ + +set -e +in_dir=$1 +data_dir=$2 +use_vocals='Y' + +mkdir -p local/musan.tmp + +echo "Preparing ${data_dir}/musan..." +mkdir -p ${data_dir}/musan +local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals} + +utils/fix_data_dir.sh ${data_dir}/musan + +grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music +grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech +grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise +utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \ + ${data_dir}/musan ${data_dir}/musan_music +utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \ + ${data_dir}/musan ${data_dir}/musan_speech +utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \ + ${data_dir}/musan ${data_dir}/musan_noise + +utils/fix_data_dir.sh ${data_dir}/musan_music +utils/fix_data_dir.sh ${data_dir}/musan_speech +utils/fix_data_dir.sh ${data_dir}/musan_noise + +rm -rf local/musan.tmp + diff --git a/egs/callhome_diarization/v1/local/make_swbd2_phase2.pl b/egs/callhome_diarization/v1/local/make_swbd2_phase2.pl index 0520d21bce4..337ab9d9708 100755 --- a/egs/callhome_diarization/v1/local/make_swbd2_phase2.pl +++ b/egs/callhome_diarization/v1/local/make_swbd2_phase2.pl @@ -1,4 +1,5 @@ #!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter # # Copyright 2013 Daniel Povey # Apache 2.0 @@ -22,7 +23,7 @@ @badAudio = ("3", "4"); -$tmp_dir = "$out_base/tmp"; +$tmp_dir = "$out_dir/tmp"; if (system("mkdir -p $tmp_dir") != 0) { die "Error making directory $tmp_dir"; } @@ -31,7 +32,7 @@ die "Error getting list of sph files"; } -open(WAVLIST, "<", "$tmp_dir/sph.list") or die "cannot open wav list"; +open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list"; while() { chomp; @@ -42,7 +43,6 @@ $wav{$uttId} = $sph; } - while () { $line = $_ ; $ci = ; @@ -93,7 +93,6 @@ } } - close(WAV) || die; close(SPKR) || die; close(GNDR) || die; diff --git a/egs/callhome_diarization/v1/local/make_swbd2_phase3.pl b/egs/callhome_diarization/v1/local/make_swbd2_phase3.pl index f9c82b7f7d9..f27853415a0 100755 --- a/egs/callhome_diarization/v1/local/make_swbd2_phase3.pl +++ b/egs/callhome_diarization/v1/local/make_swbd2_phase3.pl @@ -1,4 +1,5 @@ #!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter # # Copyright 2013 Daniel Povey # Apache 2.0 @@ -21,7 +22,7 @@ @badAudio = ("3", "4"); -$tmp_dir = "$out_base/tmp"; +$tmp_dir = "$out_dir/tmp"; if (system("mkdir -p $tmp_dir") != 0) { die "Error making directory $tmp_dir"; } @@ -30,7 +31,7 @@ die "Error getting list of sph files"; } -open(WAVLIST, "<", "$tmp_dir/sph.list") or die "cannot open wav list"; +open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list"; while() { chomp; $sph = $_; @@ -40,7 +41,6 @@ $wav{$uttId} = $sph; } - while () { $line = $_ ; @A = split(",", $line); @@ -88,7 +88,6 @@ } } - close(WAV) || die; close(SPKR) || die; close(GNDR) || die; diff --git a/egs/callhome_diarization/v1/local/make_swbd_cellular1.pl b/egs/callhome_diarization/v1/local/make_swbd_cellular1.pl index a2478c10bfe..e30c710e6fa 100755 --- a/egs/callhome_diarization/v1/local/make_swbd_cellular1.pl +++ b/egs/callhome_diarization/v1/local/make_swbd_cellular1.pl @@ -1,4 +1,5 @@ #!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter # # Copyright 2013 Daniel Povey # Apache 2.0 @@ -68,7 +69,6 @@ } } - close(WAV) || die; close(SPKR) || die; close(GNDR) || die; diff --git a/egs/callhome_diarization/v1/local/make_swbd_cellular2.pl b/egs/callhome_diarization/v1/local/make_swbd_cellular2.pl index 5c936ae61db..4de954c194c 100755 --- a/egs/callhome_diarization/v1/local/make_swbd_cellular2.pl +++ b/egs/callhome_diarization/v1/local/make_swbd_cellular2.pl @@ -1,4 +1,5 @@ #!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter # # Copyright 2013 Daniel Povey # Apache 2.0 @@ -68,7 +69,6 @@ } } - close(WAV) || die; close(SPKR) || die; close(GNDR) || die; diff --git a/egs/callhome_diarization/v1/local/nnet3/xvector/prepare_feats.sh b/egs/callhome_diarization/v1/local/nnet3/xvector/prepare_feats.sh new file mode 100755 index 00000000000..62879623df4 --- /dev/null +++ b/egs/callhome_diarization/v1/local/nnet3/xvector/prepare_feats.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# +# Apache 2.0. + +# This script applies sliding window CMVN and writes the features to disk. +# +# Although this kind of script isn't necessary in speaker recognition recipes, +# it can be helpful in the diarization recipes. The script +# diarization/nnet3/xvector/extract_xvectors.sh extracts x-vectors from very +# short (e.g., 1-2 seconds) segments. Therefore, in order to apply the sliding +# window CMVN in a meaningful way, it must be performed prior to performing +# the subsegmentation. + +nj=40 +cmd="run.pl" +stage=0 +norm_vars=false +center=true +compress=true +cmn_window=300 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features" + echo "Options: " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --norm-vars # If true, normalize variances in the sliding window cmvn" + exit 1; +fi + +data_in=$1 +data_out=$2 +dir=$3 + +name=`basename $data_in` + +for f in $data_in/feats.scp ; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +# Set various variables. +mkdir -p $dir/log +mkdir -p $data_out +featdir=$(utils/make_absolute.sh $dir) + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_cmvn_feats/storage $featdir/storage +fi + +for n in $(seq $nj); do + # the next command does nothing unless $featdir/storage/ exists, see + # utils/create_data_link.pl for more info. + utils/create_data_link.pl $featdir/xvector_cmvn_feats_${name}.${n}.ark +done + +cp $data_in/utt2spk $data_out/utt2spk +cp $data_in/spk2utt $data_out/spk2utt +cp $data_in/wav.scp $data_out/wav.scp + +write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB" + +sdata_in=$data_in/split$nj; +utils/split_data.sh $data_in $nj || exit 1; + +$cmd JOB=1:$nj $dir/log/create_xvector_cmvn_feats_${name}.JOB.log \ + apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \ + scp:${sdata_in}/JOB/feats.scp ark:- \| \ + copy-feats --compress=$compress $write_num_frames_opt ark:- \ + ark,scp:$featdir/xvector_cmvn_feats_${name}.JOB.ark,$featdir/xvector_cmvn_feats_${name}.JOB.scp || exit 1; + +for n in $(seq $nj); do + cat $featdir/xvector_cmvn_feats_${name}.$n.scp || exit 1; +done > ${data_out}/feats.scp || exit 1 + +for n in $(seq $nj); do + cat $featdir/log/utt2num_frames.$n || exit 1; +done > $data_out/utt2num_frames || exit 1 +rm $featdir/log/utt2num_frames.* + +echo "$0: Succeeded creating xvector features for $name" diff --git a/egs/callhome_diarization/v1/local/nnet3/xvector/prepare_feats_for_egs.sh b/egs/callhome_diarization/v1/local/nnet3/xvector/prepare_feats_for_egs.sh new file mode 100755 index 00000000000..dcdbe1b1593 --- /dev/null +++ b/egs/callhome_diarization/v1/local/nnet3/xvector/prepare_feats_for_egs.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# +# Apache 2.0. + +# This script applies sliding window CMVN and removes silence frames. This +# is performed on the raw features prior to generating examples for training +# the x-vector system. Once the training examples are generated, the features +# created by this script can be removed. + +nj=40 +cmd="run.pl" +stage=0 +norm_vars=false +center=true +compress=true +cmn_window=300 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features" + echo "Options: " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --norm-vars # If true, normalize variances in the sliding window cmvn" + exit 1; +fi + +data_in=$1 +data_out=$2 +dir=$3 + +name=`basename $data_in` + +for f in $data_in/feats.scp $data_in/vad.scp ; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +# Set various variables. +mkdir -p $dir/log +mkdir -p $data_out +featdir=$(utils/make_absolute.sh $dir) + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_feats/storage $featdir/storage +fi + +for n in $(seq $nj); do + # the next command does nothing unless $featdir/storage/ exists, see + # utils/create_data_link.pl for more info. + utils/create_data_link.pl $featdir/xvector_feats_${name}.${n}.ark +done + +cp $data_in/utt2spk $data_out/utt2spk +cp $data_in/spk2utt $data_out/spk2utt +cp $data_in/wav.scp $data_out/wav.scp + +write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB" + +sdata_in=$data_in/split$nj; +utils/split_data.sh $data_in $nj || exit 1; + +$cmd JOB=1:$nj $dir/log/create_xvector_feats_${name}.JOB.log \ + apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \ + scp:${sdata_in}/JOB/feats.scp ark:- \| \ + select-voiced-frames ark:- scp,s,cs:${sdata_in}/JOB/vad.scp ark:- \| \ + copy-feats --compress=$compress $write_num_frames_opt ark:- \ + ark,scp:$featdir/xvector_feats_${name}.JOB.ark,$featdir/xvector_feats_${name}.JOB.scp || exit 1; + +for n in $(seq $nj); do + cat $featdir/xvector_feats_${name}.$n.scp || exit 1; +done > ${data_out}/feats.scp || exit 1 + +for n in $(seq $nj); do + cat $featdir/log/utt2num_frames.$n || exit 1; +done > $data_out/utt2num_frames || exit 1 +rm $featdir/log/utt2num_frames.* + +echo "$0: Succeeded creating xvector features for $name" diff --git a/egs/callhome_diarization/v1/local/nnet3/xvector/run_xvector.sh b/egs/callhome_diarization/v1/local/nnet3/xvector/run_xvector.sh new file mode 120000 index 00000000000..585b63fd2dd --- /dev/null +++ b/egs/callhome_diarization/v1/local/nnet3/xvector/run_xvector.sh @@ -0,0 +1 @@ +tuning/run_xvector_1a.sh \ No newline at end of file diff --git a/egs/callhome_diarization/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh b/egs/callhome_diarization/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh new file mode 100755 index 00000000000..4fdf0cfbad6 --- /dev/null +++ b/egs/callhome_diarization/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh @@ -0,0 +1,149 @@ +#!/bin/bash +# Copyright 2018 David Snyder +# 2018 Johns Hopkins University (Author: Daniel Garcia-Romero) +# 2018 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0. + +# This script trains the x-vector DNN. The recipe is similar to the one +# described in "Diarization is Hard: Some Experiences and Lessons Learned +# for the JHU Team in the Inaugural DIHARD Challenge" by Sell et al. + +. ./cmd.sh +set -e + +stage=1 +train_stage=-1 +use_gpu=true +remove_egs=false + +data=data/train +nnet_dir=exp/xvector_nnet_1a/ +egs_dir=exp/xvector_nnet_1a/egs + +. ./path.sh +. ./cmd.sh +. ./utils/parse_options.sh + +num_pdfs=$(awk '{print $2}' $data/utt2spk | sort | uniq -c | wc -l) + +# Now we create the nnet examples using sid/nnet3/xvector/get_egs.sh. +# The argument --num-repeats is related to the number of times a speaker +# repeats per archive. If it seems like you're getting too many archives +# (e.g., more than 200) try increasing the --frames-per-iter option. The +# arguments --min-frames-per-chunk and --max-frames-per-chunk specify the +# minimum and maximum length (in terms of number of frames) of the features +# in the examples. +# +# To make sense of the egs script, it may be necessary to put an "exit 1" +# command immediately after stage 3. Then, inspect +# exp//egs/temp/ranges.* . The ranges files specify the examples that +# will be created, and which archives they will be stored in. Each line of +# ranges.* has the following form: +# +# For example: +# 100304-f-sre2006-kacg-A 1 2 4079 881 23 + +# If you're satisfied with the number of archives (e.g., 50-150 archives is +# reasonable) and with the number of examples per speaker (e.g., 1000-5000 +# is reasonable) then you can let the script continue to the later stages. +# Otherwise, try increasing or decreasing the --num-repeats option. You might +# need to fiddle with --frames-per-iter. Increasing this value decreases the +# the number of archives and increases the number of examples per archive. +# Decreasing this value increases the number of archives, while decreasing the +# number of examples per archive. +if [ $stage -le 4 ]; then + echo "$0: Getting neural network training egs"; + # dump egs. + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b{03,04,05,06}/$USER/kaldi-data/egs/callhome_diarization/v2/xvector-$(date +'%m_%d_%H_%M')/$egs_dir/storage $egs_dir/storage + fi + sid/nnet3/xvector/get_egs.sh --cmd "$train_cmd" \ + --nj 8 \ + --stage 0 \ + --frames-per-iter 1000000000 \ + --frames-per-iter-diagnostic 500000 \ + --min-frames-per-chunk 200 \ + --max-frames-per-chunk 400 \ + --num-diagnostic-archives 3 \ + --num-repeats 40 \ + "$data" $egs_dir +fi + +if [ $stage -le 5 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(wc -w $egs_dir/pdf2num | awk '{print $1}') + feat_dim=$(cat $egs_dir/info/feat_dim) + + # This chunk-size corresponds to the maximum number of frames the + # stats layer is able to pool over. In this script, it corresponds + # to 4 seconds. If the input recording is greater than 4 seconds, + # we will compute multiple xvectors from the same recording and average + # to produce the final xvector. + max_chunk_size=400 + + # The smallest number of frames we're comfortable computing an xvector from. + # Note that the hard minimum is given by the left and right context of the + # frame-level layers. + min_chunk_size=20 + mkdir -p $nnet_dir/configs + cat < $nnet_dir/configs/network.xconfig + # please note that it is important to have input layer with the name=input + + # The frame-level layers + input dim=${feat_dim} name=input + relu-batchnorm-layer name=tdnn1 input=Append(-2,-1,0,1,2) dim=512 + relu-batchnorm-layer name=tdnn2 input=Append(-2,0,2) dim=512 + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn4 dim=512 + relu-batchnorm-layer name=tdnn5 dim=1500 + + # The stats pooling layer. Layers after this are segment-level. + # In the config below, the first and last argument (0, and ${max_chunk_size}) + # means that we pool over an input segment starting at frame 0 + # and ending at frame ${max_chunk_size} or earlier. The other arguments (1:1) + # mean that no subsampling is performed. + stats-layer name=stats config=mean+stddev(0:1:1:${max_chunk_size}) + + # This is where we usually extract the embedding (aka xvector) from. + relu-batchnorm-layer name=tdnn6 dim=128 input=stats + output-layer name=output include-log-softmax=true dim=${num_targets} +EOF + + steps/nnet3/xconfig_to_configs.py \ + --xconfig-file $nnet_dir/configs/network.xconfig \ + --config-dir $nnet_dir/configs/ + cp $nnet_dir/configs/final.config $nnet_dir/nnet.config + + # These three files will be used by sid/nnet3/xvector/extract_xvectors.sh + echo "output-node name=output input=tdnn6.affine" > $nnet_dir/extract.config + echo "$max_chunk_size" > $nnet_dir/max_chunk_size + echo "$min_chunk_size" > $nnet_dir/min_chunk_size +fi + +dropout_schedule='0,0@0.20,0.1@0.50,0' +srand=123 +if [ $stage -le 6 ]; then + steps/nnet3/train_raw_dnn.py --stage=$train_stage \ + --cmd="$train_cmd" \ + --trainer.optimization.proportional-shrink 10 \ + --trainer.optimization.momentum=0.5 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=8 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.minibatch-size=64 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2 \ + --trainer.num-epochs=3 \ + --trainer.dropout-schedule="$dropout_schedule" \ + --trainer.shuffle-buffer-size=1000 \ + --egs.frames-per-eg=1 \ + --egs.dir="$egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval=10 \ + --use-gpu=true \ + --dir=$nnet_dir || exit 1; +fi + +exit 0; diff --git a/egs/callhome_diarization/v2/cmd.sh b/egs/callhome_diarization/v2/cmd.sh new file mode 100644 index 00000000000..88231ea3798 --- /dev/null +++ b/egs/callhome_diarization/v2/cmd.sh @@ -0,0 +1,13 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl" diff --git a/egs/callhome_diarization/v2/conf/mfcc.conf b/egs/callhome_diarization/v2/conf/mfcc.conf new file mode 100644 index 00000000000..d32a2217988 --- /dev/null +++ b/egs/callhome_diarization/v2/conf/mfcc.conf @@ -0,0 +1,6 @@ +--sample-frequency=8000 +--frame-length=25 # the default is 25 +--low-freq=20 # the default. +--high-freq=3700 # the default is zero meaning use the Nyquist (4k in this case). +--num-ceps=23 # higher than the default which is 12. +--snip-edges=false diff --git a/egs/callhome_diarization/v2/conf/vad.conf b/egs/callhome_diarization/v2/conf/vad.conf new file mode 100644 index 00000000000..c9f5e8b3072 --- /dev/null +++ b/egs/callhome_diarization/v2/conf/vad.conf @@ -0,0 +1,4 @@ +--vad-energy-threshold=5.5 +--vad-energy-mean-scale=0.5 +--vad-proportion-threshold=0.12 +--vad-frames-context=2 diff --git a/egs/callhome_diarization/v2/diarization b/egs/callhome_diarization/v2/diarization new file mode 120000 index 00000000000..1aac1562a8a --- /dev/null +++ b/egs/callhome_diarization/v2/diarization @@ -0,0 +1 @@ +../v1/diarization/ \ No newline at end of file diff --git a/egs/callhome_diarization/v2/local b/egs/callhome_diarization/v2/local new file mode 120000 index 00000000000..740b697d6fd --- /dev/null +++ b/egs/callhome_diarization/v2/local @@ -0,0 +1 @@ +../v1/local/ \ No newline at end of file diff --git a/egs/callhome_diarization/v2/path.sh b/egs/callhome_diarization/v2/path.sh new file mode 100755 index 00000000000..851c14e27c3 --- /dev/null +++ b/egs/callhome_diarization/v2/path.sh @@ -0,0 +1,5 @@ +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/callhome_diarization/v2/run.sh b/egs/callhome_diarization/v2/run.sh new file mode 100755 index 00000000000..4f730d4753c --- /dev/null +++ b/egs/callhome_diarization/v2/run.sh @@ -0,0 +1,358 @@ +#!/bin/bash +# Copyright 2017-2018 David Snyder +# 2017-2018 Matthew Maciejewski +# +# Apache 2.0. +# +# This recipe demonstrates the use of x-vectors for speaker diarization. +# The scripts are based on the recipe in ../v1/run.sh, but clusters x-vectors +# instead of i-vectors. It is similar to the x-vector-based diarization system +# described in "Diarization is Hard: Some Experiences and Lessons Learned for +# the JHU Team in the Inaugural DIHARD Challenge" by Sell et al. The main +# difference is that we haven't implemented the VB resegmentation yet. + +. ./cmd.sh +. ./path.sh +set -e +mfccdir=`pwd`/mfcc +vaddir=`pwd`/mfcc +data_root=/export/corpora5/LDC +stage=0 +nnet_dir=exp/xvector_nnet_1a/ + +# Prepare datasets +if [ $stage -le 0 ]; then + # Prepare a collection of NIST SRE data. This will be used to train, + # x-vector DNN and PLDA model. + local/make_sre.sh $data_root data + + # Prepare SWB for x-vector DNN training. + local/make_swbd2_phase1.pl /export/corpora/LDC/LDC98S75 \ + data/swbd2_phase1_train + local/make_swbd2_phase2.pl $data_root/LDC99S79 \ + data/swbd2_phase2_train + local/make_swbd2_phase3.pl $data_root/LDC2002S06 \ + data/swbd2_phase3_train + local/make_swbd_cellular1.pl $data_root/LDC2001S13 \ + data/swbd_cellular1_train + local/make_swbd_cellular2.pl $data_root/LDC2004S07 \ + data/swbd_cellular2_train + + # Prepare the Callhome portion of NIST SRE 2000. + local/make_callhome.sh /export/corpora/NIST/LDC2001S97/ data/ + + utils/combine_data.sh data/train \ + data/swbd_cellular1_train data/swbd_cellular2_train \ + data/swbd2_phase1_train \ + data/swbd2_phase2_train data/swbd2_phase3_train data/sre +fi + +# Prepare features +if [ $stage -le 1 ]; then + # The script local/make_callhome.sh splits callhome into two parts, called + # callhome1 and callhome2. Each partition is treated like a held-out + # dataset, and used to estimate various quantities needed to perform + # diarization on the other part (and vice versa). + for name in train callhome1 callhome2; do + steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 \ + --cmd "$train_cmd" --write-utt2num-frames true \ + data/$name exp/make_mfcc $mfccdir + utils/fix_data_dir.sh data/$name + done + + for name in train callhome1 callhome2; do + sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \ + data/$name exp/make_vad $vaddir + utils/fix_data_dir.sh data/$name + done + + # The sre dataset is a subset of train + cp data/train/{feats,vad}.scp data/sre/ + utils/fix_data_dir.sh data/sre + + # This writes features to disk after applying the sliding window CMN. + # Although this is somewhat wasteful in terms of disk space, for diarization + # it ends up being preferable to performing the CMN in memory. If the CMN + # were performed in memory (e.g., we used --apply-cmn true in + # diarization/nnet3/xvector/extract_xvectors.sh) it would need to be + # performed after the subsegmentation, which leads to poorer results. + for name in sre callhome1 callhome2; do + local/nnet3/xvector/prepare_feats.sh --nj 40 --cmd "$train_cmd" \ + data/$name data/${name}_cmn exp/${name}_cmn + cp data/$name/vad.scp data/${name}_cmn/ + if [ -f data/$name/segments ]; then + cp data/$name/segments data/${name}_cmn/ + fi + utils/fix_data_dir.sh data/${name}_cmn + done + + echo "0.01" > data/sre_cmn/frame_shift + # Create segments to extract x-vectors from for PLDA training data. + # The segments are created using an energy-based speech activity + # detection (SAD) system, but this is not necessary. You can replace + # this with segments computed from your favorite SAD. + diarization/vad_to_segments.sh --nj 40 --cmd "$train_cmd" \ + data/sre_cmn data/sre_cmn_segmented +fi + +# In this section, we augment the training data with reverberation, +# noise, music, and babble, and combined it with the clean data. +# The combined list will be used to train the xvector DNN. The SRE +# subset will be used to train the PLDA model. +if [ $stage -le 2 ]; then + frame_shift=0.01 + awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/train/utt2num_frames > data/train/reco2dur + if [ ! -d "RIRS_NOISES" ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + + # Make a version with reverberated speech + rvb_opts=() + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") + + # Make a reverberated version of the SWBD+SRE list. Note that we don't add any + # additive noise here. + python steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --speech-rvb-probability 1 \ + --pointsource-noise-addition-probability 0 \ + --isotropic-noise-addition-probability 0 \ + --num-replications 1 \ + --source-sampling-rate 8000 \ + data/train data/train_reverb + cp data/train/vad.scp data/train_reverb/ + utils/copy_data_dir.sh --utt-suffix "-reverb" data/train_reverb data/train_reverb.new + rm -rf data/train_reverb + mv data/train_reverb.new data/train_reverb + + # Prepare the MUSAN corpus, which consists of music, speech, and noise + # suitable for augmentation. + local/make_musan.sh /export/corpora/JHU/musan data + + # Get the duration of the MUSAN recordings. This will be used by the + # script augment_data_dir.py. + for name in speech noise music; do + utils/data/get_utt2dur.sh data/musan_${name} + mv data/musan_${name}/utt2dur data/musan_${name}/reco2dur + done + + # Augment with musan_noise + python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise + # Augment with musan_music + python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music + # Augment with musan_speech + python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble + + # Combine reverb, noise, music, and babble into one directory. + utils/combine_data.sh data/train_aug data/train_reverb data/train_noise data/train_music data/train_babble + + # Take a random subset of the augmentations (128k is somewhat larger than twice + # the size of the SWBD+SRE list) + utils/subset_data_dir.sh data/train_aug 128000 data/train_aug_128k + utils/fix_data_dir.sh data/train_aug_128k + + # Make filterbanks for the augmented data. Note that we do not compute a new + # vad.scp file here. Instead, we use the vad.scp from the clean version of + # the list. + steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \ + data/train_aug_128k exp/make_mfcc $mfccdir + + # Combine the clean and augmented SWBD+SRE list. This is now roughly + # double the size of the original clean list. + utils/combine_data.sh data/train_combined data/train_aug_128k data/train +fi + +# Now we prepare the features to generate examples for xvector training. +if [ $stage -le 3 ]; then + # This script applies CMN and removes nonspeech frames. Note that this is somewhat + # wasteful, as it roughly doubles the amount of training data on disk. After + # creating training examples, this can be removed. + local/nnet3/xvector/prepare_feats_for_egs.sh --nj 40 --cmd "$train_cmd" \ + data/train_combined data/train_combined_cmn_no_sil exp/train_combined_cmn_no_sil + utils/fix_data_dir.sh data/train_combined_cmn_no_sil + + # Now, we need to remove features that are too short after removing silence + # frames. We want atleast 5s (500 frames) per utterance. + min_len=500 + mv data/train_combined_cmn_no_sil/utt2num_frames data/train_combined_cmn_no_sil/utt2num_frames.bak + awk -v min_len=${min_len} '$2 > min_len {print $1, $2}' data/train_combined_cmn_no_sil/utt2num_frames.bak > data/train_combined_cmn_no_sil/utt2num_frames + utils/filter_scp.pl data/train_combined_cmn_no_sil/utt2num_frames data/train_combined_cmn_no_sil/utt2spk > data/train_combined_cmn_no_sil/utt2spk.new + mv data/train_combined_cmn_no_sil/utt2spk.new data/train_combined_cmn_no_sil/utt2spk + utils/fix_data_dir.sh data/train_combined_cmn_no_sil + + # We also want several utterances per speaker. Now we'll throw out speakers + # with fewer than 8 utterances. + min_num_utts=8 + awk '{print $1, NF-1}' data/train_combined_cmn_no_sil/spk2utt > data/train_combined_cmn_no_sil/spk2num + awk -v min_num_utts=${min_num_utts} '$2 >= min_num_utts {print $1, $2}' \ + data/train_combined_cmn_no_sil/spk2num | utils/filter_scp.pl - data/train_combined_cmn_no_sil/spk2utt \ + > data/train_combined_cmn_no_sil/spk2utt.new + mv data/train_combined_cmn_no_sil/spk2utt.new data/train_combined_cmn_no_sil/spk2utt + utils/spk2utt_to_utt2spk.pl data/train_combined_cmn_no_sil/spk2utt > data/train_combined_cmn_no_sil/utt2spk + + utils/filter_scp.pl data/train_combined_cmn_no_sil/utt2spk data/train_combined_cmn_no_sil/utt2num_frames > data/train_combined_cmn_no_sil/utt2num_frames.new + mv data/train_combined_cmn_no_sil/utt2num_frames.new data/train_combined_cmn_no_sil/utt2num_frames + + # Now we're ready to create training examples. + utils/fix_data_dir.sh data/train_combined_cmn_no_sil +fi + +local/nnet3/xvector/tuning/run_xvector_1a.sh --stage $stage --train-stage -1 \ + --data data/train_combined_cmn_no_sil --nnet-dir $nnet_dir \ + --egs-dir $nnet_dir/egs + +# Extract x-vectors +if [ $stage -le 7 ]; then + # Extract x-vectors for the two partitions of callhome. + diarization/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 5G" \ + --nj 40 --window 1.5 --period 0.75 --apply-cmn false \ + --min-segment 0.5 $nnet_dir \ + data/callhome1_cmn $nnet_dir/xvectors_callhome1 + + diarization/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 5G" \ + --nj 40 --window 1.5 --period 0.75 --apply-cmn false \ + --min-segment 0.5 $nnet_dir \ + data/callhome2_cmn $nnet_dir/xvectors_callhome2 + + # Reduce the amount of training data for the PLDA, + utils/subset_data_dir.sh data/sre_cmn_segmented 128000 data/sre_cmn_segmented_128k + # Extract x-vectors for the SRE, which is our PLDA training + # data. A long period is used here so that we don't compute too + # many x-vectors for each recording. + diarization/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 10G" \ + --nj 40 --window 3.0 --period 10.0 --min-segment 1.5 --apply-cmn false \ + --hard-min true $nnet_dir \ + data/sre_cmn_segmented_128k $nnet_dir/xvectors_sre_segmented_128k +fi + +# Train PLDA models +if [ $stage -le 8 ]; then + # Train a PLDA model on SRE, using callhome1 to whiten. + # We will later use this to score x-vectors in callhome2. + "$train_cmd" $nnet_dir/xvectors_callhome1/log/plda.log \ + ivector-compute-plda ark:$nnet_dir/xvectors_sre_segmented_128k/spk2utt \ + "ark:ivector-subtract-global-mean \ + scp:$nnet_dir/xvectors_sre_segmented_128k/xvector.scp ark:- \ + | transform-vec $nnet_dir/xvectors_callhome1/transform.mat ark:- ark:- \ + | ivector-normalize-length ark:- ark:- |" \ + $nnet_dir/xvectors_callhome1/plda || exit 1; + + # Train a PLDA model on SRE, using callhome2 to whiten. + # We will later use this to score x-vectors in callhome1. + "$train_cmd" $nnet_dir/xvectors_callhome2/log/plda.log \ + ivector-compute-plda ark:$nnet_dir/xvectors_sre_segmented_128k/spk2utt \ + "ark:ivector-subtract-global-mean \ + scp:$nnet_dir/xvectors_sre_segmented_128k/xvector.scp ark:- \ + | transform-vec $nnet_dir/xvectors_callhome2/transform.mat ark:- ark:- \ + | ivector-normalize-length ark:- ark:- |" \ + $nnet_dir/xvectors_callhome2/plda || exit 1; +fi + +# Perform PLDA scoring +if [ $stage -le 9 ]; then + # Perform PLDA scoring on all pairs of segments for each recording. + # The first directory contains the PLDA model that used callhome2 + # to perform whitening (recall that we're treating callhome2 as a + # held-out dataset). The second directory contains the x-vectors + # for callhome1. + diarization/nnet3/xvector/score_plda.sh --cmd "$train_cmd --mem 4G" \ + --nj 20 $nnet_dir/xvectors_callhome2 $nnet_dir/xvectors_callhome1 \ + $nnet_dir/xvectors_callhome1/plda_scores + + # Do the same thing for callhome2. + diarization/nnet3/xvector/score_plda.sh --cmd "$train_cmd --mem 4G" \ + --nj 20 $nnet_dir/xvectors_callhome1 $nnet_dir/xvectors_callhome2 \ + $nnet_dir/xvectors_callhome2/plda_scores +fi + +# Cluster the PLDA scores using a stopping threshold. +if [ $stage -le 10 ]; then + # First, we find the threshold that minimizes the DER on each partition of + # callhome. + mkdir -p $nnet_dir/tuning + for dataset in callhome1 callhome2; do + echo "Tuning clustering threshold for $dataset" + best_der=100 + best_threshold=0 + utils/filter_scp.pl -f 2 data/$dataset/wav.scp \ + data/callhome/fullref.rttm > data/$dataset/ref.rttm + + # The threshold is in terms of the log likelihood ratio provided by the + # PLDA scores. In a perfectly calibrated system, the threshold is 0. + # In the following loop, we evaluate the clustering on a heldout dataset + # (callhome1 is heldout for callhome2 and vice-versa) using some reasonable + # thresholds for a well-calibrated system. + for threshold in -0.3 -0.2 -0.1 -0.05 0 0.05 0.1 0.2 0.3; do + diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \ + --threshold $threshold $nnet_dir/xvectors_$dataset/plda_scores \ + $nnet_dir/xvectors_$dataset/plda_scores_t$threshold + + md-eval.pl -1 -c 0.25 -r data/$dataset/ref.rttm \ + -s $nnet_dir/xvectors_$dataset/plda_scores_t$threshold/rttm \ + 2> $nnet_dir/tuning/${dataset}_t${threshold}.log \ + > $nnet_dir/tuning/${dataset}_t${threshold} + + der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \ + $nnet_dir/tuning/${dataset}_t${threshold}) + if [ $(echo $der'<'$best_der | bc -l) -eq 1 ]; then + best_der=$der + best_threshold=$threshold + fi + done + echo "$best_threshold" > $nnet_dir/tuning/${dataset}_best + done + + # Cluster callhome1 using the best threshold found for callhome2. This way, + # callhome2 is treated as a held-out dataset to discover a reasonable + # stopping threshold for callhome1. + diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \ + --threshold $(cat $nnet_dir/tuning/callhome2_best) \ + $nnet_dir/xvectors_callhome1/plda_scores $nnet_dir/xvectors_callhome1/plda_scores + + # Do the same thing for callhome2, treating callhome1 as a held-out dataset + # to discover a stopping threshold. + diarization/cluster.sh --cmd "$train_cmd --mem 4G" --nj 20 \ + --threshold $(cat $nnet_dir/tuning/callhome1_best) \ + $nnet_dir/xvectors_callhome2/plda_scores $nnet_dir/xvectors_callhome2/plda_scores + + mkdir -p $nnet_dir/results + # Now combine the results for callhome1 and callhome2 and evaluate it + # together. + cat $nnet_dir/xvectors_callhome1/plda_scores/rttm \ + $nnet_dir/xvectors_callhome2/plda_scores/rttm | md-eval.pl -1 -c 0.25 -r \ + data/callhome/fullref.rttm -s - 2> $nnet_dir/results/threshold.log \ + > $nnet_dir/results/DER_threshold.txt + der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \ + $nnet_dir/results/DER_threshold.txt) + # Using supervised calibration, DER: 8.39% + # Compare to 10.36% in ../v1/run.sh + echo "Using supervised calibration, DER: $der%" +fi + +# Cluster the PLDA scores using the oracle number of speakers +if [ $stage -le 11 ]; then + # In this section, we show how to do the clustering if the number of speakers + # (and therefore, the number of clusters) per recording is known in advance. + diarization/cluster.sh --cmd "$train_cmd --mem 4G" \ + --reco2num-spk data/callhome1/reco2num_spk \ + $nnet_dir/xvectors_callhome1/plda_scores $nnet_dir/xvectors_callhome1/plda_scores_num_spk + + diarization/cluster.sh --cmd "$train_cmd --mem 4G" \ + --reco2num-spk data/callhome2/reco2num_spk \ + $nnet_dir/xvectors_callhome2/plda_scores $nnet_dir/xvectors_callhome2/plda_scores_num_spk + + mkdir -p $nnet_dir/results + # Now combine the results for callhome1 and callhome2 and evaluate it together. + cat $nnet_dir/xvectors_callhome1/plda_scores_num_spk/rttm \ + $nnet_dir/xvectors_callhome2/plda_scores_num_spk/rttm \ + | md-eval.pl -1 -c 0.25 -r data/callhome/fullref.rttm -s - 2> $nnet_dir/results/num_spk.log \ + > $nnet_dir/results/DER_num_spk.txt + der=$(grep -oP 'DIARIZATION\ ERROR\ =\ \K[0-9]+([.][0-9]+)?' \ + $nnet_dir/results/DER_num_spk.txt) + # Using the oracle number of speakers, DER: 7.12% + # Compare to 8.69% in ../v1/run.sh + echo "Using the oracle number of speakers, DER: $der%" +fi diff --git a/egs/callhome_diarization/v2/sid b/egs/callhome_diarization/v2/sid new file mode 120000 index 00000000000..5cb0274b7d6 --- /dev/null +++ b/egs/callhome_diarization/v2/sid @@ -0,0 +1 @@ +../../sre08/v1/sid/ \ No newline at end of file diff --git a/egs/callhome_diarization/v2/steps b/egs/callhome_diarization/v2/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/callhome_diarization/v2/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/callhome_diarization/v2/utils b/egs/callhome_diarization/v2/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/callhome_diarization/v2/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/chime5/s5/RESULTS b/egs/chime5/s5/RESULTS index 941b63ece52..0dcea1f0031 100644 --- a/egs/chime5/s5/RESULTS +++ b/egs/chime5/s5/RESULTS @@ -11,3 +11,23 @@ %WER 47.91 [ 28212 / 58881, 2843 ins, 8957 del, 16412 sub ] exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_dev_worn/wer_9_0.0 %WER 81.28 [ 47859 / 58881, 4210 ins, 27511 del, 16138 sub ] exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_dev_beamformit_ref/wer_9_0.5 +# result with the challenge submission format (July 9, 2018) +# before the fix of speaker ID across arrays +session S02 room DINING: #words 8288, #errors 6593, wer 79.54 % +session S02 room KITCHEN: #words 12696, #errors 11096, wer 87.39 % +session S02 room LIVING: #words 15460, #errors 12219, wer 79.03 % +session S09 room DINING: #words 5766, #errors 4651, wer 80.66 % +session S09 room KITCHEN: #words 8911, #errors 7277, wer 81.66 % +session S09 room LIVING: #words 7760, #errors 6023, wer 77.61 % +overall: #words 58881, #errors 47859, wer 81.28 % + +# result with the challenge submission format (July 9, 2018) +# after the fix of speaker ID across arrays +==== development set ==== +session S02 room DINING: #words 8288, #errors 6556, wer 79.10 % +session S02 room KITCHEN: #words 12696, #errors 11096, wer 87.39 % +session S02 room LIVING: #words 15460, #errors 12182, wer 78.79 % +session S09 room DINING: #words 5766, #errors 4648, wer 80.61 % +session S09 room KITCHEN: #words 8911, #errors 7277, wer 81.66 % +session S09 room LIVING: #words 7760, #errors 6022, wer 77.60 % +overall: #words 58881, #errors 47781, wer 81.14 % diff --git a/egs/chime5/s5/local/prepare_data.sh b/egs/chime5/s5/local/prepare_data.sh index a037f371e34..98087322c38 100755 --- a/egs/chime5/s5/local/prepare_data.sh +++ b/egs/chime5/s5/local/prepare_data.sh @@ -26,8 +26,8 @@ adir=$1 jdir=$2 dir=$3 -json_count=$(find $jdir -name "*.json" | wc -l) -wav_count=$(find $adir -name "*.wav" | wc -l) +json_count=$(find -L $jdir -name "*.json" | wc -l) +wav_count=$(find -L $adir -name "*.wav" | wc -l) if [ "$json_count" -eq 0 ]; then echo >&2 "We expect that the directory $jdir will contain json files." @@ -56,7 +56,7 @@ if [ $mictype == "worn" ]; then # convert the filenames to wav.scp format, use the basename of the file # as a the wav.scp key, add .L and .R for left and right channel # i.e. each file will have two entries (left and right channel) - find $adir -name "S[0-9]*_P[0-9]*.wav" | \ + find -L $adir -name "S[0-9]*_P[0-9]*.wav" | \ perl -ne '{ chomp; $path = $_; @@ -81,7 +81,7 @@ elif [ $mictype == "ref" ]; then # first get a text, which will be used to extract reference arrays perl -ne 's/-/.ENH-/;print;' $dir/text.orig | sort > $dir/text - find $adir | grep "\.wav" | sort > $dir/wav.flist + find -L $adir | grep "\.wav" | sort > $dir/wav.flist # following command provide the argument for grep to extract only reference arrays grep `cut -f 1 -d"-" $dir/text | awk -F"_" '{print $2 "_" $3}' | sed -e "s/\.ENH//" | sort | uniq | sed -e "s/^/ -e /" | tr "\n" " "` $dir/wav.flist > $dir/wav.flist2 paste -d" " \ @@ -91,7 +91,7 @@ else # array mic case # convert the filenames to wav.scp format, use the basename of the file # as a the wav.scp key - find $adir -name "*.wav" -ipath "*${mictype}*" |\ + find -L $adir -name "*.wav" -ipath "*${mictype}*" |\ perl -ne '$p=$_;chomp $_;@F=split "/";$F[$#F]=~s/\.wav//;print "$F[$#F] $p";' |\ sort -u > $dir/wav.scp diff --git a/egs/chime5/s5/local/run_beamformit.sh b/egs/chime5/s5/local/run_beamformit.sh index 176fd108d5d..aa3badd90d8 100755 --- a/egs/chime5/s5/local/run_beamformit.sh +++ b/egs/chime5/s5/local/run_beamformit.sh @@ -45,7 +45,7 @@ echo "the number of channels: $numch" # wavfiles.list can be used as the name of the output files output_wavfiles=$expdir/wavfiles.list -find ${sdir} | grep -i ${array} | awk -F "/" '{print $NF}' | sed -e "s/\.CH.\.wav//" | sort | uniq > $expdir/wavfiles.list +find -L ${sdir} | grep -i ${array} | awk -F "/" '{print $NF}' | sed -e "s/\.CH.\.wav//" | sort | uniq > $expdir/wavfiles.list # this is an input file list of the microphones # format: 1st_wav 2nd_wav ... nth_wav diff --git a/egs/chime5/s5/local/run_recog.sh b/egs/chime5/s5/local/run_recog.sh new file mode 100755 index 00000000000..5c74c9ff242 --- /dev/null +++ b/egs/chime5/s5/local/run_recog.sh @@ -0,0 +1,164 @@ +#!/bin/bash +# +# Based mostly on the TED-LIUM and Switchboard recipe +# +# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe and Yenda Trmal) +# Apache 2.0 +# +# This is a subset of run.sh to only perform recognition experiments with evaluation data + +# Begin configuration section. +decode_nj=20 +stage=0 +enhancement=beamformit # for a new enhancement method, + # change this variable and stage 4 +# End configuration section +. ./utils/parse_options.sh + +. ./cmd.sh +. ./path.sh + + +set -e # exit on error + +# chime5 main directory path +# please change the path accordingly +chime5_corpus=/export/corpora4/CHiME5 +json_dir=${chime5_corpus}/transcriptions +audio_dir=${chime5_corpus}/audio + +# training and test data +train_set=train_worn_u100k +test_sets="eval_${enhancement}_ref" + +# This script also needs the phonetisaurus g2p, srilm, beamformit +./local/check_tools.sh || exit 1 + +if [ $stage -le 4 ]; then + # Beamforming using reference arrays + # enhanced WAV directory + enhandir=enhan + for dset in eval; do + for mictype in u01 u02 u03 u04 u05 u06; do + local/run_beamformit.sh --cmd "$train_cmd" \ + ${audio_dir}/${dset} \ + ${enhandir}/${dset}_${enhancement}_${mictype} \ + ${mictype} + done + done + + for dset in eval; do + local/prepare_data.sh --mictype ref "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \ + ${json_dir}/${dset} data/${dset}_${enhancement}_ref + done +fi + +if [ $stage -le 6 ]; then + # fix speaker ID issue (thanks to Dr. Naoyuki Kanda) + # add array ID to the speaker ID to avoid the use of other array information to meet regulations + # Before this fix + # $ head -n 2 data/eval_beamformit_ref_nosplit/utt2spk + # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01 + # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01 + # After this fix + # $ head -n 2 data/eval_beamformit_ref_nosplit_fix/utt2spk + # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01_U02 + # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01_U02 + for dset in ${test_sets}; do + utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit + mkdir -p data/${dset}_nosplit_fix + cp data/${dset}_nosplit/{segments,text,wav.scp} data/${dset}_nosplit_fix/ + awk -F "_" '{print $0 "_" $3}' data/${dset}_nosplit/utt2spk > data/${dset}_nosplit_fix/utt2spk + utils/utt2spk_to_spk2utt.pl data/${dset}_nosplit_fix/utt2spk > data/${dset}_nosplit_fix/spk2utt + done + + # Split speakers up into 3-minute chunks. This doesn't hurt adaptation, and + # lets us use more jobs for decoding etc. + for dset in ${test_sets}; do + utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit_fix data/${dset} + done +fi + +if [ $stage -le 7 ]; then + # Now make MFCC features. + # mfccdir should be some place with a largish disk where you + # want to store MFCC features. + mfccdir=mfcc + for x in ${test_sets}; do + steps/make_mfcc.sh --nj 20 --cmd "$train_cmd" \ + data/$x exp/make_mfcc/$x $mfccdir + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir + utils/fix_data_dir.sh data/$x + done +fi + +if [ $stage -le 17 ]; then + nnet3_affix=_${train_set}_cleaned + for datadir in ${test_sets}; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + done + for datadir in ${test_sets}; do + steps/make_mfcc.sh --nj 20 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1; + utils/fix_data_dir.sh data/${datadir}_hires || exit 1; + done + for data in $test_sets; do + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \ + data/${data}_hires exp/nnet3${nnet3_affix}/extractor \ + exp/nnet3${nnet3_affix}/ivectors_${data}_hires + done +fi + +if [ $stage -le 18 ]; then + # First the options that are passed through to run_ivector_common.sh + # (some of which are also used in this script directly). + lm_suffix= + + # The rest are configs specific to this script. Most of the parameters + # are just hardcoded at this level, in the commands below. + affix=1a # affix for the TDNN directory name + tree_affix= + tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix} + dir=exp/chain${nnet3_affix}/tdnn${affix}_sp + + # training options + # training chunk-options + chunk_width=140,100,160 + # we don't need extra left/right context for TDNN systems. + chunk_left_context=0 + chunk_right_context=0 + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang${lm_suffix}/ \ + $tree_dir $tree_dir/graph${lm_suffix} || exit 1; + + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + steps/nnet3/decode.sh \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj 8 --cmd "$decode_cmd" --num-threads 4 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \ + $tree_dir/graph${lm_suffix} data/${data}_hires ${dir}/decode${lm_suffix}_${data} || exit 1 + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + +if [ $stage -le 20 ]; then + # final scoring to get the official challenge result + # please specify both dev and eval set directories so that the search parameters + # (insertion penalty and language model weight) will be tuned using the dev set + local/score_for_submit.sh \ + --dev exp/chain_${train_set}_cleaned/tdnn1a_sp/decode_dev_${enhancement}_ref \ + --eval exp/chain_${train_set}_cleaned/tdnn1a_sp/decode_eval_${enhancement}_ref +fi diff --git a/egs/chime5/s5/local/score_for_submit.sh b/egs/chime5/s5/local/score_for_submit.sh new file mode 100755 index 00000000000..5502c5994e5 --- /dev/null +++ b/egs/chime5/s5/local/score_for_submit.sh @@ -0,0 +1,119 @@ +#!/bin/bash +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey, Yenda Trmal) +# Apache 2.0 +# +# This script provides official CHiME-5 challenge submission scores per room and session. +# It first calculates the best search parameter configurations by using the dev set +# and also create the transcriptions for dev and eval sets to be submitted. +# The default setup does not calculate scores of the evaluation set since +# the evaluation transcription is not distributed (July 9 2018) + +cmd=run.pl +dev=exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_dev_beamformit_ref +eval=exp/chain_train_worn_u100k_cleaned/tdnn1a_sp/decode_eval_beamformit_ref +do_eval=false + +echo "$0 $@" # Print the command line for logging +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 0 ]; then + echo "Usage: $0 [--cmd (run.pl|queue.pl...)]" + echo "This script provides official CHiME-5 challenge submission scores" + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --dev # dev set decoding directory" + echo " --eval # eval set decoding directory" + exit 1; +fi + +# get language model weight and word insertion penalty from the dev set +best_lmwt=`cat $dev/scoring_kaldi/wer_details/lmwt` +best_wip=`cat $dev/scoring_kaldi/wer_details/wip` + +echo "best LM weight: $best_lmwt" +echo "insertion penalty weight: $best_wip" + +echo "==== development set ====" +# development set +# get the scoring result per utterance +score_result=$dev/scoring_kaldi/wer_details/per_utt +for session in S02 S09; do + for room in DINING KITCHEN LIVING; do + # get nerror + nerr=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$4+$5+$6} END {print sum}'` + # get nwords from references (NF-2 means to exclude utterance id and " ref ") + nwrd=`grep " ref " $score_result | grep $room | grep $session | sed -e "s/\*//g" | awk '{sum+=NF-2} END {print sum}'` + # compute wer with scale=2 + wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` + + # report the results + echo -n "session $session " + echo -n "room $room: " + echo -n "#words $nwrd, " + echo -n "#errors $nerr, " + echo "wer $wer %" + done +done +echo -n "overall: " +# get nerror +nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'` +# get nwords from references (NF-2 means to exclude utterance id and " ref ") +nwrd=`grep " ref " $score_result | sed -e "s/\*//g" | awk '{sum+=NF-2} END {print sum}'` +# compute wer with scale=2 +wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` +echo -n "#words $nwrd, " +echo -n "#errors $nerr, " +echo "wer $wer %" + +echo "==== evaluation set ====" +# evaluation set +# get the scoring result per utterance. Copied from local/score.sh +mkdir -p $eval/scoring_kaldi/wer_details_devbest +$cmd $eval/scoring_kaldi/log/stats1.log \ + cat $eval/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \ + align-text --special-symbol="'***'" ark:$eval/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \ + utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \> $eval/scoring_kaldi/wer_details_devbest/per_utt +score_result=$eval/scoring_kaldi/wer_details_devbest/per_utt +for session in S01 S21; do + for room in DINING KITCHEN LIVING; do + if $do_eval; then + # get nerror + nerr=`grep "\#csid" $score_result | grep $room | grep $session | awk '{sum+=$4+$5+$6} END {print sum}'` + # get nwords from references (NF-2 means to exclude utterance id and " ref ") + nwrd=`grep " ref " $score_result | grep $room | grep $session | sed -e "s/\*//g" | awk '{sum+=NF-2} END {print sum}'` + # compute wer with scale=2 + wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` + + # report the results + echo -n "session $session " + echo -n "room $room: " + echo -n "#words $nwrd, " + echo -n "#errors $nerr, " + echo "wer $wer %" + fi + done +done +if $do_eval; then + # get nerror + nerr=`grep "\#csid" $score_result | awk '{sum+=$4+$5+$6} END {print sum}'` + # get nwords from references (NF-2 means to exclude utterance id and " ref ") + nwrd=`grep " ref " $score_result | sed -e "s/\*//g" | awk '{sum+=NF-2} END {print sum}'` + # compute wer with scale=2 + wer=`echo "scale=2; 100 * $nerr / $nwrd" | bc` + echo -n "overall: " + echo -n "#words $nwrd, " + echo -n "#errors $nerr, " + echo "wer $wer %" +else + echo "skip evaluation scoring" + echo "" + echo "==== when you submit your result to the CHiME-5 challenge ====" + echo "Please rename your recognition results of " + echo "$dev/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt" + echo "$eval/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt" + echo "with {dev,eval}__.txt, e.g., dev_watanabe_jhu.txt and eval_watanabe_jhu.txt, " + echo "and submit both of them as your final challenge result" + echo "==================================================================" +fi + diff --git a/egs/chime5/s5/run.sh b/egs/chime5/s5/run.sh index c63249b086b..024c0190b3e 100755 --- a/egs/chime5/s5/run.sh +++ b/egs/chime5/s5/run.sh @@ -29,9 +29,7 @@ audio_dir=${chime5_corpus}/audio # training and test data train_set=train_worn_u100k -test_sets="dev_worn dev_${enhancement}_ref" -# use the below once you obtain the evaluation data. Also remove the comment #eval# in the lines below -#eval#test_sets="dev_worn dev_${enhancement}_ref eval_${enhancement}_ref" +test_sets="dev_worn dev_${enhancement}_ref eval_${enhancement}_ref" # This script also needs the phonetisaurus g2p, srilm, beamformit ./local/check_tools.sh || exit 1 @@ -42,7 +40,6 @@ if [ $stage -le 1 ]; then local/prepare_data.sh --mictype ${mictype} \ ${audio_dir}/train ${json_dir}/train data/train_${mictype} done - #eval#for dataset in dev eval; do for dataset in dev; do for mictype in worn; do local/prepare_data.sh --mictype ${mictype} \ @@ -76,8 +73,7 @@ if [ $stage -le 4 ]; then # Beamforming using reference arrays # enhanced WAV directory enhandir=enhan - #eval#for dset in dev eval; do - for dset in dev; do + for dset in dev eval; do for mictype in u01 u02 u03 u04 u05 u06; do local/run_beamformit.sh --cmd "$train_cmd" \ ${audio_dir}/${dset} \ @@ -86,8 +82,7 @@ if [ $stage -le 4 ]; then done done - #eval#for dset in dev eval; do - for dset in dev; do + for dset in dev eval; do local/prepare_data.sh --mictype ref "$PWD/${enhandir}/${dset}_${enhancement}_u0*" \ ${json_dir}/${dset} data/${dset}_${enhancement}_ref done @@ -109,7 +104,6 @@ if [ $stage -le 5 ]; then # only use left channel for worn mic recognition # you can use both left and right channels for training - #eval#for dset in train dev eval; do for dset in train dev; do utils/copy_data_dir.sh data/${dset}_worn data/${dset}_worn_stereo grep "\.L-" data/${dset}_worn_stereo/text > data/${dset}_worn/text @@ -118,12 +112,33 @@ if [ $stage -le 5 ]; then fi if [ $stage -le 6 ]; then + # fix speaker ID issue (thanks to Dr. Naoyuki Kanda) + # add array ID to the speaker ID to avoid the use of other array information to meet regulations + # Before this fix + # $ head -n 2 data/eval_beamformit_ref_nosplit/utt2spk + # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01 + # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01 + # After this fix + # $ head -n 2 data/eval_beamformit_ref_nosplit_fix/utt2spk + # P01_S01_U02_KITCHEN.ENH-0000192-0001278 P01_U02 + # P01_S01_U02_KITCHEN.ENH-0001421-0001481 P01_U02 + for dset in dev_${enhancement}_ref eval_${enhancement}_ref; do + utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit + mkdir -p data/${dset}_nosplit_fix + cp data/${dset}_nosplit/{segments,text,wav.scp} data/${dset}_nosplit_fix/ + awk -F "_" '{print $0 "_" $3}' data/${dset}_nosplit/utt2spk > data/${dset}_nosplit_fix/utt2spk + utils/utt2spk_to_spk2utt.pl data/${dset}_nosplit_fix/utt2spk > data/${dset}_nosplit_fix/spk2utt + done + # Split speakers up into 3-minute chunks. This doesn't hurt adaptation, and # lets us use more jobs for decoding etc. - for dset in ${train_set} ${test_sets}; do + for dset in ${train_set} dev_worn; do utils/copy_data_dir.sh data/${dset} data/${dset}_nosplit utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit data/${dset} done + for dset in dev_${enhancement}_ref eval_${enhancement}_ref; do + utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}_nosplit_fix data/${dset} + done fi if [ $stage -le 7 ]; then @@ -204,3 +219,12 @@ if [ $stage -le 17 ]; then # chain TDNN local/chain/run_tdnn.sh --nj ${nj} --train-set ${train_set}_cleaned --test-sets "$test_sets" --gmm tri3_cleaned --nnet3-affix _${train_set}_cleaned fi + +if [ $stage -le 18 ]; then + # final scoring to get the official challenge result + # please specify both dev and eval set directories so that the search parameters + # (insertion penalty and language model weight) will be tuned using the dev set + local/score_for_submit.sh \ + --dev exp/chain_${train_set}_cleaned/tdnn1a_sp/decode_dev_${enhancement}_ref \ + --eval exp/chain_${train_set}_cleaned/tdnn1a_sp/decode_eval_${enhancement}_ref +fi diff --git a/egs/cifar/v1/image/fix_data_dir.sh b/egs/cifar/v1/image/fix_data_dir.sh new file mode 100755 index 00000000000..b85623b6e85 --- /dev/null +++ b/egs/cifar/v1/image/fix_data_dir.sh @@ -0,0 +1,186 @@ +#!/bin/bash + +# This script makes sure that only the segments present in +# all of "feats.scp", "images.scp" [if present], segments [if present] +# text, and utt2spk are present in any of them. +# It puts the original contents of data-dir into +# data-dir/.backup + +utt_extra_files= +spk_extra_files= + +. utils/parse_options.sh + +if [ $# != 1 ]; then + echo "Usage: utils/data/fix_data_dir.sh " + echo "e.g.: utils/data/fix_data_dir.sh data/train" + echo "This script helps ensure that the various files in a data directory" + echo "are correctly sorted and filtered, for example removing utterances" + echo "that have no features (if feats.scp is present)" + exit 1 +fi + +data=$1 +mkdir -p $data/.backup + +[ ! -d $data ] && echo "$0: no such directory $data" && exit 1; + +[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1; + +set -e -o pipefail -u + +tmpdir=$(mktemp -d /tmp/kaldi.XXXX); +trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM + +export LC_ALL=C + +function check_sorted { + file=$1 + sort -k1,1 -u <$file >$file.tmp + if ! cmp -s $file $file.tmp; then + echo "$0: file $1 is not in sorted order or not unique, sorting it" + mv $file.tmp $file + else + rm $file.tmp + fi +} + +for x in utt2spk spk2utt feats.scp text segments images.scp cmvn.scp vad.scp \ + reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur utt2num_frames; do + if [ -f $data/$x ]; then + cp $data/$x $data/.backup/$x + check_sorted $data/$x + fi +done + + +function filter_file { + filter=$1 + file_to_filter=$2 + cp $file_to_filter ${file_to_filter}.tmp + utils/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter + if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then + length1=$(cat ${file_to_filter}.tmp | wc -l) + length2=$(cat ${file_to_filter} | wc -l) + if [ $length1 -ne $length2 ]; then + echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter." + fi + fi + rm $file_to_filter.tmp +} + +function filter_recordings { + # We call this once before the stage when we filter on utterance-id, and once + # after. + + if [ -f $data/segments ]; then + # We have a segments file -> we need to filter this and the file images.scp, and + # reco2file_and_utt, if it exists, to make sure they have the same list of + # recording-ids. + + if [ ! -f $data/images.scp ]; then + echo "$0: $data/segments exists but not $data/images.scp" + exit 1; + fi + awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings + n1=$(cat $tmpdir/recordings | wc -l) + [ ! -s $tmpdir/recordings ] && \ + echo "Empty list of recordings (bad file $data/segments)?" && exit 1; + utils/filter_scp.pl $data/images.scp $tmpdir/recordings > $tmpdir/recordings.tmp + mv $tmpdir/recordings.tmp $tmpdir/recordings + + + cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments + filter_file $tmpdir/recordings $data/segments + cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments + rm $data/segments.tmp + + filter_file $tmpdir/recordings $data/images.scp + [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel + true + fi +} + +function filter_speakers { + # throughout this program, we regard utt2spk as primary and spk2utt as derived, so... + utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt + + cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers + for s in cmvn.scp spk2gender; do + f=$data/$s + if [ -f $f ]; then + filter_file $f $tmpdir/speakers + fi + done + + filter_file $tmpdir/speakers $data/spk2utt + utils/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk + + for s in cmvn.scp spk2gender $spk_extra_files; do + f=$data/$s + if [ -f $f ]; then + filter_file $tmpdir/speakers $f + fi + done +} + +function filter_utts { + cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts + + ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \ + echo "utt2spk is not in sorted order (fix this yourself)" && exit 1; + + ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \ + echo "utt2spk is not in sorted order when sorted first on speaker-id " && \ + echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; + + ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \ + echo "spk2utt is not in sorted order (fix this yourself)" && exit 1; + + if [ -f $data/utt2uniq ]; then + ! cat $data/utt2uniq | sort | cmp - $data/utt2uniq && \ + echo "utt2uniq is not in sorted order (fix this yourself)" && exit 1; + fi + + maybe_image= + [ ! -f $data/segments ] && maybe_image=images.scp # images indexed by utts only if segments does not exist. + for x in feats.scp text segments utt2lang $maybe_image; do + if [ -f $data/$x ]; then + utils/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp + mv $tmpdir/utts.tmp $tmpdir/utts + fi + done + [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \ + rm $tmpdir/utts && exit 1; + + + if [ -f $data/utt2spk ]; then + new_nutts=$(cat $tmpdir/utts | wc -l) + old_nutts=$(cat $data/utt2spk | wc -l) + if [ $new_nutts -ne $old_nutts ]; then + echo "fix_data_dir.sh: kept $new_nutts utterances out of $old_nutts" + else + echo "fix_data_dir.sh: kept all $old_nutts utterances." + fi + fi + + for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2dur utt2num_frames $maybe_image $utt_extra_files; do + if [ -f $data/$x ]; then + cp $data/$x $data/.backup/$x + if ! cmp -s $data/$x <( utils/filter_scp.pl $tmpdir/utts $data/$x ) ; then + utils/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x + fi + fi + done + +} + +filter_recordings +filter_speakers +filter_utts +filter_speakers +filter_recordings + +utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt + +echo "fix_data_dir.sh: old files are kept in $data/.backup" diff --git a/egs/cifar/v1/image/get_image2num_frames.py b/egs/cifar/v1/image/get_image2num_frames.py index 3c003bb9947..a965be9e1cc 100755 --- a/egs/cifar/v1/image/get_image2num_frames.py +++ b/egs/cifar/v1/image/get_image2num_frames.py @@ -14,7 +14,7 @@ import os import sys import numpy as np -from scipy import misc +from PIL import Image parser = argparse.ArgumentParser(description="""Computes the image lengths (i.e. width) in an image data dir and writes them (by default) to image2num_frames.""") @@ -33,8 +33,7 @@ def get_scaled_image_length(im): scale_size = args.feat_dim - sx = im.shape[1] - sy = im.shape[0] + sx, sy = im.size scale = (1.0 * scale_size) / sy nx = int(scale * sx) return nx @@ -55,7 +54,7 @@ def get_scaled_image_length(im): line_vect = line.split(' ') image_id = line_vect[0] image_path = line_vect[1] - im = misc.imread(image_path) + im = Image.open(image_path) im_len = get_scaled_image_length(im) + (args.padding * 2) print('{} {}'.format(image_id, im_len), file=out_fh) diff --git a/egs/cifar/v1/image/validate_data_dir.sh b/egs/cifar/v1/image/validate_data_dir.sh new file mode 100755 index 00000000000..e4db9c2c92c --- /dev/null +++ b/egs/cifar/v1/image/validate_data_dir.sh @@ -0,0 +1,340 @@ +#!/bin/bash + + +no_feats=false +no_image=false +no_text=false +no_spk_sort=false + +for x in `seq 4`; do + if [ "$1" == "--no-feats" ]; then + no_feats=true + shift; + fi + if [ "$1" == "--no-text" ]; then + no_text=true + shift; + fi + if [ "$1" == "--no-image" ]; then + no_image=true + shift; + fi + if [ "$1" == "--no-spk-sort" ]; then + no_spk_sort=true + shift; + fi +done + +if [ $# -ne 1 ]; then + echo "Usage: $0 [--no-feats] [--no-text] [--no-image] [--no-spk-sort] " + echo "The --no-xxx options mean that the script does not require " + echo "xxx.scp to be present, but it will check it if it is present." + echo "--no-spk-sort means that the script does not require the utt2spk to be " + echo "sorted by the speaker-id in addition to being sorted by utterance-id." + echo "By default, utt2spk is expected to be sorted by both, which can be " + echo "achieved by making the speaker-id prefixes of the utterance-ids" + echo "e.g.: $0 data/train" + exit 1; +fi + +data=$1 + +if [ ! -d $data ]; then + echo "$0: no such directory $data" + exit 1; +fi + +for f in spk2utt utt2spk; do + if [ ! -f $data/$f ]; then + echo "$0: no such file $f" + exit 1; + fi + if [ ! -s $data/$f ]; then + echo "$0: empty file $f" + exit 1; + fi +done + +! cat $data/utt2spk | awk '{if (NF != 2) exit(1); }' && \ + echo "$0: $data/utt2spk has wrong format." && exit; + +ns=$(wc -l < $data/spk2utt) +if [ "$ns" == 1 ]; then + echo "$0: WARNING: you have only one speaker. This probably a bad idea." + echo " Search for the word 'bold' in http://kaldi-asr.org/doc/data_prep.html" + echo " for more information." +fi + + +tmpdir=$(mktemp -d /tmp/kaldi.XXXX); +trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM + +export LC_ALL=C + +function check_sorted_and_uniq { + ! awk '{print $1}' $1 | sort | uniq | cmp -s - <(awk '{print $1}' $1) && \ + echo "$0: file $1 is not in sorted order or has duplicates" && exit 1; +} + +function partial_diff { + diff $1 $2 | head -n 6 + echo "..." + diff $1 $2 | tail -n 6 + n1=`cat $1 | wc -l` + n2=`cat $2 | wc -l` + echo "[Lengths are $1=$n1 versus $2=$n2]" +} + +check_sorted_and_uniq $data/utt2spk + +if ! $no_spk_sort; then + ! cat $data/utt2spk | sort -k2 | cmp -s - $data/utt2spk && \ + echo "$0: utt2spk is not in sorted order when sorted first on speaker-id " && \ + echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1; +fi + +check_sorted_and_uniq $data/spk2utt + +! cmp -s <(cat $data/utt2spk | awk '{print $1, $2;}') \ + <(utils/spk2utt_to_utt2spk.pl $data/spk2utt) && \ + echo "$0: spk2utt and utt2spk do not seem to match" && exit 1; + +cat $data/utt2spk | awk '{print $1;}' > $tmpdir/utts + +if [ ! -f $data/text ] && ! $no_text; then + echo "$0: no such file $data/text (if this is by design, specify --no-text)" + exit 1; +fi + +num_utts=`cat $tmpdir/utts | wc -l` +if [ -f $data/text ]; then + utils/validate_text.pl $data/text || exit 1; + check_sorted_and_uniq $data/text + text_len=`cat $data/text | wc -l` + illegal_sym_list=" #0" + for x in $illegal_sym_list; do + if grep -w "$x" $data/text > /dev/null; then + echo "$0: Error: in $data, text contains illegal symbol $x" + exit 1; + fi + done + awk '{print $1}' < $data/text > $tmpdir/utts.txt + if ! cmp -s $tmpdir/utts{,.txt}; then + echo "$0: Error: in $data, utterance lists extracted from utt2spk and text" + echo "$0: differ, partial diff is:" + partial_diff $tmpdir/utts{,.txt} + exit 1; + fi +fi + +if [ -f $data/segments ] && [ ! -f $data/images.scp ]; then + echo "$0: in directory $data, segments file exists but no images.scp" + exit 1; +fi + + +if [ ! -f $data/images.scp ] && ! $no_image; then + echo "$0: no such file $data/images.scp (if this is by design, specify --no-image)" + exit 1; +fi + +if [ -f $data/images.scp ]; then + check_sorted_and_uniq $data/images.scp + + if grep -E -q '^\S+\s+~' $data/images.scp; then + # note: it's not a good idea to have any kind of tilde in images.scp, even if + # part of a command, as it would cause compatibility problems if run by + # other users, but this used to be not checked for so we let it slide unless + # it's something of the form "foo ~/foo.wav" (i.e. a plain file name) which + # would definitely cause problems as the fopen system call does not do + # tilde expansion. + echo "$0: Please do not use tilde (~) in your images.scp." + exit 1; + fi + + if [ -f $data/segments ]; then + + check_sorted_and_uniq $data/segments + # We have a segments file -> interpret wav file as "recording-ids" not utterance-ids. + ! cat $data/segments | \ + awk '{if (NF != 4 || $4 <= $3) { print "Bad line in segments file", $0; exit(1); }}' && \ + echo "$0: badly formatted segments file" && exit 1; + + segments_len=`cat $data/segments | wc -l` + if [ -f $data/text ]; then + ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \ + echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \ + echo "$0: Lengths are $segments_len vs $num_utts" && \ + exit 1 + fi + + cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings + awk '{print $1}' $data/images.scp > $tmpdir/recordings.wav + if ! cmp -s $tmpdir/recordings{,.wav}; then + echo "$0: Error: in $data, recording-ids extracted from segments and images.scp" + echo "$0: differ, partial diff is:" + partial_diff $tmpdir/recordings{,.wav} + exit 1; + fi + if [ -f $data/reco2file_and_channel ]; then + # this file is needed only for ctm scoring; it's indexed by recording-id. + check_sorted_and_uniq $data/reco2file_and_channel + ! cat $data/reco2file_and_channel | \ + awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { + if ( NF == 3 && $3 == "1" ) { + warning_issued = 1; + } else { + print "Bad line ", $0; exit 1; + } + } + } + END { + if (warning_issued == 1) { + print "The channel should be marked as A or B, not 1! You should change it ASAP! " + } + }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; + cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/recordings.r2fc + if ! cmp -s $tmpdir/recordings{,.r2fc}; then + echo "$0: Error: in $data, recording-ids extracted from segments and reco2file_and_channel" + echo "$0: differ, partial diff is:" + partial_diff $tmpdir/recordings{,.r2fc} + exit 1; + fi + fi + else + # No segments file -> assume images.scp indexed by utterance. + cat $data/images.scp | awk '{print $1}' > $tmpdir/utts.wav + if ! cmp -s $tmpdir/utts{,.wav}; then + echo "$0: Error: in $data, utterance lists extracted from utt2spk and images.scp" + echo "$0: differ, partial diff is:" + partial_diff $tmpdir/utts{,.wav} + exit 1; + fi + + if [ -f $data/reco2file_and_channel ]; then + # this file is needed only for ctm scoring; it's indexed by recording-id. + check_sorted_and_uniq $data/reco2file_and_channel + ! cat $data/reco2file_and_channel | \ + awk '{if (NF != 3 || ($3 != "A" && $3 != "B" )) { + if ( NF == 3 && $3 == "1" ) { + warning_issued = 1; + } else { + print "Bad line ", $0; exit 1; + } + } + } + END { + if (warning_issued == 1) { + print "The channel should be marked as A or B, not 1! You should change it ASAP! " + } + }' && echo "$0: badly formatted reco2file_and_channel file" && exit 1; + cat $data/reco2file_and_channel | awk '{print $1}' > $tmpdir/utts.r2fc + if ! cmp -s $tmpdir/utts{,.r2fc}; then + echo "$0: Error: in $data, utterance-ids extracted from segments and reco2file_and_channel" + echo "$0: differ, partial diff is:" + partial_diff $tmpdir/utts{,.r2fc} + exit 1; + fi + fi + fi +fi + +if [ ! -f $data/feats.scp ] && ! $no_feats; then + echo "$0: no such file $data/feats.scp (if this is by design, specify --no-feats)" + exit 1; +fi + +if [ -f $data/feats.scp ]; then + check_sorted_and_uniq $data/feats.scp + cat $data/feats.scp | awk '{print $1}' > $tmpdir/utts.feats + if ! cmp -s $tmpdir/utts{,.feats}; then + echo "$0: Error: in $data, utterance-ids extracted from utt2spk and features" + echo "$0: differ, partial diff is:" + partial_diff $tmpdir/utts{,.feats} + exit 1; + fi +fi + + +if [ -f $data/cmvn.scp ]; then + check_sorted_and_uniq $data/cmvn.scp + cat $data/cmvn.scp | awk '{print $1}' > $tmpdir/speakers.cmvn + cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers + if ! cmp -s $tmpdir/speakers{,.cmvn}; then + echo "$0: Error: in $data, speaker lists extracted from spk2utt and cmvn" + echo "$0: differ, partial diff is:" + partial_diff $tmpdir/speakers{,.cmvn} + exit 1; + fi +fi + +if [ -f $data/spk2gender ]; then + check_sorted_and_uniq $data/spk2gender + ! cat $data/spk2gender | awk '{if (!((NF == 2 && ($2 == "m" || $2 == "f")))) exit 1; }' && \ + echo "$0: Mal-formed spk2gender file" && exit 1; + cat $data/spk2gender | awk '{print $1}' > $tmpdir/speakers.spk2gender + cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers + if ! cmp -s $tmpdir/speakers{,.spk2gender}; then + echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2gender" + echo "$0: differ, partial diff is:" + partial_diff $tmpdir/speakers{,.spk2gender} + exit 1; + fi +fi + +if [ -f $data/spk2warp ]; then + check_sorted_and_uniq $data/spk2warp + ! cat $data/spk2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ + echo "$0: Mal-formed spk2warp file" && exit 1; + cat $data/spk2warp | awk '{print $1}' > $tmpdir/speakers.spk2warp + cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers + if ! cmp -s $tmpdir/speakers{,.spk2warp}; then + echo "$0: Error: in $data, speaker lists extracted from spk2utt and spk2warp" + echo "$0: differ, partial diff is:" + partial_diff $tmpdir/speakers{,.spk2warp} + exit 1; + fi +fi + +if [ -f $data/utt2warp ]; then + check_sorted_and_uniq $data/utt2warp + ! cat $data/utt2warp | awk '{if (!((NF == 2 && ($2 > 0.5 && $2 < 1.5)))){ print; exit 1; }}' && \ + echo "$0: Mal-formed utt2warp file" && exit 1; + cat $data/utt2warp | awk '{print $1}' > $tmpdir/utts.utt2warp + cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts + if ! cmp -s $tmpdir/utts{,.utt2warp}; then + echo "$0: Error: in $data, utterance lists extracted from utt2spk and utt2warp" + echo "$0: differ, partial diff is:" + partial_diff $tmpdir/utts{,.utt2warp} + exit 1; + fi +fi + +# check some optionally-required things +for f in vad.scp utt2lang utt2uniq; do + if [ -f $data/$f ]; then + check_sorted_and_uniq $data/$f + if ! cmp -s <( awk '{print $1}' $data/utt2spk ) \ + <( awk '{print $1}' $data/$f ); then + echo "$0: error: in $data, $f and utt2spk do not have identical utterance-id list" + exit 1; + fi + fi +done + + +if [ -f $data/utt2dur ]; then + check_sorted_and_uniq $data/utt2dur + cat $data/utt2dur | awk '{print $1}' > $tmpdir/utts.utt2dur + if ! cmp -s $tmpdir/utts{,.utt2dur}; then + echo "$0: Error: in $data, utterance-ids extracted from utt2spk and utt2dur file" + echo "$0: differ, partial diff is:" + partial_diff $tmpdir/utts{,.utt2dur} + exit 1; + fi + cat $data/utt2dur | \ + awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 +fi + + +echo "$0: Successfully validated data-directory $data" diff --git a/egs/commonvoice/s5/run.sh b/egs/commonvoice/s5/run.sh index 1ceabde1940..3e0e46c89f1 100755 --- a/egs/commonvoice/s5/run.sh +++ b/egs/commonvoice/s5/run.sh @@ -13,6 +13,8 @@ data_url=https://common-voice-data-download.s3.amazonaws.com/cv_corpus_v1.tar.gz stage=0 +. ./utils/parse_options.sh + set -euo pipefail if [ $stage -le 0 ]; then @@ -144,8 +146,8 @@ if [ $stage -le 7 ]; then utils/mkgraph.sh data/lang_test exp/tri4b exp/tri4b/graph for testset in valid_dev; do steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ - exp/tri4b/graph_nosp_tgsmall data/$testset \ - exp/tri4b/decode_nosp_tgsmall_$testset + exp/tri4b/graph data/$testset \ + exp/tri4b/decode_$testset done )& fi diff --git a/egs/csj/s5/local/csj_make_trans/csj_autorun.sh b/egs/csj/s5/local/csj_make_trans/csj_autorun.sh index 18db2b9f3a8..f288e4fb4d3 100755 --- a/egs/csj/s5/local/csj_make_trans/csj_autorun.sh +++ b/egs/csj/s5/local/csj_make_trans/csj_autorun.sh @@ -21,7 +21,7 @@ set -e # exit on error case "$csjv" in "merl" ) SDB=sdb/ ; WAV=WAV/ ; disc=CSJ2004 ;; # Set SDB directory and WAV directory respectively. "usb" ) SDB=MORPH/SDB/ ; WAV=WAV/ ; disc="core noncore" ;; # Set SDB directory and WAV directory respectively. - "dvd" ) num=dvd ; SDB= ; WAV= ; disc=$num`seq -s " "$num 3 17` ;; # Set preserved format name to $num. + "dvd" ) num=dvd ; SDB= ; WAV= ; disc=$num`seq -s " "$num 3 17| sed "s/ $num$//"` ;; # Set preserved format name to $num. *) echo "Input variable is usb or dvd only. $csjv is UNAVAILABLE VERSION." && exit 1; esac diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh index 9ba7da6e361..e95de232304 100644 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_100k_semisupervised_1a.sh @@ -285,13 +285,9 @@ fi left_context=$model_left_context right_context=$model_right_context -left_context_initial=$model_left_context -right_context_final=$model_right_context egs_left_context=$(perl -e "print int($left_context + $frame_subsampling_factor / 2)") egs_right_context=$(perl -e "print int($right_context + $frame_subsampling_factor / 2)") -egs_left_context_initial=$(perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)") -egs_right_context_final=$(perl -e "print int($right_context_final + $frame_subsampling_factor / 2)") if [ -z "$sup_egs_dir" ]; then sup_egs_dir=$dir/egs_${supervised_set_perturbed} @@ -308,7 +304,6 @@ if [ -z "$sup_egs_dir" ]; then echo "$0: generating egs from the supervised data" steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ --left-context $egs_left_context --right-context $egs_right_context \ - --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ --frame-subsampling-factor $frame_subsampling_factor \ --alignment-subsampling-factor $frame_subsampling_factor \ --frames-per-eg $frames_per_eg \ @@ -349,7 +344,6 @@ if [ -z "$unsup_egs_dir" ]; then --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ --left-tolerance $tolerance --right-tolerance $tolerance \ --left-context $egs_left_context --right-context $egs_right_context \ - --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ --frame-subsampling-factor $frame_subsampling_factor \ --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ diff --git a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh index ad5d2b106b5..2d5b2f8480e 100755 --- a/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh +++ b/egs/fisher_english/s5/local/semisup/chain/tuning/run_tdnn_50k_semisupervised_1a.sh @@ -304,13 +304,9 @@ fi left_context=$model_left_context right_context=$model_right_context -left_context_initial=$model_left_context -right_context_final=$model_right_context egs_left_context=$(perl -e "print int($left_context + $frame_subsampling_factor / 2)") egs_right_context=$(perl -e "print int($right_context + $frame_subsampling_factor / 2)") -egs_left_context_initial=$(perl -e "print int($left_context_initial + $frame_subsampling_factor / 2)") -egs_right_context_final=$(perl -e "print int($right_context_final + $frame_subsampling_factor / 2)") if [ -z "$sup_egs_dir" ]; then sup_egs_dir=$dir/egs_${supervised_set_perturbed} @@ -327,7 +323,6 @@ if [ -z "$sup_egs_dir" ]; then echo "$0: generating egs from the supervised data" steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ --left-context $egs_left_context --right-context $egs_right_context \ - --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ --frame-subsampling-factor $frame_subsampling_factor \ --alignment-subsampling-factor $frame_subsampling_factor \ --frames-per-eg $frames_per_eg \ @@ -368,7 +363,6 @@ if [ -z "$unsup_egs_dir" ]; then --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ --left-tolerance $tolerance --right-tolerance $tolerance \ --left-context $egs_left_context --right-context $egs_right_context \ - --left-context-initial $egs_left_context_initial --right-context-final $egs_right_context_final \ --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ --frame-subsampling-factor $frame_subsampling_factor \ --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ diff --git a/egs/heroico/s5/RESULTS b/egs/heroico/s5/RESULTS index 1dd129ed522..9717e95e6e2 100644 --- a/egs/heroico/s5/RESULTS +++ b/egs/heroico/s5/RESULTS @@ -1,27 +1,22 @@ -%WER 75.78 [ 6983 / 9215, 1377 ins, 507 del, 5099 sub ] exp/tri3b/decode_nonnative.si/wer_17_1.0 -%WER 74.25 [ 5680 / 7650, 1187 ins, 431 del, 4062 sub ] exp/tri3b/decode_devtest.si/wer_16_1.0 -%WER 73.85 [ 6805 / 9215, 517 ins, 1653 del, 4635 sub ] exp/chain/tdnn1c_sp/decode_nonnative/wer_7_1.0 -%WER 73.76 [ 12328 / 16713, 2541 ins, 804 del, 8983 sub ] exp/tri3b/decode_test.si/wer_17_1.0 -%WER 73.20 [ 6745 / 9215, 518 ins, 1566 del, 4661 sub ] exp/chain/tdnn1c_sp_online/decode_nonnative/wer_7_1.0 -%WER 71.86 [ 5497 / 7650, 530 ins, 959 del, 4008 sub ] exp/mono/decode_devtest/wer_7_1.0 -%WER 71.64 [ 6602 / 9215, 646 ins, 939 del, 5017 sub ] exp/mono/decode_nonnative/wer_7_1.0 -%WER 71.26 [ 5343 / 7498, 1159 ins, 293 del, 3891 sub ] exp/tri3b/decode_native.si/wer_17_1.0 -%WER 69.84 [ 11673 / 16713, 986 ins, 2523 del, 8164 sub ] exp/chain/tdnn1c_sp/decode_test/wer_7_1.0 -%WER 69.59 [ 11630 / 16713, 1153 ins, 1643 del, 8834 sub ] exp/mono/decode_test/wer_7_1.0 -%WER 69.10 [ 11548 / 16713, 976 ins, 2402 del, 8170 sub ] exp/chain/tdnn1c_sp_online/decode_test/wer_7_1.0 -%WER 67.09 [ 6182 / 9215, 907 ins, 626 del, 4649 sub ] exp/tri1/decode_nonnative/wer_14_1.0 -%WER 66.98 [ 5022 / 7498, 503 ins, 700 del, 3819 sub ] exp/mono/decode_native/wer_7_1.0 -%WER 66.78 [ 6154 / 9215, 1048 ins, 537 del, 4569 sub ] exp/tri2b/decode_nonnative/wer_15_1.0 -%WER 66.64 [ 6141 / 9215, 1226 ins, 425 del, 4490 sub ] exp/tri3b/decode_nonnative/wer_16_1.0 -%WER 66.33 [ 5074 / 7650, 921 ins, 481 del, 3672 sub ] exp/tri1/decode_devtest/wer_11_1.0 -%WER 66.30 [ 5072 / 7650, 1198 ins, 328 del, 3546 sub ] exp/tri3b/decode_devtest/wer_11_1.0 -%WER 65.88 [ 5040 / 7650, 985 ins, 450 del, 3605 sub ] exp/tri2b/decode_devtest/wer_13_1.0 -%WER 65.05 [ 10872 / 16713, 1725 ins, 959 del, 8188 sub ] exp/tri1/decode_test/wer_13_1.0 -%WER 64.76 [ 4856 / 7498, 461 ins, 862 del, 3533 sub ] exp/chain/tdnn1c_sp/decode_native/wer_7_1.0 -%WER 64.45 [ 10772 / 16713, 2261 ins, 698 del, 7813 sub ] exp/tri3b/decode_test/wer_16_1.0 -%WER 64.33 [ 10751 / 16713, 1955 ins, 845 del, 7951 sub ] exp/tri2b/decode_test/wer_14_1.0 -%WER 64.03 [ 4801 / 7498, 463 ins, 825 del, 3513 sub ] exp/chain/tdnn1c_sp_online/decode_native/wer_7_1.0 -%WER 62.54 [ 4689 / 7498, 781 ins, 379 del, 3529 sub ] exp/tri1/decode_native/wer_13_1.0 -%WER 61.66 [ 4623 / 7498, 1038 ins, 267 del, 3318 sub ] exp/tri3b/decode_native/wer_15_1.0 -%WER 61.28 [ 4595 / 7498, 899 ins, 309 del, 3387 sub ] exp/tri2b/decode_native/wer_13_1.0 -john@A-TEAM19054:~/work/kaldi/egs/heroico/s5$ \ No newline at end of file +# for dir in $(echo exp/tri*/decode* | grep -v 'si/'); do grep WER $dir/wer* | utils/best_wer.sh; done + +%WER 67.01 [ 5126 / 7650, 837 ins, 575 del, 3714 sub ] exp/tri1/decode_devtest/wer_14_1.0 +%WER 62.39 [ 4678 / 7498, 768 ins, 397 del, 3513 sub ] exp/tri1/decode_native/wer_13_1.0 +%WER 67.05 [ 6179 / 9215, 895 ins, 606 del, 4678 sub ] exp/tri1/decode_nonnative/wer_13_1.0 +%WER 64.97 [ 10859 / 16713, 1678 ins, 999 del, 8182 sub ] exp/tri1/decode_test/wer_13_1.0 +%WER 65.90 [ 5041 / 7650, 1016 ins, 416 del, 3609 sub ] exp/tri2b/decode_devtest/wer_12_1.0 +%WER 61.26 [ 4593 / 7498, 908 ins, 300 del, 3385 sub ] exp/tri2b/decode_native/wer_14_1.0 +%WER 67.51 [ 6221 / 9215, 1085 ins, 524 del, 4612 sub ] exp/tri2b/decode_nonnative/wer_14_1.0 +%WER 64.87 [ 10842 / 16713, 2004 ins, 838 del, 8000 sub ] exp/tri2b/decode_test/wer_14_1.0 +%WER 66.09 [ 5056 / 7650, 1078 ins, 402 del, 3576 sub ] exp/tri3b/decode_devtest/wer_16_1.0 +%WER 74.88 [ 5728 / 7650, 1210 ins, 426 del, 4092 sub ] exp/tri3b/decode_devtest.si/wer_15_1.0 +%WER 61.19 [ 4588 / 7498, 1038 ins, 255 del, 3295 sub ] exp/tri3b/decode_native/wer_14_1.0 +%WER 70.99 [ 5323 / 7498, 1185 ins, 301 del, 3837 sub ] exp/tri3b/decode_native.si/wer_16_1.0 +%WER 66.35 [ 6114 / 9215, 1186 ins, 421 del, 4507 sub ] exp/tri3b/decode_nonnative/wer_17_1.0 +%WER 76.36 [ 7037 / 9215, 1420 ins, 467 del, 5150 sub ] exp/tri3b/decode_nonnative.si/wer_16_1.0 +%WER 64.06 [ 10706 / 16713, 2245 ins, 657 del, 7804 sub ] exp/tri3b/decode_test/wer_15_1.0 +%WER 73.97 [ 12362 / 16713, 2608 ins, 766 del, 8988 sub ] exp/tri3b/decode_test.si/wer_16_1.0 +%WER 53.07 [ 4060 / 7650, 744 ins, 376 del, 2940 sub ] exp/chain/tdnn1e_sp/decode_devtest/wer_7_1.0 +%WER 54.47 [ 4084 / 7498, 536 ins, 475 del, 3073 sub ] exp/chain/tdnn1e_sp/decode_native/wer_7_1.0 +%WER 63.01 [ 5806 / 9215, 685 ins, 784 del, 4337 sub ] exp/chain/tdnn1e_sp/decode_nonnative/wer_7_1.0 +%WER 59.25 [ 9903 / 16713, 1226 ins, 1259 del, 7418 sub ] exp/chain/tdnn1e_sp/decode_test/wer_7_1.0 diff --git a/egs/heroico/s5/cmd.sh b/egs/heroico/s5/cmd.sh index 109060ff617..a427f3c16a5 100755 --- a/egs/heroico/s5/cmd.sh +++ b/egs/heroico/s5/cmd.sh @@ -10,8 +10,8 @@ # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -export train_cmd=queue.pl -export decode_cmd="queue.pl --mem 2G" +export train_cmd="retry.pl queue.pl" +export decode_cmd="retry.pl queue.pl --mem 2G" # queue_conf=$HOME/queue_conf/default.conf # see example /homes/kazi/iveselyk/queue_conf/default.conf, # export train_cmd="queue.pl --config $queue_conf --mem 2G --matylda 0.2" diff --git a/egs/heroico/s5/local/chain/compare_wer.sh b/egs/heroico/s5/local/chain/compare_wer.sh index 9b212cb5fbb..3ee755dee36 100755 --- a/egs/heroico/s5/local/chain/compare_wer.sh +++ b/egs/heroico/s5/local/chain/compare_wer.sh @@ -58,23 +58,17 @@ set_names() { -echo -n "# System " +echo -n "# System " for x in $*; do printf "% 10s" " $(basename $x)"; done echo -strings=( - "#WER devtest " - "#WER native " - nonnative - "#WER test ") +test_sets=(devtest test native nonnative) -for n in 0 1 2 3; do - echo -n "${strings[$n]}" +for t in ${test_sets[@]}; do + printf '# %%WER % 14s ' $t for x in $*; do set_names $x # sets $dirname and $epoch_infix - decode_names=(devtest native nonnative test) - - wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + wer=$(cat $dirname/decode_$t/wer_* | utils/best_wer.sh | awk '{print $2}') printf "% 10s" $wer done echo @@ -82,7 +76,7 @@ for n in 0 1 2 3; do echo -n "# [looped:] " for x in $*; do set_names $x # sets $dirname and $epoch_infix - wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + wer=$(cat $dirname/decode_looped_$t/wer_* | utils/best_wer.sh | awk '{print $2}') printf "% 10s" $wer done echo @@ -91,7 +85,7 @@ for n in 0 1 2 3; do echo -n "# [online:] " for x in $*; do set_names $x # sets $dirname and $epoch_infix - wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + wer=$(cat ${dirname}_online/decode_$t/wer_* | utils/best_wer.sh | awk '{print $2}') printf "% 10s" $wer done echo @@ -104,14 +98,14 @@ if $used_epochs; then fi -echo -n "# Final train prob " +echo -n "# Final train prob " for x in $*; do prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') printf "% 10s" $prob done echo -echo -n "# Final valid prob " +echo -n "# Final valid prob " for x in $*; do prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') printf "% 10s" $prob @@ -131,3 +125,9 @@ for x in $*; do printf "% 10s" $prob done echo + +echo -n "# Num-params " +for x in $*; do + printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}') +done +echo diff --git a/egs/heroico/s5/local/chain/run_tdnn.sh b/egs/heroico/s5/local/chain/run_tdnn.sh index 34499362831..61f8f499182 120000 --- a/egs/heroico/s5/local/chain/run_tdnn.sh +++ b/egs/heroico/s5/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_1a.sh \ No newline at end of file +tuning/run_tdnn_1b.sh \ No newline at end of file diff --git a/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh index 6179957feae..4658f4d3d6d 100755 --- a/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh @@ -1,42 +1,20 @@ #!/bin/bash -# 1e -# lower number of epochs to 7 from 10 (avoid overfitting?) - -# compare with 1d -# ./local/chain/compare_wer.sh exp/chain/tdnn1d_sp exp/chain/tdnn1e_sp -# System tdnn1d_sp tdnn1e_sp -#WER devtest 52.78 52.21 -#WER native 55.32 53.43 -nonnative 64.35 61.03 -# test 60.28 57.70 -# Final train prob -0.0229 -0.0250 -# Final valid prob -0.0683 -0.0678 -# Final train prob (xent) -0.7525 -0.7887 -# Final valid prob (xent) -1.0296 -1.0419 - -# info -#exp/chain/tdnn1e_sp: -# num-iters=105 -# nj=1..1 -# num-params=6.6M -# dim=40+100->1392 -# combine=-0.036->-0.033 -# xent:train/valid[69,104,final]=(-1.20,-0.917,-0.789/-1.35,-1.16,-1.04) -# logprob:train/valid[69,104,final]=(-0.049,-0.030,-0.025/-0.082,-0.075,-0.068) - -# Word Error Rates on folds -%WER 61.03 [ 5624 / 9215, 630 ins, 727 del, 4267 sub ] exp/chain/tdnn1e_sp/decode_nonnative/wer_8_1.0 -%WER 57.70 [ 9644 / 16713, 1249 ins, 1040 del, 7355 sub ] exp/chain/tdnn1e_sp/decode_test/wer_7_1.0 -%WER 53.43 [ 4006 / 7498, 558 ins, 408 del, 3040 sub ] exp/chain/tdnn1e_sp/decode_native/wer_7_1.0 -%WER 52.21 [ 3994 / 7650, 585 ins, 456 del, 2953 sub ] exp/chain/tdnn1e_sp/decode_devtest/wer_9_1.0 - -# | fold | 1a | 1b | 1c | 1d | 1e | -#| devtest | 54.46 | 54.20 | 54.16 | 52.78 | 52.21 | -#| native | 62.14 | 62.32 | 61.70 | 55.32 | 53.43 | -#| nonnative | 70.58 | 71.20 | 71.68 | 64.35 | 61.03 | -#| test | 66.85 | 67.21 | 67.25 | 60.28 | 57.70 | - -# this script came from the mini librispeech recipe + +# local/chain/compare_wer.sh exp/chain/tdnn1a_sp +# System tdnn1a_sp +# %WER devtest 53.07 +# %WER test 59.25 +# %WER native 54.47 +# %WER nonnative 63.01 +# Final train prob -0.0253 +# Final valid prob -0.0687 +# Final train prob (xent) -0.7715 +# Final valid prob (xent) -1.0719 +# Num-params 6567648 + +# steps/info/chain_dir_info.pl exp/chain/tdnn1a_sp/ +#exp/chain/tdnn1a_sp/: num-iters=105 nj=1..1 num-params=6.6M dim=40+100->1392 combine=-0.040->-0.033 (over 7) xent:train/valid[69,104,final]=(-1.12,-0.880,-0.771/-1.33,-1.21,-1.07) logprob:train/valid[69,104,final]=(-0.050,-0.031,-0.025/-0.079,-0.080,-0.069) + # Set -e here so that we catch if any executable fails immediately set -euo pipefail @@ -51,7 +29,7 @@ nnet3_affix= # The rest are configs specific to this script. Most of the parameters # are just hardcoded at this level, in the commands below. -affix=1e # affix for the TDNN directory name +affix=1a # affix for the TDNN directory name tree_affix= train_stage=-10 get_egs_stage=-10 @@ -313,13 +291,9 @@ if $test_online_decoding && [ $stage -le 17 ]; then # note: we just give it "data/${data}" as it only uses the wav.scp, the # feature type does not matter. steps/online/nnet3/decode.sh \ - --acwt 1.0 \ - --post-decode-acwt 10.0 \ - --nj $nspk \ - --cmd "$decode_cmd" \ - $tree_dir/graph \ - data/${data} \ - ${dir}_online/decode_${data} || exit 1 + --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nspk --cmd "$decode_cmd" \ + $tree_dir/graph data/${data} ${dir}_online/decode_${data} || exit 1 ) || touch $dir/.error & done wait diff --git a/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh new file mode 100755 index 00000000000..33ce1556d29 --- /dev/null +++ b/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh @@ -0,0 +1,302 @@ +#!/bin/bash + +# 1b is as 1a but a re-tuned model with quite a few changes, including moving to +# a resnet-style factored TDNN-F model. +# +# local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1b_sp +# System tdnn1a_sp tdnn1b_sp +# %WER devtest 53.07 52.54 +# %WER test 59.25 53.70 +# %WER native 54.47 48.76 +# %WER nonnative 63.01 57.66 +# Final train prob -0.0253 -0.0547 +# Final valid prob -0.0687 -0.0694 +# Final train prob (xent) -0.7715 -0.9502 +# Final valid prob (xent) -1.0719 -1.0849 +# Num-params 6567648 3321312 + + +# steps/info/chain_dir_info.pl exp/chain/tdnn1b_sp +# exp/chain/tdnn1b_sp: num-iters=34 nj=2..5 num-params=3.3M dim=40+100->1392 combine=-0.059->-0.059 (over 1) xent:train/valid[21,33,final]=(-1.28,-0.986,-0.950/-1.38,-1.10,-1.08) logprob:train/valid[21,33,final]=(-0.085,-0.063,-0.055/-0.090,-0.074,-0.069) + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train +test_sets="native nonnative devtest test" +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1b # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +num_leaves=3500 + +# training options +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +dropout_schedule='0,0@0.20,0.3@0.50,0' +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --cmd "$train_cmd" \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + $num_leaves \ + ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + affine_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.015" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=768 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + ## adding the layers for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=768 small-dim=192 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + # adding the layers for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=768 small-dim=192 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py \ + --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=8 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 \ + data/lang_test \ + $tree_dir \ + $tree_dir/graph || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l \ - $tmpdir/heroico/lists/train/$x - done + local/get_wav_list.sh $datadir/data - for x in wav.scp utt2spk text; do - cat $tmpdir/heroico/recordings/devtest/$x \ - | \ - sed -e s/ // \ - | \ - sort \ - -k1,1 \ - -u \ - > \ - $tmpdir/heroico/lists/devtest/$x - done + # make separate lists for heroico answers and recordings + # the transcripts are converted to UTF8 + export LC_ALL=en_US.UTF-8 + cat $answers_transcripts | iconv -f ISO-8859-1 -t UTF-8 | \ + sed -e 's/\r//' | local/heroico_answers_make_lists.pl - utils/fix_data_dir.sh $tmpdir/heroico/lists/train - utils/fix_data_dir.sh $tmpdir/heroico/lists/devtest -fi + utils/fix_data_dir.sh $tmpdir/heroico/answers -if [ $stage -le 1 ]; then - # make separate lists for usma native and nonnative - cat \ - $usma_transcripts \ - | \ - iconv -f ISO-8859-1 -t UTF-8 \ - | \ - sed -e s/ // \ - | \ - local/usma_native_make_lists.pl - - cat \ - $usma_transcripts \ - | \ - iconv -f ISO-8859-1 -t UTF-8 \ - | \ - sed -e s/ // \ - | \ - local/usma_nonnative_make_lists.pl + cat $recordings_transcripts | iconv -f ISO-8859-1 -t UTF-8 | \ + sed -e 's/\r//' | local/heroico_recordings_make_lists.pl - for n in native nonnative; do - mkdir -p $tmpdir/usma/$n/lists + utils/fix_data_dir.sh $tmpdir/heroico/recordings/train + utils/fix_data_dir.sh $tmpdir/heroico/recordings/devtest - for x in wav.scp utt2spk text; do - sort \ - $tmpdir/usma/$n/$x \ - > \ - $tmpdir/usma/$n/lists/$x - done + # consolidate heroico lists + mkdir -p $tmpdir/heroico/lists/train $tmpdir/heroico/lists/devtest - utils/fix_data_dir.sh \ - $tmpdir/usma/$n/lists - done + for x in wav.scp utt2spk text; do + cat $tmpdir/heroico/answers/$x $tmpdir/heroico/recordings/train/$x | \ + sed -e 's/\r//' | sort -k1,1 -u >$tmpdir/heroico/lists/train/$x + done - mkdir -p data/train - mkdir -p $tmpdir/lists/train - mkdir -p data/devtest - mkdir -p $tmpdir/lists/devtest + for x in wav.scp utt2spk text; do + cat $tmpdir/heroico/recordings/devtest/$x | sed -e 's/\r//' | \ + sort -k1,1 -u >$tmpdir/heroico/lists/devtest/$x + done - # get training lists - for x in wav.scp utt2spk text; do - cat \ - $tmpdir/heroico/answers/${x} \ - $tmpdir/heroico/recordings/train/${x} \ - | \ - sed -e s/ // \ - > \ - $tmpdir/lists/train/$x - - sort \ - $tmpdir/lists/train/$x \ - > \ - data/train/$x - done + utils/fix_data_dir.sh $tmpdir/heroico/lists/train + utils/fix_data_dir.sh $tmpdir/heroico/lists/devtest +fi + +if [ $stage -le 1 ]; then + # make separate lists for usma (US military academy) native and nonnative + cat $usma_transcripts | iconv -f ISO-8859-1 -t UTF-8 | \ + sed -e 's/\r//' | local/usma_native_make_lists.pl - # get devtest lists + cat $usma_transcripts | iconv -f ISO-8859-1 -t UTF-8 | \ + sed -e 's/\r//' | local/usma_nonnative_make_lists.pl + + for n in native nonnative; do + mkdir -p $tmpdir/usma/$n/lists for x in wav.scp utt2spk text; do - cat \ - $tmpdir/heroico/lists/devtest/$x \ - | \ - sed -e s/ // \ - > \ - $tmpdir/lists/devtest/$x - - sort \ - $tmpdir/lists/devtest/$x \ - > \ - data/devtest/$x + sort $tmpdir/usma/$n/$x >$tmpdir/usma/$n/lists/$x done - utils/utt2spk_to_spk2utt.pl \ - data/train/utt2spk \ - | \ - sort \ - > \ - data/train/spk2utt - - utils/utt2spk_to_spk2utt.pl \ - data/devtest/utt2spk \ - | \ - sort \ - > \ - data/devtest/spk2utt - - utils/fix_data_dir.sh data/train - utils/fix_data_dir.sh data/devtest - -# make testing lists - - mkdir \ - -p data/test data/native data/nonnative $tmpdir/usma/lists - - for x in wav.scp text utt2spk; do - # get testing lists - for n in native nonnative; do - cat \ - $tmpdir/usma/$n/lists/$x \ - >> \ - $tmpdir/usma/lists/$x - done - - cat \ - $tmpdir/usma/lists/$x \ - > \ - data/test/$x - - for n in native nonnative; do - sort \ - $tmpdir/usma/$n/$x \ - > \ - data/$n/$x - done - done + utils/fix_data_dir.sh $tmpdir/usma/$n/lists + done + + mkdir -p data/train $tmpdir/lists/train data/devtest $tmpdir/lists/devtest + + # get training lists + for x in wav.scp utt2spk text; do + cat $tmpdir/heroico/answers/${x} $tmpdir/heroico/recordings/train/${x} | \ + sed -e 's/\r//' >$tmpdir/lists/train/$x + sort $tmpdir/lists/train/$x >data/train/$x + done + + # get devtest lists + for x in wav.scp utt2spk text; do + cat $tmpdir/heroico/lists/devtest/$x | \ + sed -e 's/\r//' >$tmpdir/lists/devtest/$x + sort $tmpdir/lists/devtest/$x >data/devtest/$x + done + + utils/utt2spk_to_spk2utt.pl data/train/utt2spk | sort >data/train/spk2utt + utils/utt2spk_to_spk2utt.pl data/devtest/utt2spk | sort >data/devtest/spk2utt - for n in native nonnative test; do - utils/utt2spk_to_spk2utt.pl \ - data/$n/utt2spk \ - | \ - sort \ - > \ - data/$n/spk2utt + utils/fix_data_dir.sh data/train + utils/fix_data_dir.sh data/devtest - utils/fix_data_dir.sh \ - data/$n + # make testing lists + mkdir -p data/test data/native data/nonnative $tmpdir/usma/lists + + for x in wav.scp text utt2spk; do + for n in native nonnative; do + cat $tmpdir/usma/$n/lists/$x + done >$tmpdir/usma/lists/$x + + cat $tmpdir/usma/lists/$x >data/test/$x + + for n in native nonnative; do + sort $tmpdir/usma/$n/$x >data/$n/$x done + done + + for n in native nonnative test; do + utils/utt2spk_to_spk2utt.pl data/$n/utt2spk | sort >data/$n/spk2utt + utils/fix_data_dir.sh data/$n + done fi diff --git a/egs/heroico/s5/local/prepare_dict.sh b/egs/heroico/s5/local/prepare_dict.sh index d902aedd027..a6d182a6852 100755 --- a/egs/heroico/s5/local/prepare_dict.sh +++ b/egs/heroico/s5/local/prepare_dict.sh @@ -13,57 +13,29 @@ fi export LC_ALL=C -cut \ - -f2- \ - -d " " \ - data/local/tmp/dict/santiago.txt \ - | \ - tr -s '[:space:]' '[\n*]' \ - | \ - grep \ - -v \ - SPN \ - | \ - sort \ - | \ - uniq \ - > \ - data/local/dict/nonsilence_phones.txt +cut -f2- data/local/tmp/dict/santiago.txt | \ + tr -s '[:space:]' '[\n*]' | \ + grep -v SPN | sort -u >data/local/dict/nonsilence_phones.txt -expand \ - -t 1 \ - data/local/tmp/dict/santiago.txt \ - | \ - sort \ - | \ - uniq \ - | \ - sed "1d" \ - > \ - data/local/dict/lexicon.txt +# sed "1d" deletes the last line. +expand -t 1 data/local/tmp/dict/santiago.txt | sort -u | + sed "1d" >data/local/dict/lexicon.txt -echo " SPN" \ - >> \ - data/local/dict/lexicon.txt +echo " SPN" >> data/local/dict/lexicon.txt # silence phones, one per line. { echo SIL; echo SPN; -} \ - > \ - data/local/dict/silence_phones.txt +} >data/local/dict/silence_phones.txt -echo \ - SIL \ - > \ - data/local/dict/optional_silence.txt +echo SIL >data/local/dict/optional_silence.txt ( - tr '\n' ' ' < data/local/dict/silence_phones.txt; - echo; - tr '\n' ' ' < data/local/dict/nonsilence_phones.txt; - echo; + tr '\n' ' ' data/local/dict/extra_questions.txt echo "Finished dictionary preparation." diff --git a/egs/heroico/s5/local/prepare_lm.sh b/egs/heroico/s5/local/prepare_lm.sh index 5c72d64fbfe..e2a92ba3c5a 100755 --- a/egs/heroico/s5/local/prepare_lm.sh +++ b/egs/heroico/s5/local/prepare_lm.sh @@ -16,18 +16,30 @@ fi corpus=$1 -ngram-count \ - -order 3 \ - -interpolate \ - -unk \ - -map-unk "" \ - -limit-vocab \ - -text $corpus \ - -lm data/local/lm/threegram.arpa || exit 1; - - if [ -e "data/local/lm/threegram.arpa.gz" ]; then - rm data/local/lm/threegram.arpa.gz - fi - - gzip \ - data/local/lm/threegram.arpa +if [ ! -f $corpus ]; then + echo "$0: input data $corpus not found." + exit 1 +fi + +if ! command ngram-count >/dev/null; then + if uname -a | grep 64 >/dev/null; then # some kind of 64 bit... + sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 + else + sdir=$KALDI_ROOT/tools/srilm/bin/i686 + fi + if [ -f $sdir/ngram-count ]; then + echo Using SRILM tools from $sdir + export PATH=$PATH:$sdir + else + echo You appear to not have SRILM tools installed, either on your path, + echo or installed in $sdir. See tools/install_srilm.sh for installation + echo instructions. + exit 1 + fi +fi + + +ngram-count -order 3 -interpolate -unk -map-unk "" \ + -limit-vocab -text $corpus -lm data/local/lm/trigram.arpa || exit 1; + +gzip -f data/local/lm/trigram.arpa diff --git a/egs/heroico/s5/run.sh b/egs/heroico/s5/run.sh index ae1ae4fe7ec..711bece3c66 100755 --- a/egs/heroico/s5/run.sh +++ b/egs/heroico/s5/run.sh @@ -1,23 +1,26 @@ #!/bin/bash -# -*- tab-width: 2; indent-tabs-mode: nil; -*- . ./cmd.sh . ./path.sh stage=0 -. ./utils/parse_options.sh +. utils/parse_options.sh set -e set -o pipefail set -u -# the location of the LDC corpus -datadir=/mnt/corpora/LDC2006S37/data +# the location of the LDC corpus; this location works for the CLSP grid. +datadir=/export/corpora5/LDC/LDC2006S37 -# location of subs text data -subsdata="http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2016/en-es.txt.zip" +#datadir=/mnt/corpora/LDC2006S37 + +# location of subtitles text data +# note: this is not used so I'm commenting it out; dan. +#subsdata="http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2016/en-es.txt.zip" lexicon="http://www.openslr.org/resources/34/santiago.tar.gz" +# don't change tmpdir, the location is used explicitly in scripts in local/. tmpdir=data/local/tmp if [ $stage -le 0 ]; then @@ -42,7 +45,6 @@ if [ $stage -le 1 ]; then fi ( - #run in shell, so we don't have to remember the path cd $tmpdir/dict tar -xzf santiago.tar.gz ) @@ -52,28 +54,29 @@ if [ $stage -le 1 ]; then # prepare the lang directory utils/prepare_lang.sh \ data/local/dict "" \ - data/local/lang data/lang || exit 1; + data/local/lang data/lang fi if [ $stage -le 2 ]; then # use am training text to train lm mkdir -p $tmpdir/heroico/lm - + echo "point 1" # get the text from data/train/text cut -d " " -f 2- data/train/text > $tmpdir/heroico/lm/train.txt - + echo "point 2" # build lm local/prepare_lm.sh $tmpdir/heroico/lm/train.txt + echo "point 3" utils/format_lm.sh \ - data/lang data/local/lm/threegram.arpa.gz data/local/dict/lexicon.txt \ + data/lang data/local/lm/trigram.arpa.gz data/local/dict/lexicon.txt \ data/lang_test # delete temporary work - rm -Rf data/local/tmp + rm -rf data/local/tmp fi -if [ $stage -le 5 ]; then +if [ $stage -le 3 ]; then # extract acoustic features mkdir -p exp @@ -82,87 +85,79 @@ if [ $stage -le 5 ]; then rm data/$fld/cmvn.scp fi - steps/make_mfcc.sh --cmd "$train_cmd" --nj 4 \ - data/$fld exp/make_mfcc/$fld mfcc || exit 1; - - utils/fix_data_dir.sh data/$fld || exit 1; - - steps/compute_cmvn_stats.sh data/$fld exp/make_mfcc mfcc || exit 1; - - utils/fix_data_dir.sh data/$fld || exit 1; + steps/make_mfcc.sh --cmd "$train_cmd" --nj 4 data/$fld + utils/fix_data_dir.sh data/$fld + steps/compute_cmvn_stats.sh data/$fld + utils/fix_data_dir.sh data/$fld done +fi +if [ $stage -le 4 ]; then echo "$0 monophone training" - steps/train_mono.sh \ - --nj 4 --cmd "$train_cmd" \ - data/train data/lang exp/mono || exit 1; + steps/train_mono.sh --nj 8 --cmd "$train_cmd" data/train data/lang exp/mono || exit 1; # evaluation ( - # make decoding graph for monophones - utils/mkgraph.sh \ - data/lang_test \ - exp/mono \ - exp/mono/graph || exit 1; - - # test monophones - for x in native nonnative devtest test; do - steps/decode.sh --nj 8 \ - exp/mono/graph data/$x exp/mono/decode_${x} || exit 1; - done + # make decoding graph for monophones + utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph || exit 1; + + # test monophones + for x in native nonnative devtest test; do + steps/decode.sh --nj 8 exp/mono/graph data/$x exp/mono/decode_${x} || exit 1; + done ) & +fi + +if [ $stage -le 5 ]; then # align with monophones - steps/align_si.sh \ - --nj 8 --cmd "$train_cmd" \ - data/train data/lang exp/mono exp/mono_ali || exit 1; + steps/align_si.sh --nj 8 --cmd "$train_cmd" \ + data/train data/lang exp/mono exp/mono_ali echo "$0 Starting triphone training in exp/tri1" - steps/train_deltas.sh \ - --cmd "$train_cmd" \ - --cluster-thresh 100 \ - 1500 25000 \ - data/train data/lang exp/mono_ali exp/tri1 || exit 1; + steps/train_deltas.sh --cmd "$train_cmd" --cluster-thresh 100 \ + 1500 25000 data/train data/lang exp/mono_ali exp/tri1 - # test cd gmm hmm models - # make decoding graphs for tri1 + wait # wait for the previous decoding jobs to finish in case there's just one + # machine. ( - utils/mkgraph.sh \ + utils/mkgraph.sh \ data/lang_test exp/tri1 exp/tri1/graph || exit 1; - # decode test data with tri1 models - for x in native nonnative devtest test; do - steps/decode.sh \ - --nj 8 \ - exp/tri1/graph data/$x exp/tri1/decode_${x} || exit 1; - done + for x in native nonnative devtest test; do + steps/decode.sh --nj 8 exp/tri1/graph data/$x exp/tri1/decode_${x} || exit 1; + done ) & +fi + +if [ $stage -le 6 ]; then + echo "$0: Starting delta system alignment" + # align with triphones steps/align_si.sh \ - --nj 8 --cmd "$train_cmd" \ - data/train data/lang exp/tri1 exp/tri1_ali -fi + --nj 8 --cmd "$train_cmd" data/train data/lang exp/tri1 exp/tri1_ali + + echo "$0: starting lda+mllt triphone training in exp/tri2b" -if [ $stage -le 7 ]; then - echo "$0 Starting (lda_mllt) triphone training in exp/tri2b" steps/train_lda_mllt.sh \ --splice-opts "--left-context=3 --right-context=3" \ - 2000 30000 \ - data/train data/lang exp/tri1_ali exp/tri2b + 2000 30000 data/train data/lang exp/tri1_ali exp/tri2b + + wait # wait for the previous decoding jobs to finish in case there's just one + # machine. ( - # make decoding FSTs for tri2b models - utils/mkgraph.sh \ - data/lang_test exp/tri2b exp/tri2b/graph || exit 1; - - # decode test with tri2b models - for x in native nonnative devtest test; do - steps/decode.sh \ - --nj 8 \ - exp/tri2b/graph data/$x exp/tri2b/decode_${x} || exit 1; - done + utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph || exit 1; + + for x in native nonnative devtest test; do + steps/decode.sh --nj 8 exp/tri2b/graph data/$x exp/tri2b/decode_${x} || exit 1; + done ) & +fi + +if [ $stage -le 7 ]; then + echo "$0: Starting LDA+MLLT system alignment" # align with lda and mllt adapted triphones steps/align_si.sh \ @@ -172,28 +167,26 @@ if [ $stage -le 7 ]; then echo "$0 Starting (SAT) triphone training in exp/tri3b" steps/train_sat.sh \ --cmd "$train_cmd" \ - 3100 50000 \ - data/train data/lang exp/tri2b_ali exp/tri3b + 3100 50000 data/train data/lang exp/tri2b_ali exp/tri3b # align with tri3b models echo "$0 Starting exp/tri3b_ali" steps/align_fmllr.sh \ --nj 8 --cmd "$train_cmd" \ data/train data/lang exp/tri3b exp/tri3b_ali -fi -if [ $stage -le 8 ]; then + wait # wait for the previous decoding jobs to finish in case there's just one + # machine. ( - # make decoding graphs for SAT models - utils/mkgraph.sh \ - data/lang_test exp/tri3b exp/tri3b/graph || exit 1; - - # decode test sets with tri3b models - for x in native nonnative devtest test; do - steps/decode_fmllr.sh \ - --nj 8 --cmd "$decode_cmd" \ - exp/tri3b/graph data/$x exp/tri3b/decode_${x} - done + # make decoding graphs for SAT models + utils/mkgraph.sh \ + data/lang_test exp/tri3b exp/tri3b/graph || exit 1; + + # decode test sets with tri3b models + for x in native nonnative devtest test; do + steps/decode_fmllr.sh \ + --nj 8 --cmd "$decode_cmd" exp/tri3b/graph data/$x exp/tri3b/decode_${x} + done ) & fi diff --git a/egs/hkust/s5/local/chain/compare_wer.sh b/egs/hkust/s5/local/chain/compare_wer.sh new file mode 100755 index 00000000000..b3376871a69 --- /dev/null +++ b/egs/hkust/s5/local/chain/compare_wer.sh @@ -0,0 +1,97 @@ +#!/bin/bash +# Copyright 2018 Emotech LTD (Author: Xuechen Liu) + +# compare wer between diff. models in hkust chain directory +# exemplar usage: local/chain/compare_wer.sh --online exp/chain/tdnn_7h_sp +# note: this script is made quite general since we kinda wanna give more flexibility to +# users on adding affix for their own use when training models. + +set -e +. ./cmd.sh +. ./path.sh + +if [ $# == 0 ]; then + echo "Usage: $0: [--online] [ ... ]" + echo "e.g.: $0 --online exp/chain/tdnn_7h_sp" + exit 1 +fi + +echo "# $0 $*" + +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) +} + +# print model names +echo -n "# Model " +for x in $*; do + printf "% 10s" " $(basename $x)" +done +echo + +# print decode WER results +echo -n "# WER(%) " +for x in $*; do + set_names $x + wer=$([ -d $x ] && grep WER $x/decode/cer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +# so how about online WER? +if $include_online; then + echo -n "# WER(%)[online] " + for x in $*; do + set_names $x + wer=$(cat ${x}_online/decode/cer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + echo -n "# WER(%)[per-utt] " + for x in $*; do + set_names $x + wer_per_utt=$(cat ${x}_online/decode_per_utt/cer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer_per_utt + done + echo +fi + +# print final log prob for train & validation +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf($8)}' | cut -c1-7) + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf($8)}' | cut -c1-7) + printf "% 10s" $prob +done +echo + +# do the same for xent objective +echo -n "# Final train prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/hkust/s5/local/chain/run_tdnn.sh b/egs/hkust/s5/local/chain/run_tdnn.sh deleted file mode 100755 index c73c46fcef6..00000000000 --- a/egs/hkust/s5/local/chain/run_tdnn.sh +++ /dev/null @@ -1,209 +0,0 @@ -#!/bin/bash - -# This script is based on tun_tdnn_7h.sh in swbd chain recipe. - -set -e - -# configs for 'chain' -affix= -stage=12 -train_stage=-10 -get_egs_stage=-10 -dir=exp/chain/tdnn_7h # Note: _sp will get added to this if $speed_perturb == true. -decode_iter= - -# training options -num_epochs=4 -initial_effective_lrate=0.001 -final_effective_lrate=0.0001 -max_param_change=2.0 -final_layer_normalize_target=0.5 -num_jobs_initial=2 -num_jobs_final=12 -minibatch_size=128 -frames_per_eg=150 -remove_egs=true -common_egs_dir= -xent_regularize=0.1 - -# End configuration section. -echo "$0 $@" # Print the command line for logging - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - -if ! cuda-compiled; then - cat <$lang/topo -fi - -if [ $stage -le 11 ]; then - # Build a tree using our new topology. This is the critically different - # step compared with other recipes. - steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ - --context-opts "--context-width=2 --central-position=1" \ - --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir -fi - -if [ $stage -le 12 ]; then - echo "$0: creating neural net configs using the xconfig parser"; - - num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=100 name=ivector - input dim=43 name=input - - # please note that it is important to have input layer with the name=input - # as the layer immediately preceding the fixed-affine-layer to enable - # the use of short notation for the descriptor - fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - - # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=625 - relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=625 - relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=625 - relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=625 - relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=625 - relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=625 - - ## adding the layers for chain branch - relu-renorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5 - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 - - # adding the layers for xent branch - # This block prints the configs for a separate output that will be - # trained with a cross-entropy objective in the 'chain' models... this - # has the effect of regularizing the hidden parts of the model. we use - # 0.5 / args.xent_regularize as the learning rate factor- the factor of - # 0.5 / args.xent_regularize is suitable as it means the xent - # final-layer learns at a rate independent of the regularization - # constant; and the 0.5 was tuned so as to make the relative progress - # similar in the xent and regular final layers. - relu-renorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5 - output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 - -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ -fi - -if [ $stage -le 13 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/b0{5,6,7,8}/$USER/kaldi-data/egs/hkust-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/chain/train.py --stage $train_stage \ - --cmd "$decode_cmd" \ - --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ - --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --chain.xent-regularize $xent_regularize \ - --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize 0.00005 \ - --chain.apply-deriv-weights false \ - --chain.lm-opts="--num-extra-lm-states=2000" \ - --egs.dir "$common_egs_dir" \ - --egs.stage $get_egs_stage \ - --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width $frames_per_eg \ - --trainer.num-chunk-per-minibatch $minibatch_size \ - --trainer.frames-per-iter 1500000 \ - --trainer.num-epochs $num_epochs \ - --trainer.optimization.num-jobs-initial $num_jobs_initial \ - --trainer.optimization.num-jobs-final $num_jobs_final \ - --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ - --trainer.optimization.final-effective-lrate $final_effective_lrate \ - --trainer.max-param-change $max_param_change \ - --cleanup.remove-egs $remove_egs \ - --feat-dir data/${train_set}_hires \ - --tree-dir $treedir \ - --lat-dir exp/tri5a_sp_lats \ - --dir $dir || exit 1; -fi - -if [ $stage -le 14 ]; then - # Note: it might appear that this $lang directory is mismatched, and it is as - # far as the 'topo' is concerned, but this script doesn't read the 'topo' from - # the lang directory. - utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph -fi - -graph_dir=$dir/graph -if [ $stage -le 15 ]; then - iter_opts= - if [ ! -z $decode_iter ]; then - iter_opts=" --iter $decode_iter " - fi - steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --nj 10 --cmd "$decode_cmd" $iter_opts \ - --online-ivector-dir exp/nnet3/ivectors_dev \ - $graph_dir data/dev_hires $dir/decode || exit 1; -fi - -if [ $stage -le 16 ]; then - steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \ - --add-pitch true \ - data/lang exp/nnet2_online/extractor "$dir" ${dir}_online || exit 1; -fi - -if [ $stage -le 17 ]; then - # do the actual online decoding with iVectors, carrying info forward from - # previous utterances of the same speaker. - steps/online/nnet3/decode.sh --config conf/decode.config \ - --cmd "$decode_cmd" --nj 10 --acwt 1.0 --post-decode-acwt 10.0 \ - "$graph_dir" data/dev_hires \ - ${dir}_online/decode || exit 1; -fi - -if [ $stage -le 18 ]; then - # this version of the decoding treats each utterance separately - # without carrying forward speaker information. - steps/online/nnet3/decode.sh --config conf/decode.config \ - --cmd "$decode_cmd" --nj 10 --per-utt true --acwt 1.0 --post-decode-acwt 10.0 \ - "$graph_dir" data/dev_hires \ - ${dir}_online/decode_per_utt || exit 1; -fi diff --git a/egs/hkust/s5/local/chain/run_tdnn.sh b/egs/hkust/s5/local/chain/run_tdnn.sh new file mode 120000 index 00000000000..b0749843c8c --- /dev/null +++ b/egs/hkust/s5/local/chain/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_2a.sh \ No newline at end of file diff --git a/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh b/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh new file mode 100644 index 00000000000..0fc0de36a45 --- /dev/null +++ b/egs/hkust/s5/local/chain/tuning/run_tdnn_2a.sh @@ -0,0 +1,247 @@ +#!/bin/bash + +# This script is based on run_tdnn_7p.sh in swbd chain recipe. + +# Results +# local/chain/compare_wer.sh --online exp/chain/tdnn_7h_chain_2b_sp +# Model tdnn_7h_chain_2b_sp +# WER(%) 23.67 +# WER(%)[online] 23.69 +# WER(%)[per-utt] 24.67 +# Final train prob -0.0895 +# Final valid prob -0.1251 +# Final train prob (xent) -1.3628 +# Final valid prob (xent) -1.5590 + +# exp 2b: changes on network arch with multiple training options, referencing swbd +set -euxo pipefail + +# configs for 'chain' +affix=chain_2a +stage=12 +nj=10 +train_stage=-10 +get_egs_stage=-10 +dir=exp/chain/tdnn_7h # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= + +# training options +num_epochs=4 +initial_effective_lrate=0.0005 +final_effective_lrate=0.00005 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=3 +minibatch_size=128 +frames_per_eg=150,110,100 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + ivector_dim=$(feat-to-dim scp:exp/nnet3/ivectors_${train_set}/ivector_online.scp -) + feat_dim=$(feat-to-dim scp:data/${train_set}_hires/feats.scp -) + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004" + output_opts="l2-regularize=0.002" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=$ivector_dim name=ivector + input dim=$feat_dim name=input + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1024 + linear-component name=tdnn2l0 dim=256 $linear_opts input=Append(-1,0) + linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1024 + linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1024 input=Append(0,1) + linear-component name=tdnn4l0 dim=256 $linear_opts input=Append(-1,0) + linear-component name=tdnn4l dim=256 $linear_opts input=Append(0,1) + relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1024 + linear-component name=tdnn5l dim=256 $linear_opts + relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1024 input=Append(0, tdnn3l) + linear-component name=tdnn6l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn7l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn7l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1024 + linear-component name=tdnn8l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn8l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn9l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1024 + linear-component name=tdnn10l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn10l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn11l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1024 + linear-component name=prefinal-l dim=256 $linear_opts + relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280 + linear-component name=prefinal-chain-l dim=256 $linear_opts + batchnorm-component name=prefinal-chain-batchnorm + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280 + linear-component name=prefinal-xent-l dim=256 $linear_opts + batchnorm-component name=prefinal-xent-batchnorm + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/hkust-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --constrained false" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.optimization.momentum 0.0 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri5a_sp_lats \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph +fi + +graph_dir=$dir/graph +if [ $stage -le 15 ]; then + iter_opts= + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_dev \ + $graph_dir data/dev_hires $dir/decode || exit 1; +fi + +if [ $stage -le 16 ]; then + steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \ + --add-pitch true \ + data/lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1; +fi + +if [ $stage -le 17 ]; then + # do the actual online decoding with iVectors, carrying info forward from + # previous utterances of the same speaker. + steps/online/nnet3/decode.sh --config conf/decode.config \ + --cmd "$decode_cmd" --nj $nj --acwt 1.0 --post-decode-acwt 10.0 \ + "$graph_dir" data/dev_hires \ + ${dir}_online/decode || exit 1; +fi + +if [ $stage -le 18 ]; then + # this version of the decoding treats each utterance separately + # without carrying forward speaker information. + steps/online/nnet3/decode.sh --config conf/decode.config \ + --cmd "$decode_cmd" --nj $nj --per-utt true --acwt 1.0 --post-decode-acwt 10.0 \ + "$graph_dir" data/dev_hires \ + ${dir}_online/decode_per_utt || exit 1; +fi diff --git a/egs/hkust/s5/local/hkust_segment.py b/egs/hkust/s5/local/hkust_segment.py index ba5ffc053d5..92d3add0e3e 100755 --- a/egs/hkust/s5/local/hkust_segment.py +++ b/egs/hkust/s5/local/hkust_segment.py @@ -1,8 +1,8 @@ #!/usr/bin/env python #coding:utf-8 -import sys from __future__ import print_function +import sys from mmseg import seg_txt for line in sys.stdin: blks = str.split(line) diff --git a/egs/hkust/s5/local/nnet3/compare_wer.sh b/egs/hkust/s5/local/nnet3/compare_wer.sh new file mode 100755 index 00000000000..252fab12e18 --- /dev/null +++ b/egs/hkust/s5/local/nnet3/compare_wer.sh @@ -0,0 +1,82 @@ +#!/bin/bash +# Copyright 2018 Emotech LTD (Author: Xuechen Liu) + +# compare wer between diff. models in hkust nnet3 directory +# exemplar usage: local/nnet3/compare_wer_general.sh exp/nnet3/tdnn_sp exp/nnet3/tdnn_sp_pr43 +# note: this script is made quite general since we kinda wanna give more flexibility to +# users on adding affix for their own use when training models. + +set -e +. ./cmd.sh +. ./path.sh + +if [ $# == 0 ]; then + echo "Usage: $0: [--online] [ ... ]" + echo "e.g.: $0 exp/nnet3/tdnn_sp exp/nnet3/tdnn_sp_pr" + exit 1 +fi + +echo "# $0 $*" + +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) +} + +# print model names +echo -n "# Model " +for x in $*; do + printf "% 10s" " $(basename $x)" +done +echo + +# print decode WER results +echo -n "# WER(%) " +for x in $*; do + set_names $x + wer=$([ -d $x ] && grep WER $x/decode/cer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer +done +echo + +# so how about online WER? +if $include_online; then + echo -n "# WER(%)[online] " + for x in $*; do + set_names $x + wer=$(cat ${x}_online/decode/cer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + echo -n "# WER(%)[per-utt] " + for x in $*; do + set_names $x + wer_per_utt=$(cat ${x}_online/decode_per_utt/cer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer_per_utt + done + echo +fi + +# print log for train & validation +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.combined.log | grep log-like | awk '{printf($8)}' | cut -c1-7) + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.combined.log | grep log-like | awk '{printf($8)}' | cut -c1-7) + printf "% 10s" $prob +done +echo diff --git a/egs/hkust/s5/local/nnet3/run_ivector_common.sh b/egs/hkust/s5/local/nnet3/run_ivector_common.sh index 4e7ae5e6ac1..de952e08904 100755 --- a/egs/hkust/s5/local/nnet3/run_ivector_common.sh +++ b/egs/hkust/s5/local/nnet3/run_ivector_common.sh @@ -58,20 +58,19 @@ for line in sys.stdin.readlines(): fi if [ $stage -le 2 ] && [ -z $ivector_extractor ]; then - # Train a system just for its LDA+MLLT transform. We use --num-iters 13 - # because after we get the transform (12th iter is the last), any further - # training is pointless. - steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ - --realign-iters "" \ + # perform PCA on the data + echo "$0: computing a PCA transform from the no-pitch hires data." + steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \ --splice-opts "--left-context=3 --right-context=3" \ - 5000 10000 data/train_hires_nopitch data/lang \ - ${gmm_dir}_ali exp/nnet3/tri5 + --max-utts 10000 --subsample 2 \ + data/${train_set}_hires_nopitch \ + exp/nnet3/tri5_pca fi if [ $stage -le 3 ] && [ -z $ivector_extractor ]; then steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \ --num-frames 700000 \ - data/train_hires_nopitch 512 exp/nnet3/tri5 exp/nnet3/diag_ubm + data/train_hires_nopitch 512 exp/nnet3/tri5_pca exp/nnet3/diag_ubm fi if [ $stage -le 4 ] && [ -z $ivector_extractor ]; then diff --git a/egs/hkust/s5/local/nnet3/run_tdnn.sh b/egs/hkust/s5/local/nnet3/run_tdnn.sh deleted file mode 100755 index 68e95307faa..00000000000 --- a/egs/hkust/s5/local/nnet3/run_tdnn.sh +++ /dev/null @@ -1,134 +0,0 @@ -#!/bin/bash - -# This script is based on swbd/s5c/local/nnet3/run_tdnn.sh - -# this is the standard "tdnn" system, built in nnet3; it's what we use to -# call multi-splice. - -# At this script level we don't support not running on GPU, as it would be painfully slow. -# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, -# --num-threads 16 and --minibatch-size 128. -set -e - -stage=0 -train_stage=-10 -affix= -common_egs_dir= - -# training options -initial_effective_lrate=0.0015 -final_effective_lrate=0.00015 -num_epochs=4 -num_jobs_initial=2 -num_jobs_final=12 -remove_egs=true - -# feature options -use_ivectors=true - -# End configuration section. - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - -if ! cuda-compiled; then - cat < $dir/configs/network.xconfig + input dim=$ivector_dim name=ivector + input dim=$feat_dim name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 dim=1024 + relu-renorm-layer name=tdnn2 input=Append(-1,2) dim=1024 + relu-renorm-layer name=tdnn3 input=Append(-3,3) dim=1024 + relu-renorm-layer name=tdnn4 input=Append(-7,2) dim=1024 + relu-renorm-layer name=tdnn5 input=Append(-3,3) dim=1024 + relu-renorm-layer name=tdnn6 dim=1024 + + output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 9 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/hkust-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/train_dnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.num-epochs $num_epochs \ + --trainer.samples-per-iter=400000 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval 500 \ + --use-gpu true \ + --feat-dir=data/${train_set}_hires \ + --ali-dir $ali_dir \ + --lang data/lang \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 10 ]; then + # this version of the decoding treats each utterance separately + # without carrying forward speaker information. + for decode_set in dev; do + ( + num_jobs=`cat data/${decode_set}_hires/utt2spk|cut -d' ' -f2|sort -u|wc -l` + decode_dir=${dir}/decode + ivector_opts=" --online-ivector-dir exp/nnet3/ivectors_${decode_set} " + + steps/nnet3/decode.sh --nj $num_jobs --cmd "$decode_cmd" $ivector_opts \ + $graph_dir data/${decode_set}_hires $decode_dir || exit 1; + ) & + done +fi + +if [ $stage -le 11 ]; then + steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \ + --add-pitch true \ + data/lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1; +fi + +if [ $stage -le 12 ]; then + # do the actual online decoding with iVectors, carrying info forward from + # previous utterances of the same speaker. + graph_dir=exp/tri5a/graph + steps/online/nnet3/decode.sh --config conf/decode.config \ + --cmd "$decode_cmd" --nj $nj \ + "$graph_dir" data/dev_hires \ + ${dir}_online/decode || exit 1; +fi + +if [ $stage -le 13 ]; then + # this version of the decoding treats each utterance separately + # without carrying forward speaker information. + graph_dir=exp/tri5a/graph + steps/online/nnet3/decode.sh --config conf/decode.config \ + --cmd "$decode_cmd" --nj $nj --per-utt true \ + "$graph_dir" data/dev_hires \ + ${dir}_online/decode_per_utt || exit 1; +fi + +wait; +exit 0; diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh b/egs/hub4_english/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh index f3f9c939e0b..03ad94d7b95 100755 --- a/egs/hub4_english/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh +++ b/egs/hub4_english/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh @@ -40,7 +40,7 @@ done | sort > $dir/filelist mkdir -p $dir/split$nj/ if [ $stage -le 1 ]; then - eval utils/split_scp.pl $dir/filelist $dir/split$nj/filelist.{`seq -s, $nj`} + eval utils/split_scp.pl $dir/filelist $dir/split$nj/filelist.{`seq -s, $nj | sed 's/,$//'`} $cmd JOB=1:$nj $dir/log/process_text.JOB.log \ local/data_prep/process_1996_csr_hub4_lm_filelist.py \ $dir/split$nj/filelist.JOB $dir diff --git a/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_corpus.sh b/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_corpus.sh index 9835d69a37e..84baadb98f4 100755 --- a/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_corpus.sh +++ b/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_corpus.sh @@ -44,7 +44,7 @@ for x in $SOURCE_DIR/*/*/*; do mkdir -p $d/split$nj eval utils/split_scp.pl $d/articles.list \ - $d/split$nj/articles.list.{`seq -s, $nj`} + $d/split$nj/articles.list.{`seq -s, $nj | sed 's/,$//'`} $cmd JOB=1:$nj $d/log/get_processed_text.JOB.log \ local/data_prep/process_na_news_text.py $d/split$nj/articles.list.JOB \ diff --git a/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_supplement.sh b/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_supplement.sh index f7f810c2326..3bf453080f5 100755 --- a/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_supplement.sh +++ b/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_supplement.sh @@ -53,7 +53,7 @@ for x in $SOURCE_DIR/nyt/*/ $SOURCE_DIR/latwp/ $SOURCE_DIR/apws/*/; do mkdir -p $d/split$nj eval utils/split_scp.pl $d/articles.list \ - $d/split$nj/articles.list.{`seq -s, $nj`} + $d/split$nj/articles.list.{`seq -s, $nj | sed 's/,$//'`} $cmd JOB=1:$nj $d/log/get_processed_text.JOB.log \ local/data_prep/process_na_news_text.py $d/split$nj/articles.list.JOB \ diff --git a/egs/iam/v1/local/chain/run_cnn_1a.sh b/egs/iam/v1/local/chain/run_cnn_1a.sh index 05cb9948bd9..41a76920e37 100755 --- a/egs/iam/v1/local/chain/run_cnn_1a.sh +++ b/egs/iam/v1/local/chain/run_cnn_1a.sh @@ -9,12 +9,12 @@ # local/chain/compare_wer.sh exp/chain/cnn_1a/ # System cnn_1a -# WER 18.58 -# CER 10.17 -# Final train prob -0.0122 -# Final valid prob -0.0999 -# Final train prob (xent) -0.5652 -# Final valid prob (xent) -0.9758 +# WER 18.52 +# CER 10.07 +# Final train prob -0.0077 +# Final valid prob -0.0970 +# Final train prob (xent) -0.5484 +# Final valid prob (xent) -0.9643 # Parameters 4.36M set -e -o pipefail @@ -89,7 +89,7 @@ if [ $stage -le 1 ]; then # topo file. [note, it really has two states.. the first one is only repeated # once, the second one has zero or more repeats.] if [ -d $lang ]; then - if [ $lang/L.fst -nt data/$lang_test/L.fst ]; then + if [ $lang/L.fst -nt data/lang/L.fst ]; then echo "$0: $lang already exists, not overwriting it; continuing" else echo "$0: $lang already exists and seems to be older than data/lang..." @@ -97,7 +97,7 @@ if [ $stage -le 1 ]; then exit 1; fi else - cp -r data/$lang_test $lang + cp -r data/lang $lang silphonelist=$(cat $lang/phones/silence.csl) || exit 1; nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; # Use our special topology... note that later on may have to tune this @@ -110,7 +110,7 @@ if [ $stage -le 2 ]; then # Get the alignments as lattices (gives the chain training more freedom). # use the same num-jobs as the alignments steps/align_fmllr_lats.sh --nj $nj --cmd "$cmd" ${train_data_dir} \ - data/$lang_test $gmm_dir $lat_dir + data/lang $gmm_dir $lat_dir rm $lat_dir/fsts.*.gz # save space fi diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh b/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh index 2c8b6c91e5a..ee3a1a3d92c 100755 --- a/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh +++ b/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh @@ -2,21 +2,6 @@ # chainali_1a is as 1a except it uses chain alignments (using 1a system) instead of gmm alignments -# ./local/chain/compare_wer.sh exp/chain/cnn_chainali_1a/ exp/chain/cnn_1a/ -# System cnn_chainali_1a cnn_1a -# WER 6.69 9.13 -# Final train prob -0.0128 -0.0297 -# Final valid prob -0.0447 -0.0975 -# Final train prob (xent) -0.6448 -0.5915 -# Final valid prob (xent) -0.9924 -1.0022 - -# steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1a/ -# exp/chain/cnn_chainali_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.002->0.000 xent:train/valid[13,20,final]=(-0.929,-0.711,-0.645/-1.16,-1.04,-0.992) logprob:train/valid[13,20,final]=(-0.029,-0.016,-0.013/-0.051,-0.047,-0.045) - -# cat exp/chain/cnn_chainali_1a/decode_test/scoring_kaldi/best_* -# %WER 3.94 [ 2600 / 65921, 549 ins, 837 del, 1214 sub ] exp/chain/cnn_chainali_1a/decode_test/cer_15_0.0 -# %WER 6.69 [ 1241 / 18542, 135 ins, 358 del, 748 sub ] exp/chain/cnn_chainali_1a/decode_test/wer_15_0.5 - set -e -o pipefail stage=0 @@ -28,7 +13,7 @@ gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. ali=tri3_ali -chain_model_dir=exp/chain${nnet3_affix}/cnn${affix} +chain_model_dir=exp/chain${nnet3_affix}/cnn_1a common_egs_dir= reporting_email= @@ -90,7 +75,7 @@ if [ $stage -le 1 ]; then # topo file. [note, it really has two states.. the first one is only repeated # once, the second one has zero or more repeats.] if [ -d $lang ]; then - if [ $lang/L.fst -nt data/$lang_test/L.fst ]; then + if [ $lang/L.fst -nt data/lang/L.fst ]; then echo "$0: $lang already exists, not overwriting it; continuing" else echo "$0: $lang already exists and seems to be older than data/lang..." @@ -98,7 +83,7 @@ if [ $stage -le 1 ]; then exit 1; fi else - cp -r data/$lang_test $lang + cp -r data/lang $lang silphonelist=$(cat $lang/phones/silence.csl) || exit 1; nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; # Use our special topology... note that later on may have to tune this @@ -111,8 +96,9 @@ if [ $stage -le 2 ]; then # Get the alignments as lattices (gives the chain training more freedom). # use the same num-jobs as the alignments steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ - ${train_data_dir} data/$lang_test $chain_model_dir $lat_dir + ${train_data_dir} data/lang $chain_model_dir $lat_dir cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts fi diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh b/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh index d6d0ee780f4..c6876fbafcb 100755 --- a/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh +++ b/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh @@ -4,12 +4,12 @@ # local/chain/compare_wer.sh exp/chain/cnn_1a/ exp/chain/cnn_chainali_1b/ # System cnn_1a cnn_chainali_1b -# WER 18.58 14.67 -# CER 10.17 7.31 -# Final train prob -0.0122 0.0042 -# Final valid prob -0.0999 -0.0256 -# Final train prob (xent) -0.5652 -0.6282 -# Final valid prob (xent) -0.9758 -0.9096 +# WER 18.52 14.38 +# CER 10.07 7.14 +# Final train prob -0.0077 -0.0113 +# Final valid prob -0.0970 -0.0400 +# Final train prob (xent) -0.5484 -0.6043 +# Final valid prob (xent) -0.9643 -0.9030 # Parameters 4.36M 3.96M # steps/info/chain_dir_info.pl exp/chain/chainali_cnn_1b/ @@ -27,7 +27,7 @@ gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. ali=tri3_ali -chain_model_dir=exp/chain${nnet3_affix}/cnn${affix} +chain_model_dir=exp/chain${nnet3_affix}/cnn_1a common_egs_dir= reporting_email= @@ -89,7 +89,7 @@ if [ $stage -le 1 ]; then # topo file. [note, it really has two states.. the first one is only repeated # once, the second one has zero or more repeats.] if [ -d $lang ]; then - if [ $lang/L.fst -nt data/$lang_test/L.fst ]; then + if [ $lang/L.fst -nt data/lang/L.fst ]; then echo "$0: $lang already exists, not overwriting it; continuing" else echo "$0: $lang already exists and seems to be older than data/lang..." @@ -97,7 +97,7 @@ if [ $stage -le 1 ]; then exit 1; fi else - cp -r data/$lang_test $lang + cp -r data/lang $lang silphonelist=$(cat $lang/phones/silence.csl) || exit 1; nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; # Use our special topology... note that later on may have to tune this @@ -110,8 +110,9 @@ if [ $stage -le 2 ]; then # Get the alignments as lattices (gives the chain training more freedom). # use the same num-jobs as the alignments steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ - ${train_data_dir} data/$lang_test $chain_model_dir $lat_dir + ${train_data_dir} data/lang $chain_model_dir $lat_dir cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts fi diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh b/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh index 6ff76490303..54c52d913de 100755 --- a/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh +++ b/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh @@ -3,12 +3,12 @@ # chainali_1c is as chainali_1b except it uses l2-regularize # local/chain/compare_wer.sh exp/chain/cnn_chainali_1b exp/chain/cnn_chainali_1c # System cnn_chainali_1b cnn_chainali_1c -# WER 14.67 12.84 -# CER 7.31 6.40 -# Final train prob 0.0042 -0.0120 -# Final valid prob -0.0256 -0.0199 -# Final train prob (xent) -0.6282 -0.9973 -# Final valid prob (xent) -0.9096 -1.1537 +# WER 14.38 12.72 +# CER 7.14 5.99 +# Final train prob -0.0113 -0.0291 +# Final valid prob -0.0400 -0.0359 +# Final train prob (xent) -0.6043 -0.9781 +# Final valid prob (xent) -0.9030 -1.1544 # Parameters 3.96M 3.96M # steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1c @@ -25,7 +25,7 @@ gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. affix=_1c #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. ali=tri3_ali -chain_model_dir=exp/chain${nnet3_affix}/cnn${affix} +chain_model_dir=exp/chain${nnet3_affix}/cnn_1a common_egs_dir= reporting_email= @@ -33,7 +33,6 @@ reporting_email= train_stage=-10 xent_regularize=0.1 frame_subsampling_factor=4 -alignment_subsampling_factor=1 # training chunk-options chunk_width=340,300,200,100 num_leaves=500 @@ -75,7 +74,6 @@ tree_dir=exp/chain${nnet3_affix}/tree_chain # you should probably name it differently. lang=data/lang_chain for f in $train_data_dir/feats.scp \ - $train_data_dir/feats.scp $gmm_dir/final.mdl \ $ali_dir/ali.1.gz $gmm_dir/final.mdl; do [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 done @@ -87,7 +85,7 @@ if [ $stage -le 1 ]; then # topo file. [note, it really has two states.. the first one is only repeated # once, the second one has zero or more repeats.] if [ -d $lang ]; then - if [ $lang/L.fst -nt data/$lang_test/L.fst ]; then + if [ $lang/L.fst -nt data/lang/L.fst ]; then echo "$0: $lang already exists, not overwriting it; continuing" else echo "$0: $lang already exists and seems to be older than data/lang..." @@ -95,7 +93,7 @@ if [ $stage -le 1 ]; then exit 1; fi else - cp -r data/$lang_test $lang + cp -r data/lang $lang silphonelist=$(cat $lang/phones/silence.csl) || exit 1; nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; # Use our special topology... note that later on may have to tune this @@ -108,8 +106,9 @@ if [ $stage -le 2 ]; then # Get the alignments as lattices (gives the chain training more freedom). # use the same num-jobs as the alignments steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ - ${train_data_dir} data/$lang_test $chain_model_dir $lat_dir + ${train_data_dir} data/lang $chain_model_dir $lat_dir cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts fi @@ -136,12 +135,12 @@ if [ $stage -le 4 ]; then num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - opts="l2-regularize=0.075" - opts_2="l2-regularize=0.075" - opts_3="l2-regularize=0.1" - common1="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" - common2="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" - common3="$opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" mkdir -p $dir/configs cat < $dir/configs/network.xconfig input dim=40 name=input @@ -153,13 +152,13 @@ if [ $stage -le 4 ]; then conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 - relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $opts_2 - relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $opts_2 - relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $opts_2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts ## adding the layers for chain branch - relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $opts_2 - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $opts_3 + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts # adding the layers for xent branch # This block prints the configs for a separate output that will be @@ -170,8 +169,8 @@ if [ $stage -le 4 ]; then # final-layer learns at a rate independent of the regularization # constant; and the 0.5 was tuned so as to make the relative progress # similar in the xent and regular final layers. - relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $opts_2 - output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $opts_3 + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ fi @@ -192,7 +191,9 @@ if [ $stage -le 5 ]; then --chain.apply-deriv-weights=false \ --chain.lm-opts="--num-extra-lm-states=500" \ --chain.frame-subsampling-factor=$frame_subsampling_factor \ - --chain.alignment-subsampling-factor=$alignment_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ --trainer.num-epochs=4 \ diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1d.sh b/egs/iam/v1/local/chain/run_cnn_chainali_1d.sh new file mode 100755 index 00000000000..19de3af7f1d --- /dev/null +++ b/egs/iam/v1/local/chain/run_cnn_chainali_1d.sh @@ -0,0 +1,249 @@ +#!/bin/bash + +# chainali_1d is as chainali_1c except it uses unconstrained egs + +# local/chain/compare_wer.sh /home/hhadian/kaldi-rnnlm/egs/iam/v1/exp/chain/cnn_chainali_1c exp/chain/cnn_chainali_1d +# System cnn_chainali_1c cnn_chainali_1d +# WER 13.14 12.33 +# CER 6.40 5.72 +# Final train prob -0.0260 -0.0037 +# Final valid prob -0.0451 -0.0132 +# Final train prob (xent) -0.9993 -0.8647 +# Final valid prob (xent) -1.1549 -1.0101 +# Parameters 3.97M 3.97M + +# steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1d +# exp/chain/cnn_chainali_1d: num-iters=21 nj=2..4 num-params=4.0M dim=40->376 combine=-0.002->-0.002 (over 1) xent:train/valid[13,20,final]=(-1.66,-1.01,-0.865/-1.72,-1.12,-1.01) logprob:train/valid[13,20,final]=(-0.058,-0.019,-0.004/-0.055,-0.027,-0.013) + + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1c_uc #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +chain_model_dir=exp/chain${nnet3_affix}/cnn_1a_uc +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=false +lang_test=lang_unk +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $chain_model_dir $lat_dir + cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/iam/v1/local/chain/run_cnn_e2eali_1a.sh b/egs/iam/v1/local/chain/run_cnn_e2eali_1a.sh new file mode 100755 index 00000000000..ba28f681708 --- /dev/null +++ b/egs/iam/v1/local/chain/run_cnn_e2eali_1a.sh @@ -0,0 +1,245 @@ +#!/bin/bash + +# e2eali_1a is the same as chainali_1c but uses the e2e chain model to get the +# lattice alignments and to build a tree + +# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a exp/chain/cnn_chainali_1c exp/chain/cnn_e2eali_1a +# System e2e_cnn_1a cnn_chainali_1c cnn_e2eali_1a +# WER 13.87 12.72 12.70 +# CER 6.54 5.99 5.75 +# Final train prob -0.0371 -0.0291 -0.0557 +# Final valid prob -0.0636 -0.0359 -0.0770 +# Final train prob (xent) -0.9781 -0.8847 +# Final valid prob (xent) -1.1544 -1.0370 +# Parameters 9.13M 3.96M 3.95M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a +# exp/chain/cnn_e2eali_1a: num-iters=21 nj=2..4 num-params=4.0M dim=40->360 combine=-0.056->-0.056 (over 1) xent:train/valid[13,20,final]=(-1.47,-0.978,-0.918/-1.54,-1.10,-1.06) logprob:train/valid[13,20,final]=(-0.106,-0.065,-0.056/-0.113,-0.086,-0.079) + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +e2echain_model_dir=exp/chain/e2e_cnn_1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=true +lang_test=lang_unk +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/iam/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/iam/v1/local/chain/run_cnn_e2eali_1b.sh new file mode 100755 index 00000000000..6d8cca876bf --- /dev/null +++ b/egs/iam/v1/local/chain/run_cnn_e2eali_1b.sh @@ -0,0 +1,244 @@ +#!/bin/bash + +# e2eali_1b is the same as e2eali_1a but uses unconstrained egs + +# local/chain/compare_wer.sh /home/hhadian/kaldi-rnnlm/egs/iam/v1/exp/chain/cnn_e2eali_1a exp/chain/cnn_e2eali_1b +# System cnn_e2eali_1a cnn_e2eali_1b +# WER 12.79 12.23 +# CER 5.73 5.48 +# Final train prob -0.0556 -0.0367 +# Final valid prob -0.0795 -0.0592 +# Final train prob (xent) -0.9178 -0.8382 +# Final valid prob (xent) -1.0604 -0.9853 +# Parameters 3.95M 3.95M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1b +# exp/chain/cnn_e2eali_1b: num-iters=21 nj=2..4 num-params=4.0M dim=40->360 combine=-0.038->-0.038 (over 1) xent:train/valid[13,20,final]=(-1.34,-0.967,-0.838/-1.40,-1.07,-0.985) logprob:train/valid[13,20,final]=(-0.075,-0.054,-0.037/-0.083,-0.072,-0.059) + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +e2echain_model_dir=exp/chain/e2e_cnn_1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=true +lang_test=lang_unk +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh index 65eeedcc75b..56c897137f4 100755 --- a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh +++ b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh @@ -5,16 +5,16 @@ # local/chain/compare_wer.sh exp/chain/cnn_1a exp/chain/cnn_chainali_1c exp/chain/e2e_cnn_1a # System cnn_1a cnn_chainali_1c e2e_cnn_1a -# WER 18.58 12.84 15.46 -# CER 10.17 6.40 7.21 -# Final train prob -0.0122 -0.0120 -0.0426 -# Final valid prob -0.0999 -0.0199 -0.0724 -# Final train prob (xent) -0.5652 -0.9973 -# Final valid prob (xent) -0.9758 -1.1537 +# WER 18.52 12.72 13.87 +# CER 10.07 5.99 6.54 +# Final train prob -0.0077 -0.0291 -0.0371 +# Final valid prob -0.0970 -0.0359 -0.0636 +# Final train prob (xent) -0.5484 -0.9781 +# Final valid prob (xent) -0.9643 -1.1544 # Parameters 4.36M 3.96M 9.13M -# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/ -# exp/chain/e2e_cnn_1a/: num-iters=21 nj=2..4 num-params=9.1M dim=40->12640 combine=-0.040->-0.040 (over 1) logprob:train/valid[13,20,final]=(-0.065,-0.046,-0.043/-0.081,-0.073,-0.072) +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a +# exp/chain/e2e_cnn_1a: num-iters=21 nj=2..4 num-params=9.1M dim=40->12640 combine=-0.033->-0.033 (over 1) logprob:train/valid[13,20,final]=(-0.058,-0.042,-0.035/-0.070,-0.064,-0.059) set -e @@ -34,8 +34,8 @@ common_egs_dir= l2_regularize=0.00005 frames_per_iter=1000000 cmvn_opts="--norm-means=true --norm-vars=true" -train_set=train_e2e -lang_test=lang_test +train_set=train +lang_test=lang_unk # End configuration section. echo "$0 $@" # Print the command line for logging @@ -74,19 +74,24 @@ if [ $stage -le 1 ]; then --shared-phones true \ --type biphone \ data/$train_set $lang $treedir - cp exp/chain/e2e_base/phone_lm.fst $treedir/ + $cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst fi if [ $stage -le 2 ]; then echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') - opts="l2-regularize=0.075" - opts_2="l2-regularize=0.075" - opts_3="l2-regularize=0.1" - common1="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" - common2="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" - common3="$opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" mkdir -p $dir/configs cat < $dir/configs/network.xconfig input dim=40 name=input @@ -98,13 +103,13 @@ if [ $stage -le 2 ]; then conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 - relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $opts_2 - relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $opts_2 - relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $opts_2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts ## adding the layers for chain branch - relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $opts_2 - output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $opts_3 + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs diff --git a/egs/iam/v1/local/check_tools.sh b/egs/iam/v1/local/check_tools.sh index aa4fe70fa64..5b4d3107d3b 100755 --- a/egs/iam/v1/local/check_tools.sh +++ b/egs/iam/v1/local/check_tools.sh @@ -18,7 +18,7 @@ [ -f ./path.sh ] && . ./path.sh set +e -command -v python3 2>/dev/null \ +command -v python3 >&/dev/null \ || { echo >&2 "python3 not found on PATH. You will have to install Python3, preferably >= 3.6"; exit 1; } python3 -c "import numpy" @@ -41,5 +41,3 @@ fi exit 0 - - diff --git a/egs/iam/v1/local/make_features.py b/egs/iam/v1/local/make_features.py index 8cfca5ee830..84e012daedb 100755 --- a/egs/iam/v1/local/make_features.py +++ b/egs/iam/v1/local/make_features.py @@ -124,4 +124,4 @@ def get_scaled_image(im, allowed_lengths = None): write_kaldi_matrix(out_fh, data, image_id) print('Generated features for {} images. Failed for {} (iamge too ' - 'long).'.format(num_ok, num_fail)) + 'long).'.format(num_ok, num_fail), file=sys.stderr) diff --git a/egs/iam/v1/local/prepare_data.sh b/egs/iam/v1/local/prepare_data.sh index e751d5ff71a..73d711c73f0 100755 --- a/egs/iam/v1/local/prepare_data.sh +++ b/egs/iam/v1/local/prepare_data.sh @@ -18,6 +18,7 @@ stage=0 download_dir=data/download +wellington_dir= username= password= # username and password for downloading the IAM database # if you have not already downloaded the database, please @@ -43,6 +44,7 @@ xml=data/local/xml ascii=data/local/ascii bcorpus=data/local/browncorpus lobcorpus=data/local/lobcorpus +wcorpus=data/local/wellingtoncorpus data_split_info=data/local/largeWriterIndependentTextLineRecognitionTask lines_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/lines/lines.tgz xml_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/xml/xml.tgz @@ -50,6 +52,7 @@ data_split_info_url=http://www.fki.inf.unibe.ch/DBs/iamDB/tasks/largeWriterIndep ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip +wellington_corpus_loc=/export/corpora5/Wellington/WWC/ mkdir -p $download_dir data/local # download and extact images and transcription @@ -124,6 +127,23 @@ else echo "$0: Done downloading the Brown text corpus" fi +if [ -d $wcorpus ]; then + echo "$0: Not copying Wellington corpus as it is already there." +elif [ ! -z $wellington_dir ]; then + mkdir -p $wcorpus + cp -r $wellington_dir/. $wcorpus + + # Combine Wellington corpora and replace some of their annotations + cat data/local/wellingtoncorpus/Section{A,B,C,D,E,F,G,H,J,K,L}.txt | \ + cut -d' ' -f3- | sed "s/^[ \t]*//" > data/local/wellingtoncorpus/Wellington_annotated.txt + + cat data/local/wellingtoncorpus/Wellington_annotated.txt | local/remove_wellington_annotations.py > data/local/wellingtoncorpus/Wellington_annotation_removed.txt + + echo "$0: Done copying Wellington corpus" +else + echo "$0: Wellington Corpus not included because wellington_dir not provided" +fi + mkdir -p data/{train,test,val} file_name=largeWriterIndependentTextLineRecognitionTask diff --git a/egs/iam/v1/local/prepare_dict.sh b/egs/iam/v1/local/prepare_dict.sh index 8b981de3abd..f691d577fba 100755 --- a/egs/iam/v1/local/prepare_dict.sh +++ b/egs/iam/v1/local/prepare_dict.sh @@ -8,6 +8,9 @@ set -e dir=data/local/dict +vocab_size=50000 +. ./utils/parse_options.sh + mkdir -p $dir # First get the set of all letters that occur in data/train/text @@ -22,7 +25,7 @@ cat data/train/text | \ export letters=$(cat $dir/nonsilence_phones.txt | tr -d "\n") -cat data/local/local_lm/data/wordlist | \ +head -n $vocab_size data/local/local_lm/data/word_count | awk '{print $2}' | \ perl -e '$letters=$ENV{letters}; while(<>){ chop; diff --git a/egs/iam/v1/local/remove_wellington_annotations.py b/egs/iam/v1/local/remove_wellington_annotations.py new file mode 100755 index 00000000000..260a3542985 --- /dev/null +++ b/egs/iam/v1/local/remove_wellington_annotations.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +# Copyright 2018 Chun-Chieh Chang + +import sys +import io +import re +from collections import OrderedDict + +sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8"); +sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8"); + +prev2_line = " "; +prev_line = " "; +for line in sys.stdin: + line = line.strip() + pattern = re.compile("\\*\\*\\[.*?\\*\\*\\]|\\*[0-9]|\\\\[0-9]{0,2}|\\*\\*?[\|,\?,\#,\=,\;,\:,\<,\>]|\||\^") + line_fixed = pattern.sub("", line) + dict=OrderedDict([("*+$","$"), ("*+","£"), ("*-","-"), ("*/","*"), ("*{","{"), ("*}","}"), + ("**\"","\""), ("*\"","\""), ("**'","'"), ("*'","'"), ("*@","°")]) + pattern = re.compile("|".join(re.escape(key) for key in dict.keys())); + line_fixed = pattern.sub(lambda x: dict[x.group()], line_fixed) + + line_fixed = prev2_line + "\n" + prev_line + "\n" + line_fixed + + pattern = re.compile("\{[0-9]{0,2}(.*?)\}", re.DOTALL) + line_fixed = pattern.sub(lambda x: x.group(1), line_fixed) + + output, prev2_line, prev_line = line_fixed.split("\n") + + sys.stdout.write(output + "\n") +sys.stdout.write(prev2_line + "\n") +sys.stdout.write(prev_line + "\n") diff --git a/egs/iam/v1/local/train_lm.sh b/egs/iam/v1/local/train_lm.sh index a673c5b3f2d..a15fbea2af3 100755 --- a/egs/iam/v1/local/train_lm.sh +++ b/egs/iam/v1/local/train_lm.sh @@ -61,7 +61,10 @@ if [ $stage -le 0 ]; then cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \ local/remove_test_utterances_from_lob.py data/test/text data/val/text \ > ${dir}/data/text/lob.txt - cat data/local/browncorpus/brown.txt >> ${dir}/data/text/brown.txt + cat data/local/browncorpus/brown.txt > ${dir}/data/text/brown.txt + if [ -d "data/local/wellingtoncorpus" ]; then + cat data/local/wellingtoncorpus/Wellington_annotation_removed.txt > ${dir}/data/text/wellington.txt + fi # use the validation data as the dev set. # Note: the name 'dev' is treated specially by pocolm, it automatically @@ -81,7 +84,12 @@ if [ $stage -le 0 ]; then cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt # get the wordlist from IAM text - cat ${dir}/data/text/{iam,lob,brown}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + if [ -d "data/local/wellingtoncorpus" ]; then + cat ${dir}/data/text/{iam,lob,brown,wellington}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + else + echo "$0: Wellington Corpus not found. Proceeding without using that corpus." + cat ${dir}/data/text/{iam,lob,brown}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + fi head -n $vocab_size ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist fi @@ -108,7 +116,6 @@ if [ $stage -le 1 ]; then ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' - #log-prob: -5.05603614242 [perplexity = 156.967086371] over 19477.0 words fi if [ $stage -le 2 ]; then @@ -118,9 +125,6 @@ if [ $stage -le 2 ]; then prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' - # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_3_prune_big was -5.06654404785 per word [perplexity = 158.625177948] over 19477.0 words - # current results, after adding --limit-unk-history=true: - mkdir -p ${dir}/data/arpa format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz @@ -134,9 +138,6 @@ if [ $stage -le 3 ]; then prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' - # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_3_prune_small was -5.24719139498 per word [perplexity = 190.031793995] over 19477.0 words - # current results, after adding --limit-unk-history=true (needed for modeling OOVs and not blowing up LG.fst): - format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz fi diff --git a/egs/iam/v1/local/unk_arc_post_to_transcription.py b/egs/iam/v1/local/unk_arc_post_to_transcription.py index c86d35e4b8a..c5ad1235427 100755 --- a/egs/iam/v1/local/unk_arc_post_to_transcription.py +++ b/egs/iam/v1/local/unk_arc_post_to_transcription.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Copyright 2017 Ashish Arora @@ -12,20 +12,22 @@ parser.add_argument('--input-ark', type=str, default='-', help='where to read the input data') parser.add_argument('--out-ark', type=str, default='-', help='where to write the output data') args = parser.parse_args() + + ### main ### -phone_fh = open(args.phones, 'r') -word_fh = open(args.words, 'r') -unk_fh = open(args.unk,'r') +phone_fh = open(args.phones, 'r', encoding='latin-1') +word_fh = open(args.words, 'r', encoding='latin-1') +unk_fh = open(args.unk, 'r', encoding='latin-1') if args.input_ark == '-': input_fh = sys.stdin else: - input_fh = open(args.input_ark,'r') + input_fh = open(args.input_ark, 'r', encoding='latin-1') if args.out_ark == '-': out_fh = sys.stdout else: - out_fh = open(args.out_ark,'wb') + out_fh = open(args.out_ark, 'w', encoding='latin-1') -phone_dict = dict()# stores phoneID and phone mapping +phone_dict = dict() # Stores phoneID and phone mapping phone_data_vect = phone_fh.read().strip().split("\n") for key_val in phone_data_vect: key_val = key_val.split(" ") @@ -38,14 +40,14 @@ unk_val = unk_fh.read().strip().split(" ")[0] utt_word_dict = dict() -utt_phone_dict = dict()# stores utteranceID and phoneID +utt_phone_dict = dict() # Stores utteranceID and phoneID unk_word_dict = dict() count=0 for line in input_fh: line_vect = line.strip().split("\t") if len(line_vect) < 6: - print "IndexError" - print line_vect + print("Bad line: '{}' Expecting 6 fields. Skipping...".format(line), + file=sys.stderr) continue uttID = line_vect[0] word = line_vect[4] @@ -59,7 +61,7 @@ utt_phone_dict[uttID] = dict() utt_word_dict[uttID][count] = word utt_phone_dict[uttID][count] = phones - if word == unk_val: # get character sequence for unk + if word == unk_val: # Get character sequence for unk phone_key_vect = phones.split(" ") phone_val_vect = list() for pkey in phone_key_vect: @@ -78,9 +80,9 @@ count += 1 transcription = "" -for key in sorted(utt_word_dict.iterkeys()): +for key in sorted(utt_word_dict.keys()): transcription = key - for index in sorted(utt_word_dict[key].iterkeys()): + for index in sorted(utt_word_dict[key].keys()): value = utt_word_dict[key][index] transcription = transcription + " " + value out_fh.write(transcription + '\n') diff --git a/egs/iam/v1/run.sh b/egs/iam/v1/run.sh index f5c4a2b8f80..b943870f530 100755 --- a/egs/iam/v1/run.sh +++ b/egs/iam/v1/run.sh @@ -7,6 +7,7 @@ set -e stage=0 nj=20 +decode_gmm=false username= password= # iam_database points to the database path on the JHU grid. If you have not @@ -14,6 +15,11 @@ password= # like "data/download" and follow the instructions # in "local/prepare_data.sh" to download the database: iam_database=/export/corpora5/handwriting_ocr/IAM +# wellington_database points to the database path on the JHU grid. The Wellington +# corpus contains two directories WWC and WSC (Wellington Written and Spoken Corpus). +# This corpus is of written NZ English that can be purchased here: +# "https://www.victoria.ac.nz/lals/resources/corpora-default" +wellington_database=/export/corpora5/Wellington/WWC/ . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. @@ -26,6 +32,7 @@ iam_database=/export/corpora5/handwriting_ocr/IAM if [ $stage -le 0 ]; then echo "$0: Preparing data..." local/prepare_data.sh --download-dir "$iam_database" \ + --wellington-dir "$wellington_database" \ --username "$username" --password "$password" fi mkdir -p data/{train,test}/data @@ -44,22 +51,32 @@ if [ $stage -le 2 ]; then echo "$0: Estimating a language model for decoding..." # We do this stage before dict preparation because prepare_dict.sh # generates the lexicon from pocolm's wordlist - local/train_lm.sh --vocab-size 50000 + local/train_lm.sh --vocab-size 50k fi if [ $stage -le 3 ]; then echo "$0: Preparing dictionary and lang..." - local/prepare_dict.sh + + # This is for training. Use a large vocab size, e.g. 500k to include all the + # training words: + local/prepare_dict.sh --vocab-size 500k --dir data/local/dict # this is for training utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \ data/local/dict "" data/lang/temp data/lang - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_big.arpa.gz \ - data/local/dict/lexicon.txt data/lang_test + + # This is for decoding. We use a 50k lexicon to be consistent with the papers + # reporting WERs on IAM: + local/prepare_dict.sh --vocab-size 50k --dir data/local/dict_50k # this is for decoding + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \ + data/local/dict_50k "" data/lang_test/temp data/lang_test + utils/format_lm.sh data/lang_test data/local/local_lm/data/arpa/3gram_big.arpa.gz \ + data/local/dict_50k/lexicon.txt data/lang_test + echo "$0: Preparing the unk model for open-vocab decoding..." utils/lang/make_unk_lm.sh --ngram-order 4 --num-extra-ngrams 7500 \ - data/local/dict exp/unk_lang_model + data/local/dict_50k exp/unk_lang_model utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 \ --unk-fst exp/unk_lang_model/unk_fst.txt \ - data/local/dict "" data/local/temp data/lang_unk + data/local/dict_50k "" data/lang_unk/temp data/lang_unk cp data/lang_test/G.fst data/lang_unk/G.fst fi @@ -68,7 +85,7 @@ if [ $stage -le 4 ]; then data/lang exp/mono fi -if [ $stage -le 5 ]; then +if [ $stage -le 5 ] && $decode_gmm; then utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \ @@ -83,7 +100,7 @@ if [ $stage -le 6 ]; then exp/mono_ali exp/tri fi -if [ $stage -le 7 ]; then +if [ $stage -le 7 ] && $decode_gmm; then utils/mkgraph.sh data/lang_test exp/tri exp/tri/graph steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \ @@ -99,7 +116,7 @@ if [ $stage -le 8 ]; then data/train data/lang exp/tri_ali exp/tri2 fi -if [ $stage -le 9 ]; then +if [ $stage -le 9 ] && $decode_gmm; then utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph steps/decode.sh --nj $nj --cmd $cmd exp/tri2/graph \ @@ -114,7 +131,7 @@ if [ $stage -le 10 ]; then data/train data/lang exp/tri2_ali exp/tri3 fi -if [ $stage -le 11 ]; then +if [ $stage -le 11 ] && $decode_gmm; then utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph steps/decode_fmllr.sh --nj $nj --cmd $cmd exp/tri3/graph \ @@ -127,9 +144,9 @@ if [ $stage -le 12 ]; then fi if [ $stage -le 13 ]; then - local/chain/run_cnn_1a.sh + local/chain/run_cnn_1a.sh --lang-test lang_unk fi if [ $stage -le 14 ]; then - local/chain/run_cnn_chainali_1b.sh --chain-model-dir exp/chain/cnn_1a --stage 2 + local/chain/run_cnn_chainali_1c.sh --chain-model-dir exp/chain/cnn_1a --stage 2 fi diff --git a/egs/iam/v1/run_end2end.sh b/egs/iam/v1/run_end2end.sh index d479bfa2a73..6df93e739f4 100755 --- a/egs/iam/v1/run_end2end.sh +++ b/egs/iam/v1/run_end2end.sh @@ -11,6 +11,11 @@ password= # like "data/download" and follow the instructions # in "local/prepare_data.sh" to download the database: iam_database=/export/corpora5/handwriting_ocr/IAM +# wellington_database points to the database path on the JHU grid. The Wellington +# corpus contains two directories WWC and WSC (Wellington Written and Spoken Corpus). +# This corpus is of written NZ English that can be purchased here: +# "https://www.victoria.ac.nz/lals/resources/corpora-default" +wellington_database=/export/corpora5/Wellington/WWC/ . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. @@ -24,12 +29,13 @@ iam_database=/export/corpora5/handwriting_ocr/IAM if [ $stage -le 0 ]; then echo "$0: Preparing data..." local/prepare_data.sh --download-dir "$iam_database" \ + --wellington-dir "$wellington_database" \ --username "$username" --password "$password" fi mkdir -p data/{train,test}/data if [ $stage -le 1 ]; then - get_image2num_frames.py data/train # This will be needed for the next command + image/get_image2num_frames.py data/train # This will be needed for the next command # The next command creates a "allowed_lengths.txt" file in data/train # which will be used by local/make_features.py to enforce the images to # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. @@ -45,32 +51,51 @@ if [ $stage -le 1 ]; then fi if [ $stage -le 2 ]; then + echo "$0: Estimating a language model for decoding..." + # We do this stage before dict preparation because prepare_dict.sh + # generates the lexicon from pocolm's wordlist + local/train_lm.sh --vocab-size 50k +fi + +if [ $stage -le 3 ]; then echo "$0: Preparing dictionary and lang..." - local/prepare_dict.sh + + # This is for training. Use a large vocab size, e.g. 500k to include all the + # training words: + local/prepare_dict.sh --vocab-size 500k --dir data/local/dict utils/prepare_lang.sh --sil-prob 0.95 \ data/local/dict "" data/lang/temp data/lang -fi -if [ $stage -le 3 ]; then - echo "$0: Estimating a language model for decoding..." - local/train_lm.sh - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_big.arpa.gz \ - data/local/dict/lexicon.txt data/lang_test -fi + # This is for decoding. We use a 50k lexicon to be consistent with the papers + # reporting WERs on IAM. + local/prepare_dict.sh --vocab-size 50k --dir data/local/dict_50k + utils/prepare_lang.sh --sil-prob 0.95 data/local/dict_50k \ + "" data/lang_test/temp data/lang_test + utils/format_lm.sh data/lang_test data/local/local_lm/data/arpa/3gram_big.arpa.gz \ + data/local/dict_50k/lexicon.txt data/lang_test + echo "$0: Preparing the unk model for open-vocab decoding..." + utils/lang/make_unk_lm.sh --ngram-order 4 --num-extra-ngrams 7500 \ + data/local/dict_50k exp/unk_lang_model + utils/prepare_lang.sh --unk-fst exp/unk_lang_model/unk_fst.txt \ + data/local/dict_50k "" data/lang_unk/temp data/lang_unk + cp data/lang_test/G.fst data/lang_unk/G.fst +fi if [ $stage -le 4 ]; then - echo "$0: estimating phone language model for the denominator graph" - mkdir -p exp/chain/e2e_base/log - $cmd exp/chain/e2e_base/log/make_phone_lm.log \ - cat data/train/text \| \ - steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ - utils/sym2int.pl -f 2- data/lang/phones.txt \| \ - chain-est-phone-lm --num-extra-lm-states=1000 \ - ark:- exp/chain/e2e_base/phone_lm.fst + echo "$0: Calling the flat-start chain recipe..." + local/chain/run_flatstart_cnn1a.sh fi if [ $stage -le 5 ]; then - echo "$0: calling the flat-start chain recipe..." - local/chain/run_flatstart_cnn1a.sh + echo "$0: Aligning the training data using the e2e chain model..." + steps/nnet3/align.sh --nj 50 --cmd "$cmd" \ + --use-gpu false \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ + data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train +fi + +if [ $stage -le 6 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments..." + local/chain/run_cnn_e2eali_1a.sh fi diff --git a/egs/iban/s5/local/chain/run_tdnn.sh b/egs/iban/s5/local/chain/run_tdnn.sh new file mode 120000 index 00000000000..34499362831 --- /dev/null +++ b/egs/iban/s5/local/chain/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1a.sh \ No newline at end of file diff --git a/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..d320f49d3aa --- /dev/null +++ b/egs/iban/s5/local/chain/tuning/run_tdnn_1a.sh @@ -0,0 +1,263 @@ +#!/bin/bash + +# Copyright 2017-2018 Johns Hopkins University (author: Daniel Povey) +# 2017-2018 Yiming Wang + +# 1a is trying an architecture with factored parameter matrices with dropout. + +# cat exp/chain/tdnn_1a/decode_dev/scoring_kaldi/best_wer +# %WER 18.29 [ 2013 / 11006, 243 ins, 378 del, 1392 sub ] exp/chain/tdnn_1a/decode_dev/wer_10_0.0 +# cat exp/chain/tdnn_1a/decode_dev.rescored/scoring_kaldi/best_wer +# %WER 16.40 [ 1805 / 11006, 218 ins, 347 del, 1240 sub ] exp/chain/tdnn_1a/decode_dev.rescored/wer_10_0.0 + +# steps/info/chain_dir_info.pl exp/chain/tdnn_1a +# exp/chain/tdnn_1a: num-iters=38 nj=2..5 num-params=12.6M dim=40+50->1592 combine=-0.069->-0.067 (over 2) xent:train/valid[24,37,final]=(-1.41,-1.18,-1.12/-1.68,-1.54,-1.47) logprob:train/valid[24,37,final]=(-0.071,-0.057,-0.053/-0.124,-0.122,-0.121) + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +train_set=train +test_sets="dev" +gmm=tri3b + +# Options which are not passed through to run_ivector_common.sh +affix=1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# LSTM/chain options +train_stage=-10 +get_egs_stage=-10 +xent_regularize=0.1 + +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +num_epochs=15 + +# training options +srand=0 +remove_egs=true + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 10 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 50 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 12 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.08 dropout-per-dim-continuous=true" + output_opts="l2-regularize=0.02 bottleneck-dim=256" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=50 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $opts dim=768 + relu-batchnorm-dropout-layer name=tdnn2 $opts dim=768 input=Append(-1,0,1) + relu-batchnorm-dropout-layer name=tdnn3 $opts dim=768 + relu-batchnorm-dropout-layer name=tdnn4 $opts dim=768 input=Append(-1,0,1) + relu-batchnorm-dropout-layer name=tdnn5 $opts dim=768 + relu-batchnorm-dropout-layer name=tdnn6 $opts dim=768 input=Append(-3,0,3) + relu-batchnorm-dropout-layer name=tdnn7 $opts dim=768 input=Append(-3,0,3) + relu-batchnorm-dropout-layer name=tdnn8 $opts dim=768 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain $opts dim=768 + output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=768 + output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 13 ]; then + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=$num_epochs \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=0 \ + --egs.chunk-right-context=0 \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test \ + $tree_dir $tree_dir/graph || exit 1; +fi + +if [ $stage -le 15 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 1592 combine=-0.062->-0.061 (over 2) xent:train/valid[24,37,final]=(-1.28,-1.03,-0.988/-1.61,-1.43,-1.36) logprob:train/valid[24,37,final]=(-0.069,-0.053,-0.049/-0.128,-0.124,-0.120) + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +train_set=train +test_sets="dev" +gmm=tri3b + +# Options which are not passed through to run_ivector_common.sh +affix=1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# LSTM/chain options +train_stage=-10 +get_egs_stage=-10 +xent_regularize=0.1 + +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +num_epochs=15 + +# training options +srand=0 +remove_egs=true + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 10 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 50 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 12 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.08 dropout-per-dim=true dropout-per-dim-continuous=true" + linear_opts="orthonormal-constraint=-1.0" + output_opts="l2-regularize=0.04" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=50 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $opts dim=768 + linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=768 + linear-component name=tdnn3l dim=256 $linear_opts + relu-batchnorm-dropout-layer name=tdnn3 $opts dim=768 + linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=768 + linear-component name=tdnn5l dim=256 $linear_opts + relu-batchnorm-dropout-layer name=tdnn5 $opts dim=768 input=Append(0, tdnn3l) + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1024 + linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=768 + linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1024 + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=768 + linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1024 + linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=768 + linear-component name=prefinal-l dim=256 $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1024 + output-layer name=output include-log-softmax=false dim=$num_targets bottleneck-dim=256 max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1024 + output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor bottleneck-dim=256 max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 13 ]; then + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=$num_epochs \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=0 \ + --egs.chunk-right-context=0 \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test \ + $tree_dir $tree_dir/graph || exit 1; +fi + +if [ $stage -le 15 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l $dir/configs/network.xconfig + input dim=50 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-renorm-layer name=tdnn1 input=Append(-1,0,1) dim=256 + relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=256 + relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=256 + relu-renorm-layer name=tdnn4 input=Append(-1,0,1) dim=256 + relu-renorm-layer name=tdnn5 input=Append(-1,0,1) dim=256 + relu-renorm-layer name=tdnn6 dim=256 + + output-layer name=output input=tdnn6 dim=$num_targets max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + + +if [ $stage -le 9 ]; then + + steps/nnet3/train_dnn.py --stage $train_stage \ + --cmd="$decode_cmd" \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 4 \ + --trainer.num-epochs 3 \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --trainer.optimization.initial-effective-lrate 0.005 \ + --trainer.optimization.final-effective-lrate 0.0005 \ + --trainer.samples-per-iter 120000 \ + --egs.dir "$common_egs_dir" \ + --cleanup.preserve-model-interval 10 \ + --cleanup.remove-egs=$remove_egs \ + --feat-dir=$train_data_dir \ + --ali-dir=$ali_dir \ + --lang data/lang \ + --dir=$dir || exit 1; +fi + + +if [ $stage -le 10 ]; then + # this does offline decoding that should give the same results as the real + # online decoding. + graph_dir=exp/tri3b/graph + # use already-built graphs. + steps/nnet3/decode.sh --nj 6 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_dev_hires --iter final \ + $graph_dir data/dev_hires $dir/decode_dev || exit 1; +fi + +if [ $stage -le 11 ]; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_test/ data/lang_big/ data/dev_hires \ + ${dir}/decode_dev ${dir}/decode_dev.rescored +fi + +exit 0; + diff --git a/egs/ifnenit/v1/README.txt b/egs/ifnenit/v1/README.txt new file mode 100644 index 00000000000..e847680668d --- /dev/null +++ b/egs/ifnenit/v1/README.txt @@ -0,0 +1,3 @@ +This directory contains example scripts for handwriting recognition on +the Arabic IFN/ENIT dataset: http://www.ifnenit.com +You'll need to register at their website to be able to download the dataset. \ No newline at end of file diff --git a/egs/ifnenit/v1/RESULTS.txt b/egs/ifnenit/v1/RESULTS.txt new file mode 100644 index 00000000000..410c5d7a049 --- /dev/null +++ b/egs/ifnenit/v1/RESULTS.txt @@ -0,0 +1,3 @@ +WER% abc-d abcd-e abcde-f abcde-s +cnn_1a 1.46% 4.18% 5.79% 9.15% +cnn_chainali_1b 1.96% 3.28% 4.80% 7.63% diff --git a/egs/ifnenit/v1/cmd.sh b/egs/ifnenit/v1/cmd.sh new file mode 100644 index 00000000000..8c01fd76a3d --- /dev/null +++ b/egs/ifnenit/v1/cmd.sh @@ -0,0 +1,16 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export cmd="queue.pl" + + + diff --git a/egs/ifnenit/v1/conf/decode.config b/egs/ifnenit/v1/conf/decode.config new file mode 100644 index 00000000000..e69de29bb2d diff --git a/egs/ifnenit/v1/config/extra_questions.txt b/egs/ifnenit/v1/config/extra_questions.txt new file mode 100755 index 00000000000..04e8eac75fc --- /dev/null +++ b/egs/ifnenit/v1/config/extra_questions.txt @@ -0,0 +1,60 @@ +seMllL seM seE naMllL naM taMllL taM taE baM baMllL baE thM thE yaM yaMllL shMllL shM shE +seBllL seB seA naB naBllL taB taA baB baBllL baA thB yaB yaBllL thA shBllL shB shA +teE heE +teA heA +alE eeE yaEllL yaE +alA eeA yaA +laMllL laM keE +laB keA +ayB ghB hhA +ayM ayMllL ghM +ayE ghE +ayA ghA +aaE amE aeE ahE +aaA amA aeA ahA +toMllL toM dzM saMllL saM deM toEllL toE dzE saE deE +toB dzB saB deBllL deB toA dzA saA deA +raEllL raE zaEllL zaE +raAllL raA zaA +waEllL waE whE +waA waAllL whA +faMllL faM kaMllL kaM faE kaE +faB kaB kaBllL faA kaA +jaB haB khBllL khB +jaM jaMllL haM haMllL khM +jaE haE khE +jaA haA khA +daA daAllL dhA +daE daEllL dhEllL dhE +waEllL waE whE raEllL raE zaEllL zaE waAllL waA whA raAllL raA zaA +waEllL waE whE raEllL raE zaEllL zaE waAllL waA whA raAllL raA zaA +naE naA laEllL laE laA +seBllL seB seMllL seM naBllL naB naMllL naM taB taMllL taM baB baBllL baMllL baM thB thM yaB yaBllL yaM yaMllL shBllL shB shMllL shM +aaE amE aeE ahE aaA amA aeA ahA +alA eeA yaA alE eeE yaEllL yaE +jaB haB khBllL khB jaM jaMllL haMllL haM khM +jaE haE khE jaA haA khA +ayB ghB +ayM ayMllL ghM +ayE ghE +ayA ghA +daAllL daA dhA daEllL daE dhEllL dhE +saMllL saM deM saB deBllL deB +toMllL toM dzM toB dzB +seE seA shE shA saE saA deE deA +0A 1A 2A 6A 7A 8A 9A +0A 1A 2A 6A 7A 8A 9A +aaElaB aaElaM +aaElaB aaElaM +haMlaB haMmaMlaB haMllLnaB haMnaB +haMlaB haMmaMlaB haMllLnaB haMnaB +heBllL heB heM +keBllL keB keMllL keM +keBllL keB +keMllL keM +maMllL maM maBllL maB +maMllL maM maE +maBllL maB maAllL maA +maAllL maA maE +ayMllL baBllL baMllL daAllL daEllL deBllL dhEllL faMllL haMllL haMllLnaB heBllL jaMllL kaBllL kaMllL keBllL keMllL khBllL laEllL laMllL maAllL maBllL maMllL naBllL naMllL raAllL raEllL saMllL seBllL seMllL shBllL shMllL taMllL toEllL toMllL waAllL waEllL yaBllL yaEllL yaMllL zaEllL zaMllL +ayMllL baBllL baMllL daAllL daEllL deBllL dhEllL faMllL haMllL haMllLnaB heBllL jaMllL kaBllL kaMllL keBllL keMllL khBllL laEllL laMllL maAllL maBllL maMllL naBllL naMllL raAllL raEllL saMllL seBllL seMllL shBllL shMllL taMllL toEllL toMllL waAllL waEllL yaBllL yaEllL yaMllL zaEllL zaMllL diff --git a/egs/ifnenit/v1/config/ligatures b/egs/ifnenit/v1/config/ligatures new file mode 100644 index 00000000000..af36d636901 --- /dev/null +++ b/egs/ifnenit/v1/config/ligatures @@ -0,0 +1,69 @@ +bfLE bB nE +bfLE bM nE +bhLE bB hE +bhLE bM hE +bhLM bB hM +bhLM bM hM +bjLE bB jE +bjLE bM jE +bjLM bB jM +bjLM bM jM +bmLE bB mE +bmLE bM mE +bmLM bB mM +bmLM bM mM +fjLE fB jE +fjLE fM jE +fjLM fB jM +fjLM fM jM +hjLE hB jE +hjLE hM jE +hjLM hB jM +hjLM hM jM +ijLE iB jE +ijLE iM jE +ijLM iB jM +ijLM iM jM +jmLE jB mE +jmLE jM mE +jmLM jB mM +jmLM jM mM +kjLE kB jE +kjLE kM jE +kjLM kB jM +kjLM kM jM +kmLE kB mE +kmLE kM mE +kmLM kB mM +kmLM kM mM +lhLE lB hE +lhLE lM hE +lhLM lB hM +lhLM lM hM +ljLE lB jE +ljLE lM jE +ljLM lB jM +ljLM lM jM +lljLE lB lM jE +lljLE lM lM jE +lljLM lB lM jM +lljLM lM lM jM +llmLM lB lM mM +llmLM lM lM mM +lmLE lB mE +lmLE lM mE +lmLM lB mM +lmLM lM mM +mjLE mB jE +mjLE mM jE +mjLM mB jM +mjLM mM jM +mohammad maB haM maM daE +ojLE oB jE +ojLE oM jE +ojLM oB jM +ojLM oM jM +sjLE sB jE +sjLE sM jE +sjLM sB jM +sjLM sM jM diff --git a/egs/ifnenit/v1/image b/egs/ifnenit/v1/image new file mode 120000 index 00000000000..1668ee99922 --- /dev/null +++ b/egs/ifnenit/v1/image @@ -0,0 +1 @@ +../../cifar/v1/image/ \ No newline at end of file diff --git a/egs/ifnenit/v1/local/add_ligature_variants.py b/egs/ifnenit/v1/local/add_ligature_variants.py new file mode 100755 index 00000000000..c98fc08d8f3 --- /dev/null +++ b/egs/ifnenit/v1/local/add_ligature_variants.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import os, sys, io + +# This script is originally from qatip project (http://qatsdemo.cloudapp.net/qatip/demo/) +# of Qatar Computing Research Institute (http://qcri.qa/) + +# This script adds ligatures as pronunciation variants. We allow only one ligature +# per pronunciation but apply all possible rules + +classMap = { + 'hh': 'x', + 'am': 'a', + 'ae': 'a', + 'ah': 'a', + 'al': 'a', + 'aa': 'a', + 'ba': 'b', + 'te': 'x', + 'ta': 'b', + 'th': 'b', + 'ja': 'h', + 'ha': 'h', + 'kh': 'h', + 'da': 'd', + 'dh': 'd', + 'ra': 'd', + 'zy': 'd', + 'se': 's', + 'sh': 's', + 'sa': 'o', + 'de': 'o', + 'to': 't', + 'za': 't', + 'ay': 'i', + 'gh': 'i', + 'fa': 'f', + 'ka': 'f', + 'ke': 'k', + 'la': 'l', + 'ma': 'm', + 'na': 'n', + 'he': 'x', + 'wa': 'x', + 'ee': 'j', + 'ya': 'j' +} + +def match(phoneme, placeholder): + if phoneme == placeholder: + return True + if len(phoneme) < 2 or len(placeholder) < 2: + return False + p = phoneme[:-1] + if not p in classMap: + return False + return (phoneme[-1:] == placeholder[-1:]) and (classMap[p] == placeholder[:-1]) + +# Load ligature file +rules = dict() +with open(sys.argv[1], encoding="utf-8") as f: + for x in f: + parts = x.strip().split() + if len(parts) < 2 or parts[0].startswith('#'): + continue + name = parts.pop(0) + if name not in rules: + rules[name] = [] + rules[name].append(parts) + +# Read stdin +in_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +out_stream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') +for line in in_stream: + out_stream.write(line) + phonemes = line.strip().split() + word = phonemes.pop(0) + for start in range(0, len(phonemes) - 1): + if phonemes[start] == 'conn' or phonemes[start] == 'sil': + continue + for ruleName in rules: + for variant in rules[ruleName]: + matched = True + for offset in range(0, len(variant)): + if not match(phonemes[start+2*offset], variant[offset]): + matched = False + break + if matched: + out_stream.write(word + " " + + ((' '.join(phonemes[0:start])) + ' ' + + ruleName + ' ' + + (' '.join(phonemes[start+2*offset+1:]))).strip() + "\n") + break diff --git a/egs/ifnenit/v1/local/chain/compare_wer.sh b/egs/ifnenit/v1/local/chain/compare_wer.sh new file mode 100755 index 00000000000..ff2a766f9e2 --- /dev/null +++ b/egs/ifnenit/v1/local/chain/compare_wer.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +# ./local/chain/compare_wer.sh exp/chainfsf4/cnn1a_3 exp/chainfsf4/cnn1a_4 +# System cnn1a_3 cnn1a_4 +# WER 17.60 17.92 +# Final train prob -0.0112 -0.0113 +# Final valid prob -0.0961 -0.0955 +# Final train prob (xent) -0.5676 -0.5713 +# Final valid prob (xent) -0.9767 -0.9702 + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/ifnenit/v1/local/chain/run_cnn_1a.sh b/egs/ifnenit/v1/local/chain/run_cnn_1a.sh new file mode 100755 index 00000000000..b0e147d157b --- /dev/null +++ b/egs/ifnenit/v1/local/chain/run_cnn_1a.sh @@ -0,0 +1,227 @@ +#!/bin/bash + +# steps/info/chain_dir_info.pl exp/chainfsf4/cnn1a_1/ +# exp/chainfsf4/cnn1a_1/: num-iters=21 nj=2..4 num-params=4.4M dim=40->380 combine=-0.033->-0.025 xent:train/valid[13,20,final]=(-1.07,-1.31,-0.560/-1.30,-1.70,-0.978) logprob:train/valid[13,20,final]=(-0.064,-0.119,-0.011/-0.115,-0.208,-0.096) + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +alignment_subsampling_factor=1 +# training chunk-options +chunk_width="-1" +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=false +lang_test=lang +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj $nj --cmd "$cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + common1="required-time-offsets=0 height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="required-time-offsets=0 height-offsets=-2,-1,0,1,2 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn4 dim=$tdnn_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/ifnenit-$(date +'%m_%d_%H_%M')/v1/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=$frame_subsampling_factor \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch="100=128,64/300=64,32/500=32" \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 300 \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh b/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh new file mode 100755 index 00000000000..b1f33b41a0c --- /dev/null +++ b/egs/ifnenit/v1/local/chain/run_cnn_chainali_1a.sh @@ -0,0 +1,232 @@ +#!/bin/bash + +# chainali_1b is as chainali_1a except it has 3 more cnn layers and 1 less tdnn layer. + +# steps/info/chain_dir_info.pl exp/chain/chainali_cnn_1b/ +# exp/chain/chainali_cnn_1b/: num-iters=21 nj=2..4 num-params=4.0M dim=40->364 combine=-0.009->-0.005 xent:train/valid[13,20,final]=(-1.47,-0.728,-0.623/-1.69,-1.02,-0.940) logprob:train/valid[13,20,final]=(-0.068,-0.030,-0.011/-0.086,-0.056,-0.038) + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix_ali=_1a # affix for the chain model using for alignment. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +chain_model_dir=exp/chain${nnet3_affix}/cnn${affix_ali} +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +alignment_subsampling_factor=1 +# training chunk-options +chunk_width="-1" +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=false +lang_test=lang +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $chain_model_dir $lat_dir + cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/ifnenit-$(date +'%m_%d_%H_%M')/v1/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=$alignment_subsampling_factor \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch="100=128,64/300=64,32/500=32" \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk 300 \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/ifnenit/v1/local/ienit_initialize.sh b/egs/ifnenit/v1/local/ienit_initialize.sh new file mode 100755 index 00000000000..e9412eb715f --- /dev/null +++ b/egs/ifnenit/v1/local/ienit_initialize.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# This script is originally from qatip project (http://qatsdemo.cloudapp.net/qatip/demo/) +# of Qatar Computing Research Institute (http://qcri.qa/) + +database_dir= # directory of the dataset +train_sets= # sets for training +test_sets= # sets for testing + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +# Fetch transcriptions for Test and Train. +# If you want try different sets for test and train +# you should change this file and the "local/process_data.py" script. + +mkdir -p data/tru + +folders=$train_sets +touch tmp.flist +for set in 'train' 'test' +do + rm tmp.flist + for folder in $folders + do + echo "$folder" + cp $database_dir/$folder/tru/*.tru data/tru + ls $database_dir/$folder/png/*.png >> tmp.flist + done + cat tmp.flist | xargs -n 1 -IBLA basename BLA '.png' > tmp.uttids + cat tmp.flist | sed 's/\/png\/\([a-z0-9_]\+\).png$/\/tru\/\1.tru/' | xargs egrep '^LBL:' | iconv -f 'cp1256' -t 'UTF-8' | python3 local/remove_diacritics.py | cut -d':' -f5- | cut -d';' -f1 | paste -d' ' tmp.uttids - > data/text.$set + folders=$test_sets +done +rm tmp.flist tmp.uttids diff --git a/egs/ifnenit/v1/local/make_features.py b/egs/ifnenit/v1/local/make_features.py new file mode 100755 index 00000000000..3a485e32eb1 --- /dev/null +++ b/egs/ifnenit/v1/local/make_features.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 + + +""" This script converts images to Kaldi-format feature matrices. The input to + this script is the path to a data directory, e.g. "data/train". This script + reads the images listed in images.scp and writes them to standard output + (by default) as Kaldi-formatted matrices (in text form). It also scales the + images so they have the same height (via --feat-dim). It can optionally pad + the images (on left/right sides) with white pixels. + + eg. local/make_features.py data/train --feat-dim 40 +""" + + +import argparse +import os +import sys +import scipy.io as sio +import numpy as np +from scipy import misc +import math + +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE,SIG_DFL) + +parser = argparse.ArgumentParser(description="""Generates and saves the feature vectors""") +parser.add_argument('dir', type=str, help='directory of images.scp and is also output directory') +parser.add_argument('--out-ark', type=str, default='-', help='where to write the output feature file') +parser.add_argument('--feat-dim', type=int, default=40, help='size to scale the height of all images') +parser.add_argument('--padding', type=int, default=5, help='size to scale the height of all images') +args = parser.parse_args() + + +def write_kaldi_matrix(file_handle, matrix, key): + file_handle.write(key + " [ ") + num_rows = len(matrix) + if num_rows == 0: + raise Exception("Matrix is empty") + num_cols = len(matrix[0]) + + for row_index in range(len(matrix)): + if num_cols != len(matrix[row_index]): + raise Exception("All the rows of a matrix are expected to " + "have the same length") + file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + if row_index != num_rows - 1: + file_handle.write("\n") + file_handle.write(" ]\n") + +def get_scaled_image(im): + scale_size = args.feat_dim + sx = im.shape[1] + sy = im.shape[0] + scale = (1.0 * scale_size) / sy + nx = int(scale_size) + ny = int(scale * sx) + im = misc.imresize(im, (nx, ny)) + padding_x = 0 + padding_y = 0 + for i in range(0,30): + im_x = im.shape[1] + im_y = im.shape[0] + if im_x >= (28 + (20*i)) and im_x <= (28 + (20*(i+1))): + padding_x = (30 + (20*(i+1))) - im_x + padding_y = im_y + else: + continue + im_pad = np.concatenate((255 * np.ones((padding_y, math.ceil(1.0 * padding_x / 2)) , dtype=int), im), axis=1) + im_pad1 = np.concatenate((im_pad,255 * np.ones((padding_y, int(1.0 * padding_x / 2)), dtype=int)), axis=1) + return im_pad1 + +### main ### +data_list_path = os.path.join(args.dir,'images.scp') + +if args.out_ark == '-': + out_fh = sys.stdout +else: + out_fh = open(args.out_ark,'wb') + +with open(data_list_path) as f: + for line in f: + line = line.strip() + line_vect = line.split(' ') + image_id = line_vect[0] + image_path = line_vect[1] + im = misc.imread(image_path) + im_scale = get_scaled_image(im) + im_scale_inversed = np.fliplr(im_scale) + data = np.transpose(im_scale_inversed, (1, 0)) + data = np.divide(data, 255.0) + write_kaldi_matrix(out_fh, data, image_id) diff --git a/egs/ifnenit/v1/local/make_latin_words.py b/egs/ifnenit/v1/local/make_latin_words.py new file mode 100755 index 00000000000..7abda89123e --- /dev/null +++ b/egs/ifnenit/v1/local/make_latin_words.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# This script is originally from qatip project (http://qatsdemo.cloudapp.net/qatip/demo/) +# of Qatar Computing Research Institute (http://qcri.qa/) + +# Convert unicode words to position dependent latin form. +# This script make creating lexicon very easy. + +import os, sys, io + +map = { + 'ء': 'hh', + 'آ': 'am', + 'أ': 'ae', + 'إ': 'ah', + 'ئ': 'al', + 'ا': 'aa', + 'ب': 'ba', + 'ة': 'te', + 'ت': 'ta', + 'ث': 'th', + 'ج': 'ja', + 'ح': 'ha', + 'خ': 'kh', + 'د': 'da', + 'ذ': 'dh', + 'ر': 'ra', + 'ز': 'zy', + 'س': 'se', + 'ش': 'sh', + 'ص': 'sa', + 'ض': 'de', + 'ط': 'to', + 'ظ': 'za', + 'ع': 'ay', + 'غ': 'gh', + 'ف': 'fa', + 'ق': 'ka', + 'ك': 'ke', + 'ل': 'la', + 'م': 'ma', + 'ن': 'na', + 'ه': 'he', + 'و': 'wa', + 'ى': 'ee', + 'ي': 'ya', +} + +connecting = { + 'hh': False, + 'am': False, + 'ae': False, + 'ah': False, + 'al': False, + 'aa': False, + 'ba': True, + 'te': False, + 'ta': True, + 'th': True, + 'ja': True, + 'ha': True, + 'kh': True, + 'da': False, + 'dh': False, + 'ra': False, + 'zy': False, + 'se': True, + 'sh': True, + 'sa': True, + 'de': True, + 'to': True, + 'za': True, + 'ay': True, + 'gh': True, + 'fa': True, + 'ka': True, + 'ke': True, + 'la': True, + 'ma': True, + 'na': True, + 'he': True, + 'wa': False, + 'ee': False, + 'ya': True +} + +in_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +out_stream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') +for line in in_stream: + connected = False + lastChar = '' + lastType = '' + out_stream.write(line.strip()) + for char in line.strip(): + if char == '+': + continue + if char == '=': + connected = True + continue + out_stream.write((" " if lastChar else "") + lastChar + lastType + (" conn" if connected else " sil")) + if char in map: + lastChar = map[char] + if connected: + if connecting[lastChar]: + lastType="M" + else: + lastType="E" + else: + if connecting[lastChar]: + lastType="B" + else: + lastType="A" + connected=connecting[lastChar] + else: # Not in map + if char == '#': + lastChar = 'hash' + elif char == '_': + lastChar = 'uscore' + elif char == '<': + lastChar = 'ltchar' + elif char == '>': + lastChar = 'gtchar' + else: + lastChar = char + lastType = "A" + connected=False + if lastType == "M": + lastType = "E" + elif lastType == "B": + lastType = "A" + out_stream.write(" "+lastChar+lastType) + out_stream.write("\n") diff --git a/egs/ifnenit/v1/local/map_to_rareA.py b/egs/ifnenit/v1/local/map_to_rareA.py new file mode 100755 index 00000000000..b1f0dd3c0a8 --- /dev/null +++ b/egs/ifnenit/v1/local/map_to_rareA.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import os, sys, io + +# This script is originally from qatip project (http://qatsdemo.cloudapp.net/qatip/demo/) +# of Qatar Computing Research Institute (http://qcri.qa/) + +# Map unknown phonemes to "rareA" for creating lexicon.txt using phonemeset as dictionary. + +d = dict() +with open(sys.argv[1], encoding="utf-8") as f: + for x in f: + d[x.strip()] = True + +in_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +out_stream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') +for line in in_stream: + str = "" + for word in line.strip().split(): + if not str: # leave first word untouched + str = word + elif not word in d: + str = str+" rareA" + else: + str = str+" "+word + out_stream.write(str.strip() + "\n") diff --git a/egs/ifnenit/v1/local/prepare_data.sh b/egs/ifnenit/v1/local/prepare_data.sh new file mode 100755 index 00000000000..ee20822d557 --- /dev/null +++ b/egs/ifnenit/v1/local/prepare_data.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# To be run from one directory above this script. +# Creat text, utt2spk, spk2utt, images.scp, and feats.scp for test and train. + +database_dir= # directory of the dataset +train_sets= # sets for training +test_sets= # sets for testing + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +mkdir -p data +for set in 'train' 'test' +do + ## Clean up + if [[ -f tmp.unsorted ]] + then + rm tmp.unsorted + fi + if [ -d "data/$set" ]; then + rm -r data/$set + fi + + ## Gather transcriptions + mkdir data/$set + cat data/text.$set > tmp.unsorted + # done + cat tmp.unsorted | sort -k1 > tmp.sorted + cat tmp.sorted | cut -d' ' -f1 > data/$set/uttids + cat tmp.sorted | cut -d' ' -f2- | python3 local/remove_diacritics.py | python3 local/replace_arabic_punctuation.py | tr '+' '\\' | tr '=' '\\' | sed 's/\xA0/X/g' | sed 's/\x00\xA0/X/g' | sed 's/\xC2\xA0/X/g' | sed 's/\s\+/ /g' | sed 's/ \+$//' | sed 's/^ \+$//' | paste -d' ' data/$set/uttids - > data/$set/text + rm tmp.unsorted tmp.sorted + + local/process_data.py $database_dir data/$set --dataset $set --train_sets "$train_sets" --test_sets "$test_sets" || exit 1 + sort data/$set/images.scp -o data/$set/images.scp + sort data/$set/utt2spk -o data/$set/utt2spk + + utils/utt2spk_to_spk2utt.pl data/$set/utt2spk > data/$set/spk2utt + + mkdir -p data/{train,test}/data + + local/make_features.py data/$set --feat-dim 40 | \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:data/$set/data/images.ark,data/$set/feats.scp || exit 1 + + steps/compute_cmvn_stats.sh data/$set || exit 1; + +done diff --git a/egs/ifnenit/v1/local/prepare_dict.sh b/egs/ifnenit/v1/local/prepare_dict.sh new file mode 100755 index 00000000000..de0ee6a433c --- /dev/null +++ b/egs/ifnenit/v1/local/prepare_dict.sh @@ -0,0 +1,46 @@ +#!/bin/bash +. ./cmd.sh +. ./path.sh + +# This script is originally from qatip project (http://qatsdemo.cloudapp.net/qatip/demo/) +# of Qatar Computing Research Institute (http://qcri.qa/) + +# To be run from one directory above this script. +# Prepare the dict folder. +# Creating lexicon.txt, phonemeset, nonsilence_phones.txt, extra_questions.txt and silence_phones.txt. + +if [ -d "data/local" ]; then + rm -r data/local +fi + +## Determine phoneme set +mkdir -p data/local/lm +cat data/train/text | cut -d' ' -f2- | tr ' ' "\n" | sort -u > data/local/lm/train.vocab +cat data/local/lm/train.vocab | python3 local/make_latin_words.py > data/train/words2latin +cat data/train/text | cut -d' ' -f2- | python3 local/transcript_to_latin.py data/train/words2latin | cut -d' ' -f2- | tr ' ' "\n" | sort | uniq -c | awk '{if ($1 > 50 || length($2) == 3) print $2}' | fgrep -v '~A' > data/local/phonemeset + +## Lexicon and word/phoneme lists +mkdir -p data/lang/ +mkdir -p data/local/dict +echo '' > data/lang/oov.txt +cat data/train/words2latin | python3 local/map_to_rareA.py data/local/phonemeset > data/local/dict/lexicon.txt +echo " rareA" >> data/local/dict/lexicon.txt +echo "!SIL sil" >> data/local/dict/lexicon.txt + +cat data/local/phonemeset | fgrep -v '.A' | fgrep -v ',A' | fgrep -v 'conn' | fgrep -v 'sil' | sort > data/local/dict/nonsilence_phones.txt + +echo ',A' > data/local/dict/silence_phones.txt +echo '.A' >> data/local/dict/silence_phones.txt +echo 'conn' >> data/local/dict/silence_phones.txt +echo 'rareA' >> data/local/dict/silence_phones.txt +echo 'sil' >> data/local/dict/silence_phones.txt +echo 'sil' > data/local/dict/optional_silence.txt +# config folder +cat config/extra_questions.txt| python3 local/reduce_to_vocabulary.py data/local/dict/nonsilence_phones.txt | sort -u | fgrep ' ' > data/local/dict/extra_questions.txt + +mv data/local/dict/lexicon.txt data/local/dict/prelexicon.txt +# # add ligatures +cat data/local/dict/prelexicon.txt | sed 's/\s\+la[BM]\{1\}\s\+conn\s\+a[meha]\{1\}E/ laLE/g' | python3 local/add_ligature_variants.py config/ligatures > data/local/dict/lexicon.txt +cat data/local/dict/lexicon.txt| cut -d' ' -f2- | tr ' ' "\n" | sort -u > data/local/phonemeset +cat data/local/phonemeset | fgrep -v 'rare' | fgrep -v '.A' | fgrep -v ',A' | fgrep -v 'conn' | fgrep -v 'sil' | sort > data/local/dict/nonsilence_phones.txt + diff --git a/egs/ifnenit/v1/local/process_data.py b/egs/ifnenit/v1/local/process_data.py new file mode 100755 index 00000000000..1d826c98bff --- /dev/null +++ b/egs/ifnenit/v1/local/process_data.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 + +import argparse +import os +import sys +import numpy as np +from scipy import misc +import xml.dom.minidom as minidom + +parser = argparse.ArgumentParser(description="""Creates text utt2spk + and image file """) +parser.add_argument('database_path', type=str, + help='path to downloaded iam data') +parser.add_argument('out_dir', type=str, + help='where to write output files') +parser.add_argument('--train_sets', type=str, + help='sets for training') +parser.add_argument('--test_sets', type=str, + help='sets for testing') +parser.add_argument('--dataset', type=str, default='train', + choices=['train','test'], + help='choose trainset, testset, validationset1, or validationset2') +args = parser.parse_args() + +### main ### +print('processing word model') + +image_file = os.path.join(args.out_dir + '/', 'images.scp') +image_fh = open(image_file, 'w+') + +utt2spk_file = os.path.join(args.out_dir + '/', 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w+') + +text_dict = {} +utt_dict = {} +img_dict = {} + +imgs_y = [] +imgs_x = [] +sets = {} +if args.dataset == 'train': + sets = args.train_sets.split(" ") +else: + sets = args.test_sets.split(" ") +for dir_name in sorted(sets): + if( dir_name == "set_e" or dir_name == "set_f" or dir_name == "set_s"): + png_path = args.database_path + '/' + dir_name + '/png' + tru_path = args.database_path + '/' + dir_name + '/tru' + for i in range(0,len(os.listdir(png_path))): + png_file_name = sorted(os.listdir(png_path))[i][:-4] + writer_id = png_file_name[0:5] + utt_id = png_file_name + image_fh.write(utt_id + ' ' + png_path + '/' + sorted(os.listdir(png_path))[i] + '\n' ) + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + else: + png_path = args.database_path + '/' + dir_name + '/png' + tru_path = args.database_path + '/' + dir_name + '/tru' + for i in range(0,len(os.listdir(png_path))): + png_file_name = sorted(os.listdir(png_path))[i][:-4] + writer_id = png_file_name[0:4] + utt_id = png_file_name + image_fh.write(utt_id + ' ' + png_path + '/' + sorted(os.listdir(png_path))[i] + '\n' ) + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + + + + + + + diff --git a/egs/ifnenit/v1/local/reduce_to_vocabulary.py b/egs/ifnenit/v1/local/reduce_to_vocabulary.py new file mode 100755 index 00000000000..a9679e49453 --- /dev/null +++ b/egs/ifnenit/v1/local/reduce_to_vocabulary.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import os, sys, io + +# This script is originally from qatip project (http://qatsdemo.cloudapp.net/qatip/demo/) +# of Qatar Computing Research Institute (http://qcri.qa/) + +# Remove all phonemes which are not in the phonemeset from extra_question.txt + +d = dict() +with open(sys.argv[1], encoding="utf-8") as f: + for x in f: + d[x.strip()] = True + +in_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +out_stream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') +for line in in_stream: + str = "" + for word in line.strip().split(): + if word in d: + str = str+" "+word + out_stream.write(str.strip() + "\n") diff --git a/egs/ifnenit/v1/local/remove_diacritics.py b/egs/ifnenit/v1/local/remove_diacritics.py new file mode 100755 index 00000000000..5df0cf574da --- /dev/null +++ b/egs/ifnenit/v1/local/remove_diacritics.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 + +# This script is originally from qatip project (http://qatsdemo.cloudapp.net/qatip/demo/) +# of Qatar Computing Research Institute (http://qcri.qa/) + +# Convert unicode transcripts to Normal Form D (NFD). +# Delete Mark,Nonspacing unicode characters. + +import unicodedata +import sys, io +def strip_accents(s): + return ''.join(c for c in unicodedata.normalize('NFD', s) + if unicodedata.category(c) != 'Mn') + +in_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +out_stream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') +for line in in_stream: + out_stream.write(strip_accents(line)) diff --git a/egs/ifnenit/v1/local/replace_arabic_punctuation.py b/egs/ifnenit/v1/local/replace_arabic_punctuation.py new file mode 100755 index 00000000000..81fa66a6317 --- /dev/null +++ b/egs/ifnenit/v1/local/replace_arabic_punctuation.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import re +import sys, io + +# This script is originally from qatip project (http://qatsdemo.cloudapp.net/qatip/demo/) +# of Qatar Computing Research Institute (http://qcri.qa/) + +# Repalce Arabic Punctuations and Brackets instead of latin ones. + +in_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +out_stream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') +for line in in_stream: + out_stream.write( + line + .replace(" ", " ") + .replace("٭", "*") + .replace("×", "x") + .replace("،", ",") + .replace("؛", ":") + .replace("؟", "?") + .replace("–", "-") + .replace("‘", "'") + .replace("[", "(") + .replace("{", "(") + .replace("﴾", "(") + .replace("]", ")") + .replace("}", ")") + .replace("}", "﴿") + ) diff --git a/egs/ifnenit/v1/local/score.sh b/egs/ifnenit/v1/local/score.sh new file mode 120000 index 00000000000..6a200b42ed3 --- /dev/null +++ b/egs/ifnenit/v1/local/score.sh @@ -0,0 +1 @@ +../steps/scoring/score_kaldi_wer.sh \ No newline at end of file diff --git a/egs/ifnenit/v1/local/transcript_to_latin.py b/egs/ifnenit/v1/local/transcript_to_latin.py new file mode 100755 index 00000000000..470fbcc783b --- /dev/null +++ b/egs/ifnenit/v1/local/transcript_to_latin.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# This script is originally from qatip project (http://qatsdemo.cloudapp.net/qatip/demo/) +# of Qatar Computing Research Institute (http://qcri.qa/) + +# Convert every utterance transcript to position dependent latin format using "data/train/words2latin" as dictionary. + +import os, sys, re, io + +with open(sys.argv[1], encoding="utf-8") as f: + d = dict(x.rstrip().split(None, 1) for x in f) + +in_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +for line in in_stream: + mappedWords = [] + for word in line.split(): + mappedWords.append(d[word]) + sys.stdout.write(re.sub(" +", " ", " ~A ".join(mappedWords).strip()) + "\n") diff --git a/egs/ifnenit/v1/path.sh b/egs/ifnenit/v1/path.sh new file mode 100755 index 00000000000..0d7641cd5c1 --- /dev/null +++ b/egs/ifnenit/v1/path.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# path to Kaldi's root directory +export KALDI_ROOT=`pwd`/../../.. + +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export LD_LIBRARY_PATH=/home/dpovey/libs:$KALDI_ROOT/src/chainbin:/usr/local/lib:$LD_LIBRARY_PATH +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/ifnenit/v1/run.sh b/egs/ifnenit/v1/run.sh new file mode 100755 index 00000000000..2df5a06161c --- /dev/null +++ b/egs/ifnenit/v1/run.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +stage=0 +nj=8 +. ./path.sh + +# ienit_database points to the database path on the JHU grid. +# you can change this to your local directory of the dataset +ienit_database="/export/b01/babak/IFN-ENIT/ifnenit_v2.0p1e/data" +train_sets="set_a set_b set_c" +test_sets="set_d" + +. ./path.sh +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. utils/parse_options.sh # e.g. this parses the --stage option if supplied. + +if [ $stage -le 0 ]; then + # data preparation + echo "data preparation" + local/ienit_initialize.sh --database_dir $ienit_database \ + --train_sets "$train_sets" --test_sets "$test_sets" + + local/prepare_data.sh --database_dir $ienit_database \ + --train_sets "$train_sets" --test_sets "$test_sets" +fi + +if [ $stage -le 1 ]; then + # dict folder preparation + echo "dict folder preparation" + local/prepare_dict.sh + utils/prepare_lang.sh --num-sil-states 3 --num-nonsil-states 4 --position-dependent-phones false data/local/dict "" data/local/lang data/lang +fi + +if [ $stage -le 2 ]; then + # LM preparation + echo "LM preparation" + cat data/train/text | cut -d' ' -f2- | utils/make_unigram_grammar.pl | \ + fstcompile --isymbols=data/lang/words.txt --osymbols=data/lang/words.txt > data/lang/G.fst +fi + +if [ $stage -le 3 ]; then + steps/train_mono.sh --nj $nj data/train data/lang \ + exp/mono +fi + +if [ $stage -le 4 ]; then + utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph + + steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \ + exp/mono/decode_test +fi + +if [ $stage -le 5 ]; then + steps/align_si.sh --nj $nj data/train data/lang \ + exp/mono exp/mono_ali + + steps/train_deltas.sh 500 20000 data/train data/lang \ + exp/mono_ali exp/tri +fi + +if [ $stage -le 6 ]; then + utils/mkgraph.sh data/lang exp/tri exp/tri/graph + + steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \ + exp/tri/decode_test +fi + +if [ $stage -le 7 ]; then + steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \ + exp/mono exp/mono_ali + + steps/train_lda_mllt.sh --cmd $cmd \ + --splice-opts "--left-context=3 --right-context=3" 500 20000 \ + data/train data/lang exp/mono_ali exp/tri2 +fi + +if [ $stage -le 8 ]; then + utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph + + steps/decode.sh --nj $nj --cmd $cmd exp/tri2/graph data/test \ + exp/tri2/decode_test +fi + +if [ $stage -le 9 ]; then + steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ + data/train data/lang exp/tri2 exp/tri2_ali + + steps/train_sat.sh --cmd $cmd 500 20000 \ + data/train data/lang exp/tri2_ali exp/tri3 +fi + +if [ $stage -le 10 ]; then + utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph + + steps/decode_fmllr.sh --nj $nj --cmd $cmd exp/tri3/graph \ + data/test exp/tri3/decode_test +fi + +if [ $stage -le 11 ]; then + steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ + data/train data/lang exp/tri3 exp/tri3_ali +fi + +if [ $stage -le 12 ]; then + local/chain/run_cnn_1a.sh +fi + +if [ $stage -le 13 ]; then + local/chain/run_cnn_chainali_1a.sh --stage 2 +fi diff --git a/egs/ifnenit/v1/steps b/egs/ifnenit/v1/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/ifnenit/v1/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/ifnenit/v1/utils b/egs/ifnenit/v1/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/ifnenit/v1/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/librispeech/s5/local/chain/compare_wer.sh b/egs/librispeech/s5/local/chain/compare_wer.sh index be60ef60df5..ec205670b76 100755 --- a/egs/librispeech/s5/local/chain/compare_wer.sh +++ b/egs/librispeech/s5/local/chain/compare_wer.sh @@ -143,3 +143,10 @@ for x in $*; do printf "% 10s" $prob done echo + +echo -n "# Num-parameters " +for x in $*; do + num_params=$(grep num-parameters $x/log/progress.1.log | awk '{print $2}') + printf "% 10d" $num_params +done +echo diff --git a/egs/librispeech/s5/local/chain/run_tdnn.sh b/egs/librispeech/s5/local/chain/run_tdnn.sh index d48449e28bd..e1adaa9346d 120000 --- a/egs/librispeech/s5/local/chain/run_tdnn.sh +++ b/egs/librispeech/s5/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_1c.sh \ No newline at end of file +tuning/run_tdnn_1d.sh \ No newline at end of file diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1a.sh index f0f3aed4405..d4c789f7794 100755 --- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1a.sh @@ -41,7 +41,6 @@ set -e # (some of which are also used in this script directly). stage=0 decode_nj=50 -min_seg_len=1.55 train_set=train_960_cleaned gmm=tri6b_cleaned # the gmm for the target data nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned @@ -84,20 +83,19 @@ fi # nnet3 setup, and you can skip them by setting "--stage 11" if you have already # run those things. local/nnet3/run_ivector_common.sh --stage $stage \ - --min-seg-len $min_seg_len \ --train-set $train_set \ --gmm $gmm \ --nnet3-affix "$nnet3_affix" || exit 1; gmm_dir=exp/$gmm -ali_dir=exp/${gmm}_ali_${train_set}_sp_comb +ali_dir=exp/${gmm}_ali_${train_set}_sp tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix} lang=data/lang_chain -lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats +lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats dir=exp/chain${nnet3_affix}/tdnn${affix:+_$affix}_sp -train_data_dir=data/${train_set}_sp_hires_comb -lores_train_data_dir=data/${train_set}_sp_comb -train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb +train_data_dir=data/${train_set}_sp_hires +lores_train_data_dir=data/${train_set}_sp +train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \ $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do @@ -105,7 +103,7 @@ for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector done # Please take this as a reference on how to specify all the options of -# local/chain/run_chain_common.sh +# local/chain/run_chain_common.sh local/chain/run_chain_common.sh --stage $stage \ --gmm-dir $gmm_dir \ --ali-dir $ali_dir \ diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh index cd26773f50f..7129827fe19 100755 --- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh +++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh @@ -43,7 +43,6 @@ set -e # (some of which are also used in this script directly). stage=0 decode_nj=50 -min_seg_len=1.55 train_set=train_960_cleaned gmm=tri6b_cleaned # the gmm for the target data nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned @@ -86,20 +85,19 @@ fi # nnet3 setup, and you can skip them by setting "--stage 11" if you have already # run those things. local/nnet3/run_ivector_common.sh --stage $stage \ - --min-seg-len $min_seg_len \ --train-set $train_set \ --gmm $gmm \ --nnet3-affix "$nnet3_affix" || exit 1; gmm_dir=exp/$gmm -ali_dir=exp/${gmm}_ali_${train_set}_sp_comb +ali_dir=exp/${gmm}_ali_${train_set}_sp tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix} lang=data/lang_chain -lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats +lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats dir=exp/chain${nnet3_affix}/tdnn${affix:+_$affix}_sp -train_data_dir=data/${train_set}_sp_hires_comb -lores_train_data_dir=data/${train_set}_sp_comb -train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb +train_data_dir=data/${train_set}_sp_hires +lores_train_data_dir=data/${train_set}_sp +train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \ $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do @@ -130,12 +128,12 @@ if [ $stage -le 14 ]; then cat < $dir/configs/network.xconfig input dim=100 name=ivector input dim=40 name=input - + # please note that it is important to have input layer with the name=input # as the layer immediately preceding the fixed-affine-layer to enable # the use of short notation for the descriptor fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat - + # the first splicing is moved before the lda layer, so no splicing here relu-batchnorm-layer name=tdnn1 dim=$relu_dim relu-batchnorm-layer name=tdnn2 dim=$relu_dim input=Append(-1,0,1,2) @@ -147,7 +145,7 @@ if [ $stage -le 14 ]; then ## adding the layers for chain branch relu-batchnorm-layer name=prefinal-chain dim=$relu_dim target-rms=0.5 output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 - + # adding the layers for xent branch # This block prints the configs for a separate output that will be # trained with a cross-entropy objective in the 'chain' models... this diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh index ceadd890b5c..29ebe62ddde 100755 --- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh +++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1c.sh @@ -34,7 +34,6 @@ set -e # configs for 'chain' stage=0 decode_nj=50 -min_seg_len=1.55 train_set=train_960_cleaned gmm=tri6b_cleaned nnet3_affix=_cleaned @@ -75,21 +74,20 @@ fi # run those things. local/nnet3/run_ivector_common.sh --stage $stage \ - --min-seg-len $min_seg_len \ --train-set $train_set \ --gmm $gmm \ - --num-threads-ubm 6 --num-processes 3 \ + --num-threads-ubm 6 --num-processes 3 \ --nnet3-affix "$nnet3_affix" || exit 1; gmm_dir=exp/$gmm -ali_dir=exp/${gmm}_ali_${train_set}_sp_comb +ali_dir=exp/${gmm}_ali_${train_set}_sp tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix} lang=data/lang_chain -lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_comb_lats +lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats dir=exp/chain${nnet3_affix}/tdnn${affix:+_$affix}_sp -train_data_dir=data/${train_set}_sp_hires_comb -lores_train_data_dir=data/${train_set}_sp_comb -train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires_comb +train_data_dir=data/${train_set}_sp_hires +lores_train_data_dir=data/${train_set}_sp +train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires # if we are using the speed-perturbed data we need to generate # alignments for it. diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh new file mode 100755 index 00000000000..81b621ef86f --- /dev/null +++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1d.sh @@ -0,0 +1,363 @@ +#!/bin/bash +set -e + +# 1d is as 1c but a recipe based on the newer, more compact configs, and with +# various configuration changes; it also includes dropout (although I'm not +# sure whether dropout was actually helpful, that needs to be tested). +# +# local/chain/compare_wer.sh exp/chain_cleaned/tdnn_1c_sp exp/chain_cleaned/tdnn_1d_sp +# System tdnn_1c_sp tdnn_1d_sp +# WER on dev(fglarge) 3.31 3.29 +# WER on dev(tglarge) 3.41 3.44 +# WER on dev(tgmed) 4.30 4.22 +# WER on dev(tgsmall) 4.81 4.72 +# WER on dev_other(fglarge) 8.73 8.71 +# WER on dev_other(tglarge) 9.22 9.05 +# WER on dev_other(tgmed) 11.24 11.09 +# WER on dev_other(tgsmall) 12.29 12.13 +# WER on test(fglarge) 3.88 3.80 +# WER on test(tglarge) 4.05 3.89 +# WER on test(tgmed) 4.86 4.72 +# WER on test(tgsmall) 5.30 5.19 +# WER on test_other(fglarge) 9.09 8.76 +# WER on test_other(tglarge) 9.54 9.19 +# WER on test_other(tgmed) 11.65 11.22 +# WER on test_other(tgsmall) 12.77 12.24 +# Final train prob -0.0510 -0.0378 +# Final valid prob -0.0619 -0.0374 +# Final train prob (xent) -0.7499 -0.6099 +# Final valid prob (xent) -0.8118 -0.6353 +# Num-parameters 20093920 22623456 + + +# +# 1c23 is as 1c22 but with bypass-scale increased to 0.75 Better! +# 1c22 is as 1c21 but with bottleneck-dim reduced from 192 to 160. +# 1c21 is as 1c19 but with 2.5 million, instead of 5 million, frames-per-iter. +# 1c19 is a rerun of 1c{14,16} but with --constrained false in the egs.opts, +# and upgrading to new-style configs. +# 1c16 is (by mistake) a rerun of 1c14. + +# local/chain/compare_wer.sh exp/chain_cleaned/tdnn_1c14_sp exp/chain_cleaned/tdnn_1c16_sp +# System tdnn_1c14_sp tdnn_1c16_sp +# WER on dev(fglarge) 3.38 3.34 +# WER on dev(tglarge) 3.44 3.40 +# WER on dev(tgmed) 4.33 4.34 +# WER on dev(tgsmall) 4.80 4.79 +# WER on dev_other(fglarge) 8.63 8.66 +# WER on dev_other(tglarge) 9.04 9.11 +# WER on dev_other(tgmed) 11.03 11.21 +# WER on dev_other(tgsmall) 12.21 12.26 +# WER on test(fglarge) 3.79 3.77 +# WER on test(tglarge) 3.92 3.96 +# WER on test(tgmed) 4.80 4.79 +# WER on test(tgsmall) 5.34 5.31 +# WER on test_other(fglarge) 8.94 8.94 +# WER on test_other(tglarge) 9.35 9.28 +# WER on test_other(tgmed) 11.32 11.28 +# WER on test_other(tgsmall) 12.43 12.39 +# Final train prob -0.0491 -0.0486 +# Final valid prob -0.0465 -0.0465 +# Final train prob (xent) -0.6463 -0.6371 +# Final valid prob (xent) -0.6668 -0.6593 +# Num-parameters 23701728 23701728 + +# 1c14 is as 1c13 but with two more layers. +# A bit better! Overfits slightly. +# local/chain/compare_wer.sh exp/chain_cleaned/tdnn_1c_sp exp/chain_cleaned/tdnn_1c10_sp exp/chain_cleaned/tdnn_1c11_sp exp/chain_cleaned/tdnn_1c12_sp exp/chain_cleaned/tdnn_1c13_sp exp/chain_cleaned/tdnn_1c14_sp +# System tdnn_1c_sp tdnn_1c10_sp tdnn_1c11_sp tdnn_1c12_sp tdnn_1c13_sp tdnn_1c14_sp +# WER on dev(fglarge) 3.31 3.43 3.37 3.36 3.33 3.38 +# WER on dev(tglarge) 3.41 3.50 3.45 3.43 3.40 3.44 +# WER on dev(tgmed) 4.30 4.37 4.30 4.40 4.25 4.33 +# WER on dev(tgsmall) 4.81 4.79 4.82 4.86 4.74 4.80 +# WER on dev_other(fglarge) 8.73 9.10 8.61 8.49 8.78 8.63 +# WER on dev_other(tglarge) 9.22 9.46 9.11 8.92 9.23 9.04 +# WER on dev_other(tgmed) 11.24 11.33 11.23 10.91 11.10 11.03 +# WER on dev_other(tgsmall) 12.29 12.58 12.23 12.07 12.33 12.21 +# WER on test(fglarge) 3.88 3.86 3.83 3.78 3.84 3.79 +# WER on test(tglarge) 4.05 4.01 3.96 3.93 3.96 3.92 +# WER on test(tgmed) 4.86 4.80 4.83 4.81 4.77 4.80 +# WER on test(tgsmall) 5.30 5.31 5.24 5.24 5.22 5.34 +# WER on test_other(fglarge) 9.09 9.02 9.05 8.88 9.02 8.94 +# WER on test_other(tglarge) 9.54 9.58 9.47 9.20 9.42 9.35 +# WER on test_other(tgmed) 11.65 11.63 11.35 11.28 11.46 11.32 +# WER on test_other(tgsmall) 12.77 12.69 12.51 12.38 12.60 12.43 +# Final train prob -0.0510 -0.0423 -0.0449 -0.0517 -0.0460 -0.0491 +# Final valid prob -0.0619 -0.0446 -0.0456 -0.0503 -0.0460 -0.0465 +# Final train prob (xent) -0.7499 -0.5974 -0.6351 -0.6660 -0.6329 -0.6463 +# Final valid prob (xent) -0.8118 -0.6331 -0.6612 -0.6854 -0.6588 -0.6668 +# Num-parameters 20093920 21339360 21339360 22297824 21339360 23701728 + +# 1c13 is as 1c12 but changing tdnnf5-layer back to tdnnf6-layer. +# 1c12 is as 1c11 but with changes to the learning rates (reduced) and l2 +# (doubled for non-final layers), a larger frames-per-iter, and +# changing to tdnnf5-layer, i.e. keeping the extra splicing. +# 1c11 is as 1c10 but with double the l2-regularize. +# 1c10 is as 1c but using a newer type of setup based on the Swbd +# setup I'm working on, with tdnnf6-layers. +# Basing it on 7p10m. Making it 4 epochs, for speed. + +# 7n is a kind of factorized TDNN, with skip connections + +# steps/info/chain_dir_info.pl exp/chain_cleaned/tdnn_1c_sp +# exp/chain_cleaned/tdnn_1c_sp: num-iters=1307 nj=3..16 num-params=20.1M dim=40+100->6024 combine=-0.051->-0.050 (over 23) xent:train/valid[869,1306,final]=(-0.808,-0.767,-0.771/-0.828,-0.780,-0.787) logprob:train/valid[869,1306,final]=(-0.051,-0.049,-0.047/-0.059,-0.056,-0.056) + +# local/chain/compare_wer.sh exp/chain_cleaned/tdnn_1b_sp exp/chain_cleaned/tdnn_1c_sp +# System tdnn_1b_sp tdnn_1c_sp +# WER on dev(fglarge) 3.77 3.35 +# WER on dev(tglarge) 3.90 3.49 +# WER on dev(tgmed) 4.89 4.30 +# WER on dev(tgsmall) 5.47 4.78 +# WER on dev_other(fglarge) 10.05 8.76 +# WER on dev_other(tglarge) 10.80 9.26 +# WER on dev_other(tgmed) 13.07 11.21 +# WER on dev_other(tgsmall) 14.46 12.47 +# WER on test(fglarge) 4.20 3.87 +# WER on test(tglarge) 4.28 4.08 +# WER on test(tgmed) 5.31 4.80 +# WER on test(tgsmall) 5.97 5.25 +# WER on test_other(fglarge) 10.44 8.95 +# WER on test_other(tglarge) 11.05 9.41 +# WER on test_other(tgmed) 13.36 11.52 +# WER on test_other(tgsmall) 14.90 12.66 +# Final train prob -0.0670 -0.0475 +# Final valid prob -0.0704 -0.0555 +# Final train prob (xent) -1.0502 -0.7708 +# Final valid prob (xent) -1.0441 -0.7874 + +# configs for 'chain' +stage=0 +decode_nj=50 +train_set=train_960_cleaned +gmm=tri6b_cleaned +nnet3_affix=_cleaned + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1d +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# TDNN options +frames_per_eg=150,110,100 +remove_egs=true +common_egs_dir= +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +test_online_decoding=true # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf16 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf17 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + linear-component name=prefinal-l dim=256 $linear_opts + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 15 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b{09,10,11,12}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --constrained false" \ + --egs.chunk-width $frames_per_eg \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 2500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.00015 \ + --trainer.optimization.final-effective-lrate 0.000015 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir || exit 1; + +fi + +graph_dir=$dir/graph_tgsmall +if [ $stage -le 16 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov data/lang_test_tgsmall $dir $graph_dir + # remove from the graph, and convert back to const-FST. + fstrmsymbols --apply-to-output=true --remove-arcs=true "echo 3|" $graph_dir/HCLG.fst - | \ + fstconvert --fst_type=const > $graph_dir/temp.fst + mv $graph_dir/temp.fst $graph_dir/HCLG.fst +fi + +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi +if [ $stage -le 17 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in test_clean test_other dev_clean dev_other; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${decode_set}_hires \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_tgsmall || exit 1 + steps/lmrescore.sh --cmd "$decode_cmd" --self-loop-scale 1.0 data/lang_test_{tgsmall,tgmed} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tgmed} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,tglarge} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ + data/${decode_set}_hires $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_{tgsmall,fglarge} || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +if $test_online_decoding && [ $stage -le 18 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3${nnet3_affix}/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for data in test_clean test_other dev_clean dev_other; do + ( + nspk=$(wc -l /dev/null 2>&1 ; then # HogWild works much faster if all threads are binded to the same phisical cpu - rnnlm_cmd="taskset -c $(seq -s, 0 $(( $num_threads - 1 )) ) $rnnlm_cmd" + rnnlm_cmd="taskset -c $(seq -s, 0 $(( $num_threads - 1 )) | sed 's/,$//') $rnnlm_cmd" fi $rnnlm_cmd -rnnlm $modeldir/rnnlm.tmp \ -train $data_dir/librispeech-lm-norm.train.txt \ diff --git a/egs/librispeech/s5/run.sh b/egs/librispeech/s5/run.sh index 1b12f5126fd..b2386489100 100755 --- a/egs/librispeech/s5/run.sh +++ b/egs/librispeech/s5/run.sh @@ -9,28 +9,36 @@ data=/export/a15/vpanayotov/data # base url for downloads. data_url=www.openslr.org/resources/12 lm_url=www.openslr.org/resources/11 +stage=1 . ./cmd.sh . ./path.sh +. parse_options.sh # you might not want to do this for interactive shells. set -e -# download the data. Note: we're using the 100 hour setup for -# now; later in the script we'll download more and use it to train neural -# nets. -for part in dev-clean test-clean dev-other test-other train-clean-100; do - local/download_and_untar.sh $data $data_url $part -done -# download the LM resources -local/download_lm.sh $lm_url data/local/lm +if [ $stage -le 1 ]; then + # download the data. Note: we're using the 100 hour setup for + # now; later in the script we'll download more and use it to train neural + # nets. + for part in dev-clean test-clean dev-other test-other train-clean-100; do + local/download_and_untar.sh $data $data_url $part + done + -# format the data as Kaldi data directories -for part in dev-clean test-clean dev-other test-other train-clean-100; do - # use underscore-separated names in data directories. - local/data_prep.sh $data/LibriSpeech/$part data/$(echo $part | sed s/-/_/g) -done + # download the LM resources + local/download_lm.sh $lm_url data/local/lm +fi + +if [ $stage -le 2 ]; then + # format the data as Kaldi data directories + for part in dev-clean test-clean dev-other test-other train-clean-100; do + # use underscore-separated names in data directories. + local/data_prep.sh $data/LibriSpeech/$part data/$(echo $part | sed s/-/_/g) + done +fi ## Optional text corpus normalization and LM training ## These scripts are here primarily as a documentation of the process that has been @@ -46,290 +54,330 @@ done ## document our G2P model creation process #local/g2p/train_g2p.sh data/local/dict/cmudict data/local/lm -# when "--stage 3" option is used below we skip the G2P steps, and use the -# lexicon we have already downloaded from openslr.org/11/ -local/prepare_dict.sh --stage 3 --nj 30 --cmd "$train_cmd" \ +if [ $stage -le 3 ]; then + # when the "--stage 3" option is used below we skip the G2P steps, and use the + # lexicon we have already downloaded from openslr.org/11/ + local/prepare_dict.sh --stage 3 --nj 30 --cmd "$train_cmd" \ data/local/lm data/local/lm data/local/dict_nosp -utils/prepare_lang.sh data/local/dict_nosp \ - "" data/local/lang_tmp_nosp data/lang_nosp + utils/prepare_lang.sh data/local/dict_nosp \ + "" data/local/lang_tmp_nosp data/lang_nosp -local/format_lms.sh --src-dir data/lang_nosp data/local/lm + local/format_lms.sh --src-dir data/lang_nosp data/local/lm +fi -# Create ConstArpaLm format language model for full 3-gram and 4-gram LMs -utils/build_const_arpa_lm.sh data/local/lm/lm_tglarge.arpa.gz \ - data/lang_nosp data/lang_nosp_test_tglarge -utils/build_const_arpa_lm.sh data/local/lm/lm_fglarge.arpa.gz \ - data/lang_nosp data/lang_nosp_test_fglarge +if [ $stage -le 4 ]; then + # Create ConstArpaLm format language model for full 3-gram and 4-gram LMs + utils/build_const_arpa_lm.sh data/local/lm/lm_tglarge.arpa.gz \ + data/lang_nosp data/lang_nosp_test_tglarge + utils/build_const_arpa_lm.sh data/local/lm/lm_fglarge.arpa.gz \ + data/lang_nosp data/lang_nosp_test_fglarge +fi -mfccdir=mfcc -# spread the mfccs over various machines, as this data-set is quite large. -if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then - mfcc=$(basename mfccdir) # in case was absolute pathname (unlikely), get basename. - utils/create_split_dir.pl /export/b{02,11,12,13}/$USER/kaldi-data/egs/librispeech/s5/$mfcc/storage \ - $mfccdir/storage +if [ $stage -le 5 ]; then + mfccdir=mfcc + # spread the mfccs over various machines, as this data-set is quite large. + if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then + mfcc=$(basename mfccdir) # in case was absolute pathname (unlikely), get basename. + utils/create_split_dir.pl /export/b{02,11,12,13}/$USER/kaldi-data/egs/librispeech/s5/$mfcc/storage \ + $mfccdir/storage + fi fi -for part in dev_clean test_clean dev_other test_other train_clean_100; do - steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/$part exp/make_mfcc/$part $mfccdir - steps/compute_cmvn_stats.sh data/$part exp/make_mfcc/$part $mfccdir -done +if [ $stage -le 6 ]; then + for part in dev_clean test_clean dev_other test_other train_clean_100; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/$part exp/make_mfcc/$part $mfccdir + steps/compute_cmvn_stats.sh data/$part exp/make_mfcc/$part $mfccdir + done +fi -# Make some small data subsets for early system-build stages. Note, there are 29k -# utterances in the train_clean_100 directory which has 100 hours of data. -# For the monophone stages we select the shortest utterances, which should make it -# easier to align the data from a flat start. +if [ $stage -le 7 ]; then + # Make some small data subsets for early system-build stages. Note, there are 29k + # utterances in the train_clean_100 directory which has 100 hours of data. + # For the monophone stages we select the shortest utterances, which should make it + # easier to align the data from a flat start. -utils/subset_data_dir.sh --shortest data/train_clean_100 2000 data/train_2kshort -utils/subset_data_dir.sh data/train_clean_100 5000 data/train_5k -utils/subset_data_dir.sh data/train_clean_100 10000 data/train_10k + utils/subset_data_dir.sh --shortest data/train_clean_100 2000 data/train_2kshort + utils/subset_data_dir.sh data/train_clean_100 5000 data/train_5k + utils/subset_data_dir.sh data/train_clean_100 10000 data/train_10k +fi -# train a monophone system -steps/train_mono.sh --boost-silence 1.25 --nj 20 --cmd "$train_cmd" \ - data/train_2kshort data/lang_nosp exp/mono +if [ $stage -le 8 ]; then + # train a monophone system + steps/train_mono.sh --boost-silence 1.25 --nj 20 --cmd "$train_cmd" \ + data/train_2kshort data/lang_nosp exp/mono + + # decode using the monophone model + ( + utils/mkgraph.sh data/lang_nosp_test_tgsmall \ + exp/mono exp/mono/graph_nosp_tgsmall + for test in test_clean test_other dev_clean dev_other; do + steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/mono/graph_nosp_tgsmall \ + data/$test exp/mono/decode_nosp_tgsmall_$test + done + )& +fi -# decode using the monophone model -( - utils/mkgraph.sh data/lang_nosp_test_tgsmall \ - exp/mono exp/mono/graph_nosp_tgsmall - for test in test_clean test_other dev_clean dev_other; do - steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/mono/graph_nosp_tgsmall \ - data/$test exp/mono/decode_nosp_tgsmall_$test - done -)& - -steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \ - data/train_5k data/lang_nosp exp/mono exp/mono_ali_5k - -# train a first delta + delta-delta triphone system on a subset of 5000 utterances -steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ - 2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1 - -# decode using the tri1 model -( - utils/mkgraph.sh data/lang_nosp_test_tgsmall \ - exp/tri1 exp/tri1/graph_nosp_tgsmall - for test in test_clean test_other dev_clean dev_other; do - steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri1/graph_nosp_tgsmall \ - data/$test exp/tri1/decode_nosp_tgsmall_$test - steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \ - data/$test exp/tri1/decode_nosp_{tgsmall,tgmed}_$test - steps/lmrescore_const_arpa.sh \ - --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \ - data/$test exp/tri1/decode_nosp_{tgsmall,tglarge}_$test - done -)& - -steps/align_si.sh --nj 10 --cmd "$train_cmd" \ - data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k - - -# train an LDA+MLLT system. -steps/train_lda_mllt.sh --cmd "$train_cmd" \ - --splice-opts "--left-context=3 --right-context=3" 2500 15000 \ - data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2b - -# decode using the LDA+MLLT model -( - utils/mkgraph.sh data/lang_nosp_test_tgsmall \ - exp/tri2b exp/tri2b/graph_nosp_tgsmall - for test in test_clean test_other dev_clean dev_other; do - steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgsmall \ - data/$test exp/tri2b/decode_nosp_tgsmall_$test - steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \ - data/$test exp/tri2b/decode_nosp_{tgsmall,tgmed}_$test - steps/lmrescore_const_arpa.sh \ - --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \ - data/$test exp/tri2b/decode_nosp_{tgsmall,tglarge}_$test - done -)& - -# Align a 10k utts subset using the tri2b model -steps/align_si.sh --nj 10 --cmd "$train_cmd" --use-graphs true \ - data/train_10k data/lang_nosp exp/tri2b exp/tri2b_ali_10k - -# Train tri3b, which is LDA+MLLT+SAT on 10k utts -steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \ - data/train_10k data/lang_nosp exp/tri2b_ali_10k exp/tri3b - -# decode using the tri3b model -( - utils/mkgraph.sh data/lang_nosp_test_tgsmall \ - exp/tri3b exp/tri3b/graph_nosp_tgsmall - for test in test_clean test_other dev_clean dev_other; do - steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ - exp/tri3b/graph_nosp_tgsmall data/$test \ - exp/tri3b/decode_nosp_tgsmall_$test - steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \ - data/$test exp/tri3b/decode_nosp_{tgsmall,tgmed}_$test - steps/lmrescore_const_arpa.sh \ - --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \ - data/$test exp/tri3b/decode_nosp_{tgsmall,tglarge}_$test - done -)& - -# align the entire train_clean_100 subset using the tri3b model -steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \ - data/train_clean_100 data/lang_nosp \ - exp/tri3b exp/tri3b_ali_clean_100 - -# train another LDA+MLLT+SAT system on the entire 100 hour subset -steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \ - data/train_clean_100 data/lang_nosp \ - exp/tri3b_ali_clean_100 exp/tri4b - -# decode using the tri4b model -( - utils/mkgraph.sh data/lang_nosp_test_tgsmall \ - exp/tri4b exp/tri4b/graph_nosp_tgsmall - for test in test_clean test_other dev_clean dev_other; do - steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ - exp/tri4b/graph_nosp_tgsmall data/$test \ - exp/tri4b/decode_nosp_tgsmall_$test - steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \ - data/$test exp/tri4b/decode_nosp_{tgsmall,tgmed}_$test - steps/lmrescore_const_arpa.sh \ - --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \ - data/$test exp/tri4b/decode_nosp_{tgsmall,tglarge}_$test - steps/lmrescore_const_arpa.sh \ - --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,fglarge} \ - data/$test exp/tri4b/decode_nosp_{tgsmall,fglarge}_$test - done -)& - -# Now we compute the pronunciation and silence probabilities from training data, -# and re-create the lang directory. -steps/get_prons.sh --cmd "$train_cmd" \ - data/train_clean_100 data/lang_nosp exp/tri4b -utils/dict_dir_add_pronprobs.sh --max-normalize true \ - data/local/dict_nosp \ - exp/tri4b/pron_counts_nowb.txt exp/tri4b/sil_counts_nowb.txt \ - exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict - -utils/prepare_lang.sh data/local/dict \ - "" data/local/lang_tmp data/lang -local/format_lms.sh --src-dir data/lang data/local/lm - -utils/build_const_arpa_lm.sh \ - data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge -utils/build_const_arpa_lm.sh \ - data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge - -# decode using the tri4b model with pronunciation and silence probabilities -( - utils/mkgraph.sh \ - data/lang_test_tgsmall exp/tri4b exp/tri4b/graph_tgsmall - for test in test_clean test_other dev_clean dev_other; do - steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ - exp/tri4b/graph_tgsmall data/$test \ - exp/tri4b/decode_tgsmall_$test - steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \ - data/$test exp/tri4b/decode_{tgsmall,tgmed}_$test - steps/lmrescore_const_arpa.sh \ - --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ - data/$test exp/tri4b/decode_{tgsmall,tglarge}_$test - steps/lmrescore_const_arpa.sh \ - --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ - data/$test exp/tri4b/decode_{tgsmall,fglarge}_$test - done -)& - -# align train_clean_100 using the tri4b model -steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ - data/train_clean_100 data/lang exp/tri4b exp/tri4b_ali_clean_100 - -# if you want at this point you can train and test NN model(s) on the 100 hour -# subset -local/nnet2/run_5a_clean_100.sh - -local/download_and_untar.sh $data $data_url train-clean-360 - -# now add the "clean-360" subset to the mix ... -local/data_prep.sh \ - $data/LibriSpeech/train-clean-360 data/train_clean_360 -steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_clean_360 \ - exp/make_mfcc/train_clean_360 $mfccdir -steps/compute_cmvn_stats.sh \ - data/train_clean_360 exp/make_mfcc/train_clean_360 $mfccdir - -# ... and then combine the two sets into a 460 hour one -utils/combine_data.sh \ - data/train_clean_460 data/train_clean_100 data/train_clean_360 - -# align the new, combined set, using the tri4b model -steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \ - data/train_clean_460 data/lang exp/tri4b exp/tri4b_ali_clean_460 - -# create a larger SAT model, trained on the 460 hours of data. -steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \ - data/train_clean_460 data/lang exp/tri4b_ali_clean_460 exp/tri5b - -# decode using the tri5b model -( - utils/mkgraph.sh data/lang_test_tgsmall \ - exp/tri5b exp/tri5b/graph_tgsmall - for test in test_clean test_other dev_clean dev_other; do - steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ - exp/tri5b/graph_tgsmall data/$test \ - exp/tri5b/decode_tgsmall_$test - steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \ - data/$test exp/tri5b/decode_{tgsmall,tgmed}_$test - steps/lmrescore_const_arpa.sh \ - --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ - data/$test exp/tri5b/decode_{tgsmall,tglarge}_$test - steps/lmrescore_const_arpa.sh \ - --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ - data/$test exp/tri5b/decode_{tgsmall,fglarge}_$test - done -)& - -# train a NN model on the 460 hour set -local/nnet2/run_6a_clean_460.sh - -local/download_and_untar.sh $data $data_url train-other-500 - -# prepare the 500 hour subset. -local/data_prep.sh \ - $data/LibriSpeech/train-other-500 data/train_other_500 -steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_other_500 \ - exp/make_mfcc/train_other_500 $mfccdir -steps/compute_cmvn_stats.sh \ - data/train_other_500 exp/make_mfcc/train_other_500 $mfccdir - -# combine all the data -utils/combine_data.sh \ - data/train_960 data/train_clean_460 data/train_other_500 - -steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \ - data/train_960 data/lang exp/tri5b exp/tri5b_ali_960 - -# train a SAT model on the 960 hour mixed data. Use the train_quick.sh script -# as it is faster. -steps/train_quick.sh --cmd "$train_cmd" \ - 7000 150000 data/train_960 data/lang exp/tri5b_ali_960 exp/tri6b - -# decode using the tri6b model -( - utils/mkgraph.sh data/lang_test_tgsmall \ - exp/tri6b exp/tri6b/graph_tgsmall - for test in test_clean test_other dev_clean dev_other; do - steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ - exp/tri6b/graph_tgsmall data/$test exp/tri6b/decode_tgsmall_$test - steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \ - data/$test exp/tri6b/decode_{tgsmall,tgmed}_$test - steps/lmrescore_const_arpa.sh \ - --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ - data/$test exp/tri6b/decode_{tgsmall,tglarge}_$test - steps/lmrescore_const_arpa.sh \ - --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ - data/$test exp/tri6b/decode_{tgsmall,fglarge}_$test - done -)& +if [ $stage -le 9 ]; then + steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \ + data/train_5k data/lang_nosp exp/mono exp/mono_ali_5k + + # train a first delta + delta-delta triphone system on a subset of 5000 utterances + steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ + 2000 10000 data/train_5k data/lang_nosp exp/mono_ali_5k exp/tri1 + + # decode using the tri1 model + ( + utils/mkgraph.sh data/lang_nosp_test_tgsmall \ + exp/tri1 exp/tri1/graph_nosp_tgsmall + for test in test_clean test_other dev_clean dev_other; do + steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri1/graph_nosp_tgsmall \ + data/$test exp/tri1/decode_nosp_tgsmall_$test + steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \ + data/$test exp/tri1/decode_nosp_{tgsmall,tgmed}_$test + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \ + data/$test exp/tri1/decode_nosp_{tgsmall,tglarge}_$test + done + )& +fi + +if [ $stage -le 10 ]; then + steps/align_si.sh --nj 10 --cmd "$train_cmd" \ + data/train_10k data/lang_nosp exp/tri1 exp/tri1_ali_10k + + + # train an LDA+MLLT system. + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=3 --right-context=3" 2500 15000 \ + data/train_10k data/lang_nosp exp/tri1_ali_10k exp/tri2b + + # decode using the LDA+MLLT model + ( + utils/mkgraph.sh data/lang_nosp_test_tgsmall \ + exp/tri2b exp/tri2b/graph_nosp_tgsmall + for test in test_clean test_other dev_clean dev_other; do + steps/decode.sh --nj 20 --cmd "$decode_cmd" exp/tri2b/graph_nosp_tgsmall \ + data/$test exp/tri2b/decode_nosp_tgsmall_$test + steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \ + data/$test exp/tri2b/decode_nosp_{tgsmall,tgmed}_$test + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \ + data/$test exp/tri2b/decode_nosp_{tgsmall,tglarge}_$test + done + )& +fi -# this does some data-cleaning. The cleaned data should be useful when we add -# the neural net and chain systems. -local/run_cleanup_segmentation.sh +if [ $stage -le 11 ]; then + # Align a 10k utts subset using the tri2b model + steps/align_si.sh --nj 10 --cmd "$train_cmd" --use-graphs true \ + data/train_10k data/lang_nosp exp/tri2b exp/tri2b_ali_10k + + # Train tri3b, which is LDA+MLLT+SAT on 10k utts + steps/train_sat.sh --cmd "$train_cmd" 2500 15000 \ + data/train_10k data/lang_nosp exp/tri2b_ali_10k exp/tri3b + + # decode using the tri3b model + ( + utils/mkgraph.sh data/lang_nosp_test_tgsmall \ + exp/tri3b exp/tri3b/graph_nosp_tgsmall + for test in test_clean test_other dev_clean dev_other; do + steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ + exp/tri3b/graph_nosp_tgsmall data/$test \ + exp/tri3b/decode_nosp_tgsmall_$test + steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \ + data/$test exp/tri3b/decode_nosp_{tgsmall,tgmed}_$test + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \ + data/$test exp/tri3b/decode_nosp_{tgsmall,tglarge}_$test + done + )& +fi + +if [ $stage -le 12 ]; then + # align the entire train_clean_100 subset using the tri3b model + steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \ + data/train_clean_100 data/lang_nosp \ + exp/tri3b exp/tri3b_ali_clean_100 + + # train another LDA+MLLT+SAT system on the entire 100 hour subset + steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \ + data/train_clean_100 data/lang_nosp \ + exp/tri3b_ali_clean_100 exp/tri4b + + # decode using the tri4b model + ( + utils/mkgraph.sh data/lang_nosp_test_tgsmall \ + exp/tri4b exp/tri4b/graph_nosp_tgsmall + for test in test_clean test_other dev_clean dev_other; do + steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ + exp/tri4b/graph_nosp_tgsmall data/$test \ + exp/tri4b/decode_nosp_tgsmall_$test + steps/lmrescore.sh --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tgmed} \ + data/$test exp/tri4b/decode_nosp_{tgsmall,tgmed}_$test + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,tglarge} \ + data/$test exp/tri4b/decode_nosp_{tgsmall,tglarge}_$test + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_nosp_test_{tgsmall,fglarge} \ + data/$test exp/tri4b/decode_nosp_{tgsmall,fglarge}_$test + done + )& +fi + +if [ $stage -le 13 ]; then + # Now we compute the pronunciation and silence probabilities from training data, + # and re-create the lang directory. + steps/get_prons.sh --cmd "$train_cmd" \ + data/train_clean_100 data/lang_nosp exp/tri4b + utils/dict_dir_add_pronprobs.sh --max-normalize true \ + data/local/dict_nosp \ + exp/tri4b/pron_counts_nowb.txt exp/tri4b/sil_counts_nowb.txt \ + exp/tri4b/pron_bigram_counts_nowb.txt data/local/dict + + utils/prepare_lang.sh data/local/dict \ + "" data/local/lang_tmp data/lang + local/format_lms.sh --src-dir data/lang data/local/lm + + utils/build_const_arpa_lm.sh \ + data/local/lm/lm_tglarge.arpa.gz data/lang data/lang_test_tglarge + utils/build_const_arpa_lm.sh \ + data/local/lm/lm_fglarge.arpa.gz data/lang data/lang_test_fglarge + + # decode using the tri4b model with pronunciation and silence probabilities + ( + utils/mkgraph.sh \ + data/lang_test_tgsmall exp/tri4b exp/tri4b/graph_tgsmall + for test in test_clean test_other dev_clean dev_other; do + steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ + exp/tri4b/graph_tgsmall data/$test \ + exp/tri4b/decode_tgsmall_$test + steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \ + data/$test exp/tri4b/decode_{tgsmall,tgmed}_$test + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ + data/$test exp/tri4b/decode_{tgsmall,tglarge}_$test + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ + data/$test exp/tri4b/decode_{tgsmall,fglarge}_$test + done + )& +fi + +if [ $stage -le 14 ] && false; then + # This stage is for nnet2 training on 100 hours; we're commenting it out + # as it's deprecated. + # align train_clean_100 using the tri4b model + steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ + data/train_clean_100 data/lang exp/tri4b exp/tri4b_ali_clean_100 + + # This nnet2 training script is deprecated. + local/nnet2/run_5a_clean_100.sh +fi + +if [ $stage -le 15 ]; then + local/download_and_untar.sh $data $data_url train-clean-360 + + # now add the "clean-360" subset to the mix ... + local/data_prep.sh \ + $data/LibriSpeech/train-clean-360 data/train_clean_360 + steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_clean_360 \ + exp/make_mfcc/train_clean_360 $mfccdir + steps/compute_cmvn_stats.sh \ + data/train_clean_360 exp/make_mfcc/train_clean_360 $mfccdir + + # ... and then combine the two sets into a 460 hour one + utils/combine_data.sh \ + data/train_clean_460 data/train_clean_100 data/train_clean_360 +fi + +if [ $stage -le 16 ]; then + # align the new, combined set, using the tri4b model + steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \ + data/train_clean_460 data/lang exp/tri4b exp/tri4b_ali_clean_460 + + # create a larger SAT model, trained on the 460 hours of data. + steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \ + data/train_clean_460 data/lang exp/tri4b_ali_clean_460 exp/tri5b + + # decode using the tri5b model + ( + utils/mkgraph.sh data/lang_test_tgsmall \ + exp/tri5b exp/tri5b/graph_tgsmall + for test in test_clean test_other dev_clean dev_other; do + steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ + exp/tri5b/graph_tgsmall data/$test \ + exp/tri5b/decode_tgsmall_$test + steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \ + data/$test exp/tri5b/decode_{tgsmall,tgmed}_$test + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ + data/$test exp/tri5b/decode_{tgsmall,tglarge}_$test + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ + data/$test exp/tri5b/decode_{tgsmall,fglarge}_$test + done + )& +fi + + +# The following command trains an nnet3 model on the 460 hour setup. This +# is deprecated now. +## train a NN model on the 460 hour set +#local/nnet2/run_6a_clean_460.sh + +if [ $stage -le 17 ]; then + # prepare the remaining 500 hours of data + local/download_and_untar.sh $data $data_url train-other-500 + + # prepare the 500 hour subset. + local/data_prep.sh \ + $data/LibriSpeech/train-other-500 data/train_other_500 + steps/make_mfcc.sh --cmd "$train_cmd" --nj 40 data/train_other_500 \ + exp/make_mfcc/train_other_500 $mfccdir + steps/compute_cmvn_stats.sh \ + data/train_other_500 exp/make_mfcc/train_other_500 $mfccdir + + # combine all the data + utils/combine_data.sh \ + data/train_960 data/train_clean_460 data/train_other_500 +fi + +if [ $stage -le 18 ]; then + steps/align_fmllr.sh --nj 40 --cmd "$train_cmd" \ + data/train_960 data/lang exp/tri5b exp/tri5b_ali_960 + + # train a SAT model on the 960 hour mixed data. Use the train_quick.sh script + # as it is faster. + steps/train_quick.sh --cmd "$train_cmd" \ + 7000 150000 data/train_960 data/lang exp/tri5b_ali_960 exp/tri6b + + # decode using the tri6b model + ( + utils/mkgraph.sh data/lang_test_tgsmall \ + exp/tri6b exp/tri6b/graph_tgsmall + for test in test_clean test_other dev_clean dev_other; do + steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ + exp/tri6b/graph_tgsmall data/$test exp/tri6b/decode_tgsmall_$test + steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \ + data/$test exp/tri6b/decode_{tgsmall,tgmed}_$test + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ + data/$test exp/tri6b/decode_{tgsmall,tglarge}_$test + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ + data/$test exp/tri6b/decode_{tgsmall,fglarge}_$test + done + )& +fi + + +if [ $stage -le 19 ]; then + # this does some data-cleaning. The cleaned data should be useful when we add + # the neural net and chain systems. (although actually it was pretty clean already.) + local/run_cleanup_segmentation.sh +fi # steps/cleanup/debug_lexicon.sh --remove-stress true --nj 200 --cmd "$train_cmd" data/train_clean_100 \ # data/lang exp/tri6b data/local/dict/lexicon.txt exp/debug_lexicon_100h @@ -352,8 +400,10 @@ local/run_cleanup_segmentation.sh # --rnnlm-tag "h150-me3-400-nce20" $data data/local/lm -# train nnet3 tdnn models on the entire data with data-cleaning (xent and chain) -local/chain/run_tdnn.sh # set "--stage 11" if you have already run local/nnet3/run_tdnn.sh +if [ $stage -le 20 ]; then + # train and test nnet3 tdnn models on the entire data with data-cleaning. + local/chain/run_tdnn.sh # set "--stage 11" if you have already run local/nnet3/run_tdnn.sh +fi # The nnet3 TDNN recipe: # local/nnet3/run_tdnn.sh # set "--stage 11" if you have already run local/chain/run_tdnn.sh diff --git a/egs/madcat_ar/v1/README.txt b/egs/madcat_ar/v1/README.txt new file mode 100644 index 00000000000..405d8881383 --- /dev/null +++ b/egs/madcat_ar/v1/README.txt @@ -0,0 +1,15 @@ +MADCAT (Multilingual Automatic Document Classification Analysis and Translation) +Arabic Corpus is a LDC dataset (LDC2012T15, LDC2013T09, LDC2013T15) for handwriting recognition. +The dataset contains abstracts from News related passages and blogs. The xml file for each page +provides line segmentation and word segmentation information and also provides the writing +condition (writing style, speed, carefulness) of the page. It is a large size dataset with +total 42k page images and 750k (600k training, 75k dev, 75k eval) line images and 305 writers. +The major text is in Arabic but it also contains English letters and numerals. The dataset contains +about 95k unique words and 160 unique characters. The dataset has been used in NIST 2010 and 2013 +(Openhart Arabic large vocabulary unconstrained handwritten text recognition competition) evaluation +(maybe with different splits) for line level recognition task. 16.1% WER was obtained for line level +recognition in that competition. + +More info: https://catalog.ldc.upenn.edu/LDC2012T15, +https://catalog.ldc.upenn.edu/LDC2013T09/, +https://catalog.ldc.upenn.edu/LDC2013T15/. diff --git a/egs/madcat_ar/v1/cmd.sh b/egs/madcat_ar/v1/cmd.sh new file mode 100644 index 00000000000..3c8eb9f93a5 --- /dev/null +++ b/egs/madcat_ar/v1/cmd.sh @@ -0,0 +1,13 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export cmd="queue.pl" diff --git a/egs/madcat_ar/v1/image b/egs/madcat_ar/v1/image new file mode 120000 index 00000000000..1668ee99922 --- /dev/null +++ b/egs/madcat_ar/v1/image @@ -0,0 +1 @@ +../../cifar/v1/image/ \ No newline at end of file diff --git a/egs/madcat_ar/v1/local/chain/compare_wer.sh b/egs/madcat_ar/v1/local/chain/compare_wer.sh new file mode 100755 index 00000000000..ad90710b13f --- /dev/null +++ b/egs/madcat_ar/v1/local/chain/compare_wer.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi +. ./path.sh + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER " +for x in $*; do + cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Parameters " +for x in $*; do + params=$(nnet3-info $x/final.mdl 2>/dev/null | grep num-parameters | cut -d' ' -f2 | awk '{printf "%0.2fM\n",$1/1000000}') + printf "% 10s" $params +done +echo diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_1a.sh b/egs/madcat_ar/v1/local/chain/run_cnn_1a.sh new file mode 100755 index 00000000000..a3a98ce5ad5 --- /dev/null +++ b/egs/madcat_ar/v1/local/chain/run_cnn_1a.sh @@ -0,0 +1,224 @@ +#!/bin/bash + +# Copyright 2017 Hossein Hadian +# 2017 Chun Chieh Chang +# 2017 Ashish Arora + +set -e -o pipefail + +stage=0 + +nj=70 +train_set=train +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=false +lang_test=lang_test +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj $nj --cmd "$cmd" ${train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + common1="height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="height-offsets=-2,-1,0,1,2 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn4 dim=$tdnn_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=$frame_subsampling_factor \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=16 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_chainali_1a.sh b/egs/madcat_ar/v1/local/chain/run_cnn_chainali_1a.sh new file mode 100755 index 00000000000..b652eab034a --- /dev/null +++ b/egs/madcat_ar/v1/local/chain/run_cnn_chainali_1a.sh @@ -0,0 +1,226 @@ +#!/bin/bash + +set -e -o pipefail + +stage=0 + +nj=70 +train_set=train +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +chain_model_dir=exp/chain${nnet3_affix}/cnn_1a +common_egs_dir= +reporting_email= +lats_affix= +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=false +lang_test=lang_test +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $chain_model_dir $lat_dir + cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=16 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1a.sh b/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1a.sh new file mode 100755 index 00000000000..38387ce2fcc --- /dev/null +++ b/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1a.sh @@ -0,0 +1,230 @@ +#!/bin/bash + +# e2eali_1a is the same as chainali_1c but uses the e2e chain model to get the +# lattice alignments and to build a tree + +set -e -o pipefail + +stage=0 + +nj=70 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +e2echain_model_dir=exp/chain/e2e_cnn_1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=true +lang_test=lang_test +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=2 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=16 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=96,64 \ + --trainer.optimization.momentum=0.0 \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1b.sh new file mode 100755 index 00000000000..75c246f5ffe --- /dev/null +++ b/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1b.sh @@ -0,0 +1,243 @@ +#!/bin/bash + +# e2eali_1b is the same as chainali_1a but uses the e2e chain model to get the +# lattice alignments and to build a tree + +# local/chain/compare_wer.sh exp/chain/exp/chain/cnn_e2eali_1b +# System cnn_e2eali_1b +# WER 10.78 +# CER 2.99 +# Final train prob -0.0587 +# Final valid prob -0.0609 +# Final train prob (xent) -0.4471 +# Final valid prob (xent) -0.4653 +# Parameters 3.37M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1b +#exp/chain/cnn_e2eali_1b: num-iters=179 nj=8..16 num-params=3.4M dim=40->416 combine=-0.058->-0.058 (over 3) xent:train/valid[118,178,final]=(-0.463,-0.445,-0.447/-0.477,-0.462,-0.465) logprob:train/valid[118,178,final]=(-0.062,-0.059,-0.059/-0.063,-0.061,-0.061) + +set -e -o pipefail + +stage=0 + +nj=70 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +e2echain_model_dir=exp/chain/e2e_cnn_1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=true +lang_test=lang_test +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts + +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=16 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh b/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh new file mode 100755 index 00000000000..2c85e982ce6 --- /dev/null +++ b/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh @@ -0,0 +1,166 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) + +# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a +# System e2e_cnn_1a +# WER 10.71 +# CER 2.85 +# Final train prob -0.0859 +# Final valid prob -0.1266 +# Final train prob (xent) +# Final valid prob (xent) +# Parameters 2.94M + +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/ +# exp/chain/e2e_cnn_1a/: num-iters=195 nj=6..16 num-params=2.9M dim=40->324 combine=-0.065->-0.064 (over 5) logprob:train/valid[129,194,final]=(-0.078,-0.077,-0.086/-0.129,-0.126,-0.127) + +set -e + +# configs for 'chain' +stage=0 +nj=70 +train_stage=-10 +get_egs_stage=-10 +affix=1a + +# training options +tdnn_dim=450 +num_epochs=2 +num_jobs_initial=6 +num_jobs_final=16 +minibatch_size=150=128,64/300=128,64/600=64,32/1200=32,16 +common_egs_dir= +l2_regularize=0.00005 +frames_per_iter=1000000 +cmvn_opts="--norm-means=true --norm-vars=true" +train_set=train +lang_test=lang_test + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \ + --shared-phones true \ + --type mono \ + data/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts "$cmvn_opts" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize $l2_regularize \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter $frames_per_iter \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 4 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 5 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/madcat_ar/v1/local/check_tools.sh b/egs/madcat_ar/v1/local/check_tools.sh new file mode 100755 index 00000000000..00de9778808 --- /dev/null +++ b/egs/madcat_ar/v1/local/check_tools.sh @@ -0,0 +1,49 @@ +#!/bin/bash -u + +# Copyright 2015 (c) Johns Hopkins University (Jan Trmal ) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +[ -f ./path.sh ] && . ./path.sh +set +e + +command -v python3 >&/dev/null \ + || { echo >&2 "python3 not found on PATH. You will have to install Python3, preferably >= 3.6"; exit 1; } + +python3 -c "import numpy" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs numpy installed." + exit 1 +fi + +python3 -c "import scipy" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs scipy installed." + exit 1 +fi + +python3 -c "from scipy.spatial import ConvexHull" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs scipy installed." + exit 1 +fi + +python3 -c "import scipy.misc; scipy.misc.__dict__['imread'];" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs scipy-image, scikit-image and Pillow installed." + exit 1 +fi + + +exit 0 diff --git a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py new file mode 100755 index 00000000000..ba35f8b9ace --- /dev/null +++ b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py @@ -0,0 +1,573 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora +# Apache 2.0 +# minimum bounding box part in this script is originally from +#https://github.com/BebeSparkelSparkel/MinimumBoundingBox +#https://startupnextdoor.com/computing-convex-hull-in-python/ +""" This module will be used for extracting line images from page image. + Given the word segmentation (bounding box around a word) for every word, it will + extract line segmentation. To extract line segmentation, it will take word bounding + boxes of a line as input, will create a minimum area bounding box that will contain + all corner points of word bounding boxes. The obtained bounding box (will not necessarily + be vertically or horizontally aligned). Hence to extract line image from line bounding box, + page image is rotated and line image is cropped and saved. +""" + +import sys +import argparse +import os +import xml.dom.minidom as minidom +import numpy as np +from math import atan2, cos, sin, pi, degrees, sqrt +from collections import namedtuple + +from scipy.spatial import ConvexHull +from PIL import Image +from scipy.misc import toimage +import logging + +sys.path.insert(0, 'steps') +logger = logging.getLogger('libs') +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - " + "%(funcName)s - %(levelname)s ] %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) + +parser = argparse.ArgumentParser(description="Creates line images from page image", + epilog="E.g. " + sys.argv[0] + " data/LDC2012T15" + " data/LDC2013T09 data/LDC2013T15 data/madcat.train.raw.lineid " + " data/local/lines ", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('database_path1', type=str, + help='Path to the downloaded madcat data directory 1') +parser.add_argument('database_path2', type=str, + help='Path to the downloaded madcat data directory 2') +parser.add_argument('database_path3', type=str, + help='Path to the downloaded madcat data directory 3') +parser.add_argument('data_splits', type=str, + help='Path to file that contains the train/test/dev split information') +parser.add_argument('out_dir', type=str, + help='directory location to write output files') +parser.add_argument('writing_condition1', type=str, + help='Path to the downloaded (and extracted) writing conditions file 1') +parser.add_argument('writing_condition2', type=str, + help='Path to the downloaded (and extracted) writing conditions file 2') +parser.add_argument('writing_condition3', type=str, + help='Path to the downloaded (and extracted) writing conditions file 3') +parser.add_argument('--padding', type=int, default=400, + help='padding across horizontal/verticle direction') +args = parser.parse_args() + +""" +bounding_box is a named tuple which contains: + area (float): area of the rectangle + length_parallel (float): length of the side that is parallel to unit_vector + length_orthogonal (float): length of the side that is orthogonal to unit_vector + rectangle_center(int, int): coordinates of the rectangle center + (use rectangle_corners to get the corner points of the rectangle) + unit_vector (float, float): direction of the length_parallel side. + (it's orthogonal vector can be found with the orthogonal_vector function + unit_vector_angle (float): angle of the unit vector to be in radians. + corner_points [(float, float)]: set that contains the corners of the rectangle +""" + +bounding_box_tuple = namedtuple('bounding_box_tuple', 'area ' + 'length_parallel ' + 'length_orthogonal ' + 'rectangle_center ' + 'unit_vector ' + 'unit_vector_angle ' + 'corner_points' + ) + + +def unit_vector(pt0, pt1): + """ Given two points pt0 and pt1, return a unit vector that + points in the direction of pt0 to pt1. + Returns + ------- + (float, float): unit vector + """ + dis_0_to_1 = sqrt((pt0[0] - pt1[0])**2 + (pt0[1] - pt1[1])**2) + return (pt1[0] - pt0[0]) / dis_0_to_1, \ + (pt1[1] - pt0[1]) / dis_0_to_1 + + +def orthogonal_vector(vector): + """ Given a vector, returns a orthogonal/perpendicular vector of equal length. + Returns + ------ + (float, float): A vector that points in the direction orthogonal to vector. + """ + return -1 * vector[1], vector[0] + + +def bounding_area(index, hull): + """ Given index location in an array and convex hull, it gets two points + hull[index] and hull[index+1]. From these two points, it returns a named + tuple that mainly contains area of the box that bounds the hull. This + bounding box orintation is same as the orientation of the lines formed + by the point hull[index] and hull[index+1]. + Returns + ------- + a named tuple that contains: + area: area of the rectangle + length_parallel: length of the side that is parallel to unit_vector + length_orthogonal: length of the side that is orthogonal to unit_vector + rectangle_center: coordinates of the rectangle center + unit_vector: direction of the length_parallel side. + (it's orthogonal vector can be found with the orthogonal_vector function) + """ + unit_vector_p = unit_vector(hull[index], hull[index+1]) + unit_vector_o = orthogonal_vector(unit_vector_p) + + dis_p = tuple(np.dot(unit_vector_p, pt) for pt in hull) + dis_o = tuple(np.dot(unit_vector_o, pt) for pt in hull) + + min_p = min(dis_p) + min_o = min(dis_o) + len_p = max(dis_p) - min_p + len_o = max(dis_o) - min_o + + return {'area': len_p * len_o, + 'length_parallel': len_p, + 'length_orthogonal': len_o, + 'rectangle_center': (min_p + len_p / 2, min_o + len_o / 2), + 'unit_vector': unit_vector_p, + } + + +def to_xy_coordinates(unit_vector_angle, point): + """ Given angle from horizontal axis and a point from origin, + returns converted unit vector coordinates in x, y coordinates. + angle of unit vector should be in radians. + Returns + ------ + (float, float): converted x,y coordinate of the unit vector. + """ + angle_orthogonal = unit_vector_angle + pi / 2 + return point[0] * cos(unit_vector_angle) + point[1] * cos(angle_orthogonal), \ + point[0] * sin(unit_vector_angle) + point[1] * sin(angle_orthogonal) + + +def rotate_points(center_of_rotation, angle, points): + """ Rotates a point cloud around the center_of_rotation point by angle + input + ----- + center_of_rotation (float, float): angle of unit vector to be in radians. + angle (float): angle of rotation to be in radians. + points [(float, float)]: Points to be a list or tuple of points. Points to be rotated. + Returns + ------ + [(float, float)]: Rotated points around center of rotation by angle + """ + rot_points = [] + ang = [] + for pt in points: + diff = tuple([pt[d] - center_of_rotation[d] for d in range(2)]) + diff_angle = atan2(diff[1], diff[0]) + angle + ang.append(diff_angle) + diff_length = sqrt(sum([d**2 for d in diff])) + rot_points.append((center_of_rotation[0] + diff_length * cos(diff_angle), + center_of_rotation[1] + diff_length * sin(diff_angle))) + + return rot_points + + +def rectangle_corners(rectangle): + """ Given rectangle center and its inclination, returns the corner + locations of the rectangle. + Returns + ------ + [(float, float)]: 4 corner points of rectangle. + """ + corner_points = [] + for i1 in (.5, -.5): + for i2 in (i1, -1 * i1): + corner_points.append((rectangle['rectangle_center'][0] + i1 * rectangle['length_parallel'], + rectangle['rectangle_center'][1] + i2 * rectangle['length_orthogonal'])) + + return rotate_points(rectangle['rectangle_center'], rectangle['unit_vector_angle'], corner_points) + + +def get_orientation(origin, p1, p2): + """ + Given origin and two points, return the orientation of the Point p1 with + regards to Point p2 using origin. + Returns + ------- + integer: Negative if p1 is clockwise of p2. + """ + difference = ( + ((p2[0] - origin[0]) * (p1[1] - origin[1])) + - ((p1[0] - origin[0]) * (p2[1] - origin[1])) + ) + return difference + + +def compute_hull(points): + """ + Given input list of points, return a list of points that + made up the convex hull. + Returns + ------- + [(float, float)]: convexhull points + """ + hull_points = [] + start = points[0] + min_x = start[0] + for p in points[1:]: + if p[0] < min_x: + min_x = p[0] + start = p + + point = start + hull_points.append(start) + + far_point = None + while far_point is not start: + p1 = None + for p in points: + if p is point: + continue + else: + p1 = p + break + + far_point = p1 + + for p2 in points: + if p2 is point or p2 is p1: + continue + else: + direction = get_orientation(point, far_point, p2) + if direction > 0: + far_point = p2 + + hull_points.append(far_point) + point = far_point + return hull_points + + +def minimum_bounding_box(points): + """ Given a list of 2D points, it returns the minimum area rectangle bounding all + the points in the point cloud. + Returns + ------ + returns a namedtuple that contains: + area: area of the rectangle + length_parallel: length of the side that is parallel to unit_vector + length_orthogonal: length of the side that is orthogonal to unit_vector + rectangle_center: coordinates of the rectangle center + unit_vector: direction of the length_parallel side. RADIANS + unit_vector_angle: angle of the unit vector + corner_points: set that contains the corners of the rectangle + """ + + if len(points) <= 2: raise ValueError('More than two points required.') + + hull_ordered = [points[index] for index in ConvexHull(points).vertices] + hull_ordered.append(hull_ordered[0]) + #hull_ordered = compute_hull(points) + hull_ordered = tuple(hull_ordered) + + min_rectangle = bounding_area(0, hull_ordered) + for i in range(1, len(hull_ordered)-1): + rectangle = bounding_area(i, hull_ordered) + if rectangle['area'] < min_rectangle['area']: + min_rectangle = rectangle + + min_rectangle['unit_vector_angle'] = atan2(min_rectangle['unit_vector'][1], min_rectangle['unit_vector'][0]) + min_rectangle['rectangle_center'] = to_xy_coordinates(min_rectangle['unit_vector_angle'], min_rectangle['rectangle_center']) + + return bounding_box_tuple( + area = min_rectangle['area'], + length_parallel = min_rectangle['length_parallel'], + length_orthogonal = min_rectangle['length_orthogonal'], + rectangle_center = min_rectangle['rectangle_center'], + unit_vector = min_rectangle['unit_vector'], + unit_vector_angle = min_rectangle['unit_vector_angle'], + corner_points = set(rectangle_corners(min_rectangle)) + ) + + +def get_center(im): + """ Given image, returns the location of center pixel + Returns + ------- + (int, int): center of the image + """ + center_x = im.size[0] / 2 + center_y = im.size[1] / 2 + return int(center_x), int(center_y) + + +def get_horizontal_angle(unit_vector_angle): + """ Given an angle in radians, returns angle of the unit vector in + first or fourth quadrant. + Returns + ------ + (float): updated angle of the unit vector to be in radians. + It is only in first or fourth quadrant. + """ + if unit_vector_angle > pi / 2 and unit_vector_angle <= pi: + unit_vector_angle = unit_vector_angle - pi + elif unit_vector_angle > -pi and unit_vector_angle < -pi / 2: + unit_vector_angle = unit_vector_angle + pi + + return unit_vector_angle + + +def get_smaller_angle(bounding_box): + """ Given a rectangle, returns its smallest absolute angle from horizontal axis. + Returns + ------ + (float): smallest angle of the rectangle to be in radians. + """ + unit_vector = bounding_box.unit_vector + unit_vector_angle = bounding_box.unit_vector_angle + ortho_vector = orthogonal_vector(unit_vector) + ortho_vector_angle = atan2(ortho_vector[1], ortho_vector[0]) + + unit_vector_angle_updated = get_horizontal_angle(unit_vector_angle) + ortho_vector_angle_updated = get_horizontal_angle(ortho_vector_angle) + + if abs(unit_vector_angle_updated) < abs(ortho_vector_angle_updated): + return unit_vector_angle_updated + else: + return ortho_vector_angle_updated + + +def rotated_points(bounding_box, center): + """ Given the rectangle, returns corner points of rotated rectangle. + It rotates the rectangle around the center by its smallest angle. + Returns + ------- + [(int, int)]: 4 corner points of rectangle. + """ + p1, p2, p3, p4 = bounding_box.corner_points + x1, y1 = p1 + x2, y2 = p2 + x3, y3 = p3 + x4, y4 = p4 + center_x, center_y = center + rotation_angle_in_rad = -get_smaller_angle(bounding_box) + x_dash_1 = (x1 - center_x) * cos(rotation_angle_in_rad) - (y1 - center_y) * sin(rotation_angle_in_rad) + center_x + x_dash_2 = (x2 - center_x) * cos(rotation_angle_in_rad) - (y2 - center_y) * sin(rotation_angle_in_rad) + center_x + x_dash_3 = (x3 - center_x) * cos(rotation_angle_in_rad) - (y3 - center_y) * sin(rotation_angle_in_rad) + center_x + x_dash_4 = (x4 - center_x) * cos(rotation_angle_in_rad) - (y4 - center_y) * sin(rotation_angle_in_rad) + center_x + + y_dash_1 = (y1 - center_y) * cos(rotation_angle_in_rad) + (x1 - center_x) * sin(rotation_angle_in_rad) + center_y + y_dash_2 = (y2 - center_y) * cos(rotation_angle_in_rad) + (x2 - center_x) * sin(rotation_angle_in_rad) + center_y + y_dash_3 = (y3 - center_y) * cos(rotation_angle_in_rad) + (x3 - center_x) * sin(rotation_angle_in_rad) + center_y + y_dash_4 = (y4 - center_y) * cos(rotation_angle_in_rad) + (x4 - center_x) * sin(rotation_angle_in_rad) + center_y + return x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 + + +def pad_image(image): + """ Given an image, returns a padded image around the border. + This routine save the code from crashing if bounding boxes that are + slightly outside the page boundary. + Returns + ------- + image: page image + """ + offset = int(args.padding // 2) + padded_image = Image.new('RGB', (image.size[0] + int(args.padding), image.size[1] + int(args.padding)), "white") + padded_image.paste(im = image, box = (offset, offset)) + return padded_image + + +def update_minimum_bounding_box_input(bounding_box_input): + """ Given list of 2D points, returns list of 2D points shifted by an offset. + Returns + ------ + points [(float, float)]: points, a list or tuple of 2D coordinates + """ + updated_minimum_bounding_box_input = [] + offset = int(args.padding // 2) + for point in bounding_box_input: + x, y = point + new_x = x + offset + new_y = y + offset + word_coordinate = (new_x, new_y) + updated_minimum_bounding_box_input.append(word_coordinate) + + return updated_minimum_bounding_box_input + + +def set_line_image_data(image, line_id, image_file_name, image_fh): + """ Given an image, saves a flipped line image. Line image file name + is formed by appending the line id at the end page image name. + """ + + base_name = os.path.splitext(os.path.basename(image_file_name))[0] + line_id = '_' + line_id.zfill(4) + line_image_file_name = base_name + line_id + '.png' + image_path = os.path.join(args.out_dir, line_image_file_name) + imgray = image.convert('L') + imgray_rev_arr = np.fliplr(imgray) + imgray_rev = toimage(imgray_rev_arr) + imgray_rev.save(image_path) + image_fh.write(image_path + '\n') + + +def get_line_images_from_page_image(image_file_name, madcat_file_path, image_fh): + """ Given a page image, extracts the line images from it. + Input + ----- + image_file_name (string): complete path and name of the page image. + madcat_file_path (string): complete path and name of the madcat xml file + corresponding to the page image. + """ + im_wo_pad = Image.open(image_file_name) + im = pad_image(im_wo_pad) + doc = minidom.parse(madcat_file_path) + zone = doc.getElementsByTagName('zone') + for node in zone: + id = node.getAttribute('id') + token_image = node.getElementsByTagName('token-image') + minimum_bounding_box_input = [] + for token_node in token_image: + word_point = token_node.getElementsByTagName('point') + for word_node in word_point: + word_coordinate = (int(word_node.getAttribute('x')), int(word_node.getAttribute('y'))) + minimum_bounding_box_input.append(word_coordinate) + updated_mbb_input = update_minimum_bounding_box_input(minimum_bounding_box_input) + bounding_box = minimum_bounding_box(updated_mbb_input) + + p1, p2, p3, p4 = bounding_box.corner_points + x1, y1 = p1 + x2, y2 = p2 + x3, y3 = p3 + x4, y4 = p4 + min_x = int(min(x1, x2, x3, x4)) + min_y = int(min(y1, y2, y3, y4)) + max_x = int(max(x1, x2, x3, x4)) + max_y = int(max(y1, y2, y3, y4)) + box = (min_x, min_y, max_x, max_y) + region_initial = im.crop(box) + rot_points = [] + p1_new = (x1 - min_x, y1 - min_y) + p2_new = (x2 - min_x, y2 - min_y) + p3_new = (x3 - min_x, y3 - min_y) + p4_new = (x4 - min_x, y4 - min_y) + rot_points.append(p1_new) + rot_points.append(p2_new) + rot_points.append(p3_new) + rot_points.append(p4_new) + + cropped_bounding_box = bounding_box_tuple(bounding_box.area, + bounding_box.length_parallel, + bounding_box.length_orthogonal, + bounding_box.length_orthogonal, + bounding_box.unit_vector, + bounding_box.unit_vector_angle, + set(rot_points) + ) + + rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box) + img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC) + x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points( + cropped_bounding_box, get_center(region_initial)) + + min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + box = (min_x, min_y, max_x, max_y) + region_final = img2.crop(box) + set_line_image_data(region_final, id, image_file_name, image_fh) + + +def check_file_location(base_name, wc_dict1, wc_dict2, wc_dict3): + """ Returns the complete path of the page image and corresponding + xml file. + Returns + ------- + image_file_name (string): complete path and name of the page image. + madcat_file_path (string): complete path and name of the madcat xml file + corresponding to the page image. + """ + madcat_file_path1 = os.path.join(args.database_path1, 'madcat', base_name + '.madcat.xml') + madcat_file_path2 = os.path.join(args.database_path2, 'madcat', base_name + '.madcat.xml') + madcat_file_path3 = os.path.join(args.database_path3, 'madcat', base_name + '.madcat.xml') + + image_file_path1 = os.path.join(args.database_path1, 'images', base_name + '.tif') + image_file_path2 = os.path.join(args.database_path2, 'images', base_name + '.tif') + image_file_path3 = os.path.join(args.database_path3, 'images', base_name + '.tif') + + if os.path.exists(madcat_file_path1): + return madcat_file_path1, image_file_path1, wc_dict1 + + if os.path.exists(madcat_file_path2): + return madcat_file_path2, image_file_path2, wc_dict2 + + if os.path.exists(madcat_file_path3): + return madcat_file_path3, image_file_path3, wc_dict3 + + return None, None, None + + +def parse_writing_conditions(writing_conditions): + """ Given writing condition file path, returns a dictionary which have writing condition + of each page image. + Returns + ------ + (dict): dictionary with key as page image name and value as writing condition. + """ + with open(writing_conditions) as f: + file_writing_cond = dict() + for line in f: + line_list = line.strip().split("\t") + file_writing_cond[line_list[0]] = line_list[3] + return file_writing_cond + + +def check_writing_condition(wc_dict, base_name): + """ Given writing condition dictionary, checks if a page image is writing + in a specifed writing condition. + It is used to create subset of dataset based on writing condition. + Returns + (bool): True if writing condition matches. + """ + return True + writing_condition = wc_dict[base_name].strip() + if writing_condition != 'IUC': + return False + + return True + + +### main ### + +def main(): + + wc_dict1 = parse_writing_conditions(args.writing_condition1) + wc_dict2 = parse_writing_conditions(args.writing_condition2) + wc_dict3 = parse_writing_conditions(args.writing_condition3) + output_directory = args.out_dir + image_file = os.path.join(output_directory, 'images.scp') + image_fh = open(image_file, 'w', encoding='utf-8') + + splits_handle = open(args.data_splits, 'r') + splits_data = splits_handle.read().strip().split('\n') + prev_base_name = '' + for line in splits_data: + base_name = os.path.splitext(os.path.splitext(line.split(' ')[0])[0])[0] + if prev_base_name != base_name: + prev_base_name = base_name + madcat_file_path, image_file_path, wc_dict = check_file_location(base_name, wc_dict1, wc_dict2, wc_dict3) + if wc_dict is None or not check_writing_condition(wc_dict, base_name): + continue + if madcat_file_path is not None: + get_line_images_from_page_image(image_file_path, madcat_file_path, image_fh) + + +if __name__ == '__main__': + main() + diff --git a/egs/madcat_ar/v1/local/download_data.sh b/egs/madcat_ar/v1/local/download_data.sh new file mode 100755 index 00000000000..7061be49c2a --- /dev/null +++ b/egs/madcat_ar/v1/local/download_data.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# Copyright 2018 Ashish Arora +# Apache 2.0 + +# This script downloads data splits for MADCAT Arabic dataset. +# It also check if madcat arabic data is present or not. + +download_dir1=/export/corpora/LDC/LDC2012T15/data +download_dir2=/export/corpora/LDC/LDC2013T09/data +download_dir3=/export/corpora/LDC/LDC2013T15/data +train_split_url=http://www.openslr.org/resources/48/madcat.train.raw.lineid +test_split_url=http://www.openslr.org/resources/48/madcat.test.raw.lineid +dev_split_url=http://www.openslr.org/resources/48/madcat.dev.raw.lineid +data_splits=data/download/data_splits + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +if [ -d $data_splits ]; then + echo "$0: Not downloading the data splits as it is already there." +else + if [ ! -f $data_splits/madcat.train.raw.lineid ]; then + mkdir -p $data_splits + echo "$0: Downloading the data splits..." + wget -P $data_splits $train_split_url || exit 1; + wget -P $data_splits $test_split_url || exit 1; + wget -P $data_splits $dev_split_url || exit 1; + fi + echo "$0: Done downloading the data splits" +fi + +if [ -d $download_dir1 ]; then + echo "$0: madcat arabic data directory is present." +else + if [ ! -f $download_dir1/madcat/*.madcat.xml ]; then + echo "$0: please download madcat data..." + fi +fi diff --git a/egs/madcat_ar/v1/local/extract_features.sh b/egs/madcat_ar/v1/local/extract_features.sh new file mode 100755 index 00000000000..70c5498626c --- /dev/null +++ b/egs/madcat_ar/v1/local/extract_features.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Copyright 2017 Yiwen Shao +# 2018 Ashish Arora + +nj=4 +cmd=run.pl +feat_dim=40 +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +data=$1 +featdir=$data/data +scp=$data/images.scp +logdir=$data/log + +mkdir -p $logdir +mkdir -p $featdir + +# make $featdir an absolute pathname +featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}` + +for n in $(seq $nj); do + split_scps="$split_scps $logdir/images.$n.scp" +done + +# split images.scp +utils/split_scp.pl $scp $split_scps || exit 1; + +$cmd JOB=1:$nj $logdir/extract_features.JOB.log \ + local/make_features.py $logdir/images.JOB.scp \ + --allowed_len_file_path $data/allowed_lengths.txt \ + --feat-dim $feat_dim \| \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp + +## aggregates the output scp's to get feats.scp +for n in $(seq $nj); do + cat $featdir/images.$n.scp || exit 1; +done > $data/feats.scp || exit 1 diff --git a/egs/madcat_ar/v1/local/extract_lines.sh b/egs/madcat_ar/v1/local/extract_lines.sh new file mode 100755 index 00000000000..50129ad38c9 --- /dev/null +++ b/egs/madcat_ar/v1/local/extract_lines.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# Copyright 2018 Ashish Arora + +nj=4 +cmd=run.pl +download_dir1=/export/corpora/LDC/LDC2012T15/data +download_dir2=/export/corpora/LDC/LDC2013T09/data +download_dir3=/export/corpora/LDC/LDC2013T15/data +writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab +writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab +writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab +data_split_file=data/download/data_splits/madcat.dev.raw.lineid +data=data/local/dev +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +log_dir=$data/log + +mkdir -p $log_dir +mkdir -p $data + +for n in $(seq $nj); do + split_scps="$split_scps $log_dir/lines.$n.scp" +done + +utils/split_scp.pl $data_split_file $split_scps || exit 1; + +for n in $(seq $nj); do + mkdir -p $data/$n +done + +$cmd JOB=1:$nj $log_dir/extract_lines.JOB.log \ + local/create_line_image_from_page_image.py $download_dir1 $download_dir2 $download_dir3 \ + $log_dir/lines.JOB.scp $data/JOB $writing_condition1 $writing_condition2 $writing_condition3 \ + || exit 1; + +## concatenate the .scp files together. +for n in $(seq $nj); do + cat $data/$n/images.scp || exit 1; +done > $data/images.scp || exit 1 diff --git a/egs/madcat_ar/v1/local/make_features.py b/egs/madcat_ar/v1/local/make_features.py new file mode 100755 index 00000000000..a21276d32c2 --- /dev/null +++ b/egs/madcat_ar/v1/local/make_features.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2018 Hossein Hadian + +""" This script converts images to Kaldi-format feature matrices. The input to + this script is the path to a data directory, e.g. "data/train". This script + reads the images listed in images.scp and writes them to standard output + (by default) as Kaldi-formatted matrices (in text form). It also scales the + images so they have the same height (via --feat-dim). It can optionally pad + the images (on left/right sides) with white pixels. + If an 'image2num_frames' file is found in the data dir, it will be used + to enforce the images to have the specified length in that file by padding + white pixels (the --padding option will be ignored in this case). This relates + to end2end chain training. + + eg. local/make_features.py data/train --feat-dim 40 +""" + +import argparse +import os +import sys +import numpy as np +from scipy import misc + +parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and + writes them to standard output in text format.""") +parser.add_argument('images_scp_path', type=str, + help='Path of images.scp file') +parser.add_argument('--allowed_len_file_path', type=str, default=None, + help='If supplied, each images will be padded to reach the ' + 'target length (this overrides --padding).') +parser.add_argument('--out-ark', type=str, default='-', + help='Where to write the output feature file') +parser.add_argument('--feat-dim', type=int, default=40, + help='Size to scale the height of all images') +parser.add_argument('--padding', type=int, default=5, + help='Number of white pixels to pad on the left' + 'and right side of the image.') + + +args = parser.parse_args() + + +def write_kaldi_matrix(file_handle, matrix, key): + file_handle.write(key + " [ ") + num_rows = len(matrix) + if num_rows == 0: + raise Exception("Matrix is empty") + num_cols = len(matrix[0]) + + for row_index in range(len(matrix)): + if num_cols != len(matrix[row_index]): + raise Exception("All the rows of a matrix are expected to " + "have the same length") + file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + if row_index != num_rows - 1: + file_handle.write("\n") + file_handle.write(" ]\n") + + +def get_scaled_image(im): + scale_size = args.feat_dim + sx = im.shape[1] # width + sy = im.shape[0] # height + scale = (1.0 * scale_size) / sy + nx = int(scale_size) + ny = int(scale * sx) + im = misc.imresize(im, (nx, ny)) + return im + + +def horizontal_pad(im, allowed_lengths = None): + if allowed_lengths is None: + left_padding = right_padding = args.padding + else: # Find an allowed length for the image + imlen = im.shape[1] # width + allowed_len = 0 + for l in allowed_lengths: + if l > imlen: + allowed_len = l + break + if allowed_len == 0: + # No allowed length was found for the image (the image is too long) + return None + padding = allowed_len - imlen + left_padding = int(padding // 2) + right_padding = padding - left_padding + dim_y = im.shape[0] # height + im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), + dtype=int), im), axis=1) + im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), + dtype=int)), axis=1) + return im_pad1 + + +### main ### + +data_list_path = args.images_scp_path + +if args.out_ark == '-': + out_fh = sys.stdout +else: + out_fh = open(args.out_ark,'wb') + +allowed_lengths = None +allowed_len_handle = args.allowed_len_file_path +if os.path.isfile(allowed_len_handle): + print("Found 'allowed_lengths.txt' file...", file=sys.stderr) + allowed_lengths = [] + with open(allowed_len_handle) as f: + for line in f: + allowed_lengths.append(int(line.strip())) + print("Read {} allowed lengths and will apply them to the " + "features.".format(len(allowed_lengths)), file=sys.stderr) + +num_fail = 0 +num_ok = 0 +with open(data_list_path) as f: + for line in f: + line = line.strip() + line_vect = line.split(' ') + image_id = line_vect[0] + image_path = line_vect[1] + im = misc.imread(image_path) + im_scaled = get_scaled_image(im) + im_horizontal_padded = horizontal_pad(im_scaled, allowed_lengths) + if im_horizontal_padded is None: + num_fail += 1 + continue + data = np.transpose(im_horizontal_padded, (1, 0)) + data = np.divide(data, 255.0) + num_ok += 1 + write_kaldi_matrix(out_fh, data, image_id) + +print('Generated features for {} images. Failed for {} (image too ' + 'long).'.format(num_ok, num_fail), file=sys.stderr) diff --git a/egs/madcat_ar/v1/local/prepare_data.sh b/egs/madcat_ar/v1/local/prepare_data.sh new file mode 100755 index 00000000000..d808d736845 --- /dev/null +++ b/egs/madcat_ar/v1/local/prepare_data.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 + +# This script prepares the training and test data for MADCAT Arabic dataset +# (i.e text, images.scp, utt2spk and spk2utt). It calls process_data.py. + +# Eg. local/prepare_data.sh +# Eg. text file: LDC0001_000404_NHR_ARB_20070113.0052_11_LDC0001_00z2 ﻮﺠﻫ ﻮﻌﻘﻟ ﻍﺍﺮﻗ ﺢﺗّﻯ ﺎﻠﻨﺧﺎﻋ +# utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001 +# images.scp file: LDC0009_000000_arb-NG-2-76513-5612324_2_LDC0009_00z0 +# data/local/lines/1/arb-NG-2-76513-5612324_2_LDC0009_00z0.tif + +stage=0 +download_dir1=/export/corpora/LDC/LDC2012T15/data +download_dir2=/export/corpora/LDC/LDC2013T09/data +download_dir3=/export/corpora/LDC/LDC2013T15/data +writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab +writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab +writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab +data_splits_dir=data/download/data_splits +images_scp_dir=data/local + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +mkdir -p data/{train,test,dev} + +if [ $stage -le 1 ]; then + echo "$0: Processing dev, train and test data..." + echo "Date: $(date)." + local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ + $data_splits_dir/madcat.dev.raw.lineid data/dev $images_scp_dir/dev/images.scp \ + $writing_condition1 $writing_condition2 $writing_condition3 || exit 1 + + local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ + $data_splits_dir/madcat.test.raw.lineid data/test $images_scp_dir/test/images.scp \ + $writing_condition1 $writing_condition2 $writing_condition3 || exit 1 + + local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ + $data_splits_dir/madcat.train.raw.lineid data/train $images_scp_dir/train/images.scp \ + $writing_condition1 $writing_condition2 $writing_condition3 || exit 1 + + for dataset in dev test train; do + echo "$0: Fixing data directory for dataset: $dataset" + echo "Date: $(date)." + image/fix_data_dir.sh data/$dataset + done +fi diff --git a/egs/madcat_ar/v1/local/prepare_dict.sh b/egs/madcat_ar/v1/local/prepare_dict.sh new file mode 100755 index 00000000000..d8093658c30 --- /dev/null +++ b/egs/madcat_ar/v1/local/prepare_dict.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +# Copyright 2017 Hossein Hadian +# 2017 Chun Chieh Chang +# 2017 Ashish Arora + +# This script prepares the dictionary. + +set -e +dir=data/local/dict +mkdir -p $dir + +local/prepare_lexicon.py $dir + +cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1; + +echo ' SIL' >> $dir/lexicon.txt + +echo SIL > $dir/silence_phones.txt + +echo SIL >$dir/optional_silence.txt + +echo -n "" >$dir/extra_questions.txt diff --git a/egs/madcat_ar/v1/local/prepare_lexicon.py b/egs/madcat_ar/v1/local/prepare_lexicon.py new file mode 100755 index 00000000000..5a6ac5b6dbf --- /dev/null +++ b/egs/madcat_ar/v1/local/prepare_lexicon.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora + +import argparse +import os + +parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""") +parser.add_argument('dir', type=str, help='output path') +args = parser.parse_args() + +### main ### +lex = {} +text_path = os.path.join('data', 'train', 'text') +text_fh = open(text_path, 'r', encoding='utf-8') + +with open(text_path, 'r', encoding='utf-8') as f: + for line in f: + line_vect = line.strip().split(' ') + for i in range(1, len(line_vect)): + characters = list(line_vect[i]) + # Put SIL instead of "|". Because every "|" in the beginning of the words is for initial-space of that word + characters = " ".join(['SIL' if char == '|' else char for char in characters]) + lex[line_vect[i]] = characters + if line_vect[i] == '#': + lex[line_vect[i]] = "" + +with open(os.path.join(args.dir, 'lexicon.txt'), 'w', encoding='utf-8') as fp: + for key in sorted(lex): + fp.write(key + " " + lex[key] + "\n") diff --git a/egs/madcat_ar/v1/local/prepend_words.py b/egs/madcat_ar/v1/local/prepend_words.py new file mode 100755 index 00000000000..d53eb8974bf --- /dev/null +++ b/egs/madcat_ar/v1/local/prepend_words.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# This script, prepend '|' to every words in the transcript to mark +# the beginning of the words for finding the initial-space of every word +# after decoding. + +import sys, io + +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') +for line in infile: + output.write(' '.join(["|" + word for word in line.split()]) + '\n') diff --git a/egs/madcat_ar/v1/local/process_data.py b/egs/madcat_ar/v1/local/process_data.py new file mode 100755 index 00000000000..b57500cf2fa --- /dev/null +++ b/egs/madcat_ar/v1/local/process_data.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Ashish Arora + +""" This script reads MADCAT files and creates the following files (for the + data subset selected via --dataset) :text, utt2spk, images.scp. + Eg. local/process_data.py data/local /export/corpora/LDC/LDC2012T15 /export/corpora/LDC/LDC2013T09 + /export/corpora/LDC/LDC2013T15 data/download/data_splits/madcat.train.raw.lineid + data/dev data/local/lines/images.scp + Eg. text file: LDC0001_000404_NHR_ARB_20070113.0052_11_LDC0001_00z2 وجه وعقل غارق حتّى النخاع + utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001 + images.scp file: LDC0009_000000_arb-NG-2-76513-5612324_2_LDC0009_00z0 + data/local/lines/1/arb-NG-2-76513-5612324_2_LDC0009_00z0.tif +""" + +import argparse +import os +import sys +import xml.dom.minidom as minidom +import unicodedata + +parser = argparse.ArgumentParser(description="Creates text, utt2spk and images.scp files", + epilog="E.g. " + sys.argv[0] + " data/LDC2012T15" + " data/LDC2013T09 data/LDC2013T15 data/madcat.train.raw.lineid " + " data/train data/local/lines ", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('database_path1', type=str, + help='Path to the downloaded (and extracted) madcat data') +parser.add_argument('database_path2', type=str, + help='Path to the downloaded (and extracted) madcat data') +parser.add_argument('database_path3', type=str, + help='Path to the downloaded (and extracted) madcat data') +parser.add_argument('data_splits', type=str, + help='Path to file that contains the train/test/dev split information') +parser.add_argument('out_dir', type=str, + help='directory location to write output files.') +parser.add_argument('images_scp_path', type=str, + help='Path of input images.scp file(maps line image and location)') +parser.add_argument('writing_condition1', type=str, + help='Path to the downloaded (and extracted) writing conditions file 1') +parser.add_argument('writing_condition2', type=str, + help='Path to the downloaded (and extracted) writing conditions file 2') +parser.add_argument('writing_condition3', type=str, + help='Path to the downloaded (and extracted) writing conditions file 3') +args = parser.parse_args() + + +def check_file_location(): + """ Returns the complete path of the page image and corresponding + xml file. + Args: + Returns: + image_file_name (string): complete path and name of the page image. + madcat_file_path (string): complete path and name of the madcat xml file + corresponding to the page image. + """ + madcat_file_path1 = os.path.join(args.database_path1, 'madcat', base_name + '.madcat.xml') + madcat_file_path2 = os.path.join(args.database_path2, 'madcat', base_name + '.madcat.xml') + madcat_file_path3 = os.path.join(args.database_path3, 'madcat', base_name + '.madcat.xml') + + image_file_path1 = os.path.join(args.database_path1, 'images', base_name + '.tif') + image_file_path2 = os.path.join(args.database_path2, 'images', base_name + '.tif') + image_file_path3 = os.path.join(args.database_path3, 'images', base_name + '.tif') + + if os.path.exists(madcat_file_path1): + return madcat_file_path1, image_file_path1, wc_dict1 + + if os.path.exists(madcat_file_path2): + return madcat_file_path2, image_file_path2, wc_dict2 + + if os.path.exists(madcat_file_path3): + return madcat_file_path3, image_file_path3, wc_dict3 + + return None, None, None + + +def parse_writing_conditions(writing_conditions): + """ Returns a dictionary which have writing condition of each page image. + Args: + writing_conditions(string): complete path of writing condition file. + Returns: + (dict): dictionary with key as page image name and value as writing condition. + """ + with open(writing_conditions) as f: + file_writing_cond = dict() + for line in f: + line_list = line.strip().split("\t") + file_writing_cond[line_list[0]] = line_list[3] + return file_writing_cond + + +def check_writing_condition(wc_dict): + """ Checks if a given page image is writing in a given writing condition. + It is used to create subset of dataset based on writing condition. + Args: + wc_dict (dict): dictionary with key as page image name and value as writing condition. + Returns: + (bool): True if writing condition matches. + """ + return True + writing_condition = wc_dict[base_name].strip() + if writing_condition != 'IUC': + return False + + return True + + +def get_word_line_mapping(madcat_file_path): + """ Maps every word in the page image to a corresponding line. + Args: + madcat_file_path (string): complete path and name of the madcat xml file + corresponding to the page image. + Returns: + """ + doc = minidom.parse(madcat_file_path) + zone = doc.getElementsByTagName('zone') + for node in zone: + line_id = node.getAttribute('id') + line_word_dict[line_id] = list() + word_image = node.getElementsByTagName('token-image') + for tnode in word_image: + word_id = tnode.getAttribute('id') + line_word_dict[line_id].append(word_id) + word_line_dict[word_id] = line_id + + +def read_text(madcat_file_path): + """ Maps every word in the page image to a corresponding line. + Args: + madcat_file_path (string): complete path and name of the madcat xml file + corresponding to the page image. + Returns: + dict: Mapping every word in the page image to a corresponding line. + """ + text_line_word_dict = dict() + doc = minidom.parse(madcat_file_path) + segment = doc.getElementsByTagName('segment') + for node in segment: + token = node.getElementsByTagName('token') + for tnode in token: + ref_word_id = tnode.getAttribute('ref_id') + word = tnode.getElementsByTagName('source')[0].firstChild.nodeValue + word = unicodedata.normalize('NFKC',word) + ref_line_id = word_line_dict[ref_word_id] + if ref_line_id not in text_line_word_dict: + text_line_word_dict[ref_line_id] = list() + text_line_word_dict[ref_line_id].append(word) + return text_line_word_dict + + +def get_line_image_location(): + image_loc_dict = dict() # Stores image base name and location + image_loc_vect = input_image_fh.read().strip().split("\n") + for line in image_loc_vect: + base_name = os.path.basename(line) + location_vect = line.split('/') + location = "/".join(location_vect[:-1]) + image_loc_dict[base_name]=location + return image_loc_dict + + +### main ### + +print("Processing '{}' data...".format(args.out_dir)) + +text_file = os.path.join(args.out_dir, 'text') +text_fh = open(text_file, 'w', encoding='utf-8') +utt2spk_file = os.path.join(args.out_dir, 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8') +image_file = os.path.join(args.out_dir, 'images.scp') +image_fh = open(image_file, 'w', encoding='utf-8') + +input_image_file = args.images_scp_path +input_image_fh = open(input_image_file, 'r', encoding='utf-8') + +wc_dict1 = parse_writing_conditions(args.writing_condition1) +wc_dict2 = parse_writing_conditions(args.writing_condition2) +wc_dict3 = parse_writing_conditions(args.writing_condition3) +image_loc_dict = get_line_image_location() + +image_num = 0 +with open(args.data_splits) as f: + prev_base_name = '' + for line in f: + base_name = os.path.splitext(os.path.splitext(line.split(' ')[0])[0])[0] + if prev_base_name != base_name: + prev_base_name = base_name + madcat_xml_path, image_file_path, wc_dict = check_file_location() + if wc_dict is None or not check_writing_condition(wc_dict): + continue + if madcat_xml_path is not None: + madcat_doc = minidom.parse(madcat_xml_path) + writer = madcat_doc.getElementsByTagName('writer') + writer_id = writer[0].getAttribute('id') + line_word_dict = dict() + word_line_dict = dict() + get_word_line_mapping(madcat_xml_path) + text_line_word_dict = read_text(madcat_xml_path) + base_name = os.path.basename(image_file_path) + base_name, b = base_name.split('.tif') + for lineID in sorted(text_line_word_dict): + updated_base_name = base_name + '_' + str(lineID).zfill(4) +'.png' + location = image_loc_dict[updated_base_name] + image_file_path = os.path.join(location, updated_base_name) + line = text_line_word_dict[lineID] + text = ' '.join(line) + utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_name + '_' + str(lineID).zfill(4) + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') + image_num += 1 diff --git a/egs/madcat_ar/v1/local/reverse.py b/egs/madcat_ar/v1/local/reverse.py new file mode 100755 index 00000000000..8e8887095ab --- /dev/null +++ b/egs/madcat_ar/v1/local/reverse.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# This script, reverse all latin and digits sequences +# (including words like MP3) to put them in the right order in the images. + +import re, os, sys, io + +in_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +out_stream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') +for line in in_stream: + out_stream.write(re.sub(r'[a-zA-Z0-9][a-zA-Z0-9\s\.\:]*[a-zA-Z0-9]', + lambda m:m.group(0)[::-1], line)) diff --git a/egs/madcat_ar/v1/local/score.sh b/egs/madcat_ar/v1/local/score.sh new file mode 100755 index 00000000000..2c11aba3e13 --- /dev/null +++ b/egs/madcat_ar/v1/local/score.sh @@ -0,0 +1,5 @@ +#!/bin/bash + + +steps/scoring/score_kaldi_wer.sh --word_ins_penalty 0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,5.5,6.0,6.5,7.0 "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 --word_ins_penalty 0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,5.5,6.0,6.5,7.0 "$@" diff --git a/egs/madcat_ar/v1/local/train_lm.sh b/egs/madcat_ar/v1/local/train_lm.sh new file mode 100755 index 00000000000..3b8a382cb00 --- /dev/null +++ b/egs/madcat_ar/v1/local/train_lm.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 +# +# This script trains a LM on the MADCAT training transcriptions. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 + +echo "$0 $@" # Print the command line for logging +. ./utils/parse_options.sh || exit 1; + +dir=data/local/local_lm +lm_dir=${dir}/data +segments=data/train/segmented_words + + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt= +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + # use the validation data as the dev set. + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + + cat data/dev/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + + # use the training data as an additional data source. + # we can later fold the dev data into this. + cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/madcat.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (the validation data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights.) + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt + + # get the wordlist from MADCAT text + cat ${dir}/data/text/madcat.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +order=3 + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='train=2 madcat=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --wordlist=${wordlist} --num-splits=5 --warm-start-ratio=1 \ + --limit-unk-history=true \ + ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz +fi diff --git a/egs/madcat_ar/v1/local/wer_output_filter b/egs/madcat_ar/v1/local/wer_output_filter new file mode 100755 index 00000000000..c0f03e7178a --- /dev/null +++ b/egs/madcat_ar/v1/local/wer_output_filter @@ -0,0 +1,46 @@ +#!/usr/bin/env perl +# Copyright 2012-2014 Johns Hopkins University (Author: Yenda Trmal) +# Apache 2.0 + +use utf8; + +use open qw(:encoding(utf8)); +binmode STDIN, ":utf8"; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; + +# Arabic-specific normalization +while (<>) { + @F = split " "; + print $F[0]; + foreach $s (@F[1..$#F]) { + $s =~ s/\x{0623}/\x{0627}/g; + $s =~ s/\x{0625}/\x{0627}/g; + $s =~ s/\x{0622}/\x{0627}/g; + $s =~ s/\x{0624}/\x{0648}/g; + $s =~ s/\x{0626}/\x{064A}/g; + $s =~ s/\x{0649}/\x{064A}/g; + $s =~ s/\x{0629}/\x{0647}/g; + $s =~ s/\x{0660}/0/g; + $s =~ s/\x{0661}/1/g; + $s =~ s/\x{0662}/2/g; + $s =~ s/\x{0663}/3/g; + $s =~ s/\x{0664}/4/g; + $s =~ s/\x{0665}/5/g; + $s =~ s/\x{0666}/6/g; + $s =~ s/\x{0667}/7/g; + $s =~ s/\x{0668}/8/g; + $s =~ s/\x{0669}/9/g; + $s =~ s/\x{0621}//g; + $s =~ s/[\x{064b}-\x{0655}]//g; + $s =~ s/\x{0640}//g; + $s =~ s/\|/ /g; + if ($s ne "") { + print "$s"; + } else { + print ""; + } + } + print "\n"; +} + diff --git a/egs/madcat_ar/v1/path.sh b/egs/madcat_ar/v1/path.sh new file mode 100755 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/madcat_ar/v1/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/madcat_ar/v1/run.sh b/egs/madcat_ar/v1/run.sh new file mode 100755 index 00000000000..14c8bf7a6ce --- /dev/null +++ b/egs/madcat_ar/v1/run.sh @@ -0,0 +1,140 @@ +#!/bin/bash + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2017 Hossein Hadian + +set -e +stage=0 +nj=70 +decode_gmm=false +# download_dir{1,2,3} points to the database path on the JHU grid. If you have not +# already downloaded the database you can set it to a local directory +# This corpus can be purchased here: +# https://catalog.ldc.upenn.edu/LDC2012T15, +# https://catalog.ldc.upenn.edu/LDC2013T09/, +# https://catalog.ldc.upenn.edu/LDC2013T15/. +download_dir1=/export/corpora/LDC/LDC2012T15/data +download_dir2=/export/corpora/LDC/LDC2013T09/data +download_dir3=/export/corpora/LDC/LDC2013T15/data +writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab +writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab +writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab +data_splits_dir=data/download/data_splits + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. + +./local/check_tools.sh + +mkdir -p data/{train,test,dev}/data +mkdir -p data/local/{train,test,dev} + +if [ $stage -le 0 ]; then + echo "$0: Downloading data splits..." + echo "Date: $(date)." + local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ + --download_dir2 $download_dir2 --download_dir3 $download_dir3 +fi + +if [ $stage -le 1 ]; then + for dataset in test train dev; do + data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid + local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ + --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ + --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \ + --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ + --data data/local/$dataset + done +fi + +if [ $stage -le 2 ]; then + echo "$0: Preparing data..." + local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ + --download_dir3 $download_dir3 --images_scp_dir data/local \ + --data_splits_dir $data_splits_dir --writing_condition1 $writing_condition1 \ + --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 +fi + +mkdir -p data/{train,test,dev}/data + +if [ $stage -le 3 ]; then + for dataset in test train; do + local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset + steps/compute_cmvn_stats.sh data/$dataset || exit 1; + done + utils/fix_data_dir.sh data/train +fi + +if [ $stage -le 4 ]; then + echo "$0: Preparing dictionary and lang..." + local/prepare_dict.sh + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \ + data/local/dict "" data/lang/temp data/lang +fi + +if [ $stage -le 5 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + data/local/dict/lexicon.txt data/lang_test +fi + +if [ $stage -le 6 ]; then + steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train \ + data/lang exp/mono +fi + +if [ $stage -le 7 ] && $decode_gmm; then + utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph + + steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \ + exp/mono/decode_test +fi + +if [ $stage -le 8 ]; then + steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \ + exp/mono exp/mono_ali + + steps/train_deltas.sh --cmd $cmd 500 20000 data/train data/lang \ + exp/mono_ali exp/tri +fi + +if [ $stage -le 9 ] && $decode_gmm; then + utils/mkgraph.sh data/lang_test exp/tri exp/tri/graph + + steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \ + exp/tri/decode_test +fi + +if [ $stage -le 10 ]; then + steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \ + exp/tri exp/tri_ali + + steps/train_lda_mllt.sh --cmd $cmd \ + --splice-opts "--left-context=3 --right-context=3" 500 20000 \ + data/train data/lang exp/tri_ali exp/tri3 +fi + +if [ $stage -le 11 ] && $decode_gmm; then + utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph + + steps/decode.sh --nj $nj --cmd $cmd exp/tri3/graph \ + data/test exp/tri3/decode_test +fi + +if [ $stage -le 12 ]; then + steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ + data/train data/lang exp/tri3 exp/tri3_ali +fi + +if [ $stage -le 13 ]; then + local/chain/run_cnn_1a.sh +fi + +if [ $stage -le 14 ]; then + local/chain/run_cnn_chainali_1a.sh --stage 2 +fi diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh new file mode 100755 index 00000000000..5d27476d3e1 --- /dev/null +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian +# 2018 Ashish Arora +set -e +stage=0 +nj=70 +# download_dir{1,2,3} points to the database path on the JHU grid. If you have not +# already downloaded the database you can set it to a local directory +# This corpus can be purchased here: +# https://catalog.ldc.upenn.edu/LDC2012T15, +# https://catalog.ldc.upenn.edu/LDC2013T09/, +# https://catalog.ldc.upenn.edu/LDC2013T15/. +download_dir1=/export/corpora/LDC/LDC2012T15/data +download_dir2=/export/corpora/LDC/LDC2013T09/data +download_dir3=/export/corpora/LDC/LDC2013T15/data +writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab +writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab +writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab +data_splits_dir=data/download/data_splits + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. +./local/check_tools.sh + +mkdir -p data/{train,test,dev}/data +mkdir -p data/local/{train,test,dev} + +if [ $stage -le 0 ]; then + echo "$0: Downloading data splits..." + echo "Date: $(date)." + local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ + --download_dir2 $download_dir2 --download_dir3 $download_dir3 +fi + +if [ $stage -le 1 ]; then + for dataset in test train dev; do + data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid + local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ + --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ + --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \ + --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ + --data data/local/$dataset + done +fi + +if [ $stage -le 2 ]; then + echo "$0: Preparing data..." + local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ + --download_dir3 $download_dir3 --images_scp_dir data/local \ + --data_splits_dir $data_splits_dir --writing_condition1 $writing_condition1 \ + --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 +fi + +if [ $stage -le 3 ]; then + echo "$0: Obtaining image groups. calling get_image2num_frames" + echo "Date: $(date)." + image/get_image2num_frames.py data/train # This will be needed for the next command + # The next command creates a "allowed_lengths.txt" file in data/train + # which will be used by local/make_features.py to enforce the images to + # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. + echo "$0: Obtaining image groups. calling get_allowed_lengths" + echo "Date: $(date)." + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train +fi + +if [ $stage -le 4 ]; then + for dataset in test train; do + echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. " + echo "Date: $(date)." + local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset + steps/compute_cmvn_stats.sh data/$dataset || exit 1; + done + echo "$0: Fixing data directory for train dataset" + echo "Date: $(date)." + utils/fix_data_dir.sh data/train +fi + +if [ $stage -le 5 ]; then + echo "$0: Preparing dictionary and lang..." + cut -d' ' -f2- data/train/text | local/reverse.py | \ + local/prepend_words.py | \ + utils/lang/bpe/learn_bpe.py -s 700 > data/train/bpe.out + for set in test train dev; do + cut -d' ' -f1 data/$set/text > data/$set/ids + cut -d' ' -f2- data/$set/text | local/reverse.py | \ + local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/train/bpe.out \ + | sed 's/@@//g' > data/$set/bpe_text + mv data/$set/text data/$set/text.old + paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + done + local/prepare_dict.sh + # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. + # So we set --sil-prob to 0.0 + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ + data/local/dict "" data/lang/temp data/lang + utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang +fi + +if [ $stage -le 6 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + data/local/dict/lexicon.txt data/lang_test +fi + +if [ $stage -le 7 ]; then + echo "$0: Calling the flat-start chain recipe..." + echo "Date: $(date)." + local/chain/run_flatstart_cnn1a.sh --nj $nj +fi + +if [ $stage -le 8 ]; then + echo "$0: Aligning the training data using the e2e chain model..." + echo "Date: $(date)." + steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ + --use-gpu false \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ + data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train +fi + +if [ $stage -le 9 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments..." + echo "Date: $(date)." + local/chain/run_cnn_e2eali_1b.sh --nj $nj +fi diff --git a/egs/madcat_ar/v1/steps b/egs/madcat_ar/v1/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/madcat_ar/v1/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/madcat_ar/v1/utils b/egs/madcat_ar/v1/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/madcat_ar/v1/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh index cebb2b84f16..3922170ac12 120000 --- a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh +++ b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_1g.sh \ No newline at end of file +tuning/run_tdnn_1h.sh \ No newline at end of file diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g20.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g20.sh new file mode 100755 index 00000000000..18540806028 --- /dev/null +++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g20.sh @@ -0,0 +1,324 @@ +#!/bin/bash + +# 1g20 is as 1g but adding the option "--constrained false" to --egs.opts. +# This is the new 'unconstrained egs' code where it uses the e2e examples. +# +# local/chain/compare_wer.sh exp/chain/tdnn1g_sp exp/chain/tdnn1g20_sp +# System tdnn1g_sp tdnn1g20_sp +#WER dev_clean_2 (tgsmall) 13.55 13.55 +#WER dev_clean_2 (tglarge) 9.74 9.66 +# Final train prob -0.0454 -0.0318 +# Final valid prob -0.0920 -0.0800 +# Final train prob (xent) -1.1679 -1.1831 +# Final valid prob (xent) -1.4506 -1.5074 +# Num-params 6227338 6227338 + +# 1g is as 1f but adding dropout (well, something like dropout-- the mask +# is shared across time and it's continuous rather than zero-one), increasing +# the hidden dimension, and training for more epochs. + +# local/chain/compare_wer.sh --online exp/chain/tdnn1f_sp exp/chain/tdnn1g_sp +# System tdnn1f_sp tdnn1g_sp +#WER dev_clean_2 (tgsmall) 14.21 13.76 +# [online:] 14.18 13.72 +#WER dev_clean_2 (tglarge) 10.32 9.65 +# [online:] 10.25 9.85 +# Final train prob -0.0507 -0.0453 +# Final valid prob -0.0912 -0.0892 +# Final train prob (xent) -1.3550 -1.1694 +# Final valid prob (xent) -1.6018 -1.4486 +# Num-params 4205322 6227338 + + +# steps/info/chain_dir_info.pl exp/chain/tdnn1{f,g}_sp +# exp/chain/tdnn1f_sp: num-iters=17 nj=2..5 num-params=4.2M dim=40+100->2309 combine=-0.060->-0.060 (over 1) xent:train/valid[10,16,final]=(-1.61,-1.41,-1.36/-1.82,-1.66,-1.60) logprob:train/valid[10,16,final]=(-0.067,-0.057,-0.051/-0.106,-0.097,-0.091) +# exp/chain/tdnn1g_sp: num-iters=25 nj=2..5 num-params=6.2M dim=40+100->2309 combine=-0.054->-0.053 (over 2) xent:train/valid[15,24,final]=(-1.49,-1.22,-1.17/-1.75,-1.51,-1.45) logprob:train/valid[15,24,final]=(-0.063,-0.050,-0.045/-0.106,-0.096,-0.089) + + + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1g20 # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.05 dropout-per-dim-continuous=true" + output_opts="l2-regularize=0.02 bottleneck-dim=192" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $opts dim=512 + relu-batchnorm-dropout-layer name=tdnn2 $opts dim=512 input=Append(-1,0,1) + relu-batchnorm-dropout-layer name=tdnn3 $opts dim=512 + relu-batchnorm-dropout-layer name=tdnn4 $opts dim=512 input=Append(-1,0,1) + relu-batchnorm-dropout-layer name=tdnn5 $opts dim=512 + relu-batchnorm-dropout-layer name=tdnn6 $opts dim=512 input=Append(-3,0,3) + relu-batchnorm-dropout-layer name=tdnn7 $opts dim=512 input=Append(-3,0,3) + relu-batchnorm-dropout-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0) + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain $opts dim=512 + output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=512 + output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=15 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=256,128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 2328 combine=-0.056->-0.055 (over 3) xent:train/valid[15,24,final]=(-1.50,-1.23,-1.17/-1.73,-1.52,-1.45) logprob:train/valid[15,24,final]=(-0.063,-0.051,-0.046/-0.101,-0.094,-0.089) +# exp/chain/tdnn1h_sp: num-iters=34 nj=2..5 num-params=5.2M dim=40+100->2328 combine=-0.049->-0.046 (over 4) xent:train/valid[21,33,final]=(-1.50,-1.22,-1.17/-1.66,-1.44,-1.39) logprob:train/valid[21,33,final]=(-0.068,-0.055,-0.049/-0.097,-0.088,-0.080) +# exp/chain/tdnn1h2_sp: num-iters=34 nj=2..5 num-params=5.2M dim=40+100->2328 combine=-0.049->-0.046 (over 4) xent:train/valid[21,33,final]=(-1.50,-1.22,-1.17/-1.67,-1.43,-1.39) logprob:train/valid[21,33,final]=(-0.068,-0.055,-0.049/-0.096,-0.087,-0.080) +# exp/chain/tdnn1h3_sp: num-iters=34 nj=2..5 num-params=5.2M dim=40+100->2328 combine=-0.050->-0.046 (over 4) xent:train/valid[21,33,final]=(-1.51,-1.23,-1.17/-1.67,-1.45,-1.39) logprob:train/valid[21,33,final]=(-0.068,-0.055,-0.049/-0.097,-0.089,-0.081) + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1h # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +dropout_schedule='0,0@0.20,0.3@0.50,0' +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.015" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + ## adding the layers for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + # adding the layers for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l $lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 11000 data/$train_set $lang $build_tree_ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.0015 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + linear_opts="l2-regularize=0.0015 orthonormal-constraint=-1.0" + output_opts="l2-regularize=0.001" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1536 + linear-component name=tdnn2l0 dim=320 $linear_opts input=Append(-1,0) + linear-component name=tdnn2l dim=320 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1536 + linear-component name=tdnn3l dim=320 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1536 input=Append(0,1) + linear-component name=tdnn4l0 dim=320 $linear_opts input=Append(-1,0) + linear-component name=tdnn4l dim=320 $linear_opts input=Append(0,1) + relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1536 + linear-component name=tdnn5l dim=320 $linear_opts + relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1536 input=Append(0, tdnn3l) + linear-component name=tdnn6l0 dim=320 $linear_opts input=Append(-3,0) + linear-component name=tdnn6l dim=320 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1792 + linear-component name=tdnn7l0 dim=320 $linear_opts input=Append(-3,0) + linear-component name=tdnn7l dim=320 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1536 + linear-component name=tdnn8l0 dim=320 $linear_opts input=Append(-3,0) + linear-component name=tdnn8l dim=320 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1792 + linear-component name=tdnn9l0 dim=320 $linear_opts input=Append(-3,0) + linear-component name=tdnn9l dim=320 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1536 + linear-component name=tdnn10l0 dim=320 $linear_opts input=Append(-3,0) + linear-component name=tdnn10l dim=320 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1792 + linear-component name=tdnn11l0 dim=320 $linear_opts input=Append(-3,0) + linear-component name=tdnn11l dim=320 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1536 + linear-component name=prefinal-l dim=320 $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1792 + linear-component name=prefinal-chain-l dim=320 $linear_opts + batchnorm-component name=prefinal-chain-batchnorm + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1792 + linear-component name=prefinal-xent-l dim=320 $linear_opts + batchnorm-component name=prefinal-xent-batchnorm + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b{03,07,09,18}/$USER/kaldi-data/egs/multi-en-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/multi_a/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate $initial_effective_lrate \ + --trainer.optimization.final-effective-lrate $final_effective_lrate \ + --trainer.max-param-change $max_param_change \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/multi_a/tri5a_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${multi}_${gmm}_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg +fi + +decode_suff=fsh_sw1_tg +graph_dir=$dir/graph_fsh_sw1_tg + +if [ $stage -le 15 ]; then + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + if $rescore && [ ! -f data/lang_${multi}_${gmm}_fsh_sw1_fg/G.carpa ]; then + LM_fg=data/local/lm/4gram-mincount/lm_unpruned.gz + utils/build_const_arpa_lm.sh $LM_fg data/lang_${multi}_${gmm}_fsh_sw1_tg data/lang_${multi}_${gmm}_fsh_sw1_fg + fi + for decode_set in rt03 eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/multi_a/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $rescore; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_${multi}_${gmm}_fsh_sw1_{tg,fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; + fi + ) & + done +fi + +test_online_decoding=true +lang=data/lang_${multi}_${gmm}_fsh_sw1_tg +if $test_online_decoding && [ $stage -le 16 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/multi_a/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in rt03 eval2000; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj 50 --cmd "$decode_cmd" $iter_opts \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $rescore; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_${multi}_${gmm}_fsh_sw1_{tg,fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in online decoding" + exit 1 + fi +fi + +exit 0; diff --git a/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1a.sh b/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1a.sh new file mode 100755 index 00000000000..98e7c2ed6c1 --- /dev/null +++ b/egs/multi_en/s5/local/chain/tuning/run_tdnn_opgru_1a.sh @@ -0,0 +1,320 @@ +#!/bin/bash +# Copyright 2018 Xiaohui Zhang +# 2017 University of Chinese Academy of Sciences (UCAS) Gaofeng Cheng +# Apache 2.0 + +# This is based on TDNN_LSTM_1b (from egs/swbd/s5c), but using the NormOPGRU to replace the LSTMP, +# and adding chunk-{left,right}-context-initial=0 +# Different from the vanilla OPGRU, Norm-OPGRU adds batchnorm in its output (forward direction) +# and renorm in its recurrence. Experiments show that the TDNN-NormOPGRU could achieve similar +# results than TDNN-LSTMP and BLSTMP in both large or small data sets (80 ~ 2300 Hrs). + +# ./local/chain/compare_wer_general.sh tdnn_5b_sp tdnn_opgru_1a_sp +# System tdnn_5b_sp tdnn_opgru_1a_sp +# WER on eval2000(tg) 11.7 11.6 +# WER on eval2000(fg) 11.5 11.5 +# WER on rt03(tg) 11.9 11.5 +# WER on rt03(fg) 11.5 11.2 +# Final train prob -0.097 -0.088 +# Final valid prob -0.090 -0.088 +# Final train prob (xent) -1.042 -1.048 +# Final valid prob (xent) -0.9712 -1.0253 +# Num-parameters 34818416 37364848 + +# ./steps/info/chain_dir_info.pl exp/multi_a/chain/tdnn_opgru_1a_sp +# exp/multi_a/chain/tdnn_opgru_1a_sp: num-iters=2621 nj=3..16 num-params=37.4M dim=40+100->8504 combine=-0.082->-0.082 (over 2) +# xent:train/valid[1744,2620,final]=(-1.62,-1.05,-1.05/-1.56,-1.02,-1.03) +# logprob:train/valid[1744,2620,final]=(-0.118,-0.089,-0.088/-0.112,-0.089,-0.088) + +# online results +# Eval2000 +# %WER 14.5 | 2628 21594 | 87.6 8.9 3.6 2.1 14.5 49.3 | exp/multi_a/chain/tdnn_opgru_1a_sp_online/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys +# %WER 11.5 | 4459 42989 | 90.1 7.2 2.7 1.6 11.5 46.4 | exp/multi_a/chain/tdnn_opgru_1a_sp_online/decode_eval2000_fsh_sw1_tg/score_8_1.0/eval2000_hires.ctm.filt.sys +# %WER 8.4 | 1831 21395 | 92.8 5.3 1.9 1.1 8.4 41.8 | exp/multi_a/chain/tdnn_opgru_1a_sp_online/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys +# %WER 14.4 | 2628 21594 | 87.7 8.8 3.5 2.1 14.4 49.4 | exp/multi_a/chain/tdnn_opgru_1a_sp_online/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys +# %WER 11.4 | 4459 42989 | 90.2 7.1 2.7 1.7 11.4 46.3 | exp/multi_a/chain/tdnn_opgru_1a_sp_online/decode_eval2000_fsh_sw1_fg/score_8_1.0/eval2000_hires.ctm.filt.sys +# %WER 8.3 | 1831 21395 | 92.9 5.2 1.9 1.2 8.3 41.1 | exp/multi_a/chain/tdnn_opgru_1a_sp_online/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys + +# RT03 +# %WER 9.3 | 3970 36721 | 91.6 5.3 3.1 0.9 9.3 40.0 | exp/multi_a/chain/tdnn_opgru_1a_sp_online/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys +# %WER 11.4 | 8420 76157 | 89.8 6.7 3.5 1.2 11.4 42.1 | exp/multi_a/chain/tdnn_opgru_1a_sp_online/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.filt.sys +# %WER 13.3 | 4450 39436 | 88.1 7.9 4.0 1.4 13.3 43.9 | exp/multi_a/chain/tdnn_opgru_1a_sp_online/decode_rt03_fsh_sw1_tg/score_8_0.5/rt03_hires.ctm.swbd.filt.sys +# %WER 9.2 | 3970 36721 | 91.9 5.4 2.7 1.1 9.2 39.6 | exp/multi_a/chain/tdnn_opgru_1a_sp_online/decode_rt03_fsh_sw1_fg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys +# %WER 11.2 | 8420 76157 | 90.0 6.5 3.5 1.2 11.2 41.9 | exp/multi_a/chain/tdnn_opgru_1a_sp_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.filt.sys +# %WER 13.1 | 4450 39436 | 88.3 7.8 3.9 1.4 13.1 43.6 | exp/multi_a/chain/tdnn_opgru_1a_sp_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys + + + +set -e + +# configs for 'chain' +stage=1 +train_stage=576 +get_egs_stage=-10 +speed_perturb=true +multi=multi_a +gmm=tri5a +dir=exp/multi_a/chain/tdnn_opgru_1a # Note: _sp will get added to this if $speed_perturb == true. +decode_iter= +decode_dir_affix= +rescore=true # whether to rescore lattices +dropout_schedule='0,0@0.20,0.2@0.50,0' + +# training options +leftmost_questions_truncate=-1 +num_epochs=4 +chunk_width=150 +chunk_left_context=40 +chunk_right_context=0 +xent_regularize=0.025 +self_repair_scale=0.00001 +label_delay=5 +# decode options +extra_left_context=50 +extra_right_context=0 +frames_per_chunk= + +remove_egs=false +common_egs_dir= + +affix= +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 11000 data/$train_set $lang $build_tree_ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + gru_opts="dropout-per-frame=true dropout-proportion=0.0 " + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2, ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=1024 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + norm-opgru-layer name=opgru1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + norm-opgru-layer name=opgru2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 + norm-opgru-layer name=opgru3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts + + ## adding the layers for chain branch + output-layer name=output input=opgru3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=opgru3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,7,9,8}/$USER/kaldi-data/egs/multi-en-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir exp/multi_a/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1200000 \ + --trainer.max-param-change 2.0 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.shrink-value 0.99 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.momentum 0.0 \ + --trainer.deriv-truncate-margin 8 \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/multi_a/tri5a_lats_nodup$suffix \ + --dir $dir || exit 1; +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${multi}_${gmm}_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg +fi + +decode_suff=fsh_sw1_tg +graph_dir=$dir/graph_fsh_sw1_tg +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + [ -z $extra_right_context ] && extra_right_context=$chunk_right_context; + [ -z $frames_per_chunk ] && frames_per_chunk=$chunk_width; + if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " + fi + if $rescore && [ ! -f data/lang_${multi}_${gmm}_fsh_sw1_fg/G.carpa ]; then + LM_fg=data/local/lm/4gram-mincount/lm_unpruned.gz + utils/build_const_arpa_lm.sh $LM_fg data/lang_${multi}_${gmm}_fsh_sw1_tg data/lang_${multi}_${gmm}_fsh_sw1_fg + fi + for decode_set in rt03 eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 50 --cmd "$decode_cmd" $iter_opts \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk "$frames_per_chunk" \ + --online-ivector-dir exp/multi_a/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1; + if $rescore; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_${multi}_${gmm}_fsh_sw1_{tg,fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +test_online_decoding=true +lang=data/lang_${multi}_${gmm}_fsh_sw1_tg +if $test_online_decoding && [ $stage -le 16 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/multi_a/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in rt03 eval2000; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj 50 --cmd "$decode_cmd" $iter_opts \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1; + if $rescore; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_${multi}_${gmm}_fsh_sw1_{tg,fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in online decoding" + exit 1 + fi +fi + +exit 0; diff --git a/egs/multi_en/s5/local/g2p/apply_g2p.sh b/egs/multi_en/s5/local/g2p/apply_g2p.sh index f8e50302c29..8484155800d 100755 --- a/egs/multi_en/s5/local/g2p/apply_g2p.sh +++ b/egs/multi_en/s5/local/g2p/apply_g2p.sh @@ -29,7 +29,7 @@ mkdir -p $workdir echo 'Gathering missing words...' cat data/*/train/text | \ local/count_oovs.pl $lexicon | \ - awk '{for(i=4; i 3 ) {for(i=4; i $workdir/missing.txt cat $workdir/missing.txt | \ diff --git a/egs/multi_en/s5/local/make_partitions.sh b/egs/multi_en/s5/local/make_partitions.sh index bf0915029f2..74f23ae9746 100755 --- a/egs/multi_en/s5/local/make_partitions.sh +++ b/egs/multi_en/s5/local/make_partitions.sh @@ -40,8 +40,9 @@ fi # swbd 100k (nodup) if [ $stage -eq 3 ]; then utils/subset_data_dir.sh --speakers data/swbd/train 100000 data/swbd/train_100k - utils/data/remove_dup_utts.sh 200 data/swbd/train_100k $data_dir/tri1b_ali - ln -nfs tri1b_ali $data_dir/tri2 + utils/data/remove_dup_utts.sh 200 data/swbd/train_100k $data_dir/train_100k_nodup + ln -nfs train_100k_nodup $data_dir/tri1b_ali + ln -nfs train_100k_nodup $data_dir/tri2 fi # whole swbd diff --git a/egs/multi_en/s5/local/nnet3/run_ivector_common.sh b/egs/multi_en/s5/local/nnet3/run_ivector_common.sh index ee77222866a..d36cb0e6083 100755 --- a/egs/multi_en/s5/local/nnet3/run_ivector_common.sh +++ b/egs/multi_en/s5/local/nnet3/run_ivector_common.sh @@ -14,67 +14,57 @@ train_stage=-10 generate_alignments=true # false if doing chain training speed_perturb=true multi=multi_a +gmm=tri5a . ./path.sh . ./utils/parse_options.sh # perturbed data preparation -train_set=$multi/tdnn +train_set=$multi/$gmm if [ "$speed_perturb" == "true" ]; then if [ $stage -le 1 ]; then #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment # _sp stands for speed-perturbed - - for datadir in $multi/tdnn; do - utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1 - utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2 - utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2 - utils/validate_data_dir.sh --no-feats data/${datadir}_tmp - rm -r data/temp1 data/temp2 - - steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \ - data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp || exit 1; - steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp || exit 1; - utils/fix_data_dir.sh data/${datadir}_tmp - - utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0 - utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0 - utils/fix_data_dir.sh data/${datadir}_sp - rm -r data/temp0 data/${datadir}_tmp - done + echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp + echo "$0: making MFCC features for low-resolution speed-perturbed data" + steps/make_mfcc.sh --nj 70 --cmd "$train_cmd" \ + data/${train_set}_sp || exit 1 + steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1 + utils/fix_data_dir.sh data/${train_set}_sp || exit 1 fi if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then #obtain the alignment of the perturbed data steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ - data/$multi/tdnn_sp data/lang exp/$multi/tri5 exp/$multi/tri5_ali_sp || exit 1 + data/${train_set}_sp data/lang_${multi}_${gmm} exp/$multi/$gmm exp/$multi/${gmm}_ali_sp || exit 1 fi - train_set=$multi/tdnn_sp + train_set=$multi/${gmm}_sp fi if [ $stage -le 3 ]; then + # Create high-resolution MFCC features (with 40 cepstra instead of 13). + # this shows how you can split across multiple file-systems. + echo "$0: creating high-resolution MFCC features" + mfccdir=mfcc_hires + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + date=$(date +'%m_%d_%H_%M') + utils/create_split_dir.pl /export/b0{1,3,7,9}/$USER/kaldi-data/mfcc/multi-en-$date/s5/$mfccdir/storage $mfccdir/storage + fi + # the 100k_nodup directory is copied seperately, as - # we want to use exp/tri1b_ali_100k_nodup for lda_mllt training + # we want to use exp/${multi}/${gmm}_ali_100k_nodup for lda_mllt training # the main train directory might be speed_perturbed - for dataset in $train_set $multi/tdnn_100k; do + for dataset in $train_set $multi/train_100k_nodup; do utils/copy_data_dir.sh data/$dataset data/${dataset}_hires - # scale the waveforms, this is useful as we don't use CMVN - data_dir=data/${dataset}_hires - cat $data_dir/wav.scp | python -c " -import sys, os, subprocess, re, random -scale_low = 1.0/8 -scale_high = 2.0 -for line in sys.stdin.readlines(): - if len(line.strip()) == 0: - continue - print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high)) -"| sort -k1,1 -u > $data_dir/wav.scp_scaled || exit 1; - mv $data_dir/wav.scp_scaled $data_dir/wav.scp - - steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf --cmd "$train_cmd" \ - data/${dataset}_hires exp/make_hires/$dataset; - steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset}; + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${dataset}_hires + + steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset} $mfccdir; # Remove the small number of utterances that couldn't be extracted for some # reason (e.g. too short; no such file). @@ -83,58 +73,74 @@ for line in sys.stdin.readlines(): for dataset in eval2000 rt03; do # Create MFCCs for the eval set - utils/copy_data_dir.sh data/$dataset/test data/${dataset}_hires/test + utils/copy_data_dir.sh data/$dataset/test data/${dataset}_hires steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \ - data/${dataset}_hires/test exp/make_hires/$dataset; - steps/compute_cmvn_stats.sh data/${dataset}_hires/test exp/make_hires/$dataset; - utils/fix_data_dir.sh data/${dataset}_hires/test # remove segments with problems + data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires # remove segments with problems done - # Take the first 30k utterances, which will be used for the diagubm training + # Take the first 30k utterances (about 1/8th of the data) this will be used + # for the diagubm training utils/subset_data_dir.sh --first data/${train_set}_hires 30000 data/${train_set}_30k_hires - utils/data/remove_dup_utts.sh 200 data/${train_set}_30k_hires data/${train_set}_30k_nodup_hires + utils/data/remove_dup_utts.sh 200 data/${train_set}_30k_hires data/${train_set}_30k_nodup_hires # 33hr fi -# ivector extractor training if [ $stage -le 5 ]; then - # We need to build a small system just because we need the LDA+MLLT transform - # to train the diag-UBM on top of. We use --num-iters 13 because after we get - # the transform (12th iter is the last), any further training is pointless. - # this decision is based on fisher_english - steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ + echo "$0: computing a PCA transform from the hires data." + steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \ --splice-opts "--left-context=3 --right-context=3" \ - 5500 90000 data/$multi/tdnn_100k_hires \ - data/lang exp/$multi/tri4_ali exp/$multi/nnet3/tri2b + --max-utts 10000 --subsample 2 \ + data/${train_set}_30k_nodup_hires exp/$multi/nnet3/pca fi if [ $stage -le 6 ]; then # To train a diagonal UBM we don't need very much data, so use the smallest subset. - steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \ - data/${train_set}_30k_nodup_hires 512 exp/$multi/nnet3/tri2b exp/$multi/nnet3/diag_ubm + echo "$0: training the diagonal UBM." + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \ + data/${train_set}_30k_nodup_hires 512 exp/$multi/nnet3/pca exp/$multi/nnet3/diag_ubm fi if [ $stage -le 7 ]; then # iVector extractors can be sensitive to the amount of data, but this one has a # fairly small dim (defaults to 100) so we don't use all of it, we use just the # 100k subset (just under half the data). + echo "$0: training the iVector extractor" steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ - data/$multi/tdnn_100k_hires exp/$multi/nnet3/diag_ubm exp/$multi/nnet3/extractor || exit 1; + data/$multi/train_100k_nodup_hires exp/$multi/nnet3/diag_ubm exp/$multi/nnet3/extractor || exit 1; fi if [ $stage -le 8 ]; then - # We extract iVectors on all the train_nodup data, which will be what we - # train the system on. + # We extract iVectors on the speed-perturbed training data after combining + # short segments, which will be what we train the system on. With + # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats + # each of these pairs as one speaker; this gives more diversity in iVectors.. + # Note that these are extracted 'online'. + + # note, we don't encode the 'max2' in the name of the ivectordir even though + # that's the data we extract the ivectors from, as it's still going to be + # valid for the non-'max2' data, the utterance list is the same. + + ivectordir=exp/$multi/nnet3/ivectors_${train_set} + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then + utils/create_split_dir.pl /export/b0{1,3,7,9}/$USER/kaldi-data/ivectors/fisher_swbd-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage + fi + # having a larger number of speakers is helpful for generalization, and to # handle per-utterance decoding well (iVector starts at zero). - steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires + temp_data_root=${ivectordir} + utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \ + data/${train_set}_hires ${temp_data_root}/${train_set}_hires_max2 steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ - data/${train_set}_max2_hires exp/$multi/nnet3/extractor exp/$multi/nnet3/ivectors_$train_set || exit 1; + ${temp_data_root}/${train_set}_hires_max2 \ + exp/$multi/nnet3/extractor $ivectordir + # Also extract iVectors for the test data for data_set in eval2000 rt03; do steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ - data/${data_set}_hires/test exp/$multi/nnet3/extractor exp/$multi/nnet3/ivectors_$data_set || exit 1; + data/${data_set}_hires exp/$multi/nnet3/extractor exp/$multi/nnet3/ivectors_${data_set} || exit 1; done fi diff --git a/egs/multi_en/s5/local/score_sclite.sh b/egs/multi_en/s5/local/score_sclite.sh index 845caed2e99..07dd63950d5 100755 --- a/egs/multi_en/s5/local/score_sclite.sh +++ b/egs/multi_en/s5/local/score_sclite.sh @@ -2,7 +2,7 @@ ########################################################################################### # This script was copied from egs/fisher_swbd/s5/local/score_sclite.sh -# The source commit was e69198c3dc5633f98eb88e1cdf20b2521a598f21 +# The source commit was 5dfa20aa3da217cc4d51d2e844995db1139b7bcd # No changes were made ########################################################################################### @@ -12,8 +12,7 @@ cmd=run.pl stage=0 min_lmwt=5 -max_lmwt=20 -reverse=false +max_lmwt=17 word_ins_penalty=0.0,0.5,1.0 #end configuration section. @@ -27,7 +26,6 @@ if [ $# -ne 3 ]; then echo " --stage (0|1|2) # start scoring script from part-way through." echo " --min_lmwt # minumum LM-weight for lattice rescoring " echo " --max_lmwt # maximum LM-weight for lattice rescoring " - echo " --reverse (true/false) # score with time reversed features " exit 1; fi @@ -37,7 +35,7 @@ dir=$3 model=$dir/../final.mdl # assume model one level up from decoding dir. -hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl +hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl [ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1; hubdir=`dirname $hubscr` @@ -46,38 +44,31 @@ for f in $data/stm $data/glm $lang/words.txt $lang/phones/word_boundary.int \ [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; done -# the structure of data dirs in this recipe is like data/eval2000/test -data_dir=`dirname $data`; -name=`basename $data_dir`; # e.g. eval2000 +if [ -f $dir/../frame_shift ]; then + frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)" + echo "$0: $dir/../frame_shift exists, using $frame_shift_opt" +elif [ -f $dir/../frame_subsampling_factor ]; then + factor=$(cat $dir/../frame_subsampling_factor) || exit 1 + frame_shift_opt="--frame-shift=0.0$factor" + echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt" +fi + +name=`basename $data`; # e.g. eval2000 mkdir -p $dir/scoring/log if [ $stage -le 0 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do - if $reverse; then - $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \ - mkdir -p $dir/score_LMWT_${wip}/ '&&' \ - lattice-scale --lm-scale==LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \|\ - lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ - lattice-1best ark:- ark:- \| \ - lattice-reverse ark:- ark:- \| \ - lattice-align-words --reorder=false $lang/phones/word_boundary.int $model ark:- ark:- \| \ - nbest-to-ctm ark:- - \| \ - utils/int2sym.pl -f 5 $lang/words.txt \| \ - utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \ - '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1; - else - $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \ - mkdir -p $dir/score_LMWT_${wip}/ '&&' \ - lattice-scale --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ - lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ - lattice-1best ark:- ark:- \| \ - lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \ - nbest-to-ctm ark:- - \| \ - utils/int2sym.pl -f 5 $lang/words.txt \| \ - utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \ - '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1; - fi + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \ + mkdir -p $dir/score_LMWT_${wip}/ '&&' \ + lattice-scale --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-1best ark:- ark:- \| \ + lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \ + nbest-to-ctm $frame_shift_opt ark:- - \| \ + utils/int2sym.pl -f 5 $lang/words.txt \| \ + utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \ + '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1; done fi @@ -102,7 +93,7 @@ fi # Score the set... if [ $stage -le 2 ]; then - for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.${wip}.log \ cp $data/stm $dir/score_LMWT_${wip}/ '&&' \ $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm $dir/score_LMWT_${wip}/${name}.ctm || exit 1; @@ -113,7 +104,7 @@ fi case "$name" in eval2000* ) # Score only the, swbd part... if [ $stage -le 3 ]; then - for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.swbd.LMWT.${wip}.log \ grep -v '^en_' $data/stm '>' $dir/score_LMWT_${wip}/stm.swbd '&&' \ grep -v '^en_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.swbd '&&' \ @@ -122,7 +113,7 @@ case "$name" in eval2000* ) fi # Score only the, callhome part... if [ $stage -le 3 ]; then - for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.callhm.LMWT.${wip}.log \ grep -v '^sw_' $data/stm '>' $dir/score_LMWT_${wip}/stm.callhm '&&' \ grep -v '^sw_' $dir/score_LMWT_${wip}/${name}.ctm '>' $dir/score_LMWT_${wip}/${name}.ctm.callhm '&&' \ @@ -132,7 +123,7 @@ case "$name" in eval2000* ) ;; rt03* ) - + # Score only the swbd part... if [ $stage -le 3 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do diff --git a/egs/multi_en/s5/local/train_lms.sh b/egs/multi_en/s5/local/train_lms.sh index e7313b0ed93..02fd66e0368 100755 --- a/egs/multi_en/s5/local/train_lms.sh +++ b/egs/multi_en/s5/local/train_lms.sh @@ -6,7 +6,6 @@ # Changes made: # - Specified path to path.sh # - Modified paths to match multi_en naming conventions -# - Disabled 4-gram LM creation ########################################################################################### # To be run from one directory above this script. @@ -64,7 +63,7 @@ cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1] train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1; # Perplexity over 88307.000000 words (excluding 691.000000 OOVs) is 71.241332 -#train_lm.sh --arpa --lmtype 4gram-mincount $dir || exit 1; +train_lm.sh --arpa --lmtype 4gram-mincount $dir || exit 1; # note: output is # data/local/lm/3gram-mincount/lm_unpruned.gz diff --git a/egs/sitw/README.txt b/egs/sitw/README.txt new file mode 100644 index 00000000000..c823c721b8d --- /dev/null +++ b/egs/sitw/README.txt @@ -0,0 +1,16 @@ + + This directory (sitw) contains example scripts for the Speakers in the + Wild (SITW) Speaker Recognition Challenge. The SITW corpus is required, + and can be obtained by following the directions at the url + http://www.speech.sri.com/projects/sitw/ + + Additional data sources (e.g., VoxCeleb and MUSAN) are required to train + the systems in the subdirectories. See the corresponding README.txt files + in the subdirectories for more details. + + Note: This recipe requires ffmpeg to be installed and its location included + in $PATH. + + The subdirectories "v1" and so on are different speaker recognition + recipes. The recipe in v1 is a traditional i-vector system while the v2 + recipe uses DNN embeddings called x-vectors. diff --git a/egs/sitw/v1/README.txt b/egs/sitw/v1/README.txt new file mode 100644 index 00000000000..f5fa2de2314 --- /dev/null +++ b/egs/sitw/v1/README.txt @@ -0,0 +1,14 @@ + + This is a traditional i-vector recipe for Speakers in the Wild. The + following datasets are used: + + Evaluation + + Speakers in the Wild http://www.speech.sri.com/projects/sitw + + System Development + + VoxCeleb 1 http://www.robots.ox.ac.uk/~vgg/data/voxceleb + VoxCeleb 2 http://www.robots.ox.ac.uk/~vgg/data/voxceleb2 + MUSAN http://www.openslr.org/17 + RIR_NOISES http://www.openslr.org/28 diff --git a/egs/sitw/v1/cmd.sh b/egs/sitw/v1/cmd.sh new file mode 100755 index 00000000000..d1ca1a6d126 --- /dev/null +++ b/egs/sitw/v1/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" + + diff --git a/egs/sitw/v1/conf/mfcc.conf b/egs/sitw/v1/conf/mfcc.conf new file mode 100644 index 00000000000..649cffb9de8 --- /dev/null +++ b/egs/sitw/v1/conf/mfcc.conf @@ -0,0 +1,7 @@ +--sample-frequency=16000 +--frame-length=25 # the default is 25 +--low-freq=20 # the default. +--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case). +--num-mel-bins=30 +--num-ceps=24 +--snip-edges=false diff --git a/egs/sitw/v1/conf/vad.conf b/egs/sitw/v1/conf/vad.conf new file mode 100644 index 00000000000..a0ca2449b10 --- /dev/null +++ b/egs/sitw/v1/conf/vad.conf @@ -0,0 +1,2 @@ +--vad-energy-threshold=5.5 +--vad-energy-mean-scale=0.5 diff --git a/egs/sitw/v1/local/make_musan.py b/egs/sitw/v1/local/make_musan.py new file mode 100755 index 00000000000..74c434990fb --- /dev/null +++ b/egs/sitw/v1/local/make_musan.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +# Copyright 2015 David Snyder +# 2018 Ewald Enzinger +# Apache 2.0. +# +# Modified version of egs/sre16/v1/local/make_musan.py (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8). +# This version uses the raw MUSAN audio files (16 kHz) and does not use sox to resample at 8 kHz. +# +# This file is meant to be invoked by make_musan.sh. + +import os, sys + +def process_music_annotations(path): + utt2spk = {} + utt2vocals = {} + lines = open(path, 'r').readlines() + for line in lines: + utt, genres, vocals, musician = line.rstrip().split()[:4] + # For this application, the musican ID isn't important + utt2spk[utt] = utt + utt2vocals[utt] = vocals == "Y" + return utt2spk, utt2vocals + +def prepare_music(root_dir, use_vocals): + utt2vocals = {} + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + music_dir = os.path.join(root_dir, "music") + for root, dirs, files in os.walk(music_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + elif str(file) == "ANNOTATIONS": + utt2spk_part, utt2vocals_part = process_music_annotations(file_path) + utt2spk.update(utt2spk_part) + utt2vocals.update(utt2vocals_part) + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2vocals: + if utt in utt2wav: + if use_vocals or not utt2vocals[utt]: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + return utt2spk_str, utt2wav_str + +def prepare_speech(root_dir): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + speech_dir = os.path.join(root_dir, "speech") + for root, dirs, files in os.walk(speech_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + return utt2spk_str, utt2wav_str + +def prepare_noise(root_dir): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + noise_dir = os.path.join(root_dir, "noise") + for root, dirs, files in os.walk(noise_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + return utt2spk_str, utt2wav_str + +def main(): + in_dir = sys.argv[1] + out_dir = sys.argv[2] + use_vocals = sys.argv[3] == "Y" + utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals) + utt2spk_speech, utt2wav_speech = prepare_speech(in_dir) + utt2spk_noise, utt2wav_noise = prepare_noise(in_dir) + utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise + utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise + wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w') + wav_fi.write(utt2wav) + utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w') + utt2spk_fi.write(utt2spk) + + +if __name__=="__main__": + main() diff --git a/egs/sitw/v1/local/make_musan.sh b/egs/sitw/v1/local/make_musan.sh new file mode 100755 index 00000000000..1565ef0d85c --- /dev/null +++ b/egs/sitw/v1/local/make_musan.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# Copyright 2015 David Snyder +# Apache 2.0. +# +# Copy of egs/sre16/v1/local/make_musan.sh (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8). +# +# This script, called by ../run.sh, creates the MUSAN +# data directory. The required dataset is freely available at +# http://www.openslr.org/17/ + +set -e +in_dir=$1 +data_dir=$2 +use_vocals='Y' + +mkdir -p local/musan.tmp + +echo "Preparing ${data_dir}/musan..." +mkdir -p ${data_dir}/musan +local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals} + +utils/fix_data_dir.sh ${data_dir}/musan + +grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music +grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech +grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise +utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \ + ${data_dir}/musan ${data_dir}/musan_music +utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \ + ${data_dir}/musan ${data_dir}/musan_speech +utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \ + ${data_dir}/musan ${data_dir}/musan_noise + +utils/fix_data_dir.sh ${data_dir}/musan_music +utils/fix_data_dir.sh ${data_dir}/musan_speech +utils/fix_data_dir.sh ${data_dir}/musan_noise + +rm -rf local/musan.tmp + diff --git a/egs/sitw/v1/local/make_sitw.sh b/egs/sitw/v1/local/make_sitw.sh new file mode 100755 index 00000000000..7c0bcd0fea1 --- /dev/null +++ b/egs/sitw/v1/local/make_sitw.sh @@ -0,0 +1,86 @@ +#!/bin/bash +# Copyrigh 2017 Ignacio Viñals +# 2017-2018 David Snyder +# +# This script prepares the SITW data. It creates separate directories +# for dev enroll, eval enroll, dev test, and eval test. It also prepares +# multiple trials files, in the test directories, but we usually only use the +# core-core.lst list. + +if [ $# != 2 ]; then + echo "Usage: make_sitw.sh " + echo "E.g.: make_sitw.sh /export/corpora/SRI/sitw data" + exit 1 +fi + +in_dir=$1 +out_dir=$2 + +# Prepare the enrollment data +for mode in dev eval; do + this_out_dir=${out_dir}/sitw_${mode}_enroll + mkdir -p $this_out_dir 2>/dev/null + WAVFILE=$this_out_dir/wav.scp + SPKFILE=$this_out_dir/utt2spk + MODFILE=$this_out_dir/utt2cond + rm $WAVFILE $SPKFILE $MODFILE 2>/dev/null + this_in_dir=${in_dir}/$mode + + for enroll in core assist; do + cat $this_in_dir/lists/enroll-${enroll}.lst | \ + while read line; do + wav_id=`echo $line| awk '{print $2}' |\ + awk 'BEGIN{FS="[./]"}{print $(NF-1)}'` + spkr_id=`echo $line| awk '{print $1}'` + WAV=`echo $line | awk '{print this_in_dir"/"$2}' this_in_dir=$this_in_dir` + echo "${spkr_id}_${wav_id} sox -t flac $WAV -t wav -r 16k -b 16 - channels 1 |" >> $WAVFILE + echo "${spkr_id}_${wav_id} ${spkr_id}" >> $SPKFILE + echo "${spkr_id}_${wav_id} $enroll $mode" >> $MODFILE + done + done + utils/fix_data_dir.sh $this_out_dir +done + +# Prepare the test data +for mode in dev eval; do + this_out_dir=${out_dir}/sitw_${mode}_test + mkdir -p $this_out_dir 2>/dev/null + WAVFILE=$this_out_dir/wav.scp + SPKFILE=$this_out_dir/utt2spk + MODFILE=$this_out_dir/utt2cond + rm $WAVFILE $SPKFILE $MODFILE 2>/dev/null + mkdir -p $this_out_dir/trials 2>/dev/null + mkdir -p $this_out_dir/trials/aux 2>/dev/null + this_in_dir=${in_dir}/$mode + + for trial in core multi; do + cat $this_in_dir/lists/test-${trial}.lst | awk '{print $1,$2}' |\ + while read line; do + wav_id=`echo $line | awk 'BEGIN{FS="[./]"} {print $(NF-1)}'` + WAV=`echo $line | awk '{print this_in_dir"/"$1}' this_in_dir=$this_in_dir` + echo "${wav_id} sox -t flac $WAV -t wav -r 16k -b 16 - channels 1 |" >> $WAVFILE + echo "${wav_id} ${wav_id}" >> $SPKFILE + echo "${wav_id} $trial $mode" >> $MODFILE + done + done + + for trial in core-core core-multi assist-core assist-multi; do + cat $this_in_dir/keys/$trial.lst | sed 's@audio/@@g' | sed 's@.flac@@g' |\ + awk '{if ($3=="tgt") + {print $1,$2,"target"} + else + {print $1,$2,"nontarget"} + }' > $this_out_dir/trials/${trial}.lst + done + + for trial in $this_in_dir/keys/aux/* ; do + trial_name=`basename $trial` + cat $trial | sed 's@audio/@@g' | sed 's@.flac@@g' |\ + awk '{if ($3=="tgt") + {print $1,$2,"target"} + else + {print $1,$2,"nontarget"} + }' > $this_out_dir/trials/aux/${trial_name} + done + utils/fix_data_dir.sh $this_out_dir +done diff --git a/egs/sitw/v1/local/make_voxceleb1.pl b/egs/sitw/v1/local/make_voxceleb1.pl new file mode 100755 index 00000000000..e56483563b8 --- /dev/null +++ b/egs/sitw/v1/local/make_voxceleb1.pl @@ -0,0 +1,84 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# 2018 David Snyder +# +# Usage: make_voxceleb1.pl /export/voxceleb1 data/ +# Note that this script also downloads a list of speakers that overlap +# with our evaluation set, SITW. These speakers are removed from VoxCeleb1 +# prior to preparing the dataset. + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/voxceleb1 data/\n"; + exit(1); +} + +($data_base, $out_dir) = @ARGV; +my $out_dir = "$out_dir/voxceleb1"; + +if (system("mkdir -p $out_dir") != 0) { + die "Error making directory $out_dir"; +} + +# This file provides the list of speakers that overlap between SITW and VoxCeleb1. +if (! -e "$out_dir/voxceleb1_sitw_overlap.txt") { + system("wget -O $out_dir/voxceleb1_sitw_overlap.txt http://www.openslr.org/resources/49/voxceleb1_sitw_overlap.txt"); +} + +# sitw_overlap contains the list of speakers that also exist in our evaluation set, SITW. +my %sitw_overlap = (); +open(OVERLAP, "<", "$out_dir/voxceleb1_sitw_overlap.txt") or die "Could not open the overlap file $out_dir/voxceleb1_sitw_overlap.txt"; +while () { + chomp; + my $spkr_id = $_; + $sitw_overlap{$spkr_id} = (); +} + +opendir my $dh, "$data_base/voxceleb1_wav" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; + +foreach (@spkr_dirs) { + my $spkr_id = $_; + # Only keep the speaker if it isn't in the overlap list. + if (not exists $sitw_overlap{$spkr_id}) { + opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); + closedir $dh; + foreach (@files) { + my $filename = $_; + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $utt_id = "$spkr_id-$rec_id-$segment"; + my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav"; + print WAV "$utt_id", " $wav", "\n"; + print SPKR "$utt_id", " $spkr_id", "\n"; + } + } +} + +close(SPKR) or die; +close(WAV) or die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} + +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/sitw/v1/local/make_voxceleb2.pl b/egs/sitw/v1/local/make_voxceleb2.pl new file mode 100755 index 00000000000..34c1591eba3 --- /dev/null +++ b/egs/sitw/v1/local/make_voxceleb2.pl @@ -0,0 +1,70 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# +# Usage: make_voxceleb2.pl /export/voxceleb2 dev data/dev +# +# Note: This script requires ffmpeg to be installed and its location included in $PATH. + +if (@ARGV != 3) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/voxceleb2 dev data/dev\n"; + exit(1); +} + +# Check that ffmpeg is installed. +if (`which ffmpeg` eq "") { + die "Error: this script requires that ffmpeg is installed."; +} + +($data_base, $dataset, $out_dir) = @ARGV; + +if ("$dataset" ne "dev" && "$dataset" ne "test") { + die "dataset parameter must be 'dev' or 'test'!"; +} + +opendir my $dh, "$data_base/$dataset/aac" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$data_base/$dataset/aac/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +if (system("mkdir -p $out_dir") != 0) { + die "Error making directory $out_dir"; +} + +open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; + +foreach (@spkr_dirs) { + my $spkr_id = $_; + + opendir my $dh, "$data_base/$dataset/aac/$spkr_id/" or die "Cannot open directory: $!"; + my @rec_dirs = grep {-d "$data_base/$dataset/aac/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh); + closedir $dh; + + foreach (@rec_dirs) { + my $rec_id = $_; + + opendir my $dh, "$data_base/$dataset/aac/$spkr_id/$rec_id/" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.m4a$/} readdir($dh); + closedir $dh; + + foreach (@files) { + my $name = $_; + my $wav = "ffmpeg -v 8 -i $data_base/$dataset/aac/$spkr_id/$rec_id/$name.m4a -f wav -acodec pcm_s16le -|"; + my $utt_id = "$spkr_id-$rec_id-$name"; + print WAV "$utt_id", " $wav", "\n"; + print SPKR "$utt_id", " $spkr_id", "\n"; + } + } +} +close(SPKR) or die; +close(WAV) or die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/sitw/v1/local/nnet3/xvector/prepare_feats_for_egs.sh b/egs/sitw/v1/local/nnet3/xvector/prepare_feats_for_egs.sh new file mode 100755 index 00000000000..480b2cc2fe8 --- /dev/null +++ b/egs/sitw/v1/local/nnet3/xvector/prepare_feats_for_egs.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# +# Copied from egs/sre16/v1/local/nnet3/xvector/prepare_feats_for_egs.sh (commit 3ea534070fd2cccd2e4ee21772132230033022ce). +# +# Apache 2.0. + +# This script applies sliding window cmvn and removes silence frames. This +# is performed on the raw features prior to generating examples for training +# the xvector system. + +nj=40 +cmd="run.pl" +stage=0 +norm_vars=false +center=true +compress=true +cmn_window=300 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features" + echo "Options: " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --norm-vars # If true, normalize variances in the sliding window cmvn" + exit 1; +fi + +data_in=$1 +data_out=$2 +dir=$3 + +name=`basename $data_in` + +for f in $data_in/feats.scp $data_in/vad.scp ; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +# Set various variables. +mkdir -p $dir/log +mkdir -p $data_out +featdir=$(utils/make_absolute.sh $dir) + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$USER/kaldi-data/egs/voxceleb2/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_feats/storage $featdir/storage +fi + +for n in $(seq $nj); do + # the next command does nothing unless $featdir/storage/ exists, see + # utils/create_data_link.pl for more info. + utils/create_data_link.pl $featdir/xvector_feats_${name}.${n}.ark +done + +cp $data_in/utt2spk $data_out/utt2spk +cp $data_in/spk2utt $data_out/spk2utt +cp $data_in/wav.scp $data_out/wav.scp + +write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB" + +sdata_in=$data_in/split$nj; +utils/split_data.sh $data_in $nj || exit 1; + +$cmd JOB=1:$nj $dir/log/create_xvector_feats_${name}.JOB.log \ + apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \ + scp:${sdata_in}/JOB/feats.scp ark:- \| \ + select-voiced-frames ark:- scp,s,cs:${sdata_in}/JOB/vad.scp ark:- \| \ + copy-feats --compress=$compress $write_num_frames_opt ark:- \ + ark,scp:$featdir/xvector_feats_${name}.JOB.ark,$featdir/xvector_feats_${name}.JOB.scp || exit 1; + +for n in $(seq $nj); do + cat $featdir/xvector_feats_${name}.$n.scp || exit 1; +done > ${data_out}/feats.scp || exit 1 + +for n in $(seq $nj); do + cat $featdir/log/utt2num_frames.$n || exit 1; +done > $data_out/utt2num_frames || exit 1 +rm $featdir/log/utt2num_frames.* + +echo "$0: Succeeded creating xvector features for $name" diff --git a/egs/sitw/v1/local/nnet3/xvector/run_xvector.sh b/egs/sitw/v1/local/nnet3/xvector/run_xvector.sh new file mode 120000 index 00000000000..585b63fd2dd --- /dev/null +++ b/egs/sitw/v1/local/nnet3/xvector/run_xvector.sh @@ -0,0 +1 @@ +tuning/run_xvector_1a.sh \ No newline at end of file diff --git a/egs/sitw/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh b/egs/sitw/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh new file mode 100755 index 00000000000..892c1ad55bd --- /dev/null +++ b/egs/sitw/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh @@ -0,0 +1,153 @@ +#!/bin/bash +# Copyright 2017 David Snyder +# 2017 Johns Hopkins University (Author: Daniel Garcia-Romero) +# 2017 Johns Hopkins University (Author: Daniel Povey) +# +# Copied from egs/sre16/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh (commit e082c17d4a8f8a791428ae4d9f7ceb776aef3f0b). +# +# Apache 2.0. + +# This script trains a DNN similar to the recipe described in +# http://www.danielpovey.com/files/2018_icassp_xvectors.pdf + +. ./cmd.sh +set -e + +stage=1 +train_stage=0 +use_gpu=true +remove_egs=false + +data=data/train +nnet_dir=exp/xvector_nnet_1a/ +egs_dir=exp/xvector_nnet_1a/egs + +. ./path.sh +. ./cmd.sh +. ./utils/parse_options.sh + +# Now we create the nnet examples using sid/nnet3/xvector/get_egs.sh. +# The argument --num-repeats is related to the number of times a speaker +# repeats per archive. If it seems like you're getting too many archives +# (e.g., more than 200) try increasing the --frames-per-iter option. The +# arguments --min-frames-per-chunk and --max-frames-per-chunk specify the +# minimum and maximum length (in terms of number of frames) of the features +# in the examples. +# +# To make sense of the egs script, it may be necessary to put an "exit 1" +# command immediately after stage 3. Then, inspect +# exp//egs/temp/ranges.* . The ranges files specify the examples that +# will be created, and which archives they will be stored in. Each line of +# ranges.* has the following form: +# +# For example: +# 100304-f-sre2006-kacg-A 1 2 4079 881 23 + +# If you're satisfied with the number of archives (e.g., 50-150 archives is +# reasonable) and with the number of examples per speaker (e.g., 1000-5000 +# is reasonable) then you can let the script continue to the later stages. +# Otherwise, try increasing or decreasing the --num-repeats option. You might +# need to fiddle with --frames-per-iter. Increasing this value decreases the +# the number of archives and increases the number of examples per archive. +# Decreasing this value increases the number of archives, while decreasing the +# number of examples per archive. +if [ $stage -le 6 ]; then + echo "$0: Getting neural network training egs"; + # dump egs. + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b{03,04,05,06}/$USER/kaldi-data/egs/voxceleb2/v2/xvector-$(date +'%m_%d_%H_%M')/$egs_dir/storage $egs_dir/storage + fi + sid/nnet3/xvector/get_egs.sh --cmd "$train_cmd" \ + --nj 8 \ + --stage 0 \ + --frames-per-iter 1000000000 \ + --frames-per-iter-diagnostic 100000 \ + --min-frames-per-chunk 200 \ + --max-frames-per-chunk 400 \ + --num-diagnostic-archives 3 \ + --num-repeats 50 \ + "$data" $egs_dir +fi + +if [ $stage -le 7 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(wc -w $egs_dir/pdf2num | awk '{print $1}') + feat_dim=$(cat $egs_dir/info/feat_dim) + + # This chunk-size corresponds to the maximum number of frames the + # stats layer is able to pool over. In this script, it corresponds + # to 100 seconds. If the input recording is greater than 100 seconds, + # we will compute multiple xvectors from the same recording and average + # to produce the final xvector. + max_chunk_size=10000 + + # The smallest number of frames we're comfortable computing an xvector from. + # Note that the hard minimum is given by the left and right context of the + # frame-level layers. + min_chunk_size=25 + mkdir -p $nnet_dir/configs + cat < $nnet_dir/configs/network.xconfig + # please note that it is important to have input layer with the name=input + + # The frame-level layers + input dim=${feat_dim} name=input + relu-batchnorm-layer name=tdnn1 input=Append(-2,-1,0,1,2) dim=512 + relu-batchnorm-layer name=tdnn2 input=Append(-2,0,2) dim=512 + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn4 dim=512 + relu-batchnorm-layer name=tdnn5 dim=1500 + + # The stats pooling layer. Layers after this are segment-level. + # In the config below, the first and last argument (0, and ${max_chunk_size}) + # means that we pool over an input segment starting at frame 0 + # and ending at frame ${max_chunk_size} or earlier. The other arguments (1:1) + # mean that no subsampling is performed. + stats-layer name=stats config=mean+stddev(0:1:1:${max_chunk_size}) + + # This is where we usually extract the embedding (aka xvector) from. + relu-batchnorm-layer name=tdnn6 dim=512 input=stats + + # This is where another layer the embedding could be extracted + # from, but usually the previous one works better. + relu-batchnorm-layer name=tdnn7 dim=512 + output-layer name=output include-log-softmax=true dim=${num_targets} +EOF + + steps/nnet3/xconfig_to_configs.py \ + --xconfig-file $nnet_dir/configs/network.xconfig \ + --config-dir $nnet_dir/configs/ + cp $nnet_dir/configs/final.config $nnet_dir/nnet.config + + # These three files will be used by sid/nnet3/xvector/extract_xvectors.sh + echo "output-node name=output input=tdnn6.affine" > $nnet_dir/extract.config + echo "$max_chunk_size" > $nnet_dir/max_chunk_size + echo "$min_chunk_size" > $nnet_dir/min_chunk_size +fi + +dropout_schedule='0,0@0.20,0.1@0.50,0' +srand=123 +if [ $stage -le 8 ]; then + steps/nnet3/train_raw_dnn.py --stage=$train_stage \ + --cmd="$train_cmd" \ + --trainer.optimization.proportional-shrink 10 \ + --trainer.optimization.momentum=0.5 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=8 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.minibatch-size=64 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2 \ + --trainer.num-epochs=3 \ + --trainer.dropout-schedule="$dropout_schedule" \ + --trainer.shuffle-buffer-size=1000 \ + --egs.frames-per-eg=1 \ + --egs.dir="$egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval=10 \ + --use-gpu=true \ + --dir=$nnet_dir || exit 1; +fi + +exit 0; diff --git a/egs/sitw/v1/path.sh b/egs/sitw/v1/path.sh new file mode 100755 index 00000000000..e50f57c5271 --- /dev/null +++ b/egs/sitw/v1/path.sh @@ -0,0 +1,5 @@ +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/sitw/v1/run.sh b/egs/sitw/v1/run.sh new file mode 100755 index 00000000000..68d08dfc170 --- /dev/null +++ b/egs/sitw/v1/run.sh @@ -0,0 +1,254 @@ +#!/bin/bash +# Copyright 2017 Johns Hopkins University (Author: Daniel Garcia-Romero) +# 2017 Johns Hopkins University (Author: Daniel Povey) +# 2017-2018 David Snyder +# 2018 Ewald Enzinger +# Apache 2.0. +# +# This is an i-vector-based recipe for Speakers in the Wild (SITW). +# See ../README.txt for more info on data required. The recipe uses +# VoxCeleb 1 and 2 for training the UBM and T matrix, and an augmented +# version of those datasets for PLDA training. The augmentation consists +# of MUSAN noises, music, and babble and reverberation from the Room +# Impulse Response and Noise Database. Note that there are 60 speakers +# in VoxCeleb 1 that overlap with our evaluation dataset, SITW. The recipe +# removes those 60 speakers prior to training. See ../README.txt for more +# info on data required. The results are reported in terms of EER and minDCF, +# and are inline in the comments below. + +. ./cmd.sh +. ./path.sh +set -e +mfccdir=`pwd`/mfcc +vaddir=`pwd`/mfcc + +voxceleb1_root=/export/corpora/VoxCeleb1 +voxceleb2_root=/export/corpora/VoxCeleb2 +sitw_root=/export/corpora/SRI/sitw +musan_root=/export/corpora/JHU/musan + +sitw_dev_trials_core=data/sitw_dev_test/trials/core-core.lst +sitw_eval_trials_core=data/sitw_eval_test/trials/core-core.lst + +stage=0 + +if [ $stage -le 0 ]; then + # Prepare the VoxCeleb1 dataset. The script also downloads a list from + # http://www.openslr.org/resources/49/voxceleb1_sitw_overlap.txt that + # contains the speakers that overlap between VoxCeleb1 and our evaluation + # set SITW. The script removes the overlapping speakers from VoxCeleb1. + local/make_voxceleb1.pl $voxceleb1_root data + + # Prepare the dev portion of the VoxCeleb2 dataset. + local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train + + # The original version of this recipe included the test portion of VoxCeleb2 + # in the training list. Unfortunately, it turns out that there's an overlap + # with our evaluation set, Speakers in the Wild. Therefore, we've removed + # this dataset from the training list. + # local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test + + # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1. + # This should leave 7,185 speakers and 1,236,567 utterances. + utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb1 + + # Prepare Speakers in the Wild. This is our evaluation dataset. + local/make_sitw.sh $sitw_root data +fi + +if [ $stage -le 1 ]; then + # Make MFCCs and compute the energy-based VAD for each dataset + for name in sitw_eval_enroll sitw_eval_test sitw_dev_enroll sitw_dev_test train; do + steps/make_mfcc.sh --write-utt2num-frames true --mfcc-config conf/mfcc.conf --nj 80 --cmd "$train_cmd" \ + data/${name} exp/make_mfcc $mfccdir + utils/fix_data_dir.sh data/${name} + sid/compute_vad_decision.sh --nj 80 --cmd "$train_cmd" \ + data/${name} exp/make_vad $vaddir + utils/fix_data_dir.sh data/${name} + done +fi + +if [ $stage -le 2 ]; then + # Train the UBM on VoxCeleb 1 and 2. + sid/train_diag_ubm.sh --cmd "$train_cmd --mem 4G" \ + --nj 40 --num-threads 8 \ + data/train 2048 \ + exp/diag_ubm + + sid/train_full_ubm.sh --cmd "$train_cmd --mem 25G" \ + --nj 40 --remove-low-count-gaussians false \ + data/train \ + exp/diag_ubm exp/full_ubm +fi + +if [ $stage -le 3 ]; then + # In this stage, we train the i-vector extractor on a subset of VoxCeleb 1 + # and 2. + # + # Note that there are well over 1 million utterances in our training set, + # and it takes an extremely long time to train the extractor on all of this. + # Also, most of those utterances are very short. Short utterances are + # harmful for training the i-vector extractor. Therefore, to reduce the + # training time and improve performance, we will only train on the 100k + # longest utterances. + utils/subset_data_dir.sh \ + --utt-list <(sort -n -k 2 data/train/utt2num_frames | tail -n 100000) \ + data/train data/train_100k + + # Train the i-vector extractor. + sid/train_ivector_extractor.sh --cmd "$train_cmd --mem 16G" \ + --ivector-dim 400 --num-iters 5 \ + exp/full_ubm/final.ubm data/train_100k \ + exp/extractor +fi + +# In this section, we augment the VoxCeleb 1 and 2 data with reverberation, +# noise, music, and babble, and combine it with the clean data. This will +# later be used to train out PLDA model. +if [ $stage -le 4 ]; then + frame_shift=0.01 + awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/train_100k/utt2num_frames > data/train_100k/reco2dur + + if [ ! -d "RIRS_NOISES" ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + + # Make a version with reverberated speech + rvb_opts=() + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") + + # Make a reverberated version of the VoxCeleb2 list. Note that we don't add any + # additive noise here. + python steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --speech-rvb-probability 1 \ + --pointsource-noise-addition-probability 0 \ + --isotropic-noise-addition-probability 0 \ + --num-replications 1 \ + --source-sampling-rate 16000 \ + data/train_100k data/train_100k_reverb + cp data/train_100k/vad.scp data/train_100k_reverb/ + utils/copy_data_dir.sh --utt-suffix "-reverb" data/train_100k_reverb data/train_100k_reverb.new + rm -rf data/train_100k_reverb + mv data/train_100k_reverb.new data/train_100k_reverb + + # Prepare the MUSAN corpus, which consists of music, speech, and noise + # suitable for augmentation. + local/make_musan.sh $musan_root data + + # Get the duration of the MUSAN recordings. This will be used by the + # script augment_data_dir.py. + for name in speech noise music; do + utils/data/get_utt2dur.sh data/musan_${name} + mv data/musan_${name}/utt2dur data/musan_${name}/reco2dur + done + + # Augment with musan_noise + python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train_100k data/train_100k_noise + # Augment with musan_music + python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train_100k data/train_100k_music + # Augment with musan_speech + python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train_100k data/train_100k_babble + + # Combine reverb, noise, music, and babble into one directory. + utils/combine_data.sh data/train_aug data/train_100k_reverb data/train_100k_noise data/train_100k_music data/train_100k_babble +fi + +if [ $stage -le 5 ]; then + # Take a 100k subset of the augmentations. + utils/subset_data_dir.sh data/train_aug 100000 data/train_100k_aug + utils/fix_data_dir.sh data/train_100k_aug + + # Make MFCCs for the augmented data. Note that we do not compute a new + # vad.scp file here. Instead, we use the vad.scp from the clean version of + # the list. + steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 80 --cmd "$train_cmd" \ + data/train_100k_aug exp/make_mfcc $mfccdir + + # Combine the clean and augmented VoxCeleb list. This is now roughly + # double the size of the original clean list. + utils/combine_data.sh data/train_combined_200k data/train_100k_aug data/train_100k +fi + +if [ $stage -le 6 ]; then + # These i-vectors will be used for mean-subtraction, LDA, and PLDA training. + sid/extract_ivectors.sh --cmd "$train_cmd --mem 4G" --nj 80 \ + exp/extractor data/train_combined_200k \ + exp/ivectors_train_combined_200k + + # Extract i-vectors for the SITW dev and eval sets. + for name in sitw_eval_enroll sitw_eval_test sitw_dev_enroll sitw_dev_test; do + sid/extract_ivectors.sh --cmd "$train_cmd --mem 4G" --nj 40 \ + exp/extractor data/$name \ + exp/ivectors_$name + done +fi + +if [ $stage -le 7 ]; then + # Compute the mean vector for centering the evaluation i-vectors. + $train_cmd exp/ivectors_train_combined_200k/log/compute_mean.log \ + ivector-mean scp:exp/ivectors_train_combined_200k/ivector.scp \ + exp/ivectors_train_combined_200k/mean.vec || exit 1; + + # This script uses LDA to decrease the dimensionality prior to PLDA. + lda_dim=150 + $train_cmd exp/ivectors_train_combined_200k/log/lda.log \ + ivector-compute-lda --total-covariance-factor=0.0 --dim=$lda_dim \ + "ark:ivector-subtract-global-mean scp:exp/ivectors_train_combined_200k/ivector.scp ark:- |" \ + ark:data/train_combined_200k/utt2spk exp/ivectors_train_combined_200k/transform.mat || exit 1; + + # Train the PLDA model. + $train_cmd exp/ivectors_train_combined_200k/log/plda.log \ + ivector-compute-plda ark:data/train_combined_200k/spk2utt \ + "ark:ivector-subtract-global-mean scp:exp/ivectors_train_combined_200k/ivector.scp ark:- | transform-vec exp/ivectors_train_combined_200k/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ + exp/ivectors_train_combined_200k/plda || exit 1; +fi + +if [ $stage -le 8 ]; then + # Compute PLDA scores for SITW dev core-core trials + $train_cmd exp/scores/log/sitw_dev_core_scoring.log \ + ivector-plda-scoring --normalize-length=true \ + --num-utts=ark:exp/ivectors_sitw_dev_enroll/num_utts.ark \ + "ivector-copy-plda --smoothing=0.0 exp/ivectors_train_combined_200k/plda - |" \ + "ark:ivector-mean ark:data/sitw_dev_enroll/spk2utt scp:exp/ivectors_sitw_dev_enroll/ivector.scp ark:- | ivector-subtract-global-mean exp/ivectors_train_combined_200k/mean.vec ark:- ark:- | transform-vec exp/ivectors_train_combined_200k/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ + "ark:ivector-subtract-global-mean exp/ivectors_train_combined_200k/mean.vec scp:exp/ivectors_sitw_dev_test/ivector.scp ark:- | transform-vec exp/ivectors_train_combined_200k/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ + "cat '$sitw_dev_trials_core' | cut -d\ --fields=1,2 |" exp/scores/sitw_dev_core_scores || exit 1; + + # SITW Dev Core: + # EER: 4.813% + # minDCF(p-target=0.01): 0.4250 + # minDCF(p-target=0.001): 0.5727 + echo "SITW Dev Core:" + eer=$(paste $sitw_dev_trials_core exp/scores/sitw_dev_core_scores | awk '{print $6, $3}' | compute-eer - 2>/dev/null) + mindcf1=`sid/compute_min_dcf.py --p-target 0.01 exp/scores/sitw_dev_core_scores $sitw_dev_trials_core 2> /dev/null` + mindcf2=`sid/compute_min_dcf.py --p-target 0.001 exp/scores/sitw_dev_core_scores $sitw_dev_trials_core 2> /dev/null` + echo "EER: $eer%" + echo "minDCF(p-target=0.01): $mindcf1" + echo "minDCF(p-target=0.001): $mindcf2" +fi + +if [ $stage -le 9 ]; then + # Compute PLDA scores for SITW eval core-core trials + $train_cmd exp/scores/log/sitw_eval_core_scoring.log \ + ivector-plda-scoring --normalize-length=true \ + --num-utts=ark:exp/ivectors_sitw_eval_enroll/num_utts.ark \ + "ivector-copy-plda --smoothing=0.0 exp/ivectors_train_combined_200k/plda - |" \ + "ark:ivector-mean ark:data/sitw_eval_enroll/spk2utt scp:exp/ivectors_sitw_eval_enroll/ivector.scp ark:- | ivector-subtract-global-mean exp/ivectors_train_combined_200k/mean.vec ark:- ark:- | transform-vec exp/ivectors_train_combined_200k/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ + "ark:ivector-subtract-global-mean exp/ivectors_train_combined_200k/mean.vec scp:exp/ivectors_sitw_eval_test/ivector.scp ark:- | transform-vec exp/ivectors_train_combined_200k/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ + "cat '$sitw_eval_trials_core' | cut -d\ --fields=1,2 |" exp/scores/sitw_eval_core_scores || exit 1; + + # SITW Eval Core: + # EER: 5.659% + # minDCF(p-target=0.01): 0.4637 + # minDCF(p-target=0.001): 0.6290 + echo -e "\nSITW Eval Core:"; + eer=$(paste $sitw_eval_trials_core exp/scores/sitw_eval_core_scores | awk '{print $6, $3}' | compute-eer - 2>/dev/null) + mindcf1=`sid/compute_min_dcf.py --p-target 0.01 exp/scores/sitw_eval_core_scores $sitw_eval_trials_core 2> /dev/null` + mindcf2=`sid/compute_min_dcf.py --p-target 0.001 exp/scores/sitw_eval_core_scores $sitw_eval_trials_core 2> /dev/null` + echo "EER: $eer%" + echo "minDCF(p-target=0.01): $mindcf1" + echo "minDCF(p-target=0.001): $mindcf2" +fi diff --git a/egs/sitw/v1/sid b/egs/sitw/v1/sid new file mode 120000 index 00000000000..5cb0274b7d6 --- /dev/null +++ b/egs/sitw/v1/sid @@ -0,0 +1 @@ +../../sre08/v1/sid/ \ No newline at end of file diff --git a/egs/sitw/v1/steps b/egs/sitw/v1/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/sitw/v1/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/sitw/v1/utils b/egs/sitw/v1/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/sitw/v1/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/sitw/v2/README.txt b/egs/sitw/v2/README.txt new file mode 100644 index 00000000000..7944745bad1 --- /dev/null +++ b/egs/sitw/v2/README.txt @@ -0,0 +1,26 @@ + This recipe replaces i-vectors used in the v1 recipe with embeddings extracted + from a deep neural network. In the scripts, we refer to these embeddings as + "x-vectors." The recipe in local/nnet3/xvector/tuning/run_xvector_1a.sh is + closesly based on the following paper: + + @inproceedings{snyder2018xvector, + title={X-vectors: Robust DNN Embeddings for Speaker Recognition}, + author={Snyder, D. and Garcia-Romero, D. and Sell, G. and Povey, D. and Khudanpur, S.}, + booktitle={2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, + year={2018}, + organization={IEEE}, + url={http://www.danielpovey.com/files/2018_icassp_xvectors.pdf} + } + + The recipe uses the following datasets: + + Evaluation + + Speakers in the Wild http://www.speech.sri.com/projects/sitw + + System Development + + VoxCeleb 1 http://www.robots.ox.ac.uk/~vgg/data/voxceleb + VoxCeleb 2 http://www.robots.ox.ac.uk/~vgg/data/voxceleb2 + MUSAN http://www.openslr.org/17 + RIR_NOISES http://www.openslr.org/28 diff --git a/egs/sitw/v2/cmd.sh b/egs/sitw/v2/cmd.sh new file mode 100755 index 00000000000..d1ca1a6d126 --- /dev/null +++ b/egs/sitw/v2/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" + + diff --git a/egs/sitw/v2/conf/mfcc.conf b/egs/sitw/v2/conf/mfcc.conf new file mode 100644 index 00000000000..9e125706aae --- /dev/null +++ b/egs/sitw/v2/conf/mfcc.conf @@ -0,0 +1,7 @@ +--sample-frequency=16000 +--frame-length=25 # the default is 25 +--low-freq=20 # the default. +--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case). +--num-mel-bins=30 +--num-ceps=30 +--snip-edges=false diff --git a/egs/sitw/v2/conf/vad.conf b/egs/sitw/v2/conf/vad.conf new file mode 100644 index 00000000000..c9f5e8b3072 --- /dev/null +++ b/egs/sitw/v2/conf/vad.conf @@ -0,0 +1,4 @@ +--vad-energy-threshold=5.5 +--vad-energy-mean-scale=0.5 +--vad-proportion-threshold=0.12 +--vad-frames-context=2 diff --git a/egs/sitw/v2/local b/egs/sitw/v2/local new file mode 120000 index 00000000000..740b697d6fd --- /dev/null +++ b/egs/sitw/v2/local @@ -0,0 +1 @@ +../v1/local/ \ No newline at end of file diff --git a/egs/sitw/v2/path.sh b/egs/sitw/v2/path.sh new file mode 100755 index 00000000000..e50f57c5271 --- /dev/null +++ b/egs/sitw/v2/path.sh @@ -0,0 +1,5 @@ +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/sitw/v2/run.sh b/egs/sitw/v2/run.sh new file mode 100755 index 00000000000..499d436366a --- /dev/null +++ b/egs/sitw/v2/run.sh @@ -0,0 +1,270 @@ +#!/bin/bash +# Copyright 2017 Johns Hopkins University (Author: Daniel Povey) +# 2017 Johns Hopkins University (Author: Daniel Garcia-Romero) +# 2018 Ewald Enzinger +# 2018 David Snyder +# Apache 2.0. +# +# This is an x-vector-based recipe for Speakers in the Wild (SITW). +# It is based on "X-vectors: Robust DNN Embeddings for Speaker Recognition" +# by Snyder et al. The recipe uses augmented VoxCeleb 1 and 2 for training. +# The augmentation consists of MUSAN noises, music, and babble and +# reverberation from the Room Impulse Response and Noise Database. Note that +# there are 60 speakers in VoxCeleb 1 that overlap with our evaluation +# dataset, SITW. The recipe removes those 60 speakers prior to training. +# See ../README.txt for more info on data required. The results are reported +# in terms of EER and minDCF, and are inline in the comments below. + +. ./cmd.sh +. ./path.sh +set -e +mfccdir=`pwd`/mfcc +vaddir=`pwd`/mfcc + +voxceleb1_root=/export/corpora/VoxCeleb1 +voxceleb2_root=/export/corpora/VoxCeleb2 +sitw_root=/export/corpora/SRI/sitw +nnet_dir=exp/xvector_nnet_1a +musan_root=/export/corpora/JHU/musan + +sitw_dev_trials_core=data/sitw_dev_test/trials/core-core.lst +sitw_eval_trials_core=data/sitw_eval_test/trials/core-core.lst + +stage=0 + +if [ $stage -le 0 ]; then + # Prepare the VoxCeleb1 dataset. The script also downloads a list from + # http://www.openslr.org/resources/49/voxceleb1_sitw_overlap.txt that + # contains the speakers that overlap between VoxCeleb1 and our evaluation + # set SITW. The script removes these overlapping speakers from VoxCeleb1. + local/make_voxceleb1.pl $voxceleb1_root data + + # Prepare the dev portion of the VoxCeleb2 dataset. + local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train + + # The original version of this recipe included the test portion of VoxCeleb2 + # in the training list. Unfortunately, it turns out that there's an overlap + # with our evaluation set, Speakers in the Wild. Therefore, we've removed + # this dataset from the training list. + # local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test + + # We'll train on the dev portion of VoxCeleb2, plus VoxCeleb1 (minus the + # speakers that overlap with SITW). + # This should leave 7,185 speakers and 1,236,567 utterances. + utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb1 + + # Prepare Speakers in the Wild. This is our evaluation dataset. + local/make_sitw.sh $sitw_root data +fi + +if [ $stage -le 1 ]; then + # Make MFCCs and compute the energy-based VAD for each dataset + for name in sitw_eval_enroll sitw_eval_test sitw_dev_enroll sitw_dev_test train; do + steps/make_mfcc.sh --write-utt2num-frames true --mfcc-config conf/mfcc.conf --nj 80 --cmd "$train_cmd" \ + data/${name} exp/make_mfcc $mfccdir + utils/fix_data_dir.sh data/${name} + sid/compute_vad_decision.sh --nj 80 --cmd "$train_cmd" \ + data/${name} exp/make_vad $vaddir + utils/fix_data_dir.sh data/${name} + done +fi + +# In this section, we augment the VoxCeleb2 data with reverberation, +# noise, music, and babble, and combine it with the clean data. +if [ $stage -le 2 ]; then + frame_shift=0.01 + awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/train/utt2num_frames > data/train/reco2dur + + if [ ! -d "RIRS_NOISES" ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + + # Make a version with reverberated speech + rvb_opts=() + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") + + # Make a reverberated version of the VoxCeleb2 list. Note that we don't add any + # additive noise here. + python steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --speech-rvb-probability 1 \ + --pointsource-noise-addition-probability 0 \ + --isotropic-noise-addition-probability 0 \ + --num-replications 1 \ + --source-sampling-rate 16000 \ + data/train data/train_reverb + cp data/train/vad.scp data/train_reverb/ + utils/copy_data_dir.sh --utt-suffix "-reverb" data/train_reverb data/train_reverb.new + rm -rf data/train_reverb + mv data/train_reverb.new data/train_reverb + + # Prepare the MUSAN corpus, which consists of music, speech, and noise + # suitable for augmentation. + local/make_musan.sh $musan_root data + + # Get the duration of the MUSAN recordings. This will be used by the + # script augment_data_dir.py. + for name in speech noise music; do + utils/data/get_utt2dur.sh data/musan_${name} + mv data/musan_${name}/utt2dur data/musan_${name}/reco2dur + done + + # Augment with musan_noise + python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise + # Augment with musan_music + python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music + # Augment with musan_speech + python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble + + # Combine reverb, noise, music, and babble into one directory. + utils/combine_data.sh data/train_aug data/train_reverb data/train_noise data/train_music data/train_babble +fi + +if [ $stage -le 3 ]; then + # Take a random subset of the augmentations + utils/subset_data_dir.sh data/train_aug 1000000 data/train_aug_1m + utils/fix_data_dir.sh data/train_aug_1m + + # Make MFCCs for the augmented data. Note that we do not compute a new + # vad.scp file here. Instead, we use the vad.scp from the clean version of + # the list. + steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 80 --cmd "$train_cmd" \ + data/train_aug_1m exp/make_mfcc $mfccdir + + # Combine the clean and augmented VoxCeleb2 list. This is now roughly + # double the size of the original clean list. + utils/combine_data.sh data/train_combined data/train_aug_1m data/train +fi + +# Now we prepare the features to generate examples for xvector training. +if [ $stage -le 4 ]; then + # This script applies CMVN and removes nonspeech frames. Note that this is somewhat + # wasteful, as it roughly doubles the amount of training data on disk. After + # creating training examples, this can be removed. + local/nnet3/xvector/prepare_feats_for_egs.sh --nj 80 --cmd "$train_cmd" \ + data/train_combined data/train_combined_no_sil exp/train_combined_no_sil + utils/fix_data_dir.sh data/train_combined_no_sil +fi + +if [ $stage -le 5 ]; then + # Now, we need to remove features that are too short after removing silence + # frames. We want atleast 5s (500 frames) per utterance. + min_len=400 + mv data/train_combined_no_sil/utt2num_frames data/train_combined_no_sil/utt2num_frames.bak + awk -v min_len=${min_len} '$2 > min_len {print $1, $2}' data/train_combined_no_sil/utt2num_frames.bak > data/train_combined_no_sil/utt2num_frames + utils/filter_scp.pl data/train_combined_no_sil/utt2num_frames data/train_combined_no_sil/utt2spk > data/train_combined_no_sil/utt2spk.new + mv data/train_combined_no_sil/utt2spk.new data/train_combined_no_sil/utt2spk + utils/fix_data_dir.sh data/train_combined_no_sil + + # We also want several utterances per speaker. Now we'll throw out speakers + # with fewer than 8 utterances. + min_num_utts=8 + awk '{print $1, NF-1}' data/train_combined_no_sil/spk2utt > data/train_combined_no_sil/spk2num + awk -v min_num_utts=${min_num_utts} '$2 >= min_num_utts {print $1, $2}' data/train_combined_no_sil/spk2num | utils/filter_scp.pl - data/train_combined_no_sil/spk2utt > data/train_combined_no_sil/spk2utt.new + mv data/train_combined_no_sil/spk2utt.new data/train_combined_no_sil/spk2utt + utils/spk2utt_to_utt2spk.pl data/train_combined_no_sil/spk2utt > data/train_combined_no_sil/utt2spk + + utils/filter_scp.pl data/train_combined_no_sil/utt2spk data/train_combined_no_sil/utt2num_frames > data/train_combined_no_sil/utt2num_frames.new + mv data/train_combined_no_sil/utt2num_frames.new data/train_combined_no_sil/utt2num_frames + + # Now we're ready to create training examples. + utils/fix_data_dir.sh data/train_combined_no_sil +fi + +# Stages 6 through 8 are handled in run_xvector.sh +local/nnet3/xvector/run_xvector.sh --stage $stage --train-stage -1 \ + --data data/train_combined_no_sil --nnet-dir $nnet_dir \ + --egs-dir $nnet_dir/egs + +if [ $stage -le 9 ]; then + # Now we will extract x-vectors used for centering, LDA, and PLDA training. + # Note that data/train_combined has well over 2 million utterances, + # which is far more than is needed to train the generative PLDA model. + # In addition, many of the utterances are very short, which causes a + # mismatch with our evaluation conditions. In the next command, we + # create a data directory that contains the longest 200,000 recordings, + # which we will use to train the backend. + utils/subset_data_dir.sh \ + --utt-list <(sort -n -k 2 data/train_combined_no_sil/utt2num_frames | tail -n 200000) \ + data/train_combined data/train_combined_200k + + sid/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 4G" --nj 80 \ + $nnet_dir data/train_combined_200k \ + $nnet_dir/xvectors_train_combined_200k + + # Extract x-vectors used in the evaluation. + for name in sitw_eval_enroll sitw_eval_test sitw_dev_enroll sitw_dev_test; do + sid/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 4G" --nj 40 \ + $nnet_dir data/$name \ + $nnet_dir/xvectors_$name + done +fi + +if [ $stage -le 10 ]; then + # Compute the mean.vec used for centering. + $train_cmd $nnet_dir/xvectors_train_combined_200k/log/compute_mean.log \ + ivector-mean scp:$nnet_dir/xvectors_train_combined_200k/xvector.scp \ + $nnet_dir/xvectors_train_combined_200k/mean.vec || exit 1; + + # Use LDA to decrease the dimensionality prior to PLDA. + lda_dim=128 + $train_cmd $nnet_dir/xvectors_train_combined_200k/log/lda.log \ + ivector-compute-lda --total-covariance-factor=0.0 --dim=$lda_dim \ + "ark:ivector-subtract-global-mean scp:$nnet_dir/xvectors_train_combined_200k/xvector.scp ark:- |" \ + ark:data/train_combined_200k/utt2spk $nnet_dir/xvectors_train_combined_200k/transform.mat || exit 1; + + # Train the PLDA model. + $train_cmd $nnet_dir/xvectors_train_combined_200k/log/plda.log \ + ivector-compute-plda ark:data/train_combined_200k/spk2utt \ + "ark:ivector-subtract-global-mean scp:$nnet_dir/xvectors_train_combined_200k/xvector.scp ark:- | transform-vec $nnet_dir/xvectors_train_combined_200k/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ + $nnet_dir/xvectors_train_combined_200k/plda || exit 1; +fi + +if [ $stage -le 11 ]; then + # Compute PLDA scores for SITW dev core-core trials + $train_cmd $nnet_dir/scores/log/sitw_dev_core_scoring.log \ + ivector-plda-scoring --normalize-length=true \ + --num-utts=ark:$nnet_dir/xvectors_sitw_dev_enroll/num_utts.ark \ + "ivector-copy-plda --smoothing=0.0 $nnet_dir/xvectors_train_combined_200k/plda - |" \ + "ark:ivector-mean ark:data/sitw_dev_enroll/spk2utt scp:$nnet_dir/xvectors_sitw_dev_enroll/xvector.scp ark:- | ivector-subtract-global-mean $nnet_dir/xvectors_train_combined_200k/mean.vec ark:- ark:- | transform-vec $nnet_dir/xvectors_train_combined_200k/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ + "ark:ivector-subtract-global-mean $nnet_dir/xvectors_train_combined_200k/mean.vec scp:$nnet_dir/xvectors_sitw_dev_test/xvector.scp ark:- | transform-vec $nnet_dir/xvectors_train_combined_200k/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ + "cat '$sitw_dev_trials_core' | cut -d\ --fields=1,2 |" $nnet_dir/scores/sitw_dev_core_scores || exit 1; + + # SITW Dev Core: + # EER: 3.003% + # minDCF(p-target=0.01): 0.3119 + # minDCF(p-target=0.001): 0.4955 + echo "SITW Dev Core:" + eer=$(paste $sitw_dev_trials_core $nnet_dir/scores/sitw_dev_core_scores | awk '{print $6, $3}' | compute-eer - 2>/dev/null) + mindcf1=`sid/compute_min_dcf.py --p-target 0.01 $nnet_dir/scores/sitw_dev_core_scores $sitw_dev_trials_core 2> /dev/null` + mindcf2=`sid/compute_min_dcf.py --p-target 0.001 $nnet_dir/scores/sitw_dev_core_scores $sitw_dev_trials_core 2> /dev/null` + echo "EER: $eer%" + echo "minDCF(p-target=0.01): $mindcf1" + echo "minDCF(p-target=0.001): $mindcf2" +fi + +if [ $stage -le 12 ]; then + # Compute PLDA scores for SITW eval core-core trials + $train_cmd $nnet_dir/scores/log/sitw_eval_core_scoring.log \ + ivector-plda-scoring --normalize-length=true \ + --num-utts=ark:$nnet_dir/xvectors_sitw_eval_enroll/num_utts.ark \ + "ivector-copy-plda --smoothing=0.0 $nnet_dir/xvectors_train_combined_200k/plda - |" \ + "ark:ivector-mean ark:data/sitw_eval_enroll/spk2utt scp:$nnet_dir/xvectors_sitw_eval_enroll/xvector.scp ark:- | ivector-subtract-global-mean $nnet_dir/xvectors_train_combined_200k/mean.vec ark:- ark:- | transform-vec $nnet_dir/xvectors_train_combined_200k/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ + "ark:ivector-subtract-global-mean $nnet_dir/xvectors_train_combined_200k/mean.vec scp:$nnet_dir/xvectors_sitw_eval_test/xvector.scp ark:- | transform-vec $nnet_dir/xvectors_train_combined_200k/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ + "cat '$sitw_eval_trials_core' | cut -d\ --fields=1,2 |" $nnet_dir/scores/sitw_eval_core_scores || exit 1; + + # SITW Eval Core: + # EER: 3.499% + # minDCF(p-target=0.01): 0.3424 + # minDCF(p-target=0.001): 0.5164 + echo -e "\nSITW Eval Core:"; + eer=$(paste $sitw_eval_trials_core $nnet_dir/scores/sitw_eval_core_scores | awk '{print $6, $3}' | compute-eer - 2>/dev/null) + mindcf1=`sid/compute_min_dcf.py --p-target 0.01 $nnet_dir/scores/sitw_eval_core_scores $sitw_eval_trials_core 2> /dev/null` + mindcf2=`sid/compute_min_dcf.py --p-target 0.001 $nnet_dir/scores/sitw_eval_core_scores $sitw_eval_trials_core 2> /dev/null` + echo "EER: $eer%" + echo "minDCF(p-target=0.01): $mindcf1" + echo "minDCF(p-target=0.001): $mindcf2" +fi diff --git a/egs/sitw/v2/sid b/egs/sitw/v2/sid new file mode 120000 index 00000000000..5cb0274b7d6 --- /dev/null +++ b/egs/sitw/v2/sid @@ -0,0 +1 @@ +../../sre08/v1/sid/ \ No newline at end of file diff --git a/egs/sitw/v2/steps b/egs/sitw/v2/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/sitw/v2/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/sitw/v2/utils b/egs/sitw/v2/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/sitw/v2/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/sre08/v1/sid/compute_min_dcf.py b/egs/sre08/v1/sid/compute_min_dcf.py new file mode 100755 index 00000000000..41b7b0fc594 --- /dev/null +++ b/egs/sre08/v1/sid/compute_min_dcf.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +# Copyright 2018 David Snyder +# Apache 2.0 + +# This script computes the minimum detection cost function, which is a common +# error metric used in speaker recognition. Compared to equal error-rate, +# which assigns equal weight to false negatives and false positives, this +# error-rate is usually used to assess performance in settings where achieving +# a low false positive rate is more important than achieving a low false +# negative rate. See the NIST 2016 Speaker Recognition Evaluation Plan at +# https://www.nist.gov/sites/default/files/documents/2016/10/07/sre16_eval_plan_v1.3.pdf +# for more details about the metric. +from __future__ import print_function +from operator import itemgetter +import sys, argparse, os + +def GetArgs(): + parser = argparse.ArgumentParser(description="Compute the minimum " + "detection cost function along with the threshold at which it occurs. " + "Usage: sid/compute_min_dcf.py [options...] " + " " + "E.g., sid/compute_min_dcf.py --p-target 0.01 --c-miss 1 --c-fa 1 " + "exp/scores/trials data/test/trials", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--p-target', type=float, dest = "p_target", + default = 0.01, + help='The prior probability of the target speaker in a trial.') + parser.add_argument('--c-miss', type=float, dest = "c_miss", default = 1, + help='Cost of a missed detection. This is usually not changed.') + parser.add_argument('--c-fa', type=float, dest = "c_fa", default = 1, + help='Cost of a spurious detection. This is usually not changed.') + parser.add_argument("scores_filename", + help="Input scores file, with columns of the form " + " ") + parser.add_argument("trials_filename", + help="Input trials file, with columns of the form " + " ") + sys.stderr.write(' '.join(sys.argv) + "\n") + args = parser.parse_args() + args = CheckArgs(args) + return args + +def CheckArgs(args): + if args.c_fa <= 0: + raise Exception("--c-fa must be greater than 0") + if args.c_miss <= 0: + raise Exception("--c-miss must be greater than 0") + if args.p_target <= 0 or args.p_target >= 1: + raise Exception("--p-target must be greater than 0 and less than 1") + return args + +# Creates a list of false-negative rates, a list of false-positive rates +# and a list of decision thresholds that give those error-rates. +def ComputeErrorRates(scores, labels): + + # Sort the scores from smallest to largest, and also get the corresponding + # indexes of the sorted scores. We will treat the sorted scores as the + # thresholds at which the the error-rates are evaluated. + sorted_indexes, thresholds = zip(*sorted( + [(index, threshold) for index, threshold in enumerate(scores)], + key=itemgetter(1))) + sorted_labels = [] + labels = [labels[i] for i in sorted_indexes] + fnrs = [] + fprs = [] + + # At the end of this loop, fnrs[i] is the number of errors made by + # incorrectly rejecting scores less than thresholds[i]. And, fprs[i] + # is the total number of times that we have correctly accepted scores + # greater than thresholds[i]. + for i in range(0, len(labels)): + if i == 0: + fnrs.append(labels[i]) + fprs.append(1 - labels[i]) + else: + fnrs.append(fnrs[i-1] + labels[i]) + fprs.append(fprs[i-1] + 1 - labels[i]) + fnrs_norm = sum(labels) + fprs_norm = len(labels) - fnrs_norm + + # Now divide by the total number of false negative errors to + # obtain the false positive rates across all thresholds + fnrs = [x / float(fnrs_norm) for x in fnrs] + + # Divide by the total number of corret positives to get the + # true positive rate. Subtract these quantities from 1 to + # get the false positive rates. + fprs = [1 - x / float(fprs_norm) for x in fprs] + return fnrs, fprs, thresholds + +# Computes the minimum of the detection cost function. The comments refer to +# equations in Section 3 of the NIST 2016 Speaker Recognition Evaluation Plan. +def ComputeMinDcf(fnrs, fprs, thresholds, p_target, c_miss, c_fa): + min_c_det = float("inf") + min_c_det_threshold = thresholds[0] + for i in range(0, len(fnrs)): + # See Equation (2). it is a weighted sum of false negative + # and false positive errors. + c_det = c_miss * fnrs[i] * p_target + c_fa * fprs[i] * (1 - p_target) + if c_det < min_c_det: + min_c_det = c_det + min_c_det_threshold = thresholds[i] + # See Equations (3) and (4). Now we normalize the cost. + c_def = min(c_miss * p_target, c_fa * (1 - p_target)) + min_dcf = min_c_det / c_def + return min_dcf, min_c_det_threshold + +def main(): + args = GetArgs() + scores_file = open(args.scores_filename, 'r').readlines() + trials_file = open(args.trials_filename, 'r').readlines() + c_miss = args.c_miss + c_fa = args.c_fa + p_target = args.p_target + + scores = [] + labels = [] + + trials = {} + for line in trials_file: + utt1, utt2, target = line.rstrip().split() + trial = utt1 + " " + utt2 + trials[trial] = target + + for line in scores_file: + utt1, utt2, score = line.rstrip().split() + trial = utt1 + " " + utt2 + if trial in trials: + scores.append(float(score)) + if trials[trial] == "target": + labels.append(1) + else: + labels.append(0) + else: + raise Exception("Missing entry for " + utt1 + " and " + utt2 + + " " + args.scores_filename) + + fnrs, fprs, thresholds = ComputeErrorRates(scores, labels) + mindcf, threshold = ComputeMinDcf(fnrs, fprs, thresholds, p_target, + c_miss, c_fa) + sys.stdout.write("{0:.4f}\n".format(mindcf)) + sys.stderr.write("minDCF is {0:.4f} at threshold {1:.4f} (p-target={2}, c-miss={3}," + "c-fa={4})\n".format(mindcf, threshold, p_target,c_miss, c_fa)) + +if __name__ == "__main__": + main() diff --git a/egs/sre16/v1/local/make_mx6_calls.pl b/egs/sre16/v1/local/make_mx6_calls.pl index ed9d6375248..0e38a350890 100755 --- a/egs/sre16/v1/local/make_mx6_calls.pl +++ b/egs/sre16/v1/local/make_mx6_calls.pl @@ -39,6 +39,7 @@ if (system("find $db_base/mx6_speech/data/ulaw_sphere/ -name '*.sph' > $tmp_dir/sph.list") != 0) { die "Error getting list of sph files"; } + open(SPHLIST, "<$tmp_dir/sph.list") or die "cannot open wav list"; while() { diff --git a/egs/sre16/v1/local/nnet3/xvector/prepare_feats_for_egs.sh b/egs/sre16/v1/local/nnet3/xvector/prepare_feats_for_egs.sh index 029070422a8..a35b94150c4 100755 --- a/egs/sre16/v1/local/nnet3/xvector/prepare_feats_for_egs.sh +++ b/egs/sre16/v1/local/nnet3/xvector/prepare_feats_for_egs.sh @@ -43,7 +43,7 @@ mkdir -p $dir/log mkdir -p $data_out featdir=$(utils/make_absolute.sh $dir) -if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then utils/create_split_dir.pl \ /export/b{14,15,16,17}/$USER/kaldi-data/egs/sre16/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_feats/storage $featdir/storage fi diff --git a/egs/sre16/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh b/egs/sre16/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh index 6e87b30f284..6a103ea8bf0 100755 --- a/egs/sre16/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh +++ b/egs/sre16/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh @@ -53,7 +53,7 @@ num_pdfs=$(awk '{print $2}' $data/utt2spk | sort | uniq -c | wc -l) if [ $stage -le 4 ]; then echo "$0: Getting neural network training egs"; # dump egs. - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then utils/create_split_dir.pl \ /export/b{03,04,05,06}/$USER/kaldi-data/egs/sre16/v2/xvector-$(date +'%m_%d_%H_%M')/$egs_dir/storage $egs_dir/storage fi diff --git a/egs/sre16/v2/README.txt b/egs/sre16/v2/README.txt index 0c9cc0d158e..3fe63d3e221 100644 --- a/egs/sre16/v2/README.txt +++ b/egs/sre16/v2/README.txt @@ -1,8 +1,16 @@ This recipe replaces iVectors used in the v1 recipe with embeddings extracted from a deep neural network. In the scripts, we refer to these embeddings as - "xvectors." The recipe is based on - http://www.danielpovey.com/files/2017_interspeech_embeddings.pdf but with - improvements due to augmentation in the DNN training data. + "xvectors." The recipe in local/nnet3/xvector/tuning/run_xvector_1a.sh is + closesly based on the following paper: + + @inproceedings{snyder2018xvector, + title={X-vectors: Robust DNN Embeddings for Speaker Recognition}, + author={Snyder, D. and Garcia-Romero, D. and Sell, G. and Povey, D. and Khudanpur, S.}, + booktitle={2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, + year={2018}, + organization={IEEE}, + url={http://www.danielpovey.com/files/2018_icassp_xvectors.pdf} + } The recipe uses the following data for system development. This is in addition to the NIST SRE 2016 dataset used for evaluation (see ../README.txt). diff --git a/egs/sre16/v2/run.sh b/egs/sre16/v2/run.sh index 375b07c0e53..0bc06431138 100755 --- a/egs/sre16/v2/run.sh +++ b/egs/sre16/v2/run.sh @@ -8,11 +8,12 @@ # Results (mostly EERs) are inline in comments below. # # This example demonstrates a "bare bones" NIST SRE 2016 recipe using xvectors. -# In the future, we will add score-normalization and a more effective form of -# PLDA domain adaptation. +# It is closely based on "X-vectors: Robust DNN Embeddings for Speaker +# Recognition" by Snyder et al. In the future, we will add score-normalization +# and a more effective form of PLDA domain adaptation. # -# Pretrained models are available for this recipe. -# See http://kaldi-asr.org/models.html and +# Pretrained models are available for this recipe. See +# http://kaldi-asr.org/models.html and # https://david-ryan-snyder.github.io/2017/10/04/model_sre16_v2.html # for details. diff --git a/egs/swbd/s5c/local/chain/run_tdnn.sh b/egs/swbd/s5c/local/chain/run_tdnn.sh index 2f050be93f2..2f256c1a5aa 120000 --- a/egs/swbd/s5c/local/chain/run_tdnn.sh +++ b/egs/swbd/s5c/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_7o.sh \ No newline at end of file +tuning/run_tdnn_7q.sh \ No newline at end of file diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh index b927cc86823..fb47b1e88ad 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh @@ -197,6 +197,7 @@ if [ $stage -le 13 ]; then steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$train_cmd" \ --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ --chain.xent-regularize $xent_regularize \ diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7p.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7p.sh new file mode 100755 index 00000000000..096ed9c54fd --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7p.sh @@ -0,0 +1,286 @@ +#!/bin/bash + +# 7p is as 7o but adding the option "--constrained false" to --egs.opts. +# This is the new 'unconstrained egs' code where it uses the e2e examples. +# This leads to ~40% speed-up in egs generation. +# +# +# local/chain/compare_wer_general.sh --rt03 tdnn7o_sp tdnn7p_sp +# System tdnn7o_sp tdnn7p_sp +# WER on train_dev(tg) 11.74 11.75 +# WER on train_dev(fg) 10.69 10.83 +# WER on eval2000(tg) 14.6 14.1 +# WER on eval2000(fg) 13.1 12.8 +# WER on rt03(tg) 17.5 17.3 +# WER on rt03(fg) 15.4 15.0 +# Final train prob -0.070 -0.055 +# Final valid prob -0.084 -0.069 +# Final train prob (xent) -0.883 -0.872 +# Final valid prob (xent) -0.9110 -0.9020 +# Num-parameters 22865188 22886776 + +# steps/info/chain_dir_info.pl exp/chain/tdnn7p_sp +# exp/chain/tdnn7p_sp: num-iters=525 nj=3..16 num-params=22.9M dim=40+100->6076 combine=-0.059->-0.058 (over 4) xent:train/valid[348,524,final]=(-1.20,-0.874,-0.872/-1.22,-0.910,-0.902) logprob:train/valid[348,524,final]=(-0.082,-0.055,-0.055/-0.096,-0.070,-0.069) + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +affix=7p +suffix= +$speed_perturb && suffix=_sp +if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi + +dir=exp/chain/tdnn${affix}${suffix} +decode_iter= +decode_nj=50 + +# training options +frames_per_eg=150,110,100 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004" + output_opts="l2-regularize=0.002" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1280 + linear-component name=tdnn2l0 dim=256 $linear_opts input=Append(-1,0) + linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1280 input=Append(0,1) + linear-component name=tdnn4l0 dim=256 $linear_opts input=Append(-1,0) + linear-component name=tdnn4l dim=256 $linear_opts input=Append(0,1) + relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1280 + linear-component name=tdnn5l dim=256 $linear_opts + relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1280 input=Append(0, tdnn3l) + linear-component name=tdnn6l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1536 + linear-component name=tdnn7l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn7l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 + linear-component name=tdnn8l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn8l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1536 + linear-component name=tdnn9l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1280 + linear-component name=tdnn10l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn10l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1536 + linear-component name=tdnn11l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1280 + linear-component name=prefinal-l dim=256 $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1536 + linear-component name=prefinal-chain-l dim=256 $linear_opts + batchnorm-component name=prefinal-chain-batchnorm + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1536 + linear-component name=prefinal-xent-l dim=256 $linear_opts + batchnorm-component name=prefinal-xent-batchnorm + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + +# --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \ + + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$train_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --constrained false" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 8 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0005 \ + --trainer.optimization.final-effective-lrate 0.00005 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + + +graph_dir=$dir/graph_sw1_tg +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +if $test_online_decoding && [ $stage -le 16 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +exit 0; diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh new file mode 100755 index 00000000000..8eab54a9dc2 --- /dev/null +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7q.sh @@ -0,0 +1,274 @@ +#!/bin/bash + +# 7q is as 7p but a modified topology with resnet-style skip connections, more layers, +# skinnier bottlenecks, removing the 3-way splicing and skip-layer splicing, +# and re-tuning the learning rate and l2 regularize. The configs are +# standardized and substantially simplified. There isn't any advantage in WER +# on this setup; the advantage of this style of config is that it also works +# well on smaller datasets, and we adopt this style here also for consistency. + +# local/chain/compare_wer_general.sh --rt03 tdnn7p_sp tdnn7q_sp +# System tdnn7p_sp tdnn7q_sp +# WER on train_dev(tg) 11.80 11.79 +# WER on train_dev(fg) 10.77 10.84 +# WER on eval2000(tg) 14.4 14.3 +# WER on eval2000(fg) 13.0 12.9 +# WER on rt03(tg) 17.5 17.6 +# WER on rt03(fg) 15.3 15.2 +# Final train prob -0.057 -0.058 +# Final valid prob -0.069 -0.073 +# Final train prob (xent) -0.886 -0.894 +# Final valid prob (xent) -0.9005 -0.9106 +# Num-parameters 22865188 18702628 + + +# steps/info/chain_dir_info.pl exp/chain/tdnn7q_sp +# exp/chain/tdnn7q_sp: num-iters=394 nj=3..16 num-params=18.7M dim=40+100->6034 combine=-0.058->-0.057 (over 8) xent:train/valid[261,393,final]=(-1.20,-0.897,-0.894/-1.20,-0.919,-0.911) logprob:train/valid[261,393,final]=(-0.090,-0.059,-0.058/-0.098,-0.073,-0.073) + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +affix=7q +if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi + +decode_iter= +decode_nj=50 + +# training options +frames_per_eg=150,110,100 +remove_egs=false +common_egs_dir= +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +suffix= +$speed_perturb && suffix=_sp +dir=exp/chain/tdnn${affix}${suffix} + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. This is the critically different + # step compared with other recipes. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + affine_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.002" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1536 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=1536 bottleneck-dim=160 time-stride=3 + linear-component name=prefinal-l dim=256 $linear_opts + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1536 small-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 13 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + +# --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \ + + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$train_cmd" \ + --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--frames-overlap-per-eg 0 --constrained false" \ + --egs.chunk-width $frames_per_eg \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 6 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.00025 \ + --trainer.optimization.final-effective-lrate 0.000025 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir data/${train_set}_hires \ + --tree-dir $treedir \ + --lat-dir exp/tri4_lats_nodup$suffix \ + --dir $dir || exit 1; + +fi + +if [ $stage -le 14 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + + +graph_dir=$dir/graph_sw1_tg +iter_opts= +if [ ! -z $decode_iter ]; then + iter_opts=" --iter $decode_iter " +fi +if [ $stage -le 15 ]; then + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $decode_nj --cmd "$decode_cmd" $iter_opts \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +if $test_online_decoding && [ $stage -le 16 ]; then + # note: if the features change (e.g. you add pitch features), you will have to + # change the options of the following command line. + steps/online/nnet3/prepare_online_decoding.sh \ + --mfcc-config conf/mfcc_hires.conf \ + $lang exp/nnet3/extractor $dir ${dir}_online + + rm $dir/.error 2>/dev/null || true + for decode_set in train_dev eval2000 $maybe_rt03; do + ( + # note: we just give it "$decode_set" as it only uses the wav.scp, the + # feature type does not matter. + + steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + $graph_dir data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1; + fi + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + + +exit 0; diff --git a/egs/swbd/s5c/local/rnnlm/run_tdnn_lstm_back.sh b/egs/swbd/s5c/local/rnnlm/run_tdnn_lstm_back.sh new file mode 120000 index 00000000000..0fe3d76834e --- /dev/null +++ b/egs/swbd/s5c/local/rnnlm/run_tdnn_lstm_back.sh @@ -0,0 +1 @@ +tuning/run_tdnn_lstm_back_1e.sh \ No newline at end of file diff --git a/egs/swbd/s5c/local/rnnlm/tuning/run_tdnn_lstm_1e.sh b/egs/swbd/s5c/local/rnnlm/tuning/run_tdnn_lstm_1e.sh index 7a99100132f..b7e53b016ea 100755 --- a/egs/swbd/s5c/local/rnnlm/tuning/run_tdnn_lstm_1e.sh +++ b/egs/swbd/s5c/local/rnnlm/tuning/run_tdnn_lstm_1e.sh @@ -32,6 +32,8 @@ train_stage=-10 # variables for lattice rescoring run_lat_rescore=true run_nbest_rescore=true +run_backward_rnnlm=false + ac_model_dir=exp/nnet3/tdnn_lstm_1a_adversarial0.3_epochs12_ld5_sp decode_dir_suffix=rnnlm_1e ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order @@ -130,10 +132,10 @@ if [ $stage -le 4 ] && $run_lat_rescore; then # Lattice rescoring rnnlm/lmrescore$pruned.sh \ --cmd "$decode_cmd --mem 4G" \ - --weight 0.5 --max-ngram-order $ngram_order \ + --weight 0.45 --max-ngram-order $ngram_order \ data/lang_$LM $dir \ data/${decode_set}_hires ${decode_dir} \ - ${decode_dir}_${decode_dir_suffix} + ${decode_dir}_${decode_dir_suffix}_0.45 done fi @@ -151,4 +153,10 @@ if [ $stage -le 5 ] && $run_nbest_rescore; then done fi +# running backward RNNLM, which further improves WERS by combining backward with +# the forward RNNLM trained in this script. +if [ $stage -le 6 ] && $run_backward_rnnlm; then + local/rnnlm/run_tdnn_lstm_back.sh +fi + exit 0 diff --git a/egs/swbd/s5c/local/rnnlm/tuning/run_tdnn_lstm_back_1e.sh b/egs/swbd/s5c/local/rnnlm/tuning/run_tdnn_lstm_back_1e.sh new file mode 100755 index 00000000000..4f85ed8f28b --- /dev/null +++ b/egs/swbd/s5c/local/rnnlm/tuning/run_tdnn_lstm_back_1e.sh @@ -0,0 +1,142 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (author: Daniel Povey) +# 2015 Guoguo Chen +# 2017 Hainan Xu +# 2017 Xiaohui Zhang + +# This script trains a backward LMs on the swbd LM-training data, and use it +# to rescore either decoded lattices, or lattices that are just rescored with +# a forward RNNLM. In order to run this, you must first run the forward RNNLM +# recipe at local/rnnlm/run_tdnn_lstm.sh + +# rnnlm/train_rnnlm.sh: best iteration (out of 35) was 34, linking it to final iteration. +# rnnlm/train_rnnlm.sh: train/dev perplexity was 41.8 / 55.1. +# Train objf: -5.18 -4.46 -4.26 -4.18 -4.12 -4.07 -4.04 -4.00 -3.99 -3.98 -3.95 -3.93 -3.91 -3.90 -3.88 -3.87 -3.86 -3.85 -3.83 -3.82 -3.82 -3.81 -3.79 -3.79 -3.78 -3.77 -3.76 -3.77 -3.75 -3.74 -3.74 -3.73 -3.72 -3.71 -3.71 +# Dev objf: -10.32 -4.89 -4.57 -4.45 -4.37 -4.33 -4.29 -4.26 -4.24 -4.22 -4.18 -4.17 -4.15 -4.14 -4.13 -4.12 -4.11 -4.10 -4.09 -4.08 -4.07 -4.06 -4.06 -4.05 -4.05 -4.05 -4.04 -4.04 -4.03 -4.03 -4.02 -4.02 -4.02 -4.01 -4.01 + +# %WER 11.1 | 1831 21395 | 89.9 6.4 3.7 1.0 11.1 46.3 | exp/nnet3/tdnn_lstm_1a_adversarial0.3_epochs12_ld5_sp/decode_eval2000_sw1_fsh_fg_looped/score_13_0.0/eval2000_hires.ctm.swbd.filt.sys +# %WER 9.9 | 1831 21395 | 91.0 5.8 3.2 0.9 9.9 43.2 | exp/nnet3/tdnn_lstm_1a_adversarial0.3_epochs12_ld5_sp/decode_eval2000_sw1_fsh_fg_looped_rnnlm_1e/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys +# %WER 9.5 | 1831 21395 | 91.4 5.5 3.1 0.9 9.5 42.5 | exp/nnet3/tdnn_lstm_1a_adversarial0.3_epochs12_ld5_sp/decode_eval2000_sw1_fsh_fg_looped_rnnlm_1e_back/score_11_0.0/eval2000_hires.ctm.swbd.filt.sys + +# %WER 15.9 | 4459 42989 | 85.7 9.7 4.6 1.6 15.9 51.6 | exp/nnet3/tdnn_lstm_1a_adversarial0.3_epochs12_ld5_sp/decode_eval2000_sw1_fsh_fg_looped/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 14.4 | 4459 42989 | 87.0 8.7 4.3 1.5 14.4 49.4 | exp/nnet3/tdnn_lstm_1a_adversarial0.3_epochs12_ld5_sp/decode_eval2000_sw1_fsh_fg_looped_rnnlm_1e/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 13.9 | 4459 42989 | 87.6 8.4 4.0 1.5 13.9 48.6 | exp/nnet3/tdnn_lstm_1a_adversarial0.3_epochs12_ld5_sp/decode_eval2000_sw1_fsh_fg_looped_rnnlm_1e_back/score_10_0.0/eval2000_hires.ctm.filt.sys + +# Begin configuration section. + +dir=exp/rnnlm_lstm_1e_backward +embedding_dim=1024 +lstm_rpd=256 +lstm_nrpd=256 +stage=-10 +train_stage=-10 + +# variables for lattice rescoring +run_lat_rescore=true +ac_model_dir=exp/nnet3/tdnn_lstm_1a_adversarial0.3_epochs12_ld5_sp +decode_dir_suffix_forward=rnnlm_1e +decode_dir_suffix_backward=rnnlm_1e_back +ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order + # if it's set, it merges histories in the lattice if they share + # the same ngram history and this prevents the lattice from + # exploding exponentially + +. ./cmd.sh +. ./utils/parse_options.sh + +text=data/train_nodev/text +fisher_text=data/local/lm/fisher/text1.gz +lexicon=data/local/dict_nosp/lexiconp.txt +text_dir=data/rnnlm/text_nosp_1e_back +mkdir -p $dir/config +set -e + +for f in $text $lexicon; do + [ ! -f $f ] && \ + echo "$0: expected file $f to exist; search for local/wsj_extend_dict.sh in run.sh" && exit 1 +done + +if [ $stage -le 0 ]; then + mkdir -p $text_dir + echo -n >$text_dir/dev.txt + # hold out one in every 50 lines as dev data. + cat $text | cut -d ' ' -f2- | awk '{for(i=NF;i>0;i--) printf("%s ", $i); print""}' | awk -v text_dir=$text_dir '{if(NR%50 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/swbd.txt + cat > $dir/config/hesitation_mapping.txt <0;i--) printf("%s ", $i); print""}' > $text_dir/fisher.txt +fi + +if [ $stage -le 1 ]; then + cp data/lang/words.txt $dir/config/ + n=`cat $dir/config/words.txt | wc -l` + echo " $n" >> $dir/config/words.txt + + # words that are not present in words.txt but are in the training or dev data, will be + # mapped to during training. + echo "" >$dir/config/oov.txt + + cat > $dir/config/data_weights.txt <$dir/config/unigram_probs.txt + + # choose features + rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ + --use-constant-feature=true \ + --special-words=',,,,[noise],[laughter],[vocalized-noise]' \ + $dir/config/words.txt > $dir/config/features.txt + + cat >$dir/config/xconfig <3600 combine=-0.060->-0.060 (over 2) xent:train/valid[71,107,final]=(-1.30,-0.985,-0.979/-1.29,-1.00,-0.995) logprob:train/valid[71,107,final]=(-0.098,-0.065,-0.064/-0.100,-0.075,-0.075) + +## how you run this (note: this assumes that the run_tdnn.sh soft link points here; +## otherwise call it directly in its location). +# by default, with cleanup: +# local/chain/run_tdnn.sh + +# without cleanup: +# local/chain/run_tdnn.sh --train-set train --gmm tri3 --nnet3-affix "" & + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnn_affix=1g #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. +remove_egs=true + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +if [ $stage -le 17 ]; then + mkdir -p $dir + + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + affine_opts="l2-regularize=0.008 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.008 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.008 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.008" + output_opts="l2-regularize=0.002" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=1024 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + linear-component name=prefinal-l dim=256 $linear_opts + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=256 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=256 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + +fi + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0 --constrained false" \ + --egs.chunk-width 150,110,100 \ + --trainer.num-chunk-per-minibatch 64 \ + --trainer.frames-per-iter 5000000 \ + --trainer.num-epochs 6 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.00025 \ + --trainer.optimization.final-effective-lrate 0.000025 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs $remove_egs \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/tedlium/s5_r2/local/ted_download_lm.sh b/egs/tedlium/s5_r2/local/ted_download_lm.sh new file mode 100755 index 00000000000..11ee5bc82d6 --- /dev/null +++ b/egs/tedlium/s5_r2/local/ted_download_lm.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# +# Copyright 2018 David Snyder +# Apache 2.0 +# +# This script downloads pre-built language models trained on the Cantab-Tedlium +# text data and Tedlium acoustic training data. If you want to build these +# models yourself, run the script local/ted_train_lm.sh. + +set -e + +echo "$0: downloading Tedlium 4 gram language models (it won't re-download if it was already downloaded.)" +wget --continue http://kaldi-asr.org/models/5/4gram_small.arpa.gz -P data/local/local_lm/data/arpa || exit 1 +wget --continue http://kaldi-asr.org/models/5/4gram_big.arpa.gz -P data/local/local_lm/data/arpa || exit 1 + +exit 0 + diff --git a/egs/tedlium/s5_r2/run.sh b/egs/tedlium/s5_r2/run.sh index 161c416d1f9..e7b5df6055e 100755 --- a/egs/tedlium/s5_r2/run.sh +++ b/egs/tedlium/s5_r2/run.sh @@ -57,10 +57,12 @@ if [ $stage -le 3 ]; then fi if [ $stage -le 4 ]; then - # later on we'll change this script so you have the option to - # download the pre-built LMs from openslr.org instead of building them + # Download the pre-built LMs from kaldi-asr.org instead of building them # locally. - local/ted_train_lm.sh + local/ted_download_lm.sh + # Uncomment this script to build the language models instead of + # downloading them from kaldi-asr.org. + # local/ted_train_lm.sh fi if [ $stage -le 5 ]; then diff --git a/egs/tedlium/s5_r3/cmd.sh b/egs/tedlium/s5_r3/cmd.sh new file mode 100755 index 00000000000..56c1d783a9e --- /dev/null +++ b/egs/tedlium/s5_r3/cmd.sh @@ -0,0 +1,15 @@ +# "queue.pl" uses qsub. The options to it are +# options to qsub. If you have GridEngine installed, +# change this to a queue you have access to. +# Otherwise, use "run.pl", which will run jobs locally +# (make sure your --num-jobs options are no more than +# the number of cpus on your machine. + +# Run locally: +#export train_cmd=run.pl +#export decode_cmd=run.pl + +# JHU cluster (or most clusters using GridEngine, with a suitable +# conf/queue.conf). +export train_cmd="queue.pl" +export decode_cmd="queue.pl --mem 4G" diff --git a/egs/tedlium/s5_r3/conf/mfcc.conf b/egs/tedlium/s5_r3/conf/mfcc.conf new file mode 100644 index 00000000000..32988403b00 --- /dev/null +++ b/egs/tedlium/s5_r3/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false +--sample-frequency=16000 diff --git a/egs/tedlium/s5_r3/conf/mfcc_hires.conf b/egs/tedlium/s5_r3/conf/mfcc_hires.conf new file mode 100644 index 00000000000..434834a6725 --- /dev/null +++ b/egs/tedlium/s5_r3/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so + # there might be some information at the low end. +--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) diff --git a/egs/tedlium/s5_r3/conf/online_cmvn.conf b/egs/tedlium/s5_r3/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/tedlium/s5_r3/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh b/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh new file mode 100755 index 00000000000..88dde1ff0e2 --- /dev/null +++ b/egs/tedlium/s5_r3/local/chain/compare_wer_general.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_c_sp exp/chain_cleaned/tdnn_c_sp_smbr:{1,2,3} + + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain_cleaned/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain_cleaned/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain_cleaned/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=("# WER on dev(orig) " "# WER on dev(rescored) " "# WER on test(orig) " "# WER on test(rescored)") + +for n in 0 1 2 3; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore) + wer=$(grep Sum $dirname/decode_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(dev${epoch_infix} dev${epoch_infix}_rescore test${epoch_infix} test${epoch_infix}_rescore) + wer=$(grep Sum $dirname/decode_looped_${decode_names[$n]}/score*/*ys | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Num-params " +for x in $*; do + printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}') +done +echo diff --git a/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh b/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh new file mode 120000 index 00000000000..61f8f499182 --- /dev/null +++ b/egs/tedlium/s5_r3/local/chain/run_tdnnf.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1b.sh \ No newline at end of file diff --git a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7_bs.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh similarity index 58% rename from egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7_bs.sh rename to egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh index 694e8e05027..40cdcb5b5ff 100755 --- a/egs/babel/s5d/local/chain/tuning/run_tdnn_lstm_bab7_bs.sh +++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1a.sh @@ -1,42 +1,57 @@ #!/bin/bash +# Results -# by default, with cleanup -# this is basically the same as the run_tdnn_lstm_bab7.sh -# with dropout and backstitch (+more iterations as the convergence with dropout and backstitch is slower) -# please note that the language(s) was not selected for any particular reason (other to represent the various sizes of babel datasets) -# 304-lithuanian | %WER 38.1 | 20041 61492 | 64.5 26.1 9.4 2.6 38.1 28.1 | -0.242 | exp/chain_cleaned/tdnn_lstm_bab7_bs_sp/decode_dev10h.pem/score_9/dev10h.pem.ctm.sys -# num-iters=120 nj=2..12 num-params=36.7M dim=43+100->3273 combine=-0.161->-0.151 -# xent:train/valid[79,119,final]=(-2.35,-1.73,-1.71/-2.49,-2.04,-2.03) -# logprob:train/valid[79,119,final]=(-0.191,-0.138,-0.136/-0.225,-0.201,-0.202) -# 206-zulu | %WER 50.7 | 22805 52162 | 53.1 36.8 10.0 3.8 50.7 30.5 | -0.553 | exp/chain_cleaned/tdnn_lstm_bab7_bs_sp/decode_dev10h.pem/score_10/dev10h.pem.ctm.sys -# num-iters=167 nj=2..12 num-params=36.7M dim=43+100->3274 combine=-0.197->-0.190 -# xent:train/valid[110,166,final]=(-2.44,-1.85,-1.83/-2.55,-2.14,-2.13) -# logprob:train/valid[110,166,final]=(-0.230,-0.171,-0.170/-0.269,-0.244,-0.245) -# 104-pashto | %WER 38.5 | 21825 101803 | 65.4 24.6 10.0 3.8 38.5 29.7 | -0.418 | exp/chain_cleaned/tdnn_lstm_bab7_bs_sp/decode_dev10h.pem/score_9/dev10h.pem.ctm.sys -# num-iters=214 nj=2..12 num-params=36.8M dim=43+100->3328 combine=-0.173->-0.168 -# xent:train/valid[141,213,final]=(-2.37,-1.69,-1.69/-2.54,-2.05,-2.05) -# logprob:train/valid[141,213,final]=(-0.208,-0.151,-0.151/-0.256,-0.224,-0.224) +# System tdnn_1a +# Scoring script sclite +# WER on dev(orig) 8.2 +# WER on dev(rescored ngram) 7.6 +# WER on dev(rescored rnnlm) 6.3 +# WER on test(orig) 8.1 +# WER on test(rescored ngram) 7.7 +# WER on test(rescored rnnlm) 6.7 +# Final train prob -0.0802 +# Final valid prob -0.0980 +# Final train prob (xent) -1.1450 +# Final valid prob (xent) -1.2498 +# Num-params 26651840 + + + +## how you run this (note: this assumes that the run_tdnn.sh soft link points here; +## otherwise call it directly in its location). +# by default, with cleanup: +# local/chain/run_tdnn.sh + +# without cleanup: +# local/chain/run_tdnn.sh --train-set train --gmm tri3 --nnet3-affix "" & + +# note, if you have already run the corresponding non-chain nnet3 system +# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14. + +# This script is like run_tdnn_1a.sh except it uses an xconfig-based mechanism +# to get the configuration. set -e -o pipefail # First the options that are passed through to run_ivector_common.sh # (some of which are also used in this script directly). -stage=17 +stage=0 nj=30 -dropout_schedule='0,0@0.20,0.3@0.50,0' +decode_nj=30 +min_seg_len=1.55 +xent_regularize=0.1 train_set=train_cleaned -gmm=tri5_cleaned # the gmm for the target data -langdir=data/langp/tri5_ali -num_threads_ubm=12 +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned # The rest are configs specific to this script. Most of the parameters # are just hardcoded at this level, in the commands below. train_stage=-10 tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. -tdnn_affix="_bab7_bs" #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. -common_egs_dir=exp/chain_cleaned/tdnn_lstm_sp/egs # you can set this to use previously dumped egs. +tdnn_affix=_1a #affix for TDNN directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. # End configuration section. echo "$0 $@" # Print the command line for logging @@ -54,8 +69,9 @@ where "nvcc" is installed. EOF fi -local/chain/run_ivector_common.sh --stage $stage \ +local/nnet3/run_ivector_common.sh --stage $stage \ --nj $nj \ + --min-seg-len $min_seg_len \ --train-set $train_set \ --gmm $gmm \ --num-threads-ubm $num_threads_ubm \ @@ -66,7 +82,7 @@ gmm_dir=exp/$gmm ali_dir=exp/${gmm}_ali_${train_set}_sp tree_dir=exp/chain${nnet3_affix}/tree${tree_affix} lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats -dir=exp/chain${nnet3_affix}/tdnn_lstm${tdnn_affix}_sp +dir=exp/chain${nnet3_affix}/tdnn${tdnn_affix}_sp train_data_dir=data/${train_set}_sp_hires lores_train_data_dir=data/${train_set}_sp train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires @@ -91,7 +107,7 @@ if [ $stage -le 14 ]; then exit 1; fi else - cp -r $langdir data/lang_chain + cp -r data/lang data/lang_chain silphonelist=$(cat data/lang_chain/phones/silence.csl) || exit 1; nonsilphonelist=$(cat data/lang_chain/phones/nonsilence.csl) || exit 1; # Use our special topology... note that later on may have to tune this @@ -104,7 +120,7 @@ if [ $stage -le 15 ]; then # Get the alignments as lattices (gives the chain training more freedom). # use the same num-jobs as the alignments steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ - $langdir $gmm_dir $lat_dir + data/lang $gmm_dir $lat_dir rm $lat_dir/fsts.*.gz # save space fi @@ -118,48 +134,38 @@ if [ $stage -le 16 ]; then fi steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ --context-opts "--context-width=2 --central-position=1" \ - --leftmost-questions-truncate -1 \ --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir fi -xent_regularize=0.1 if [ $stage -le 17 ]; then mkdir -p $dir echo "$0: creating neural net configs using the xconfig parser"; num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - [ -z $num_targets ] && { echo "$0: error getting num-targets"; exit 1; } learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - lstm_opts="decay-time=20 dropout-proportion=0.0" - label_delay=5 mkdir -p $dir/configs cat < $dir/configs/network.xconfig input dim=100 name=ivector - input dim=43 name=input + input dim=40 name=input # please note that it is important to have input layer with the name=input # as the layer immediately preceding the fixed-affine-layer to enable # the use of short notation for the descriptor - fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat # the first splicing is moved before the lda layer, so no splicing here - relu-renorm-layer name=tdnn1 dim=1024 - relu-renorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 - relu-renorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 - - # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults - fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts - relu-renorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 - relu-renorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 - fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts - relu-renorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 - relu-renorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 - fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-batchnorm-layer name=tdnn1 dim=1024 self-repair-scale=1.0e-04 + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1,2) dim=1024 + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 + relu-batchnorm-layer name=tdnn6 input=Append(-6,-3,0) dim=1024 ## adding the layers for chain branch - output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 + relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=1024 target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 # adding the layers for xent branch # This block prints the configs for a separate output that will be @@ -170,7 +176,8 @@ if [ $stage -le 17 ]; then # final-layer learns at a rate independent of the regularization # constant; and the 0.5 was tuned so as to make the relative progress # similar in the xent and regular final layers. - output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 + relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=1024 target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 EOF steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ @@ -180,35 +187,31 @@ fi if [ $stage -le 18 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ - /export/b0{5,6,7,8}/$USER/kaldi-data/egs/babel-$(date +'%m_%d_%H_%M')/s5d/$RANDOM/$dir/egs/storage $dir/egs/storage + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage fi - [ ! -d $dir/egs ] && mkdir -p $dir/egs/ - touch $dir/egs/.nodelete # keep egs around when that run dies. steps/nnet3/chain/train.py --stage $train_stage \ --cmd "$decode_cmd" \ --feat.online-ivector-dir $train_ivector_dir \ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ - --chain.xent-regularize $xent_regularize \ + --chain.xent-regularize 0.1 \ --chain.leaky-hmm-coefficient 0.1 \ --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --chain.lm-opts="--num-extra-lm-states=2000" \ --egs.dir "$common_egs_dir" \ --egs.opts "--frames-overlap-per-eg 0" \ - --egs.chunk-width 150 \ + --egs.chunk-width 150,110,100 \ --trainer.num-chunk-per-minibatch 128 \ --trainer.frames-per-iter 1500000 \ - --trainer.num-epochs 10 \ + --trainer.num-epochs 4 \ + --trainer.optimization.proportional-shrink 10 \ --trainer.optimization.num-jobs-initial 2 \ - --trainer.optimization.num-jobs-final 12 \ - --trainer.dropout-schedule $dropout_schedule \ - --trainer.optimization.backstitch-training-scale 1 \ - --trainer.optimization.backstitch-training-interval 4 \ + --trainer.optimization.num-jobs-final 6 \ --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.max-param-change 2.0 \ - --cleanup.remove-egs true \ + --cleanup.remove-egs false \ --feat-dir $train_data_dir \ --tree-dir $tree_dir \ --lat-dir $lat_dir \ @@ -221,7 +224,26 @@ if [ $stage -le 19 ]; then # Note: it might appear that this data/lang_chain directory is mismatched, and it is as # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. - utils/mkgraph.sh --self-loop-scale 1.0 data/langp_test $dir $dir/graph + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph fi +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi exit 0 diff --git a/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh new file mode 100755 index 00000000000..f8eec8c5213 --- /dev/null +++ b/egs/tedlium/s5_r3/local/chain/tuning/run_tdnn_1b.sh @@ -0,0 +1,257 @@ +#!/bin/bash + +# run_tdnn_1b.sh is the script which results are presented in the corpus release paper. +# It uses 2 to 6 jobs and add proportional-shrink 10. + +# WARNING +# This script is flawed and misses key elements to optimize the tdnnf setup. +# You can run it as is to reproduce results from the corpus release paper, +# but a more up-to-date version should be looked at in other egs until another +# setup is added here. + +# local/chain/compare_wer_general.sh exp/chain_cleaned/tdnn_1a exp/chain_cleaned/tdnn_1b +# System tdnn_1a tdnn_1b tdnn_1b +# Scoring script sclite sclite score_basic +# WER on dev(orig) 8.2 7.9 7.9 +# WER on dev(rescored ngram) 7.6 7.4 7.5 +# WER on dev(rescored rnnlm) 6.3 6.2 6.2 +# WER on test(orig) 8.1 8.0 8.2 +# WER on test(rescored ngram) 7.7 7.7 7.9 +# WER on test(rescored rnnlm) 6.7 6.7 6.8 +# Final train prob -0.0802 -0.0899 +# Final valid prob -0.0980 -0.0974 +# Final train prob (xent) -1.1450 -0.9449 +# Final valid prob (xent) -1.2498 -1.0002 +# Num-params 26651840 25782720 + + +## how you run this (note: this assumes that the run_tdnn.sh soft link points here; +## otherwise call it directly in its location). +# by default, with cleanup: +# local/chain/run_tdnn.sh + +# without cleanup: +# local/chain/run_tdnn.sh --train-set train --gmm tri3 --nnet3-affix "" & + +# note, if you have already run the corresponding non-chain nnet3 system +# (local/nnet3/run_tdnn.sh), you may want to run with --stage 14. + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +decode_nj=30 +min_seg_len=1.55 +xent_regularize=0.1 +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +num_threads_ubm=32 +nnet3_affix=_cleaned # cleanup affix for nnet3 and chain dirs, e.g. _cleaned + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tdnnf_affix=_1a #affix for TDNNF directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 15 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 16 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 4000 ${lores_train_data_dir} data/lang_chain $ali_dir $tree_dir +fi + +if [ $stage -le 17 ]; then + mkdir -p $dir + + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=1280 + linear-component name=tdnn2l dim=256 input=Append(-1,0) + relu-batchnorm-layer name=tdnn2 input=Append(0,1) dim=1280 + linear-component name=tdnn3l dim=256 + relu-batchnorm-layer name=tdnn3 dim=1280 + linear-component name=tdnn4l dim=256 input=Append(-1,0) + relu-batchnorm-layer name=tdnn4 input=Append(0,1) dim=1280 + linear-component name=tdnn5l dim=256 + relu-batchnorm-layer name=tdnn5 dim=1280 input=Append(tdnn5l, tdnn3l) + linear-component name=tdnn6l dim=256 input=Append(-3,0) + relu-batchnorm-layer name=tdnn6 input=Append(0,3) dim=1280 + linear-component name=tdnn7l dim=256 input=Append(-3,0) + relu-batchnorm-layer name=tdnn7 input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280 + linear-component name=tdnn8l dim=256 input=Append(-3,0) + relu-batchnorm-layer name=tdnn8 input=Append(0,3) dim=1280 + linear-component name=tdnn9l dim=256 input=Append(-3,0) + relu-batchnorm-layer name=tdnn9 input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280 + linear-component name=tdnn10l dim=256 input=Append(-3,0) + relu-batchnorm-layer name=tdnn10 input=Append(0,3) dim=1280 + linear-component name=tdnn11l dim=256 input=Append(-3,0) + relu-batchnorm-layer name=tdnn11 input=Append(0,3,tdnn10l,tdnn8l,tdnn6l) dim=1280 + linear-component name=prefinal-l dim=256 + relu-batchnorm-layer name=prefinal-chain input=prefinal-l dim=1280 + output-layer name=output include-log-softmax=false dim=$num_targets + relu-batchnorm-layer name=prefinal-xent input=prefinal-l dim=1280 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + +fi + +if [ $stage -le 18 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ + --chain.xent-regularize 0.1 \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width 150 \ + --trainer.num-chunk-per-minibatch 128 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.proportional-shrink 10 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 6 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs false \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + +if [ $stage -le 19 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang $dir $dir/graph +fi + +if [ $stage -le 20 ]; then + rm $dir/.error 2>/dev/null || true + for dset in dev test; do + ( + steps/nnet3/decode.sh --num-threads 4 --nj $decode_nj --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${dset}_hires \ + --scoring-opts "--min-lmwt 5 " \ + $dir/graph data/${dset}_hires $dir/decode_${dset} || exit 1; + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset}_hires ${dir}/decode_${dset} ${dir}/decode_${dset}_rescore || exit 1 + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi +exit 0 diff --git a/egs/tedlium/s5_r3/local/download_data.sh b/egs/tedlium/s5_r3/local/download_data.sh new file mode 100755 index 00000000000..49de5b12372 --- /dev/null +++ b/egs/tedlium/s5_r3/local/download_data.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# Copyright 2014 Nickolay V. Shmyrev +# 2014 Brno University of Technology (Author: Karel Vesely) +# 2016 John Hopkins University (author: Daniel Povey) +# Apache 2.0 + +mkdir -p db + +cd db ### Note: the rest of this script is executed from the directory 'db'. + +# TED-LIUM database: +if [[ $(hostname -f) == *.clsp.jhu.edu ]] ; then + if [ ! -e TEDLIUM_release-3 ]; then + ln -sf /export/corpora5/TEDLIUM_release-3 + fi + echo "$0: linking the TEDLIUM data from /export/corpora5/TEDLIUM_release-3" +else + if [ ! -e TEDLIUM_release-3 ]; then + echo "$0: downloading TEDLIUM_release-3 data (it won't re-download if it was already downloaded.)" + # the following command won't re-get it if it's already there + # because of the --continue switch. + wget --continue http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz || exit 1 + tar xf "TEDLIUM_release-3.tar.gz" + else + echo "$0: not downloading or un-tarring TEDLIUM_release2 because it already exists." + fi +fi + + +num_sph=$(find TEDLIUM_release-3/data -name '*.sph' | wc -l) +if [ "$num_sph" != 2351 ]; then + echo "$0: expected to find 2351 .sph files in the directory db/TEDLIUM_release-3, found $num_sph" + exit 1 +fi + +exit 0 + diff --git a/egs/tedlium/s5_r3/local/format_lms.sh b/egs/tedlium/s5_r3/local/format_lms.sh new file mode 100755 index 00000000000..bba5bbd17ec --- /dev/null +++ b/egs/tedlium/s5_r3/local/format_lms.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# +# Copyright 2014 Nickolay V. Shmyrev +# Apache 2.0 + +if [ -f path.sh ]; then . path.sh; fi + + +small_arpa_lm=data/local/local_lm/data/arpa/4gram_small.arpa.gz +big_arpa_lm=data/local/local_lm/data/arpa/4gram_big.arpa.gz + +for f in $small_arpa_lm $big_arpa_lm data/lang_nosp/words.txt; do + [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 +done + + +set -e + +if [ -f data/lang_nosp/G.fst ] && [ data/lang_nosp/G.fst -nt $small_arpa_lm ]; then + echo "$0: not regenerating data/lang_nosp/G.fst as it already exists and " + echo ".. is newer than the source LM." +else + arpa2fst --disambig-symbol=#0 --read-symbol-table=data/lang_nosp/words.txt \ + "gunzip -c $small_arpa_lm|" data/lang_nosp/G.fst + echo "$0: Checking how stochastic G is (the first of these numbers should be small):" + fstisstochastic data/lang_nosp/G.fst || true + utils/validate_lang.pl --skip-determinization-check data/lang_nosp +fi + + + +if [ -f data/lang_nosp_rescore/G.carpa ] && [ data/lang_nosp_rescore/G.carpa -nt $big_arpa_lm ] && \ + [ data/lang_nosp_rescore/G.carpa -nt data/lang_nosp/words.txt ]; then + echo "$0: not regenerating data/lang_nosp_rescore/ as it seems to already by up to date." +else + utils/build_const_arpa_lm.sh $big_arpa_lm data/lang_nosp data/lang_nosp_rescore || exit 1; +fi + +exit 0; diff --git a/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh b/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..5322da6240f --- /dev/null +++ b/egs/tedlium/s5_r3/local/nnet3/run_ivector_common.sh @@ -0,0 +1,184 @@ +#!/bin/bash + +set -e -o pipefail + + +# This script is called from local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh (and may eventually +# be called by more scripts). It contains the common feature preparation and iVector-related parts +# of the script. See those scripts for examples of usage. + + +stage=0 +nj=30 + +train_set=train_cleaned # you might set this to e.g. train. +gmm=tri3_cleaned # This specifies a GMM-dir from the features of the type you're training the system on; + # it should contain alignments for 'train_set'. + +num_threads_ubm=32 +nnet3_affix=_cleaned # affix for exp/nnet3 directory to put iVector stuff in, so it + # becomes exp/nnet3_cleaned or whatever. + +. ./cmd.sh +. ./path.sh +. utils/parse_options.sh + + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp + +for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + + + +if [ $stage -le 2 ] && [ -f data/${train_set}_sp_hires/feats.scp ]; then + echo "$0: data/${train_set}_sp_hires/feats.scp already exists." + echo " ... Please either remove it, or rerun this script with stage > 2." + exit 1 +fi + + +if [ $stage -le 1 ]; then + echo "$0: preparing directory for speed-perturbed data" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp +fi + +if [ $stage -le 2 ]; then + echo "$0: creating high-resolution MFCC features" + + # this shows how you can split across multiple file-systems. we'll split the + # MFCC dir across multiple locations. You might want to be careful here, if you + # have multiple copies of Kaldi checked out and run the same recipe, not to let + # them overwrite each other. + mfccdir=data/${train_set}_sp_hires/data + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/mfcc/tedlium-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp dev test; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires + + for datadir in ${train_set}_sp dev test; do + steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires + steps/compute_cmvn_stats.sh data/${datadir}_hires + utils/fix_data_dir.sh data/${datadir}_hires + done +fi + +if [ $stage -le 3 ]; then + echo "$0: computing a subset of data to train the diagonal UBM." + + mkdir -p exp/nnet3${nnet3_affix}/diag_ubm + temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm + + # train a diagonal UBM using a subset of about a quarter of the data + num_utts_total=$(wc -l -> : map dev stm labels to be coherent with train + test, + # - -> : --||-- + # - (2) -> null : remove pronunciation variants in transcripts, keep in dictionary + # - -> null : remove marked , it is modelled implicitly (in kaldi) + # - (...) -> null : remove utterance names from end-lines of train + # - it 's -> it's : merge words that contain apostrophe (if compound in dictionary, local/join_suffix.py) + { # Add STM header, so sclite can prepare the '.lur' file + echo ';; +;; LABEL "o" "Overall" "Overall results" +;; LABEL "f0" "f0" "Wideband channel" +;; LABEL "f2" "f2" "Telephone channel" +;; LABEL "male" "Male" "Male Talkers" +;; LABEL "female" "Female" "Female Talkers" +;;' + # Process the STMs + cat db/TEDLIUM_release-3/legacy/$set/stm/*.stm | sort -k1,1 -k2,2 -k4,4n | \ + sed -e 's:([^ ]*)$::' | \ + awk '{ $2 = "A"; print $0; }' + } | local/join_suffix.py > data/$set.orig/stm + + # Prepare 'text' file + # - {NOISE} -> [NOISE] : map the tags to match symbols in dictionary + cat $dir/stm | grep -v -e 'ignore_time_segment_in_scoring' -e ';;' | \ + awk '{ printf ("%s-%07d-%07d", $1, $4*100, $5*100); + for (i=7;i<=NF;i++) { printf(" %s", $i); } + printf("\n"); + }' | tr '{}' '[]' | sort -k1,1 > $dir/text || exit 1 + + # Prepare 'segments', 'utt2spk', 'spk2utt' + cat $dir/text | cut -d" " -f 1 | awk -F"-" '{printf("%s %s %07.2f %07.2f\n", $0, $1, $2/100.0, $3/100.0)}' > $dir/segments + cat $dir/segments | awk '{print $1, $2}' > $dir/utt2spk + cat $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt + + # Prepare 'wav.scp', 'reco2file_and_channel' + cat $dir/spk2utt | awk -v set=$set -v pwd=$PWD '{ printf("%s sph2pipe -f wav -p %s/db/TEDLIUM_release-3/legacy/%s/sph/%s.sph |\n", $1, pwd, set, $1); }' > $dir/wav.scp + cat $dir/wav.scp | awk '{ print $1, $1, "A"; }' > $dir/reco2file_and_channel + + # Create empty 'glm' file + echo ';; empty.glm + [FAKE] => %HESITATION / [ ] __ [ ] ;; hesitation token + ' > data/$set.orig/glm + + # The training set seems to not have enough silence padding in the segmentations, + # especially at the beginning of segments. Extend the times. + if [ $set == "train" ]; then + mv data/$set.orig/segments data/$set.orig/segments.temp + utils/data/extend_segment_times.py --start-padding=0.15 \ + --end-padding=0.1 data/$set.orig/segments || exit 1 + rm data/$set.orig/segments.temp + fi + + # Check that data dirs are okay! + utils/validate_data_dir.sh --no-feats $dir || exit 1 +done + diff --git a/egs/tedlium/s5_r3/local/prepare_dict.sh b/egs/tedlium/s5_r3/local/prepare_dict.sh new file mode 100755 index 00000000000..204b3f910e5 --- /dev/null +++ b/egs/tedlium/s5_r3/local/prepare_dict.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# +# Copyright 2014 Nickolay V. Shmyrev +# 2014 Brno University of Technology (Author: Karel Vesely) +# 2016 Daniel Galvez +# 2016 Vincent Nguyen +# Apache 2.0 +# + +dir=data/local/dict_nosp +mkdir -p $dir + +srcdict=db/TEDLIUM_release-3/TEDLIUM.152k.dic + +[ ! -r $srcdict ] && echo "Missing $srcdict" && exit 1 + +# Join dicts and fix some troubles +cat $srcdict | grep -v -w "" | grep -v -w "" | grep -v -w "" | \ + LANG= LC_ALL= sort | sed 's:([0-9])::g' > $dir/lexicon_words.txt + +cat $dir/lexicon_words.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \ + grep -v SIL | sort > $dir/nonsilence_phones.txt + +( echo SIL; echo NSN ) > $dir/silence_phones.txt + +echo SIL > $dir/optional_silence.txt + +# No "extra questions" in the input to this setup, as we don't +# have stress or tone. +echo -n >$dir/extra_questions.txt + +# Add to the lexicon the silences, noises etc. +# Typically, you would use " NSN" here, but the Cantab Research language models +# use instead of to represent out of vocabulary words. +echo ' NSN' | cat - $dir/lexicon_words.txt | sort | uniq > $dir/lexicon.txt + +# Check that the dict dir is okay! +utils/validate_dict_dir.pl $dir || exit 1 diff --git a/egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh b/egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh new file mode 100755 index 00000000000..61ad07645ff --- /dev/null +++ b/egs/tedlium/s5_r3/local/rnnlm/average_rnnlm.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# +# Copyright 2018 François Hernandez (Ubiqus) +# +# This script takes a rnnlm_dir and averages its models. +# +# Takes the default rnnlm_dir of tedlium s5_r3 recipe, +# and average the best model and the 10 previous and +# following ones (if they exist). + + +. ./cmd.sh +. ./path.sh + +set -e -o pipefail -u + +rnnlm_dir=exp/rnnlm_lstm_tdnn_a +begin= +end= + +. utils/parse_options.sh # accept options + +# get the best iteration +best_iter=$(rnnlm/get_best_model.py $rnnlm_dir) + +# get num_iters +info=$(grep "num_iters" $rnnlm_dir/info.txt) +num_iters=${info##*=} + + +# test if begin and end exist +if [ -z $begin ] && [ -z $end ]; then + begin=$(($best_iter-10)) + end=$(($best_iter+10)) + if [ $begin -le 1 ]; then + begin=1 + fi + if [ ! $end -le $num_iters ]; then + end=$num_iters + fi +fi + +# create list of models and embeddings files to merge +models="" +embeddings="" +for num in $(seq -s' ' $begin $end); do + [ -f $rnnlm_dir/$num.raw ] && \ + models=$models" $rnnlm_dir/$num.raw" + [ -f $rnnlm_dir/feat_embedding.$num.mat ] && \ + embeddings=$embeddings" $rnnlm_dir/feat_embedding.$num.mat" +done + +# merge list of files +mkdir -p ${rnnlm_dir}_averaged +nnet3-average $models ${rnnlm_dir}_averaged/final.raw +matrix-sum --average=true $embeddings ${rnnlm_dir}_averaged/feat_embedding.final.mat + +# copy other files to averaged rnnlm_dir +cp -r $rnnlm_dir/{info.txt,word_feats.txt,config,special_symbol_opts.txt} ${rnnlm_dir}_averaged + diff --git a/egs/tedlium/s5_r3/local/rnnlm/prepare_rnnlm_data.sh b/egs/tedlium/s5_r3/local/rnnlm/prepare_rnnlm_data.sh new file mode 100755 index 00000000000..ba6252450da --- /dev/null +++ b/egs/tedlium/s5_r3/local/rnnlm/prepare_rnnlm_data.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# To be run from the egs/ directory. + +. path.sh + +set -e -o pipefail -u + +# it should contain things like +# foo.txt, bar.txt, and dev.txt (dev.txt is a special filename that's +# obligatory). +data_dir=data/rnnlm +dir=exp/rnnlm/ +mkdir -p $dir + +# validata data dir +rnnlm/validate_data_dir.py $data_dir/data/ + +# get unigram counts +rnnlm/get_unigram_counts.sh $data_dir/data/ + +# get vocab +mkdir -p $data_dir/vocab +rnnlm/get_vocab.py $data_dir/data > $data_dir/vocab/words.txt + +# Choose weighting and multiplicity of data. +# The following choices would mean that data-source 'foo' +# is repeated once per epoch and has a weight of 0.5 in the +# objective function when training, and data-source 'bar' is repeated twice +# per epoch and has a data -weight of 1.5. +# There is no contraint that the average of the data weights equal one. +# Note: if a data-source has zero multiplicity, it just means you are ignoring +# it; but you must include all data-sources. +#cat > exp/foo/data_weights.txt < $dir/data_weights.txt < $dir/unigram_probs.txt + +# choose features +rnnlm/choose_features.py --unigram-probs=$dir/unigram_probs.txt \ + $data_dir/vocab/words.txt > $dir/features.txt +# validate features +rnnlm/validate_features.py $dir/features.txt + +# make features for word +rnnlm/make_word_features.py --unigram-probs=$dir/unigram_probs.txt \ + $data_dir/vocab/words.txt $dir/features.txt \ + > $dir/word_feats.txt + +# validate word features +rnnlm/validate_word_features.py --features-file $dir/features.txt \ + $dir/word_feats.txt diff --git a/egs/tedlium/s5_r3/local/rnnlm/run_lstm_tdnn.sh b/egs/tedlium/s5_r3/local/rnnlm/run_lstm_tdnn.sh new file mode 120000 index 00000000000..72a3172db41 --- /dev/null +++ b/egs/tedlium/s5_r3/local/rnnlm/run_lstm_tdnn.sh @@ -0,0 +1 @@ +tuning/run_lstm_tdnn_a.sh \ No newline at end of file diff --git a/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh b/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh new file mode 100755 index 00000000000..32252db937d --- /dev/null +++ b/egs/tedlium/s5_r3/local/rnnlm/tuning/run_lstm_tdnn_a.sh @@ -0,0 +1,109 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (author: Daniel Povey) Tony Robinson +# 2017 Hainan Xu +# 2017 Ke Li +# 2018 François Hernandez (Ubiqus) +# +# rnnlm/train_rnnlm.sh: best iteration (out of 1060) was 1050, linking it to final iteration. +# rnnlm/train_rnnlm.sh: train/dev perplexity was 90.0 / 92.0. + +# System tdnn_1a tdnnf_1a +# WER on dev(orig) 8.2 7.9 +# WER on dev(ngram) 7.6 7.2 +# WER on dev(rnnlm) 6.3 6.1 +# WER on test(orig) 8.1 8.0 +# WER on test(ngram) 7.7 7.5 +# WER on test(rnnlm) 6.7 6.6 + +# Begin configuration section. +dir=exp/rnnlm_lstm_tdnn_a +embedding_dim=800 +lstm_rpd=200 +lstm_nrpd=200 +stage=-10 +train_stage=-10 +epochs=20 + +. ./cmd.sh +. utils/parse_options.sh +[ -z "$cmd" ] && cmd=$train_cmd + +text_from_audio=data/train/text +text=data/LM/train.txt +wordlist=data/lang_chain/words.txt +dev_sents=10000 +text_dir=data/rnnlm/text +mkdir -p $dir/config +set -e + +for f in $text $wordlist; do + [ ! -f $f ] && \ + echo "$0: expected file $f to exist; search for local/prepare_data.sh and utils/prepare_lang.sh in run.sh" && exit 1 +done + +if [ $stage -le 0 ]; then + mkdir -p $text_dir + # shuffle text from audio and lm + cat $text_from_audio | cut -d ' ' -f2- | cat $text |\ + shuf > data/rnnlm/full_lm_data.shuffled + # create dev and train sets based on audio and LM data + cat data/rnnlm/full_lm_data.shuffled | head -n $dev_sents> $text_dir/dev.txt + cat data/rnnlm/full_lm_data.shuffled | tail -n +$[$dev_sents+1] > $text_dir/ted.txt + +fi + +if [ $stage -le 1 ]; then + cp $wordlist $dir/config/ + n=`cat $dir/config/words.txt | wc -l` + echo " $n" >> $dir/config/words.txt + + # words that are not present in words.txt but are in the training or dev data, will be + # mapped to during training. + echo "" >$dir/config/oov.txt + + cat > $dir/config/data_weights.txt <$dir/config/unigram_probs.txt + + # choose features + rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ + --use-constant-feature=true \ + --top-word-features=10000 \ + --min-frequency 1.0e-03 \ + --special-words=',,,' \ + $dir/config/words.txt > $dir/config/features.txt + + cat >$dir/config/xconfig < " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + exit 1; +fi + +data=$1 +lang_or_graph=$2 +dir=$3 + +symtab=$lang_or_graph/words.txt + +for f in $symtab $dir/lat.1.gz $data/text; do + [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; +done + +mkdir -p $dir/scoring/log + +cat $data/text > $dir/scoring/test_filt.txt + +$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \ + lattice-best-path --lm-scale=LMWT --word-symbol-table=$symtab \ + "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.tra || exit 1; + +# Note: the double level of quoting for the sed command + +$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \ + cat $dir/scoring/LMWT.tra \| \ + utils/int2sym.pl -f 2- $symtab \| \ + sed "'s:::g'" \| \ + compute-wer --text --mode=present \ + ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT || exit 1; + +# Show results +for f in $dir/wer_*; do echo $f; egrep '(WER)|(SER)' < $f; done + +exit 0; diff --git a/egs/tedlium/s5_r3/local/score_sclite.sh b/egs/tedlium/s5_r3/local/score_sclite.sh new file mode 100755 index 00000000000..16c8b30e52f --- /dev/null +++ b/egs/tedlium/s5_r3/local/score_sclite.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# +# Copyright Johns Hopkins University (Author: Daniel Povey) 2012, +# Brno University of Technology (Author: Karel Vesely) 2014, +# Apache 2.0 +# + +# begin configuration section. +cmd=run.pl +stage=0 +decode_mbr=true +beam=7 # speed-up, but may affect MBR confidences. +word_ins_penalty=0.0,0.5,1.0 +min_lmwt=7 +max_lmwt=17 +iter=final +#end configuration section. + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: local/score_sclite.sh [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --stage (0|1|2) # start scoring script from part-way through." + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + exit 1; +fi + +data=$1 +lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied. +dir=$3 + +model=$dir/../$iter.mdl # assume model one level up from decoding dir. + +hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl +[ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1; +hubdir=`dirname $hubscr` + +for f in $data/stm $data/glm $lang/words.txt $lang/phones/word_boundary.int \ + $model $data/segments $data/reco2file_and_channel $dir/lat.1.gz; do + [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; +done + +# name=`basename $data`; # e.g. eval2000 +nj=$(cat $dir/num_jobs) + +mkdir -p $dir/scoring/log + +if [ -f $dir/../frame_shift ]; then + frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)" + echo "$0: $dir/../frame_shift exists, using $frame_shift_opt" +elif [ -f $dir/../frame_subsampling_factor ]; then + factor=$(cat $dir/../frame_subsampling_factor) || exit 1 + frame_shift_opt="--frame-shift=0.0$factor" + echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt" +fi + +if [ $stage -le 0 ]; then + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \ + set -e -o pipefail \; \ + mkdir -p $dir/score_LMWT_${wip}/ '&&' \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-prune --beam=$beam ark:- ark:- \| \ + lattice-align-words --output-error-lats=true --max-expand=10.0 --test=false \ + $lang/phones/word_boundary.int $model ark:- ark:- \| \ + lattice-to-ctm-conf --decode-mbr=$decode_mbr $frame_shift_opt ark:- - \| \ + utils/int2sym.pl -f 5 $lang/words.txt \| \ + utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \| \ + sort -k1,1 -k2,2 -k3,3nb '>' $dir/score_LMWT_${wip}/ctm || exit 1; + done +fi + +if [ $stage -le 1 ]; then + # Remove some stuff we don't want to score, from the ctm. + for x in $dir/score_*/ctm; do + # `-i` is not needed in the following. It is added for robustness in ase this code is copy-pasted + # into another script that, e.g., uses instead of + grep -v -w -i '' <$x > ${x}.filt || exit 1; + done +fi + +# Score the set... +if [ $stage -le 2 ]; then + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.${wip}.log \ + cp $data/stm $dir/score_LMWT_${wip}/ '&&' \ + $hubscr -p $hubdir -V -l english -h hub5 -g $data/glm -r $dir/score_LMWT_${wip}/stm $dir/score_LMWT_${wip}/ctm.filt || exit 1; + done +fi + +exit 0 diff --git a/egs/tedlium/s5_r3/local/ted_download_lm.sh b/egs/tedlium/s5_r3/local/ted_download_lm.sh new file mode 100755 index 00000000000..ad833555b5f --- /dev/null +++ b/egs/tedlium/s5_r3/local/ted_download_lm.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# +# Copyright 2018 David Snyder +# Apache 2.0 +# +# This script downloads pre-built language models trained on the Cantab-Tedlium +# text data and Tedlium acoustic training data. If you want to build these +# models yourself, run the script local/ted_train_lm.sh. + +set -e + +echo "$0: downloading Tedlium 4 gram language models (it won't re-download if it was already downloaded.)" +wget --continue http://kaldi-asr.org/models/5/4gram_small.arpa.gz -P data/local/local_lm/data/arpa || exit 1 +wget --continue http://kaldi-asr.org/models/5/4gram_big.arpa.gz -P data/local/local_lm/data/arpa || exit 1 + +exit 0 \ No newline at end of file diff --git a/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh b/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh new file mode 100755 index 00000000000..431d44c6ff6 --- /dev/null +++ b/egs/tedlium/s5_r3/local/ted_download_rnnlm.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# +# Copyright 2018 François Hernandez +# Apache 2.0 +# +# This script downloads pre-built RNN language models trained on the TED-LIUM +# text data and acoustic training data. If you want to build these +# models yourself, run the script local/ted_train_rnnlm.sh. + +set -e + +echo "$0: downloading Tedlium RNNLM models (it won't re-download if it was already downloaded.)" +wget --continue http://kaldi-asr.org/models/5/tedlium_rnnlm.tgz -P exp/rnnlm_lstm_tdnn_a_averaged || exit 1 +cd exp/rnnlm_lstm_tdnn_a_averaged +tar -xvzf tedlium_rnnlm.tgz || exit 1 +rm tedlium_rnnlm.tgz +mkdir config +cd ../.. +cp data/lang/words.txt exp/rnnlm_lstm_tdnn_a_averaged/config/words.txt +echo " 152217" >> exp/rnnlm_lstm_tdnn_a_averaged/config/words.txt + +exit 0 diff --git a/egs/tedlium/s5_r3/local/ted_train_lm.sh b/egs/tedlium/s5_r3/local/ted_train_lm.sh new file mode 100755 index 00000000000..3c587f63094 --- /dev/null +++ b/egs/tedlium/s5_r3/local/ted_train_lm.sh @@ -0,0 +1,139 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 +# +# This script trains a LM on the Cantab-Tedlium text data and tedlium acoustic training data. +# It is based on the example scripts distributed with PocoLM + +# It will first check if pocolm is installed and if not will process with installation +# It will then get the source data from the pre-downloaded Cantab-Tedlium files +# and the pre-prepared data/train text source. + + +set -e +stage=0 + +echo "$0 $@" # Print the command line for logging +. utils/parse_options.sh || exit 1; + +dir=data/local/local_lm +lm_dir=${dir}/data + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +num_dev_sentences=10000 + +#bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +bypass_metaparam_optim_opt="--bypass-metaparameter-optimization=0.854,0.0722,0.5808,0.338,0.166,0.015,0.999,0.6228,0.340,0.172,0.999,0.788,0.501,0.406" +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + # Unzip TEDLIUM 6 data sources, remove , gzip the result. + gunzip -c db/TEDLIUM_release-3/LM/*.en.gz | sed 's/ <\/s>//g' | gzip -c > ${dir}/data/text/train.txt.gz + # use a subset of the annotated training data as the dev set . + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + head -n $num_dev_sentences < data/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + # .. and the rest of the training data as an additional data source. + # we can later fold the dev data into this. + tail -n +$[$num_dev_sentences+1] < data/train/text | cut -d " " -f 2- > ${dir}/data/text/ted.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (a subset of the training data is used as ${dir}/data/text/ted.txt to work + # out interpolation weights. + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < data/dev/text > ${dir}/data/real_dev_set.txt + + # get wordlist + awk '{print $1}' db/TEDLIUM_release-3/TEDLIUM.152k.dic | sed 's:([0-9])::g' | sort | uniq > ${dir}/data/wordlist +fi + +order=4 + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='train=2 ted=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20 \ + --limit-unk-history=true \ + --fold-dev-into=ted ${bypass_metaparam_optim_opt} \ + --min-counts="${min_counts}" \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + #[perplexity = 157.87] over 18290.0 words +fi + +if [ $stage -le 2 ]; then + echo "$0: pruning the LM (to larger size)" + # Using 10 million n-grams for a big LM for rescoring purposes. + size=10000000 + prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' + + # current results, after adding --limit-unk-history=true: + # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_big was -5.16562818753 per word [perplexity = 175.147449465] over 18290.0 words. + + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz +fi + +if [ $stage -le 3 ]; then + echo "$0: pruning the LM (to smaller size)" + # Using 2 million n-grams for a smaller LM for graph building. Prune from the + # bigger-pruned LM, it'll be faster. + size=2000000 + prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' + + # current results, after adding --limit-unk-history=true (needed for modeling OOVs and not blowing up LG.fst): + # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_small was -5.29432352378 per word [perplexity = 199.202824404 over 18290.0 words. + + + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz +fi diff --git a/egs/tedlium/s5_r3/path.sh b/egs/tedlium/s5_r3/path.sh new file mode 100755 index 00000000000..16d5314b9c2 --- /dev/null +++ b/egs/tedlium/s5_r3/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5 +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/tedlium/s5_r3/results.sh b/egs/tedlium/s5_r3/results.sh new file mode 100755 index 00000000000..98bcab94ec5 --- /dev/null +++ b/egs/tedlium/s5_r3/results.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +filter_regexp=. +[ $# -ge 1 ] && filter_regexp=$1 + +for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; done 2>/dev/null + for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp + for x in exp/{mono,tri,sgmm,nnet,dnn,lstm,chain}*/*/decode*; do [ -d $x ] && grep Sum $x/score_*/*.sys | utils/best_wer.sh; done 2>/dev/null | grep $filter_regexp +exit 0 + diff --git a/egs/tedlium/s5_r3/rnnlm b/egs/tedlium/s5_r3/rnnlm new file mode 120000 index 00000000000..e136939ba72 --- /dev/null +++ b/egs/tedlium/s5_r3/rnnlm @@ -0,0 +1 @@ +../../../scripts/rnnlm/ \ No newline at end of file diff --git a/egs/tedlium/s5_r3/run.sh b/egs/tedlium/s5_r3/run.sh new file mode 100755 index 00000000000..d4f3a38fd49 --- /dev/null +++ b/egs/tedlium/s5_r3/run.sh @@ -0,0 +1,224 @@ +#!/bin/bash +# +# Based mostly on the Switchboard recipe. The training database is TED-LIUM, +# it consists of TED talks with cleaned automatic transcripts: +# +# https://lium.univ-lemans.fr/ted-lium3/ +# http://www.openslr.org/resources (Mirror). +# +# The data is distributed under 'Creative Commons BY-NC-ND 3.0' license, +# which allow free non-commercial use, while only a citation is required. +# +# Copyright 2014 Nickolay V. Shmyrev +# 2014 Brno University of Technology (Author: Karel Vesely) +# 2016 Vincent Nguyen +# 2016 Johns Hopkins University (Author: Daniel Povey) +# 2018 François Hernandez +# +# Apache 2.0 +# + +. ./cmd.sh +. ./path.sh + + +set -e -o pipefail -u + +nj=35 +decode_nj=30 # note: should not be >38 which is the number of speakers in the dev set + # after applying --seconds-per-spk-max 180. We decode with 4 threads, so + # this will be too many jobs if you're using run.pl. +stage=0 +train_rnnlm=false +train_lm=false + +. utils/parse_options.sh # accept options + +# Data preparation +if [ $stage -le 0 ]; then + local/download_data.sh +fi + +if [ $stage -le 1 ]; then + local/prepare_data.sh + # Split speakers up into 3-minute chunks. This doesn't hurt adaptation, and + # lets us use more jobs for decoding etc. + # [we chose 3 minutes because that gives us 38 speakers for the dev data, which is + # more than our normal 30 jobs.] + for dset in dev test train; do + utils/data/modify_speaker_info.sh --seconds-per-spk-max 180 data/${dset}.orig data/${dset} + done +fi + + +if [ $stage -le 2 ]; then + local/prepare_dict.sh +fi + +if [ $stage -le 3 ]; then + utils/prepare_lang.sh data/local/dict_nosp \ + "" data/local/lang_nosp data/lang_nosp +fi + +if [ $stage -le 4 ]; then + # later on we'll change this script so you have the option to + # download the pre-built LMs from openslr.org instead of building them + # locally. + if $train_lm; then + local/ted_train_lm.sh + else + local/ted_download_lm.sh + fi +fi + +if [ $stage -le 5 ]; then + local/format_lms.sh +fi + +# Feature extraction +if [ $stage -le 6 ]; then + for set in test dev train; do + dir=data/$set + steps/make_mfcc.sh --nj 30 --cmd "$train_cmd" $dir + steps/compute_cmvn_stats.sh $dir + done +fi + +# Now we have 452 hours of training data. +# Well create a subset with 10k short segments to make flat-start training easier: +if [ $stage -le 7 ]; then + utils/subset_data_dir.sh --shortest data/train 10000 data/train_10kshort + utils/data/remove_dup_utts.sh 10 data/train_10kshort data/train_10kshort_nodup +fi + +# Train +if [ $stage -le 8 ]; then + steps/train_mono.sh --nj 20 --cmd "$train_cmd" \ + data/train_10kshort_nodup data/lang_nosp exp/mono +fi + +if [ $stage -le 9 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/train data/lang_nosp exp/mono exp/mono_ali + steps/train_deltas.sh --cmd "$train_cmd" \ + 2500 30000 data/train data/lang_nosp exp/mono_ali exp/tri1 +fi + +if [ $stage -le 10 ]; then + utils/mkgraph.sh data/lang_nosp exp/tri1 exp/tri1/graph_nosp + + # The slowest part about this decoding is the scoring, which we can't really + # control as the bottleneck is the NIST tools. + for dset in dev test; do + steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri1/graph_nosp data/${dset} exp/tri1/decode_nosp_${dset} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp data/lang_nosp_rescore \ + data/${dset} exp/tri1/decode_nosp_${dset} exp/tri1/decode_nosp_${dset}_rescore + done +fi + +if [ $stage -le 11 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/train data/lang_nosp exp/tri1 exp/tri1_ali + + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + 4000 50000 data/train data/lang_nosp exp/tri1_ali exp/tri2 +fi + +if [ $stage -le 12 ]; then + utils/mkgraph.sh data/lang_nosp exp/tri2 exp/tri2/graph_nosp + for dset in dev test; do + steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri2/graph_nosp data/${dset} exp/tri2/decode_nosp_${dset} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp data/lang_nosp_rescore \ + data/${dset} exp/tri2/decode_nosp_${dset} exp/tri2/decode_nosp_${dset}_rescore + done +fi + +if [ $stage -le 13 ]; then + steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_nosp exp/tri2 + utils/dict_dir_add_pronprobs.sh --max-normalize true \ + data/local/dict_nosp exp/tri2/pron_counts_nowb.txt \ + exp/tri2/sil_counts_nowb.txt \ + exp/tri2/pron_bigram_counts_nowb.txt data/local/dict +fi + +if [ $stage -le 14 ]; then + utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang + cp -rT data/lang data/lang_rescore + cp data/lang_nosp/G.fst data/lang/ + cp data/lang_nosp_rescore/G.carpa data/lang_rescore/ + + utils/mkgraph.sh data/lang exp/tri2 exp/tri2/graph + + for dset in dev test; do + steps/decode.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri2/graph data/${dset} exp/tri2/decode_${dset} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset} exp/tri2/decode_${dset} exp/tri2/decode_${dset}_rescore + done +fi + +if [ $stage -le 15 ]; then + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/train data/lang exp/tri2 exp/tri2_ali + + steps/train_sat.sh --cmd "$train_cmd" \ + 5000 100000 data/train data/lang exp/tri2_ali exp/tri3 + + utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph + + for dset in dev test; do + steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri3/graph data/${dset} exp/tri3/decode_${dset} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ + data/${dset} exp/tri3/decode_${dset} exp/tri3/decode_${dset}_rescore + done +fi + +if [ $stage -le 16 ]; then + # this does some data-cleaning. It actually degrades the GMM-level results + # slightly, but the cleaned data should be useful when we add the neural net and chain + # systems. If not we'll remove this stage. + local/run_cleanup_segmentation.sh +fi + +if [ $stage -le 17 ]; then + # This will only work if you have GPUs on your system (and note that it requires + # you to have the queue set up the right way... see kaldi-asr.org/doc/queue.html) + local/chain/run_tdnnf.sh +fi + +if [ $stage -le 18 ]; then + # You can either train your own rnnlm or download a pre-trained one + if $train_rnnlm; then + local/rnnlm/tuning/run_lstm_tdnn_a.sh + local/rnnlm/average_rnnlm.sh + else + local/ted_download_rnnlm.sh + fi +fi + +if [ $stage -le 19 ]; then + # Here we rescore the lattices generated at stage 17 + rnnlm_dir=exp/rnnlm_lstm_tdnn_a_averaged + lang_dir=data/lang_chain + ngram_order=4 + + for dset in dev test; do + data_dir=data/${dset}_hires + decoding_dir=exp/chain_cleaned/tdnnf_1a + suffix=$(basename $rnnlm_dir) + output_dir=${decoding_dir}_$suffix + + rnnlm/lmrescore_pruned.sh \ + --cmd "$decode_cmd --mem 4G" \ + --weight 0.5 --max-ngram-order $ngram_order \ + $lang_dir $rnnlm_dir \ + $data_dir $decoding_dir \ + $output_dir + done +fi + +echo "$0: success." +exit 0 diff --git a/egs/tedlium/s5_r3/steps b/egs/tedlium/s5_r3/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/tedlium/s5_r3/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/tedlium/s5_r3/utils b/egs/tedlium/s5_r3/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/tedlium/s5_r3/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/uw3/v1/local/chain/run_cnn_1a.sh b/egs/uw3/v1/local/chain/run_cnn_1a.sh index ad7421e1261..582bfc90105 100755 --- a/egs/uw3/v1/local/chain/run_cnn_1a.sh +++ b/egs/uw3/v1/local/chain/run_cnn_1a.sh @@ -46,7 +46,6 @@ lat_dir=exp/chain${nnet3_affix}/tri2_train_lats dir=exp/chain${nnet3_affix}/cnn${affix} train_data_dir=data/train lores_train_data_dir=$train_data_dir # for the start, use the same data for gmm and chain -gmm_lang=data/lang lang_test=data/lang_unk tree_dir=exp/chain${nnet3_affix}/tree${affix} @@ -84,7 +83,7 @@ if [ $stage -le 1 ]; then # topo file. [note, it really has two states.. the first one is only repeated # once, the second one has zero or more repeats.] if [ -d $lang ]; then - if [ $lang/L.fst -nt $lang_test/L.fst ]; then + if [ $lang/L.fst -nt data/lang/L.fst ]; then echo "$0: $lang already exists, not overwriting it; continuing" else echo "$0: $lang already exists and seems to be older than data/lang..." @@ -92,7 +91,7 @@ if [ $stage -le 1 ]; then exit 1; fi else - cp -r $lang_test $lang + cp -r data/lang $lang silphonelist=$(cat $lang/phones/silence.csl) || exit 1; nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1; # Use our special topology... note that later on may have to tune this @@ -105,7 +104,7 @@ if [ $stage -le 2 ]; then # Get the alignments as lattices (gives the chain training more freedom). # use the same num-jobs as the alignments steps/align_fmllr_lats.sh --nj $nj --cmd "$cmd" ${lores_train_data_dir} \ - $lang_test $gmm_dir $lat_dir + data/lang $gmm_dir $lat_dir rm $lat_dir/fsts.*.gz # save space fi diff --git a/egs/voxceleb/README.txt b/egs/voxceleb/README.txt new file mode 100644 index 00000000000..02e328594f3 --- /dev/null +++ b/egs/voxceleb/README.txt @@ -0,0 +1,13 @@ + + This is a Kaldi recipe for speaker verification using the VoxCeleb1 and + VoxCeleb2 corpora. See http://www.robots.ox.ac.uk/~vgg/data/voxceleb/ and + http://www.robots.ox.ac.uk/~vgg/data/voxceleb2/ for additional details and + information on how to obtain them. + + Note: This recipe requires ffmpeg to be installed and its location included + in $PATH + + The subdirectories "v1" and so on are different speaker recognition + recipes. The recipe in v1 demonstrates a standard approach using a + full-covariance GMM-UBM, iVectors, and a PLDA backend. The example + in v2 demonstrates DNN speaker embeddings with a PLDA backend. diff --git a/egs/voxceleb/v1/cmd.sh b/egs/voxceleb/v1/cmd.sh new file mode 100755 index 00000000000..d1ca1a6d126 --- /dev/null +++ b/egs/voxceleb/v1/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" + + diff --git a/egs/voxceleb/v1/conf/mfcc.conf b/egs/voxceleb/v1/conf/mfcc.conf new file mode 100644 index 00000000000..649cffb9de8 --- /dev/null +++ b/egs/voxceleb/v1/conf/mfcc.conf @@ -0,0 +1,7 @@ +--sample-frequency=16000 +--frame-length=25 # the default is 25 +--low-freq=20 # the default. +--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case). +--num-mel-bins=30 +--num-ceps=24 +--snip-edges=false diff --git a/egs/voxceleb/v1/conf/vad.conf b/egs/voxceleb/v1/conf/vad.conf new file mode 100644 index 00000000000..a0ca2449b10 --- /dev/null +++ b/egs/voxceleb/v1/conf/vad.conf @@ -0,0 +1,2 @@ +--vad-energy-threshold=5.5 +--vad-energy-mean-scale=0.5 diff --git a/egs/voxceleb/v1/local/make_musan.py b/egs/voxceleb/v1/local/make_musan.py new file mode 100755 index 00000000000..74c434990fb --- /dev/null +++ b/egs/voxceleb/v1/local/make_musan.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +# Copyright 2015 David Snyder +# 2018 Ewald Enzinger +# Apache 2.0. +# +# Modified version of egs/sre16/v1/local/make_musan.py (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8). +# This version uses the raw MUSAN audio files (16 kHz) and does not use sox to resample at 8 kHz. +# +# This file is meant to be invoked by make_musan.sh. + +import os, sys + +def process_music_annotations(path): + utt2spk = {} + utt2vocals = {} + lines = open(path, 'r').readlines() + for line in lines: + utt, genres, vocals, musician = line.rstrip().split()[:4] + # For this application, the musican ID isn't important + utt2spk[utt] = utt + utt2vocals[utt] = vocals == "Y" + return utt2spk, utt2vocals + +def prepare_music(root_dir, use_vocals): + utt2vocals = {} + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + music_dir = os.path.join(root_dir, "music") + for root, dirs, files in os.walk(music_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + elif str(file) == "ANNOTATIONS": + utt2spk_part, utt2vocals_part = process_music_annotations(file_path) + utt2spk.update(utt2spk_part) + utt2vocals.update(utt2vocals_part) + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2vocals: + if utt in utt2wav: + if use_vocals or not utt2vocals[utt]: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + return utt2spk_str, utt2wav_str + +def prepare_speech(root_dir): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + speech_dir = os.path.join(root_dir, "speech") + for root, dirs, files in os.walk(speech_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + return utt2spk_str, utt2wav_str + +def prepare_noise(root_dir): + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + noise_dir = os.path.join(root_dir, "noise") + for root, dirs, files in os.walk(noise_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + utt2wav_str = utt2wav_str + utt + " " + utt2wav[utt] + "\n" + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") + return utt2spk_str, utt2wav_str + +def main(): + in_dir = sys.argv[1] + out_dir = sys.argv[2] + use_vocals = sys.argv[3] == "Y" + utt2spk_music, utt2wav_music = prepare_music(in_dir, use_vocals) + utt2spk_speech, utt2wav_speech = prepare_speech(in_dir) + utt2spk_noise, utt2wav_noise = prepare_noise(in_dir) + utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise + utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise + wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w') + wav_fi.write(utt2wav) + utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w') + utt2spk_fi.write(utt2spk) + + +if __name__=="__main__": + main() diff --git a/egs/voxceleb/v1/local/make_musan.sh b/egs/voxceleb/v1/local/make_musan.sh new file mode 100755 index 00000000000..1565ef0d85c --- /dev/null +++ b/egs/voxceleb/v1/local/make_musan.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# Copyright 2015 David Snyder +# Apache 2.0. +# +# Copy of egs/sre16/v1/local/make_musan.sh (commit e3fb7c4a0da4167f8c94b80f4d3cc5ab4d0e22e8). +# +# This script, called by ../run.sh, creates the MUSAN +# data directory. The required dataset is freely available at +# http://www.openslr.org/17/ + +set -e +in_dir=$1 +data_dir=$2 +use_vocals='Y' + +mkdir -p local/musan.tmp + +echo "Preparing ${data_dir}/musan..." +mkdir -p ${data_dir}/musan +local/make_musan.py ${in_dir} ${data_dir}/musan ${use_vocals} + +utils/fix_data_dir.sh ${data_dir}/musan + +grep "music" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_music +grep "speech" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_speech +grep "noise" ${data_dir}/musan/utt2spk > local/musan.tmp/utt2spk_noise +utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_music \ + ${data_dir}/musan ${data_dir}/musan_music +utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_speech \ + ${data_dir}/musan ${data_dir}/musan_speech +utils/subset_data_dir.sh --utt-list local/musan.tmp/utt2spk_noise \ + ${data_dir}/musan ${data_dir}/musan_noise + +utils/fix_data_dir.sh ${data_dir}/musan_music +utils/fix_data_dir.sh ${data_dir}/musan_speech +utils/fix_data_dir.sh ${data_dir}/musan_noise + +rm -rf local/musan.tmp + diff --git a/egs/voxceleb/v1/local/make_voxceleb1.pl b/egs/voxceleb/v1/local/make_voxceleb1.pl new file mode 100755 index 00000000000..916e11020d2 --- /dev/null +++ b/egs/voxceleb/v1/local/make_voxceleb1.pl @@ -0,0 +1,113 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# 2018 David Snyder +# +# Usage: make_voxceleb1.pl /export/voxceleb1 data/ + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/voxceleb1 data/\n"; + exit(1); +} + +($data_base, $out_dir) = @ARGV; +my $out_test_dir = "$out_dir/voxceleb1_test"; +my $out_train_dir = "$out_dir/voxceleb1_train"; + +if (! -e "$data_base/voxceleb1_test.txt") { + system("wget -O $data_base/voxceleb1_test.txt http://www.openslr.org/resources/49/voxceleb1_test.txt"); +} + +if (system("mkdir -p $out_test_dir") != 0) { + die "Error making directory $out_test_dir"; +} + +if (system("mkdir -p $out_train_dir") != 0) { + die "Error making directory $out_train_dir"; +} + +opendir my $dh, "$data_base/voxceleb1_wav" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +open(TRIAL_IN, "<", "$data_base/voxceleb1_test.txt") or die "Could not open the verification trials file $data_base/voxceleb1_test.txt"; +open(SPKR_TEST, ">", "$out_test_dir/utt2spk") or die "Could not open the output file $out_test_dir/utt2spk"; +open(WAV_TEST, ">", "$out_test_dir/wav.scp") or die "Could not open the output file $out_test_dir/wav.scp"; +open(SPKR_TRAIN, ">", "$out_train_dir/utt2spk") or die "Could not open the output file $out_train_dir/utt2spk"; +open(WAV_TRAIN, ">", "$out_train_dir/wav.scp") or die "Could not open the output file $out_train_dir/wav.scp"; +open(TRIAL_OUT, ">", "$out_test_dir/trials") or die "Could not open the output file $out_test_dir/trials"; + +my $test_spkrs = (); +while () { + chomp; + my ($tar_or_none, $path1, $path2) = split; + + # Create entry for left-hand side of trial + my $wav = "$data_base/voxceleb1_wav/$path1"; + my ($spkr_id, $filename) = split('/', $path1); + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $utt_id1 = "$spkr_id-$rec_id-$segment"; + $test_spkrs{$spkr_id} = (); + + # Create entry for right-hand side of trial + my $wav = "$data_base/voxceleb1_wav/$path2"; + my ($spkr_id, $filename) = split('/', $path2); + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $utt_id2 = "$spkr_id-$rec_id-$segment"; + $test_spkrs{$spkr_id} = (); + + my $target = "nontarget"; + if ($tar_or_none eq "1") { + $target = "target"; + } + print TRIAL_OUT "$utt_id1 $utt_id2 $target\n"; +} + +foreach (@spkr_dirs) { + my $spkr_id = $_; + opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); + closedir $dh; + foreach (@files) { + my $filename = $_; + my $rec_id = substr($filename, 0, 11); + my $segment = substr($filename, 12, 7); + my $utt_id = "$spkr_id-$rec_id-$segment"; + my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav"; + if (exists $test_spkrs{$spkr_id}) { + print WAV_TEST "$utt_id", " $wav", "\n"; + print SPKR_TEST "$utt_id", " $spkr_id", "\n"; + } else { + print WAV_TRAIN "$utt_id", " $wav", "\n"; + print SPKR_TRAIN "$utt_id", " $spkr_id", "\n"; + } + } +} + +close(SPKR_TEST) or die; +close(WAV_TEST) or die; +close(SPKR_TRAIN) or die; +close(WAV_TRAIN) or die; +close(TRIAL_OUT) or die; +close(TRIAL_IN) or die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_test_dir/utt2spk >$out_test_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_test_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_test_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_test_dir") != 0) { + die "Error validating directory $out_test_dir"; +} + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_train_dir/utt2spk >$out_train_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_train_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_train_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_train_dir") != 0) { + die "Error validating directory $out_train_dir"; +} diff --git a/egs/voxceleb/v1/local/make_voxceleb2.pl b/egs/voxceleb/v1/local/make_voxceleb2.pl new file mode 100755 index 00000000000..34c1591eba3 --- /dev/null +++ b/egs/voxceleb/v1/local/make_voxceleb2.pl @@ -0,0 +1,70 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# +# Usage: make_voxceleb2.pl /export/voxceleb2 dev data/dev +# +# Note: This script requires ffmpeg to be installed and its location included in $PATH. + +if (@ARGV != 3) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/voxceleb2 dev data/dev\n"; + exit(1); +} + +# Check that ffmpeg is installed. +if (`which ffmpeg` eq "") { + die "Error: this script requires that ffmpeg is installed."; +} + +($data_base, $dataset, $out_dir) = @ARGV; + +if ("$dataset" ne "dev" && "$dataset" ne "test") { + die "dataset parameter must be 'dev' or 'test'!"; +} + +opendir my $dh, "$data_base/$dataset/aac" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$data_base/$dataset/aac/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +if (system("mkdir -p $out_dir") != 0) { + die "Error making directory $out_dir"; +} + +open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; + +foreach (@spkr_dirs) { + my $spkr_id = $_; + + opendir my $dh, "$data_base/$dataset/aac/$spkr_id/" or die "Cannot open directory: $!"; + my @rec_dirs = grep {-d "$data_base/$dataset/aac/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh); + closedir $dh; + + foreach (@rec_dirs) { + my $rec_id = $_; + + opendir my $dh, "$data_base/$dataset/aac/$spkr_id/$rec_id/" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.m4a$/} readdir($dh); + closedir $dh; + + foreach (@files) { + my $name = $_; + my $wav = "ffmpeg -v 8 -i $data_base/$dataset/aac/$spkr_id/$rec_id/$name.m4a -f wav -acodec pcm_s16le -|"; + my $utt_id = "$spkr_id-$rec_id-$name"; + print WAV "$utt_id", " $wav", "\n"; + print SPKR "$utt_id", " $spkr_id", "\n"; + } + } +} +close(SPKR) or die; +close(WAV) or die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/voxceleb/v1/local/nnet3/xvector/prepare_feats_for_egs.sh b/egs/voxceleb/v1/local/nnet3/xvector/prepare_feats_for_egs.sh new file mode 100755 index 00000000000..21efb8e3dad --- /dev/null +++ b/egs/voxceleb/v1/local/nnet3/xvector/prepare_feats_for_egs.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# +# Copied from egs/sre16/v1/local/nnet3/xvector/prepare_feats_for_egs.sh (commit 3ea534070fd2cccd2e4ee21772132230033022ce). +# +# Apache 2.0. + +# This script applies sliding window cmvn and removes silence frames. This +# is performed on the raw features prior to generating examples for training +# the xvector system. + +nj=40 +cmd="run.pl" +stage=0 +norm_vars=false +center=true +compress=true +cmn_window=300 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; +if [ $# != 3 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/train data/train_no_sil exp/make_xvector_features" + echo "Options: " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --norm-vars # If true, normalize variances in the sliding window cmvn" + exit 1; +fi + +data_in=$1 +data_out=$2 +dir=$3 + +name=`basename $data_in` + +for f in $data_in/feats.scp $data_in/vad.scp ; do + [ ! -f $f ] && echo "$0: No such file $f" && exit 1; +done + +# Set various variables. +mkdir -p $dir/log +mkdir -p $data_out +featdir=$(utils/make_absolute.sh $dir) + +if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $featdir/storage ]; then + utils/create_split_dir.pl \ + /export/b{14,15,16,17}/$USER/kaldi-data/egs/voxceleb2/v2/xvector-$(date +'%m_%d_%H_%M')/xvector_feats/storage $featdir/storage +fi + +for n in $(seq $nj); do + # the next command does nothing unless $featdir/storage/ exists, see + # utils/create_data_link.pl for more info. + utils/create_data_link.pl $featdir/xvector_feats_${name}.${n}.ark +done + +cp $data_in/utt2spk $data_out/utt2spk +cp $data_in/spk2utt $data_out/spk2utt +cp $data_in/wav.scp $data_out/wav.scp + +write_num_frames_opt="--write-num-frames=ark,t:$featdir/log/utt2num_frames.JOB" + +sdata_in=$data_in/split$nj; +utils/split_data.sh $data_in $nj || exit 1; + +$cmd JOB=1:$nj $dir/log/create_xvector_feats_${name}.JOB.log \ + apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=$cmn_window \ + scp:${sdata_in}/JOB/feats.scp ark:- \| \ + select-voiced-frames ark:- scp,s,cs:${sdata_in}/JOB/vad.scp ark:- \| \ + copy-feats --compress=$compress $write_num_frames_opt ark:- \ + ark,scp:$featdir/xvector_feats_${name}.JOB.ark,$featdir/xvector_feats_${name}.JOB.scp || exit 1; + +for n in $(seq $nj); do + cat $featdir/xvector_feats_${name}.$n.scp || exit 1; +done > ${data_out}/feats.scp || exit 1 + +for n in $(seq $nj); do + cat $featdir/log/utt2num_frames.$n || exit 1; +done > $data_out/utt2num_frames || exit 1 +rm $featdir/log/utt2num_frames.* + +echo "$0: Succeeded creating xvector features for $name" diff --git a/egs/voxceleb/v1/local/nnet3/xvector/run_xvector.sh b/egs/voxceleb/v1/local/nnet3/xvector/run_xvector.sh new file mode 120000 index 00000000000..585b63fd2dd --- /dev/null +++ b/egs/voxceleb/v1/local/nnet3/xvector/run_xvector.sh @@ -0,0 +1 @@ +tuning/run_xvector_1a.sh \ No newline at end of file diff --git a/egs/voxceleb/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh b/egs/voxceleb/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh new file mode 100755 index 00000000000..0c2c77bb5bd --- /dev/null +++ b/egs/voxceleb/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh @@ -0,0 +1,155 @@ +#!/bin/bash +# Copyright 2017 David Snyder +# 2017 Johns Hopkins University (Author: Daniel Garcia-Romero) +# 2017 Johns Hopkins University (Author: Daniel Povey) +# +# Copied from egs/sre16/v1/local/nnet3/xvector/tuning/run_xvector_1a.sh (commit e082c17d4a8f8a791428ae4d9f7ceb776aef3f0b). +# +# Apache 2.0. + +# This script trains a DNN similar to the recipe described in +# http://www.danielpovey.com/files/2018_icassp_xvectors.pdf + +. ./cmd.sh +set -e + +stage=1 +train_stage=0 +use_gpu=true +remove_egs=false + +data=data/train +nnet_dir=exp/xvector_nnet_1a/ +egs_dir=exp/xvector_nnet_1a/egs + +. ./path.sh +. ./cmd.sh +. ./utils/parse_options.sh + +num_pdfs=$(awk '{print $2}' $data/utt2spk | sort | uniq -c | wc -l) + +# Now we create the nnet examples using sid/nnet3/xvector/get_egs.sh. +# The argument --num-repeats is related to the number of times a speaker +# repeats per archive. If it seems like you're getting too many archives +# (e.g., more than 200) try increasing the --frames-per-iter option. The +# arguments --min-frames-per-chunk and --max-frames-per-chunk specify the +# minimum and maximum length (in terms of number of frames) of the features +# in the examples. +# +# To make sense of the egs script, it may be necessary to put an "exit 1" +# command immediately after stage 3. Then, inspect +# exp//egs/temp/ranges.* . The ranges files specify the examples that +# will be created, and which archives they will be stored in. Each line of +# ranges.* has the following form: +# +# For example: +# 100304-f-sre2006-kacg-A 1 2 4079 881 23 + +# If you're satisfied with the number of archives (e.g., 50-150 archives is +# reasonable) and with the number of examples per speaker (e.g., 1000-5000 +# is reasonable) then you can let the script continue to the later stages. +# Otherwise, try increasing or decreasing the --num-repeats option. You might +# need to fiddle with --frames-per-iter. Increasing this value decreases the +# the number of archives and increases the number of examples per archive. +# Decreasing this value increases the number of archives, while decreasing the +# number of examples per archive. +if [ $stage -le 6 ]; then + echo "$0: Getting neural network training egs"; + # dump egs. + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b{03,04,05,06}/$USER/kaldi-data/egs/voxceleb2/v2/xvector-$(date +'%m_%d_%H_%M')/$egs_dir/storage $egs_dir/storage + fi + sid/nnet3/xvector/get_egs.sh --cmd "$train_cmd" \ + --nj 8 \ + --stage 0 \ + --frames-per-iter 1000000000 \ + --frames-per-iter-diagnostic 100000 \ + --min-frames-per-chunk 200 \ + --max-frames-per-chunk 400 \ + --num-diagnostic-archives 3 \ + --num-repeats 50 \ + "$data" $egs_dir +fi + +if [ $stage -le 7 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(wc -w $egs_dir/pdf2num | awk '{print $1}') + feat_dim=$(cat $egs_dir/info/feat_dim) + + # This chunk-size corresponds to the maximum number of frames the + # stats layer is able to pool over. In this script, it corresponds + # to 100 seconds. If the input recording is greater than 100 seconds, + # we will compute multiple xvectors from the same recording and average + # to produce the final xvector. + max_chunk_size=10000 + + # The smallest number of frames we're comfortable computing an xvector from. + # Note that the hard minimum is given by the left and right context of the + # frame-level layers. + min_chunk_size=25 + mkdir -p $nnet_dir/configs + cat < $nnet_dir/configs/network.xconfig + # please note that it is important to have input layer with the name=input + + # The frame-level layers + input dim=${feat_dim} name=input + relu-batchnorm-layer name=tdnn1 input=Append(-2,-1,0,1,2) dim=512 + relu-batchnorm-layer name=tdnn2 input=Append(-2,0,2) dim=512 + relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn4 dim=512 + relu-batchnorm-layer name=tdnn5 dim=1500 + + # The stats pooling layer. Layers after this are segment-level. + # In the config below, the first and last argument (0, and ${max_chunk_size}) + # means that we pool over an input segment starting at frame 0 + # and ending at frame ${max_chunk_size} or earlier. The other arguments (1:1) + # mean that no subsampling is performed. + stats-layer name=stats config=mean+stddev(0:1:1:${max_chunk_size}) + + # This is where we usually extract the embedding (aka xvector) from. + relu-batchnorm-layer name=tdnn6 dim=512 input=stats + + # This is where another layer the embedding could be extracted + # from, but usually the previous one works better. + relu-batchnorm-layer name=tdnn7 dim=512 + output-layer name=output include-log-softmax=true dim=${num_targets} +EOF + + steps/nnet3/xconfig_to_configs.py \ + --xconfig-file $nnet_dir/configs/network.xconfig \ + --config-dir $nnet_dir/configs/ + cp $nnet_dir/configs/final.config $nnet_dir/nnet.config + + # These three files will be used by sid/nnet3/xvector/extract_xvectors.sh + echo "output-node name=output input=tdnn6.affine" > $nnet_dir/extract.config + echo "$max_chunk_size" > $nnet_dir/max_chunk_size + echo "$min_chunk_size" > $nnet_dir/min_chunk_size +fi + +dropout_schedule='0,0@0.20,0.1@0.50,0' +srand=123 +if [ $stage -le 8 ]; then + steps/nnet3/train_raw_dnn.py --stage=$train_stage \ + --cmd="$train_cmd" \ + --trainer.optimization.proportional-shrink 10 \ + --trainer.optimization.momentum=0.5 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=8 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.minibatch-size=64 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2 \ + --trainer.num-epochs=3 \ + --trainer.dropout-schedule="$dropout_schedule" \ + --trainer.shuffle-buffer-size=1000 \ + --egs.frames-per-eg=1 \ + --egs.dir="$egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval=10 \ + --use-gpu=true \ + --dir=$nnet_dir || exit 1; +fi + +exit 0; diff --git a/egs/voxceleb/v1/local/prepare_for_eer.py b/egs/voxceleb/v1/local/prepare_for_eer.py new file mode 100755 index 00000000000..6bfa04e011b --- /dev/null +++ b/egs/voxceleb/v1/local/prepare_for_eer.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 +# +# Copyright 2015 David Snyder +# Apache 2.0. +# +# Copied from egs/sre10/v1/local/prepare_for_eer.py (commit 9cb4c4c2fb0223ee90c38d98af11305074eb7ef8) +# +# Given a trials and scores file, this script +# prepares input for the binary compute-eer. +import sys +trials = open(sys.argv[1], 'r').readlines() +scores = open(sys.argv[2], 'r').readlines() +spkrutt2target = {} +for line in trials: + spkr, utt, target = line.strip().split() + spkrutt2target[spkr+utt]=target +for line in scores: + spkr, utt, score = line.strip().split() + print(score, spkrutt2target[spkr+utt]) diff --git a/egs/voxceleb/v1/path.sh b/egs/voxceleb/v1/path.sh new file mode 100755 index 00000000000..e50f57c5271 --- /dev/null +++ b/egs/voxceleb/v1/path.sh @@ -0,0 +1,5 @@ +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/voxceleb/v1/run.sh b/egs/voxceleb/v1/run.sh new file mode 100755 index 00000000000..8af2226423d --- /dev/null +++ b/egs/voxceleb/v1/run.sh @@ -0,0 +1,129 @@ +#!/bin/bash +# Copyright 2017 Johns Hopkins University (Author: Daniel Garcia-Romero) +# 2017 Johns Hopkins University (Author: Daniel Povey) +# 2017-2018 David Snyder +# 2018 Ewald Enzinger +# Apache 2.0. +# +# See ../README.txt for more info on data required. +# Results (mostly equal error-rates) are inline in comments below. + +. ./cmd.sh +. ./path.sh +set -e +mfccdir=`pwd`/mfcc +vaddir=`pwd`/mfcc + +# The trials file is downloaded by local/make_voxceleb1.pl. +voxceleb1_trials=data/voxceleb1_test/trials +voxceleb1_root=/export/corpora/VoxCeleb1 +voxceleb2_root=/export/corpora/VoxCeleb2 + +stage=0 + +if [ $stage -le 0 ]; then + local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train + local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test + # This script reates data/voxceleb1_test and data/voxceleb1_train. + # Our evaluation set is the test portion of VoxCeleb1. + local/make_voxceleb1.pl $voxceleb1_root data + # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1. + # This should give 7,351 speakers and 1,277,503 utterances. + utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train +fi + +if [ $stage -le 1 ]; then + # Make MFCCs and compute the energy-based VAD for each dataset + for name in train voxceleb1_test; do + steps/make_mfcc.sh --write-utt2num-frames true \ + --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \ + data/${name} exp/make_mfcc $mfccdir + utils/fix_data_dir.sh data/${name} + sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \ + data/${name} exp/make_vad $vaddir + utils/fix_data_dir.sh data/${name} + done +fi + +if [ $stage -le 2 ]; then + # Train the UBM. + sid/train_diag_ubm.sh --cmd "$train_cmd --mem 4G" \ + --nj 40 --num-threads 8 \ + data/train 2048 \ + exp/diag_ubm + + sid/train_full_ubm.sh --cmd "$train_cmd --mem 25G" \ + --nj 40 --remove-low-count-gaussians false \ + data/train \ + exp/diag_ubm exp/full_ubm +fi + +if [ $stage -le 3 ]; then + # In this stage, we train the i-vector extractor. + # + # Note that there are well over 1 million utterances in our training set, + # and it takes an extremely long time to train the extractor on all of this. + # Also, most of those utterances are very short. Short utterances are + # harmful for training the i-vector extractor. Therefore, to reduce the + # training time and improve performance, we will only train on the 100k + # longest utterances. + utils/subset_data_dir.sh \ + --utt-list <(sort -n -k 2 data/train/utt2num_frames | tail -n 100000) \ + data/train data/train_100k + # Train the i-vector extractor. + sid/train_ivector_extractor.sh --cmd "$train_cmd --mem 16G" \ + --ivector-dim 400 --num-iters 5 \ + exp/full_ubm/final.ubm data/train_100k \ + exp/extractor +fi + +if [ $stage -le 4 ]; then + sid/extract_ivectors.sh --cmd "$train_cmd --mem 4G" --nj 80 \ + exp/extractor data/train \ + exp/ivectors_train + + sid/extract_ivectors.sh --cmd "$train_cmd --mem 4G" --nj 40 \ + exp/extractor data/voxceleb1_test \ + exp/ivectors_voxceleb1_test +fi + +if [ $stage -le 5 ]; then + # Compute the mean vector for centering the evaluation i-vectors. + $train_cmd exp/ivectors_train/log/compute_mean.log \ + ivector-mean scp:exp/ivectors_train/ivector.scp \ + exp/ivectors_train/mean.vec || exit 1; + + # This script uses LDA to decrease the dimensionality prior to PLDA. + lda_dim=200 + $train_cmd exp/ivectors_train/log/lda.log \ + ivector-compute-lda --total-covariance-factor=0.0 --dim=$lda_dim \ + "ark:ivector-subtract-global-mean scp:exp/ivectors_train/ivector.scp ark:- |" \ + ark:data/train/utt2spk exp/ivectors_train/transform.mat || exit 1; + + # Train the PLDA model. + $train_cmd exp/ivectors_train/log/plda.log \ + ivector-compute-plda ark:data/train/spk2utt \ + "ark:ivector-subtract-global-mean scp:exp/ivectors_train/ivector.scp ark:- | transform-vec exp/ivectors_train/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ + exp/ivectors_train/plda || exit 1; +fi + +if [ $stage -le 6 ]; then + $train_cmd exp/scores/log/voxceleb1_test_scoring.log \ + ivector-plda-scoring --normalize-length=true \ + "ivector-copy-plda --smoothing=0.0 exp/ivectors_train/plda - |" \ + "ark:ivector-subtract-global-mean exp/ivectors_train/mean.vec scp:exp/ivectors_voxceleb1_test/ivector.scp ark:- | transform-vec exp/ivectors_train/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ + "ark:ivector-subtract-global-mean exp/ivectors_train/mean.vec scp:exp/ivectors_voxceleb1_test/ivector.scp ark:- | transform-vec exp/ivectors_train/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ + "cat '$voxceleb1_trials' | cut -d\ --fields=1,2 |" exp/scores_voxceleb1_test || exit 1; +fi + +if [ $stage -le 7 ]; then + eer=`compute-eer <(local/prepare_for_eer.py $voxceleb1_trials exp/scores_voxceleb1_test) 2> /dev/null` + mindcf1=`sid/compute_min_dcf.py --p-target 0.01 exp/scores_voxceleb1_test $voxceleb1_trials 2> /dev/null` + mindcf2=`sid/compute_min_dcf.py --p-target 0.001 exp/scores_voxceleb1_test $voxceleb1_trials 2> /dev/null` + echo "EER: $eer%" + echo "minDCF(p-target=0.01): $mindcf1" + echo "minDCF(p-target=0.001): $mindcf2" + # EER: 5.329% + # minDCF(p-target=0.01): 0.4933 + # minDCF(p-target=0.001): 0.6168 +fi diff --git a/egs/voxceleb/v1/sid b/egs/voxceleb/v1/sid new file mode 120000 index 00000000000..5cb0274b7d6 --- /dev/null +++ b/egs/voxceleb/v1/sid @@ -0,0 +1 @@ +../../sre08/v1/sid/ \ No newline at end of file diff --git a/egs/voxceleb/v1/steps b/egs/voxceleb/v1/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/voxceleb/v1/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/voxceleb/v1/utils b/egs/voxceleb/v1/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/voxceleb/v1/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/voxceleb/v2/README.txt b/egs/voxceleb/v2/README.txt new file mode 100644 index 00000000000..2dbcade69c1 --- /dev/null +++ b/egs/voxceleb/v2/README.txt @@ -0,0 +1,11 @@ + This recipe replaces ivectors used in the v1 recipe with embeddings extracted + from a deep neural network. In the scripts, we refer to these embeddings as + "xvectors." The recipe is closely based on the following paper: + http://www.danielpovey.com/files/2018_icassp_xvectors.pdf but uses a wideband + rather than narrowband MFCC config. + + In addition to the VoxCeleb datasets used for training and evaluation (see + ../README.txt) we also use the following datasets for augmentation. + + MUSAN http://www.openslr.org/17 + RIR_NOISES http://www.openslr.org/28 diff --git a/egs/voxceleb/v2/cmd.sh b/egs/voxceleb/v2/cmd.sh new file mode 100755 index 00000000000..d1ca1a6d126 --- /dev/null +++ b/egs/voxceleb/v2/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" + + diff --git a/egs/voxceleb/v2/conf/mfcc.conf b/egs/voxceleb/v2/conf/mfcc.conf new file mode 100644 index 00000000000..9e125706aae --- /dev/null +++ b/egs/voxceleb/v2/conf/mfcc.conf @@ -0,0 +1,7 @@ +--sample-frequency=16000 +--frame-length=25 # the default is 25 +--low-freq=20 # the default. +--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case). +--num-mel-bins=30 +--num-ceps=30 +--snip-edges=false diff --git a/egs/voxceleb/v2/conf/vad.conf b/egs/voxceleb/v2/conf/vad.conf new file mode 100644 index 00000000000..c9f5e8b3072 --- /dev/null +++ b/egs/voxceleb/v2/conf/vad.conf @@ -0,0 +1,4 @@ +--vad-energy-threshold=5.5 +--vad-energy-mean-scale=0.5 +--vad-proportion-threshold=0.12 +--vad-frames-context=2 diff --git a/egs/voxceleb/v2/local b/egs/voxceleb/v2/local new file mode 120000 index 00000000000..740b697d6fd --- /dev/null +++ b/egs/voxceleb/v2/local @@ -0,0 +1 @@ +../v1/local/ \ No newline at end of file diff --git a/egs/voxceleb/v2/path.sh b/egs/voxceleb/v2/path.sh new file mode 100755 index 00000000000..e50f57c5271 --- /dev/null +++ b/egs/voxceleb/v2/path.sh @@ -0,0 +1,5 @@ +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/voxceleb/v2/run.sh b/egs/voxceleb/v2/run.sh new file mode 100755 index 00000000000..e57799cee27 --- /dev/null +++ b/egs/voxceleb/v2/run.sh @@ -0,0 +1,216 @@ +#!/bin/bash +# Copyright 2017 Johns Hopkins University (Author: Daniel Garcia-Romero) +# 2017 Johns Hopkins University (Author: Daniel Povey) +# 2017-2018 David Snyder +# 2018 Ewald Enzinger +# Apache 2.0. +# +# See ../README.txt for more info on data required. +# Results (mostly equal error-rates) are inline in comments below. + +. ./cmd.sh +. ./path.sh +set -e +mfccdir=`pwd`/mfcc +vaddir=`pwd`/mfcc + + +# The trials file is downloaded by local/make_voxceleb1.pl. +voxceleb1_trials=data/voxceleb1_test/trials +voxceleb1_root=/export/corpora/VoxCeleb1 +voxceleb2_root=/export/corpora/VoxCeleb2 +nnet_dir=exp/xvector_nnet_1a +musan_root=/export/corpora/JHU/musan + +stage=0 + +if [ $stage -le 0 ]; then + local/make_voxceleb2.pl $voxceleb2_root dev data/voxceleb2_train + local/make_voxceleb2.pl $voxceleb2_root test data/voxceleb2_test + # This script reates data/voxceleb1_test and data/voxceleb1_train. + # Our evaluation set is the test portion of VoxCeleb1. + local/make_voxceleb1.pl $voxceleb1_root data + # We'll train on all of VoxCeleb2, plus the training portion of VoxCeleb1. + # This should give 7,351 speakers and 1,277,503 utterances. + utils/combine_data.sh data/train data/voxceleb2_train data/voxceleb2_test data/voxceleb1_train +fi + +if [ $stage -le 1 ]; then + # Make MFCCs and compute the energy-based VAD for each dataset + for name in train voxceleb1_test; do + steps/make_mfcc.sh --write-utt2num-frames true --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \ + data/${name} exp/make_mfcc $mfccdir + utils/fix_data_dir.sh data/${name} + sid/compute_vad_decision.sh --nj 40 --cmd "$train_cmd" \ + data/${name} exp/make_vad $vaddir + utils/fix_data_dir.sh data/${name} + done +fi + +# In this section, we augment the VoxCeleb2 data with reverberation, +# noise, music, and babble, and combine it with the clean data. +if [ $stage -le 2 ]; then + frame_shift=0.01 + awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' data/train/utt2num_frames > data/train/reco2dur + + if [ ! -d "RIRS_NOISES" ]; then + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + + # Make a version with reverberated speech + rvb_opts=() + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") + + # Make a reverberated version of the VoxCeleb2 list. Note that we don't add any + # additive noise here. + python steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --speech-rvb-probability 1 \ + --pointsource-noise-addition-probability 0 \ + --isotropic-noise-addition-probability 0 \ + --num-replications 1 \ + --source-sampling-rate 16000 \ + data/train data/train_reverb + cp data/train/vad.scp data/train_reverb/ + utils/copy_data_dir.sh --utt-suffix "-reverb" data/train_reverb data/train_reverb.new + rm -rf data/train_reverb + mv data/train_reverb.new data/train_reverb + + # Prepare the MUSAN corpus, which consists of music, speech, and noise + # suitable for augmentation. + local/make_musan.sh $musan_root data + + # Get the duration of the MUSAN recordings. This will be used by the + # script augment_data_dir.py. + for name in speech noise music; do + utils/data/get_utt2dur.sh data/musan_${name} + mv data/musan_${name}/utt2dur data/musan_${name}/reco2dur + done + + # Augment with musan_noise + python steps/data/augment_data_dir.py --utt-suffix "noise" --fg-interval 1 --fg-snrs "15:10:5:0" --fg-noise-dir "data/musan_noise" data/train data/train_noise + # Augment with musan_music + python steps/data/augment_data_dir.py --utt-suffix "music" --bg-snrs "15:10:8:5" --num-bg-noises "1" --bg-noise-dir "data/musan_music" data/train data/train_music + # Augment with musan_speech + python steps/data/augment_data_dir.py --utt-suffix "babble" --bg-snrs "20:17:15:13" --num-bg-noises "3:4:5:6:7" --bg-noise-dir "data/musan_speech" data/train data/train_babble + + # Combine reverb, noise, music, and babble into one directory. + utils/combine_data.sh data/train_aug data/train_reverb data/train_noise data/train_music data/train_babble +fi + +if [ $stage -le 3 ]; then + # Take a random subset of the augmentations + utils/subset_data_dir.sh data/train_aug 1000000 data/train_aug_1m + utils/fix_data_dir.sh data/train_aug_1m + + # Make MFCCs for the augmented data. Note that we do not compute a new + # vad.scp file here. Instead, we use the vad.scp from the clean version of + # the list. + steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj 40 --cmd "$train_cmd" \ + data/train_aug_1m exp/make_mfcc $mfccdir + + # Combine the clean and augmented VoxCeleb2 list. This is now roughly + # double the size of the original clean list. + utils/combine_data.sh data/train_combined data/train_aug_1m data/train +fi + +# Now we prepare the features to generate examples for xvector training. +if [ $stage -le 4 ]; then + # This script applies CMVN and removes nonspeech frames. Note that this is somewhat + # wasteful, as it roughly doubles the amount of training data on disk. After + # creating training examples, this can be removed. + local/nnet3/xvector/prepare_feats_for_egs.sh --nj 40 --cmd "$train_cmd" \ + data/train_combined data/train_combined_no_sil exp/train_combined_no_sil + utils/fix_data_dir.sh data/train_combined_no_sil +fi + +if [ $stage -le 5 ]; then + # Now, we need to remove features that are too short after removing silence + # frames. We want atleast 5s (500 frames) per utterance. + min_len=400 + mv data/train_combined_no_sil/utt2num_frames data/train_combined_no_sil/utt2num_frames.bak + awk -v min_len=${min_len} '$2 > min_len {print $1, $2}' data/train_combined_no_sil/utt2num_frames.bak > data/train_combined_no_sil/utt2num_frames + utils/filter_scp.pl data/train_combined_no_sil/utt2num_frames data/train_combined_no_sil/utt2spk > data/train_combined_no_sil/utt2spk.new + mv data/train_combined_no_sil/utt2spk.new data/train_combined_no_sil/utt2spk + utils/fix_data_dir.sh data/train_combined_no_sil + + # We also want several utterances per speaker. Now we'll throw out speakers + # with fewer than 8 utterances. + min_num_utts=8 + awk '{print $1, NF-1}' data/train_combined_no_sil/spk2utt > data/train_combined_no_sil/spk2num + awk -v min_num_utts=${min_num_utts} '$2 >= min_num_utts {print $1, $2}' data/train_combined_no_sil/spk2num | utils/filter_scp.pl - data/train_combined_no_sil/spk2utt > data/train_combined_no_sil/spk2utt.new + mv data/train_combined_no_sil/spk2utt.new data/train_combined_no_sil/spk2utt + utils/spk2utt_to_utt2spk.pl data/train_combined_no_sil/spk2utt > data/train_combined_no_sil/utt2spk + + utils/filter_scp.pl data/train_combined_no_sil/utt2spk data/train_combined_no_sil/utt2num_frames > data/train_combined_no_sil/utt2num_frames.new + mv data/train_combined_no_sil/utt2num_frames.new data/train_combined_no_sil/utt2num_frames + + # Now we're ready to create training examples. + utils/fix_data_dir.sh data/train_combined_no_sil +fi + +# Stages 6 through 8 are handled in run_xvector.sh +local/nnet3/xvector/run_xvector.sh --stage $stage --train-stage -1 \ + --data data/train_combined_no_sil --nnet-dir $nnet_dir \ + --egs-dir $nnet_dir/egs + +if [ $stage -le 9 ]; then + # Extract x-vectors for centering, LDA, and PLDA training. + sid/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 4G" --nj 80 \ + $nnet_dir data/train \ + $nnet_dir/xvectors_train + + # Extract x-vectors used in the evaluation. + sid/nnet3/xvector/extract_xvectors.sh --cmd "$train_cmd --mem 4G" --nj 40 \ + $nnet_dir data/voxceleb1_test \ + $nnet_dir/xvectors_voxceleb1_test +fi + +if [ $stage -le 10 ]; then + # Compute the mean vector for centering the evaluation xvectors. + $train_cmd $nnet_dir/xvectors_train/log/compute_mean.log \ + ivector-mean scp:$nnet_dir/xvectors_train/xvector.scp \ + $nnet_dir/xvectors_train/mean.vec || exit 1; + + # This script uses LDA to decrease the dimensionality prior to PLDA. + lda_dim=200 + $train_cmd $nnet_dir/xvectors_train/log/lda.log \ + ivector-compute-lda --total-covariance-factor=0.0 --dim=$lda_dim \ + "ark:ivector-subtract-global-mean scp:$nnet_dir/xvectors_train/xvector.scp ark:- |" \ + ark:data/train/utt2spk $nnet_dir/xvectors_train/transform.mat || exit 1; + + # Train the PLDA model. + $train_cmd $nnet_dir/xvectors_train/log/plda.log \ + ivector-compute-plda ark:data/train/spk2utt \ + "ark:ivector-subtract-global-mean scp:$nnet_dir/xvectors_train/xvector.scp ark:- | transform-vec $nnet_dir/xvectors_train/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ + $nnet_dir/xvectors_train/plda || exit 1; +fi + +if [ $stage -le 11 ]; then + $train_cmd exp/scores/log/voxceleb1_test_scoring.log \ + ivector-plda-scoring --normalize-length=true \ + "ivector-copy-plda --smoothing=0.0 $nnet_dir/xvectors_train/plda - |" \ + "ark:ivector-subtract-global-mean $nnet_dir/xvectors_train/mean.vec scp:$nnet_dir/xvectors_voxceleb1_test/xvector.scp ark:- | transform-vec $nnet_dir/xvectors_train/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ + "ark:ivector-subtract-global-mean $nnet_dir/xvectors_train/mean.vec scp:$nnet_dir/xvectors_voxceleb1_test/xvector.scp ark:- | transform-vec $nnet_dir/xvectors_train/transform.mat ark:- ark:- | ivector-normalize-length ark:- ark:- |" \ + "cat '$voxceleb1_trials' | cut -d\ --fields=1,2 |" exp/scores_voxceleb1_test || exit 1; +fi + +if [ $stage -le 12 ]; then + eer=`compute-eer <(local/prepare_for_eer.py $voxceleb1_trials exp/scores_voxceleb1_test) 2> /dev/null` + mindcf1=`sid/compute_min_dcf.py --p-target 0.01 exp/scores_voxceleb1_test $voxceleb1_trials 2> /dev/null` + mindcf2=`sid/compute_min_dcf.py --p-target 0.001 exp/scores_voxceleb1_test $voxceleb1_trials 2> /dev/null` + echo "EER: $eer%" + echo "minDCF(p-target=0.01): $mindcf1" + echo "minDCF(p-target=0.001): $mindcf2" + # EER: 3.128% + # minDCF(p-target=0.01): 0.3258 + # minDCF(p-target=0.001): 0.5003 + # + # For reference, here's the ivector system from ../v1: + # EER: 5.329% + # minDCF(p-target=0.01): 0.4933 + # minDCF(p-target=0.001): 0.6168 +fi diff --git a/egs/voxceleb/v2/sid b/egs/voxceleb/v2/sid new file mode 120000 index 00000000000..5cb0274b7d6 --- /dev/null +++ b/egs/voxceleb/v2/sid @@ -0,0 +1 @@ +../../sre08/v1/sid/ \ No newline at end of file diff --git a/egs/voxceleb/v2/steps b/egs/voxceleb/v2/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/voxceleb/v2/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/voxceleb/v2/utils b/egs/voxceleb/v2/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/voxceleb/v2/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/voxforge/gst_demo/run-live.py b/egs/voxforge/gst_demo/run-live.py index 14119fa67f5..725a306c42c 100755 --- a/egs/voxforge/gst_demo/run-live.py +++ b/egs/voxforge/gst_demo/run-live.py @@ -68,7 +68,7 @@ def init_gst(self): self.asr.set_property("acoustic-scale", 0.0769) else: print >> sys.stderr, "Couldn't create the onlinegmmfasterdecoder element. " - if os.environ.has_key("GST_PLUGIN_PATH"): + if "GST_PLUGIN_PATH" in os.environ: print >> sys.stderr, "Have you compiled the Kaldi GStreamer plugin?" else: print >> sys.stderr, "You probably need to set the GST_PLUGIN_PATH envoronment variable" diff --git a/egs/wsj/s5/local/chain/e2e/run_tdnn_flatstart.sh b/egs/wsj/s5/local/chain/e2e/run_tdnn_flatstart.sh index 6fb00150378..9a4f0c87c8d 100755 --- a/egs/wsj/s5/local/chain/e2e/run_tdnn_flatstart.sh +++ b/egs/wsj/s5/local/chain/e2e/run_tdnn_flatstart.sh @@ -13,23 +13,23 @@ # of phone_lm.fst (in stage 1 below) # local/chain/compare_wer.sh exp/chain/e2e_tdnn_1a -# System e2e_tdnn_1a -#WER dev93 (tgpr) 9.70 -#WER dev93 (tg) 9.05 -#WER dev93 (big-dict,tgpr) 7.20 -#WER dev93 (big-dict,fg) 6.36 -#WER eval92 (tgpr) 5.88 -#WER eval92 (tg) 5.32 -#WER eval92 (big-dict,tgpr) 3.67 -#WER eval92 (big-dict,fg) 3.05 -# Final train prob -0.0741 -# Final valid prob -0.0951 +# System e2e_tdnn_1a +#WER dev93 (tgpr) 9.63 +#WER dev93 (tg) 9.07 +#WER dev93 (big-dict,tgpr) 7.41 +#WER dev93 (big-dict,fg) 6.55 +#WER eval92 (tgpr) 5.90 +#WER eval92 (tg) 5.17 +#WER eval92 (big-dict,tgpr) 3.56 +#WER eval92 (big-dict,fg) 2.85 +# Final train prob -0.0726 +# Final valid prob -0.0884 # Final train prob (xent) # Final valid prob (xent) -# Num-params 5562234 +# Num-params 3740934 # steps/info/chain_dir_info.pl exp/chain/e2e_tdnn_1a -# exp/chain/e2e_tdnn_1a: num-iters=68 nj=2..5 num-params=5.6M dim=40->84 combine=-0.094->-0.094 logprob:train/valid[44,67,final]=(-0.083,-0.073,-0.072/-0.097,-0.095,-0.095) +# exp/chain/e2e_tdnn_1a: num-iters=102 nj=2..5 num-params=3.7M dim=40->84 combine=-0.117->-0.116 (over 3) logprob:train/valid[67,101,final]=(-0.080,-0.073,-0.073/-0.090,-0.089,-0.088) set -e @@ -37,7 +37,7 @@ set -e stage=0 train_stage=-10 get_egs_stage=-10 -affix=1a_dim450 +affix=1a # training options num_epochs=4 @@ -85,10 +85,18 @@ if [ $stage -le 0 ]; then fi if [ $stage -le 1 ]; then + echo "$0: Estimating a phone language model for the denominator graph..." + mkdir -p $treedir/log + $train_cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py --between-silprob 0.1 \ + data/lang_nosp \| \ + utils/sym2int.pl -f 2- data/lang_nosp/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=2000 \ + ark:- $treedir/phone_lm.fst steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$train_cmd" \ --shared-phones true \ data/$train_set $lang $treedir - cp exp/chain/e2e_base/phone_lm.fst $treedir/ fi if [ $stage -le 2 ]; then diff --git a/egs/wsj/s5/local/chain/e2e/run_tdnn_lstm_flatstart.sh b/egs/wsj/s5/local/chain/e2e/run_tdnn_lstm_flatstart.sh index f73012b444f..cc7c64f3cc8 100755 --- a/egs/wsj/s5/local/chain/e2e/run_tdnn_lstm_flatstart.sh +++ b/egs/wsj/s5/local/chain/e2e/run_tdnn_lstm_flatstart.sh @@ -92,11 +92,18 @@ if [ $stage -le 0 ]; then fi if [ $stage -le 1 ]; then + echo "$0: Estimating a phone language model for the denominator graph..." + mkdir -p $treedir/log + $train_cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang_nosp \| \ + utils/sym2int.pl -f 2- data/lang_nosp/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=2000 \ + ark:- $treedir/phone_lm.fst steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$train_cmd" \ --type biphone \ --shared-phones true \ data/$train_set $lang $treedir - cp exp/chain/e2e_base/char_lm.fst $treedir/phone_lm.fst fi if [ $stage -le 2 ]; then diff --git a/egs/wsj/s5/local/chain/run_tdnn.sh b/egs/wsj/s5/local/chain/run_tdnn.sh index cb5756188a4..cebb2b84f16 120000 --- a/egs/wsj/s5/local/chain/run_tdnn.sh +++ b/egs/wsj/s5/local/chain/run_tdnn.sh @@ -1 +1 @@ -tuning/run_tdnn_1f.sh \ No newline at end of file +tuning/run_tdnn_1g.sh \ No newline at end of file diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh new file mode 100755 index 00000000000..1724c057e12 --- /dev/null +++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh @@ -0,0 +1,339 @@ +#!/bin/bash + +# 1g is like 1f but upgrading to a "resnet-style TDNN-F model", i.e. +# with bypass resnet connections, and re-tuned. + +# local/chain/compare_wer.sh exp/chain/tdnn1f_sp exp/chain/tdnn1g_sp +# System tdnn1f_sp tdnn1g_sp +#WER dev93 (tgpr) 7.03 6.68 +#WER dev93 (tg) 6.83 6.57 +#WER dev93 (big-dict,tgpr) 4.99 4.60 +#WER dev93 (big-dict,fg) 4.52 4.26 +#WER eval92 (tgpr) 5.19 4.54 +#WER eval92 (tg) 4.73 4.32 +#WER eval92 (big-dict,tgpr) 2.94 2.62 +#WER eval92 (big-dict,fg) 2.68 2.32 +# Final train prob -0.0461 -0.0417 +# Final valid prob -0.0588 -0.0487 +# Final train prob (xent) -0.9042 -0.6461 +# Final valid prob (xent) -0.9447 -0.6882 +# Num-params 6071244 8354636 + +# steps/info/chain_dir_info.pl exp/chain/tdnn1g_sp +# exp/chain/tdnn1g_sp: num-iters=108 nj=2..8 num-params=8.4M dim=40+100->2854 combine=-0.042->-0.042 (over 2) xent:train/valid[71,107,final]=(-0.975,-0.640,-0.646/-0.980,-0.678,-0.688) logprob:train/valid[71,107,final]=(-0.067,-0.043,-0.042/-0.069,-0.050,-0.049) + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +train_set=train_si284 +test_sets="test_dev93 test_eval92" +gmm=tri4b # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +num_threads_ubm=32 +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=1g #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# LSTM/chain options +train_stage=-10 +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=false # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 15 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.005" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=1024 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=10 \ + --trainer.frames-per-iter=5000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=8 \ + --trainer.optimization.initial-effective-lrate=0.0005 \ + --trainer.optimization.final-effective-lrate=0.00005 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=0 \ + --egs.chunk-right-context=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 17 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/lang/check_phones_compatible.sh \ + data/lang_test_tgpr/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgpr \ + $tree_dir $tree_dir/graph_tgpr || exit 1; + + utils/lang/check_phones_compatible.sh \ + data/lang_test_bd_tgpr/phones.txt $lang/phones.txt + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_bd_tgpr \ + $tree_dir $tree_dir/graph_bd_tgpr || exit 1; +fi + +if [ $stage -le 18 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l data/${trainset}_spe2e_hires/utt2uniq utils/fix_data_dir.sh data/${trainset}_spe2e_hires @@ -101,17 +101,6 @@ else fi if [ $stage -le 5 ]; then - echo "$0: estimating character language model for the denominator graph" - mkdir -p exp/chain/e2e_base/log - $train_cmd exp/chain/e2e_base/log/make_char_lm.log \ - cat data/$trainset/text \| \ - steps/nnet3/chain/e2e/text_to_phones.py data/lang_char \| \ - utils/sym2int.pl -f 2- data/lang_char/phones.txt \| \ - chain-est-phone-lm --num-extra-lm-states=2000 \ - ark:- exp/chain/e2e_base/char_lm.fst -fi - -if [ $stage -le 6 ]; then echo "$0: calling the flat-start chain recipe..." local/chain/e2e/run_tdnn_lstm_flatstart.sh fi diff --git a/egs/wsj/s5/local/e2e/run_end2end_phone.sh b/egs/wsj/s5/local/e2e/run_end2end_phone.sh index f1e443073ba..3d33a4a57b5 100755 --- a/egs/wsj/s5/local/e2e/run_end2end_phone.sh +++ b/egs/wsj/s5/local/e2e/run_end2end_phone.sh @@ -70,8 +70,8 @@ if [ $stage -le 2 ]; then # 12 in the following command means the allowed lengths are spaced # by 12% change in length. - python utils/data/perturb_speed_to_allowed_lengths.py 12 data/${trainset} \ - data/${trainset}_spe2e_hires + utils/data/perturb_speed_to_allowed_lengths.py 12 data/${trainset} \ + data/${trainset}_spe2e_hires cat data/${trainset}_spe2e_hires/utt2dur | \ awk '{print $1 " " substr($1,5)}' >data/${trainset}_spe2e_hires/utt2uniq utils/fix_data_dir.sh data/${trainset}_spe2e_hires @@ -85,17 +85,6 @@ if [ $stage -le 3 ]; then fi if [ $stage -le 4 ]; then - echo "$0: estimating phone language model for the denominator graph" - mkdir -p exp/chain/e2e_base/log - $train_cmd exp/chain/e2e_base/log/make_phone_lm.log \ - cat data/$trainset/text \| \ - steps/nnet3/chain/e2e/text_to_phones.py data/lang_nosp \| \ - utils/sym2int.pl -f 2- data/lang_nosp/phones.txt \| \ - chain-est-phone-lm --num-extra-lm-states=2000 \ - ark:- exp/chain/e2e_base/phone_lm.fst -fi - -if [ $stage -le 5 ]; then echo "$0: calling the flat-start chain recipe..." local/chain/e2e/run_tdnn_flatstart.sh fi diff --git a/egs/wsj/s5/run.sh b/egs/wsj/s5/run.sh index f0c0c95dddd..277252cecc3 100755 --- a/egs/wsj/s5/run.sh +++ b/egs/wsj/s5/run.sh @@ -148,8 +148,6 @@ if [ $stage -le 2 ]; then fi fi -exit 0 ## TEMP - if [ $stage -le 3 ]; then # tri2b. there is no special meaning in the "b"-- it's historical. if $train; then diff --git a/egs/wsj/s5/steps/chain b/egs/wsj/s5/steps/chain new file mode 120000 index 00000000000..937364099ee --- /dev/null +++ b/egs/wsj/s5/steps/chain @@ -0,0 +1 @@ +nnet3/chain \ No newline at end of file diff --git a/egs/wsj/s5/steps/cleanup/decode_fmllr_segmentation.sh b/egs/wsj/s5/steps/cleanup/decode_fmllr_segmentation.sh index de99fd8e624..d1297ccd836 100755 --- a/egs/wsj/s5/steps/cleanup/decode_fmllr_segmentation.sh +++ b/egs/wsj/s5/steps/cleanup/decode_fmllr_segmentation.sh @@ -111,7 +111,7 @@ delta_opts=`cat $srcdir/delta_opts 2>/dev/null` || true silphonelist=`cat $graphdir/phones/silence.csl` || exit 1; -utils/lang/check_phones_compatible.sh $graph_dir/phones.txt $srcdir/phones.txt +utils/lang/check_phones_compatible.sh $graphdir/phones.txt $srcdir/phones.txt # Some checks. Note: we don't need $srcdir/tree but we expect # it should exist, given the current structure of the scripts. diff --git a/egs/wsj/s5/steps/cleanup/decode_segmentation.sh b/egs/wsj/s5/steps/cleanup/decode_segmentation.sh index 437cc751189..628741e1e7c 100755 --- a/egs/wsj/s5/steps/cleanup/decode_segmentation.sh +++ b/egs/wsj/s5/steps/cleanup/decode_segmentation.sh @@ -103,7 +103,7 @@ for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $model $graphdir/HCLG.fsts.scp; do [ ! -f $f ] && echo "$0: no such file $f" && exit 1; done -utils/lang/check_phones_compatible.sh $graph_dir/phones.txt $srcdir/phones.txt +utils/lang/check_phones_compatible.sh $graphdir/phones.txt $srcdir/phones.txt # Split HCLG.fsts.scp by input utterance n1=$(cat $graphdir/HCLG.fsts.scp | wc -l) diff --git a/egs/wsj/s5/steps/decode.sh b/egs/wsj/s5/steps/decode.sh index b3819ed802d..e9ca2f17cc5 100755 --- a/egs/wsj/s5/steps/decode.sh +++ b/egs/wsj/s5/steps/decode.sh @@ -80,7 +80,7 @@ if [ $(basename $model) != final.alimdl ] ; then fi for f in $sdata/1/feats.scp $sdata/1/cmvn.scp $model $graphdir/HCLG.fst; do - [ ! -f $f ] && echo "decode.sh: no such file $f" && exit 1; + [ ! -f $f ] && echo "$0: Error: no such file $f" && exit 1; done if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi @@ -96,13 +96,13 @@ thread_string= case $feat_type in delta) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |";; - *) echo "Invalid feature type $feat_type" && exit 1; + *) echo "$0: Error: Invalid feature type $feat_type" && exit 1; esac if [ ! -z "$transform_dir" ]; then # add transforms to features... echo "Using fMLLR transforms from $transform_dir" [ ! -f $transform_dir/trans.1 ] && echo "Expected $transform_dir/trans.1 to exist." [ ! -s $transform_dir/num_jobs ] && \ - echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1; + echo "$0: Error: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1; nj_orig=$(cat $transform_dir/num_jobs) if [ $nj -ne $nj_orig ]; then # Copy the transforms into an archive with an index. @@ -119,7 +119,7 @@ fi if [ $stage -le 0 ]; then if [ -f "$graphdir/num_pdfs" ]; then [ "`cat $graphdir/num_pdfs`" -eq `am-info --print-args=false $model | grep pdfs | awk '{print $NF}'` ] || \ - { echo "Mismatch in number of pdfs with $model"; exit 1; } + { echo "$0: Error: Mismatch in number of pdfs with $model"; exit 1; } fi $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \ gmm-latgen-faster$thread_string --max-active=$max_active --beam=$beam --lattice-beam=$lattice_beam \ @@ -134,9 +134,9 @@ fi if ! $skip_scoring ; then [ ! -x local/score.sh ] && \ - echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + echo "$0: Not scoring because local/score.sh does not exist or not executable." && exit 1; local/score.sh --cmd "$cmd" $scoring_opts $data $graphdir $dir || - { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; } + { echo "$0: Error: scoring failed. (ignore by '--skip-scoring true')"; exit 1; } fi exit 0; diff --git a/egs/wsj/s5/steps/diagnostic/analyze_alignments.sh b/egs/wsj/s5/steps/diagnostic/analyze_alignments.sh index b641cd18cbb..ff0a87ae295 100755 --- a/egs/wsj/s5/steps/diagnostic/analyze_alignments.sh +++ b/egs/wsj/s5/steps/diagnostic/analyze_alignments.sh @@ -44,7 +44,7 @@ $cmd JOB=1:$num_jobs $dir/log/get_phone_alignments.JOB.log \ set -o pipefail '&&' ali-to-phones --write-lengths=true "$model" \ "ark:gunzip -c $dir/ali.JOB.gz|" ark,t:- \| \ sed -E 's/^[^ ]+ //' \| \ - awk 'BEGIN{FS=" ; "; OFS="\n";} {print "begin " $1; print "end " $NF; for (n=1;n<=NF;n++) print "all " $n; }' \| \ + awk 'BEGIN{FS=" ; "; OFS="\n";} {print "begin " $1; if (NF>1) print "end " $NF; for (n=1;n<=NF;n++) print "all " $n; }' \| \ sort \| uniq -c \| gzip -c '>' $dir/phone_stats.JOB.gz || exit 1 if ! $cmd $dir/log/analyze_alignments.log \ diff --git a/egs/wsj/s5/steps/diagnostic/analyze_lats.sh b/egs/wsj/s5/steps/diagnostic/analyze_lats.sh index 98b33d9d09d..d580f516527 100755 --- a/egs/wsj/s5/steps/diagnostic/analyze_lats.sh +++ b/egs/wsj/s5/steps/diagnostic/analyze_lats.sh @@ -51,7 +51,7 @@ $cmd JOB=1:$num_jobs $dir/log/lattice_best_path.JOB.log \ $cmd JOB=1:$num_jobs $dir/log/get_lattice_stats.JOB.log \ ali-to-phones --write-lengths=true "$model" "ark:gunzip -c $dir/ali_tmp.JOB.gz|" ark,t:- \| \ sed -E 's/^[^ ]+ //' \| \ - awk 'BEGIN{FS=" ; "; OFS="\n";} {print "begin " $1; print "end " $NF; for (n=1;n<=NF;n++) print "all " $n; }' \| \ + awk 'BEGIN{FS=" ; "; OFS="\n";} {print "begin " $1; if (NF>1) print "end " $NF; for (n=1;n<=NF;n++) print "all " $n; }' \| \ sort \| uniq -c \| gzip -c '>' $dir/phone_stats.JOB.gz || exit 1 diff --git a/egs/wsj/s5/steps/get_ctm.sh b/egs/wsj/s5/steps/get_ctm.sh index 690e6cee4f2..85286e47bea 100755 --- a/egs/wsj/s5/steps/get_ctm.sh +++ b/egs/wsj/s5/steps/get_ctm.sh @@ -1,8 +1,10 @@ #!/bin/bash # Copyright Johns Hopkins University (Author: Daniel Povey) 2012. Apache 2.0. -# This script produces CTM files from a decoding directory that has lattices -# present. +# This script produces CTM files from a decoding directory that has lattices +# present. It does this for a range of language model weights; see also +# get_ctm_fast.sh which does it for just one LM weight and also supports +# the word insertion penalty. # begin configuration section. @@ -34,7 +36,8 @@ if [ $# -ne 3 ]; then echo " # not equal to 0.01 seconds" echo "e.g.:" echo "$0 data/train data/lang exp/tri4a/decode/" - echo "See also: steps/get_train_ctm.sh" + echo "See also: steps/get_train_ctm.sh, steps/get_ctm_fast.sh" + exit 1; fi @@ -87,4 +90,3 @@ if [ $stage -le 0 ]; then fi fi - diff --git a/egs/wsj/s5/steps/get_ctm_fast.sh b/egs/wsj/s5/steps/get_ctm_fast.sh index 613061f7df8..75b666300fe 100755 --- a/egs/wsj/s5/steps/get_ctm_fast.sh +++ b/egs/wsj/s5/steps/get_ctm_fast.sh @@ -1,18 +1,21 @@ #!/bin/bash # Copyright Johns Hopkins University (Author: Daniel Povey) 2012. Apache 2.0. # Copyright 2017 Vimal Manohar +# Music Technology Group, Universitat Pompeu Fabra, 2018. Apache 2.0 # This script produces CTM files from a decoding directory that has lattices -# present. +# present. It does this for one LM weight and also supports +# the word insertion penalty. # This is similar to get_ctm.sh, but gets the CTM at the utterance-level. # It can be faster than steps/get_ctm.sh --use-segments false as it splits -# the process across many jobs. +# the process across many jobs. # begin configuration section. cmd=run.pl stage=0 frame_shift=0.01 lmwt=10 +wip=0.0 print_silence=false #end configuration section. @@ -57,7 +60,7 @@ echo $nj > $dir/num_jobs if [ -f $lang/phones/word_boundary.int ]; then $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \ set -o pipefail '&&' \ - lattice-1best --lm-scale=$lmwt "ark:gunzip -c $decode_dir/lat.JOB.gz|" ark:- \| \ + lattice-1best --lm-scale=$lmwt --word-ins-penalty=$wip "ark:gunzip -c $decode_dir/lat.JOB.gz|" ark:- \| \ lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \ nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \ utils/int2sym.pl -f 5 $lang/words.txt \ @@ -65,7 +68,7 @@ if [ -f $lang/phones/word_boundary.int ]; then elif [ -f $lang/phones/align_lexicon.int ]; then $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \ set -o pipefail '&&' \ - lattice-1best --lm-scale=$lmwt "ark:gunzip -c $decode_dir/lat.JOB.gz|" ark:- \| \ + lattice-1best --lm-scale=$lmwt --word-ins-penalty=$wip "ark:gunzip -c $decode_dir/lat.JOB.gz|" ark:- \| \ lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \ lattice-1best ark:- ark:- \| \ nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \ diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py index 63b1c12c759..73f4e5b6533 100755 --- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py +++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py @@ -85,7 +85,7 @@ def fill_nonlin_stats_table_with_regex_result(groups, gate_index, stats_table): if len(groups) <= 9: try: - if stats_table[component_name]['stats'].has_key(iteration): + if iteration in stats_table[component_name]['stats']: stats_table[component_name]['stats'][iteration].extend( [value_mean, value_stddev, deriv_mean, deriv_stddev, @@ -117,7 +117,7 @@ def fill_nonlin_stats_table_with_regex_result(groups, gate_index, stats_table): oderiv_50th = float(oderiv_percentiles_split[6]) oderiv_95th = float(oderiv_percentiles_split[9]) try: - if stats_table[component_name]['stats'].has_key(iteration): + if iteration in stats_table[component_name]['stats']: stats_table[component_name]['stats'][iteration].extend( [value_mean, value_stddev, deriv_mean, deriv_stddev, @@ -540,4 +540,4 @@ def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"): total_time += times[iter] report.append("Total training time is {0}\n".format( str(datetime.timedelta(seconds=total_time)))) - return ["\n".join(report), times, data] \ No newline at end of file + return ["\n".join(report), times, data] diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 854a37a52b7..229f290e94c 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -69,7 +69,7 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir, alignment_subsampling_factor=3, online_ivector_dir=None, frames_per_iter=20000, frames_per_eg_str="20", srand=0, - egs_opts=None, cmvn_opts=None, transform_dir=None): + egs_opts=None, cmvn_opts=None): """Wrapper for steps/nnet3/chain/get_egs.sh See options in that script. @@ -79,7 +79,6 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir, """steps/nnet3/chain/get_egs.sh {egs_opts} \ --cmd "{command}" \ --cmvn-opts "{cmvn_opts}" \ - --transform-dir "{transform_dir}" \ --online-ivector-dir "{ivector_dir}" \ --left-context {left_context} \ --right-context {right_context} \ @@ -96,9 +95,6 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir, {data} {dir} {lat_dir} {egs_dir}""".format( command=run_opts.egs_command, cmvn_opts=cmvn_opts if cmvn_opts is not None else '', - transform_dir=(transform_dir - if transform_dir is not None - else ''), ivector_dir=(online_ivector_dir if online_ivector_dir is not None else ''), @@ -490,7 +486,7 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, """{command} {dir}/log/compute_prob_valid.{iter}.log \ nnet3-chain-compute-prob --l2-regularize={l2} \ --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \ - "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ + {model} {dir}/den.fst \ "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/valid_diagnostic{egs_suffix} \ ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \ """.format(command=run_opts.command, dir=dir, iter=iter, model=model, @@ -509,7 +505,7 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, """{command} {dir}/log/compute_prob_train.{iter}.log \ nnet3-chain-compute-prob --l2-regularize={l2} \ --leaky-hmm-coefficient={leaky} --xent-regularize={xent_reg} \ - "nnet3-am-copy --raw=true {model} - |" {dir}/den.fst \ + {model} {dir}/den.fst \ "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/train_diagnostic{egs_suffix} \ ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \ """.format(command=run_opts.command, dir=dir, iter=iter, model=model, diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index b20c64ab9ba..720164e5436 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -195,7 +195,7 @@ def validate_chunk_width(chunk_width): for elem in a: try: i = int(elem) - if i < 1: + if i < 1 and i != -1: return False except: return False @@ -320,7 +320,7 @@ def copy_egs_properties_to_exp_dir(egs_dir, dir): for file in ['cmvn_opts', 'splice_opts', 'info/final.ie.id', 'final.mat']: file_name = '{dir}/{file}'.format(dir=egs_dir, file=file) if os.path.isfile(file_name): - shutil.copy2(file_name, dir) + shutil.copy(file_name, dir) except IOError: logger.error("Error while trying to copy egs " "property files to {dir}".format(dir=dir)) @@ -535,7 +535,10 @@ def smooth_presoftmax_prior_scale_vector(pdf_counts, return scaled_counts -def prepare_initial_network(dir, run_opts, srand=-3): +def prepare_initial_network(dir, run_opts, srand=-3, input_model=None): + if input_model is not None: + shutil.copy(input_model, "{0}/0.raw".format(dir)) + return if os.path.exists(dir+"/configs/init.config"): common_lib.execute_command( """{command} {dir}/log/add_first_layer.log \ @@ -744,11 +747,6 @@ def __init__(self, to the right of the *last* input chunk extracted from an utterance. If negative, defaults to the same as --egs.chunk-right-context""") - self.parser.add_argument("--egs.transform_dir", type=str, - dest='transform_dir', default=None, - action=common_lib.NullstrToNoneAction, - help="String to provide options directly to " - "steps/nnet3/get_egs.sh script") self.parser.add_argument("--egs.dir", type=str, dest='egs_dir', default=None, action=common_lib.NullstrToNoneAction, diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py index 1785e437687..4a39ed9dae6 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py @@ -24,7 +24,7 @@ def generate_egs(data, alidir, egs_dir, left_context_initial=-1, right_context_final=-1, online_ivector_dir=None, samples_per_iter=20000, frames_per_eg_str="20", srand=0, - egs_opts=None, cmvn_opts=None, transform_dir=None): + egs_opts=None, cmvn_opts=None): """ Wrapper for calling steps/nnet3/get_egs.sh @@ -36,7 +36,6 @@ def generate_egs(data, alidir, egs_dir, """steps/nnet3/get_egs.sh {egs_opts} \ --cmd "{command}" \ --cmvn-opts "{cmvn_opts}" \ - --transform-dir "{transform_dir}" \ --online-ivector-dir "{ivector_dir}" \ --left-context {left_context} \ --right-context {right_context} \ @@ -49,9 +48,6 @@ def generate_egs(data, alidir, egs_dir, {data} {alidir} {egs_dir} """.format(command=run_opts.egs_command, cmvn_opts=cmvn_opts if cmvn_opts is not None else '', - transform_dir=(transform_dir - if transform_dir is not None else - ''), ivector_dir=(online_ivector_dir if online_ivector_dir is not None else ''), @@ -66,21 +62,27 @@ def generate_egs(data, alidir, egs_dir, def prepare_initial_acoustic_model(dir, alidir, run_opts, - srand=-3): + srand=-3, input_model=None): """ Adds the first layer; this will also add in the lda.mat and presoftmax_prior_scale.vec. It will also prepare the acoustic model - with the transition model.""" + with the transition model. + If 'input_model' is specified, no initial network preparation(adding + the first layer) is done and this model is used as initial 'raw' model + instead of '0.raw' model to prepare '0.mdl' as acoustic model by adding the + transition model. + """ - common_train_lib.prepare_initial_network(dir, run_opts, - srand=srand) + if input_model is None: + common_train_lib.prepare_initial_network(dir, run_opts, + srand=srand) # Convert to .mdl, train the transitions, set the priors. common_lib.execute_command( """{command} {dir}/log/init_mdl.log \ - nnet3-am-init {alidir}/final.mdl {dir}/0.raw - \| \ + nnet3-am-init {alidir}/final.mdl {raw_mdl} - \| \ nnet3-am-train-transitions - \ "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl """.format(command=run_opts.command, - dir=dir, alidir=alidir)) - - + dir=dir, alidir=alidir, + raw_mdl=(input_model if input_model is not None + else '{0}/0.raw'.format(dir)))) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py index c18003a626e..cc5c9693a12 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py @@ -200,7 +200,6 @@ def train_one_iteration(dir, iter, srand, egs_dir, # Set off jobs doing some diagnostics, in the background. # Use the egs dir from the previous iteration for the diagnostics - logger.info("Training neural net (pass {0})".format(iter)) # check if different iterations use the same random seed if os.path.exists('{0}/srand'.format(dir)): @@ -257,15 +256,6 @@ def train_one_iteration(dir, iter, srand, egs_dir, cur_minibatch_size_str = common_train_lib.halve_minibatch_size_str(minibatch_size_str) cur_max_param_change = float(max_param_change) / math.sqrt(2) - shrink_info_str = '' - if shrinkage_value != 1.0: - shrink_info_str = ' and shrink value is {0}'.format(shrinkage_value) - - logger.info("On iteration {0}, learning rate is {1}" - "{shrink_info}.".format( - iter, learning_rate, - shrink_info=shrink_info_str)) - train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs, num_archives_processed=num_archives_processed, num_archives=num_archives, @@ -393,8 +383,7 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, run_opts, use_multitask_egs=False, compute_per_dim_accuracy=False): if get_raw_nnet_from_am: - model = "nnet3-am-copy --raw=true {dir}/{iter}.mdl - |".format( - dir=dir, iter=iter) + model = "{dir}/{iter}.mdl".format(dir=dir, iter=iter) else: model = "{dir}/{iter}.raw".format(dir=dir, iter=iter) @@ -598,7 +587,7 @@ def get_realign_iters(realign_times, num_iters, return realign_iters -def align(dir, data, lang, run_opts, iter=None, transform_dir=None, +def align(dir, data, lang, run_opts, iter=None, online_ivector_dir=None): alidir = '{dir}/ali{ali_suffix}'.format( @@ -612,7 +601,6 @@ def align(dir, data, lang, run_opts, iter=None, transform_dir=None, """steps/nnet3/align.sh --nj {num_jobs_align} \ --cmd "{align_cmd} {align_queue_opt}" \ --use-gpu {align_use_gpu} \ - --transform-dir "{transform_dir}" \ --online-ivector-dir "{online_ivector_dir}" \ --iter "{iter}" {data} {lang} {dir} {alidir}""".format( dir=dir, align_use_gpu=("yes" @@ -621,9 +609,6 @@ def align(dir, data, lang, run_opts, iter=None, transform_dir=None, align_cmd=run_opts.realign_command, align_queue_opt=run_opts.realign_queue_opt, num_jobs_align=run_opts.realign_num_jobs, - transform_dir=(transform_dir - if transform_dir is not None - else ""), online_ivector_dir=(online_ivector_dir if online_ivector_dir is not None else ""), @@ -635,7 +620,7 @@ def align(dir, data, lang, run_opts, iter=None, transform_dir=None, def realign(dir, iter, feat_dir, lang, prev_egs_dir, cur_egs_dir, prior_subset_size, num_archives, - run_opts, transform_dir=None, online_ivector_dir=None): + run_opts, online_ivector_dir=None): raise Exception("Realignment stage has not been implemented in nnet3") logger.info("Getting average posterior for purposes of adjusting " "the priors.") @@ -654,7 +639,7 @@ def realign(dir, iter, feat_dir, lang, prev_egs_dir, cur_egs_dir, adjust_am_priors(dir, model, avg_post_vec_file, model, run_opts) alidir = align(dir, feat_dir, lang, run_opts, iter, - transform_dir, online_ivector_dir) + online_ivector_dir) common_lib.execute_command( """steps/nnet3/relabel_egs.sh --cmd "{command}" --iter {iter} \ {alidir} {prev_egs_dir} {cur_egs_dir}""".format( diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py index 4e3a78cb3a0..7f21af06a16 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/raw_model.py @@ -24,8 +24,7 @@ def generate_egs_using_targets(data, targets_scp, egs_dir, online_ivector_dir=None, target_type='dense', num_targets=-1, samples_per_iter=20000, frames_per_eg_str="20", - srand=0, egs_opts=None, cmvn_opts=None, - transform_dir=None): + srand=0, egs_opts=None, cmvn_opts=None): """ Wrapper for calling steps/nnet3/get_egs_targets.sh This method generates egs directly from an scp file of targets, instead of @@ -52,7 +51,6 @@ def generate_egs_using_targets(data, targets_scp, egs_dir, """steps/nnet3/get_egs_targets.sh {egs_opts} \ --cmd "{command}" \ --cmvn-opts "{cmvn_opts}" \ - --transform-dir "{transform_dir}" \ --online-ivector-dir "{ivector_dir}" \ --left-context {left_context} \ --right-context {right_context} \ @@ -67,9 +65,6 @@ def generate_egs_using_targets(data, targets_scp, egs_dir, {data} {targets_scp} {egs_dir} """.format(command=run_opts.egs_command, cmvn_opts=cmvn_opts if cmvn_opts is not None else '', - transform_dir=(transform_dir - if transform_dir is not None - else ''), ivector_dir=(online_ivector_dir if online_ivector_dir is not None else ''), diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/attention.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/attention.py index fb68cb298a5..e870c1a60cf 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/attention.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/attention.py @@ -22,6 +22,7 @@ # Documentation for the rest of the parameters (related to the # attention component) can be found in nnet-attention-component.h + class XconfigAttentionLayer(XconfigLayerBase): def __init__(self, first_token, key_to_value, prev_names = None): # Here we just list some likely combinations.. you can just add any @@ -146,7 +147,6 @@ def _add_components(self, input_desc, input_dim, nonlinearities): if learning_rate_factor != 1.0 else '') l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize) if l2_regularize != 0.0 else '') - configs = [] # First the affine node. line = ('component name={0}.affine' @@ -154,7 +154,7 @@ def _add_components(self, input_desc, input_dim, nonlinearities): ' input-dim={1}' ' output-dim={2}' ' max-change={3}' - ' {4} {5} {6} ' + ' {4} {5} {6}' ''.format(self.name, input_dim, dim, max_change, ng_affine_options, learning_rate_option, l2_regularize_option)) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index 99a4fb28ff6..e95de336586 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -500,16 +500,22 @@ def set_default_configs(self): 'objective-type': 'linear', # see Nnet::ProcessOutputNodeConfigLine in # nnet-nnet.cc for other options - 'learning-rate-factor': 1.0, - # used in DNN (not RNN) training when using - # frame-level objfns, + 'output-delay': 0, + 'ng-affine-options': '', + 'ng-linear-options': '', # only affects bottleneck output layers. + + # The following are just passed through to the affine + # component, and (in the bottleneck case) the linear + # component. + 'learning-rate-factor': '', # effective default: 1.0 + 'l2-regularize': '', # effective default: 0.0 'max-change': 1.5, + + # The following are passed through to the affine component only. + # It tends to be beneficial to initialize the output layer with + # zero values, unlike the hidden layers. 'param-stddev': 0.0, 'bias-stddev': 0.0, - 'l2-regularize': 0.0, - 'output-delay': 0, - 'ng-affine-options': '', - 'ng-linear-options': '' # only affects bottleneck output layers. } def check_configs(self): @@ -524,10 +530,10 @@ def check_configs(self): " invalid value {0}" "".format(self.config['objective-type'])) - if self.config['learning-rate-factor'] <= 0.0: - raise RuntimeError("In output-layer, learning-rate-factor has" - " invalid value {0}" - "".format(self.config['learning-rate-factor'])) + if self.config['orthonormal-constraint'] <= 0.0: + raise RuntimeError("output-layer does not support negative (floating) " + "orthonormal constraint; use a separate linear-component " + "followed by batchnorm-component.") def auxiliary_outputs(self): @@ -587,18 +593,14 @@ def _generate_config(self): output_dim = self.config['dim'] bottleneck_dim = self.config['bottleneck-dim'] objective_type = self.config['objective-type'] - learning_rate_factor = self.config['learning-rate-factor'] include_log_softmax = self.config['include-log-softmax'] - param_stddev = self.config['param-stddev'] - bias_stddev = self.config['bias-stddev'] - l2_regularize = self.config['l2-regularize'] output_delay = self.config['output-delay'] - max_change = self.config['max-change'] - ng_affine_options = self.config['ng-affine-options'] - learning_rate_option = ('learning-rate-factor={0} '.format(learning_rate_factor) if - learning_rate_factor != 1.0 else '') - l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize) - if l2_regularize != 0.0 else '') + + affine_options = self.config['ng-affine-options'] + for opt in [ 'learning-rate-factor', 'l2-regularize', 'max-change', + 'param-stddev', 'bias-stddev' ]: + if self.config[opt] != '': + affine_options += ' {0}={1}'.format(opt, self.config[opt]) cur_node = descriptor_final_string cur_dim = input_dim @@ -613,6 +615,10 @@ def _generate_config(self): # We don't include the l2-regularize option because it's useless # given the orthonormality constraint. linear_options = self.config['ng-linear-options'] + for opt in [ 'learning-rate-factor', 'l2-regularize', 'max-change' ]: + if self.config[opt] != '': + linear_options += ' {0}={1}'.format(opt, self.config[opt]) + # note: by default the LinearComponent uses natural gradient. line = ('component name={0}.linear type=LinearComponent ' @@ -631,14 +637,8 @@ def _generate_config(self): line = ('component name={0}.affine' ' type=NaturalGradientAffineComponent' - ' input-dim={1}' - ' output-dim={2}' - ' param-stddev={3}' - ' bias-stddev={4}' - ' max-change={5} {6} {7} {8}' - ''.format(self.name, cur_dim, output_dim, - param_stddev, bias_stddev, max_change, ng_affine_options, - learning_rate_option, l2_regularize_option)) + ' input-dim={1} output-dim={2} {3}' + ''.format(self.name, cur_dim, output_dim, affine_options)) configs.append(line) line = ('component-node name={0}.affine' ' component={0}.affine input={1}' @@ -711,7 +711,9 @@ def set_default_configs(self): # the most recent layer. self.config = {'input': '[-1]', 'dim': -1, - 'bottleneck-dim': -1, + 'bottleneck-dim': -1, # Deprecated! Use tdnnf-layer for + # factorized TDNNs, or prefinal-layer + # for bottlenecks just before the output. 'self-repair-scale': 1.0e-05, 'target-rms': 1.0, 'ng-affine-options': '', @@ -726,7 +728,8 @@ def set_default_configs(self): # continuous-valued (not zero-one) mask. 'add-log-stddev': False, # the following are not really inspected by this level of - # code, just passed through (but not if left at ''). + # code, just passed through to the affine component if + # their value is not ''. 'bias-stddev': '', 'l2-regularize': '', 'learning-rate-factor': '', @@ -821,6 +824,9 @@ def _add_components(self, input_desc, input_dim, nonlinearities): # First the affine node (or linear then affine, if bottleneck). if self.config['bottleneck-dim'] > 0: + # The 'bottleneck-dim' option is deprecated and may eventually be + # removed. Best to use tdnnf-layer if you want factorized TDNNs. + # This is the bottleneck case (it doesn't necessarily imply we # will be using the features from the bottleneck; it's just a factorization # of the matrix into two pieces without a nonlinearity in between). diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/composite_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/composite_layers.py new file mode 100644 index 00000000000..e1905d0aa48 --- /dev/null +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/composite_layers.py @@ -0,0 +1,319 @@ +# Copyright 2018 Johns Hopkins University (Dan Povey) +# Apache 2.0. + +""" This module contains some composite layers, which is basically a catch-all + term for things like TDNN-F that contain several affine or linear comopnents. +""" +from __future__ import print_function +import math +import re +import sys +from libs.nnet3.xconfig.basic_layers import XconfigLayerBase + +# This class is intended to implement an extension of the factorized TDNN +# (TDNN-F) that supports resnet-type 'bypass' connections. It is for lines like +# the following: +# +# tdnnf-layer name=tdnnf2 dim=1024 bottleneck-dim=128 dropout-proportion=0.0 time-stride=3 +# +# The line above would be roughly equivalent to the following four lines (except +# for different naming, and the use of TdnnComponent, for efficiency, in place +# of AffineComponent). Assume that the previous layer (the default input) was tdnnf1: +# +# linear-component name=tdnnf2.linear dim=128 orthonormal-constraint=-1.0 input=Append(Offset(-3, tdnnf1), tdnnf1) +# relu-batchnorm-dropout-layer name=tdnnf2.affine dim=1024 dropout-proportion=0.0 \ +# dropout-per-dim-continuous=true input=Append(0,3) +# no-op-component name=tdnnf2 input=Sum(Scale(0.66,tdnnf1), tdnn2.affine) + +# Documentation of some of the important options: +# +# - dropout-proportion +# This gets passed through to the dropout component. If you don't set +# 'dropout-proportion', no dropout component will be included; it would be like +# using a relu-batchnorm-layer in place of a relu-batchnorm-dropout-layer. You +# should only set 'dropout-proportion' if you intend to use dropout (it would +# usually be combined with the --dropout-schedule option to train.py). If you +# use the --dropout-schedule option, the value doesn't really matter since it +# will be changed during training, and 0 is recommended. +# +# - time-stride +# Controls the time offsets in the splicing, e.g. if you set time-stride to +# 1 instead of the 3 in the example, the time-offsets would be -1 and 1 instead +# of 1 and 3. +# If you set time-stride=0, as a special case no splicing over time will be +# performed (so no Append() expressions) and the second linear component (named +# tdnnf2l in the example) would be omitted, since it would add no modeling +# power. +# You can set time-stride to a negative number which will negate all the +# time indexes; it might potentially be useful to alternate negative and positive +# time-stride if you wanted to force the overall network to have symmetric +# context, since with positive time stride, this layer has more negative +# than positive time context (i.e. more left than right). +# +# - bypass-scale + +# A scale on the previous layer's output, used in bypass (resnet-type) +# connections. Should not exceed 1.0. The default is 0.66. If you set it to +# zero, the layer will lack the bypass (but we don't recommend this). won't use +# a bypass connection at all, so it would be like conventional TDNN-F Note: the +# layer outputs are added together after the batchnorm so the model cannot +# control their relative magnitudes and this does actually affect what it can +# model. When we experimented with having this scale trainable it did not seem +# to give an advantage. +# +# - l2-regularize +# This is passed through to the linear and affine components. You'll normally +# want this to be set to a nonzero value, e.g. 0.004. + +class XconfigTdnnfLayer(XconfigLayerBase): + + def __init__(self, first_token, key_to_value, prev_names = None): + assert first_token == "tdnnf-layer" + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input':'[-1]', + 'dim':-1, + 'bottleneck-dim':-1, + 'bypass-scale':0.66, + 'dropout-proportion':-1.0, + 'time-stride':1, + 'l2-regularize':0.0, + 'max-change': 0.75, + 'self-repair-scale': 1.0e-05} + + def set_derived_configs(self): + pass + + def check_configs(self): + if self.config['bottleneck-dim'] <= 0: + raise RuntimeError("bottleneck-dim must be set and >0.") + if self.config['dim'] <= self.config['bottleneck-dim']: + raise RuntimeError("dim must be greater than bottleneck-dim") + + dropout = self.config['dropout-proportion'] + if dropout != -1.0 and not (dropout >= 0.0 and dropout < 1.0): + raise RuntimeError("invalid value for dropout-proportion") + + if abs(self.config['bypass-scale']) > 1.0: + raise RuntimeError("bypass-scale has invalid value") + + input_dim = self.descriptors['input']['dim'] + output_dim = self.config['dim'] + if output_dim != input_dim and self.config['bypass-scale'] != 0.0: + raise RuntimeError('bypass-scale is nonzero but output-dim != input-dim: {0} != {1}' + ''.format(output_dim, input_dim)) + + + def output_name(self, auxiliary_output=None): + assert auxiliary_output is None + output_component = '' + if self.config['bypass-scale'] != 0.0: + # the no-op component is used to cache something that we don't want + # to have to recompute. + output_component = 'noop' + elif self.config['dropout-proportion'] != -1.0: + output_component = 'dropout' + else: + output_component = 'batchnorm' + return '{0}.{1}'.format(self.name, output_component) + + + def output_dim(self, auxiliary_output=None): + return self.config['dim'] + + def get_full_config(self): + ans = [] + config_lines = self._generate_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + ans.append((config_name, line)) + return ans + + + def _generate_config(self): + configs = [] + name = self.name + + input_dim = self.descriptors['input']['dim'] + input_descriptor = self.descriptors['input']['final-string'] + output_dim = self.config['dim'] + assert output_dim == input_dim + bottleneck_dim = self.config['bottleneck-dim'] + bypass_scale = self.config['bypass-scale'] + dropout_proportion = self.config['dropout-proportion'] + time_stride = self.config['time-stride'] + if time_stride != 0: + time_offsets1 = '{0},0'.format(-time_stride) + time_offsets2 = '0,{0}'.format(time_stride) + else: + time_offsets1 = '0' + time_offsets2 = '0' + l2_regularize = self.config['l2-regularize'] + max_change = self.config['max-change'] + self_repair_scale = self.config['self-repair-scale'] + + # The first linear layer, from input-dim (spliced x2) to bottleneck-dim + configs.append('component name={0}.linear type=TdnnComponent input-dim={1} ' + 'output-dim={2} l2-regularize={3} max-change={4} use-bias=false ' + 'time-offsets={5} orthonormal-constraint=-1.0'.format( + name, input_dim, bottleneck_dim, l2_regularize, + max_change, time_offsets1)) + configs.append('component-node name={0}.linear component={0}.linear ' + 'input={1}'.format(name, input_descriptor)) + + # The affine layer, from bottleneck-dim (spliced x2) to output-dim + configs.append('component name={0}.affine type=TdnnComponent ' + 'input-dim={1} output-dim={2} l2-regularize={3} max-change={4} ' + 'time-offsets={5}'.format( + name, bottleneck_dim, output_dim, l2_regularize, + max_change, time_offsets2)) + configs.append('component-node name={0}.affine component={0}.affine ' + 'input={0}.linear'.format(name)) + + # The ReLU layer + configs.append('component name={0}.relu type=RectifiedLinearComponent dim={1} ' + 'self-repair-scale={2}'.format( + name, output_dim, self_repair_scale)) + configs.append('component-node name={0}.relu component={0}.relu ' + 'input={0}.affine'.format(name)) + + # The BatchNorm layer + configs.append('component name={0}.batchnorm type=BatchNormComponent ' + 'dim={1}'.format(name, output_dim)) + configs.append('component-node name={0}.batchnorm component={0}.batchnorm ' + 'input={0}.relu'.format(name)) + + if dropout_proportion != -1: + # This is not normal dropout. It's dropout where the mask is shared + # across time, and (thanks to continuous=true), instead of a + # zero-or-one scale, it's a continuously varying scale whose + # expected value is 1, drawn from a uniform distribution over an + # interval of a size that varies with dropout-proportion. + configs.append('component name={0}.dropout type=GeneralDropoutComponent ' + 'dim={1} dropout-proportion={2} continuous=true'.format( + name, output_dim, dropout_proportion)) + configs.append('component-node name={0}.dropout component={0}.dropout ' + 'input={0}.batchnorm'.format(name)) + cur_component_type = 'dropout' + else: + cur_component_type = 'batchnorm' + + if bypass_scale != 0.0: + # Add a NoOpComponent to cache the weighted sum of the input and the + # output. We could easily have the output of the component be a + # Descriptor like 'Append(Scale(0.66, tdnn1.batchnorm), tdnn2.batchnorm)', + # but if we did that and you used many of this component in sequence, + # the weighted sums would have more and more terms as you went deeper + # in the network. + configs.append('component name={0}.noop type=NoOpComponent ' + 'dim={1}'.format(name, output_dim)) + configs.append('component-node name={0}.noop component={0}.noop ' + 'input=Sum(Scale({1}, {2}), {0}.{3})'.format( + name, bypass_scale, input_descriptor, + cur_component_type)) + + return configs + +# This is for lines like the following: +# prefinal-layer name=prefinal-chain input=prefinal-l l2-regularize=0.02 big-dim=1024 small-dim=256 +# +# which is equivalent to the following sequence of components (except for +# name differences): +# relu-batchnorm-layer name=prefinal-chain input=prefinal-l l2-regularize=0.02 dim=1024 +# linear-comonent name=prefinal-chain-l dim=256 l2-regularize=0.02 orthonormal-constraint=-1.0 +# batchnorm-component name=prefinal-chain-batchnorm +# +# This layer is really just for convenience in writing config files: it doesn't +# do anything that's particular hard or unusual, but it encapsulates a commonly +# repeated pattern. +class XconfigPrefinalLayer(XconfigLayerBase): + + def __init__(self, first_token, key_to_value, prev_names = None): + assert first_token == "prefinal-layer" + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input':'[-1]', + 'big-dim':-1, + 'small-dim':-1, + 'l2-regularize':0.0, + 'max-change': 0.75, + 'self-repair-scale': 1.0e-05} + + def set_derived_configs(self): + pass + + def check_configs(self): + if self.config['small-dim'] <= 0: + raise RuntimeError("small-dim must be set and >0.") + if self.config['big-dim'] <= self.config['small-dim']: + raise RuntimeError("big-dim must be greater than small-dim") + + def output_name(self, auxiliary_output=None): + assert auxiliary_output is None + return '{0}.batchnorm2'.format(self.name) + + def output_dim(self, auxiliary_output=None): + return self.config['small-dim'] + + def get_full_config(self): + ans = [] + config_lines = self._generate_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + ans.append((config_name, line)) + return ans + + + def _generate_config(self): + configs = [] + name = self.name + + input_dim = self.descriptors['input']['dim'] + input_descriptor = self.descriptors['input']['final-string'] + small_dim = self.config['small-dim'] + big_dim = self.config['big-dim'] + l2_regularize = self.config['l2-regularize'] + max_change = self.config['max-change'] + self_repair_scale = self.config['self-repair-scale'] + + # The affine layer, from input-dim to big-dim. + configs.append('component name={0}.affine type=NaturalGradientAffineComponent ' + 'input-dim={1} output-dim={2} l2-regularize={3} max-change={4}'.format( + name, input_dim, big_dim, l2_regularize, max_change)) + configs.append('component-node name={0}.affine component={0}.affine ' + 'input={1}'.format(name, input_descriptor)) + + # The ReLU layer + configs.append('component name={0}.relu type=RectifiedLinearComponent dim={1} ' + 'self-repair-scale={2}'.format( + name, big_dim, self_repair_scale)) + configs.append('component-node name={0}.relu component={0}.relu ' + 'input={0}.affine'.format(name)) + + # The first BatchNorm layer + configs.append('component name={0}.batchnorm1 type=BatchNormComponent ' + 'dim={1}'.format(name, big_dim)) + configs.append('component-node name={0}.batchnorm1 component={0}.batchnorm1 ' + 'input={0}.relu'.format(name)) + + # The linear layer, from big-dim to small-dim, with orthonormal-constraint=-1 + # ("floating" orthonormal constraint). + configs.append('component name={0}.linear type=LinearComponent ' + 'input-dim={1} output-dim={2} l2-regularize={3} max-change={4} ' + 'orthonormal-constraint=-1 '.format( + name, big_dim, small_dim, + l2_regularize, max_change)) + configs.append('component-node name={0}.linear component={0}.linear ' + 'input={0}.batchnorm1'.format(name)) + + # The second BatchNorm layer + configs.append('component name={0}.batchnorm2 type=BatchNormComponent ' + 'dim={1}'.format(name, small_dim)) + configs.append('component-node name={0}.batchnorm2 component={0}.batchnorm2 ' + 'input={0}.linear'.format(name)) + + return configs diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py index 32d6e87eba1..5b722fee5b4 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py @@ -10,3 +10,4 @@ from .gru import * from .stats_layer import * from .trivial_layers import * +from .composite_layers import * diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 99911b39fb2..1d284146e35 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -68,10 +68,13 @@ 'opgru-layer' : xlayers.XconfigOpgruLayer, 'norm-pgru-layer' : xlayers.XconfigNormPgruLayer, 'norm-opgru-layer' : xlayers.XconfigNormOpgruLayer, + 'tdnnf-layer': xlayers.XconfigTdnnfLayer, + 'prefinal-layer': xlayers.XconfigPrefinalLayer, 'renorm-component': xlayers.XconfigRenormComponent, 'batchnorm-component': xlayers.XconfigBatchnormComponent, 'no-op-component': xlayers.XconfigNoOpComponent, - 'linear-component': xlayers.XconfigLinearComponent + 'linear-component': xlayers.XconfigLinearComponent, + 'scale-component': xlayers.XconfigPerElementScaleComponent } # Turn a config line and a list of previous layers into @@ -85,7 +88,7 @@ def xconfig_line_to_object(config_line, prev_layers = None): if x is None: return None (first_token, key_to_value) = x - if not config_to_layer.has_key(first_token): + if not first_token in config_to_layer: raise RuntimeError("No such layer type '{0}'".format(first_token)) return config_to_layer[first_token](first_token, key_to_value, prev_layers) except Exception: @@ -165,7 +168,9 @@ def get_model_component_info(model_filename): # layers but are actual component node names from an existing neural net model # and created using get_model_component_info function). # 'existing' layers can be used as input to component-nodes in layers of xconfig file. -def read_xconfig_file(xconfig_filename, existing_layers=[]): +def read_xconfig_file(xconfig_filename, existing_layers=None): + if existing_layers is None: + existing_layers = [] try: f = open(xconfig_filename, 'r') except Exception as e: diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py index f7da8956d1c..6b8e3c3a5c2 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py @@ -253,3 +253,78 @@ def _generate_config(self): self.name, input_desc)) configs.append(line) return configs + + +class XconfigPerElementScaleComponent(XconfigLayerBase): + """This class is for parsing lines like + 'scale-component name=scale1 input=Append(-3,0,3)' + which will produce just a single component, of type NaturalGradientPerElementScaleComponent, with + output-dim 1024 in this case, and input-dim determined by the dimension of the input . + + Parameters of the class, and their defaults: + input='[-1]' [Descriptor giving the input of the layer.] + + The following (shown with their effective defaults) are just passed through + to the component's config line. (These defaults are mostly set in the + code). + + max-change=0.75 + l2-regularize=0.0 + param-mean=1.0 # affects initialization + param-stddev=0.0 # affects initialization + learning-rate-factor=1.0 + """ + def __init__(self, first_token, key_to_value, prev_names=None): + XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) + + def set_default_configs(self): + self.config = {'input': '[-1]', + 'l2-regularize': '', + 'max-change': 0.75, + 'param-mean': '', + 'param-stddev': '', + 'learning-rate-factor': '' } + + def check_configs(self): + pass + + def output_name(self, auxiliary_output=None): + assert auxiliary_output is None + return self.name + + def output_dim(self, auxiliary_output=None): + assert auxiliary_output is None + return self.descriptors['input']['dim'] + + def get_full_config(self): + ans = [] + config_lines = self._generate_config() + + for line in config_lines: + for config_name in ['ref', 'final']: + # we do not support user specified matrices in this layer + # so 'ref' and 'final' configs are the same. + ans.append((config_name, line)) + return ans + + def _generate_config(self): + # by 'descriptor_final_string' we mean a string that can appear in + # config-files, i.e. it contains the 'final' names of nodes. + input_desc = self.descriptors['input']['final-string'] + dim = self.descriptors['input']['dim'] + + opts = '' + for opt_name in ['learning-rate-factor', 'max-change', 'l2-regularize', 'param-mean', + 'param-stddev' ]: + value = self.config[opt_name] + if value != '': + opts += ' {0}={1}'.format(opt_name, value) + + configs = [] + line = ('component name={0} type=NaturalGradientPerElementScaleComponent dim={1} {2} ' + ''.format(self.name, dim, opts)) + configs.append(line) + line = ('component-node name={0} component={0} input={1}'.format( + self.name, input_desc)) + configs.append(line) + return configs diff --git a/egs/wsj/s5/steps/nnet2/make_multisplice_configs.py b/egs/wsj/s5/steps/nnet2/make_multisplice_configs.py index cff85b7f60d..6e7bff3fa17 100755 --- a/egs/wsj/s5/steps/nnet2/make_multisplice_configs.py +++ b/egs/wsj/s5/steps/nnet2/make_multisplice_configs.py @@ -90,7 +90,7 @@ def create_config_files(output_dir, params): for i in xrange(1, params.num_hidden_layers): #just run till num_hidden_layers-1 since we do not add splice before the final affine transform lines=[] context_len = 1 - if contexts.has_key(i): + if i in contexts: # Adding the splice component as a context is provided lines.append("SpliceComponent input-dim=%d context=%s " % (pnorm_output_dim, contexts[i])) context_len = len(contexts[i].split(":")) diff --git a/egs/wsj/s5/steps/nnet3/align.sh b/egs/wsj/s5/steps/nnet3/align.sh index 48614d362f9..cf1cc9124d3 100755 --- a/egs/wsj/s5/steps/nnet3/align.sh +++ b/egs/wsj/s5/steps/nnet3/align.sh @@ -6,6 +6,8 @@ # Apache 2.0 # Computes training alignments using nnet3 DNN +# Warning: this script uses GPUs by default, and this is generally not +# an efficient use of GPUs. Set --use-gpu false to make it run on CPU. # Begin configuration section. nj=4 @@ -14,7 +16,6 @@ cmd=run.pl scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" beam=10 retry_beam=40 -transform_dir= iter=final use_gpu=true frames_per_chunk=50 @@ -31,8 +32,11 @@ echo "$0 $@" # Print the command line for logging . parse_options.sh || exit 1; if [ $# != 4 ]; then - echo "Usage: $0 [--transform-dir ] " + echo "Usage: $0 " echo "e.g.: $0 data/train data/lang exp/nnet4 exp/nnet4_ali" + echo "Warning: this script uses GPUs by default, and this is generally not" + echo "an efficient use of GPUs. Set --use-gpu false to make it run on CPU." + echo "" echo "main options (for others, see top of script file)" echo " --config # config containing options" echo " --nj # number of parallel jobs" @@ -48,6 +52,7 @@ dir=$4 oov=`cat $lang/oov.int` || exit 1; mkdir -p $dir/log echo $nj > $dir/num_jobs +touch $dir/per_utt sdata=$data/split${nj}utt [[ -d $sdata && $data/feats.scp -ot $sdata ]] || \ split_data.sh --per-utt $data $nj || exit 1; @@ -84,27 +89,6 @@ cp $srcdir/cmvn_opts $dir 2>/dev/null feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" -if [ ! -z "$transform_dir" ]; then - echo "$0: using transforms from $transform_dir" - [ ! -s $transform_dir/num_jobs ] && \ - echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1; - nj_orig=$(cat $transform_dir/num_jobs) - - if [ ! -f $transform_dir/raw_trans.1 ]; then - echo "$0: expected $transform_dir/raw_trans.1 to exist (--transform-dir option)" - exit 1; - fi - if [ $nj -ne $nj_orig ]; then - # Copy the transforms into an archive with an index. - for n in $(seq $nj_orig); do cat $transform_dir/raw_trans.$n; done | \ - copy-feats ark:- ark,scp:$dir/raw_trans.ark,$dir/raw_trans.scp || exit 1; - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/raw_trans.scp ark:- ark:- |" - else - # number of jobs matches with alignment dir. - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |" - fi -fi - ivector_opts= if [ ! -z "$online_ivector_dir" ]; then ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; diff --git a/egs/wsj/s5/steps/nnet3/align_lats.sh b/egs/wsj/s5/steps/nnet3/align_lats.sh index 2f9042467ff..4edc38751c8 100755 --- a/egs/wsj/s5/steps/nnet3/align_lats.sh +++ b/egs/wsj/s5/steps/nnet3/align_lats.sh @@ -16,7 +16,6 @@ stage=-1 scale_opts="--transition-scale=1.0 --self-loop-scale=0.1" acoustic_scale=0.1 beam=20 -transform_dir= iter=final frames_per_chunk=50 extra_left_context=0 @@ -34,7 +33,7 @@ echo "$0 $@" # Print the command line for logging . parse_options.sh || exit 1; if [ $# != 4 ]; then - echo "Usage: $0 [--transform-dir ] " + echo "Usage: $0 " echo "e.g.: $0 data/train data/lang exp/nnet4 exp/nnet4_ali" echo "main options (for others, see top of script file)" echo " --config # config containing options" @@ -51,6 +50,7 @@ dir=$4 oov=`cat $lang/oov.int` || exit 1; mkdir -p $dir/log echo $nj > $dir/num_jobs +touch $dir/per_utt sdata=$data/split${nj}utt [[ -d $sdata && $data/feats.scp -ot $sdata ]] || \ split_data.sh --per-utt $data $nj || exit 1; @@ -79,27 +79,6 @@ cp $srcdir/cmvn_opts $dir 2>/dev/null feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" -if [ ! -z "$transform_dir" ]; then - echo "$0: using transforms from $transform_dir" - [ ! -s $transform_dir/num_jobs ] && \ - echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1; - nj_orig=$(cat $transform_dir/num_jobs) - - if [ ! -f $transform_dir/raw_trans.1 ]; then - echo "$0: expected $transform_dir/raw_trans.1 to exist (--transform-dir option)" - exit 1; - fi - if [ $nj -ne $nj_orig ]; then - # Copy the transforms into an archive with an index. - for n in $(seq $nj_orig); do cat $transform_dir/raw_trans.$n; done | \ - copy-feats ark:- ark,scp:$dir/raw_trans.ark,$dir/raw_trans.scp || exit 1; - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/raw_trans.scp ark:- ark:- |" - else - # number of jobs matches with alignment dir. - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |" - fi -fi - ivector_opts= if [ ! -z "$online_ivector_dir" ]; then ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh index 23fb62d7a87..757963f13a7 100755 --- a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh +++ b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh @@ -22,6 +22,7 @@ cmd=run.pl context_opts= # e.g. set this to "--context-width 5 --central-position 2" for quinphone. cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves frame_subsampling_factor=1 +alignment_subsampling_factor= leftmost_questions_truncate=-1 # note: this option is deprecated and has no effect tree_stats_opts= cluster_phones_opts= @@ -74,7 +75,6 @@ oov=`cat $lang/oov.int` nj=`cat $alidir/num_jobs` || exit 1; silphonelist=`cat $lang/phones/silence.csl` ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1; -sdata=$data/split$nj; splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options. cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` delta_opts=`cat $alidir/delta_opts 2>/dev/null` @@ -88,7 +88,13 @@ utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exi cp $lang/phones.txt $dir || exit 1; echo $nj >$dir/num_jobs -[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +if [ -f $alidir/per_utt ]; then + sdata=$data/split${nj}utt + utils/split_data.sh --per-utt $data $nj +else + sdata=$data/split$nj + utils/split_data.sh $data $nj +fi # Set up features. @@ -116,6 +122,10 @@ if [ $frame_subsampling_factor -gt 1 ]; then feats="$feats subsample-feats --n=$frame_subsampling_factor ark:- ark:- |" fi +if [ -z $alignment_subsampling_factor ]; then + alignment_subsampling_factor=$frame_subsampling_factor +fi + if [ $stage -le -5 ]; then echo "$0: Initializing monophone model (for alignment conversion, in case topology changed)" @@ -137,7 +147,7 @@ if [ $stage -le -4 ]; then # Get tree stats. echo "$0: Accumulating tree stats" $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \ - convert-ali --frame-subsampling-factor=$frame_subsampling_factor \ + convert-ali --frame-subsampling-factor=$alignment_subsampling_factor \ $alidir/final.mdl $dir/mono.mdl $dir/mono.tree "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \ acc-tree-stats $context_opts $tree_stats_opts --ci-phones=$ciphonelist $dir/mono.mdl \ "$feats" ark:- $dir/JOB.treeacc || exit 1; @@ -180,7 +190,7 @@ if [ $stage -le -1 ]; then echo "$0: Converting alignments from $alidir to use current tree" $cmd JOB=1:$nj $dir/log/convert.JOB.log \ convert-ali --repeat-frames=$repeat_frames \ - --frame-subsampling-factor=$frame_subsampling_factor \ + --frame-subsampling-factor=$alignment_subsampling_factor \ $alidir/final.mdl $dir/1.mdl $dir/tree \ "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; fi diff --git a/egs/wsj/s5/steps/nnet3/chain/e2e/get_egs_e2e.sh b/egs/wsj/s5/steps/nnet3/chain/e2e/get_egs_e2e.sh index 01e3ae6551a..e642fad29bd 100755 --- a/egs/wsj/s5/steps/nnet3/chain/e2e/get_egs_e2e.sh +++ b/egs/wsj/s5/steps/nnet3/chain/e2e/get_egs_e2e.sh @@ -35,8 +35,6 @@ frames_per_iter=400000 # each iteration of training, see this many frames per # used. This is just a guideline; it will pick a number # that divides the number of samples in the entire data. -transform_dir= # If supplied, overrides latdir as the place to find fMLLR transforms - stage=0 nj=15 # This should be set to the maximum number of jobs you are # comfortable to run in parallel; you can increase it if your disk @@ -118,7 +116,7 @@ frames_per_eg=$(cat $data/allowed_lengths.txt | tr '\n' , | sed 's/,$//') cat $data/utt2len | \ awk '{print $1}' | \ - utils/shuffle_list.pl | head -$num_utts_subset > $dir/valid_uttlist || exit 1; + utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/valid_uttlist len_uttlist=`wc -l $dir/valid_uttlist | awk '{print $1}'` @@ -143,32 +141,12 @@ fi cat $data/utt2len | \ awk '{print $1}' | \ utils/filter_scp.pl --exclude $dir/valid_uttlist | \ - utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1; + utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/train_subset_uttlist len_uttlist=`wc -l $dir/train_subset_uttlist | awk '{print $1}'` if [ $len_uttlist -lt $num_utts_subset ]; then echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1; fi -[ -z "$transform_dir" ] && transform_dir=$latdir - -# because we'll need the features with a different number of jobs than $latdir, -# copy to ark,scp. -if [ -f $transform_dir/trans.1 ] && [ $feat_type != "raw" ]; then - echo "$0: using transforms from $transform_dir" - if [ $stage -le 0 ]; then - $cmd $dir/log/copy_transforms.log \ - copy-feats "ark:cat $transform_dir/trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" - fi -fi -if [ -f $transform_dir/raw_trans.1 ] && [ $feat_type == "raw" ]; then - echo "$0: using raw transforms from $transform_dir" - if [ $stage -le 0 ]; then - $cmd $dir/log/copy_transforms.log \ - copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" - fi -fi - - ## Set up features. echo "$0: feature type is $feat_type" @@ -182,12 +160,6 @@ case $feat_type in *) echo "$0: invalid feature type --feat-type '$feat_type'" && exit 1; esac -if [ -f $dir/trans.scp ]; then - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |" - valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" - train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" -fi - if [ ! -z "$online_ivector_dir" ]; then ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; echo $ivector_dim > $dir/info/ivector_dim @@ -408,9 +380,8 @@ if [ $stage -le 6 ]; then # there are some extra soft links that we should delete. for f in $dir/cegs.*.*.ark; do rm $f; done fi - echo "$0: removing temporary alignments and transforms" - # Ignore errors below because trans.* might not exist. - rm $dir/{ali,trans}.{ark,scp} 2>/dev/null + echo "$0: removing temporary alignments" + rm $dir/ali.{ark,scp} 2>/dev/null fi diff --git a/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py b/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py index 99f622d79a7..e96f2a10820 100755 --- a/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py +++ b/egs/wsj/s5/steps/nnet3/chain/e2e/train_e2e.py @@ -487,7 +487,7 @@ def train(args, run_opts): if args.cleanup: - # do a clean up everythin but the last 2 models, under certain + # do a clean up everything but the last 2 models, under certain # conditions common_train_lib.remove_model( args.dir, iter-2, num_iters, models_to_combine, diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index 99e7499bd30..9996820d6d3 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -31,6 +31,10 @@ alignment_subsampling_factor=3 # frames-per-second of input alignments divided left_context=4 # amount of left-context per eg (i.e. extra frames of input features # not present in the output supervision). right_context=4 # amount of right-context per eg. +constrained=true # 'constrained=true' is the traditional setup; 'constrained=false' + # gives you the 'unconstrained' egs creation in which the time + # boundaries are not enforced inside chunks. + left_context_initial=-1 # if >=0, left-context for first chunk of an utterance right_context_final=-1 # if >=0, right-context for last chunk of an utterance compress=true # set this to false to disable compression (e.g. if you want to see whether @@ -49,8 +53,6 @@ frames_per_iter=400000 # each iteration of training, see this many frames per right_tolerance= # chain right tolerance == max label delay. left_tolerance= -transform_dir= # If supplied, overrides latdir as the place to find fMLLR transforms - stage=0 max_jobs_run=15 # This should be set to the maximum number of nnet3-chain-get-egs jobs you are # comfortable to run in parallel; you can increase it if your disk @@ -65,9 +67,9 @@ cmvn_opts= # can be used for specifying CMVN options, if feature type is not ld # LDA transform). This is used to turn off CMVN in the online-nnet experiments. lattice_lm_scale= # If supplied, the graph/lm weight of the lattices will be # used (with this scale) in generating supervisions - # This is 0 by default for conventional supervised training, - # but may be close to 1 for the unsupervised part of the data - # in semi-supervised training. The optimum is usually + # This is 0 by default for conventional supervised training, + # but may be close to 1 for the unsupervised part of the data + # in semi-supervised training. The optimum is usually # 0.5 for unsupervised data. lattice_prune_beam= # If supplied, the lattices will be pruned to this beam, # before being used to get supervisions. @@ -136,24 +138,25 @@ for f in $data/feats.scp $latdir/lat.1.gz $latdir/final.mdl \ done nj=$(cat $latdir/num_jobs) || exit 1 - -sdata=$data/split$nj -utils/split_data.sh $data $nj +if [ -f $latdir/per_utt ]; then + sdata=$data/split${nj}utt + utils/split_data.sh --per-utt $data $nj +else + sdata=$data/split$nj + utils/split_data.sh $data $nj +fi mkdir -p $dir/log $dir/info # Get list of validation utterances. - frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1 -utils/data/get_utt2dur.sh $data -cat $data/utt2dur | \ - awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \ - utils/shuffle_list.pl | head -$num_utts_subset > $dir/valid_uttlist || exit 1; +awk '{print $1}' $data/utt2spk | \ + utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/valid_uttlist -len_uttlist=`wc -l $dir/valid_uttlist | awk '{print $1}'` +len_uttlist=$(wc -l < $dir/valid_uttlist) if [ $len_uttlist -lt $num_utts_subset ]; then - echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1; + echo "Number of utterances is very small. Please check your data." && exit 1; fi if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation. @@ -171,25 +174,12 @@ fi echo "$0: creating egs. To ensure they are not deleted later you can do: touch $dir/.nodelete" -cat $data/utt2dur | \ - awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \ +awk '{print $1}' $data/utt2spk | \ utils/filter_scp.pl --exclude $dir/valid_uttlist | \ - utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1; -len_uttlist=`wc -l $dir/train_subset_uttlist | awk '{print $1}'` + utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/train_subset_uttlist +len_uttlist=$(wc -l <$dir/train_subset_uttlist) if [ $len_uttlist -lt $num_utts_subset ]; then - echo "Number of utterances which have length at least $frames_per_eg is really low. Please check your data." && exit 1; -fi - -[ -z "$transform_dir" ] && transform_dir=$latdir - -# because we'll need the features with a different number of jobs than $latdir, -# copy to ark,scp. -if [ -f $transform_dir/raw_trans.1 ]; then - echo "$0: using raw transforms from $transform_dir" - if [ $stage -le 0 ]; then - $cmd $dir/log/copy_transforms.log \ - copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" - fi + echo "Number of utterances is very small. Please check your data." && exit 1; fi ## Set up features. @@ -199,12 +189,6 @@ valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | a train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now. -if [ -f $dir/trans.scp ]; then - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |" - valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" - train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" -fi - tree-info $chaindir/tree | grep num-pdfs | awk '{print $2}' > $dir/info/num_pdfs || exit 1 if [ ! -z "$online_ivector_dir" ]; then @@ -293,6 +277,14 @@ chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$ali [ ! -z $left_tolerance ] && \ chain_supervision_all_opts="$chain_supervision_all_opts --left-tolerance=$left_tolerance" +if ! $constrained; then + chain_supervision_all_opts="$chain_supervision_all_opts --convert-to-pdfs=false" + trans_mdl_opt=--transition-model=$chaindir/0.trans_mdl +else + trans_mdl_opt= +fi + + lats_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |" if [ ! -z $lattice_prune_beam ]; then if [ "$lattice_prune_beam" == "0" ] || [ "$lattice_prune_beam" == "0.0" ]; then @@ -338,7 +330,8 @@ if [ $stage -le 2 ]; then chain-get-supervision $chain_supervision_all_opts $chaindir/tree $chaindir/0.trans_mdl \ ark:- ark:- \| \ nnet3-chain-get-egs $ivector_opts --srand=$srand \ - $egs_opts --normalization-fst-scale=$normalization_fst_scale $chaindir/normalization.fst \ + $egs_opts --normalization-fst-scale=$normalization_fst_scale \ + $trans_mdl_opt $chaindir/normalization.fst \ "$valid_feats" ark,s,cs:- "ark:$dir/valid_all.cegs" || exit 1 $cmd $dir/log/create_train_subset.log \ utils/filter_scp.pl $dir/train_subset_uttlist $dir/lat_special.scp \| \ @@ -346,7 +339,8 @@ if [ $stage -le 2 ]; then chain-get-supervision $chain_supervision_all_opts \ $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \ nnet3-chain-get-egs $ivector_opts --srand=$srand \ - $egs_opts --normalization-fst-scale=$normalization_fst_scale $chaindir/normalization.fst \ + $egs_opts --normalization-fst-scale=$normalization_fst_scale \ + $trans_mdl_opt $chaindir/normalization.fst \ "$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || exit 1 wait sleep 5 # wait for file system to sync. @@ -376,7 +370,6 @@ if [ $stage -le 2 ]; then if $generate_egs_scp; then cat $dir/valid_combine.cegs $dir/train_combine.cegs | \ nnet3-chain-copy-egs ark:- ark,scp:$dir/combine.cegs,$dir/combine.scp - rm $dir/{train,valid}_combine.scp else cat $dir/valid_combine.cegs $dir/train_combine.cegs > $dir/combine.cegs fi @@ -413,7 +406,7 @@ if [ $stage -le 4 ]; then chain-get-supervision $chain_supervision_all_opts \ $chaindir/tree $chaindir/0.trans_mdl ark:- ark:- \| \ nnet3-chain-get-egs $ivector_opts --srand=\$[JOB+$srand] $egs_opts \ - --num-frames-overlap=$frames_overlap_per_eg \ + --num-frames-overlap=$frames_overlap_per_eg $trans_mdl_opt \ "$feats" ark,s,cs:- ark:- \| \ nnet3-chain-copy-egs --random=true --srand=\$[JOB+$srand] ark:- $egs_list || exit 1; fi @@ -504,14 +497,13 @@ if [ $stage -le 6 ]; then # 'storage' directory. rm cegs_orig.*.ark 2>/dev/null ) - if [ $archives_multiple -gt 1 ]; then + if ! $generate_egs_scp && [ $archives_multiple -gt 1 ]; then # there are some extra soft links that we should delete. for f in $dir/cegs.*.*.ark; do rm $f; done fi - echo "$0: removing temporary alignments and transforms" - # Ignore errors below because trans.* might not exist. - rm $dir/{ali,trans}.{ark,scp} 2>/dev/null - + echo "$0: removing temporary alignments, lattices and transforms" + rm $dir/ali.{ark,scp} 2>/dev/null + rm $dir/lat_special.*.{ark,scp} 2>/dev/null fi echo "$0: Finished preparing training examples" diff --git a/egs/wsj/s5/steps/nnet3/chain/get_phone_post.sh b/egs/wsj/s5/steps/nnet3/chain/get_phone_post.sh new file mode 100755 index 00000000000..9925403a3ac --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/get_phone_post.sh @@ -0,0 +1,244 @@ +#!/bin/bash +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). +# Apache 2.0. + + + +# This script obtains phone posteriors from a trained chain model, using either +# the xent output or the forward-backward posteriors from the denominator fst. +# The phone posteriors will be in matrices where the column index can be +# interpreted as phone-index - 1. + +# You may want to mess with the compression options. Be careful: with the current +# settings, you might sometimes get exact zeros as the posterior values. + +# CAUTION! This script isn't very suitable for dumping features from recurrent +# architectures such as LSTMs, because it doesn't support setting the chunk size +# and left and right context. (Those would have to be passed into nnet3-compute +# or nnet3-chain-compute-post). + +# Begin configuration section. +stage=0 + +nj=1 # Number of jobs to run. +cmd=run.pl +remove_word_position_dependency=false +use_xent_output=false +online_ivector_dir= +use_gpu=false +count_smoothing=1.0 # this should be some small number, I don't think it's critical; + # it will mainly affect the probability we assign to phones that + # were never seen in training. note: this is added to the raw + # transition-id occupation counts, so 1.0 means, add a single + # frame's count to each transition-id's counts. + +# End configuration section. + +set -e -u +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# != 5 ]; then + echo "Usage: $0 " + echo " e.g.: $0 --remove-word-position-dependency true --online-ivector-dir exp/nnet3/ivectors_test_eval92_hires \\" + echo " exp/chain/tree_a_sp exp/chain/tdnn1a_sp data/lang data/test_eval92_hires exp/chain/tdnn1a_sp_post_eval92" + echo " ... you'll normally want to set the --nj and --cmd options as well." + echo "" + echo "Main options (for others, see top of script file)" + echo " --cmd (run.pl|queue.pl|... ) # how to run jobs." + echo " --config # config containing options" + echo " --stage # stage to do partial re-run from." + echo " --nj # Number of parallel jobs to run, default:1" + echo " --remove-word-position-dependency # If true, remove word-position-dependency" + echo " # info when dumping posteriors (default: false)" + echo " --use-xent-output # If true, use the cross-entropy output of the" + echo " # neural network when dumping posteriors" + echo " # (default: false, will use chain denominator FST)" + echo " --online-ivector-dir # Directory where we dumped online-computed" + echo " # ivectors corresponding to the data in " + echo " --use-gpu # Set to true to use GPUs (not recommended as the" + echo " # binary is very poorly optimized for GPU use)." + exit 1; +fi + + +tree_dir=$1 +model_dir=$2 +lang=$3 +data=$4 +dir=$5 + + +for f in $tree_dir/tree $tree_dir/final.mdl $tree_dir/ali.1.gz $tree_dir/num_jobs \ + $model_dir/final.mdl $model_dir/frame_subsampling_factor $model_dir/den.fst \ + $data/feats.scp $lang/phones.txt; do + [ ! -f $f ] && echo "train_sat.sh: no such file $f" && exit 1; +done + +sdata=$data/split${nj}utt +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh --per-utt $data $nj || exit 1; + +use_ivector=false + +cmvn_opts=$(cat $model_dir/cmvn_opts) +feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" + +if [ ! -z "$online_ivector_dir" ];then + steps/nnet2/check_ivectors_compatible.sh $model_dir $online_ivector_dir || exit 1; + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + ivector_feats="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp |" + ivector_opts="--online-ivector-period=$ivector_period --online-ivectors='$ivector_feats'" +else + ivector_opts= +fi + +if $use_gpu; then + gpu_queue_opt="--gpu 1" + gpu_opt="--use-gpu=yes" + if ! cuda-compiled; then + echo "$0: WARNING: you are running with one thread but you have not compiled" + echo " for CUDA. You may be running a setup optimized for GPUs. If you have" + echo " GPUs and have nvcc installed, go to src/ and do ./configure; make" + exit 1 + fi +else + gpu_queue_opts= + gpu_opt="--use-gpu=no" +fi +frame_subsampling_factor=$(cat $model_dir/frame_subsampling_factor) + +mkdir -p $dir/log +cp $model_dir/frame_subsampling_factor $dir/ + +if [ $stage -le 0 ]; then + if [ ! -f $dir/tacc ] || [ $dir/tacc -ot $tree_dir/ali.1.gz ]; then + echo "$0: obtaining transition-id counts in $dir/tacc" + # Obtain counts for each transition-id, from the alignments. + this_nj=$(cat $tree_dir/num_jobs) + + + $cmd JOB=1:$this_nj $dir/log/acc_taccs.JOB.log \ + ali-to-post "ark:gunzip -c $tree_dir/ali.JOB.gz|" ark:- \| \ + post-to-tacc $tree_dir/final.mdl ark:- $dir/tacc.JOB + + input_taccs=$(for n in $(seq $this_nj); do echo $dir/tacc.$n; done) + + $cmd $dir/log/sum_taccs.log \ + vector-sum --binary=false $input_taccs $dir/tacc + + rm $dir/tacc.* + else + echo "$0: skipping creation of $dir/tacc since it already exists." + fi +fi + + +if [ $stage -le 1 ] && $remove_word_position_dependency; then + echo "$0: creating $dir/phone_map.int" + utils/lang/get_word_position_phone_map.pl $lang $dir +else + # Either way, $dir/phones.txt will be a symbol table for the phones that + # we are dumping (although the matrices we dump won't contain anything + # for symbol 0 which is ). + grep -v '^#' $lang/phones.txt > $dir/phones.txt +fi + +if [ $stage -le 1 ]; then + # we want the phones in integer form as it's safer for processing by script. + # $data/fake_phones.txt will just contain e.g. "0 0\n1 1\n....", it's used + # to force show-transitions to print the phones as integers. + awk '{print $2,$2}' <$lang/phones.txt >$dir/fake_phones.txt + + + # The format of the 'show-transitions' command below is like the following: + #show-transitions tempdir/phone_map.int exp/chain/tree_a_sp/final.mdl + #Transition-state 1: phone = 1 hmm-state = 0 forward-pdf = 0 self-loop-pdf = 51 + # Transition-id = 1 p = 0.5 [self-loop] + # Transition-id = 2 p = 0.5 [0 -> 1] + #Transition-state 2: phone = 10 hmm-state = 0 forward-pdf = 0 self-loop-pdf = 51 + # Transition-id = 3 p = 0.5 [self-loop] + # Transition-id = 4 p = 0.5 [0 -> 1] + + # The following inline script processes that info about the transition model + # into the file $dir/phones_and_pdfs.txt, which has a line for each transition-id + # (starting from number 1), and the format of each line is + # + show-transitions $dir/fake_phones.txt $tree_dir/final.mdl | \ + perl -ane ' if(m/Transition-state.* phone = (\d+) pdf = (\d+)/) { $phone = $1; $forward_pdf = $2; $self_loop_pdf = $2; } + if(m/Transition-state.* phone = (\d+) .* forward-pdf = (\d+) self-loop-pdf = (\d+)/) { + $phone = $1; $forward_pdf = $2; $self_loop_pdf = $3; } + if(m/Transition-id/) { if (m/self-loop/) { print "$phone $self_loop_pdf\n"; } + else { print "$phone $forward_pdf\n" } } ' > $dir/phones_and_pdfs.txt + + + # The following command just separates the 'tacc' file into a similar format + # to $dir/phones_and_pdfs.txt, with one count per line, and a line per transition-id + # starting from number 1. We skip the first two fields which are "[ 0" (the 0 is + # for transition-id=0, since transition-ids are 1-based), and the last field which is "]". + awk '{ for (n=3;n$dir/transition_counts.txt + + num_lines1=$(wc -l <$dir/phones_and_pdfs.txt) + num_lines2=$(wc -l <$dir/transition_counts.txt) + if [ $num_lines1 -ne $num_lines2 ]; then + echo "$0: mismatch in num-lines between phones_and_pdfs.txt and transition_counts.txt: $num_lines1 vs $num_lines2" + exit 1 + fi + + # after 'paste', the format of the data will be + # + # we add the count smoothing at this point. + paste $dir/phones_and_pdfs.txt $dir/transition_counts.txt | \ + awk -v s=$count_smoothing '{print $1, $2, (s+$3);}' > $dir/combined_info.txt + + if $remove_word_position_dependency; then + # map the phones to word-position-independent phones; you can see $dir/phones.txt + # to interpret the final output. + utils/apply_map.pl -f 1 $dir/phone_map.int <$dir/combined_info.txt > $dir/temp.txt + mv $dir/temp.txt $dir/combined_info.txt + fi + + awk 'BEGIN{num_phones=1;num_pdfs=1;} { phone=$1; pdf=$2; count=$3; pdf_count[pdf] += count; counts[pdf,phone] += count; + if (phone>num_phones) num_phones=phone; if (pdf>=num_pdfs) num_pdfs = pdf + 1; } + END{ print "[ "; for(phone=1;phone<=num_phones;phone++) { + for (pdf=0;pdf$dir/transform.mat + +fi + + +if [ $stage -le 2 ]; then + + # note: --compression-method=3 is kTwoByteAuto: Each element is stored in two + # bytes as a uint16, with the representable range of values chosen + # automatically with the minimum and maximum elements of the matrix as its + # edges. + compress_opts="--compress=true --compression-method=3" + + if $use_xent_output; then + # This block uses the 'output-xent' output of the nnet. + + model="nnet3-copy '--edits-config=echo remove-output-nodes name=output; echo rename-node old-name=output-xent new-name=output|' $model_dir/final.mdl -|" + + $cmd $gpu_queue_opts JOB=1:$nj $dir/log/get_phone_post.JOB.log \ + nnet3-compute $gpu_opt $ivector_opts \ + --frame-subsampling-factor=$frame_subsampling_factor --apply-exp=true \ + "$model" "$feats" ark:- \| \ + transform-feats $dir/transform.mat ark:- ark:- \| \ + copy-feats $compress_opts ark:- ark,scp:$dir/phone_post.JOB.ark,$dir/phone_post.JOB.scp + else + # This block is when we are using the 'chain' output (recommended as the posteriors + # will be much more accurate). + $cmd $gpu_queue_opts JOB=1:$nj $dir/log/get_phone_post.JOB.log \ + nnet3-chain-compute-post $gpu_opt $ivector_opts --transform-mat=$dir/transform.mat \ + --frame-subsampling-factor=$frame_subsampling_factor \ + $model_dir/final.mdl $model_dir/den.fst "$feats" ark:- \| \ + copy-feats $compress_opts ark:- ark,scp:$dir/phone_post.JOB.ark,$dir/phone_post.JOB.scp + fi + + sleep 5 + # Make a single .scp file, for convenience. + for n in $(seq $nj); do cat $dir/phone_post.$n.scp; done > $dir/phone_post.scp + +fi diff --git a/egs/wsj/s5/steps/nnet3/chain/multilingual/combine_egs.sh b/egs/wsj/s5/steps/nnet3/chain/multilingual/combine_egs.sh index 76793e8fa25..410a8710b2f 100755 --- a/egs/wsj/s5/steps/nnet3/chain/multilingual/combine_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/multilingual/combine_egs.sh @@ -75,7 +75,7 @@ combine_scp_list= # read paramter from $egs_dir[0]/info and cmvn_opts # to write in multilingual egs_dir. -check_params="info/feat_dim info/ivector_dim info/left_context info/right_context cmvn_opts" +check_params="info/feat_dim info/ivector_dim info/left_context info/right_context info/left_context_initial info/right_context_final cmvn_opts" ivec_dim=`cat ${args[0]}/info/ivector_dim` if [ $ivec_dim -ne 0 ];then check_params="$check_params info/final.ie.id"; fi diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index acabf733c94..a832f57cd8f 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -224,8 +224,6 @@ def process_args(args): "and exist; or the {0}/configs directory should exist." "".format(args.dir)) - if args.transform_dir is None: - args.transform_dir = args.lat_dir # set the options corresponding to args.use_gpu run_opts = common_train_lib.RunOpts() if args.use_gpu in ["true", "false"]: @@ -384,7 +382,6 @@ def train(args, run_opts): cmvn_opts=args.cmvn_opts, online_ivector_dir=args.online_ivector_dir, frames_per_iter=args.frames_per_iter, - transform_dir=args.transform_dir, stage=args.egs_stage) if args.egs_dir is None: @@ -581,7 +578,7 @@ def train(args, run_opts): "{0}/final.mdl".format(args.dir)) chain_lib.compute_train_cv_probabilities( dir=args.dir, iter=num_iters, egs_dir=egs_dir, - l2_regularize=l2_regularize, xent_regularize=xent_regularize, + l2_regularize=args.l2_regularize, xent_regularize=args.xent_regularize, leaky_hmm_coefficient=args.leaky_hmm_coefficient, run_opts=run_opts, use_multitask_egs=use_multitask_egs) diff --git a/egs/wsj/s5/steps/nnet3/compute_output.sh b/egs/wsj/s5/steps/nnet3/compute_output.sh index 80ce2a120ff..da3cb704878 100755 --- a/egs/wsj/s5/steps/nnet3/compute_output.sh +++ b/egs/wsj/s5/steps/nnet3/compute_output.sh @@ -4,11 +4,11 @@ # 2016 Vimal Manohar # Apache 2.0. -# This script does forward propagation through a neural network. +# This script does forward propagation through a neural network. # Begin configuration section. stage=1 -nj=4 # number of jobs. +nj=4 # number of jobs. cmd=run.pl use_gpu=false frames_per_chunk=50 @@ -77,16 +77,11 @@ cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1; echo $nj > $dir/num_jobs ## Set up features. -if [ -f $srcdir/final.mat ]; then +if [ -f $srcdir/final.mat ]; then echo "$0: ERROR: lda feature type is no longer supported." && exit 1 fi feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" -if grep 'transform-feats --utt2spk' $srcdir/log/train.1.log >&/dev/null; then - echo "$0: **WARNING**: you seem to be using a neural net system trained with transforms," - echo " but this is no longer supported." -fi - if [ ! -z "$online_ivector_dir" ]; then ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh index 37a67b41f94..5b8374a5a1d 100755 --- a/egs/wsj/s5/steps/nnet3/decode.sh +++ b/egs/wsj/s5/steps/nnet3/decode.sh @@ -3,14 +3,11 @@ # Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). # Apache 2.0. -# This script does decoding with a neural-net. If the neural net was built on -# top of fMLLR transforms from a conventional system, you should provide the -# --transform-dir option. +# This script does decoding with a neural-net. # Begin configuration section. stage=1 -transform_dir= # dir to find fMLLR transforms. -nj=4 # number of decoding jobs. If --transform-dir set, must match that number! +nj=4 # number of decoding jobs. acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. post_decode_acwt=1.0 # can be used in 'chain' systems to scale acoustics by 10 so the # regular scoring script works. @@ -45,8 +42,6 @@ if [ $# -ne 3 ]; then echo "--online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\" echo " exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92" echo "main options (for others, see top of script file)" - echo " --transform-dir # directory of previous decoding" - echo " # where we can find transforms for SAT systems." echo " --config # config containing options" echo " --nj # number of parallel jobs" echo " --cmd # Command to run in parallel with" @@ -90,30 +85,6 @@ echo $nj > $dir/num_jobs echo "$0: feature type is raw" feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" -if [ ! -z "$transform_dir" ]; then - echo "$0: using transforms from $transform_dir" - [ ! -s $transform_dir/num_jobs ] && \ - echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1; - nj_orig=$(cat $transform_dir/num_jobs) - - if [ ! -f $transform_dir/raw_trans.1 ]; then - echo "$0: expected $transform_dir/raw_trans.1 to exist (--transform-dir option)" - exit 1; - fi - if [ $nj -ne $nj_orig ]; then - # Copy the transforms into an archive with an index. - for n in $(seq $nj_orig); do cat $transform_dir/raw_trans.$n; done | \ - copy-feats ark:- ark,scp:$dir/raw_trans.ark,$dir/raw_trans.scp || exit 1; - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/raw_trans.scp ark:- ark:- |" - else - # number of jobs matches with alignment dir. - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |" - fi -elif grep 'transform-feats --utt2spk' $srcdir/log/train.1.log >&/dev/null; then - echo "$0: **WARNING**: you seem to be using a neural net system trained with transforms," - echo " but you are not providing the --transform-dir option in test time." -fi -## if [ ! -z "$online_ivector_dir" ]; then ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; diff --git a/egs/wsj/s5/steps/nnet3/decode_looped.sh b/egs/wsj/s5/steps/nnet3/decode_looped.sh index e83483cd828..f90dc0325d5 100755 --- a/egs/wsj/s5/steps/nnet3/decode_looped.sh +++ b/egs/wsj/s5/steps/nnet3/decode_looped.sh @@ -21,14 +21,11 @@ # that you trained with, . [note: if not specified during training, it defaults to # the same as the regular --extra-left-context -# This script does decoding with a neural-net. If the neural net was built on -# top of fMLLR transforms from a conventional system, you should provide the -# --transform-dir option. +# This script does decoding with a neural-net. # Begin configuration section. stage=1 -transform_dir= # dir to find fMLLR transforms. -nj=4 # number of decoding jobs. If --transform-dir set, must match that number! +nj=4 # number of decoding jobs. acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. post_decode_acwt=1.0 # can be used in 'chain' systems to scale acoustics by 10 so the # regular scoring script works. @@ -59,8 +56,6 @@ if [ $# -ne 3 ]; then echo "--online-ivector-dir exp/nnet2_online/ivectors_test_eval92 \\" echo " exp/tri4b/graph_bg data/test_eval92_hires $dir/decode_bg_eval92" echo "main options (for others, see top of script file)" - echo " --transform-dir # directory of previous decoding" - echo " # where we can find transforms for SAT systems." echo " --config # config containing options" echo " --nj # number of parallel jobs" echo " --cmd # Command to run in parallel with" @@ -98,30 +93,6 @@ echo "$0: feature type is raw" splice_opts=`cat $srcdir/splice_opts 2>/dev/null` feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" -if [ ! -z "$transform_dir" ]; then - echo "$0: using transforms from $transform_dir" - [ ! -s $transform_dir/num_jobs ] && \ - echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1; - nj_orig=$(cat $transform_dir/num_jobs) - - if [ ! -f $transform_dir/raw_trans.1 ]; then - echo "$0: expected $transform_dir/raw_trans.1 to exist (--transform-dir option)" - exit 1; - fi - if [ $nj -ne $nj_orig ]; then - # Copy the transforms into an archive with an index. - for n in $(seq $nj_orig); do cat $transform_dir/raw_trans.$n; done | \ - copy-feats ark:- ark,scp:$dir/raw_trans.ark,$dir/raw_trans.scp || exit 1; - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/raw_trans.scp ark:- ark:- |" - else - # number of jobs matches with alignment dir. - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |" - fi -elif grep 'transform-feats --utt2spk' $srcdir/log/train.1.log >&/dev/null; then - echo "$0: **WARNING**: you seem to be using a neural net system trained with transforms," - echo " but you are not providing the --transform-dir option in test time." -fi -## if [ ! -z "$online_ivector_dir" ]; then ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; diff --git a/egs/wsj/s5/steps/nnet3/decode_score_fusion.sh b/egs/wsj/s5/steps/nnet3/decode_score_fusion.sh new file mode 100755 index 00000000000..2fcc4a1944d --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/decode_score_fusion.sh @@ -0,0 +1,262 @@ +#!/bin/bash + +# Copyright 2018 Tien-Hong Lo + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# Script for system combination using output of the neural networks. +# This calls nnet3-compute, matrix-sum and latgen-faster-mapped to create a system combination. +set -euo pipefail +# begin configuration section. +cmd=run.pl + +# Neural Network +stage=0 +iter=final +nj=30 +output_name="output" +ivector_scale=1.0 +apply_exp=false # Apply exp i.e. write likelihoods instead of log-likelihoods +compress=false # Specifies whether the output should be compressed before + # dumping to disk +use_gpu=false +skip_diagnostics=false +extra_left_context=0 +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 +online_ivector_dir= +frame_subsampling_factor=1 +frames_per_chunk=150 +average=true + +# Decode +beam=15.0 # prune the lattices prior to MBR decoding, for speed. +max_active=7000 +min_active=200 +acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. +post_decode_acwt=1.0 # can be used in 'chain' systems to scale acoustics by 10 so the + # regular scoring script works. +lattice_beam=8.0 # Beam we use in lattice generation. +num_threads=1 # if >1, will use latgen-faster--map-parallel +min_lmwt=5 +max_lmwt=15 +parallel_opts="--num-threads 3" +scoring_opts= +minimize=false +skip_scoring=false + +word_determinize=false # If set to true, then output lattice does not retain + # alternate paths a sequence of words (with alternate pronunciations). + # Setting to true is the default in steps/nnet3/decode.sh. + # However, setting this to false + # is useful for generation w of semi-supervised training + # supervision and frame-level confidences. +write_compact=true # If set to false, then writes the lattice in non-compact format, + # retaining the acoustic scores on each arc. This is + # required to be false for LM rescoring undeterminized + # lattices (when --word-determinize is false) +#end configuration section. + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + + +if [ $# -lt 5 ]; then + echo "Usage: $0 [options] [ ... ] " + echo "e.g.: local/socal/score_fusion.sh --nj 8 \\" + echo "--online-ivector-dir exp/nnet3/ivectors_test_eval92 \\" + echo " data/test_eval92_hires exp/nnet3/tdnn/graph exp/nnet3/tdnn/output exp/nnet3/tdnn1/output .. \\" + echo " exp/nnet3/tdnn_comb/decode_dev" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --iter # Iteration of model to decode; default is final." + exit 1; +fi + +echo "$0 $@" + +data=$1 +graphdir=$2 +dir=${@: -1} # last argument to the script +shift 2; +model_dirs=( $@ ) # read the remaining arguments into an array +unset model_dirs[${#model_dirs[@]}-1] # 'pop' the last argument which is odir +num_sys=${#model_dirs[@]} # number of systems to combine + +for f in $graphdir/words.txt $graphdir/phones/word_boundary.int ; do + [ ! -f $f ] && echo "$0: file $f does not exist" && exit 1; +done + +[ ! -z "$online_ivector_dir" ] && \ + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" + +if [ ! -z "$online_ivector_dir" ]; then + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" +fi + +frame_subsampling_opt= +if [ $frame_subsampling_factor -ne 1 ]; then + # e.g. for 'chain' systems + frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor" +fi + +# convert $dir to absolute pathname +fdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}` + +# Possibly use multi-threaded decoder +thread_string= +[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" + +mkdir -p $dir/temp + +for i in `seq 0 $[num_sys-1]`; do + srcdir=${model_dirs[$i]} + + model=$srcdir/$iter.mdl + if [ ! -f $srcdir/$iter.mdl ]; then + echo "$0: Error: no such file $srcdir/$iter.raw. Trying $srcdir/$iter.mdl exit" && exit 1; + fi + + # check that they have the same tree + show-transitions $graphdir/phones.txt $model > $dir/temp/transition.${i}.txt + cmp_tree=`diff -q $dir/temp/transition.0.txt $dir/temp/transition.${i}.txt | awk '{print $5}'` + if [ ! -z $cmp_tree ]; then + echo "$0 tree must be the same." + exit 0; + fi + + # check that they have the same frame-subsampling-factor + if [ $frame_subsampling_factor -ne `cat $srcdir/frame_subsampling_factor` ]; then + echo "$0 frame_subsampling_factor must be the same." + exit 0; + fi + + for f in $data/feats.scp $model $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; + done + + if [ ! -z "$output_name" ] && [ "$output_name" != "output" ]; then + echo "$0: Using output-name $output_name" + model="nnet3-copy --edits='remove-output-nodes name=output;rename-node old-name=$output_name new-name=output' $model - |" + fi + + ## Set up features. + if [ -f $srcdir/final.mat ]; then + echo "$0: Error: lda feature type is no longer supported." && exit 1 + fi + + sdata=$data/split$nj; + cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1; + + feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" + + if $apply_exp; then + output_wspecifier="ark:| copy-matrix --apply-exp ark:- ark:-" + else + output_wspecifier="ark:| copy-feats --compress=$compress ark:- ark:-" + fi + + gpu_opt="--use-gpu=no" + gpu_queue_opt= + + if $use_gpu; then + gpu_queue_opt="--gpu 1" + gpu_opt="--use-gpu=yes" + fi + + echo "$i $model"; + models[$i]="ark,s,cs:nnet3-compute $gpu_opt $ivector_opts $frame_subsampling_opt \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ + '$model' '$feats' '$output_wspecifier' |" +done + +# remove tempdir +rm -rf $dir/temp + +# split data to nj +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + + +# Assume the nnet trained by +# the same tree and frame subsampling factor. +mkdir -p $dir/log + +if [ -f $model ]; then + echo "$0: $model exists, copy model to $dir/../" + cp $model $dir/../ +fi + +if [ -f $srcdir/frame_shift ]; then + cp $srcdir/frame_shift $dir/../ + echo "$0: $srcdir/frame_shift exists, copy $srcdir/frame_shift to $dir/../" +elif [ -f $srcdir/frame_subsampling_factor ]; then + cp $srcdir/frame_subsampling_factor $dir/../ + echo "$0: $srcdir/frame_subsampling_factor exists, copy $srcdir/frame_subsampling_factor to $dir/../" +fi + +lat_wspecifier="ark:|" +extra_opts= +if ! $write_compact; then + extra_opts="--determinize-lattice=false" + lat_wspecifier="ark:| lattice-determinize-phone-pruned --beam=$lattice_beam --acoustic-scale=$acwt --minimize=$minimize --word-determinize=$word_determinize --write-compact=false $model ark:- ark:- |" +fi + +if [ "$post_decode_acwt" == 1.0 ]; then + lat_wspecifier="$lat_wspecifier gzip -c >$dir/lat.JOB.gz" +else + lat_wspecifier="$lat_wspecifier lattice-scale --acoustic-scale=$post_decode_acwt --write-compact=$write_compact ark:- ark:- | gzip -c >$dir/lat.JOB.gz" +fi + + +if [ $stage -le 0 ]; then + $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \ + matrix-sum --average=$average "${models[@]}" ark:- \| \ + latgen-faster-mapped$thread_string --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ + --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \ + --word-symbol-table=$graphdir/words.txt ${extra_opts} "$model" \ + $graphdir/HCLG.fst ark:- "$lat_wspecifier" +fi + +if [ $stage -le 1 ]; then + if ! $skip_diagnostics ; then + [ ! -z $iter ] && iter_opt="--iter $iter" + steps/diagnostic/analyze_lats.sh --cmd "$cmd" $iter_opt $graphdir $dir + fi +fi + +if ! $skip_scoring ; then + if [ $stage -le 2 ]; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + echo "score best paths" + [ "$iter" != "final" ] && iter_opt="--iter $iter" + scoring_opts="--min_lmwt $min_lmwt" + local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir + echo "score confidence and timing with sclite" + fi +fi + + +exit 0 + diff --git a/egs/wsj/s5/steps/nnet3/decode_semisup.sh b/egs/wsj/s5/steps/nnet3/decode_semisup.sh index b742835f588..25ce232b2c6 100755 --- a/egs/wsj/s5/steps/nnet3/decode_semisup.sh +++ b/egs/wsj/s5/steps/nnet3/decode_semisup.sh @@ -3,14 +3,11 @@ # Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). # Apache 2.0. -# This script does decoding with a neural-net. If the neural net was built on -# top of fMLLR transforms from a conventional system, you should provide the -# --transform-dir option. +# This script does decoding with a neural-net. # Begin configuration section. stage=1 -transform_dir= # dir to find fMLLR transforms. -nj=4 # number of decoding jobs. If --transform-dir set, must match that number! +nj=4 # number of decoding jobs. acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. post_decode_acwt=1.0 # can be used in 'chain' systems to scale acoustics by 10 so the # regular scoring script works. @@ -34,13 +31,13 @@ online_ivector_dir= minimize=false word_determinize=false # If set to true, then output lattice does not retain # alternate paths a sequence of words (with alternate pronunciations). - # Setting to true is the default in steps/nnet3/decode.sh. + # Setting to true is the default in steps/nnet3/decode.sh. # However, setting this to false # is useful for generation w of semi-supervised training # supervision and frame-level confidences. write_compact=true # If set to false, then writes the lattice in non-compact format, - # retaining the acoustic scores on each arc. This is - # required to be false for LM rescoring undeterminized + # retaining the acoustic scores on each arc. This is + # required to be false for LM rescoring undeterminized # lattices (when --word-determinize is false) # Useful for semi-supervised training with rescored lattices. # End configuration section. @@ -101,30 +98,6 @@ echo $nj > $dir/num_jobs echo "$0: feature type is raw" feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" -if [ ! -z "$transform_dir" ]; then - echo "$0: using transforms from $transform_dir" - [ ! -s $transform_dir/num_jobs ] && \ - echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1; - nj_orig=$(cat $transform_dir/num_jobs) - - if [ ! -f $transform_dir/raw_trans.1 ]; then - echo "$0: expected $transform_dir/raw_trans.1 to exist (--transform-dir option)" - exit 1; - fi - if [ $nj -ne $nj_orig ]; then - # Copy the transforms into an archive with an index. - for n in $(seq $nj_orig); do cat $transform_dir/raw_trans.$n; done | \ - copy-feats ark:- ark,scp:$dir/raw_trans.ark,$dir/raw_trans.scp || exit 1; - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/raw_trans.scp ark:- ark:- |" - else - # number of jobs matches with alignment dir. - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |" - fi -elif grep 'transform-feats --utt2spk' $srcdir/log/train.1.log >&/dev/null; then - echo "$0: **WARNING**: you seem to be using a neural net system trained with transforms," - echo " but you are not providing the --transform-dir option in test time." -fi -## if [ ! -z "$online_ivector_dir" ]; then ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; @@ -150,6 +123,9 @@ if [ -f $srcdir/frame_subsampling_factor ]; then frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)" fi +# Copy the model as it is required when generating egs +cp $model $dir/ || exit 1 + if [ $stage -le 1 ]; then $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \ nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \ @@ -165,7 +141,6 @@ if [ $stage -le 1 ]; then $graphdir/HCLG.fst "$feats" "$lat_wspecifier" || exit 1; fi - if [ $stage -le 2 ]; then if ! $skip_diagnostics ; then [ ! -z $iter ] && iter_opt="--iter $iter" diff --git a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py index 2a6499090e2..f8cd357fa3b 100755 --- a/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py +++ b/egs/wsj/s5/steps/nnet3/dot/nnet3_to_dot.py @@ -122,9 +122,9 @@ def ProcessAppendDescriptor(segment, parent_node_name, affix, edge_attributes = attr_string = '' if edge_attributes is not None: - if edge_attributes.has_key('label'): + if 'label' in edge_attributes: attr_string += " label={0} ".format(edge_attributes['label']) - if edge_attributes.has_key('style'): + if 'style' in edge_attributes: attr_string += ' style={0} '.format(edge_attributes['style']) dot_string = '{0} -> {1} [tailport=s]'.format(GetDotNodeName(desc_name)['node'], GetDotNodeName(parent_node_name)['node']) @@ -142,9 +142,9 @@ def ProcessRoundDescriptor(segment, parent_node_name, affix, edge_attributes = N label = 'Round ({0})'.format(segment['arguments'][1]) style = None if edge_attributes is not None: - if edge_attributes.has_key('label'): + if 'label' in edge_attributes: label = "{0} {1}".format(edge_attributes['label'], label) - if edge_attributes.has_key('style'): + if 'style' in edge_attributes: style = 'style={0}'.format(edge_attributes['style']) attr_string = 'label="{0}"'.format(label) @@ -164,9 +164,9 @@ def ProcessOffsetDescriptor(segment, parent_node_name, affix, edge_attributes = label = 'Offset ({0})'.format(segment['arguments'][1]) style = None if edge_attributes is not None: - if edge_attributes.has_key('label'): + if 'label' in edge_attributes: label = "{0} {1}".format(edge_attributes['label'], label) - if edge_attributes.has_key('style'): + if 'style' in edge_attributes: style = 'style={0}'.format(edge_attributes['style']) attr_string = 'label="{0}"'.format(label) @@ -204,9 +204,9 @@ def ProcessSumDescriptor(segment, parent_node_name, affix, edge_attributes = Non attr_string = '' if edge_attributes is not None: - if edge_attributes.has_key('label'): + if 'label' in edge_attributes: attr_string += " label={0} ".format(edge_attributes['label']) - if edge_attributes.has_key('style'): + if 'style' in edge_attributes: attr_string += ' style={0} '.format(edge_attributes['style']) dot_string = '{0} -> {1}'.format(GetDotNodeName(desc_name)['node'], GetDotNodeName(parent_node_name)['node']) @@ -221,9 +221,9 @@ def ProcessReplaceIndexDescriptor(segment, parent_node_name, affix, edge_attribu label = 'ReplaceIndex({0}, {1})'.format(segment['arguments'][1], segment['arguments'][2]) style = None if edge_attributes is not None: - if edge_attributes.has_key('label'): + if 'label' in edge_attributes: label = "{0} {1}".format(edge_attributes['label'], label) - if edge_attributes.has_key('style'): + if 'style' in edge_attributes: style = 'style={0}'.format(edge_attributes['style']) attr_string = 'label="{0}"'.format(label) @@ -324,7 +324,7 @@ def Nnet3ComponentToDot(component_config, component_attributes = None): attributes_to_print = set(component_attributes).intersection(component_config.keys()) # process the known fields for key in attributes_to_print: - if component_config.has_key(key): + if key in component_config: label += '{0} = {1}\\n'.format(key, component_config[key]) attr_string = '' @@ -370,7 +370,9 @@ def Nnet3ComponentNodeToDot(parsed_config): GetDotNodeName(parsed_config['name'])['node'])) return dot_graph -def GroupConfigs(configs, node_prefixes = []): +def GroupConfigs(configs, node_prefixes = None): + if node_prefixes is None: + node_prefixes = [] # we make the assumption that nodes belonging to the same sub-graph have a # commong prefix. grouped_configs = {} @@ -388,7 +390,9 @@ def GroupConfigs(configs, node_prefixes = []): return grouped_configs -def ParseConfigLines(lines, node_prefixes = [], component_attributes = None ): +def ParseConfigLines(lines, node_prefixes = None, component_attributes = None ): + if node_prefixes is None: + node_prefixes = [] config_lines = [] dot_graph=[] configs = [] diff --git a/egs/wsj/s5/steps/nnet3/get_degs.sh b/egs/wsj/s5/steps/nnet3/get_degs.sh index 44a7886fc3f..8098b59c4ad 100755 --- a/egs/wsj/s5/steps/nnet3/get_degs.sh +++ b/egs/wsj/s5/steps/nnet3/get_degs.sh @@ -13,7 +13,6 @@ cmd=run.pl max_copy_jobs=5 # Limit disk I/O # feature options -transform_dir= # If this is a SAT system, directory for transforms online_ivector_dir= # example splitting and context options @@ -116,8 +115,6 @@ dir=$5 extra_files= [ ! -z $online_ivector_dir ] && \ extra_files="$extra_files $online_ivector_dir/ivector_period $online_ivector_dir/ivector_online.scp" -[ ! -z $transform_dir ] && \ - extra_files="$extra_files $transform_dir/trans.1 $transform_dir/num_jobs" # Check some files. for f in $data/feats.scp $lang/L.fst $lang/phones/silence.csl $srcdir/${iter}.mdl $srcdir/tree \ @@ -147,28 +144,6 @@ feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdat cp $srcdir/{splice_opts,cmvn_opts} $dir 2>/dev/null || true -if [ ! -z "$transform_dir" ]; then - echo "$0: using transforms from $transform_dir" - [ ! -s $transform_dir/num_jobs ] && \ - echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1; - nj_orig=$(cat $transform_dir/num_jobs) - - if [ ! -f $transform_dir/raw_trans.1 ]; then - echo "$0: expected $transform_dir/raw_trans.1 to exist (--transform-dir option)" - exit 1; - fi - if [ $nj -ne $nj_orig ]; then - # Copy the transforms into an archive with an index. - for n in $(seq $nj_orig); do cat $transform_dir/raw_trans.$n; done | \ - copy-feats ark:- ark,scp:$dir/raw_trans.ark,$dir/raw_trans.scp || exit 1; - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/raw_trans.scp ark:- ark:- |" - else - # number of jobs matches with alignment dir. - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |" - fi -fi - - ## set iVector options if [ ! -z "$online_ivector_dir" ]; then online_ivector_period=$(cat $online_ivector_dir/ivector_period) diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh index c8cbf67c8b8..2888f77ed59 100755 --- a/egs/wsj/s5/steps/nnet3/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/get_egs.sh @@ -42,8 +42,6 @@ samples_per_iter=400000 # this is the target number of egs in each archive of eg # a number that divides the number of samples in the # entire data. -transform_dir= # If supplied, overrides alidir as the place to find fMLLR transforms - stage=0 nj=6 # This should be set to the maximum number of jobs you are # comfortable to run in parallel; you can increase it if your disk @@ -119,8 +117,8 @@ if ! [ $num_utts -gt $[$num_utts_subset*4] ]; then fi # Get list of validation utterances. -awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \ - > $dir/valid_uttlist || exit 1; +awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset \ + > $dir/valid_uttlist if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation. echo "File $data/utt2uniq exists, so augmenting valid_uttlist to" @@ -134,22 +132,10 @@ if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation. fi awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \ - utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1; - -[ -z "$transform_dir" ] && transform_dir=$alidir + utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/train_subset_uttlist echo "$0: creating egs. To ensure they are not deleted later you can do: touch $dir/.nodelete" -# because we'll need the features with a different number of jobs than $alidir, -# copy to ark,scp. -if [ -f $transform_dir/raw_trans.1 ]; then - echo "$0: using raw transforms from $transform_dir" - if [ $stage -le 0 ]; then - $cmd $dir/log/copy_transforms.log \ - copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" - fi -fi - ## Set up features. echo "$0: feature type is raw" @@ -158,12 +144,6 @@ valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | a train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now. -if [ -f $dir/trans.scp ]; then - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |" - valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" - train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" -fi - if [ ! -z "$online_ivector_dir" ]; then ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; echo $ivector_dim > $dir/info/ivector_dim @@ -426,9 +406,9 @@ if [ $stage -le 6 ]; then # there are some extra soft links that we should delete. for f in $dir/egs.*.*.ark; do rm $f; done fi - echo "$0: removing temporary alignments and transforms" + echo "$0: removing temporary alignments" # Ignore errors below because trans.* might not exist. - rm $dir/{ali,trans}.{ark,scp} 2>/dev/null + rm $dir/ali.{ark,scp} 2>/dev/null fi echo "$0: Finished preparing training examples" diff --git a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh index 00ac2c7390c..d315ff925f4 100755 --- a/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh +++ b/egs/wsj/s5/steps/nnet3/get_egs_discriminative.sh @@ -42,7 +42,6 @@ stage=0 max_jobs_run=15 max_shuffle_jobs_run=15 -transform_dir= # If this is a SAT system, directory for transforms online_ivector_dir= cmvn_opts= # can be used for specifying CMVN options, if feature type is not lda (if lda, # it doesn't make sense to use different options than were used as input to the @@ -128,8 +127,6 @@ fi awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \ utils/shuffle_list.pl | head -$num_utts_subset > $dir/train_subset_uttlist || exit 1; -[ -z "$transform_dir" ] && transform_dir=$alidir - if [ $stage -le 1 ]; then nj_ali=$(cat $alidir/num_jobs) alis=$(for n in $(seq $nj_ali); do echo -n "$alidir/ali.$n.gz "; done) @@ -140,14 +137,6 @@ fi prior_ali_rspecifier="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $dir/ali.scp | ali-to-pdf $alidir/final.mdl scp:- ark:- |" -if [ -f $transform_dir/raw_trans.1 ]; then - echo "$0: using raw transforms from $transform_dir" - if [ $stage -le 0 ]; then - $cmd $dir/log/copy_transforms.log \ - copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" - fi -fi - silphonelist=`cat $lang/phones/silence.csl` || exit 1; cp $alidir/tree $dir cp $lang/phones/silence.csl $dir/info/ @@ -158,23 +147,12 @@ awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlis utils/shuffle_list.pl | head -$num_priors_subset \ > $dir/priors_uttlist || exit 1; -## We don't support deltas here, only LDA or raw (mainly because deltas are less -## frequently used). -echo "$0: feature type is raw" - feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |" valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" priors_feats="ark,s,cs:utils/filter_scp.pl $dir/priors_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" echo $cmvn_opts > $dir/cmvn_opts -if [ -f $dir/trans.scp ]; then - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |" - valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |" - train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |" - priors_feats="$priors_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp|' ark:- ark:- |" -fi - if [ ! -z $online_ivector_dir ]; then ivector_period=$(cat $online_ivector_dir/ivector_period) ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1; @@ -433,9 +411,8 @@ if [ $stage -le 7 ]; then fi echo "$0: removing temporary lattices" rm $dir/lat.* - echo "$0: removing temporary alignments and transforms" - # Ignore errors below because trans.* might not exist. - rm $dir/{ali,trans}.{ark,scp} 2>/dev/null + echo "$0: removing temporary alignments" + rm $dir/ali.{ark,scp} 2>/dev/null fi wait diff --git a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh index f4179fb28b4..2e368283ed4 100755 --- a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh +++ b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh @@ -41,7 +41,7 @@ compress=true # set this to false to disable compression (e.g. if you want to # results are affected). num_utts_subset=300 # number of utterances in validation and training # subsets used for shrinkage and diagnostics. -num_utts_subset_valid= # number of utterances in validation +num_utts_subset_valid= # number of utterances in validation # subsets used for shrinkage and diagnostics # if provided, overrides num-utts-subset num_utts_subset_train= # number of utterances in training @@ -56,8 +56,6 @@ samples_per_iter=400000 # this is the target number of egs in each archive of eg # a number that divides the number of samples in the # entire data. -transform_dir= - stage=0 nj=6 # This should be set to the maximum number of jobs you are # comfortable to run in parallel; you can increase it if your disk @@ -149,14 +147,6 @@ fi awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \ utils/shuffle_list.pl | head -$num_utts_subset_train | sort > $dir/train_subset_uttlist || exit 1; -if [ -f $transform_dir/raw_trans.1 ]; then - echo "$0: using raw transforms from $transform_dir" - if [ $stage -le 0 ]; then - $cmd $dir/log/copy_transforms.log \ - copy-feats "ark:cat $transform_dir/raw_trans.* |" "ark,scp:$dir/trans.ark,$dir/trans.scp" - fi -fi - ## Set up features. echo "$0: feature type is raw" @@ -165,12 +155,6 @@ valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | a train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now. -if [ -f $dir/trans.scp ]; then - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/trans.scp ark:- ark:- |" - valid_feats="$valid_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" - train_subset_feats="$train_subset_feats transform-feats --utt2spk=ark:$data/utt2spk scp:$dir/trans.scp ark:- ark:- |" -fi - if [ ! -z "$online_ivector_dir" ]; then steps/nnet2/get_ivector_id.sh $online_ivector_dir > $dir/info/final.ie.id || exit 1 ivector_dim=$(feat-to-dim scp:$online_ivector_dir/ivector_online.scp -) || exit 1 @@ -291,7 +275,7 @@ case $target_type in "sparse") get_egs_program="nnet3-get-egs --num-pdfs=$num_targets" targets="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $targets_scp_split | ali-to-post scp:- ark:- |" - valid_targets="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $targets_scp | ali-to-post scp:- ark:- |" + valid_targets="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $targets_scp | ali-to-post scp:- ark:- |" train_subset_targets="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $targets_scp | ali-to-post scp:- ark:- |" ;; default) @@ -455,8 +439,7 @@ if [ $stage -le 6 ]; then for f in $dir/egs.*.*.ark; do rm $f; done fi echo "$0: removing temporary stuff" - # Ignore errors below because trans.* might not exist. - rm -f $dir/trans.{ark,scp} $dir/targets.*.scp 2>/dev/null + rm -f $dir/targets.*.scp 2>/dev/null fi echo "$0: Finished preparing training examples" diff --git a/egs/wsj/s5/steps/nnet3/get_saturation.pl b/egs/wsj/s5/steps/nnet3/get_saturation.pl index 3d5ec5c2661..ed18fc1c399 100755 --- a/egs/wsj/s5/steps/nnet3/get_saturation.pl +++ b/egs/wsj/s5/steps/nnet3/get_saturation.pl @@ -35,7 +35,7 @@ # deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 # 95,98,99,100)=(0.0001,0.003,0.004,0.03 0.12,0.18,0.22,0.24,0.25 # 0.25,0.25,0.25,0.25), mean=0.198, stddev=0.0591] - if (m/deriv-avg=.+mean=([^,]+),/) { + if (m/deriv-avg=[^m]+mean=([^,]+),/) { $num_nonlinearities += 1; my $this_saturation = 1.0 - ($1 / 0.25); $total_saturation += $this_saturation; @@ -43,7 +43,7 @@ print STDERR "$0: could not make sense of line (no deriv-avg?): $_"; } } elsif (m/type=TanhComponent/) { - if (m/deriv-avg=.+mean=([^,]+),/) { + if (m/deriv-avg=[^m]+mean=([^,]+),/) { $num_nonlinearities += 1; my $this_saturation = 1.0 - ($1 / 1.0); $total_saturation += $this_saturation; diff --git a/egs/wsj/s5/steps/nnet3/make_bottleneck_features.sh b/egs/wsj/s5/steps/nnet3/make_bottleneck_features.sh index 09f4263918a..d9c04784406 100755 --- a/egs/wsj/s5/steps/nnet3/make_bottleneck_features.sh +++ b/egs/wsj/s5/steps/nnet3/make_bottleneck_features.sh @@ -3,6 +3,10 @@ # Copyright 2016 Pegah Ghahremani # This script dumps bottleneck feature for model trained using nnet3. +# CAUTION! This script isn't very suitable for dumping features from recurrent +# architectures such as LSTMs, because it doesn't support setting the chunk size +# and left and right context. (Those would have to be passed into nnet3-compute). +# See also chain/get_phone_post.sh. # Begin configuration section. stage=1 @@ -21,7 +25,7 @@ echo "$0 $@" # Print the command line for logging if [[ ( $# -lt 4 ) || ( $# -gt 6 ) ]]; then echo "usage: steps/nnet3/make_bottleneck_features.sh [ [] ]" echo "e.g.: steps/nnet3/make_bottleneck_features.sh tdnn_bn.renorm data/train data/train_bnf exp/nnet3/tdnn_bnf exp_bnf/dump_bnf bnf" - echo "Note: dafaults to /log and defaults to" + echo "Note: defaults to /log and defaults to" echo " /data" echo "main options (for others, see top of script file)" echo " --config # config containing options" @@ -49,8 +53,12 @@ fi cmvn_opts=`cat $nnetdir/cmvn_opts`; bnf_nnet=$nnetdir/final.raw if [ ! -f $bnf_nnet ] ; then - echo "$0: No such file $bnf_nnet"; - exit 1; + if [ ! -f $nnetdir/final.mdl ]; then + echo "$0: No such file $bnf_nnet or $nnetdir/final.mdl"; + exit 1; + else + bnf_nnet=$nnetdir/final.mdl + fi fi if $use_gpu; then @@ -77,6 +85,7 @@ mkdir -p $bnf_data mkdir -p $bnfdir echo $nj > $nnetdir/num_jobs +[ ! -f $data/feats.scp ] && echo >&2 "The file $data/feats.scp does not exist!" && exit 1; [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; use_ivector=false @@ -89,10 +98,10 @@ feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdat ivector_feats="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $ivector_dir/ivector_online.scp |" if [ $stage -le 1 ]; then - echo "$0: Generating bottleneck features using $bnf_nnet model as output of " + echo "$0: Generating bottleneck (BNF) features using $bnf_nnet model as output of " echo " component-node with name $bnf_name." echo "output-node name=output input=$bnf_name" > $bnf_data/output.config - modified_bnf_nnet="nnet3-copy --edits='remove-output-nodes name=output' $bnf_nnet - | nnet3-copy --nnet-config=$bnf_data/output.config - - |" + modified_bnf_nnet="nnet3-copy --nnet-config=$bnf_data/output.config $bnf_nnet - |" ivector_opts= if $use_ivector; then ivector_period=$(cat $ivector_dir/ivector_period) || exit 1; @@ -107,7 +116,7 @@ fi N0=$(cat $data/feats.scp | wc -l) N1=$(cat $bnfdir/raw_bnfeat_$name.*.scp | wc -l) if [[ "$N0" != "$N1" ]]; then - echo "$0: Error happens when generating BNF for $name (Original:$N0 BNF:$N1)" + echo "$0: Error generating BNF features for $name (original:$N0 utterances, BNF:$N1 utterances)" exit 1; fi @@ -121,6 +130,6 @@ done echo "$0: computing CMVN stats." steps/compute_cmvn_stats.sh $bnf_data -echo "$0: done making BNF feats.scp." +echo "$0: done making BNF features." exit 0; diff --git a/egs/wsj/s5/steps/nnet3/make_denlats.sh b/egs/wsj/s5/steps/nnet3/make_denlats.sh index f2bb4df712d..36da179bbaf 100755 --- a/egs/wsj/s5/steps/nnet3/make_denlats.sh +++ b/egs/wsj/s5/steps/nnet3/make_denlats.sh @@ -21,7 +21,6 @@ self_loop_scale=0.1 acwt=0.1 max_active=5000 min_active=200 -transform_dir= max_mem=20000000 # This will stop the processes getting too large. # This is in bytes, but not "real" bytes-- you have to multiply # by something like 5 or 10 to get real bytes (not sure why so large) @@ -47,8 +46,6 @@ num_threads=1 # Fixed to 1 for now if [ $# != 4 ]; then echo "Usage: steps/nnet3/make_denlats.sh [options] " echo " e.g.: steps/nnet3/make_denlats.sh data/train data/lang exp/nnet4 exp/nnet4_denlats" - echo "Works for (delta|lda) features, and (with --transform-dir option) such features" - echo " plus transforms." echo "" echo "Main options (for others, see top of script file)" echo " --config # config containing options" @@ -57,7 +54,6 @@ if [ $# != 4 ]; then echo " --sub-split # e.g. 40; use this for " echo " # large databases so your jobs will be smaller and" echo " # will (individually) finish reasonably soon." - echo " --transform-dir # directory to find fMLLR transforms." echo " --num-threads # number of threads per decoding job" exit 1; fi @@ -115,28 +111,6 @@ echo "$0: feature type is raw" feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" -if [ ! -z "$transform_dir" ]; then - echo "$0: using transforms from $transform_dir" - [ ! -s $transform_dir/num_jobs ] && \ - echo "$0: expected $transform_dir/num_jobs to contain the number of jobs." && exit 1; - nj_orig=$(cat $transform_dir/num_jobs) - - if [ ! -f $transform_dir/raw_trans.1 ]; then - echo "$0: expected $transform_dir/raw_trans.1 to exist (--transform-dir option)" - exit 1; - fi - if [ $nj -ne $nj_orig ]; then - # Copy the transforms into an archive with an index. - for n in $(seq $nj_orig); do cat $transform_dir/raw_trans.$n; done | \ - copy-feats ark:- ark,scp:$dir/raw_trans.ark,$dir/raw_trans.scp || exit 1; - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk scp:$dir/raw_trans.scp ark:- ark:- |" - else - # number of jobs matches with alignment dir. - feats="$feats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$transform_dir/raw_trans.JOB ark:- ark:- |" - fi -fi - - # if this job is interrupted by the user, we want any background jobs to be # killed too. cleanup() { @@ -195,7 +169,7 @@ else split_data.sh --per-utt $sdata/$n $sub_split || exit 1; mkdir -p $dir/log/$n mkdir -p $dir/part - feats_subset=`echo $feats | sed "s/trans.JOB/trans.$n/g" | sed s:JOB/:$n/split${sub_split}utt/JOB/:g` + feats_subset=`echo $feats | sed s:JOB/:$n/split${sub_split}utt/JOB/:g` $cmd --num-threads $num_threads JOB=1:$sub_split $dir/log/$n/decode_den.JOB.log \ nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \ diff --git a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh index e1aeb0b70d6..cdf55ea81d3 100755 --- a/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh +++ b/egs/wsj/s5/steps/nnet3/multilingual/combine_egs.sh @@ -71,7 +71,7 @@ combine_scp_list= # read paramter from $egs_dir[0]/info and cmvn_opts # to write in multilingual egs_dir. -check_params="info/feat_dim info/ivector_dim info/left_context info/right_context cmvn_opts" +check_params="info/feat_dim info/ivector_dim info/left_context info/right_context info/left_context_initial info/right_context_final cmvn_opts" ivec_dim=`cat ${args[0]}/info/ivector_dim` if [ $ivec_dim -ne 0 ];then check_params="$check_params info/final.ie.id"; fi diff --git a/egs/wsj/s5/steps/nnet3/remove_egs.sh b/egs/wsj/s5/steps/nnet3/remove_egs.sh new file mode 120000 index 00000000000..44cd36cbc60 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/remove_egs.sh @@ -0,0 +1 @@ +../nnet2/remove_egs.sh \ No newline at end of file diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py index dd1c97b350d..0c881b4dbdf 100755 --- a/egs/wsj/s5/steps/nnet3/train_dnn.py +++ b/egs/wsj/s5/steps/nnet3/train_dnn.py @@ -57,6 +57,14 @@ def get_args(): help="Number of output labels per example") # trainer options + parser.add_argument("--trainer.input-model", type=str, + dest='input_model', default=None, + action=common_lib.NullstrToNoneAction, + help="""If specified, this model is used as initial + raw model (0.raw in the script) instead of initializing + the model from xconfig. Configs dir is not expected to + exist and left/right context is computed from this + model.""") parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size', default=20000, help="Number of samples for computing priors") @@ -107,14 +115,14 @@ def process_args(args): if not common_train_lib.validate_minibatch_size_str(args.minibatch_size): raise Exception("--trainer.rnn.num-chunk-per-minibatch has an invalid value") - if (not os.path.exists(args.dir) - or not os.path.exists(args.dir+"/configs")): - raise Exception("This scripts expects {0} to exist and have a configs " - "directory which is the output of " - "make_configs.py script") - - if args.transform_dir is None: - args.transform_dir = args.ali_dir + if (not os.path.exists(args.dir)): + raise Exception("This script expects --dir={0} to exist.") + if (not os.path.exists(args.dir+"/configs") and + (args.input_model is None or not os.path.exists(args.input_model))): + raise Exception("Either --trainer.input-model option should be supplied, " + "and exist; or the {0}/configs directory should exist." + "{0}/configs is the output of make_configs.py" + "".format(args.dir)) # set the options corresponding to args.use_gpu run_opts = common_train_lib.RunOpts() @@ -187,10 +195,15 @@ def train(args, run_opts): with open('{0}/num_jobs'.format(args.dir), 'w') as f: f.write(str(num_jobs)) - config_dir = '{0}/configs'.format(args.dir) - var_file = '{0}/vars'.format(config_dir) + if args.input_model is None: + config_dir = '{0}/configs'.format(args.dir) + var_file = '{0}/vars'.format(config_dir) - variables = common_train_lib.parse_generic_config_vars_file(var_file) + variables = common_train_lib.parse_generic_config_vars_file(var_file) + else: + # If args.input_model is specified, the model left and right contexts + # are computed using input_model. + variables = common_train_lib.get_input_model_info(args.input_model) # Set some variables. try: @@ -208,7 +221,8 @@ def train(args, run_opts): # we do this as it's a convenient way to get the stats for the 'lda-like' # transform. - if (args.stage <= -5) and os.path.exists(args.dir+"/configs/init.config"): + if (args.stage <= -5) and os.path.exists(args.dir+"/configs/init.config") and \ + (args.input_model is None): logger.info("Initializing a basic network for estimating " "preconditioning matrix") common_lib.execute_command( @@ -234,7 +248,6 @@ def train(args, run_opts): cmvn_opts=args.cmvn_opts, online_ivector_dir=args.online_ivector_dir, samples_per_iter=args.samples_per_iter, - transform_dir=args.transform_dir, stage=args.egs_stage) if args.egs_dir is None: @@ -257,7 +270,7 @@ def train(args, run_opts): # use during decoding common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) - if args.stage <= -3 and os.path.exists(args.dir+"/configs/init.config"): + if args.stage <= -3 and os.path.exists(args.dir+"/configs/init.config") and (args.input_model is None): logger.info('Computing the preconditioning matrix for input features') train_lib.common.compute_preconditioning_matrix( @@ -265,7 +278,7 @@ def train(args, run_opts): max_lda_jobs=args.max_lda_jobs, rand_prune=args.rand_prune) - if args.stage <= -2: + if args.stage <= -2 and (args.input_model is None): logger.info("Computing initial vector for FixedScaleComponent before" " softmax, using priors^{prior_scale} and rescaling to" " average 1".format( @@ -278,7 +291,8 @@ def train(args, run_opts): if args.stage <= -1: logger.info("Preparing the initial acoustic model.") train_lib.acoustic_model.prepare_initial_acoustic_model( - args.dir, args.ali_dir, run_opts) + args.dir, args.ali_dir, run_opts, + input_model=args.input_model) # set num_iters so that as close as possible, we process the data # $num_epochs times, i.e. $num_iters*$avg_num_jobs) == @@ -324,6 +338,19 @@ def train(args, run_opts): "shrink-value={1}".format(args.proportional_shrink, shrinkage_value)) + percent = num_archives_processed * 100.0 / num_archives_to_process + epoch = (num_archives_processed * args.num_epochs + / num_archives_to_process) + shrink_info_str = '' + if shrinkage_value != 1.0: + shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value) + logger.info("Iter: {0}/{1} " + "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete) " + "lr: {5:0.6f} {6}".format(iter, num_iters - 1, + epoch, args.num_epochs, + percent, + lrate, shrink_info_str)) + train_lib.common.train_one_iteration( dir=args.dir, iter=iter, diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py index 0e787b0b647..34214169d5d 100755 --- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py +++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py @@ -60,6 +60,14 @@ def get_args(): help="Image augmentation options") # trainer options + parser.add_argument("--trainer.input-model", type=str, + dest='input_model', default=None, + action=common_lib.NullstrToNoneAction, + help="""If specified, this model is used as initial + raw model (0.raw in the script) instead of initializing + the model from xconfig. Configs dir is not expected to + exist and left/right context is computed from this + model.""") parser.add_argument("--trainer.prior-subset-size", type=int, dest='prior_subset_size', default=20000, help="Number of samples for computing priors") @@ -118,11 +126,14 @@ def process_args(args): if not common_train_lib.validate_minibatch_size_str(args.minibatch_size): raise Exception("--trainer.optimization.minibatch-size has an invalid value") - if (not os.path.exists(args.dir) - or not os.path.exists(args.dir+"/configs")): - raise Exception("This scripts expects {0} to exist and have a configs " - "directory which is the output of " - "make_configs.py script") + if (not os.path.exists(args.dir)): + raise Exception("This script expects --dir={0} to exist.") + if (not os.path.exists(args.dir+"/configs") and + (args.input_model is None or not os.path.exists(args.input_model))): + raise Exception("Either --trainer.input-model option should be supplied, " + "and exist; or the {0}/configs directory should exist." + "{0}/configs is the output of make_configs.py" + "".format(args.dir)) # set the options corresponding to args.use_gpu run_opts = common_train_lib.RunOpts() @@ -185,7 +196,15 @@ def train(args, run_opts): config_dir = '{0}/configs'.format(args.dir) var_file = '{0}/vars'.format(config_dir) - variables = common_train_lib.parse_generic_config_vars_file(var_file) + if args.input_model is None: + config_dir = '{0}/configs'.format(args.dir) + var_file = '{0}/vars'.format(config_dir) + + variables = common_train_lib.parse_generic_config_vars_file(var_file) + else: + # If args.input_model is specified, the model left and right contexts + # are computed using input_model. + variables = common_train_lib.get_input_model_info(args.input_model) # Set some variables. try: @@ -204,7 +223,8 @@ def train(args, run_opts): # matrix. This first config just does any initial splicing that we do; # we do this as it's a convenient way to get the stats for the 'lda-like' # transform. - if (args.stage <= -5) and os.path.exists(args.dir+"/configs/init.config"): + if (args.stage <= -4) and os.path.exists(args.dir+"/configs/init.config") and \ + (args.input_model is None): logger.info("Initializing the network for computing the LDA stats") common_lib.execute_command( """{command} {dir}/log/nnet_init.log \ @@ -213,7 +233,7 @@ def train(args, run_opts): dir=args.dir)) default_egs_dir = '{0}/egs'.format(args.dir) - if (args.stage <= -4) and args.egs_dir is None: + if (args.stage <= -3) and args.egs_dir is None: if args.targets_scp is None or args.feat_dir is None: raise Exception("If you don't supply the --egs-dir option, the " "--targets-scp and --feat-dir options are required.") @@ -250,7 +270,6 @@ def train(args, run_opts): cmvn_opts=args.cmvn_opts, online_ivector_dir=args.online_ivector_dir, samples_per_iter=args.samples_per_iter, - transform_dir=args.transform_dir, stage=args.egs_stage, target_type=target_type, num_targets=num_targets) @@ -275,7 +294,8 @@ def train(args, run_opts): # use during decoding common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) - if args.stage <= -3 and os.path.exists(args.dir+"/configs/init.config"): + if args.stage <= -2 and os.path.exists(args.dir+"/configs/init.config") and \ + (args.input_model is None): logger.info('Computing the preconditioning matrix for input features') train_lib.common.compute_preconditioning_matrix( @@ -285,7 +305,7 @@ def train(args, run_opts): if args.stage <= -1: logger.info("Preparing the initial network.") - common_train_lib.prepare_initial_network(args.dir, run_opts) + common_train_lib.prepare_initial_network(args.dir, run_opts, args.srand, args.input_model) # set num_iters so that as close as possible, we process the data # $num_epochs times, i.e. $num_iters*$avg_num_jobs) == @@ -347,6 +367,19 @@ def train(args, run_opts): "shrink-value={1}".format(args.proportional_shrink, shrinkage_value)) + percent = num_archives_processed * 100.0 / num_archives_to_process + epoch = (num_archives_processed * args.num_epochs + / num_archives_to_process) + shrink_info_str = '' + if shrinkage_value != 1.0: + shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value) + logger.info("Iter: {0}/{1} " + "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete) " + "lr: {5:0.6f} {6}".format(iter, num_iters - 1, + epoch, args.num_epochs, + percent, + lrate, shrink_info_str)) + train_lib.common.train_one_iteration( dir=args.dir, iter=iter, diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py index bd94fb7cb94..e797c86b323 100755 --- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py +++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py @@ -68,6 +68,14 @@ def get_args(): is the 'principal' chunk-width, used preferentially""") # trainer options + parser.add_argument("--trainer.input-model", type=str, + dest='input_model', default=None, + action=common_lib.NullstrToNoneAction, + help="""If specified, this model is used as initial + raw model (0.raw in the script) instead of initializing + the model from xconfig. Configs dir is not expected to + exist and left/right context is computed from this + model.""") parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter', default=20000, help="""This is really the number of egs in each @@ -171,11 +179,14 @@ def process_args(args): if args.chunk_right_context < 0: raise Exception("--egs.chunk-right-context should be non-negative") - if (not os.path.exists(args.dir) - or not os.path.exists(args.dir+"/configs")): - raise Exception("This scripts expects {0} to exist and have a configs " - "directory which is the output of " - "make_configs.py script") + if (not os.path.exists(args.dir)): + raise Exception("This script expects --dir={0} to exist.") + if (not os.path.exists(args.dir+"/configs") and + (args.input_model is None or not os.path.exists(args.input_model))): + raise Exception("Either --trainer.input-model option should be supplied, " + "and exist; or the {0}/configs directory should exist." + "{0}/configs is the output of make_configs.py" + "".format(args.dir)) # set the options corresponding to args.use_gpu run_opts = common_train_lib.RunOpts() @@ -233,11 +244,15 @@ def train(args, run_opts): ivector_dim = common_lib.get_ivector_dim(args.online_ivector_dir) ivector_id = common_lib.get_ivector_extractor_id(args.online_ivector_dir) + if args.input_model is None: + config_dir = '{0}/configs'.format(args.dir) + var_file = '{0}/vars'.format(config_dir) - config_dir = '{0}/configs'.format(args.dir) - var_file = '{0}/vars'.format(config_dir) - - variables = common_train_lib.parse_generic_config_vars_file(var_file) + variables = common_train_lib.parse_generic_config_vars_file(var_file) + else: + # If args.input_model is specified, the model left and right contexts + # are computed using input_model. + variables = common_train_lib.get_input_model_info(args.input_model) # Set some variables. try: @@ -259,7 +274,8 @@ def train(args, run_opts): # we do this as it's a convenient way to get the stats for the 'lda-like' # transform. - if (args.stage <= -4) and os.path.exists(args.dir+"/configs/init.config"): + if (args.stage <= -4) and os.path.exists(args.dir+"/configs/init.config") and \ + (args.input_model is None): logger.info("Initializing the network for computing the LDA stats") common_lib.execute_command( """{command} {dir}/log/nnet_init.log \ @@ -304,7 +320,6 @@ def train(args, run_opts): cmvn_opts=args.cmvn_opts, online_ivector_dir=args.online_ivector_dir, samples_per_iter=args.samples_per_iter, - transform_dir=args.transform_dir, stage=args.egs_stage, target_type=target_type, num_targets=num_targets) @@ -334,7 +349,8 @@ def train(args, run_opts): # use during decoding common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) - if args.stage <= -2 and os.path.exists(args.dir+"/configs/init.config"): + if args.stage <= -2 and os.path.exists(args.dir+"/configs/init.config") and \ + (args.input_model is None): logger.info('Computing the preconditioning matrix for input features') train_lib.common.compute_preconditioning_matrix( @@ -344,7 +360,7 @@ def train(args, run_opts): if args.stage <= -1: logger.info("Preparing the initial network.") - common_train_lib.prepare_initial_network(args.dir, run_opts) + common_train_lib.prepare_initial_network(args.dir, run_opts, args.srand, args.input_model) # set num_iters so that as close as possible, we process the data # $num_epochs times, i.e. $num_iters*$avg_num_jobs) == @@ -423,6 +439,19 @@ def train(args, run_opts): get_raw_nnet_from_am=False) else shrinkage_value) + percent = num_archives_processed * 100.0 / num_archives_to_process + epoch = (num_archives_processed * args.num_epochs + / num_archives_to_process) + shrink_info_str = '' + if shrinkage_value != 1.0: + shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value) + logger.info("Iter: {0}/{1} " + "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete) " + "lr: {5:0.6f} {6}".format(iter, num_iters - 1, + epoch, args.num_epochs, + percent, + lrate, shrink_info_str)) + train_lib.common.train_one_iteration( dir=args.dir, iter=iter, diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py index 83a1da8eca1..25e7dced19b 100755 --- a/egs/wsj/s5/steps/nnet3/train_rnn.py +++ b/egs/wsj/s5/steps/nnet3/train_rnn.py @@ -67,6 +67,14 @@ def get_args(): should halve --trainer.samples-per-iter. May be a comma-separated list of alternatives: first width is the 'principal' chunk-width, used preferentially""") + parser.add_argument("--trainer.input-model", type=str, + dest='input_model', default=None, + action=common_lib.NullstrToNoneAction, + help="""If specified, this model is used as initial + raw model (0.raw in the script) instead of initializing + the model from xconfig. Configs dir is not expected to + exist and left/right context is computed from this + model.""") parser.add_argument("--trainer.samples-per-iter", type=int, dest='samples_per_iter', default=20000, help="""This is really the number of egs in each @@ -162,14 +170,15 @@ def process_args(args): if args.chunk_right_context < 0: raise Exception("--egs.chunk-right-context should be non-negative") - if (not os.path.exists(args.dir) - or not os.path.exists(args.dir+"/configs")): - raise Exception("This scripts expects {0} to exist and have a configs " - "directory which is the output of " - "make_configs.py script") + if (not os.path.exists(args.dir)): + raise Exception("This script expects --dir={0} to exist.") - if args.transform_dir is None: - args.transform_dir = args.ali_dir + if (not os.path.exists(args.dir+"/configs") and + (args.input_model is None or not os.path.exists(args.input_model))): + raise Exception("Either --trainer.input-model option should be supplied, " + "and exist; or the {0}/configs directory should exist." + "{0}/configs is the output of make_configs.py" + "".format(args.dir)) # set the options corresponding to args.use_gpu run_opts = common_train_lib.RunOpts() @@ -244,7 +253,15 @@ def train(args, run_opts): config_dir = '{0}/configs'.format(args.dir) var_file = '{0}/vars'.format(config_dir) - variables = common_train_lib.parse_generic_config_vars_file(var_file) + if args.input_model is None: + config_dir = '{0}/configs'.format(args.dir) + var_file = '{0}/vars'.format(config_dir) + + variables = common_train_lib.parse_generic_config_vars_file(var_file) + else: + # If args.input_model is specified, the model left and right contexts + # are computed using input_model. + variables = common_train_lib.get_input_model_info(args.input_model) # Set some variables. try: @@ -266,7 +283,7 @@ def train(args, run_opts): # we do this as it's a convenient way to get the stats for the 'lda-like' # transform. - if args.stage <= -5: + if (args.stage <= -5) and (args.input_model is None): logger.info("Initializing a basic network for estimating " "preconditioning matrix") common_lib.execute_command( @@ -296,7 +313,6 @@ def train(args, run_opts): cmvn_opts=args.cmvn_opts, online_ivector_dir=args.online_ivector_dir, samples_per_iter=args.samples_per_iter, - transform_dir=args.transform_dir, stage=args.egs_stage) if args.egs_dir is None: @@ -323,7 +339,7 @@ def train(args, run_opts): # use during decoding common_train_lib.copy_egs_properties_to_exp_dir(egs_dir, args.dir) - if args.stage <= -3: + if args.stage <= -3 and (args.input_model is None): logger.info('Computing the preconditioning matrix for input features') train_lib.common.compute_preconditioning_matrix( @@ -331,7 +347,7 @@ def train(args, run_opts): max_lda_jobs=args.max_lda_jobs, rand_prune=args.rand_prune) - if args.stage <= -2: + if args.stage <= -2 and (args.input_model is None): logger.info("Computing initial vector for FixedScaleComponent before" " softmax, using priors^{prior_scale} and rescaling to" " average 1".format( @@ -344,7 +360,8 @@ def train(args, run_opts): if args.stage <= -1: logger.info("Preparing the initial acoustic model.") train_lib.acoustic_model.prepare_initial_acoustic_model( - args.dir, args.ali_dir, run_opts) + args.dir, args.ali_dir, run_opts, + input_model=args.input_model) # set num_iters so that as close as possible, we process the data # $num_epochs times, i.e. $num_iters*$avg_num_jobs) == @@ -405,6 +422,19 @@ def train(args, run_opts): iter, model_file, args.shrink_saturation_threshold) else 1.0) + percent = num_archives_processed * 100.0 / num_archives_to_process + epoch = (num_archives_processed * args.num_epochs + / num_archives_to_process) + shrink_info_str = '' + if shrinkage_value != 1.0: + shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value) + logger.info("Iter: {0}/{1} " + "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete) " + "lr: {5:0.6f} {6}".format(iter, num_iters - 1, + epoch, args.num_epochs, + percent, + lrate, shrink_info_str)) + train_lib.common.train_one_iteration( dir=args.dir, iter=iter, diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py index d74135e5980..3b8dc82fe48 100755 --- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py +++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py @@ -296,10 +296,10 @@ def check_model_contexts(config_dir, nnet_edits=None, existing_model=None): if key in ['left-context', 'right-context']: contexts[file_name][key] = value - if contexts.has_key('init'): - assert(contexts.has_key('ref')) - if (contexts['init'].has_key('left-context') and - contexts['ref'].has_key('left-context')): + if 'init' in contexts: + assert('ref' in contexts) + if ('left-context' in contexts['init'] and + 'left-context' in contexts['ref']): if ((contexts['init']['left-context'] > contexts['ref']['left-context']) or (contexts['init']['right-context'] diff --git a/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh b/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh index 8a1ab0fe840..a423be7aa20 100755 --- a/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh +++ b/egs/wsj/s5/steps/online/nnet2/extract_ivectors.sh @@ -182,7 +182,7 @@ if [ $sub_speaker_frames -gt 0 ]; then cat $data/spk2utt | python -c " import sys utt_counts = {} -trash = map(lambda x: utt_counts.update({x.split()[0]:float(x.split()[1])}), open('$dir/utt_counts').readlines()) +trash = list(map(lambda x: utt_counts.update({x.split()[0]:float(x.split()[1])}), open('$dir/utt_counts').readlines())) sub_speaker_frames = $sub_speaker_frames lines = sys.stdin.readlines() total_counts = {} @@ -212,7 +212,7 @@ for line_index in range(len(lines)): if ((current_count >= $sub_speaker_frames) and ((total_counts[spk] - covered_count) >= $sub_speaker_frames)) or (utt == parts[-1]): spk_partial = '{0}-{1:06x}'.format(spk, numeric_id) numeric_id += 1 - print '{0} {1}'.format(spk_partial, ' '.join(current_utts)) + print ('{0} {1}'.format(spk_partial, ' '.join(current_utts))) current_utts = [] current_count = 0 "> $dir/spk2utt || exit 1; diff --git a/egs/wsj/s5/steps/online/prepare_online_decoding.sh b/egs/wsj/s5/steps/online/prepare_online_decoding.sh index de69c2afb05..a6c17a4f303 100755 --- a/egs/wsj/s5/steps/online/prepare_online_decoding.sh +++ b/egs/wsj/s5/steps/online/prepare_online_decoding.sh @@ -27,7 +27,7 @@ echo "$0 $@" # Print the command line for logging . parse_options.sh || exit 1; if [ $# -ne 4 -a $# -ne 5 ]; then - echo "Usage: $0 [options] [] " + echo "Usage: $0 [options] [] " echo "e.g.: $0 data/train data/lang exp/tri3b exp/tri3b_mmi/final.mdl exp/tri3b_online" echo "main options (for others, see top of script file)" echo " --feature-type # Type of the base features; " @@ -80,8 +80,8 @@ echo $nj >$dir/num_jobs || exit 1; utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1; cp $lang/phones.txt $dir || exit 1; -splice_opts=`cat $srcdir/splice_opts 2>/dev/null` -cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` +cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` silphonelist=`cat $lang/phones/silence.csl` || exit 1; cp $srcdir/splice_opts $srcdir/cmvn_opts $srcdir/final.mat $srcdir/final.mdl $dir/ 2>/dev/null @@ -116,7 +116,7 @@ case $feat_type in esac # Set up the adapted features "$feats" for training set. -if [ -f $srcdir/trans.1 ]; then +if [ -f $srcdir/trans.1 ]; then feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$srcdir/trans.JOB ark:- ark:- |"; else feats="$sifeats"; @@ -136,13 +136,13 @@ fi if [ $stage -le 0 ]; then echo "$0: Accumulating statistics for basis-fMLLR computation" # Note: we get Gaussian level alignments with the "final.mdl" and the -# speaker adapted features. +# speaker adapted features. $cmd JOB=1:$nj $dir/log/basis_acc.JOB.log \ ali-to-post "ark:gunzip -c $srcdir/ali.JOB.gz|" ark:- \| \ weight-silence-post $silence_weight $silphonelist $dir/final.mdl ark:- ark:- \| \ gmm-post-to-gpost $dir/final.mdl "$feats" ark:- ark:- \| \ gmm-basis-fmllr-accs-gpost $spk2utt_opt \ - $dir/final.mdl "$sifeats" ark,s,cs:- $dir/basis.acc.JOB || exit 1; + $dir/final.mdl "$sifeats" ark,s,cs:- $dir/basis.acc.JOB || exit 1; fi if [ $stage -le 1 ]; then @@ -199,7 +199,7 @@ if [ $stage -le 3 ]; then *) echo "Unknown feature type $feature_type" esac - if ! cp $online_cmvn_config $dir/conf/online_cmvn.conf; then + if ! cp $online_cmvn_config $dir/conf/online_cmvn.conf; then echo "$0: error copying online cmvn config to $dir/conf/" exit 1; fi @@ -242,7 +242,7 @@ if [ $stage -le 3 ]; then # perl -e '$_ = <>; s/^\s+|\s+$//g; ($t, $c) = (split)[13, 16]; print -$t/$c;'); #echo "--pov-offset=$offset" >>$dir/conf/pitch_process.conf fi - + echo "--fmllr-basis=$dir/fmllr.basis" >>$conf echo "--online-alignment-model=$dir/final.oalimdl" >>$conf echo "--model=$dir/final.mdl" >>$conf diff --git a/egs/wsj/s5/steps/segmentation/internal/sad_to_segments.py b/egs/wsj/s5/steps/segmentation/internal/sad_to_segments.py index 4c4b8f3301b..9b1c0f12b9a 100755 --- a/egs/wsj/s5/steps/segmentation/internal/sad_to_segments.py +++ b/egs/wsj/s5/steps/segmentation/internal/sad_to_segments.py @@ -162,6 +162,8 @@ def pad_speech_segments(self, segment_padding, max_duration=float("inf")): """Pads segments by duration 'segment_padding' on either sides, but ensures that the segments don't go beyond the neighboring segments or the duration of the utterance 'max_duration'.""" + if max_duration == None: + max_duration = float("inf") for i, segment in enumerate(self.segments): assert segment[2] == 2, segment segment[0] -= segment_padding # try adding padding on the left side diff --git a/egs/wsj/s5/utils/combine_data.sh b/egs/wsj/s5/utils/combine_data.sh index 1dc3da6b742..a43cf9d77f3 100755 --- a/egs/wsj/s5/utils/combine_data.sh +++ b/egs/wsj/s5/utils/combine_data.sh @@ -94,7 +94,7 @@ else echo "$0 [info]: not combining segments as it does not exist" fi -for file in utt2spk utt2lang utt2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do +for file in utt2spk utt2lang utt2dur reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do exists_somewhere=false absent_somewhere=false for d in $*; do diff --git a/egs/wsj/s5/utils/copy_data_dir.sh b/egs/wsj/s5/utils/copy_data_dir.sh index 5b0b3946d25..f3b885c5e79 100755 --- a/egs/wsj/s5/utils/copy_data_dir.sh +++ b/egs/wsj/s5/utils/copy_data_dir.sh @@ -103,6 +103,13 @@ fi if [ -f $srcdir/utt2dur ]; then utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur fi +if [ -f $srcdir/reco2dur ]; then + if [ -f $srcdir/segments ]; then + cp $srcdir/reco2dur $destdir/reco2dur + else + utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur + fi +fi if [ -f $srcdir/spk2gender ]; then utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender fi diff --git a/egs/wsj/s5/utils/data/extract_wav_segments_data_dir.sh b/egs/wsj/s5/utils/data/extract_wav_segments_data_dir.sh index 7babad5a52c..129977415e0 100755 --- a/egs/wsj/s5/utils/data/extract_wav_segments_data_dir.sh +++ b/egs/wsj/s5/utils/data/extract_wav_segments_data_dir.sh @@ -7,13 +7,19 @@ # wav segments (according to the 'segments' file) # so that the resulting data directory does not have a 'segments' file anymore. -. utils/parse_options.sh +nj=4 +cmd=run.pl + +. ./utils/parse_options.sh . ./path.sh if [ $# != 2 ]; then echo "Usage: $0 " - echo " This script copies data directory to and gets" - echo "rid of the 'segments' file by extracting the wav segments." + echo " This script copies data directory to and removes" + echo " the 'segments' file by extracting the wav segments." + echo "Options: " + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." exit 1; fi @@ -22,17 +28,32 @@ export LC_ALL=C srcdir=$1 dir=$2 - +logdir=$dir/log if ! mkdir -p $dir/data; then echo "$0: failed to create directory $dir/data" exit 1 fi +mkdir -p $logdir -set -e -o pipefail +set -eu -o pipefail utils/copy_data_dir.sh $srcdir $dir -extract-segments scp:$srcdir/wav.scp $srcdir/segments \ - ark,scp:$dir/data/wav_segments.ark,$dir/data/wav_segments.scp +split_segments="" +for n in $(seq $nj); do + split_segments="$split_segments $logdir/segments.$n" +done + +utils/split_scp.pl $srcdir/segments $split_segments + +$cmd JOB=1:$nj $logdir/extract_wav_segments.JOB.log \ + extract-segments scp,p:$srcdir/wav.scp $logdir/segments.JOB \ + ark,scp:$dir/data/wav_segments.JOB.ark,$dir/data/wav_segments.JOB.scp + +# concatenate the .scp files together. +for n in $(seq $nj); do + cat $dir/data/wav_segments.$n.scp +done > $dir/data/wav_segments.scp + cat $dir/data/wav_segments.scp | awk '{ print $1 " wav-copy " $2 " - |" }' >$dir/wav.scp -rm $dir/reco2file_and_channel || true +rm $dir/{segments,reco2file_and_channel} 2>/dev/null || true diff --git a/egs/wsj/s5/utils/data/get_reco2dur.sh b/egs/wsj/s5/utils/data/get_reco2dur.sh new file mode 100755 index 00000000000..943e739c53c --- /dev/null +++ b/egs/wsj/s5/utils/data/get_reco2dur.sh @@ -0,0 +1,143 @@ +#!/bin/bash + +# Copyright 2016 Johns Hopkins University (author: Daniel Povey) +# 2018 Andrea Carmantini +# Apache 2.0 + +# This script operates on a data directory, such as in data/train/, and adds the +# reco2dur file if it does not already exist. The file 'reco2dur' maps from +# recording to the duration of the recording in seconds. This script works it +# out from the 'wav.scp' file, or, if utterance-ids are the same as recording-ids, from the +# utt2dur file (it first tries interrogating the headers, and if this fails, it reads the wave +# files in entirely.) +# We could use durations from segments file, but that's not the duration of the recordings +# but the sum of utterance lenghts (silence in between could be excluded from segments) +# For sum of utterance lenghts: +# awk 'FNR==NR{uttdur[$1]=$2;next} +# { for(i=2;i<=NF;i++){dur+=uttdur[$i];} +# print $1 FS dur; dur=0 }' $data/utt2dur $data/reco2utt + + +frame_shift=0.01 +cmd=run.pl +nj=4 + +. utils/parse_options.sh +. ./path.sh + +if [ $# != 1 ]; then + echo "Usage: $0 [options] " + echo "e.g.:" + echo " $0 data/train" + echo " Options:" + echo " --frame-shift # frame shift in seconds. Only relevant when we are" + echo " # getting duration from feats.scp (default: 0.01). " + exit 1 +fi + +export LC_ALL=C + +data=$1 + + +if [ -s $data/reco2dur ] && \ + [ $(wc -l < $data/wav.scp) -eq $(wc -l < $data/reco2dur) ]; then + echo "$0: $data/reco2dur already exists with the expected length. We won't recompute it." + exit 0; +fi + +if [ -s $data/utt2dur ] && \ + [ $(wc -l < $data/utt2spk) -eq $(wc -l < $data/utt2dur) ] && \ + [ ! -s $data/segments ]; then + + echo "$0: $data/wav.scp indexed by utt-id; copying utt2dur to reco2dur" + cp $data/utt2dur $data/reco2dur && exit 0; + +elif [ -f $data/wav.scp ]; then + echo "$0: obtaining durations from recordings" + + # if the wav.scp contains only lines of the form + # utt1 /foo/bar/sph2pipe -f wav /baz/foo.sph | + if cat $data/wav.scp | perl -e ' + while (<>) { s/\|\s*$/ |/; # make sure final | is preceded by space. + @A = split; if (!($#A == 5 && $A[1] =~ m/sph2pipe$/ && + $A[2] eq "-f" && $A[3] eq "wav" && $A[5] eq "|")) { exit(1); } + $reco = $A[0]; $sphere_file = $A[4]; + + if (!open(F, "<$sphere_file")) { die "Error opening sphere file $sphere_file"; } + $sample_rate = -1; $sample_count = -1; + for ($n = 0; $n <= 30; $n++) { + $line = ; + if ($line =~ m/sample_rate -i (\d+)/) { $sample_rate = $1; } + if ($line =~ m/sample_count -i (\d+)/) { $sample_count = $1; } + if ($line =~ m/end_head/) { break; } + } + close(F); + if ($sample_rate == -1 || $sample_count == -1) { + die "could not parse sphere header from $sphere_file"; + } + $duration = $sample_count * 1.0 / $sample_rate; + print "$reco $duration\n"; + } ' > $data/reco2dur; then + echo "$0: successfully obtained recording lengths from sphere-file headers" + else + echo "$0: could not get recording lengths from sphere-file headers, using wav-to-duration" + if ! command -v wav-to-duration >/dev/null; then + echo "$0: wav-to-duration is not on your path" + exit 1; + fi + + read_entire_file=false + if grep -q 'sox.*speed' $data/wav.scp; then + read_entire_file=true + echo "$0: reading from the entire wav file to fix the problem caused by sox commands with speed perturbation. It is going to be slow." + echo "... It is much faster if you call get_reco2dur.sh *before* doing the speed perturbation via e.g. perturb_data_dir_speed.sh or " + echo "... perturb_data_dir_speed_3way.sh." + fi + + num_recos=$(wc -l <$data/wav.scp) + if [ $nj -gt $num_recos ]; then + nj=$num_recos + fi + + temp_data_dir=$data/wav${nj}split + wavscps=$(for n in `seq $nj`; do echo $temp_data_dir/$n/wav.scp; done) + subdirs=$(for n in `seq $nj`; do echo $temp_data_dir/$n; done) + + if ! mkdir -p $subdirs >&/dev/null; then + for n in `seq $nj`; do + mkdir -p $temp_data_dir/$n + done + fi + + utils/split_scp.pl $data/wav.scp $wavscps + + + $cmd JOB=1:$nj $data/log/get_reco_durations.JOB.log \ + wav-to-duration --read-entire-file=$read_entire_file \ + scp:$temp_data_dir/JOB/wav.scp ark,t:$temp_data_dir/JOB/reco2dur || \ + { echo "$0: there was a problem getting the durations"; exit 1; } # This could + + for n in `seq $nj`; do + cat $temp_data_dir/$n/reco2dur + done > $data/reco2dur + fi + rm -r $temp_data_dir +else + echo "$0: Expected $data/wav.scp to exist" + exit 1 +fi + +len1=$(wc -l < $data/wav.scp) +len2=$(wc -l < $data/reco2dur) +if [ "$len1" != "$len2" ]; then + echo "$0: warning: length of reco2dur does not equal that of wav.scp, $len2 != $len1" + if [ $len1 -gt $[$len2*2] ]; then + echo "$0: less than half of recordings got a duration: failing." + exit 1 + fi +fi + +echo "$0: computed $data/reco2dur" + +exit 0 diff --git a/egs/wsj/s5/utils/data/get_utt2dur.sh b/egs/wsj/s5/utils/data/get_utt2dur.sh index e6a344d7d50..995136a5575 100755 --- a/egs/wsj/s5/utils/data/get_utt2dur.sh +++ b/egs/wsj/s5/utils/data/get_utt2dur.sh @@ -32,14 +32,14 @@ export LC_ALL=C data=$1 if [ -s $data/utt2dur ] && \ - [ $(cat $data/utt2spk | wc -l) -eq $(cat $data/utt2dur | wc -l) ]; then + [ $(wc -l < $data/utt2spk) -eq $(wc -l < $data/utt2dur) ]; then echo "$0: $data/utt2dur already exists with the expected length. We won't recompute it." exit 0; fi if [ -s $data/segments ]; then echo "$0: working out $data/utt2dur from $data/segments" - cat $data/segments | awk '{len=$4-$3; print $1, len;}' > $data/utt2dur + awk '{len=$4-$3; print $1, len;}' < $data/segments > $data/utt2dur elif [ -f $data/wav.scp ]; then echo "$0: segments file does not exist so getting durations from wave files" @@ -75,13 +75,19 @@ elif [ -f $data/wav.scp ]; then fi read_entire_file=false - if cat $data/wav.scp | grep -q 'sox.*speed'; then + if grep -q 'sox.*speed' $data/wav.scp; then read_entire_file=true echo "$0: reading from the entire wav file to fix the problem caused by sox commands with speed perturbation. It is going to be slow." echo "... It is much faster if you call get_utt2dur.sh *before* doing the speed perturbation via e.g. perturb_data_dir_speed.sh or " echo "... perturb_data_dir_speed_3way.sh." fi + + num_utts=$(wc -l <$data/utt2spk) + if [ $nj -gt $num_utts ]; then + nj=$num_utts + fi + utils/data/split_data.sh --per-utt $data $nj sdata=$data/split${nj}utt @@ -102,8 +108,8 @@ else exit 1 fi -len1=$(cat $data/utt2spk | wc -l) -len2=$(cat $data/utt2dur | wc -l) +len1=$(wc -l < $data/utt2spk) +len2=$(wc -l < $data/utt2dur) if [ "$len1" != "$len2" ]; then echo "$0: warning: length of utt2dur does not equal that of utt2spk, $len2 != $len1" if [ $len1 -gt $[$len2*2] ]; then diff --git a/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh b/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh index f857ae2bdd7..cd291427398 100755 --- a/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh +++ b/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh @@ -47,10 +47,11 @@ if [ -f $destdir/feats.scp ]; then exit 1 fi -echo "$0: making sure the utt2dur file is present in ${srcdir}, because " -echo "... obtaining it after speed-perturbing would be very slow, and" -echo "... you might need it." +echo "$0: making sure the utt2dur and the reco2dur files are present" +echo "... in ${srcdir}, because obtaining it after speed-perturbing" +echo "... would be very slow, and you might need them." utils/data/get_utt2dur.sh ${srcdir} +utils/data/get_reco2dur.sh ${srcdir} utils/data/perturb_data_dir_speed.sh 0.9 ${srcdir} ${destdir}_speed0.9 || exit 1 utils/data/perturb_data_dir_speed.sh 1.1 ${srcdir} ${destdir}_speed1.1 || exit 1 diff --git a/egs/wsj/s5/utils/fix_data_dir.sh b/egs/wsj/s5/utils/fix_data_dir.sh index 8ebfc8d49fe..ca0972ca85b 100755 --- a/egs/wsj/s5/utils/fix_data_dir.sh +++ b/egs/wsj/s5/utils/fix_data_dir.sh @@ -6,6 +6,8 @@ # It puts the original contents of data-dir into # data-dir/.backup +cmd="$@" + utt_extra_files= spk_extra_files= @@ -21,6 +23,12 @@ if [ $# != 1 ]; then fi data=$1 + +if [ -f $data/images.scp ]; then + image/fix_data_dir.sh $cmd + exit $? +fi + mkdir -p $data/.backup [ ! -d $data ] && echo "$0: no such directory $data" && exit 1; @@ -46,7 +54,7 @@ function check_sorted { } for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur utt2num_frames; do + reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur reco2dur utt2num_frames; do if [ -f $data/$x ]; then cp $data/$x $data/.backup/$x check_sorted $data/$x @@ -97,6 +105,7 @@ function filter_recordings { filter_file $tmpdir/recordings $data/wav.scp [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel + [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur true fi } @@ -143,7 +152,9 @@ function filter_utts { fi maybe_wav= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. + maybe_reco2dur= + [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. + [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts for x in feats.scp text segments utt2lang $maybe_wav; do if [ -f $data/$x ]; then utils/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp @@ -164,7 +175,7 @@ function filter_utts { fi fi - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2dur utt2num_frames $maybe_wav $utt_extra_files; do + for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do if [ -f $data/$x ]; then cp $data/$x $data/.backup/$x if ! cmp -s $data/$x <( utils/filter_scp.pl $tmpdir/utts $data/$x ) ; then diff --git a/egs/wsj/s5/utils/lang/add_unigrams_arpa.pl b/egs/wsj/s5/utils/lang/add_unigrams_arpa.pl new file mode 100755 index 00000000000..11e43be17bf --- /dev/null +++ b/egs/wsj/s5/utils/lang/add_unigrams_arpa.pl @@ -0,0 +1,93 @@ +#!/usr/bin/env perl + +# Copyright 2018 Xiaohui Zhang +# Apache 2.0. +# +use strict; +use warnings; +use Getopt::Long; + +my $Usage = < output-arpa + contains a list of words and their probabilities, e.g. "jack 0.2". All probs will be +scaled by a positive scalar and then be used as the unigram prob. of the added word. +The scale should approximiately relect the OOV rate of the language in concern. +EOU + +my @F; +my @OOVS; + +if (@ARGV != 2) { + die $Usage; +} + +# Gets parameters. +my $oov_prob_file = shift @ARGV; +my $scale = shift @ARGV; +my $arpa_in = shift @ARGV; +my $arpa_out = shift @ARGV; + +# Opens files. +open(F, "<$oov_prob_file") || die "$0: Fail to open $oov_prob_file\n"; +while () { push @OOVS, $_; } +my $num_oovs = @OOVS; + +$scale > 0.0 || die "Bad scale"; +print STDERR "$0: Creating LM file with additional unigrams, using $oov_prob_file\n"; + +my %vocab; +my $unigram = 0; +my $num_unigrams = 0; +my @lines; + +# Parse and record the head and unigrams in the ARPA LM. +while() { + if (m/^ngram 1=(\d+)/) { $num_unigrams = $1; } + + if (m/^\\2-grams:$/) { last; } + if (m/^\\1-grams:$/) { $unigram = 1; push(@lines, $_); next; } + if (m/^\\2-grams:$/) { $unigram = 0; } + + my @col = split(" ", $_); + if ( $unigram == 1 ) { + # Record in-vocab words into a map. + if ( @col > 0 ) { + my $word = $col[1]; + $vocab{$word} = 1; + push(@lines, $_); + } else { + # Insert out-of-vocab words and their probs into the unigram list. + foreach my $l (@OOVS) { + my @A = split(" ", $l); + @A == 2 || die "bad line in oov2prob: $_;"; + my $word = $A[0]; + my $prob = $A[1]; + if (exists($vocab{$word})) { next; } + $num_unigrams ++; + my $log10prob = (log($prob * $scale) / log(10.0)); + $vocab{$word} = 1; + my $line = sprintf("%.6f\t$word\n", $log10prob); + push(@lines, $line); + } + } + } else { push(@lines, $_); } +} + +# Print the head and unigrams, with the updated # unigrams in the head. +foreach my $l (@lines) { + if ($l =~ m/ngram 1=/) { + print "ngram 1=$num_unigrams\n"; + } else { + print $l; + } +} + +# Print the left fields. +print "\n\\2-grams:\n"; +while() { + print; +} + +close(F); +exit 0 diff --git a/egs/wsj/s5/utils/lang/adjust_unk_arpa.pl b/egs/wsj/s5/utils/lang/adjust_unk_arpa.pl new file mode 100755 index 00000000000..c3a6640b8bc --- /dev/null +++ b/egs/wsj/s5/utils/lang/adjust_unk_arpa.pl @@ -0,0 +1,68 @@ +#!/usr/bin/env perl + +# Copyright 2018 Xiaohui Zhang +# Apache 2.0. +# +use strict; +use warnings; +use Getopt::Long; + +my $Usage = < output-arpa + +Allowed options: + --fixed-value (true|false) : If true, interpret the unk-scale as a fixed value we'll set to + the unigram prob of the OOV dict entry, rather than using it to + scale the probs. In this case higher order n-grams containing + the OOV dict entry remain untouched. This is useful when the OOV + dict entry doesn't appear in n-grams (n>1) as the predicted word. +EOU + +my $fixed_value = "false"; +GetOptions('fixed-value=s' => \$fixed_value); + +($fixed_value eq "true" || $fixed_value eq "false") || + die "$0: Bad value for option --fixed-value\n"; + +if (@ARGV != 2) { + die $Usage; +} + +# Gets parameters. +my $unk_word = shift @ARGV; +my $unk_scale = shift @ARGV; +my $arpa_in = shift @ARGV; +my $arpa_out = shift @ARGV; + +$unk_scale > 0.0 || die "Bad unk_scale"; # this must be positive +if ( $fixed_value eq "true" ) { + print STDERR "$0: Setting the unigram prob of $unk_word in LM file as $unk_scale.\n"; +} else { + print STDERR "$0: Scaling the probs of ngrams where $unk_word is the predicted word in LM file by $unk_scale.\n"; +} + +my $ngram = 0; # the order of ngram we are visiting + +# Change the unigram prob of the unk-word in the ARPA LM. +while() { + if (m/^\\1-grams:$/) { $ngram = 1; } + if (m/^\\2-grams:$/) { $ngram = 2; } + if (m/^\\3-grams:$/) { $ngram = 3; } + if (m/^\\4-grams:$/) { $ngram = 4; } + if (m/^\\5-grams:$/) { $ngram = 5; } + my @col = split(" ", $_); + if ( @col > 1 && $ngram > 0 && $col[$ngram] eq $unk_word ) { + if ( $fixed_value eq "true" && $ngram == 1 ) { + $col[0] = (log($unk_scale) / log(10.0)); + } else { + $col[0] += (log($unk_scale) / log(10.0)); + } + my $line = join("\t", @col); + print "$line\n"; + } else { + print; + } +} + +exit 0 diff --git a/egs/wsj/s5/utils/lang/adjust_unk_graph.sh b/egs/wsj/s5/utils/lang/adjust_unk_graph.sh new file mode 100755 index 00000000000..9a40a9960f2 --- /dev/null +++ b/egs/wsj/s5/utils/lang/adjust_unk_graph.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# Copyright 2018 Xiaohui Zhang +# Apache 2.0 + +# This script copies a fully expanded decoding graph (HCLG.fst) and scales the scores +# of all arcs whose output symbol is a user-specified OOV symbol (or any other word). +# This achieves an equivalent effect of utils/lang/adjust_unk_arpa.pl, which scales +# the LM prob of all ngrams predicting an OOV symbol, while avoiding re-creating the graph. + +set -o pipefail + +if [ $# != 4 ]; then + echo "Usage: utils/adjust_unk_graph.sh " + echo "e.g.: utils/adjust_unk_graph.sh \"\" 0.1 exp/tri1/graph exp/tri1/graph_unk_scale_0.1" + exit 1; +fi + +if [ -f path.sh ]; then . ./path.sh; fi + +oov_word=$1 +unk_scale=$2 +graphdir_in=$3 +graphdir_out=$4 + +mkdir -p $graphdir_out + +required="HCLG.fst words.txt disambig_tid.int num_pdfs phones phones.txt words.txt" +for f in $required; do + [ ! -f $graphdir_in/$f ] && echo "adjust_unk_graph.sh: expected $graphdir_in/$f to exist" && exit 1; + cp -r $graphdir_in/$f $graphdir_out +done + +cp -r $graphdir_in/{disambig_tid.int,num_pdfs,phones,phones.txt,words.txt} $graphdir_out + +oov_id=`echo $oov_word | utils/sym2int.pl $graphdir_in/words.txt` +[ -z $oov_id ] && echo "adjust_unk_graph.sh: the specified oov symbol $oov_word is out of the vocabulary." && exit 1; +fstprint $graphdir_in/HCLG.fst | awk -v oov=$oov_id -v unk_scale=$unk_scale '{if($4==oov) $5=$5-log(unk_scale);print $0}' | \ + fstcompile > $graphdir_out/HCLG.fst || exit 1; diff --git a/egs/wsj/s5/utils/lang/bpe/add_final_optional_silence.sh b/egs/wsj/s5/utils/lang/bpe/add_final_optional_silence.sh new file mode 100755 index 00000000000..11e6b897382 --- /dev/null +++ b/egs/wsj/s5/utils/lang/bpe/add_final_optional_silence.sh @@ -0,0 +1,57 @@ +#!/bin/bash +. ./path.sh + +final_sil_prob=0.5 + +echo "$0 $@" # Print the command line for logging + +. ./utils/parse_options.sh + +if [ $# -ne 1 ]; then + echo "Usage: $0 " + echo " Add final optional silence to lexicon FSTs (L.fst and L_disambig.fst) in" + echo " lang/ directory ." + echo " This can be useful in systems with byte-pair encoded (BPE) lexicons, in which" + echo " the word-initial silence is part of the lexicon, so we turn off the standard" + echo " optional silence in the lexicon" + echo "options:" + echo " --final-sil-prob # default 0.5" + exit 1; +fi + +lang=$1 + +if [ $lang/phones/final_sil_prob -nt $lang/phones/nonsilence.txt ]; then + echo "$0 $lang/phones/final_sil_prob exists. Exiting..." + exit 1; +fi + +silphone=$(cat $lang/phones/optional_silence.int) + +sil_eq_zero=$(echo $(perl -e "if ( $final_sil_prob == 0.0) {print 'true';} else {print 'false';}")) +sil_eq_one=$(echo $(perl -e "if ( $final_sil_prob == 1.0) {print 'true';} else {print 'false';}")) +sil_lt_zero=$(echo $(perl -e "if ( $final_sil_prob < 0.0) {print 'true';} else {print 'false';}")) +sil_gt_one=$(echo $(perl -e "if ( $final_sil_prob > 1.0) {print 'true';} else {print 'false';}")) + +if $sil_lt_zero || $sil_gt_one; then + echo "$0 final-sil-prob should be between 0.0 and 1.0. Final silence was not added." + exit 1; +else + if $sil_eq_zero; then + echo "$0 final-sil-prob = 0 => Final silence was not added." + exit 0; + elif $sil_eq_one; then + ( echo "0 1 $silphone 0"; + echo "1" ) | fstcompile > $lang/final_sil.fst + else + log_silprob=$(echo $(perl -e "print log $final_sil_prob")) + ( echo "0 1 $silphone 0 $log_silprob"; + echo "0 $log_silprob"; + echo "1" ) | fstcompile > $lang/final_sil.fst + fi + mv $lang/L.fst $lang/L.fst.orig + mv $lang/L_disambig.fst $lang/L_disambig.fst.orig + fstconcat $lang/L.fst.orig $lang/final_sil.fst | fstarcsort --sort_type=olabel > $lang/L.fst + fstconcat $lang/L_disambig.fst.orig $lang/final_sil.fst | fstarcsort --sort_type=olabel > $lang/L_disambig.fst + echo "$final_sil_prob" > $lang/phones/final_sil_prob +fi diff --git a/egs/wsj/s5/utils/lang/bpe/apply_bpe.py b/egs/wsj/s5/utils/lang/bpe/apply_bpe.py new file mode 100755 index 00000000000..9edc726d596 --- /dev/null +++ b/egs/wsj/s5/utils/lang/bpe/apply_bpe.py @@ -0,0 +1,340 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Author: Rico Sennrich +# Released under the MIT License. + +"""Use operations learned with learn_bpe.py to encode a new text. +The text will not be smaller, but use only a fixed vocabulary, with rare words +encoded as variable-length sequences of subword units. + +Reference: +Rico Sennrich, Barry Haddow and Alexandra Birch (2015). Neural Machine Translation of Rare Words with Subword Units. +Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. +""" + +from __future__ import unicode_literals, division + +import sys +import codecs +import io +import argparse +import re + +# hack for python2/3 compatibility +from io import open +argparse.open = open + +class BPE(object): + + def __init__(self, codes, merges=-1, separator='@@', vocab=None, glossaries=None): + + codes.seek(0) + + # check version information + firstline = codes.readline() + if firstline.startswith('#version:'): + self.version = tuple([int(x) for x in re.sub(r'(\.0+)*$','', firstline.split()[-1]).split(".")]) + else: + self.version = (0, 1) + codes.seek(0) + + self.bpe_codes = [tuple(item.strip().split(' ')) for (n, item) in enumerate(codes) if (n < merges or merges == -1)] + + for item in self.bpe_codes: + if len(item) != 2: + sys.stderr.write('Error: invalid line in BPE codes file: {0}\n'.format(' '.join(item))) + sys.stderr.write('The line should exist of exactly two subword units, separated by whitespace\n'.format(' '.join(item))) + sys.exit(1) + + # some hacking to deal with duplicates (only consider first instance) + self.bpe_codes = dict([(code,i) for (i,code) in reversed(list(enumerate(self.bpe_codes)))]) + + self.bpe_codes_reverse = dict([(pair[0] + pair[1], pair) for pair,i in self.bpe_codes.items()]) + + self.separator = separator + + self.vocab = vocab + + self.glossaries = glossaries if glossaries else [] + + self.cache = {} + + def process_line(self, line): + """segment line, dealing with leading and trailing whitespace""" + + out = "" + + leading_whitespace = len(line)-len(line.lstrip()) + if leading_whitespace: + out += line[:leading_whitespace] + + out += self.segment(line) + + trailing_whitespace = len(line)-len(line.rstrip()) + if trailing_whitespace: + out += line[-trailing_whitespace:] + + return out + + def segment(self, sentence): + """segment single sentence (whitespace-tokenized string) with BPE encoding""" + output = [] + for word in sentence.strip().split(' '): + # eliminate double spaces + if not word: + continue + new_word = [out for segment in self._isolate_glossaries(word) + for out in encode(segment, + self.bpe_codes, + self.bpe_codes_reverse, + self.vocab, + self.separator, + self.version, + self.cache, + self.glossaries)] + + for item in new_word[:-1]: + output.append(item + self.separator) + output.append(new_word[-1]) + + return ' '.join(output) + + def _isolate_glossaries(self, word): + word_segments = [word] + for gloss in self.glossaries: + word_segments = [out_segments for segment in word_segments + for out_segments in isolate_glossary(segment, gloss)] + return word_segments + +def create_parser(): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="learn BPE-based word segmentation") + + parser.add_argument( + '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, + metavar='PATH', + help="Input file (default: standard input).") + parser.add_argument( + '--codes', '-c', type=argparse.FileType('r'), metavar='PATH', + required=True, + help="File with BPE codes (created by learn_bpe.py).") + parser.add_argument( + '--merges', '-m', type=int, default=-1, + metavar='INT', + help="Use this many BPE operations (<= number of learned symbols)"+ + "default: Apply all the learned merge operations") + parser.add_argument( + '--output', '-o', type=argparse.FileType('w'), default=sys.stdout, + metavar='PATH', + help="Output file (default: standard output)") + parser.add_argument( + '--separator', '-s', type=str, default='@@', metavar='STR', + help="Separator between non-final subword units (default: '%(default)s'))") + parser.add_argument( + '--vocabulary', type=argparse.FileType('r'), default=None, + metavar="PATH", + help="Vocabulary file (built with get_vocab.py). If provided, this script reverts any merge operations that produce an OOV.") + parser.add_argument( + '--vocabulary-threshold', type=int, default=None, + metavar="INT", + help="Vocabulary threshold. If vocabulary is provided, any word with frequency < threshold will be treated as OOV") + parser.add_argument( + '--glossaries', type=str, nargs='+', default=None, + metavar="STR", + help="Glossaries. The strings provided in glossaries will not be affected"+ + "by the BPE (i.e. they will neither be broken into subwords, nor concatenated with other subwords") + + return parser + +def get_pairs(word): + """Return set of symbol pairs in a word. + + word is represented as tuple of symbols (symbols being variable-length strings) + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + +def encode(orig, bpe_codes, bpe_codes_reverse, vocab, separator, version, cache, glossaries=None): + """Encode word based on list of BPE merge operations, which are applied consecutively + """ + + if orig in cache: + return cache[orig] + + if orig in glossaries: + cache[orig] = (orig,) + return (orig,) + + if version == (0, 1): + word = tuple(orig) + ('',) + elif version == (0, 2): # more consistent handling of word-final segments + word = tuple(orig[:-1]) + ( orig[-1] + '',) + else: + raise NotImplementedError + + pairs = get_pairs(word) + + if not pairs: + return orig + + while True: + bigram = min(pairs, key = lambda pair: bpe_codes.get(pair, float('inf'))) + if bigram not in bpe_codes: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except: + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word)-1 and word[i+1] == second: + new_word.append(first+second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + + # don't print end-of-word symbols + if word[-1] == '': + word = word[:-1] + elif word[-1].endswith(''): + word = word[:-1] + (word[-1].replace('',''),) + + if vocab: + word = check_vocab_and_split(word, bpe_codes_reverse, vocab, separator) + + cache[orig] = word + return word + +def recursive_split(segment, bpe_codes, vocab, separator, final=False): + """Recursively split segment into smaller units (by reversing BPE merges) + until all units are either in-vocabulary, or cannot be split futher.""" + + try: + if final: + left, right = bpe_codes[segment + ''] + right = right[:-4] + else: + left, right = bpe_codes[segment] + except: + #sys.stderr.write('cannot split {0} further.\n'.format(segment)) + yield segment + return + + if left + separator in vocab: + yield left + else: + for item in recursive_split(left, bpe_codes, vocab, separator, False): + yield item + + if (final and right in vocab) or (not final and right + separator in vocab): + yield right + else: + for item in recursive_split(right, bpe_codes, vocab, separator, final): + yield item + +def check_vocab_and_split(orig, bpe_codes, vocab, separator): + """Check for each segment in word if it is in-vocabulary, + and segment OOV segments into smaller units by reversing the BPE merge operations""" + + out = [] + + for segment in orig[:-1]: + if segment + separator in vocab: + out.append(segment) + else: + #sys.stderr.write('OOV: {0}\n'.format(segment)) + for item in recursive_split(segment, bpe_codes, vocab, separator, False): + out.append(item) + + segment = orig[-1] + if segment in vocab: + out.append(segment) + else: + #sys.stderr.write('OOV: {0}\n'.format(segment)) + for item in recursive_split(segment, bpe_codes, vocab, separator, True): + out.append(item) + + return out + + +def read_vocabulary(vocab_file, threshold): + """read vocabulary file produced by get_vocab.py, and filter according to frequency threshold. + """ + + vocabulary = set() + + for line in vocab_file: + word, freq = line.strip().split(' ') + freq = int(freq) + if threshold == None or freq >= threshold: + vocabulary.add(word) + + return vocabulary + +def isolate_glossary(word, glossary): + """ + Isolate a glossary present inside a word. + + Returns a list of subwords. In which all 'glossary' glossaries are isolated + + For example, if 'USA' is the glossary and '1934USABUSA' the word, the return value is: + ['1934', 'USA', 'B', 'USA'] + """ + if word == glossary or glossary not in word: + return [word] + else: + splits = word.split(glossary) + segments = [segment.strip() for split in splits[:-1] for segment in [split, glossary] if segment != ''] + return segments + [splits[-1].strip()] if splits[-1] != '' else segments + +if __name__ == '__main__': + + # python 2/3 compatibility + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + else: + sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') + sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', write_through=True, line_buffering=True) + + parser = create_parser() + args = parser.parse_args() + + # read/write files as UTF-8 + args.codes = codecs.open(args.codes.name, encoding='utf-8') + if args.input.name != '': + args.input = codecs.open(args.input.name, encoding='utf-8') + if args.output.name != '': + args.output = codecs.open(args.output.name, 'w', encoding='utf-8') + if args.vocabulary: + args.vocabulary = codecs.open(args.vocabulary.name, encoding='utf-8') + + if args.vocabulary: + vocabulary = read_vocabulary(args.vocabulary, args.vocabulary_threshold) + else: + vocabulary = None + + bpe = BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries) + + for line in args.input: + args.output.write(bpe.process_line(line)) diff --git a/egs/wsj/s5/utils/lang/bpe/learn_bpe.py b/egs/wsj/s5/utils/lang/bpe/learn_bpe.py new file mode 100755 index 00000000000..70f18f2d1d9 --- /dev/null +++ b/egs/wsj/s5/utils/lang/bpe/learn_bpe.py @@ -0,0 +1,256 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Author: Rico Sennrich +# Released under the MIT License. + +"""Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text. +Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary +of a text to a configurable number of symbols, with only a small increase in the number of tokens. + +Reference: +Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units. +Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany. +""" + +from __future__ import unicode_literals + +import sys +import codecs +import re +import copy +import argparse +from collections import defaultdict, Counter + +# hack for python2/3 compatibility +from io import open +argparse.open = open + +def create_parser(): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description="learn BPE-based word segmentation") + + parser.add_argument( + '--input', '-i', type=argparse.FileType('r'), default=sys.stdin, + metavar='PATH', + help="Input text (default: standard input).") + + parser.add_argument( + '--output', '-o', type=argparse.FileType('w'), default=sys.stdout, + metavar='PATH', + help="Output file for BPE codes (default: standard output)") + parser.add_argument( + '--symbols', '-s', type=int, default=10000, + help="Create this many new symbols (each representing a character n-gram) (default: %(default)s))") + parser.add_argument( + '--min-frequency', type=int, default=2, metavar='FREQ', + help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s))') + parser.add_argument('--dict-input', action="store_true", + help="If set, input file is interpreted as a dictionary where each line contains a word-count pair") + parser.add_argument( + '--verbose', '-v', action="store_true", + help="verbose mode.") + + return parser + +def get_vocabulary(fobj, is_dict=False): + """Read text and return dictionary that encodes vocabulary + """ + vocab = Counter() + for i, line in enumerate(fobj): + if is_dict: + try: + word, count = line.strip().split(' ') + except: + print('Failed reading vocabulary file at line {0}: {1}'.format(i, line)) + sys.exit(1) + vocab[word] += int(count) + else: + for word in line.strip().split(' '): + if word: + vocab[word] += 1 + return vocab + +def update_pair_statistics(pair, changed, stats, indices): + """Minimally update the indices and frequency of symbol pairs + + if we merge a pair of symbols, only pairs that overlap with occurrences + of this pair are affected, and need to be updated. + """ + stats[pair] = 0 + indices[pair] = defaultdict(int) + first, second = pair + new_pair = first+second + for j, word, old_word, freq in changed: + + # find all instances of pair, and update frequency/indices around it + i = 0 + while True: + # find first symbol + try: + i = old_word.index(first, i) + except ValueError: + break + # if first symbol is followed by second symbol, we've found an occurrence of pair (old_word[i:i+2]) + if i < len(old_word)-1 and old_word[i+1] == second: + # assuming a symbol sequence "A B C", if "B C" is merged, reduce the frequency of "A B" + if i: + prev = old_word[i-1:i+1] + stats[prev] -= freq + indices[prev][j] -= 1 + if i < len(old_word)-2: + # assuming a symbol sequence "A B C B", if "B C" is merged, reduce the frequency of "C B". + # however, skip this if the sequence is A B C B C, because the frequency of "C B" will be reduced by the previous code block + if old_word[i+2] != first or i >= len(old_word)-3 or old_word[i+3] != second: + nex = old_word[i+1:i+3] + stats[nex] -= freq + indices[nex][j] -= 1 + i += 2 + else: + i += 1 + + i = 0 + while True: + try: + # find new pair + i = word.index(new_pair, i) + except ValueError: + break + # assuming a symbol sequence "A BC D", if "B C" is merged, increase the frequency of "A BC" + if i: + prev = word[i-1:i+1] + stats[prev] += freq + indices[prev][j] += 1 + # assuming a symbol sequence "A BC B", if "B C" is merged, increase the frequency of "BC B" + # however, if the sequence is A BC BC, skip this step because the count of "BC BC" will be incremented by the previous code block + if i < len(word)-1 and word[i+1] != new_pair: + nex = word[i:i+2] + stats[nex] += freq + indices[nex][j] += 1 + i += 1 + + +def get_pair_statistics(vocab): + """Count frequency of all symbol pairs, and create index""" + + # data structure of pair frequencies + stats = defaultdict(int) + + #index from pairs to words + indices = defaultdict(lambda: defaultdict(int)) + + for i, (word, freq) in enumerate(vocab): + prev_char = word[0] + for char in word[1:]: + stats[prev_char, char] += freq + indices[prev_char, char][i] += 1 + prev_char = char + + return stats, indices + + +def replace_pair(pair, vocab, indices): + """Replace all occurrences of a symbol pair ('A', 'B') with a new symbol 'AB'""" + first, second = pair + pair_str = ''.join(pair) + pair_str = pair_str.replace('\\','\\\\') + changes = [] + pattern = re.compile(r'(?'); + # version numbering allows bckward compatibility + outfile.write('#version: 0.2\n') + + vocab = get_vocabulary(infile, is_dict) + vocab = dict([(tuple(x[:-1])+(x[-1]+'',) ,y) for (x,y) in vocab.items()]) + sorted_vocab = sorted(vocab.items(), key=lambda x: x[1], reverse=True) + + stats, indices = get_pair_statistics(sorted_vocab) + big_stats = copy.deepcopy(stats) + # threshold is inspired by Zipfian assumption, but should only affect speed + threshold = max(stats.values()) / 10 + for i in range(num_symbols): + if stats: + most_frequent = max(stats, key=lambda x: (stats[x], x)) + + # we probably missed the best pair because of pruning; go back to full statistics + if not stats or (i and stats[most_frequent] < threshold): + prune_stats(stats, big_stats, threshold) + stats = copy.deepcopy(big_stats) + most_frequent = max(stats, key=lambda x: (stats[x], x)) + # threshold is inspired by Zipfian assumption, but should only affect speed + threshold = stats[most_frequent] * i/(i+10000.0) + prune_stats(stats, big_stats, threshold) + + if stats[most_frequent] < min_frequency: + sys.stderr.write('no pair has frequency >= {0}. Stopping\n'.format(min_frequency)) + break + + if verbose: + sys.stderr.write('pair {0}: {1} {2} -> {1}{2} (frequency {3})\n'.format(i, most_frequent[0], most_frequent[1], stats[most_frequent])) + outfile.write('{0} {1}\n'.format(*most_frequent)) + changes = replace_pair(most_frequent, sorted_vocab, indices) + update_pair_statistics(most_frequent, changes, stats, indices) + stats[most_frequent] = 0 + if not i % 100: + prune_stats(stats, big_stats, threshold) + + +if __name__ == '__main__': + + # python 2/3 compatibility + if sys.version_info < (3, 0): + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin) + else: + sys.stderr = codecs.getwriter('UTF-8')(sys.stderr.buffer) + sys.stdout = codecs.getwriter('UTF-8')(sys.stdout.buffer) + sys.stdin = codecs.getreader('UTF-8')(sys.stdin.buffer) + + parser = create_parser() + args = parser.parse_args() + + # read/write files as UTF-8 + if args.input.name != '': + args.input = codecs.open(args.input.name, encoding='utf-8') + if args.output.name != '': + args.output = codecs.open(args.output.name, 'w', encoding='utf-8') + + main(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input) diff --git a/egs/wsj/s5/utils/lang/compute_sentence_probs_arpa.py b/egs/wsj/s5/utils/lang/compute_sentence_probs_arpa.py new file mode 100755 index 00000000000..5a7743badee --- /dev/null +++ b/egs/wsj/s5/utils/lang/compute_sentence_probs_arpa.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python + +# Dongji Gao + +# We're using python 3.x style but want it to work in python 2.x + +from __future__ import print_function +import argparse +import sys +import math + +parser = argparse.ArgumentParser(description="This script evaluates the log probabilty (default log base is e) of each sentence " + "from data (in text form), given a language model in arpa form " + "and a specific ngram order.", + epilog="e.g. ./compute_sentence_probs_arpa.py ARPA_LM NGRAM_ORDER TEXT_IN PROB_FILE --log-base=LOG_BASE", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument("arpa_lm", type=str, + help="Input language model in arpa form.") +parser.add_argument("ngram_order", type=int, + help="Order of ngram") +parser.add_argument("text_in", type=str, + help="Filename of input text file (each line will be interpreted as a sentence).") +parser.add_argument("prob_file", type=str, + help="Filename of output probability file.") +parser.add_argument("--log-base", type=float, default=math.exp(1), + help="Log base for log porbability") +args = parser.parse_args() + +def check_args(args): + args.text_in_handle = sys.stdin if args.text_in == "-" else open(args.text_in, "r") + args.prob_file_handle = sys.stdout if args.prob_file == "-" else open(args.prob_file, "w") + if args.log_base <= 0: + sys.exit("compute_sentence_probs_arpa.py: Invalid log base (must be greater than 0)") + +def is_logprob(input): + if input[0] == "-": + try: + float(input[1:]) + return True + except: + return False + else: + return False + +def check_number(model_file, tot_num): + cur_num = 0 + max_ngram_order = 0 + with open(model_file) as model: + lines = model.readlines() + for line in lines[1:]: + if "=" not in line: + return (cur_num == tot_num), max_ngram_order + cur_num += int(line.split("=")[-1]) + max_ngram_order = int(line.split("=")[0].split()[-1]) + +# This function load language model in arpa form and save in a dictionary for +# computing sentence probabilty of input text file. +def load_model(model_file): + with open(model_file) as model: + ngram_dict = {} + lines = model.readlines() + + # check arpa form + if lines[0][:-1] != "\\data\\": + sys.exit("compute_sentence_probs_arpa.py: Please make sure that language model is in arpa form.") + + # read line + for line in lines: + if line[0] == "-": + line_split = line.split() + if is_logprob(line_split[-1]): + ngram_key = " ".join(line_split[1:-1]) + if ngram_key in ngram_dict: + sys.exit("compute_sentence_probs_arpa.py: Duplicated ngram in arpa language model: {}.".format(ngram_key)) + ngram_dict[ngram_key] = (line_split[0], line_split[-1]) + else: + ngram_key = " ".join(line_split[1:]) + if ngram_key in ngram_dict: + sys.exit("compute_sentence_probs_arpa.py: Duplicated ngram in arpa language model: {}.".format(ngram_key)) + ngram_dict[ngram_key] = (line_split[0],) + + return ngram_dict, len(ngram_dict) + +def compute_sublist_prob(sub_list): + if len(sub_list) == 0: + sys.exit("compute_sentence_probs_arpa.py: Ngram substring not found in arpa language model, please check.") + + sub_string = " ".join(sub_list) + if sub_string in ngram_dict: + return -float(ngram_dict[sub_string][0][1:]) + else: + backoff_substring = " ".join(sub_list[:-1]) + backoff_weight = 0.0 if (backoff_substring not in ngram_dict or len(ngram_dict[backoff_substring]) < 2) \ + else -float(ngram_dict[backoff_substring][1][1:]) + return compute_sublist_prob(sub_list[1:]) + backoff_weight + +def compute_begin_prob(sub_list): + logprob = 0 + for i in range(1, len(sub_list) - 1): + logprob += compute_sublist_prob(sub_list[:i + 1]) + return logprob + +# The probability is computed in this way: +# p(word_N | word_N-1 ... word_1) = ngram_dict[word_1 ... word_N][0]. +# Here gram_dict is a dictionary stores a tuple corresponding to ngrams. +# The first element of tuple is probablity and the second is backoff probability (if exists). +# If the particular ngram (word_1 ... word_N) is not in the dictionary, then +# p(word_N | word_N-1 ... word_1) = p(word_N | word_(N-1) ... word_2) * backoff_weight(word_(N-1) | word_(N-2) ... word_1) +# If the sequence (word_(N-1) ... word_1) is not in the dictionary, then the backoff_weight gets replaced with 0.0 (log1) +# More details can be found in https://cmusphinx.github.io/wiki/arpaformat/ +def compute_sentence_prob(sentence, ngram_order): + sentence_split = sentence.split() + for i in range(len(sentence_split)): + if sentence_split[i] not in ngram_dict: + sentence_split[i] = "" + sen_length = len(sentence_split) + + if sen_length < ngram_order: + return compute_begin_prob(sentence_split) + else: + logprob = 0 + begin_sublist = sentence_split[:ngram_order] + logprob += compute_begin_prob(begin_sublist) + + for i in range(sen_length - ngram_order + 1): + cur_sublist = sentence_split[i : i + ngram_order] + logprob += compute_sublist_prob(cur_sublist) + + return logprob + + +def output_result(text_in_handle, output_file_handle, ngram_order): + lines = text_in_handle.readlines() + logbase_modifier = math.log(10, args.log_base) + for line in lines: + new_line = " " + line[:-1] + " " + logprob = compute_sentence_prob(new_line, ngram_order) + new_logprob = logprob * logbase_modifier + output_file_handle.write("{}\n".format(new_logprob)) + text_in_handle.close() + output_file_handle.close() + + +if __name__ == "__main__": + check_args(args) + ngram_dict, tot_num = load_model(args.arpa_lm) + + num_valid, max_ngram_order = check_number(args.arpa_lm, tot_num) + if not num_valid: + sys.exit("compute_sentence_probs_arpa.py: Wrong loading model.") + if args.ngram_order <= 0 or args.ngram_order > max_ngram_order: + sys.exit("compute_sentence_probs_arpa.py: " + + "Invalid ngram_order (either negative or greater than maximum ngram number ({}) allowed)".format(max_ngram_order)) + + output_result(args.text_in_handle, args.prob_file_handle, args.ngram_order) diff --git a/egs/wsj/s5/utils/lang/get_word_position_phone_map.pl b/egs/wsj/s5/utils/lang/get_word_position_phone_map.pl new file mode 100755 index 00000000000..cd1af044d72 --- /dev/null +++ b/egs/wsj/s5/utils/lang/get_word_position_phone_map.pl @@ -0,0 +1,154 @@ +#!/usr/bin/env perl + +# Copyright 2018 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0. +# +use strict; +use warnings; + +my $Usage = < + + is a conventional lang dir as validated by validate_lang.pl. +It is an error if does not have word-position-dependent phones. + +To will be written the following files: + phones.txt is a conventional symbol table, similar in format to the one + in , but without word-position-dependency or disambiguation + symbols. + phone_map.int is a mapping from the input 's phones to + the phones in /phones.txt, containing integers, i.e. + . + phone_map.txt is the text form of the mapping in phone_map.int, mostly + provided for reference. +EOU + + +if (@ARGV != 2) { + die $Usage; +} + +my $lang_dir = shift @ARGV; +my $output_dir = shift @ARGV; + +foreach my $filename ( ("phones.txt", "phones/disambig.int") ) { + if (! -f "$lang_dir/$filename") { + die "$0: expected file $lang_dir/$filename to exist"; + } +} + +if (! -d $output_dir) { + die "$0: expected directory $output_dir to exist"; +} + + +# %is_disambig is a hash indexed by integer phone index in the input $lang_dir, +# which will contain 1 for each (integer) disambiguation symbol. +my %is_disambig; + +open(D, "<$lang_dir/phones/disambig.int") || die "opening $lang_dir/phones/disambig.int"; +while () { + my $disambig_sym = int($_); + $is_disambig{$disambig_sym} = 1; +} +close(D); + +## @orig_phone_list will be an array indexed by integer index, containing +## the written form of the original, non-word-position-dependent phones. +## (but excluding disambiguation symbols like #0, #1 and so on). +## E.g. @orig_phone_list = ( "", "SIL", "SIL_B", "SIL_E", "SIL_I", "SIL_S", ... ) +my @orig_phone_list = (); + + +## @mapped_phones will be an array of the same size as @orig_phone_list, but +## containing the same phone mapped to context-independent form, +## e.g. ( "", "SIL", "SIL", "SIL", SIL", "SIL",... ) +my @mapped_phones = (); + + +## @mapped_phone_list will contain the distinct mapped phones in order, +## e.g. ( "", "SIL", "AA", ... ) +my @mapped_phone_list = (); + +## mapped_phone_to_int will be a mapping from the strings in @mapped_phones, +## such as "" and "SIL", to an integer like 0, 1, .... +my %mapped_phone_to_int; + +# $cur_mapped_int keeps track of the symbols we've used in the output +# phones.txt. +my $cur_mapped_int = 0; + +# $cur_line is the current line index in input phones.txt +my $cur_line = 0; + +open(F, "<$lang_dir/phones.txt") || die "$0: failed to open $lang_dir/phones.txt for reading"; + +while () { + chop; # remove newline from $_ (the line we just read) for easy printing. + my @A = split; # split $_ on space. + if (@A != 2) { # if the array @A does not have length 2... + die "$0: bad line $_ in file $lang_dir/phones.txt"; + } + my $phone_name = $A[0]; # e.g. "" or "SIL" or "SIL_B" ... + my $phone_int = int($A[1]); + if ($phone_int != $cur_line) { + die ("$0: unexpected line $_ in $lang_dir/phones.txt, expected integer to be $cur_line"); + } + if (! $is_disambig{$phone_int}) { + # if it's not a disambiguation symbol... + my $mapped_phone_name = $phone_name; + $mapped_phone_name =~ s/_[BESI]$//; + + push @orig_phone_list, $phone_name; + push @mapped_phones, $mapped_phone_name; + + if (!defined $mapped_phone_to_int{$mapped_phone_name}) { + $mapped_phone_to_int{$mapped_phone_name} = $cur_mapped_int++; + push @mapped_phone_list, $mapped_phone_name; + } + } + $cur_line++; +} +close(F); + +if ($cur_line == 0) { + die "$0: empty $lang_dir/phones.txt"; +} + +if ($cur_mapped_int == @orig_phone_list) { + # if the number of distinct mapped phones is the same as the + # number of input phones (including epsilon), it means the mapping + # was a no-op. This is an error, because it doesn't make sense to + # run this script on input that was not word-position-dependent. + die "input lang dir $lang_dir was not word-position-dependent."; +} + +open(P, ">$output_dir/phones.txt") || die "failed to open $output_dir/phones.txt for writing."; +open(I, ">$output_dir/phone_map.int") || die "failed to open $output_dir/phone_map.int for writing."; +open(T, ">$output_dir/phone_map.txt") || die "failed to open $output_dir/phone_map.txt for writing."; + +for (my $x = 0; $x <= $#mapped_phone_list; $x++) { + print P "$mapped_phone_list[$x] $x\n"; +} + + +for (my $x = 0; $x <= $#orig_phone_list; $x++) { + my $orig_phone_name = $orig_phone_list[$x]; + my $mapped_phone_name = $mapped_phones[$x]; + my $y = $mapped_phone_to_int{$mapped_phone_name}; + defined $y || die "code error"; + + print I "$x $y\n"; + print T "$orig_phone_name $mapped_phone_name\n"; +} + + +(close(I) && close(T) && close(P)) || die "failed to close file (disk full?)"; + + +exit(0); diff --git a/egs/wsj/s5/utils/lang/limit_arpa_unk_history.py b/egs/wsj/s5/utils/lang/limit_arpa_unk_history.py new file mode 100755 index 00000000000..81c0df36d2b --- /dev/null +++ b/egs/wsj/s5/utils/lang/limit_arpa_unk_history.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Armin Oliya +# Apache 2.0. + +''' +This script takes an existing ARPA lanugage model and limits the history +to make it suitable for downstream modeling. +This is for the case when you don't have access +to the original text corpus that is used for creating the LM. +If you do, you can use pocolm with the option --limit-unk-history=true. +This keeps the graph compact after adding the unk model. +''' + +import argparse +import io +import re +import sys +from collections import defaultdict + + +parser = argparse.ArgumentParser( + description='''This script takes an existing ARPA lanugage model + and limits the history to make it suitable + for downstream modeling. + It supports up to 5-grams.''', + usage='''utils/lang/limit_arpa_unk_history.py + output-arpa''', + epilog='''E.g.: gunzip -c src.arpa.gz | + utils/lang/limit_arpa_unk_history.py "" | gzip -c >dest.arpa.gz''') + +parser.add_argument( + 'oov_dict_entry', + help='oov identifier, for example ""', type=str) +args = parser.parse_args() + + +def get_ngram_stats(old_lm_lines): + ngram_counts = defaultdict(int) + + for i in range(10): + g = re.search(r"ngram (\d)=(\d+)", old_lm_lines[i]) + if g: + ngram_counts[int(g.group(1))] = int(g.group(2)) + + if len(ngram_counts) == 0: + sys.exit("""Couldn't get counts per ngram section. + The input doesn't seem to be a valid ARPA language model.""") + + max_ngrams = list(ngram_counts.keys())[-1] + skip_rows = ngram_counts[1] + + if max_ngrams > 5: + sys.exit("This script supports up to 5-gram language models.") + + return max_ngrams, skip_rows, ngram_counts + + +def find_and_replace_unks(old_lm_lines, max_ngrams, skip_rows): + ngram_diffs = defaultdict(int) + unk_pattern = re.compile( + "[0-9.-]+(?:[\s\\t]\S+){1,3}[\s\\t]" + args.oov_dict_entry + + "[\s\\t](?!-[0-9]+\.[0-9]+).*") + backoff_pattern = re.compile( + "[0-9.-]+(?:[\s\\t]\S+){1,3}[\s\\t][\s\\t]-[0-9]+\.[0-9]+") + passed_2grams, last_ngram = False, False + unk_row_count, backoff_row_count = 0, 0 + + print("Upadting the language model .. ", file=sys.stderr) + new_lm_lines = old_lm_lines[:skip_rows] + + for i in range(skip_rows, len(old_lm_lines)): + line = old_lm_lines[i].strip() + + if "\{}-grams:".format(3) in line: + passed_2grams = True + if "\{}-grams:".format(max_ngrams) in line: + last_ngram = True + + # remove any n-gram states of the form: foo -> X + # that is, any n-grams of order > 2 where + # is the second-to-last word + # here we skip 1-gram and 2-gram sections of arpa + + if passed_2grams: + g_unk = unk_pattern.search(line) + if g_unk: + ngram = len(g_unk.group(0).split()) - 1 + ngram_diffs[ngram] = ngram_diffs[ngram] - 1 + unk_row_count += 1 + continue + + # remove backoff probability from the lines that end with + # for example, the -0.64 in -4.09 every -0.64 + # here we skip the last n-gram section because it + # doesn't include backoff probabilities + + if not last_ngram: + g_backoff = backoff_pattern.search(line) + if g_backoff: + updated_row = g_backoff.group(0).split()[:-1] + updated_row = updated_row[0] + \ + "\t" + " ".join(updated_row[1:]) + "\n" + new_lm_lines.append(updated_row) + backoff_row_count += 1 + continue + + new_lm_lines.append(line+"\n") + + print("Removed {} lines including {} as second-to-last term.".format( + unk_row_count, args.oov_dict_entry), file=sys.stderr) + print("Removed backoff probabilties from {} lines.".format( + backoff_row_count), file=sys.stderr) + + return new_lm_lines, ngram_diffs + + +def read_old_lm(): + print("Reading ARPA LM frome input stream .. ", file=sys.stderr) + + with io.TextIOWrapper( + sys.stdin.buffer, + encoding="latin-1") as input_stream: + old_lm_lines = input_stream.readlines() + + return old_lm_lines + + +def write_new_lm(new_lm_lines, ngram_counts, ngram_diffs): + ''' Update n-gram counts that go in the header of the arpa lm ''' + + for i in range(10): + g = re.search(r"ngram (\d)=(\d+)", new_lm_lines[i]) + if g: + n = int(g.group(1)) + if n in ngram_diffs: + # ngram_diffs contains negative values + new_num_ngrams = ngram_counts[n] + ngram_diffs[n] + new_lm_lines[i] = "ngram {}={}\n".format( + n, new_num_ngrams) + + with io.TextIOWrapper( + sys.stdout.buffer, + encoding="latin-1") as output_stream: + output_stream.writelines(new_lm_lines) + + +def main(): + old_lm_lines = read_old_lm() + max_ngrams, skip_rows, ngram_counts = get_ngram_stats(old_lm_lines) + new_lm_lines, ngram_diffs = find_and_replace_unks( + old_lm_lines, max_ngrams, skip_rows) + write_new_lm(new_lm_lines, ngram_counts, ngram_diffs) + + +if __name__ == "__main__": + main() diff --git a/egs/wsj/s5/utils/nnet/subset_data_tr_cv.sh b/egs/wsj/s5/utils/nnet/subset_data_tr_cv.sh index 788e041dc5c..d8694bdf36d 100755 --- a/egs/wsj/s5/utils/nnet/subset_data_tr_cv.sh +++ b/egs/wsj/s5/utils/nnet/subset_data_tr_cv.sh @@ -39,7 +39,7 @@ if [ -z "$cv_spk_list" ]; then # Select 'cv_spk_percent' speakers randomly, cat $src_data/spk2utt | awk '{ print $1; }' | utils/shuffle_list.pl --srand $seed >$tmp/speakers n_spk=$(wc -l <$tmp/speakers) - n_spk_cv=$((cv_spk_percent * n_spk / 100)) + n_spk_cv=$(perl -e "print int($cv_spk_percent * $n_spk / 100); ") # head -n $n_spk_cv $tmp/speakers >$tmp/speakers_cv tail -n+$((n_spk_cv+1)) $tmp/speakers >$tmp/speakers_trn diff --git a/egs/wsj/s5/utils/parallel/queue.pl b/egs/wsj/s5/utils/parallel/queue.pl index 10fd3b1a885..e14af5ef6e3 100755 --- a/egs/wsj/s5/utils/parallel/queue.pl +++ b/egs/wsj/s5/utils/parallel/queue.pl @@ -94,7 +94,8 @@ sub caught_signal { if ( defined $sge_job_id ) { # Signal trapped after submitting jobs my $signal = $!; system ("qdel $sge_job_id"); - die "Caught a signal: $signal , deleting SGE task: $sge_job_id and exiting\n"; + print STDERR "Caught a signal: $signal , deleting SGE task: $sge_job_id and exiting\n"; + exit(2); } } @@ -395,6 +396,7 @@ sub caught_signal { if (!close(Q)) { # close was not successful... || die "Could not close script file $shfile"; die "Failed to close the script file (full disk?)"; } +chmod 0755, $queue_scriptfile; # This block submits the job to the queue. for (my $try = 1; $try < 5; $try++) { diff --git a/egs/wsj/s5/utils/parallel/retry.pl b/egs/wsj/s5/utils/parallel/retry.pl new file mode 100755 index 00000000000..a039d6f5a74 --- /dev/null +++ b/egs/wsj/s5/utils/parallel/retry.pl @@ -0,0 +1,106 @@ +#!/usr/bin/env perl +use strict; +use warnings; + +# Copyright 2018 Johns Hopkins University (Author: Daniel Povey). +# Apache 2.0. + +use File::Basename; +use Cwd; +use Getopt::Long; + + +# retry.pl is a wrapper for queue.pl. It can be used to retry jobs that failed, +# e.g. if your command line was "queue.pl [args]", you can replace that +# with "retry.pl queue.pl [args]" and it will retry jobs that failed. + + +my $num_tries = 2; + +sub print_usage() { + print STDERR + "Usage: retry.pl \n" . + " e.g.: retry.pl [options] queue.pl foo.log do_something\n" . + "This will retry jobs that failed (only once)\n" . + "Options:\n" . + " --num-tries # default: 2\n"; + exit 1; +} + +if ($ARGV[0] eq "--num-tries") { + shift; + $num_tries = $ARGV[0] + 0; + if ($num_tries < 1) { + die "$0: invalid option --num-tries $ARGV[0]"; + } + shift; +} + +if (@ARGV < 3) { + print_usage(); +} + + +sub get_log_file { + my $n; + # First just look for the first command-line arg that ends in ".log". If that + # exists, it's almost certainly the log file. + for ($n = 1; $n < @ARGV; $n++) { + if ($ARGV[$n] =~ m/\.log$/) { + return $ARGV[$n]; + } + } + for ($n = 1; $n < @ARGV; $n++) { + # If this arg isn't of the form "-some-option', and isn't of the form + # "JOB=1:10", and the previous arg wasn't of the form "-some-option", and this + # isn't just a number (note: the 'not-a-number' things is mostly to exclude + # things like the 5 in "-pe smp 5" which is an older but still-supported + # option to queue.pl)... then assume it's a log file. + if ($ARGV[$n] !~ m/^-=/ && $ARGV[$n] !~ m/=/ && $ARGV[$n] !~ m/^\d+$/ && + $ARGV[$n-1] !~ m/^-/) { + return $ARGV[$n]; + } + } + print STDERR "$0: failed to parse log-file name from args:" . join(" ", @ARGV); + exit(1); +} + + +my $log_file = get_log_file(); +my $return_status; + +for (my $n = 1; $n <= $num_tries; $n++) { + system(@ARGV); + $return_status = $?; + if ($return_status == 0) { + exit(0); # The command succeeded. We return success. + } elsif ($return_status != 256) { + # The command did not "die normally". When queue.pl and similar scripts + # detect a normal error, they exit(1), which becomes a status of 256 + # in perl's $? variable. + # See http://perldoc.perl.org/perlvar.html#%24CHILD_ERROR for more info. + # An example of an abnormal death that would cause us to want to exit + # immediately, is when the user does ctrl-c or KILLs the script, + # which gets caught by 'caught_signal' in queue.pl and causes that program + # to return with exit status 2. + exit(1); + } + + + if ($n < $num_tries) { + if (! -f $log_file) { + # $log_file doesn't exist as a file. Maybe it was an array job. + # This script doesn't yet support array jobs. We just give up. + # Later on we might want to figure out which array jobs failed + # and have to be rerun, but for now we just die. + print STDERR "$0: job failed and log file $log_file does not exist (array job?).\n"; + exit($return_status) + } else { + rename($log_file, $log_file . ".bak"); + print STDERR "$0: job failed; renaming log file to ${log_file}.bak and rerunning\n"; + } + } +} + +print STDERR "$0: job failed $num_tries times; log is in $log_file\n"; +exit(1); diff --git a/egs/wsj/s5/utils/perturb_data_dir_speed.sh b/egs/wsj/s5/utils/perturb_data_dir_speed.sh index 667bd934f04..a50cdb04be4 100755 --- a/egs/wsj/s5/utils/perturb_data_dir_speed.sh +++ b/egs/wsj/s5/utils/perturb_data_dir_speed.sh @@ -2,6 +2,7 @@ # Copyright 2013 Johns Hopkins University (author: Daniel Povey) # 2014 Tom Ko +# 2018 Emotech LTD (author: Pawel Swietojanski) # Apache 2.0 # This script operates on a directory, such as in data/train/, @@ -10,6 +11,8 @@ # spk2utt # utt2spk # text +# utt2dur +# reco2dur # # It generates the files which are used for perturbing the speed of the original data. @@ -52,6 +55,7 @@ mkdir -p $destdir cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/utt_map cat $srcdir/spk2utt | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/spk_map +cat $srcdir/wav.scp | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/reco_map if [ ! -f $srcdir/utt2uniq ]; then cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s%s %s\n", p, $1, $1);}' > $destdir/utt2uniq else @@ -65,8 +69,6 @@ cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt if [ -f $srcdir/segments ]; then - # also apply the spk_prefix to the recording-ids. - cat $srcdir/wav.scp | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/reco_map utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments | \ utils/apply_map.pl -f 2 $destdir/reco_map | \ @@ -83,7 +85,6 @@ if [ -f $srcdir/segments ]; then utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel fi - rm $destdir/reco_map 2>/dev/null else # no segments->wav indexed by utterance. if [ -f $srcdir/wav.scp ]; then utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/wav.scp | sed 's/| *$/ |/' | \ @@ -102,15 +103,23 @@ if [ -f $srcdir/spk2gender ]; then utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender fi +#prepare speed-perturbed utt2dur if [ ! -f $srcdir/utt2dur ]; then # generate utt2dur if it does not exist in srcdir utils/data/get_utt2dur.sh $srcdir fi - cat $srcdir/utt2dur | utils/apply_map.pl -f 1 $destdir/utt_map | \ awk -v factor=$factor '{print $1, $2/factor;}' >$destdir/utt2dur -rm $destdir/spk_map $destdir/utt_map 2>/dev/null +#prepare speed-perturbed reco2dur +if [ ! -f $srcdir/reco2dur ]; then + # generate reco2dur if it does not exist in srcdir + utils/data/get_reco2dur.sh $srcdir +fi +cat $srcdir/reco2dur | utils/apply_map.pl -f 1 $destdir/reco_map | \ + awk -v factor=$factor '{print $1, $2/factor;}' >$destdir/reco2dur + +rm $destdir/spk_map $destdir/utt_map $destdir/reco_map 2>/dev/null echo "$0: generated speed-perturbed version of data in $srcdir, in $destdir" utils/validate_data_dir.sh --no-feats --no-text $destdir diff --git a/egs/wsj/s5/utils/prepare_extended_lang.sh b/egs/wsj/s5/utils/prepare_extended_lang.sh new file mode 100755 index 00000000000..824654cabf1 --- /dev/null +++ b/egs/wsj/s5/utils/prepare_extended_lang.sh @@ -0,0 +1,165 @@ +#!/bin/bash +# Copyright 2018 Xiaohui Zhang + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script adds word-position-dependent phones and constructs a host of other +# derived files, that go in data/lang/. + +# Begin configuration section. +prep_lang_opts= +stage=0 +word_list= # if a word list (mapping words from the srcdict to IDs) is provided, +# we'll make sure the IDs of these words are kept as before. +# end configuration sections + +echo "$0 $@" # Print the command line for logging + +. utils/parse_options.sh + +if [ $# -ne 7 ]; then + echo "usage: utils/prepare_extended_lang.sh " + echo " " + echo "e.g.: utils/prepare_extended_lang.sh data/local/dict '' lexicon_extra.txt" + echo "data/lang/phones.txt data/local/dict_ext data/local/lang_ext data/lang_ext" + echo "The goal is to extend the lexicon from with extra lexical entries from " + echo ", putting the extended lexicon into , and then build" + echo "a valid lang dir . This is useful when we want to extend the vocab" + echo "in test time." + echo " must be a valid dictionary dir and is the oov word " + echo "(see utils/prepare_lang.sh for details). A phone symbol table from a previsouly built " + echo "lang dir is required, for validating provided lexical entries." + echo "options: " + echo " --prep-lang-opts STRING # options to pass to utils/prepare_lang.sh" + echo " --word-list # default: \"\"; if not empty, re-order the " + echo " # words in the generated words.txt so that the" + echo " # words from the provided list have their ids" + echo " # kept unchanged." + exit 1; +fi + +srcdict=$1 +oov_word=$2 +extra_lexicon=$3 +phone_symbol_table=$4 +extdict=$5 # extended dict dir +tmpdir=$6 +extlang=$7 # extended lang dir + +mkdir -p $extlang $tmpdir + +[ -f path.sh ] && . ./path.sh + +! utils/validate_dict_dir.pl $srcdict && \ + echo "*Error validating directory $srcdict*" && exit 1; + +if [[ ! -f $srcdict/lexicon.txt ]]; then + echo "**Creating $dir/lexicon.txt from $dir/lexiconp.txt" + perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' < $srcdict/lexiconp.txt \ + > $srcdict/lexicon.txt || exit 1; +fi + +if [[ ! -f $srcdict/lexiconp.txt ]]; then + echo "**Creating $srcdict/lexiconp.txt from $srcdict/lexicon.txt" + perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdict/lexicon.txt > $srcdict/lexiconp.txt || exit 1; +fi + +# Checks if the phone sets match. +echo "$(basename $0): Validating the source lexicon" +cat $srcdict/lexicon.txt | awk -v f=$phone_symbol_table ' +BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }} +{ for (x = 2; x <= NF; ++x) { + if (!($x in phones)) { + print "The source lexicon contains a phone not in the phones.txt: "$x; + print "You must provide a phones.txt from the lang built with the source lexicon."; + exit 1; + } +}}' || exit 1; + +echo "$(basename $0): Validating the extra lexicon" +cat $extra_lexicon | awk -v f=$phone_symbol_table ' +BEGIN { while ((getline < f) > 0) { sub(/_[BEIS]$/, "", $1); phones[$1] = 1; }} +{ for (x = 2; x <= NF; ++x) { if (!($x in phones)) { + print "The extra lexicon contains a phone not in the phone symbol table: "$x; exit 1; } + } +}' || exit 1; + +if [ $stage -le 0 ]; then + # Genearte the extended dict dir + echo "$(basename $0): Creating the extended lexicon $extdict/lexicon.txt" + [ -d $extdict ] && rm -r $extdict 2>/dev/null + cp -R $srcdict $extdict 2>/dev/null + + # Reformat the source lexicon + perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$srcdict/lexiconp.txt | awk '{ gsub(/\t/, " "); print }' \ + >$tmpdir/lexicon.txt || exit 1; + + # Filter lexical entries which are already in the source lexicon + awk '{ gsub(/\t/, " "); print }' $extra_lexicon | sort -u | \ + awk 'NR==FNR{a[$0]=1;next} {if (!($0 in a)) print $0 }' $tmpdir/lexicon.txt - \ + > $extdict/lexicon_extra.txt || exit 1; + + echo "$(basename $0): Creating $extdict/lexiconp.txt from $srcdict/lexiconp.txt and $extdict/lexicon_extra.txt" + perl -ape 's/(\S+\s+)(.+)/${1}1 $2/;' < $extdict/lexicon_extra.txt | \ + cat $srcdict/lexiconp.txt - | awk '{ gsub(/\t/, " "); print }' | \ + sort -u -k1,1 -k2g,2 -k3 > $extdict/lexiconp.txt || exit 1; + + perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$extdict/lexiconp.txt >$extdict/lexicon.txt || exit 1; + + # Create lexicon_silprobs.txt + silprob=false + [ -f $srcdict/lexiconp_silprob.txt ] && silprob=true + if "$silprob"; then + echo "$(basename $0): Creating $extdict/lexiconp_silprob.txt from $srcdict/lexiconp_silprob.txt" + # Here we assume no acoustic evidence for the extra word-pron pairs. + # So we assign silprob1 = overall_silprob, silprob2 = silprob3 = 1.00 + overall_silprob=`awk '{if ($1=="overall") print $2}' $srcdict/silprob.txt` + awk -v overall=$overall_silprob '{ + printf("%s %d %.1f %.2f %.2f",$1, 1, overall, 1.00, 1.00); + for(n=2;n<=NF;n++) printf " "$n; printf("\n"); + }' $extdict/lexicon_extra.txt | cat $srcdict/lexiconp_silprob.txt - | \ + sort -k1,1 -k2g,2 -k6 \ + > $extdict/lexiconp_silprob.txt || exit 1; + fi + + if ! utils/validate_dict_dir.pl $extdict >&/dev/null; then + utils/validate_dict_dir.pl $extdict # show the output. + echo "$(basename $0): Validation failed on the extended dict" + exit 1; + fi +fi + +if [ $stage -le 1 ]; then + echo "$(basename $0): Preparing the extended lang dir." + [ -d $extlang ] && rm -r $extlang 2>/dev/null + utils/prepare_lang.sh $prep_lang_opts $extdict \ + $oov_word $tmpdir $extlang || exit 1; + + # If a word list is provided, make sure the word-ids of these words are kept unchanged + # in the extended word list. + if [ -f $word_list ]; then + # First, make sure there's no OOV in the provided word-list. + if [ `awk -v s=$extlang/words.txt 'BEGIN{ while((getline < s) > 0) { vocab[$1] = 1;}} \ + {if (!($1 in vocab)) print $0}' $word_list | wc -l` -gt 0 ]; then + echo "$(basename $0): The provided word list contains words out of the extended vocab." + exit 1; + fi + awk -v s=$word_list -v oov=$oov_word -v boost=$oov_unigram_prob -v prob=$oov_prob \ + 'BEGIN{ while((getline < s) > 0) { vocab[$1] = 1; n+=1; print $0}} \ + { if (!($1 in vocab)) {print $1" "n; n+=1;}}' $extlang/words.txt > $extlang/words.txt.$$ + mv $extlang/words.txt.$$ $extlang/words.txt + fi +fi + +exit 0; diff --git a/egs/wsj/s5/utils/retry.pl b/egs/wsj/s5/utils/retry.pl new file mode 120000 index 00000000000..05a756c9747 --- /dev/null +++ b/egs/wsj/s5/utils/retry.pl @@ -0,0 +1 @@ +parallel/retry.pl \ No newline at end of file diff --git a/egs/wsj/s5/utils/validate_data_dir.sh b/egs/wsj/s5/utils/validate_data_dir.sh index dbbaeb10d5d..a8b0542c1bb 100755 --- a/egs/wsj/s5/utils/validate_data_dir.sh +++ b/egs/wsj/s5/utils/validate_data_dir.sh @@ -1,5 +1,6 @@ #!/bin/bash +cmd="$@" no_feats=false no_wav=false @@ -44,6 +45,12 @@ if [ ! -d $data ]; then exit 1; fi +if [ -f $data/images.scp ]; then + cmd=${cmd/--no-wav/} # remove --no-wav if supplied + image/validate_data_dir.sh $cmd + exit $? +fi + for f in spk2utt utt2spk; do if [ ! -f $data/$f ]; then echo "$0: no such file $f" @@ -337,4 +344,27 @@ if [ -f $data/utt2dur ]; then fi +if [ -f $data/reco2dur ]; then + check_sorted_and_uniq $data/reco2dur + cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur + if [ -f $tmpdir/recordings ]; then + if ! cmp -s $tmpdir/recordings{,.reco2dur}; then + echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" + echo "$0: differ, partial diff is:" + partial_diff $tmpdir/recordings{,.reco2dur} + exit 1; + fi + else + if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then + echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" + echo "$0: differ, partial diff is:" + partial_diff $tmpdir/{utts,recordings.reco2dur} + exit 1; + fi + fi + cat $data/reco2dur | \ + awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 +fi + + echo "$0: Successfully validated data-directory $data" diff --git a/egs/wsj/s5/utils/validate_dict_dir.pl b/egs/wsj/s5/utils/validate_dict_dir.pl index 25e45da97e5..981dc005116 100755 --- a/egs/wsj/s5/utils/validate_dict_dir.pl +++ b/egs/wsj/s5/utils/validate_dict_dir.pl @@ -58,7 +58,11 @@ sub validate_utf8_whitespaces { my $current_line = $unicode_lines->[$i]; # we replace TAB, LF, CR, and SPACE # this is to simplify the test - $current_line =~ s/[\x{0009}\x{000a}\x{000d}\x{0020}]/./g; + if ($current_line =~ /\x{000d}/) { + print STDERR "$0: The current line (nr. $i) contains CR (0x0D) character\n"; + return 1; + } + $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; if ($current_line =~/\s/) { return 1; } diff --git a/egs/wsj/s5/utils/validate_lang.pl b/egs/wsj/s5/utils/validate_lang.pl index 7e95545b284..2501d25c8f3 100755 --- a/egs/wsj/s5/utils/validate_lang.pl +++ b/egs/wsj/s5/utils/validate_lang.pl @@ -58,7 +58,11 @@ sub validate_utf8_whitespaces { my $current_line = $unicode_lines->[$i]; # we replace TAB, LF, CR, and SPACE # this is to simplify the test - $current_line =~ s/[\x{0009}\x{000a}\x{000d}\x{0020}]/./g; + if ($current_line =~ /\x{000d}/) { + print STDERR "$0: The current line (nr. $i) contains CR (0x0D) character\n"; + return 1; + } + $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; if ($current_line =~/\s/) { return 1; } diff --git a/egs/wsj/s5/utils/validate_text.pl b/egs/wsj/s5/utils/validate_text.pl index 70ed73d6f15..172396c867e 100755 --- a/egs/wsj/s5/utils/validate_text.pl +++ b/egs/wsj/s5/utils/validate_text.pl @@ -74,10 +74,17 @@ sub validate_utf8_whitespaces { use feature 'unicode_strings'; for (my $i = 0; $i < scalar @{$unicode_lines}; $i++) { my $current_line = $unicode_lines->[$i]; + my @A = split(" ", $current_line); + my $utt_id = $A[0]; # we replace TAB, LF, CR, and SPACE # this is to simplify the test - $current_line =~ s/[\x{0009}\x{000a}\x{000d}\x{0020}]/./g; + if ($current_line =~ /\x{000d}/) { + print STDERR "$0: The line for utterance $utt_id contains CR (0x0D) character\n"; + return 1; + } + $current_line =~ s/[\x{0009}\x{000a}\x{0020}]/./g; if ($current_line =~/\s/) { + print STDERR "$0: The line for utterance $utt_id contains disallowed Unicode whitespaces\n"; return 1; } } diff --git a/misc/docker/ubuntu-cuda/Dockerfile b/misc/docker/ubuntu-cuda/Dockerfile new file mode 100644 index 00000000000..f61d4403355 --- /dev/null +++ b/misc/docker/ubuntu-cuda/Dockerfile @@ -0,0 +1,28 @@ +FROM nvidia/cuda:9.1-devel-ubuntu16.04 + +MAINTAINER sih4sing5hong5 + +ENV CPU_CORE 4 + +RUN \ + apt-get update -qq && \ + apt-get install -y \ + git bzip2 wget \ + g++ make python python3 \ + zlib1g-dev automake autoconf libtool subversion \ + libatlas-base-dev + + +WORKDIR /usr/local/ +# Use the newest kaldi version +RUN git clone https://github.com/kaldi-asr/kaldi.git + + +WORKDIR /usr/local/kaldi/tools +RUN extras/check_dependencies.sh +RUN make -j $CPU_CORE + +WORKDIR /usr/local/kaldi/src +RUN ./configure && make depend -j $CPU_CORE && make -j $CPU_CORE + + diff --git a/misc/docker/ubuntu/Dockerfile b/misc/docker/ubuntu/Dockerfile index 3199b360b6f..6e2bc5def92 100644 --- a/misc/docker/ubuntu/Dockerfile +++ b/misc/docker/ubuntu/Dockerfile @@ -10,7 +10,7 @@ RUN \ git bzip2 wget \ g++ make python python3 \ zlib1g-dev automake autoconf libtool subversion \ - libatlas-dev libatlas-base-dev + libatlas-base-dev WORKDIR /usr/local/ diff --git a/scripts/rnnlm/change_vocab.sh b/scripts/rnnlm/change_vocab.sh new file mode 100755 index 00000000000..96fdfa14477 --- /dev/null +++ b/scripts/rnnlm/change_vocab.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash + +# Copyright 2018 Xiaohui Zhang + +# This script prepares a new rnnlm-dir commpatible with a new vocab from a provided word-list, +# given an exisiting rnnlm-dir containing a trained rnnlm. Basically, we copy the feature +# embedding, a trained rnnlm and some config files from the old rnnlm-dir. And then we re- +# generate the unigram_probs.txt (a fixed unigram prob is assigned to words out of the orignal vocab), +# word_feats.txt and word embeddings. + +cmd=run.pl +oov_unigram_prob=0.0000001 + +. utils/parse_options.sh + +if [ $# != 3 ]; then + echo "Usage: $0 [options] " + echo "Prepare a new directory with a given and a valid ." + echo " is a vocabulary file with mapping to integers." + exit 1 +fi + +set -e +[ -f path.sh ] && . ./path.sh + +word_list=$1 +rnnlm_in_dir=$2 +rnnlm_out_dir=$3 + +for f in features.txt data_weights.txt oov.txt xconfig; do + if [ ! -f $rnnlm_in_dir/config/$f ]; then + echo "$0: file $rnnlm_in_dir/config/$f is not present." + exit 1 + fi +done + +for f in unigram_probs.txt feat_embedding.final.mat final.raw; do + if [ ! -f $rnnlm_in_dir/$f ]; then + echo "$0: file $rnnlm_in_dir/$f is not present." + exit 1 + fi +done + +echo "$0: Copying config directory." +mkdir -p $rnnlm_out_dir/config +for f in features.txt data_weights.txt oov.txt xconfig; do + cp $rnnlm_in_dir/config/$f $rnnlm_out_dir/config +done + +for f in feat_embedding.final.mat final.raw; do + cp -L $rnnlm_in_dir/$f $rnnlm_out_dir/ +done + +echo "$0: Re-generating words.txt, unigram_probs.txt, word_feats.txt and word_embedding.final.mat." +cp $word_list $rnnlm_out_dir/config/words.txt + +brk_id=`cat $rnnlm_out_dir/config/words.txt | wc -l` +echo " $brk_id" >> $rnnlm_out_dir/config/words.txt + +# Generate new unigram_probs.txt. For words within the original vocab, we just take the prob +# from the original unigram_probs.txt. For new words added, we assign the prob as $oov_unigram_prob. +awk -v s=$rnnlm_in_dir/unigram_probs.txt -v t=$rnnlm_in_dir/config/words.txt -v oov_prob=$oov_unigram_prob \ + 'BEGIN { while ((getline 0) { id2prob[$1] = $2; } + while ((getline 0) { word2prob[$1] = id2prob[$2]; } + } + { if ($1 in word2prob) print $2" "word2prob[$1]; else print $2" "oov_prob; }' \ + $rnnlm_out_dir/config/words.txt | sort -k1,1 -n > $rnnlm_out_dir/unigram_probs.txt + +rnnlm/get_special_symbol_opts.py < $rnnlm_out_dir/config/words.txt > $rnnlm_out_dir/special_symbol_opts.txt + +# Re-compute words_feats.txt and word embeddings. +rnnlm/get_word_features.py --unigram-probs=$rnnlm_out_dir/unigram_probs.txt --treat-as-bos='#0' \ + $rnnlm_out_dir/config/words.txt $rnnlm_out_dir/config/features.txt > $rnnlm_out_dir/word_feats.txt + +rnnlm-get-word-embedding $rnnlm_out_dir/word_feats.txt $rnnlm_out_dir/feat_embedding.final.mat \ + $rnnlm_out_dir/word_embedding.final.mat diff --git a/scripts/rnnlm/choose_features.py b/scripts/rnnlm/choose_features.py index dd742cb82e1..0686c8f88c6 100755 --- a/scripts/rnnlm/choose_features.py +++ b/scripts/rnnlm/choose_features.py @@ -8,8 +8,10 @@ import sys import math from collections import defaultdict -sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) +sys.stdout = open(1, 'w', encoding='latin-1', closefd=False) +import re +tab_or_space = re.compile('[ \t]') parser = argparse.ArgumentParser(description="This script chooses the sparse feature representation of words. " "To be more specific, it chooses the set of features-- you compute " @@ -84,9 +86,9 @@ # and 'wordlist' is a list indexed by integer id, that returns the string-valued word. def read_vocab(vocab_file): vocab = {} - with open(vocab_file, 'r', encoding="utf-8") as f: + with open(vocab_file, 'r', encoding="latin-1") as f: for line in f: - fields = line.split() + fields = re.split(tab_or_space, line) assert len(fields) == 2 if fields[0] in vocab: sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}" @@ -113,9 +115,9 @@ def read_vocab(vocab_file): # id of the word, which evaluates to the unigram prob of the word. def read_unigram_probs(unigram_probs_file): unigram_probs = [] - with open(unigram_probs_file, 'r', encoding="utf-8") as f: + with open(unigram_probs_file, 'r', encoding="latin-1") as f: for line in f: - fields = line.split() + fields = re.split(tab_or_space, line) assert len(fields) == 2 idx = int(fields[0]) if idx >= len(unigram_probs): diff --git a/scripts/rnnlm/get_best_model.py b/scripts/rnnlm/get_best_model.py index 45487b18b0c..e8c6bd8a2f4 100755 --- a/scripts/rnnlm/get_best_model.py +++ b/scripts/rnnlm/get_best_model.py @@ -22,7 +22,7 @@ num_iters=None try: - with open(args.rnnlm_dir + "/info.txt", encoding="utf-8") as f: + with open(args.rnnlm_dir + "/info.txt", encoding="latin-1") as f: for line in f: a = line.split("=") if a[0] == "num_iters": @@ -38,10 +38,10 @@ best_objf=-2000 best_iter=-1 -for i in range(num_iters): +for i in range(1, num_iters): this_logfile = "{0}/log/compute_prob.{1}.log".format(args.rnnlm_dir, i) try: - f = open(this_logfile, 'r', encoding='utf-8') + f = open(this_logfile, 'r', encoding='latin-1') except: sys.exit(sys.argv[0] + ": could not open log-file {0}".format(this_logfile)) this_objf=-1000 diff --git a/scripts/rnnlm/get_embedding_dim.py b/scripts/rnnlm/get_embedding_dim.py index b6810ef2cbf..a5ddb8c25f3 100755 --- a/scripts/rnnlm/get_embedding_dim.py +++ b/scripts/rnnlm/get_embedding_dim.py @@ -45,7 +45,7 @@ left_context=0 right_context=0 for line in out_lines: - line = line.decode('utf-8') + line = line.decode('latin-1') m = re.search(r'input-node name=input dim=(\d+)', line) if m is not None: try: diff --git a/scripts/rnnlm/get_num_splits.sh b/scripts/rnnlm/get_num_splits.sh index 93d1f7f169c..974fd8bf204 100755 --- a/scripts/rnnlm/get_num_splits.sh +++ b/scripts/rnnlm/get_num_splits.sh @@ -65,7 +65,7 @@ tot_with_multiplicities=0 for f in $text/*.counts; do if [ "$f" != "$text/dev.counts" ]; then - this_tot=$(cat $f | awk '{tot += $2} END{print tot}') + this_tot=$(cat $f | awk '{tot += $2} END{printf("%d", tot)}') if ! [ $this_tot -gt 0 ]; then echo "$0: there were no counts in counts file $f" 1>&2 exit 1 diff --git a/scripts/rnnlm/get_special_symbol_opts.py b/scripts/rnnlm/get_special_symbol_opts.py index 2120d9a5d26..83f7d708a49 100755 --- a/scripts/rnnlm/get_special_symbol_opts.py +++ b/scripts/rnnlm/get_special_symbol_opts.py @@ -8,6 +8,9 @@ import argparse import sys +import re +tab_or_space = re.compile('[ \t]') + parser = argparse.ArgumentParser(description="This script checks whether the special symbols " "appear in words.txt with expected values, if not, it will " "print out the options with correct value to stdout, which may look like " @@ -25,9 +28,10 @@ lower_ids = {} upper_ids = {} -input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='latin-1') for line in input_stream: - fields = line.split() + fields = re.split(tab_or_space, line) + assert(len(fields) == 2) sym = fields[0] if sym in special_symbols: assert sym not in lower_ids diff --git a/scripts/rnnlm/get_unigram_probs.py b/scripts/rnnlm/get_unigram_probs.py index 98721d728ba..abb8515f330 100755 --- a/scripts/rnnlm/get_unigram_probs.py +++ b/scripts/rnnlm/get_unigram_probs.py @@ -7,6 +7,9 @@ import argparse import sys +import re +tab_or_space = re.compile('[ \t]') + parser = argparse.ArgumentParser(description="This script gets the unigram probabilities of words.", epilog="E.g. " + sys.argv[0] + " --vocab-file=data/rnnlm/vocab/words.txt " "--data-weights-file=exp/rnnlm/data_weights.txt data/rnnlm/data " @@ -74,10 +77,10 @@ def get_all_data_sources_except_dev(text_dir): # value is a tuple (repeated_times_per_epoch, weight) def read_data_weights(weights_file, data_sources): data_weights = {} - with open(weights_file, 'r', encoding="utf-8") as f: + with open(weights_file, 'r', encoding="latin-1") as f: for line in f: try: - fields = line.split() + fields = re.split(tab_or_space, line) assert len(fields) == 3 if fields[0] in data_weights: raise Exception("duplicated data source({0}) specified in " @@ -99,9 +102,9 @@ def read_data_weights(weights_file, data_sources): # return the vocab, which is a dict mapping the word to a integer id. def read_vocab(vocab_file): vocab = {} - with open(vocab_file, 'r', encoding="utf-8") as f: + with open(vocab_file, 'r', encoding="latin-1") as f: for line in f: - fields = line.split() + fields = re.split(tab_or_space, line) assert len(fields) == 2 if fields[0] in vocab: sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}" @@ -128,10 +131,11 @@ def get_counts(data_sources, data_weights, vocab): if weight == 0.0: continue - with open(counts_file, 'r', encoding="utf-8") as f: + with open(counts_file, 'r', encoding="latin-1") as f: for line in f: - fields = line.split() - assert len(fields) == 2 + fields = re.split(tab_or_space, line) + if len(fields) != 2: print("Warning, should be 2 cols:", fields, line, file=sys.stderr); + assert(len(fields) == 2) word = fields[0] count = fields[1] if word not in vocab: diff --git a/scripts/rnnlm/get_vocab.py b/scripts/rnnlm/get_vocab.py index f290ef721c1..e30ce4a94c9 100755 --- a/scripts/rnnlm/get_vocab.py +++ b/scripts/rnnlm/get_vocab.py @@ -6,7 +6,10 @@ import os import argparse import sys -sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) +sys.stdout = open(1, 'w', encoding='latin-1', closefd=False) + +import re +tab_or_space = re.compile('[ \t]') parser = argparse.ArgumentParser(description="This script get a vocab from unigram counts " "of words produced by get_unigram_counts.sh", @@ -25,10 +28,10 @@ # Add the count for every word in counts_file # the result is written into word_counts def add_counts(word_counts, counts_file): - with open(counts_file, 'r', encoding="utf-8") as f: + with open(counts_file, 'r', encoding="latin-1") as f: for line in f: line = line.strip() - word_and_count = line.split() + word_and_count = re.split(tab_or_space, line) assert len(word_and_count) == 2 if word_and_count[0] in word_counts: word_counts[word_and_count[0]] += int(word_and_count[1]) diff --git a/scripts/rnnlm/get_word_features.py b/scripts/rnnlm/get_word_features.py index 744c486cfd9..54d84077060 100755 --- a/scripts/rnnlm/get_word_features.py +++ b/scripts/rnnlm/get_word_features.py @@ -9,6 +9,9 @@ import math from collections import defaultdict +import re +tab_or_space = re.compile('[ \t]') + parser = argparse.ArgumentParser(description="This script turns the words into the sparse feature representation, " "using features from rnnlm/choose_features.py.", epilog="E.g. " + sys.argv[0] + " --unigram-probs=exp/rnnlm/unigram_probs.txt " @@ -38,9 +41,9 @@ # return the vocab, which is a dict mapping the word to a integer id. def read_vocab(vocab_file): vocab = {} - with open(vocab_file, 'r', encoding="utf-8") as f: + with open(vocab_file, 'r', encoding="latin-1") as f: for line in f: - fields = line.split() + fields = re.split(tab_or_space, line) assert len(fields) == 2 if fields[0] in vocab: sys.exit(sys.argv[0] + ": duplicated word({0}) in vocab: {1}" @@ -59,9 +62,9 @@ def read_vocab(vocab_file): # return a list of unigram_probs, indexed by word id def read_unigram_probs(unigram_probs_file): unigram_probs = [] - with open(unigram_probs_file, 'r', encoding="utf-8") as f: + with open(unigram_probs_file, 'r', encoding="latin-1") as f: for line in f: - fields = line.split() + fields = re.split(tab_or_space, line) assert len(fields) == 2 idx = int(fields[0]) if idx >= len(unigram_probs): @@ -100,9 +103,9 @@ def read_features(features_file): feats['min_ngram_order'] = 10000 feats['max_ngram_order'] = -1 - with open(features_file, 'r', encoding="utf-8") as f: + with open(features_file, 'r', encoding="latin-1") as f: for line in f: - fields = line.split() + fields = re.split(tab_or_space, line) assert(len(fields) in [3, 4, 5]) feat_id = int(fields[0]) diff --git a/scripts/rnnlm/lmrescore.sh b/scripts/rnnlm/lmrescore.sh index cd0cf793d8d..9da22ae75a2 100755 --- a/scripts/rnnlm/lmrescore.sh +++ b/scripts/rnnlm/lmrescore.sh @@ -72,6 +72,12 @@ awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) { print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \ || exit 1; +if ! head -n -1 $rnnlm_dir/config/words.txt | cmp $oldlang/words.txt -; then + # the last word of the RNNLM word list is an added word + echo "$0: Word lists mismatch for lattices and RNNLM." + exit 1 +fi + oldlm_command="fstproject --project_output=true $oldlm |" special_symbol_opts=$(cat $rnnlm_dir/special_symbol_opts.txt) diff --git a/scripts/rnnlm/lmrescore_back.sh b/scripts/rnnlm/lmrescore_back.sh new file mode 100755 index 00000000000..5a8052a201d --- /dev/null +++ b/scripts/rnnlm/lmrescore_back.sh @@ -0,0 +1,125 @@ +#!/bin/bash + +# Copyright 2017 Hainan Xu +# Apache 2.0 + +# This script rescores lattices with KALDI RNNLM trained on reversed text. +# The input directory should already be rescored with a forward RNNLM, preferably +# with the pruned algorithm, since smaller lattices make rescoring much faster. +# An example of the forward pruned rescoring is at +# egs/swbd/s5c/local/rnnlm/run_tdnn_lstm.sh +# One example script for backward RNNLM rescoring is at +# egs/swbd/s5c/local/rnnlm/run_tdnn_lstm_back.sh + +# Begin configuration section. +cmd=run.pl +skip_scoring=false +max_ngram_order=4 # Approximate the lattice-rescoring by limiting the max-ngram-order + # if it's set, it merges histories in the lattice if they share + # the same ngram history and this prevents the lattice from + # exploding exponentially. Details of the n-gram approximation + # method are described in section 2.3 of the paper + # http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdm + +weight=0.5 # Interpolation weight for RNNLM. +normalize=false # If true, we add a normalization step to the output of the RNNLM + # so that it adds up to *exactly* 1. Note that this is not necessary + # as in our RNNLM setup, a properly trained network would automatically + # have its normalization term close to 1. The details of this + # could be found at http://www.danielpovey.com/files/2018_icassp_rnnlm.pdf + +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +. ./utils/parse_options.sh + +if [ $# != 5 ]; then + echo "Does language model rescoring of lattices (remove old LM, add new LM)" + echo "with Kaldi RNNLM trained on reversed text. See comments in file for details" + echo "" + echo "Usage: $0 [options] \\" + echo " " + echo " e.g.: $0 data/lang_tg exp/rnnlm_lstm/ data/test \\" + echo " exp/tri3/test_rnnlm_forward exp/tri3/test_rnnlm_bidirection" + echo "options: [--cmd (run.pl|queue.pl [queue opts])]" + exit 1; +fi + +[ -f path.sh ] && . ./path.sh; + +oldlang=$1 +rnnlm_dir=$2 +data=$3 +indir=$4 +outdir=$5 + +oldlm=$oldlang/G.fst +if [ ! -f $oldlm ]; then + echo "$0: file $oldlm not found; using $oldlang/G.carpa" + oldlm=$oldlang/G.carpa +fi + +[ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1; +[ ! -f $rnnlm_dir/final.raw ] && echo "$0: Missing file $rnnlm_dir/final.raw" && exit 1; +[ ! -f $rnnlm_dir/feat_embedding.final.mat ] && [ ! -f $rnnlm_dir/word_embedding.final.mat ] && echo "$0: Missing word embedding file" && exit 1; + +[ ! -f $oldlang/words.txt ] &&\ + echo "$0: Missing file $oldlang/words.txt" && exit 1; +! ls $indir/lat.*.gz >/dev/null &&\ + echo "$0: No lattices input directory $indir" && exit 1; +awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) { + print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \ + || exit 1; + +normalize_opt= +if $normalize; then + normalize_opt="--normalize-probs=true" +fi +oldlm_command="fstproject --project_output=true $oldlm |" +special_symbol_opts=$(cat $rnnlm_dir/special_symbol_opts.txt) + +word_embedding= +if [ -f $rnnlm_dir/word_embedding.final.mat ]; then + word_embedding=$rnnlm_dir/word_embedding.final.mat +else + word_embedding="'rnnlm-get-word-embedding $rnnlm_dir/word_feats.txt $rnnlm_dir/feat_embedding.final.mat -|'" +fi + +mkdir -p $outdir/log +nj=`cat $indir/num_jobs` || exit 1; +cp $indir/num_jobs $outdir + +# In order to rescore with a backward RNNLM, we first remove the original LM +# scores with lattice-lmrescore, before reversing the lattices +oldlm_weight=$(perl -e "print -1.0 * $weight;") +if [ "$oldlm" == "$oldlang/G.fst" ]; then + $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ + lattice-lmrescore --lm-scale=$oldlm_weight \ + "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm_command" ark:- \| \ + lattice-reverse ark:- ark:- \| \ + lattice-lmrescore-kaldi-rnnlm --lm-scale=$weight $special_symbol_opts \ + --max-ngram-order=$max_ngram_order $normalize_opt \ + $word_embedding "$rnnlm_dir/final.raw" ark:- ark:- \| \ + lattice-reverse ark:- "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1; +else + $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ + lattice-lmrescore-const-arpa --lm-scale=$oldlm_weight \ + "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlm" ark:- \| \ + lattice-reverse ark:- ark:- \| \ + lattice-lmrescore-kaldi-rnnlm --lm-scale=$weight $special_symbol_opts \ + --max-ngram-order=$max_ngram_order $normalize_opt \ + $word_embedding "$rnnlm_dir/final.raw" ark:- ark:- \| \ + lattice-reverse ark:- "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1; +fi + +if ! $skip_scoring ; then + err_msg="$0: Not scoring because local/score.sh does not exist or not executable." + [ ! -x local/score.sh ] && echo $err_msg && exit 1; + echo local/score.sh --cmd "$cmd" $data $oldlang $outdir + local/score.sh --cmd "$cmd" $data $oldlang $outdir +else + echo "$0: Not scoring because --skip-scoring was specified." +fi + +exit 0; diff --git a/scripts/rnnlm/lmrescore_pruned.sh b/scripts/rnnlm/lmrescore_pruned.sh index 506527f4f6b..9ba78415708 100755 --- a/scripts/rnnlm/lmrescore_pruned.sh +++ b/scripts/rnnlm/lmrescore_pruned.sh @@ -16,7 +16,7 @@ max_ngram_order=4 # Approximate the lattice-rescoring by limiting the max-ngram- # the same ngram history and this prevents the lattice from # exploding exponentially. Details of the n-gram approximation # method are described in section 2.3 of the paper - # http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdm + # http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdf max_arcs= # limit the max arcs in lattice while rescoring. E.g., 20000 acwt=0.1 @@ -26,6 +26,8 @@ normalize=false # If true, we add a normalization step to the output of the RNNL # as in our RNNLM setup, a properly trained network would automatically # have its normalization term close to 1. The details of this # could be found at http://www.danielpovey.com/files/2018_icassp_rnnlm.pdf +lattice_prune_beam=4 # Beam used in pruned lattice composition + # This option affects speed and how large the composed lattice may be # End configuration section. @@ -73,6 +75,12 @@ awk -v n=$0 -v w=$weight 'BEGIN {if (w < 0 || w > 1) { print n": Interpolation weight should be in the range of [0, 1]"; exit 1;}}' \ || exit 1; +if ! head -n -1 $rnnlm_dir/config/words.txt | cmp $oldlang/words.txt -; then + # the last word of the RNNLM word list is an added word + echo "$0: Word lists mismatch for lattices and RNNLM." + exit 1 +fi + normalize_opt= if $normalize; then normalize_opt="--normalize-probs=true" @@ -97,6 +105,7 @@ cp $indir/num_jobs $outdir $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \ lattice-lmrescore-kaldi-rnnlm-pruned --lm-scale=$weight $special_symbol_opts \ + --lattice-compose-beam=$lattice_prune_beam \ --acoustic-scale=$acwt --max-ngram-order=$max_ngram_order $normalize_opt $max_arcs_opt \ $carpa_option $oldlm $word_embedding "$rnnlm_dir/final.raw" \ "ark:gunzip -c $indir/lat.JOB.gz|" "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1; diff --git a/scripts/rnnlm/prepare_split_data.py b/scripts/rnnlm/prepare_split_data.py index 9cc4f69d09f..e39f4504f37 100755 --- a/scripts/rnnlm/prepare_split_data.py +++ b/scripts/rnnlm/prepare_split_data.py @@ -8,6 +8,9 @@ import argparse import sys +import re +tab_or_space = re.compile('[ \t]') + parser = argparse.ArgumentParser(description="This script prepares files containing integerized text, " "for consumption by nnet3-get-egs.", epilog="E.g. " + sys.argv[0] + " --vocab-file=data/rnnlm/vocab/words.txt " @@ -63,10 +66,10 @@ def get_all_data_sources_except_dev(text_dir): # value is a tuple (repeated_times_per_epoch, weight) def read_data_weights(weights_file, data_sources): data_weights = {} - with open(weights_file, 'r', encoding="utf-8") as f: + with open(weights_file, 'r', encoding="latin-1") as f: for line in f: try: - fields = line.split() + fields = re.split(tab_or_space, line) assert len(fields) == 3 if fields[0] in data_weights: raise Exception("duplicated data source({0}) specified in " @@ -94,7 +97,7 @@ def distribute_to_outputs(source_filename, weight, output_filehandles): num_outputs = len(output_filehandles) n = 0 try: - f = open(source_filename, 'r', encoding="utf-8") + f = open(source_filename, 'r', encoding="latin-1") except Exception as e: sys.exit(sys.argv[0] + ": failed to open file {0} for reading: {1} ".format( source_filename, str(e))) @@ -121,7 +124,7 @@ def distribute_to_outputs(source_filename, weight, output_filehandles): os.makedirs(args.split_dir + "/info") # set up the 'num_splits' file, which contains an integer. -with open("{0}/info/num_splits".format(args.split_dir), 'w', encoding="utf-8") as f: +with open("{0}/info/num_splits".format(args.split_dir), 'w', encoding="latin-1") as f: print(args.num_splits, file=f) # e.g. set temp_files = [ 'foo/1.tmp', 'foo/2.tmp', ..., 'foo/5.tmp' ] @@ -133,7 +136,7 @@ def distribute_to_outputs(source_filename, weight, output_filehandles): temp_filehandles = [] for fname in temp_files: try: - temp_filehandles.append(open(fname, 'w', encoding="utf-8")) + temp_filehandles.append(open(fname, 'w', encoding="latin-1")) except Exception as e: sys.exit(sys.argv[0] + ": failed to open file: " + str(e) + ".. if this is a max-open-filehandles limitation, you may " diff --git a/scripts/rnnlm/show_word_features.py b/scripts/rnnlm/show_word_features.py index 89d84d53f3e..5fe049cb8ce 100755 --- a/scripts/rnnlm/show_word_features.py +++ b/scripts/rnnlm/show_word_features.py @@ -6,7 +6,10 @@ import os import argparse import sys -sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) +sys.stdout = open(1, 'w', encoding='latin-1', closefd=False) + +import re +tab_or_space = re.compile('[ \t]') parser = argparse.ArgumentParser(description="This script turns the word features to a human readable format.", epilog="E.g. " + sys.argv[0] + "exp/rnnlm/word_feats.txt exp/rnnlm/features.txt " @@ -27,9 +30,9 @@ def read_feature_type_and_key(features_file): feat_types = {} - with open(features_file, 'r', encoding="utf-8") as f: + with open(features_file, 'r', encoding="latin-1") as f: for line in f: - fields = line.split() + fields = re.split(tab_or_space, line) assert(len(fields) in [2, 3, 4]) feat_id = int(fields[0]) @@ -44,9 +47,9 @@ def read_feature_type_and_key(features_file): feat_type_and_key = read_feature_type_and_key(args.features_file) num_word_feats = 0 -with open(args.word_features_file, 'r', encoding="utf-8") as f: +with open(args.word_features_file, 'r', encoding="latin-1") as f: for line in f: - fields = line.split() + fields = re.split(tab_or_space, line) assert len(fields) % 2 == 1 print(int(fields[0]), end='\t') diff --git a/scripts/rnnlm/train_rnnlm.sh b/scripts/rnnlm/train_rnnlm.sh index f76d02cfd41..aedfc470ac9 100755 --- a/scripts/rnnlm/train_rnnlm.sh +++ b/scripts/rnnlm/train_rnnlm.sh @@ -188,12 +188,16 @@ while [ $x -lt $num_iters ]; do else dest_number=$[x+1]; fi # in the normal case $repeated data will be just one copy. repeated_data=$(for n in $(seq $num_repeats); do echo -n $dir/text/$split.txt ''; done) - + rnnlm_l2_factor=$(perl -e "print (1.0/$this_num_jobs);") embedding_l2_regularize=$(perl -e "print ($embedding_l2/$this_num_jobs);") + # allocate queue-slots for threads doing sampling, + num_threads_=$[$num_egs_threads*2/3] + [ -f $dir/sampling.lm ] && queue_thread_opt="--num-threads $num_threads_" || queue_thread_opt= + # Run the training job or jobs. - $cmd $queue_gpu_opt $dir/log/train.$x.$n.log \ + $cmd $queue_gpu_opt $queue_thread_opt $dir/log/train.$x.$n.log \ rnnlm-train \ --rnnlm.max-param-change=$rnnlm_max_change \ --rnnlm.l2_regularize_factor=$rnnlm_l2_factor \ @@ -220,12 +224,12 @@ while [ $x -lt $num_iters ]; do fi ) - num_splits_processed=$[num_splits_processed+this_num_jobs] # the error message below is not that informative, but $cmd will # have printed a more specific one. [ -f $dir/.error ] && echo "$0: error with diagnostics on iteration $x of training" && exit 1; fi x=$[x+1] + num_splits_processed=$[num_splits_processed+this_num_jobs] done wait # wait for diagnostic jobs in the background. @@ -247,11 +251,11 @@ fi # Now get some diagnostics about the evolution of the objective function. if [ $stage -le $[num_iters+1] ]; then ( - logs=$(for iter in $(seq 0 $[$num_iters-1]); do echo -n $dir/log/train.$iter.1.log ''; done) + logs=$(for iter in $(seq 1 $[$num_iters-1]); do echo -n $dir/log/train.$iter.1.log ''; done) # in the non-sampling case the exact objf is printed and we plot that # in the sampling case we print the approximated objf for training. grep 'Overall objf' $logs | awk 'BEGIN{printf("Train objf: ")} /exact/{printf("%.2f ", $NF);next} {printf("%.2f ", $10)} END{print "";}' - logs=$(for iter in $(seq 0 $[$num_iters-1]); do echo -n $dir/log/compute_prob.$iter.log ''; done) + logs=$(for iter in $(seq 1 $[$num_iters-1]); do echo -n $dir/log/compute_prob.$iter.log ''; done) grep 'Overall objf' $logs | awk 'BEGIN{printf("Dev objf: ")} {printf("%.2f ", $NF)} END{print "";}' ) > $dir/report.txt cat $dir/report.txt diff --git a/scripts/rnnlm/validate_features.py b/scripts/rnnlm/validate_features.py index a650092b086..010ceb72615 100755 --- a/scripts/rnnlm/validate_features.py +++ b/scripts/rnnlm/validate_features.py @@ -7,6 +7,9 @@ import argparse import sys +import re +tab_or_space = re.compile('[ \t]') + parser = argparse.ArgumentParser(description="Validates features file, produced by rnnlm/choose_features.py.", epilog="E.g. " + sys.argv[0] + " exp/rnnlm/features.txt", formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -21,7 +24,7 @@ if not os.path.isfile(args.features_file): sys.exit(sys.argv[0] + ": Expected file {0} to exist".format(args.features_file)) -with open(args.features_file, 'r', encoding="utf-8") as f: +with open(args.features_file, 'r', encoding="latin-1") as f: has_unigram = False has_length = False idx = 0 @@ -30,7 +33,7 @@ final_feats = {} word_feats = {} for line in f: - fields = line.split() + fields = re.split(tab_or_space, line) assert(len(fields) in [3, 4, 5]) assert idx == int(fields[0]) diff --git a/scripts/rnnlm/validate_text_dir.py b/scripts/rnnlm/validate_text_dir.py index d644d77911e..4b311a8abbd 100755 --- a/scripts/rnnlm/validate_text_dir.py +++ b/scripts/rnnlm/validate_text_dir.py @@ -7,6 +7,9 @@ import argparse import sys +import re +tab_or_space = re.compile('[ \t]') + parser = argparse.ArgumentParser(description="Validates data directory containing text " "files from one or more data sources, including dev.txt.", epilog="E.g. " + sys.argv[0] + " data/rnnlm/data", @@ -37,7 +40,7 @@ def check_text_file(text_file): - with open(text_file, 'r', encoding="utf-8") as f: + with open(text_file, 'r', encoding="latin-1") as f: found_nonempty_line = False lineno = 0 if args.allow_internal_eos == 'true': @@ -51,7 +54,7 @@ def check_text_file(text_file): lineno += 1 if args.spot_check == 'true' and lineno > 10: break - words = line.split() + words = re.split(tab_or_space, line) if len(words) != 0: found_nonempty_line = True for word in words: @@ -73,9 +76,9 @@ def check_text_file(text_file): # with some kind of utterance-id first_field_set = set() other_fields_set = set() - with open(text_file, 'r', encoding="utf-8") as f: + with open(text_file, 'r', encoding="latin-1") as f: for line in f: - array = line.split() + array = re.split(tab_or_space, line) if len(array) > 0: first_word = array[0] if first_word in first_field_set or first_word in other_fields_set: diff --git a/scripts/rnnlm/validate_word_features.py b/scripts/rnnlm/validate_word_features.py index 3dc9b23aa41..f8eb5858d95 100755 --- a/scripts/rnnlm/validate_word_features.py +++ b/scripts/rnnlm/validate_word_features.py @@ -7,6 +7,9 @@ import argparse import sys +import re +tab_or_space = re.compile('[ \t]') + parser = argparse.ArgumentParser(description="Validates word features file, produced by rnnlm/get_word_features.py.", epilog="E.g. " + sys.argv[0] + " --features-file=exp/rnnlm/features.txt " "exp/rnnlm/word_feats.txt", @@ -25,9 +28,9 @@ unigram_feat_id = -1 length_feat_id = -1 max_feat_id = -1 -with open(args.features_file, 'r', encoding="utf-8") as f: +with open(args.features_file, 'r', encoding="latin-1") as f: for line in f: - fields = line.split() + fields = re.split(tab_or_space, line) assert(len(fields) in [3, 4, 5]) feat_id = int(fields[0]) @@ -49,9 +52,9 @@ if feat_id > max_feat_id: max_feat_id = feat_id -with open(args.word_features_file, 'r', encoding="utf-8") as f: +with open(args.word_features_file, 'r', encoding="latin-1") as f: for line in f: - fields = line.split() + fields = re.split(tab_or_space, line) assert len(fields) > 0 and len(fields) % 2 == 1 word_id = int(fields[0]) diff --git a/src/base/kaldi-math.cc b/src/base/kaldi-math.cc index cf06875030b..991e46a590c 100644 --- a/src/base/kaldi-math.cc +++ b/src/base/kaldi-math.cc @@ -42,8 +42,8 @@ int32 RoundUpToNearestPowerOfTwo(int32 n) { static std::mutex _RandMutex; int Rand(struct RandomState* state) { -#ifdef _MSC_VER - // On Windows, just call Rand() +#if defined(_MSC_VER) || defined(__CYGWIN__) + // On Windows and Cygwin, just call Rand() return rand(); #else if (state) { diff --git a/src/base/kaldi-math.h b/src/base/kaldi-math.h index afddc5105d4..21665ddfc63 100644 --- a/src/base/kaldi-math.h +++ b/src/base/kaldi-math.h @@ -183,6 +183,7 @@ inline Float RandPrune(Float post, BaseFloat prune_thresh, inline double LogAdd(double x, double y) { double diff; + if (x < y) { diff = x - y; x = y; @@ -203,6 +204,7 @@ inline double LogAdd(double x, double y) { inline float LogAdd(float x, float y) { float diff; + if (x < y) { diff = x - y; x = y; diff --git a/src/base/kaldi-utils.cc b/src/base/kaldi-utils.cc index 1ae1dc0b758..432da426bc3 100644 --- a/src/base/kaldi-utils.cc +++ b/src/base/kaldi-utils.cc @@ -45,6 +45,8 @@ std::string CharToString(const char &c) { void Sleep(float seconds) { #if defined(_MSC_VER) || defined(MINGW) ::Sleep(static_cast(seconds * 1000.0)); +#elif defined(__CYGWIN__) + sleep(static_cast(seconds)); #else usleep(static_cast(seconds * 1000000.0)); #endif diff --git a/src/base/kaldi-utils.h b/src/base/kaldi-utils.h index bd2da25dce7..c9d6fd950ab 100644 --- a/src/base/kaldi-utils.h +++ b/src/base/kaldi-utils.h @@ -46,10 +46,14 @@ #endif #endif -#ifdef _MSC_VER +#if defined(_MSC_VER) # define KALDI_MEMALIGN(align, size, pp_orig) \ (*(pp_orig) = _aligned_malloc(size, align)) # define KALDI_MEMALIGN_FREE(x) _aligned_free(x) +#elif defined(__CYGWIN__) +# define KALDI_MEMALIGN(align, size, pp_orig) \ + (*(pp_orig) = aligned_alloc(align, size)) +# define KALDI_MEMALIGN_FREE(x) free(x) #else # define KALDI_MEMALIGN(align, size, pp_orig) \ (!posix_memalign(pp_orig, align, size) ? *(pp_orig) : NULL) @@ -134,8 +138,11 @@ template<> class KaldiCompileTimeAssert { KaldiCompileTimeAssert::is_specialized \ && !std::numeric_limits::is_integer>::Check() -#ifdef _MSC_VER +#if defined(_MSC_VER) #define KALDI_STRCASECMP _stricmp +#elif defined(__CYGWIN__) +#include +#define KALDI_STRCASECMP strcasecmp #else #define KALDI_STRCASECMP strcasecmp #endif diff --git a/src/bin/add-self-loops.cc b/src/bin/add-self-loops.cc index c349caba547..b223dfe317d 100644 --- a/src/bin/add-self-loops.cc +++ b/src/bin/add-self-loops.cc @@ -97,13 +97,13 @@ int main(int argc, char *argv[]) { if (!fst) KALDI_ERR << "add-self-loops: error reading input FST."; + bool check_no_self_loops = true; // The work gets done here. AddSelfLoops(trans_model, disambig_syms_in, self_loop_scale, - reorder, - fst); + reorder, check_no_self_loops, fst); if (! fst->Write(fst_out_filename) ) KALDI_ERR << "add-self-loops: error writing FST to " @@ -117,4 +117,3 @@ int main(int argc, char *argv[]) { return -1; } } - diff --git a/src/bin/copy-matrix.cc b/src/bin/copy-matrix.cc index 46656345065..30f50f094bb 100644 --- a/src/bin/copy-matrix.cc +++ b/src/bin/copy-matrix.cc @@ -44,7 +44,7 @@ int main(int argc, char *argv[]) { " or: copy-matrix [options] \n" " e.g.: copy-matrix --binary=false 1.mat -\n" " copy-matrix ark:2.trans ark,t:-\n" - "See also: copy-feats\n"; + "See also: copy-feats, matrix-sum\n"; bool binary = true; bool apply_log = false; @@ -140,5 +140,3 @@ int main(int argc, char *argv[]) { return -1; } } - - diff --git a/src/bin/draw-tree.cc b/src/bin/draw-tree.cc index a534fdf78de..ad1dd41a53f 100644 --- a/src/bin/draw-tree.cc +++ b/src/bin/draw-tree.cc @@ -39,7 +39,7 @@ void MakeEvent(std::string &qry, fst::SymbolTable *phone_syms, } else { value = static_cast(phone_syms->Find(valstr.c_str())); - if (value == fst::SymbolTable::kNoSymbol) { + if (value == -1) { // fst::kNoSymbol KALDI_ERR << "Bad query: invalid symbol (" << valstr << ')' << std::endl << std::endl; } @@ -49,7 +49,7 @@ void MakeEvent(std::string &qry, fst::SymbolTable *phone_syms, } std::string valstr = qry.substr(old_found); EventValueType value = static_cast(phone_syms->Find(valstr.c_str())); - if (value == fst::SymbolTable::kNoSymbol) { + if (value == -1) { // fst::kNoSymbol KALDI_ERR << "Bad query: invalid symbol (" << valstr << ')' << std::endl << std::endl; } @@ -69,7 +69,7 @@ int main(int argc, char **argv) { "Outputs a decision tree description in GraphViz format\n" "Usage: draw-tree [options] \n" "e.g.: draw-tree phones.txt tree | dot -Gsize=8,10.5 -Tps | ps2pdf - tree.pdf\n"; - + ParseOptions po(usage); po.Register("query", &qry, "a query to trace through the tree" diff --git a/src/bin/get-post-on-ali.cc b/src/bin/get-post-on-ali.cc index 40ae806936c..6d6dfd0d3df 100644 --- a/src/bin/get-post-on-ali.cc +++ b/src/bin/get-post-on-ali.cc @@ -46,7 +46,7 @@ int main(int argc, char *argv[]) { "See http://kaldi-asr.org/doc/hmm.html#transition_model_identifiers for an\n" "explanation of these types of indexes.\n" "\n" - "See also: weight-post, post-to-weights, reverse-weights\n" + "See also: post-to-tacc, weight-post, post-to-weights, reverse-weights\n" "\n" "Usage: get-post-on-ali [options] \n" "e.g.: get-post-on-ali ark:post.ark ark,s,cs:ali.ark ark:weights.ark\n"; @@ -68,7 +68,7 @@ int main(int argc, char *argv[]) { SequentialPosteriorReader posterior_reader(posteriors_rspecifier); RandomAccessInt32VectorReader alignments_reader(alignments_rspecifier); BaseFloatVectorWriter confidences_writer(confidences_wspecifier); - + for (; !posterior_reader.Done(); posterior_reader.Next()) { std::string key = posterior_reader.Key(); if (!alignments_reader.HasKey(key)) { @@ -108,7 +108,3 @@ int main(int argc, char *argv[]) { return -1; } } - - - - diff --git a/src/bin/matrix-sum.cc b/src/bin/matrix-sum.cc index 7110fdf8b82..8a7b5a39e00 100644 --- a/src/bin/matrix-sum.cc +++ b/src/bin/matrix-sum.cc @@ -103,6 +103,79 @@ int32 TypeOneUsage(const ParseOptions &po, return (n_success != 0 && n_missing < (n_success - n_missing)) ? 0 : 1; } +int32 TypeOneUsageAverage(const ParseOptions &po) { + int32 num_args = po.NumArgs(); + std::string matrix_in_fn1 = po.GetArg(1), + matrix_out_fn = po.GetArg(num_args); + BaseFloat scale = 1.0 / (num_args - 1); + + // Output matrix + BaseFloatMatrixWriter matrix_writer(matrix_out_fn); + + // Input matrices + SequentialBaseFloatMatrixReader matrix_reader1(matrix_in_fn1); + std::vector + matrix_readers(num_args-2, + static_cast(NULL)); + std::vector matrix_in_fns(num_args-2); + for (int32 i = 2; i < num_args; ++i) { + matrix_readers[i-2] = new RandomAccessBaseFloatMatrixReader(po.GetArg(i)); + matrix_in_fns[i-2] = po.GetArg(i); + } + + int32 n_utts = 0, n_total_matrices = 0, + n_success = 0, n_missing = 0, n_other_errors = 0; + + for (; !matrix_reader1.Done(); matrix_reader1.Next()) { + std::string key = matrix_reader1.Key(); + Matrix matrix1 = matrix_reader1.Value(); + matrix_reader1.FreeCurrent(); + n_utts++; + n_total_matrices++; + + matrix1.Scale(scale); + + Matrix matrix_out(matrix1); + + for (int32 i = 0; i < num_args-2; ++i) { + if (matrix_readers[i]->HasKey(key)) { + Matrix matrix2 = matrix_readers[i]->Value(key); + n_total_matrices++; + if (SameDim(matrix2, matrix_out)) { + matrix_out.AddMat(scale, matrix2, kNoTrans); + } else { + KALDI_WARN << "Dimension mismatch for utterance " << key + << " : " << matrix2.NumRows() << " by " + << matrix2.NumCols() << " for " + << "system " << (i + 2) << ", rspecifier: " + << matrix_in_fns[i] << " vs " << matrix_out.NumRows() + << " by " << matrix_out.NumCols() + << " primary matrix, rspecifier:" << matrix_in_fn1; + n_other_errors++; + } + } else { + KALDI_WARN << "No matrix found for utterance " << key << " for " + << "system " << (i + 2) << ", rspecifier: " + << matrix_in_fns[i]; + n_missing++; + } + } + + matrix_writer.Write(key, matrix_out); + n_success++; + } + + KALDI_LOG << "Processed " << n_utts << " utterances: with a total of " + << n_total_matrices << " matrices across " << (num_args-1) + << " different systems"; + KALDI_LOG << "Produced output for " << n_success << " utterances; " + << n_missing << " total missing matrices"; + + DeletePointers(&matrix_readers); + + return (n_success != 0 && n_missing < (n_success - n_missing)) ? 0 : 1; +} + int32 TypeTwoUsage(const ParseOptions &po, bool binary) { KALDI_ASSERT(po.NumArgs() == 2); @@ -207,7 +280,7 @@ int main(int argc, char *argv[]) { " matrix-sum [options] ..." " \n" " e.g.: matrix-sum --binary=false 1.mat 2.mat 3.mat sum.mat\n" - "See also: matrix-sum-rows\n"; + "See also: matrix-sum-rows, copy-matrix\n"; BaseFloat scale1 = 1.0, scale2 = 1.0; @@ -223,7 +296,7 @@ int main(int argc, char *argv[]) { po.Register("binary", &binary, "If true, write output as binary (only " "relevant for usage types two or three"); po.Register("average", &average, "If true, compute average instead of " - "sum; only currently compatible with type 3 usage."); + "sum; currently compatible with type 3 or type 1 usage."); po.Read(argc, argv); @@ -232,9 +305,11 @@ int main(int argc, char *argv[]) { if (po.NumArgs() >= 2 && ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) != kNoWspecifier) { if (average) - KALDI_ERR << "--average option not compatible with type one usage."; - // output to table. - exit_status = TypeOneUsage(po, scale1, scale2); + // average option with type one usage."; + exit_status = TypeOneUsageAverage(po); + else + // output to table. + exit_status = TypeOneUsage(po, scale1, scale2); } else if (po.NumArgs() == 2 && ClassifyRspecifier(po.GetArg(1), NULL, NULL) != kNoRspecifier && ClassifyWspecifier(po.GetArg(N), NULL, NULL, NULL) == @@ -260,3 +335,4 @@ int main(int argc, char *argv[]) { return -1; } } + diff --git a/src/bin/post-to-tacc.cc b/src/bin/post-to-tacc.cc index 6456195e998..afa5315d6b4 100644 --- a/src/bin/post-to-tacc.cc +++ b/src/bin/post-to-tacc.cc @@ -27,7 +27,7 @@ int main(int argc, char *argv[]) { try { using namespace kaldi; - typedef kaldi::int32 int32; + typedef kaldi::int32 int32; const char *usage = "From posteriors, compute transition-accumulators\n" @@ -35,7 +35,8 @@ int main(int argc, char *argv[]) { "Note: the model is only read in order to get the size of the vector\n" "\n" "Usage: post-to-tacc [options] \n" - " e.g.: post-to-tacc --binary=false 1.mdl \"ark:ali-to-post 1.ali|\" 1.tacc\n"; + " e.g.: post-to-tacc --binary=false 1.mdl \"ark:ali-to-post 1.ali|\" 1.tacc\n" + "See also: get-post-on-ali\n"; bool binary = true; bool per_pdf = false; @@ -49,25 +50,25 @@ int main(int argc, char *argv[]) { po.PrintUsage(); exit(1); } - + std::string model_rxfilename = po.GetArg(1), post_rspecifier = po.GetArg(2), accs_wxfilename = po.GetArg(3); kaldi::SequentialPosteriorReader posterior_reader(post_rspecifier); - + int32 num_transition_ids; - + bool binary_in; Input ki(model_rxfilename, &binary_in); TransitionModel trans_model; trans_model.Read(ki.Stream(), binary_in); num_transition_ids = trans_model.NumTransitionIds(); - + Vector transition_accs(num_transition_ids+1); // +1 because they're // 1-based; position zero is empty. We'll write as float. - int32 num_done = 0; - + int32 num_done = 0; + for (; !posterior_reader.Done(); posterior_reader.Next()) { const kaldi::Posterior &posterior = posterior_reader.Value(); int32 num_frames = static_cast(posterior.size()); @@ -109,4 +110,3 @@ int main(int argc, char *argv[]) { return -1; } } - diff --git a/src/chain/chain-den-graph.cc b/src/chain/chain-den-graph.cc index 5386f959b1f..62d2f3aaa56 100644 --- a/src/chain/chain-den-graph.cc +++ b/src/chain/chain-den-graph.cc @@ -347,10 +347,13 @@ void CreateDenominatorFst(const ContextDependency &ctx_dep, BaseFloat self_loop_scale = 1.0; // We have to be careful to use the same // value in test time. + // 'reorder' must always be set to true for chain models. bool reorder = true; + bool check_no_self_loops = true; + // add self-loops to the FST with transition-ids as its labels. AddSelfLoops(trans_model, disambig_syms_h, self_loop_scale, reorder, - &transition_id_fst); + check_no_self_loops, &transition_id_fst); // at this point transition_id_fst will have transition-ids as its ilabels and // context-dependent phones (indexes into ILabelInfo()) as its olabels. // Discard the context-dependent phones by projecting on the input, keeping diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc index 3a767721c6d..e41e942e266 100644 --- a/src/chain/chain-denominator.cc +++ b/src/chain/chain-denominator.cc @@ -47,6 +47,11 @@ DenominatorComputation::DenominatorComputation( tot_log_prob_(num_sequences_, kUndefined), log_correction_term_(num_sequences_, kUndefined), ok_(true) { + // We don't let leaky_hmm_coefficient be exactly zero (although that would + // make sense mathematically, corresponding to "turning off" the leaky HMM), + // because that would lead to underflow and eventually NaN's or inf's + // appearing in the computation, since we do this computation not in + // log-space. KALDI_ASSERT(opts_.leaky_hmm_coefficient > 0.0 && opts_.leaky_hmm_coefficient < 1.0); // make sure the alpha sums and beta sums are zeroed. diff --git a/src/chain/chain-denominator.h b/src/chain/chain-denominator.h index f44588e434f..d76e4244ae2 100644 --- a/src/chain/chain-denominator.h +++ b/src/chain/chain-denominator.h @@ -210,9 +210,11 @@ class DenominatorComputation { int32 num_sequences, const CuMatrixBase &nnet_output); - // Does the forward computation, and returns the total negated log-like summed - // over all sequences. You will have to scale this by any supervision - // weighting factor, manually. + // Does the forward computation, and returns the total log-like summed over + // all sequences. You will have to scale this by any supervision weighting + // factor, manually. Note: this log-like will be negated before it + // is added into the objective function, since this is the denominator + // computation. BaseFloat Forward(); // this adds deriv_weight times (the derivative of the log-prob w.r.t. the diff --git a/src/chain/chain-generic-numerator.cc b/src/chain/chain-generic-numerator.cc index 392f7b02def..d3a114242c2 100644 --- a/src/chain/chain-generic-numerator.cc +++ b/src/chain/chain-generic-numerator.cc @@ -21,6 +21,10 @@ #include "chain/chain-generic-numerator.h" #include "chain/chain-kernels-ansi.h" +#include +#include +#include + namespace kaldi { namespace chain { @@ -33,59 +37,45 @@ GenericNumeratorComputation::GenericNumeratorComputation( const Supervision &supervision, const CuMatrixBase &nnet_output): supervision_(supervision), - nnet_output_deriv_transposed_( - nnet_output.NumCols(), - std::min(nnet_output.NumRows(), - static_cast(kMaxDerivTimeSteps) * - supervision.num_sequences)), - tot_prob_(supervision.num_sequences, kUndefined), - ok_(true) { + nnet_output_(nnet_output) { KALDI_ASSERT(supervision.num_sequences * supervision.frames_per_sequence == nnet_output.NumRows() && supervision.label_dim == nnet_output.NumCols()); - { - CuMatrix exp_nnet_output_transposed_gpu(nnet_output, kTrans); - exp_nnet_output_transposed_gpu.ApplyExp(); - exp_nnet_output_transposed_.Resize(nnet_output.NumCols(), - nnet_output.NumRows(), kUndefined); - exp_nnet_output_transposed_.CopyFromMat(exp_nnet_output_transposed_gpu); - } using std::vector; - int32 B = supervision_.num_sequences, - num_frames = supervision_.frames_per_sequence; - KALDI_ASSERT(supervision_.e2e_fsts.size() == B); + int num_sequences = supervision_.num_sequences; + KALDI_ASSERT(supervision_.e2e_fsts.size() == num_sequences); // Find the maximum number of HMM states and then // initialize final probs, alpha, and beta. - max_num_hmm_states_ = 0; - for (int32 i = 0; i < B; i++) { + int max_num_hmm_states = 0; + for (int i = 0; i < num_sequences; i++) { KALDI_ASSERT(supervision_.e2e_fsts[i].Properties(fst::kIEpsilons, true) == 0); - if (supervision_.e2e_fsts[i].NumStates() > max_num_hmm_states_) - max_num_hmm_states_ = supervision_.e2e_fsts[i].NumStates(); + if (supervision_.e2e_fsts[i].NumStates() > max_num_hmm_states) + max_num_hmm_states = supervision_.e2e_fsts[i].NumStates(); } - final_probs_.Resize(max_num_hmm_states_, B, kSetZero); - alpha_.Resize(num_frames + 1, - max_num_hmm_states_ * B + B, - kSetZero); - // The extra B is for storing alpha sums - beta_.Resize(2, max_num_hmm_states_ * B, kSetZero); + final_probs_.Resize(num_sequences, max_num_hmm_states); // Initialize incoming transitions for easy access - in_transitions_.resize(B); // indexed by seq, state - out_transitions_.resize(B); // indexed by seq, state - for (int32 seq = 0; seq < B; seq++) { + in_transitions_.resize(num_sequences); // indexed by seq, state + out_transitions_.resize(num_sequences); // indexed by seq, state + for (int seq = 0; seq < num_sequences; seq++) { in_transitions_[seq] = vector >( supervision_.e2e_fsts[seq].NumStates()); out_transitions_[seq] = vector >( supervision_.e2e_fsts[seq].NumStates()); } - offsets_.Resize(B); - for (int32 seq = 0; seq < B; seq++) { + offsets_.Resize(num_sequences); + std::unordered_map pdf_to_index; + int32 pdf_stride = nnet_output_.Stride(); + int32 view_stride = nnet_output_.Stride() * num_sequences; + pdf_to_index.reserve(view_stride); + nnet_output_stride_ = pdf_stride; + for (int seq = 0; seq < num_sequences; seq++) { for (int32 s = 0; s < supervision_.e2e_fsts[seq].NumStates(); s++) { - final_probs_(s, seq) = exp(-supervision_.e2e_fsts[seq].Final(s).Value()); + final_probs_(seq, s)= -supervision_.e2e_fsts[seq].Final(s).Value(); BaseFloat offset = 0.0; if (s == 0) { for (fst::ArcIterator aiter( @@ -98,13 +88,25 @@ GenericNumeratorComputation::GenericNumeratorComputation( } for (fst::ArcIterator aiter( - supervision_.e2e_fsts[seq], s); + supervision_.e2e_fsts[seq], s); !aiter.Done(); aiter.Next()) { const fst::StdArc &arc = aiter.Value(); DenominatorGraphTransition transition; - transition.transition_prob = exp(-(arc.weight.Value() - offset)); - transition.pdf_id = arc.ilabel - 1; + transition.transition_prob = -(arc.weight.Value() - offset); + + int32 pdf_id = arc.ilabel - 1; // note: the FST labels were pdf-id plus one. + + // remap to a unique index in the remapped space + pdf_id = pdf_id + seq * pdf_stride; + KALDI_ASSERT(pdf_id < view_stride); + + if (pdf_to_index.find(pdf_id) == pdf_to_index.end()) { + index_to_pdf_.push_back(pdf_id); + pdf_to_index[pdf_id] = index_to_pdf_.size() - 1; + } + + transition.pdf_id = pdf_to_index[pdf_id]; transition.hmm_state = s; in_transitions_[seq][arc.nextstate].push_back(transition); transition.hmm_state = arc.nextstate; @@ -115,229 +117,289 @@ GenericNumeratorComputation::GenericNumeratorComputation( } -void GenericNumeratorComputation::AlphaFirstFrame() { - const int32 num_sequences = supervision_.num_sequences, - num_states = max_num_hmm_states_; - // Set alpha_0(0) for all sequences to 1.0 and leave the rest to be 0.0. - double *first_frame_alpha = alpha_.RowData(0); - SubVector alpha_hmm_state0(first_frame_alpha, num_sequences); - alpha_hmm_state0.Set(1.0); - - // Now compute alpha-sums for t=0 which is obviously 1.0 for each sequence - SubVector alpha_sum_vec(first_frame_alpha + - num_states * num_sequences, - num_sequences); - alpha_sum_vec.Set(1.0); +void GenericNumeratorComputation::AlphaFirstFrame(int seq, + Matrix *alpha) { + const int32 num_frames = supervision_.frames_per_sequence, + num_states = supervision_.e2e_fsts[seq].NumStates(); + alpha->Resize(num_frames + 1, num_states + 1, kSetZero); + alpha->Set(-std::numeric_limits::infinity()); + (*alpha)(0, 0) = 0.0; + (*alpha)(0, num_states) = 0.0; } +void GenericNumeratorComputation::CopySpecificPdfsIndirect( + const CuMatrixBase &nnet_output, + const std::vector &indices, + Matrix *out) { + KALDI_ASSERT(nnet_output_stride_ == nnet_output_.Stride()); + const int32 num_sequences = supervision_.num_sequences, + frames_per_sequence = supervision_.frames_per_sequence; + + const BaseFloat *starting_ptr = nnet_output.RowData(0); + const int view_stride = num_sequences * nnet_output.Stride(); + + const CuSubMatrix sequence_view(starting_ptr, + frames_per_sequence, + view_stride, + view_stride); + + CuArray indices_gpu(indices); + CuMatrix required_pdfs(frames_per_sequence, + indices.size()); + + required_pdfs.CopyCols(sequence_view, indices_gpu); + out->Swap(&required_pdfs); +} + // The alpha computation for some 0 < t <= num_time_steps_. -void GenericNumeratorComputation::AlphaGeneralFrame(int32 t) { +BaseFloat GenericNumeratorComputation::AlphaRemainingFrames(int seq, + const Matrix &probs, + Matrix *alpha) { // Define some variables to make things nicer - const int32 - num_sequences = supervision_.num_sequences, - num_frames = supervision_.frames_per_sequence, - num_pdfs = exp_nnet_output_transposed_.NumRows(), - num_states = max_num_hmm_states_; - KALDI_ASSERT(t > 0 && t <= num_frames); - - SubMatrix this_alpha(alpha_.RowData(t), num_states, - num_sequences, num_sequences); - const SubMatrix prev_alpha(alpha_.RowData(t - 1), num_states + 1, - num_sequences, num_sequences); - // 'probs' is the matrix of pseudo-likelihoods for frame t - 1. - SubMatrix probs(exp_nnet_output_transposed_, 0, num_pdfs, - (t - 1) * num_sequences, num_sequences); - - for (int32 seq = 0; seq < num_sequences; seq++) { - double inv_arbitrary_scale = prev_alpha(num_states, seq); + const int32 num_sequences = supervision_.num_sequences, + num_frames = supervision_.frames_per_sequence; + + KALDI_ASSERT(seq >= 0 && seq < num_sequences); + + // variables for log_likelihood computation + double log_scale_product = 0, + log_prob_product = 0; + + for (int t = 1; t <= num_frames; ++t) { + const BaseFloat *probs_tm1 = probs.RowData(t - 1); + BaseFloat *alpha_t = alpha->RowData(t); + const BaseFloat *alpha_tm1 = alpha->RowData(t - 1); + for (int32 h = 0; h < supervision_.e2e_fsts[seq].NumStates(); h++) { for (auto tr = in_transitions_[seq][h].begin(); - tr != in_transitions_[seq][h].end(); tr++) { - double transition_prob = tr->transition_prob; - int32 pdf_id = tr->pdf_id, prev_hmm_state = tr->hmm_state; - double prob = probs(pdf_id, seq); - this_alpha(h, seq) += prev_alpha(prev_hmm_state, seq) / - inv_arbitrary_scale * transition_prob * prob; + tr != in_transitions_[seq][h].end(); ++tr) { + BaseFloat transition_prob = tr->transition_prob; + int32 pdf_id = tr->pdf_id, + prev_hmm_state = tr->hmm_state; + BaseFloat prob = probs_tm1[pdf_id]; + alpha_t[h] = LogAdd(alpha_t[h], + alpha_tm1[prev_hmm_state] + transition_prob + prob); } } + double sum = alpha_tm1[alpha->NumCols() - 1]; + SubMatrix alpha_t_mat(*alpha, t, 1, 0, + alpha->NumCols() - 1); + alpha_t_mat.Add(-sum); + sum = alpha_t_mat.LogSumExp(); + + alpha_t[alpha->NumCols() - 1] = sum; + log_scale_product += sum; } - - if (t == num_frames) // last alpha - this_alpha.MulElements(final_probs_); - // Now compute alpha-sums for frame t: - SubVector alpha_sum_vec(alpha_.RowData(t) + num_states * num_sequences, - num_sequences); - alpha_sum_vec.AddRowSumMat(1.0, this_alpha, 0.0); + SubMatrix last_alpha(*alpha, alpha->NumRows() - 1, 1, + 0, alpha->NumCols() - 1); + SubVector final_probs(final_probs_.RowData(seq), + alpha->NumCols() - 1); + + // adjust last_alpha + double sum = (*alpha)(alpha->NumRows() - 1, alpha->NumCols() - 1); + log_scale_product -= sum; + last_alpha.AddVecToRows(1.0, final_probs); + sum = last_alpha.LogSumExp(); + (*alpha)(alpha->NumRows() - 1, alpha->NumCols() - 1) = sum; + + // second part of criterion + log_prob_product = sum - offsets_(seq); + + return log_prob_product + log_scale_product; } -BaseFloat GenericNumeratorComputation::Forward() { - AlphaFirstFrame(); - for (int32 t = 1; t <= supervision_.frames_per_sequence; t++) { - AlphaGeneralFrame(t); +bool GenericNumeratorComputation::ForwardBackward( + BaseFloat *total_loglike, + CuMatrixBase *nnet_output_deriv) { + KALDI_ASSERT(total_loglike != NULL); + KALDI_ASSERT(nnet_output_deriv != NULL); + KALDI_ASSERT(nnet_output_deriv->NumCols() == nnet_output_.NumCols()); + KALDI_ASSERT(nnet_output_deriv->NumRows() == nnet_output_.NumRows()); + + BaseFloat partial_loglike = 0; + const int32 num_sequences = supervision_.num_sequences; + + bool ok = true; + Matrix alpha; + Matrix beta; + Matrix probs; + Matrix derivs; + + // We selectively copy only those pdfs we need + CopySpecificPdfsIndirect(nnet_output_, index_to_pdf_, &probs); + + derivs.Resize(probs.NumRows(), probs.NumCols()); + derivs.Set(-std::numeric_limits::infinity()); + + for (int seq = 0; seq < num_sequences; ++seq) { + // Forward part + AlphaFirstFrame(seq, &alpha); + partial_loglike += AlphaRemainingFrames(seq, probs, &alpha); + + // Backward part + BetaLastFrame(seq, alpha, &beta); + BetaRemainingFrames(seq, probs, alpha, &beta, &derivs); + if (GetVerboseLevel() >= 1) + ok = ok && CheckValues(seq, probs, alpha, beta, derivs); } - return ComputeTotLogLike(); + // Transfer and add the derivatives to the values in the matrix + AddSpecificPdfsIndirect(&derivs, index_to_pdf_, nnet_output_deriv); + *total_loglike = partial_loglike; + return ok; } -BaseFloat GenericNumeratorComputation::ComputeTotLogLike() { - const int32 - num_sequences = supervision_.num_sequences, - num_frames = supervision_.frames_per_sequence, - num_states = max_num_hmm_states_; - - // View the last alpha as a matrix of size num-hmm-states by num-sequences. - SubMatrix last_alpha(alpha_.RowData(num_frames), num_states, - num_sequences, num_sequences); - tot_prob_.AddRowSumMat(1.0, last_alpha, 0.0); - Vector tot_log_probs(tot_prob_); - tot_log_probs.ApplyLog(); - tot_log_probs.AddVec(-1.0, offsets_); - double tot_log_prob = tot_log_probs.Sum(); - SubMatrix inv_arbitrary_scales(alpha_, 0, num_frames, - num_sequences * num_states, - num_sequences); - Matrix log_inv_arbitrary_scales(inv_arbitrary_scales); - log_inv_arbitrary_scales.ApplyLog(); - double log_inv_arbitrary_scales_product = - log_inv_arbitrary_scales.Sum(); - return tot_log_prob + log_inv_arbitrary_scales_product; -} +BaseFloat GenericNumeratorComputation::ComputeObjf() { + BaseFloat partial_loglike = 0; + const int32 num_sequences = supervision_.num_sequences; + Matrix alpha; + Matrix probs; -bool GenericNumeratorComputation::Backward( - CuMatrixBase *nnet_output_deriv) { - const int32 - num_sequences = supervision_.num_sequences, - num_frames = supervision_.frames_per_sequence, - num_pdfs = exp_nnet_output_transposed_.NumRows(); - BetaLastFrame(); - for (int32 t = num_frames - 1; t >= 0; t--) { - BetaGeneralFrame(t); - if (GetVerboseLevel() >= 1 || t == 0 || t == num_frames - 1) - BetaGeneralFrameDebug(t); - if (t % kMaxDerivTimeSteps == 0) { - // Commit the derivative stored in exp_nnet_output_transposed_ by adding - // its transpose to the appropriate sub-matrix of 'nnet_output_deriv'. - int32 chunk_frames = std::min(static_cast(kMaxDerivTimeSteps), - num_frames - t); - SubMatrix transposed_deriv_part( - nnet_output_deriv_transposed_, - 0, num_pdfs, - 0, chunk_frames * num_sequences); - CuMatrix tmp(transposed_deriv_part); - CuSubMatrix output_deriv_part( - *nnet_output_deriv, - t * num_sequences, chunk_frames * num_sequences, - 0, num_pdfs); - output_deriv_part.AddMat(supervision_.weight, tmp, kTrans); - if (t != 0) - transposed_deriv_part.SetZero(); - } + // We selectively copy only those pdfs we need + CopySpecificPdfsIndirect(nnet_output_, index_to_pdf_, &probs); + + for (int seq = 0; seq < num_sequences; ++seq) { + // Forward part + AlphaFirstFrame(seq, &alpha); + partial_loglike += AlphaRemainingFrames(seq, probs, &alpha); } - return ok_; + return partial_loglike; +} + +BaseFloat GenericNumeratorComputation::GetTotalProb( + const Matrix &alpha) { + return alpha(alpha.NumRows() - 1, alpha.NumCols() - 1); } -void GenericNumeratorComputation::BetaLastFrame() { +void GenericNumeratorComputation::BetaLastFrame(int seq, + const Matrix &alpha, + Matrix *beta) { // Sets up the beta quantity on the last frame (frame == // frames_per_sequence_). Note that the betas we use here contain a // 1/(tot-prob) factor in order to simplify the backprop. - int32 t = supervision_.frames_per_sequence; - double *last_frame_beta = beta_.RowData(t % 2); + const int32 num_frames = supervision_.frames_per_sequence, + num_states = supervision_.e2e_fsts[seq].NumStates(); + float tot_prob = GetTotalProb(alpha); - SubMatrix beta_mat(last_frame_beta, - max_num_hmm_states_, - supervision_.num_sequences, - supervision_.num_sequences); + beta->Resize(2, num_states); + beta->Set(-std::numeric_limits::infinity()); - Vector inv_tot_prob(tot_prob_); - inv_tot_prob.InvertElements(); + SubVector beta_mat(beta->RowData(num_frames % 2), num_states); + SubVector final_probs(final_probs_.RowData(seq), num_states); - beta_mat.CopyRowsFromVec(inv_tot_prob); - beta_mat.MulElements(final_probs_); + BaseFloat inv_tot_prob = -tot_prob; + beta_mat.Set(inv_tot_prob); + beta_mat.AddVec(1.0, final_probs); } -void GenericNumeratorComputation::BetaGeneralFrame(int32 t) { +void GenericNumeratorComputation::BetaRemainingFrames(int seq, + const Matrix &probs, + const Matrix &alpha, + Matrix *beta, + Matrix *derivs) { const int32 num_sequences = supervision_.num_sequences, num_frames = supervision_.frames_per_sequence, - num_pdfs = exp_nnet_output_transposed_.NumRows(), - num_states = max_num_hmm_states_; - KALDI_ASSERT(t >= 0 && t < num_frames); - - // t_wrapped gives us the time-index we use when indexing - // nnet_output_deriv_transposed_; to save memory we limit the size of the - // matrix, storing only chunks of frames at a time, and we add it to the - // non-transposed output whenever we finish a chunk. - int32 t_wrapped = t % static_cast(kMaxDerivTimeSteps); - const SubMatrix this_alpha(alpha_.RowData(t), num_states, - num_sequences, num_sequences); - SubMatrix this_beta(beta_.RowData(t % 2), num_states, - num_sequences, num_sequences); - const SubMatrix next_beta(beta_.RowData((t + 1) % 2), num_states, - num_sequences, num_sequences); - - SubMatrix probs(exp_nnet_output_transposed_, 0, num_pdfs, - t * num_sequences, num_sequences), - log_prob_deriv(nnet_output_deriv_transposed_, 0, num_pdfs, - t_wrapped * num_sequences, num_sequences); - - for (int32 seq = 0; seq < num_sequences; seq++) { + num_states = supervision_.e2e_fsts[seq].NumStates(); + KALDI_ASSERT(seq >= 0 && seq < num_sequences); + + for (int t = num_frames - 1; t >= 0; --t) { + const BaseFloat *alpha_t = alpha.RowData(t), + *beta_tp1 = beta->RowData((t + 1) % 2), + *probs_t = probs.RowData(t); + BaseFloat *log_prob_deriv_t = derivs->RowData(t), + *beta_t = beta->RowData(t % 2); + + BaseFloat inv_arbitrary_scale = alpha_t[num_states]; for (int32 h = 0; h < supervision_.e2e_fsts[seq].NumStates(); h++) { - BaseFloat inv_arbitrary_scale = this_alpha(num_states, seq); - double tot_variable_factor = 0.0; + BaseFloat tot_variable_factor; + tot_variable_factor = -std::numeric_limits::infinity(); for (auto tr = out_transitions_[seq][h].begin(); - tr != out_transitions_[seq][h].end(); tr++) { + tr != out_transitions_[seq][h].end(); ++tr) { BaseFloat transition_prob = tr->transition_prob; int32 pdf_id = tr->pdf_id, next_hmm_state = tr->hmm_state; - double variable_factor = transition_prob * - next_beta(next_hmm_state, seq) * - probs(pdf_id, seq) / inv_arbitrary_scale; - tot_variable_factor += variable_factor; - double occupation_prob = variable_factor * this_alpha(h, seq); - log_prob_deriv(pdf_id, seq) += occupation_prob; + BaseFloat variable_factor = transition_prob + + beta_tp1[next_hmm_state] + + probs_t[pdf_id] - inv_arbitrary_scale; + tot_variable_factor = LogAdd(tot_variable_factor, + variable_factor); + + BaseFloat occupation_prob = variable_factor + alpha_t[h]; + log_prob_deriv_t[pdf_id] = LogAdd(log_prob_deriv_t[pdf_id], + occupation_prob); } - this_beta(h, seq) = tot_variable_factor; + beta_t[h] = tot_variable_factor; } } } -void GenericNumeratorComputation::BetaGeneralFrameDebug(int32 t) { - int32 alpha_beta_size = max_num_hmm_states_ * supervision_.num_sequences; - SubVector this_alpha(alpha_.RowData(t), alpha_beta_size), - this_beta(beta_.RowData(t % 2), alpha_beta_size); - int32 t_wrapped = t % static_cast(kMaxDerivTimeSteps), - num_pdfs = exp_nnet_output_transposed_.NumRows(); - SubMatrix this_log_prob_deriv( - nnet_output_deriv_transposed_, 0, num_pdfs, - t_wrapped * supervision_.num_sequences, supervision_.num_sequences); - double alpha_beta_product = VecVec(this_alpha, - this_beta), - this_log_prob_deriv_sum = this_log_prob_deriv.Sum(); - if (!ApproxEqual(alpha_beta_product, supervision_.num_sequences)) { - KALDI_WARN << "On time " << t << ", alpha-beta product " - << alpha_beta_product << " != " << supervision_.num_sequences - << " alpha-sum = " << this_alpha.Sum() - << ", beta-sum = " << this_beta.Sum(); - if (fabs(alpha_beta_product - supervision_.num_sequences) > 2.0 - || alpha_beta_product - alpha_beta_product != 0) { - KALDI_WARN << "Excessive error detected, will abandon this minibatch"; - ok_ = false; - } + +void GenericNumeratorComputation::AddSpecificPdfsIndirect( + Matrix *logprobs, + const std::vector &indices, + CuMatrixBase *output) { + const int32 num_sequences = supervision_.num_sequences, + frames_per_sequence = supervision_.frames_per_sequence; + + const int view_stride = output->Stride() * num_sequences; + + KALDI_ASSERT(frames_per_sequence * num_sequences == output->NumRows()); + + CuMatrix specific_pdfs; + specific_pdfs.Swap(logprobs); + specific_pdfs.ApplyExp(); + specific_pdfs.Scale(supervision_.weight); + + std::vector indices_expanded(view_stride, -1); + for (int i = 0; i < indices.size(); ++i) { + int pdf_index = indices[i]; + int sequence_local_pdf_index = pdf_index % nnet_output_stride_; + int sequence_index = pdf_index / nnet_output_stride_; + pdf_index = sequence_local_pdf_index + + sequence_index * output->Stride(); + KALDI_ASSERT(pdf_index < view_stride); + KALDI_ASSERT(i < specific_pdfs.NumCols()); + indices_expanded[pdf_index] = i; } - // Use higher tolerance, since we are using randomized pruning for the - // log-prob derivatives. - if (!ApproxEqual(this_log_prob_deriv_sum, - supervision_.num_sequences, 0.01)) { - KALDI_WARN << "On time " << t << ", log-prob-deriv sum " - << this_log_prob_deriv_sum << " != " - << supervision_.num_sequences; - if (fabs(this_log_prob_deriv_sum - supervision_.num_sequences) > 2.0 || - this_log_prob_deriv_sum - this_log_prob_deriv_sum != 0) { - KALDI_WARN << "Excessive error detected, will abandon this minibatch"; - ok_ = false; + + CuArray cu_indices(indices_expanded); + CuSubMatrix out(output->Data(), frames_per_sequence, + view_stride, view_stride); + + out.AddCols(specific_pdfs, cu_indices); +} + +bool GenericNumeratorComputation::CheckValues(int seq, + const Matrix &probs, + const Matrix &alpha, + const Matrix &beta, + const Matrix &derivs) const { + const int32 num_frames = supervision_.frames_per_sequence; + // only check the derivs for the first and last frames + const std::vector times = {0, num_frames - 1}; + for (const int32 t: times) { + BaseFloat deriv_sum = 0.0; + for (int32 n = 0; n < probs.NumCols(); n++) { + int32 pdf_stride = nnet_output_.Stride(); + int32 pdf2seq = index_to_pdf_[n] / pdf_stride; + if (pdf2seq != seq) // this pdf is not in the space of this sequence + continue; + deriv_sum += Exp(derivs(t, n)); + } + + if (!ApproxEqual(deriv_sum, 1.0)) { + KALDI_WARN << "On time " << t + << " for seq " << seq << ", deriv sum " + << deriv_sum << " != 1.0"; + if (fabs(deriv_sum - 1.0) > 0.05 || deriv_sum - deriv_sum != 0) { + KALDI_WARN << "Excessive error detected, will abandon this minibatch"; + return false; + } } } + return true; } } // namespace chain diff --git a/src/chain/chain-generic-numerator.h b/src/chain/chain-generic-numerator.h index e7bcb524a95..fc5e00b2c63 100644 --- a/src/chain/chain-generic-numerator.h +++ b/src/chain/chain-generic-numerator.h @@ -1,6 +1,7 @@ // chain/chain-generic-numerator.h // Copyright 2017 Hossein Hadian +// 2018 Johns Hopkins University (Jan "Yenda" Trmal) // See ../../COPYING for clarification regarding multiple authors @@ -66,10 +67,38 @@ namespace chain { training). It is the same as DenominatorComputation with 2 differences: [1] it runs on CPU [2] it does not use leakyHMM + The F-B computation is done in log-domain. When the 'e2e' flag of a supervision is set, the ComputeChainObjfAndDeriv function in chain-training.cc uses GenericNumeratorComputation (instead of NumeratorCompuation) to compute the numerator derivatives. + + The implementation tries to optimize the memory transfers. The optimization + uses the observation that for each supervision graph, only very limited + number of pdfs is needed to evaluate the possible transitions from state + to state. That means that for the F-B, we don't have to transfer the whole + neural network output, we can copy only the limited set of pdfs activation + values that will be needed for F-B on the given graph. + + To streamline things, in the constructor of this class, we remap the pdfs + indices to a new space and store the bookkeeping info in the index_to_pdf_ + structure. This can be seen as if for each FST we create a subspace that + has only the pdfs that are needed for the given FST (possibly ordered + differently). + + Morover, we optimize memory transfers. The matrix of nnet outputs can be + reshaped (viewed) as a matrix of dimensions + (frames_per_sequence) x (num_sequences * pdf_stride), where the pdf_stride + is the stride of the original matrix and pdf_stride >= num_pdfs. + When the matrix is viewed this way, it becomes obvious that the pdfs of the + k-th supervision sequence have column index k * pdf_stride + original_pdf_index + Once this is understood, the way how copy all pdfs in one shot should become + obvious. + + The complete F-B is then done in this remapped space and only + when copying the activation values from the GPU memory or copying + the computed derivatives to GPU memory, we use the bookkeeping info to + map the values correctly. */ @@ -81,90 +110,94 @@ namespace chain { // and the numerator FSTs are stored in 'e2e_fsts' instead of 'fst' class GenericNumeratorComputation { - public: - /// Initializes the object. GenericNumeratorComputation(const Supervision &supervision, const CuMatrixBase &nnet_output); - // Does the forward computation. Returns the total log-prob multiplied - // by supervision_.weight. - BaseFloat Forward(); - - // Does the backward computation and (efficiently) adds the derivative of the + // Does the forward-backward computation. Returns the total log-prob + // multiplied by supervision_.weight. + // In the backward computation, add (efficiently) the derivative of the // nnet output w.r.t. the (log-prob times supervision_.weight times // deriv_weight) to 'nnet_output_deriv'. - bool Backward(CuMatrixBase *nnet_output_deriv); + bool ForwardBackward(BaseFloat *total_loglike, + CuMatrixBase *nnet_output_deriv); + BaseFloat ComputeObjf(); private: - - // Defining this constant as an enum is easier. it controls a memory/speed - // tradeoff, determining how many frames' worth of the transposed derivative - // we store at a time. It's not very critical; the only disadvantage from - // setting it small is that we have to invoke an AddMat kernel more times. - enum { kMaxDerivTimeSteps = 8 }; + // For the remapped FSTs, copy the appropriate activations to CPU memory. + // For explanation of what remapped FST is, see the large comment in the + // beginning of the file + void CopySpecificPdfsIndirect( + const CuMatrixBase &nnet_output, + const std::vector &indices, + Matrix *output); + + // For the remapped FSTs, copy the computed values back to gpu, + // expand to the original shape and add to the output matrix. + // For explanation of what remapped FST is, see the large comment in the + // beginning of the file. + void AddSpecificPdfsIndirect( + Matrix *logprobs, + const std::vector &indices, + CuMatrixBase *output); // sets up the alpha for frame t = 0. - void AlphaFirstFrame(); - - // the alpha computation for some 0 < t <= num_time_steps_. - void AlphaGeneralFrame(int32 t); - - BaseFloat ComputeTotLogLike(); - - // sets up the beta for frame t = num_time_steps_. - void BetaLastFrame(); - - // the beta computation for 0 <= beta < num_time_steps_. - void BetaGeneralFrame(int32 t); + void AlphaFirstFrame(int seq, Matrix *alpha); + + // the alpha computation for 0 < t <= supervision_.frames_per_sequence + // for some 0 <= seq < supervision_.num_sequences. + BaseFloat AlphaRemainingFrames(int seq, + const Matrix &probs, + Matrix *alpha); + + // the beta computation for 0 <= t < supervision_.frames_per_sequence + // for some 0 <= seq < supervision_.num_sequences. + void BetaRemainingFrames(int32 seq, + const Matrix &probs, + const Matrix &alpha, + Matrix *beta, + Matrix *derivs); + + // the beta computation for t = supervision_.frames_per_sequence + void BetaLastFrame(int seq, + const Matrix &alpha, + Matrix *beta); + + // returns total prob for the given matrix alpha (assumes the alpha + // matrix was computed using AlphaFirstFrame() and AlphaRemainingFrames() + // (it's exactly like 'tot_probe_' in DenominatorComputation) + BaseFloat GetTotalProb(const Matrix &alpha); // some checking that we can do if debug mode is activated, or on frame zero. - // Sets ok_ to false if a bad problem is detected. - void BetaGeneralFrameDebug(int32 t); + // Returns false if a bad problem is detected. + bool CheckValues(int32 seq, + const Matrix &probs, + const Matrix &alpha, + const Matrix &beta, + const Matrix &derivs) const; const Supervision &supervision_; - // the transposed neural net output. - Matrix exp_nnet_output_transposed_; + // a reference to the nnet output. + const CuMatrixBase &nnet_output_; + int32 nnet_output_stride_; // we keep the original stride extra + // as the matrix can change before ForwardBackward // in_transitions_ lists all the incoming transitions for // each state of each numerator graph // out_transitions_ does the same but for the outgoing transitions - std::vector > > - in_transitions_, out_transitions_; + typedef std::vector > TransitionMap; + std::vector in_transitions_, out_transitions_; + std::vector index_to_pdf_; // final probs for each state of each numerator graph - Matrix final_probs_; // indexed by seq, state + Matrix final_probs_; // indexed by seq, state // an offset subtracted from the logprobs of transitions out of the first - // state of each graph to help reduce numerical problems. Note the - // generic forward-backward computations cannot be done in log-space. + // state of each graph to help reduce numerical problems. Vector offsets_; - - // maximum number of states among all the numerator graphs - // (it is used as a stride in alpha_ and beta_) - int32 max_num_hmm_states_; - - // the derivs w.r.t. the nnet outputs (transposed) - // (the dimensions and functionality is the same as in - // DenominatorComputation) - Matrix nnet_output_deriv_transposed_; - - // forward and backward probs matrices. These have the - // same dimension and functionality as alpha_ and beta_ - // in DenominatorComputation except here we don't use beta - // sums (becasue we don't use leakyHMM). However, we use - // alpha sums to help avoid numerical issues. - Matrix alpha_; - Matrix beta_; - - // vector of total probs (i.e. for all the sequences) - // (it's exactly like 'tot_probe_' in DenominatorComputation) - Vector tot_prob_; - - bool ok_; }; } // namespace chain diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc index f93fa3aef7b..7ee5ee117b0 100644 --- a/src/chain/chain-supervision-test.cc +++ b/src/chain/chain-supervision-test.cc @@ -158,7 +158,7 @@ void TestSupervisionAppend(const TransitionModel &trans_model, for (int32 i = 0; i < num_append; i++) input[i] = &supervision; Supervision output; - AppendSupervision(input, &output); + MergeSupervision(input, &output); KALDI_ASSERT(output.frames_per_sequence == supervision.frames_per_sequence && output.num_sequences == num_append); @@ -364,7 +364,7 @@ void TestSupervisionSplitting(const ContextDependency &ctx_dep, std::vector to_append(num_ranges); for (int32 i = 0; i < num_ranges; i++) to_append[i] = &(split_supervision[i]); - AppendSupervision(to_append, &reattached_supervision); + MergeSupervision(to_append, &reattached_supervision); ChainTrainingTest(den_graph, reattached_supervision); if (num_frames % frames_per_range == 0) { TestSupervisionReattached(trans_model, @@ -495,7 +495,7 @@ void ChainSupervisionTest() { Supervision supervision; if (!ProtoSupervisionToSupervision(*ctx_dep, *trans_model, - proto_sup1, &supervision)) { + proto_sup1, true, &supervision)) { // we shouldn't fail because we multiplied by // 'subsample_factor' when creating the duration. KALDI_ERR << "Failed creating supervision."; @@ -521,6 +521,15 @@ void ChainSupervisionTest() { ChainTrainingTest(den_graph, supervision); } + // Test IO for supervisions which have transition id's as labels + if (!ProtoSupervisionToSupervision(*ctx_dep, *trans_model, + proto_sup1, false, &supervision)) { + KALDI_ERR << "Failed creating supervision with transition-ids as labels."; + } else { + supervision.Check(*trans_model); + TestSupervisionIo(supervision); + } + delete ctx_dep; delete trans_model; } diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 30aff50170b..8f95034c437 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -223,7 +223,7 @@ bool PhoneLatticeToProtoSupervision(const SupervisionOptions &opts, if (opts.lm_scale != 0.0) fst::Push(&(proto_supervision->fst), fst::REWEIGHT_TO_INITIAL, fst::kDelta, true); - + return true; } @@ -237,10 +237,14 @@ bool TimeEnforcerFst::GetArc(StateId s, Label ilabel, fst::StdArc* oarc) { } if (std::binary_search(allowed_phones_[s].begin(), allowed_phones_[s].end(), phone)) { - // the olabel will be a pdf-id plus one, not a transition-id. - int32 pdf_id = trans_model_.TransitionIdToPdf(ilabel); oarc->ilabel = ilabel; - oarc->olabel = pdf_id + 1; + if (convert_to_pdfs_) { + // the olabel will be a pdf-id plus one, not a transition-id. + int32 pdf_id = trans_model_.TransitionIdToPdf(ilabel); + oarc->olabel = pdf_id + 1; + } else { + oarc->olabel = ilabel; + } oarc->weight = fst::TropicalWeight::One(); oarc->nextstate = s + 1; return true; @@ -250,7 +254,7 @@ bool TimeEnforcerFst::GetArc(StateId s, Label ilabel, fst::StdArc* oarc) { } bool TrainingGraphToSupervisionE2e( - const fst::StdVectorFst& training_graph, + const fst::StdVectorFst &training_graph, const TransitionModel &trans_model, int32 num_frames, Supervision *supervision) { @@ -276,7 +280,6 @@ bool TrainingGraphToSupervisionE2e( aiter.SetValue(arc2); } } - supervision->e2e = true; supervision->e2e_fsts.clear(); supervision->e2e_fsts.resize(1); supervision->e2e_fsts[0] = trans2word_fst; @@ -291,6 +294,7 @@ bool ProtoSupervisionToSupervision( const ContextDependencyInterface &ctx_dep, const TransitionModel &trans_model, const ProtoSupervision &proto_supervision, + bool convert_to_pdfs, Supervision *supervision) { using fst::VectorFst; using fst::StdArc; @@ -338,10 +342,13 @@ bool ProtoSupervisionToSupervision( // when we compose with the denominator graph. BaseFloat self_loop_scale = 0.0; - bool reorder = true; // more efficient in general; won't affect results. + // You should always set reorder to true; for the current chain-model + // topologies, it will affect results if you are inconsistent about this. + bool reorder = true, + check_no_self_loops = true; // add self-loops to the FST with transition-ids as its labels. AddSelfLoops(trans_model, disambig_syms_h, self_loop_scale, reorder, - &transition_id_fst); + check_no_self_loops, &transition_id_fst); // at this point transition_id_fst will have transition-ids as its ilabels and // context-dependent phones (indexes into ILabelInfo()) as its olabels. @@ -357,14 +364,19 @@ bool ProtoSupervisionToSupervision( // The last step is to enforce that phones can only appear on the frames they // are 'allowed' to appear on. This will also convert the FST to have pdf-ids // plus one as the labels - TimeEnforcerFst enforcer_fst(trans_model, proto_supervision.allowed_phones); + TimeEnforcerFst enforcer_fst(trans_model, + convert_to_pdfs, + proto_supervision.allowed_phones); ComposeDeterministicOnDemand(transition_id_fst, &enforcer_fst, &(supervision->fst)); fst::Connect(&(supervision->fst)); - // at this point supervision->fst will have pdf-ids plus one as the olabels, - // but still transition-ids as the ilabels. Copy olabels to ilabels. - fst::Project(&(supervision->fst), fst::PROJECT_OUTPUT); + + if (convert_to_pdfs) { + // at this point supervision->fst will have pdf-ids plus one as the olabels, + // but still transition-ids as the ilabels. Copy olabels to ilabels. + fst::Project(&(supervision->fst), fst::PROJECT_OUTPUT); + } KALDI_ASSERT(supervision->fst.Properties(fst::kIEpsilons, true) == 0); if (supervision->fst.NumStates() == 0) { @@ -377,13 +389,15 @@ bool ProtoSupervisionToSupervision( supervision->weight = 1.0; supervision->num_sequences = 1; supervision->frames_per_sequence = proto_supervision.allowed_phones.size(); - supervision->label_dim = trans_model.NumPdfs(); + if (convert_to_pdfs) + supervision->label_dim = trans_model.NumPdfs(); + else + supervision->label_dim = trans_model.NumTransitionIds(); SortBreadthFirstSearch(&(supervision->fst)); return true; } - SupervisionSplitter::SupervisionSplitter( const Supervision &supervision): supervision_(supervision), @@ -397,50 +411,10 @@ SupervisionSplitter::SupervisionSplitter( KALDI_WARN << "Splitting already-reattached sequence (only expected in " << "testing code)"; } - int32 num_states = fst.NumStates(), - num_frames = supervision_.frames_per_sequence * supervision_.num_sequences; - KALDI_ASSERT(num_states > 0); - int32 start_state = fst.Start(); - // FST should be top-sorted and connected, so start-state must be 0. - KALDI_ASSERT(start_state == 0 && "Expecting start-state to be 0"); - frame_[start_state] = 0; - for (int32 state = 0; state < num_states; state++) { - int32 cur_frame = frame_[state]; - if (cur_frame == -1) { - // If this happens it means the Supervision does not have the required - // properties, e.g. being top-sorted and connected. - KALDI_ERR << "Error computing frame indexes for Supervision"; - } - for (fst::ArcIterator aiter(fst, state); - !aiter.Done(); aiter.Next()) { - const fst::StdArc &arc = aiter.Value(); - // The FST is supposed to be an epsilon-free acceptor. - KALDI_ASSERT(arc.ilabel == arc.olabel && arc.ilabel > 0); - int32 nextstate = arc.nextstate; - KALDI_ASSERT(nextstate >= 0 && nextstate < num_states); - // all arcs go from some t to t + 1. - int32 &next_frame = frame_[nextstate]; - if (next_frame == -1) - next_frame = cur_frame + 1; - else - KALDI_ASSERT(next_frame == cur_frame + 1); - } - } - // The following assert checks that the number of frames in the FST - // matches the num_frames stored in the supervision object; it also relies - // on the topological sorting and connectedness of the FST. - KALDI_ASSERT(frame_.back() == num_frames); - std::vector::iterator iter = frame_.begin(), - end = iter + (frame_.size() - 1); - // check that the frame-indexes of states are monotonically non-decreasing, as - // they should be based on the top-sorting. We rely on this property to - // compute the frame ranges while splitting. - while (iter != end) { - int32 cur_t = *iter; - ++iter; - int32 next_t = *iter; - KALDI_ASSERT(next_t >= cur_t); - } + int32 num_frames = supervision_.frames_per_sequence * + supervision_.num_sequences; + int32 ans = ComputeFstStateTimes(fst, &frame_); + KALDI_ASSERT(ans == num_frames); } void SupervisionSplitter::GetFrameRange(int32 begin_frame, int32 num_frames, @@ -574,7 +548,10 @@ void Supervision::Write(std::ostream &os, bool binary) const { WriteBasicType(os, binary, label_dim); KALDI_ASSERT(frames_per_sequence > 0 && label_dim > 0 && num_sequences > 0); + bool e2e = !e2e_fsts.empty(); WriteToken(os, binary, ""); + // the following is of course redundant, but it's for back compatibility + // reasons. WriteBasicType(os, binary, e2e); if (!e2e) { if (binary == false) { @@ -606,6 +583,10 @@ void Supervision::Write(std::ostream &os, bool binary) const { } WriteToken(os, binary, ""); } + if (!alignment_pdfs.empty()) { + WriteToken(os, binary, ""); + WriteIntegerVector(os, binary, alignment_pdfs); + } WriteToken(os, binary, ""); } @@ -615,8 +596,8 @@ void Supervision::Swap(Supervision *other) { std::swap(frames_per_sequence, other->frames_per_sequence); std::swap(label_dim, other->label_dim); std::swap(fst, other->fst); - std::swap(e2e, other->e2e); std::swap(e2e_fsts, other->e2e_fsts); + std::swap(alignment_pdfs, other->alignment_pdfs); } void Supervision::Read(std::istream &is, bool binary) { @@ -629,12 +610,9 @@ void Supervision::Read(std::istream &is, bool binary) { ReadBasicType(is, binary, &frames_per_sequence); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &label_dim); - if (PeekToken(is, binary) == 'E') { - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &e2e); - } else { - e2e = false; - } + bool e2e; + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &e2e); if (!e2e) { if (!binary) { ReadFstKaldi(is, binary, &fst); @@ -665,6 +643,12 @@ void Supervision::Read(std::istream &is, bool binary) { } ExpectToken(is, binary, ""); } + if (PeekToken(is, binary) == 'A') { + ExpectToken(is, binary, ""); + ReadIntegerVector(is, binary, &alignment_pdfs); + } else { + alignment_pdfs.clear(); + } ExpectToken(is, binary, ""); } @@ -698,6 +682,17 @@ int32 ComputeFstStateTimes(const fst::StdVectorFst &fst, KALDI_ERR << "Input FST does not have required properties."; } } + std::vector::iterator iter = state_times->begin(), + end = iter + (num_states - 1); + // check that the frame-indexes of states are monotonically non-decreasing, as + // they should be based on the top-sorting. We rely on this property to + // compute the frame ranges while splitting. + while (iter != end) { + int32 cur_t = *iter; + ++iter; + int32 next_t = *iter; + KALDI_ASSERT(next_t >= cur_t); + } if (total_length < 0) KALDI_ERR << "Input FST does not have required properties."; return total_length; @@ -707,28 +702,33 @@ Supervision::Supervision(const Supervision &other): weight(other.weight), num_sequences(other.num_sequences), frames_per_sequence(other.frames_per_sequence), label_dim(other.label_dim), fst(other.fst), - e2e(other.e2e), e2e_fsts(other.e2e_fsts) { } + e2e_fsts(other.e2e_fsts), alignment_pdfs(other.alignment_pdfs) { } -// This static function is called by AppendSupervision if the supervisions +// This static function is called by MergeSupervision if the supervisions // are end2end. It simply puts all e2e FST's into 1 supervision. -void AppendSupervisionE2e(const std::vector &input, +void MergeSupervisionE2e(const std::vector &input, Supervision *output_supervision) { KALDI_ASSERT(!input.empty()); - KALDI_ASSERT(input[0]->e2e); KALDI_ASSERT(input[0]->e2e_fsts.size() == 1); *output_supervision = *(input[0]); - for (int32 i = 1; i < input.size(); i++) { + output_supervision->e2e_fsts.reserve(input.size()); + int32 frames_per_sequence = output_supervision->frames_per_sequence, + num_seqs = input.size(); + for (int32 i = 1; i < num_seqs; i++) { output_supervision->num_sequences++; KALDI_ASSERT(input[i]->e2e_fsts.size() == 1); KALDI_ASSERT(input[i]->frames_per_sequence == - output_supervision->frames_per_sequence); + frames_per_sequence); output_supervision->e2e_fsts.push_back(input[i]->e2e_fsts[0]); } + output_supervision->alignment_pdfs.clear(); + // The program nnet3-chain-acc-lda-stats works on un-merged egs, + // and there is no need to support merging of 'alignment_pdfs' } -void AppendSupervision(const std::vector &input, - Supervision *output_supervision) { +void MergeSupervision(const std::vector &input, + Supervision *output_supervision) { KALDI_ASSERT(!input.empty()); int32 label_dim = input[0]->label_dim, num_inputs = input.size(); @@ -736,14 +736,16 @@ void AppendSupervision(const std::vector &input, *output_supervision = *(input[0]); return; } - if (input[0]->e2e) { - AppendSupervisionE2e(input, output_supervision); + if (!input[0]->e2e_fsts.empty()) { + MergeSupervisionE2e(input, output_supervision); return; } - for (int32 i = 1; i < num_inputs; i++) + for (int32 i = 1; i < num_inputs; i++) { KALDI_ASSERT(input[i]->label_dim == label_dim && "Trying to append incompatible Supervision objects"); + KALDI_ASSERT(input[i]->alignment_pdfs.empty()); + } *output_supervision = *(input[num_inputs-1]); for (int32 i = num_inputs - 2; i >= 0; i--) { const Supervision &src = *(input[i]); @@ -773,7 +775,6 @@ void AppendSupervision(const std::vector &input, // be sorted). bool AddWeightToSupervisionFstE2e(const fst::StdVectorFst &normalization_fst, Supervision *supervision) { - KALDI_ASSERT(supervision->e2e); KALDI_ASSERT(supervision->num_sequences == 1); KALDI_ASSERT(supervision->e2e_fsts.size() == 1); // Remove epsilons before composing. 'normalization_fst' has no epsilons so @@ -798,7 +799,7 @@ bool AddWeightToSupervisionFstE2e(const fst::StdVectorFst &normalization_fst, bool AddWeightToSupervisionFst(const fst::StdVectorFst &normalization_fst, Supervision *supervision) { - if (supervision->e2e) + if (!supervision->e2e_fsts.empty()) return AddWeightToSupervisionFstE2e(normalization_fst, supervision); // remove epsilons before composing. 'normalization_fst' has noepsilons so @@ -902,9 +903,11 @@ void Supervision::Check(const TransitionModel &trans_mdl) const { KALDI_ERR << "Invalid frames_per_sequence: " << frames_per_sequence; if (num_sequences <= 0) KALDI_ERR << "Invalid num_sequences: " << num_sequences; - if (label_dim != trans_mdl.NumPdfs()) + if (!(label_dim == trans_mdl.NumPdfs() || + label_dim == trans_mdl.NumTransitionIds())) KALDI_ERR << "Invalid label-dim: " << label_dim - << ", expected " << trans_mdl.NumPdfs(); + << ", expected " << trans_mdl.NumPdfs() + << " or " << trans_mdl.NumTransitionIds(); std::vector state_times; if (frames_per_sequence * num_sequences != ComputeFstStateTimes(fst, &state_times)) @@ -956,6 +959,158 @@ void GetWeightsForRanges(int32 range_length, } } +bool ConvertSupervisionToUnconstrained( + const TransitionModel &trans_mdl, + Supervision *supervision) { + KALDI_ASSERT(supervision->label_dim == trans_mdl.NumTransitionIds() && + supervision->fst.NumStates() > 0 && + supervision->e2e_fsts.empty() && + supervision->alignment_pdfs.empty()); + + // Remove epsilons that will have been introduced into supervision->fst by + // class SupervisionSplitter (they make it harder to identify arcs that are on + // the first frame). + fst::RmEpsilon(&(supervision->fst)); + + { // Set supervision->alignment_pdfs to the label sequence on a randomly chosen + // path through supervision->fst. This is only needed for computing LDA + // stats in `nnet3-chain-acc-lda-stats`. + fst::UniformArcSelector selector; + fst::RandGenOptions > randgen_opts( + selector); + fst::StdVectorFst single_path_fst; + fst::RandGen(supervision->fst, &single_path_fst, randgen_opts); + fst::GetLinearSymbolSequence(single_path_fst, &(supervision->alignment_pdfs), + static_cast*>(NULL), + static_cast(NULL)); + + if (static_cast(supervision->alignment_pdfs.size()) != + supervision->frames_per_sequence) { + KALDI_ERR << "Length mismatch between FST and frames-per-sequence."; + } + for (int32 i = 0; i < supervision->frames_per_sequence; i++) { + supervision->alignment_pdfs[i] = + trans_mdl.TransitionIdToPdf(supervision->alignment_pdfs[i]); + } + } + + + { + int32 num_states = supervision->fst.NumStates(), + start_state = supervision->fst.Start(), + num_transition_ids = trans_mdl.NumTransitionIds(); + for (int32 s = 0; s < num_states; s++) { + for (fst::MutableArcIterator aiter( + &(supervision->fst), s); + !aiter.Done(); aiter.Next()) { + fst::StdArc arc = aiter.Value(); + // First replace all output labels with epsilon. + arc.olabel = 0; + int32 transition_id = arc.ilabel; + KALDI_ASSERT(transition_id <= num_transition_ids); + // Then remove all self-loop transitions except those on the 1st frame + // (which must come from the start state, since the FST was epsilon free). + // The reason for allowing them on the 1st frame, if they were already + // there, is because we want to allow phones to be cut in half on + // chunk boundaries. We don't have to do anything special on the + // last frame. (Note that the self-loops come after forward transitions, + // because these graphs are always built with reorder == true; if it was + // built with reorder == false, we'd have to treat the last, not first, + // frame specially.) + if (trans_mdl.IsSelfLoop(transition_id) && s != start_state) + arc.ilabel = 0; + aiter.SetValue(arc); + } + } + } + + { + // We determinize using DeterminizeStar, which removes epsilons while + // determinizing. It can't fail because the FST is functional (all output + // paths are epsilons) and acyclic. [Note: by "functional" here we have a + // more natural definition of functional than Mohri likely uses in the + // context of determinization; we mean, functional after removing epsilons] + supervision->e2e_fsts.resize(1); + bool is_partial = fst::DeterminizeStar(supervision->fst, + &(supervision->e2e_fsts[0])); + if (is_partial) { + KALDI_WARN << "Partial FST generated when determinizing supervision; " + "abandoning this chunk."; + return false; + } + supervision->fst.DeleteStates(); + fst::Minimize(&(supervision->e2e_fsts[0])); + if (supervision->e2e_fsts[0].NumStates() == 0) { + // this should not happen-- likely a code bug or mismatch of some kind. + KALDI_WARN << "Supervision FST became empty."; + return false; + } + } + + { // Add self-loops to the FST. (At this point we move it to + // supervision->e2e_fsts[0]). + + // There are be no disambiguation symbols here. + std::vector disambig_syms; + // We're not adding transition probabilities; we rely on compsition with the + // normalization FST for that. (note: all transition probabilities are just + // 0.5 anyway, for the typical chain topology). + BaseFloat self_loop_scale = 0.0; + // 'reorder' must always be true for chain models. + bool reorder = true; + // The FST we're about to call AddSelfLoops() on will have self-loops, on + // the first frame, so disable the check that the FST was originally + // self-loop-free. + bool check_no_self_loops = false; + supervision->e2e_fsts.resize(1); + AddSelfLoops(trans_mdl, disambig_syms, self_loop_scale, + reorder, check_no_self_loops, &(supervision->e2e_fsts[0])); + } + + { // Convert transition-ids to pdf-ids+1 on the FST labels, + // and copy ilabels to olabels. + fst::StdVectorFst &e2e_fst = supervision->e2e_fsts[0]; + int32 num_states = e2e_fst.NumStates(); + for (int32 s = 0; s < num_states; s++) { + for (fst::MutableArcIterator aiter(&e2e_fst, s); + !aiter.Done(); aiter.Next()) { + fst::StdArc arc = aiter.Value(); + // There will be a few zero ilabels at this point, due to how + // AddSelfLoops() works (it calls MakePrecedingInputSymbolsSame(), which + // adds epsilons). zero olabels. + if (arc.ilabel != 0) { + int32 pdf_id_plus_one = trans_mdl.TransitionIdToPdf(arc.ilabel) + 1; + arc.ilabel = pdf_id_plus_one; + arc.olabel = pdf_id_plus_one; + aiter.SetValue(arc); + } + } + } + supervision->label_dim = trans_mdl.NumPdfs(); + } + + { + // AddSelfLoops() adds epsilons, and we don't want these. Determinize-star + // (which removes epsilons) and minimize again. + fst::StdVectorFst temp_fst(supervision->e2e_fsts[0]); + bool is_partial = fst::DeterminizeStar(temp_fst, + &(supervision->e2e_fsts[0])); + if (is_partial) { + KALDI_WARN << "Partial FST generated when determinizing supervision; " + "abandoning this chunk."; + return false; + } + fst::Minimize(&(supervision->e2e_fsts[0])); + fst::Connect(&(supervision->e2e_fsts[0])); + if (supervision->e2e_fsts[0].NumStates() == 0) { + // this should not happen-- likely a code bug or mismatch of some kind. + KALDI_WARN << "Supervision FST became empty."; + return false; + } + } + return true; +} + } // namespace chain } // namespace kaldi diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h index e52602e1c12..f1a796dc2f8 100644 --- a/src/chain/chain-supervision.h +++ b/src/chain/chain-supervision.h @@ -52,12 +52,14 @@ struct SupervisionOptions { int32 frame_subsampling_factor; BaseFloat weight; BaseFloat lm_scale; + bool convert_to_pdfs; SupervisionOptions(): left_tolerance(5), right_tolerance(5), frame_subsampling_factor(1), weight(1.0), - lm_scale(0.0) { } + lm_scale(0.0), + convert_to_pdfs(true) { } void Register(OptionsItf *opts) { opts->Register("left-tolerance", &left_tolerance, "Left tolerance for " @@ -76,6 +78,8 @@ struct SupervisionOptions { opts->Register("lm-scale", &lm_scale, "The scale with which the graph/lm " "weights from the phone lattice are included in the " "supervision fst."); + opts->Register("convert-to-pdfs", &convert_to_pdfs, "If true, convert " + "transition-ids to pdf-ids + 1 in supervision FSTs."); } void Check() const; }; @@ -178,8 +182,10 @@ class TimeEnforcerFst: typedef fst::StdArc::Label Label; TimeEnforcerFst(const TransitionModel &trans_model, + bool convert_to_pdfs, const std::vector > &allowed_phones): trans_model_(trans_model), + convert_to_pdfs_(convert_to_pdfs), allowed_phones_(allowed_phones) { } // We cannot use "const" because the pure virtual function in the interface is @@ -199,6 +205,10 @@ class TimeEnforcerFst: private: const TransitionModel &trans_model_; + // if convert_to_pdfs_ is true, this FST will map from transition-id (on the + // input side) to pdf-id plus one (on the output side); if false, both sides' + // labels will be transition-id. + bool convert_to_pdfs_; const std::vector > &allowed_phones_; }; @@ -223,33 +233,61 @@ struct Supervision { int32 frames_per_sequence; // the maximum possible value of the labels in 'fst' (which go from 1 to - // label_dim). This should equal the NumPdfs() in the TransitionModel object. - // Included to avoid training on mismatched egs. + // label_dim). For fully-processed examples this will equal the NumPdfs() in the + // TransitionModel object, but for newer-style "unconstrained" examples + // that have been output by chain-get-supervision but not yet processed + // by nnet3-chain-get-egs, it will be the NumTransitionIds() of the + // TransitionModel object. int32 label_dim; // This is an epsilon-free unweighted acceptor that is sorted in increasing // order of frame index (this implies it's topologically sorted but it's a - // stronger condition). The labels are pdf-ids plus one (to avoid epsilons, - // since pdf-ids are zero-based). Each successful path in 'fst' has exactly - // 'frames_per_sequence * num_sequences' arcs on it (first 'frames_per_sequence' arcs for the - // first sequence; then 'frames_per_sequence' arcs for the second sequence, and so on). + // stronger condition). The labels will normally be pdf-ids plus one (to avoid epsilons, + // since pdf-ids are zero-based), but for newer-style "unconstrained" examples + // that have been output by chain-get-supervision but not yet processed + // by nnet3-chain-get-egs, they will be transition-ids. + // Each successful path in 'fst' has exactly 'frames_per_sequence * + // num_sequences' arcs on it (first 'frames_per_sequence' arcs for the first + // sequence; then 'frames_per_sequence' arcs for the second sequence, and so + // on). fst::StdVectorFst fst; - // if the 'e2e' flag is set to true, it means that this supervision is meant - // to be used in end-to-end (i.e. flat-start) chain training. In that case, - // the numerator FST's are no longer stored in 'fst' but instead they are - // stored in e2e_fsts which is a list (with size() == 'num_sequences'). - // That's because end-to-end numerator FST's are similar to training FST's - // used in gmm monophone flat-start training (i.e. they have self-loops) - // and therefore they can't be appended into a single long FST. - // The function responsible for creating an end-to-end 'supervision' - // is TrainingGraphToSupervision(). - // To find out more about end-to-end training, see chain-generic-numerator.h - bool e2e; // end to end + // 'e2e_fsts' may be set as an alternative to 'fst'. These FSTs are used + // when the numerator computation will be done with 'full forward_backward' + // instead of constrained in time. (The 'constrained in time' fsts are + // how we described it in the original LF-MMI paper, where each phone can + // only occur at the same time it occurred in the lattice, extended by + // a tolerance). + // + // This 'e2e_fsts' is an array of FSTs, one per sequence, that are acceptors + // with (pdf_id + 1) on the labels, just like 'fst', but which are cyclic FSTs. + // Unlike with 'fst', it is not the case with 'e2e_fsts' that each arc + // corresponds to a specific frame). + // + // There are two situations 'e2e_fsts' might be set. + // The first is in 'end-to-end' training, where we train without a tree from + // a flat start. The function responsible for creating this object in that + // case is TrainingGraphToSupervision(); to find out more about end-to-end + // training, see chain-generic-numerator.h + // The second situation is where we create the supervision from lattices, + // and split them into chunks using the time marks in the lattice, but then + // make a cyclic FST, and don't enforce the times on the lattice inside the + // chunk. [Code location TBD]. std::vector e2e_fsts; + + // This member is only set to a nonempty value if we are creating 'unconstrained' + // egs. These are egs that are split into chunks using the lattice alignments, + // but then within the chunks we remove the frame-level constraints on which + // phones can appear when, and use the 'e2e_fsts' member. + // + // It is only required in order to accumulate the LDA stats using + // `nnet3-chain-acc-lda-stats`, and it is not merged by nnet3-chain-merge-egs; + // it will only be present for un-merged egs. + std::vector alignment_pdfs; + Supervision(): weight(1.0), num_sequences(1), frames_per_sequence(-1), - label_dim(-1), e2e(false) { } + label_dim(-1) { } Supervision(const Supervision &other); @@ -267,17 +305,21 @@ struct Supervision { /** This function creates a Supervision object from a ProtoSupervision object. - The labels will be pdf-ids plus one. It sets supervision->label_dim - trans_model.NumPdfs(). - It returns true on success, and false on failure; the only failure mode is - that it might return false on that would not be a bug, is when the FST is + If convert_to_pdfs is true then the labels will be pdf-ids plus one and + supervision->label_dim will be set to trans_model.NumPdfs(); otherwise, the + labels will be transition-ids and supervision->label_dim will be + trans_model.NumTransitionIds(). + + It returns true on success, and false on failure; the only failure mode for + which it might return false that would not be a bug, is when the FST is empty because there were too many phones for the number of frames. */ bool ProtoSupervisionToSupervision( const ContextDependencyInterface &ctx_dep, const TransitionModel &trans_model, const ProtoSupervision &proto_supervision, + bool convert_to_pdfs, Supervision *supervision); /** This function creates and initializes an end-to-end supervision object @@ -388,17 +430,11 @@ int32 ComputeFstStateTimes(const fst::StdVectorFst &fst, std::vector *state_times); -/// This function appends a list of supervision objects to create what will -/// usually be a single such object, but if the weights and num-frames are not -/// all the same it will only append Supervision objects where successive ones -/// have the same weight and num-frames. -/// The normal use-case for this is when you are combining neural-net examples for -/// training; appending them like this helps to simplify the training process. -/// This function will crash if the values of label_dim in the inputs are not -/// all the same. -void AppendSupervision(const std::vector &input, - Supervision *output_supervision); +/// This function merges a list of supervision objects, which must have the +/// same num-frames and label-dim. +void MergeSupervision(const std::vector &input, + Supervision *output_supervision); /// This function helps you to pseudo-randomly split a sequence of length 'num_frames', @@ -439,6 +475,18 @@ void GetWeightsForRanges(int32 range_length, std::vector > *weights); +/// This function converts a 'Supervision' object that has a non-cyclic FST +/// as its 'fst' member, and converts it to one that has a cyclic FST in +/// its e2e_fsts[0], and has 'alignment_pdfs' set to a random path through +/// the original 'fst' (this used only in the binary nnet3-chain-acc-lda-stats). +/// This can be used to train without any constraints on the alignment of phones +/// internal to chunks, while still imposing constraints at chunk boundaries. +/// It returns true on success, and false if some kind of error happened +/// (this is not expected). +bool ConvertSupervisionToUnconstrained( + const TransitionModel &trans_mdl, + Supervision *supervision); + typedef TableWriter > SupervisionWriter; typedef SequentialTableReader > SequentialSupervisionReader; diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 1d357ace106..6b4a7b593c2 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -75,25 +75,22 @@ void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts, GenericNumeratorComputation numerator(supervision, nnet_output); // note: supervision.weight is included as a factor in the derivative from // the numerator object, as well as the returned logprob. - num_logprob_weighted = numerator.Forward(); - KALDI_VLOG(2) << "Numerator logprob per frame: " - << num_logprob_weighted / (*weight); - numerator_ok = (num_logprob_weighted - num_logprob_weighted == 0); - if (!numerator_ok) - KALDI_LOG << "Numerator forward failed."; - - if (xent_output_deriv && numerator_ok) { - numerator_ok = numerator.Backward(xent_output_deriv); - if (!numerator_ok) - KALDI_LOG << "Numerator backward failed."; - if (nnet_output_deriv) + if (xent_output_deriv) { + numerator_ok = numerator.ForwardBackward(&num_logprob_weighted, + xent_output_deriv); + if (numerator_ok && nnet_output_deriv) nnet_output_deriv->AddMat(1.0, *xent_output_deriv); - } else if (nnet_output_deriv && numerator_ok) { - numerator_ok = numerator.Backward(nnet_output_deriv); - if (!numerator_ok) - KALDI_LOG << "Numerator backward failed."; + } else if (nnet_output_deriv) { + numerator_ok = numerator.ForwardBackward(&num_logprob_weighted, + nnet_output_deriv); + } else { + num_logprob_weighted = numerator.ComputeObjf(); } + if (!numerator_ok) + KALDI_WARN << "Numerator forward-backward failed."; } + numerator_ok = numerator_ok && + (num_logprob_weighted - num_logprob_weighted == 0); *objf = num_logprob_weighted - den_logprob_weighted; if (!((*objf) - (*objf) == 0) || !denominator_ok || !numerator_ok) { @@ -150,7 +147,7 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, CuMatrix *xent_output_deriv) { - if (supervision.e2e) { + if (!supervision.e2e_fsts.empty()) { ComputeChainObjfAndDerivE2e(opts, den_graph, supervision, nnet_output, objf, l2_term, weight, nnet_output_deriv, xent_output_deriv); diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index d6535902625..6ea70b5ca41 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -53,6 +53,7 @@ struct ChainTrainingOptions { // B, with transition from A to B, so we don't have to consider epsilon loops- // or just imagine the coefficient is small enough that we can ignore the // epsilon loops. + // Note: we generally set leaky_hmm_coefficient to 0.1. BaseFloat leaky_hmm_coefficient; diff --git a/src/chain/context-dep-topology.h b/src/chain/context-dep-topology.h deleted file mode 100644 index 5eae267a5cf..00000000000 --- a/src/chain/context-dep-topology.h +++ /dev/null @@ -1,129 +0,0 @@ -// chain/context-dep-topology.h - -// Copyright 2015 Johns Hopkins University (Author: Daniel Povey) - - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef KALDI_CHAIN_CONTEXT_DEP_TOPOLOGY_H_ -#define KALDI_CHAIN_CONTEXT_DEP_TOPOLOGY_H_ - -#include -#include - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "fstext/fstext-lib.h" -#include "chain/phone-topology.h" -#include "chain/phone-context.h" - -namespace kaldi { -namespace chain { - - -/** - The 'ContextDepTopology' object is responsible for combining the - 'PhoneTopology' model, which describes the quasi-HMM topology for each phone, - and the 'PhoneContext' model, which describes how we create left-context - dependent phones. It also allocates 'graph-labels' and 'output-labels'. It - is analogous to 'HC' in the 'HCLG' recipe. It's of a manageable size as an - FST, because we limit ourselves to left context. - - A 'graph-label' is one-based, is sufficient to identify the logical CD-phone - and the label in the topology, and can also be mapped to an 'output-label'. - - The output-label is also one-based; it is sufficient to identify the physical - CD-phone and the label in the topology object, but won't let you identify - the monophone (because output-labels may be shared between monophones). - - The neural-net output is indexed by the output-label minus one (to form - a zero-based index). -*/ - -class ContextDepTopology { - public: - - ContextDepTopology(); - - ContextDepTopology(const PhoneTopology &topology, - const PhoneContext &context); - - const PhoneTopology &GetPhoneTopology() { return phone_topology_; } - - const PhoneContext &GetPhoneContext() { return phone_context_; } - - // Returns the number of output-labels (labels corresponding to the neural-net - // output). The actual neural-net output matrix is indexed by the label minus - // one, which we call an output-index. - int32 NumOutputLabels(); - - // Returns the number of graph-labels. A graph-label is what will typically - // appear in HCLG decoding graphs; it is mappable to an output-label, but we - // also ensure that it is mappable to a phone. - int32 NumGraphLabels(); - - // convenience function to return the number of phones. - int32 NumPhones() { return phone_topology_.NumPhones(); } - - // maps a graph-label to an output-label. - int32 GraphLabelToOutputLabel(int32 graph_label); - - // maps a graph label to a phone. - int32 GraphLabelToPhone(int32 graph_label); - - // maps a graph label to a logical cd-phone [a logical cd-phone is always - // mappable to the monophone]. - int32 GraphLabelToLogicalCdPhone(int32 graph_label); - - // maps a graph label to a physical cd-phone, as defined by the PhoneContext - // object. - int32 GraphLabelToPhysicalCdPhone(int32 graph_label); - - // maps a graph label to a label in the phone's topology object (needed to - // work out phone alignments). - int32 GraphLabelToTopologyLabel(int32 graph_label); - - // Outputs to 'output' an FST that represents this object-- it's essentially - // the 'HC' object in the 'HCLG' recipe. It's an unweighted transducer where - // the input labels are phones (or epsilon) and the output labels are - // 'graph-labels'. Note: we will ensure that there are no epsilons on - // the 'output side'. - void GetAsFst(fst::VectorFst* output) const; - - // This variant of of GetAsFst gives you 'output-labels' as the olabels, instead - // of graph-labels. These are indexes-into-the-nnet-output plus one. - void GetAsFstWithOutputLabels(fst::VectorFst* output) const; - - void Write(std::ostream &os, bool binary) const; - - void Read(std::istream &is, bool binary); - - private: - PhoneTopology phone_topology_; - PhoneContext phone_context_; - - struct GraphLabelInfo { - int32 logical_cd_phone; - int32 topology_label; - int32 output_label; - }; -}; - - -} // namespace chain -} // namespace kaldi - -#endif // KALDI_CHAIN_CONTEXT_DEP_TOPOLOGY_H_ diff --git a/src/chain/phone-context.h b/src/chain/phone-context.h deleted file mode 100644 index bfcb56e64d1..00000000000 --- a/src/chain/phone-context.h +++ /dev/null @@ -1,188 +0,0 @@ -// chain/phone-context.h - -// Copyright 2015 Johns Hopkins University (Author: Daniel Povey) - - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - - -#ifndef KALDI_CHAIN_PHONE_CONTEXT_H_ -#define KALDI_CHAIN_PHONE_CONTEXT_H_ - -#include -#include - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "fstext/fstext-lib.h" - -namespace kaldi { -namespace chain { - - -/** - The 'PhoneContext' object is responsible for mapping phones in left-context to - cd-phones (context-dependent phones). In the 'chain' models, we only support - left-context, in order to make phone-level discriminative training - sufficiently efficient. The 'PhoneContext' model represents all the - information we need to know about the phonetic-context decision tree (so after - building the decision tree, we can build the PhoneContext object and then - discard the tree). - - There two types of cd-phones: cd-phones, and physical cd-phones. The logical - ones can be mapped to physical. The logical cd-phones are the ones that we - actually put in the graph, which will enable us to work out the phone sequence - (assuming the topology is 'alignable', which it normally will be). Logical - cd-phones are mappable to the (mono) phone; the physical ones are less - detailed, and can't necessarily be mapped to the monophones. - - Note that the PhoneTopology and PhoneContext will be incorporated as data - members in the ContextDependentTopology model, which contains information - about topology and context, and also controls the allocation of output-ids - (which are indexes into the neural net output, and roughly correspond to - context-dependent states in a conventional HMM-based system). -*/ - -class PhoneContext: public fst::DeterministicOnDemandFst { - public: - /* First, members that relate to the base class. */ - - // repeat the typedefs (they're not inherited automatically; we could inherit - // but they are boilerplate so we just repeat them). - typedef typename fst::StdArc Arc; - typedef typename Arc::StateId StateId; // should be int32. - typedef typename Arc::Weight Weight; - typedef typename Arc::Label Label; // should be int32. - - // The following are part of the interface from DeterministicOnDemandFst. - virtual StateId Start() { return 0; } - - // all states are final. - virtual Weight Final(StateId s) { return Weight::One(); } - - // Assuming 0 <= s < NumStates() and 1 <= phone <= NumPhones(), - // this function will return true and output to Arc as follows: - // ilabel = phone, olabel = logical-cd-phone, weight = One(), - // nextstate = [the next state after seeing this phone.] - virtual bool GetArc(StateId s, Label phone, Arc *oarc) = 0; - - // There is a concept of states in this model, whereby when it outputs a phone - // it advances the state. So it's an FST-like representation of the decision - // tree. States are numbered from 0 to NumStates() - 1. This function is - // actually not in the interface, but it is the same as in ExpandedFst. - int32 NumStates() const { return transitions_.size(); } - - virtual ~PhoneContext(); - - /* Next members not relating to the base class. */ - - PhoneContext(); - - // Initialization from a tree (which must be left-context only, i.e. - // CentralPosition() == ContextWidth() - 1). The initialization method relies - // on enumerating all possible contexts, so it will be slow if you have a - // ridiculously large context. - - // Note: we hope not to use this, we will use a separate version of the - // tree-building code that tries to reduce the number of 'context states'. - PhoneContext(int32 num_phones, const ContextDependencyInterface &ctx_dep); - - // Phones are numbered from 1 to NumPhones(). - int32 NumPhones() const { return num_phones_; } - - - // Return the number of distinct labels on the topology FST for this phone: - // the labels must be contiguously numbered from 1, so this is the same as - // the largest topology label. - bool GetNumLabels(int32 phone) const; - - // Logical context-dependent phones are numbered from 1 to - // NumLogicalCdPhones(). - int32 NumLogicalCdPhones() const { return logical_to_phone_.size() - 1; } - - // Physical context-dependent phones are numbered from 1 to - // NumPhysicalCdPhones(). - int32 NumPhysicalCdPhones() const { return num_physical_cd_phones_; } - - // This function tells you how many phones of left-context the underlying - // decision tree was built with: 0 for monophone, 1 for left-biphone, etc. It - // amounts to an assertion that if you take a given phone sequence of length - // LeftContext(), and starting from any FST state, use that phone-sequence as - // ilabels, you'll always end up in the same state. - int32 LeftContext() const { return left_context_; } - - // Maps a logical CD-phone to the phone index (i.e. of the monophone with - // no context)-- you cannot map to a full context, that is not what - // logical CD-phones mean in this code. - int32 LogicalToPhone(int32 logical_cd_phone) const; - - // Maps a logical CD-phone to a physical CD-phone. - int32 LogicalToPhysical(int32 logical_cd_phone) const; - - // Given a context-dependent phone index, return the set of phones it may - // correspond to (in most cases this would be a set of just one element). - // We'll implement this when we need it- it will require storing derived - // variables, to make it efficient. - // - // void CdPhoneToPhones(int32 cd_phone, std::vector *phones); - - - void Write(std::ostream &os, bool binary) const; - - void Read(std::istream &is); - - // Outputs to 'output' an FST that's a copy of this object in the normal FST - // format (as opposed to DeterministicOnDemandFst). This is the 'C' FST - // (the context-dependency FST) in the HCLG recipe. - // ilabels are phones, olabels are cd-phones. Note: can be implemented by - // taking an FST 'f' with one state that's initial and final, with self-loops - // for each phone, and then calling ComposeDeterministicOnDemand(f, *this, - // output). - void GetAsFst(fst::VectorFst* output) const; - private: - void Check(); - // Sets up the cd_phone_to_phone_ array. - void ComputeCdPhoneToPhone(); - - int32 num_phones_; - int32 num_physical_cd_phones_; - int32 left_context_; - - // 'transitions_' is indexed by state, then by phone - 1 (each vector of pairs - // is of length num_phones), and each pair is (cd-phone-index, next-state). - // For instance (bear in mind that 0 is the initial-state that you get at the - // begining of a phone_sequence), transitions_[0][p].first is the - // logical-cd-phone you get from seeing phone p with the left-context being the - // beginning of a sequence (i.e. a left-context of all zeros, as far as the - // tree is concerned); and transitions_[0][p].second is the context state you - // go to after seeing that phone. - std::vector > > transitions_; - - // map logical CD-phones to phones. Indexed by logical CD-phone (zeroth - // element not used). - std::vector logical_to_phone_; - - // map logical CD-phones to physical CD-phones. Indexed by logical CD-phone (zeroth - // element not used). - std::vector logical_to_physical_; - -}; - - -} // namespace chain -} // namespace kaldi - -#endif // KALDI_CHAIN_PHONE_CONTEXT_H_ diff --git a/src/chain/phone-topology.cc b/src/chain/phone-topology.cc deleted file mode 100644 index e0a3fb639b7..00000000000 --- a/src/chain/phone-topology.cc +++ /dev/null @@ -1,98 +0,0 @@ -// chain/phone-topology.cc - -// Copyright 2015 Johns Hopkins University (author: Daniel Povey) -// 2015 Xingyu Na - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "chain/phone-topology.h" - -namespace kaldi { -namespace chain { - - -const fst::VectorFst& PhoneTopolgy::TopologyForPhone (int32 phone) { - return fsts_[phone]; -} - -PhoneTopology::PhoneTopology (int32 num_phones) { - fsts_.clear(); - fsts_.resize(num_phones + 1); - for (int32 i = 1; i <= num_phones; i++) { - fst::VectorFst fst; - fst.AddState(); // state 0 - fst.SetStart(0); // set start state - fst.AddState(); // state 1 - fst.AddArc(0, StdArc(1, 1, 0.5, 1)); - fst.AddArc(1, StdArc(2, 2, 0.5, 1)); - fst.SetFinal(1); // set final state - fsts_[i] = fst; - } -} - -void PhoneTopology::Write(std::ostream &os, bool binary) const{ - WriteToken(os, binary, ""); - if (!binary) os << "\n"; - int num_phones = fsts_.size() - 1; - WriteToken(os, binary, ""); - WriteBasicType(os, binary, num_phones); - if (!binary) os << "\n"; - std::vector >::iterator fiter = fsts_.begin(), - fend = fsts_.end(); - for (++fiter; fiter != fend; ++fiter) - WriteFstKaldi(os, binary, *fiter); - WriteToken(os, binary, ""); -} - -void PhoneTopology::Read(std::istream &is, bool binary) const{ - ExpectToken(is, binary, ""); - int num_phones; - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &num_phones); - fsts_.resize(num_phones + 1); - std::vector >::iterator fiter = fsts_.begin(), - fend = fsts_.end(); - for (++fiter; fiter != fend; ++fiter) - ReadFstKaldi(os, binary, fiter); - ExpectToken(is, binary, ""); -} - -bool PhonoTopology::IsAlignable() { - std::vector >::iterator fiter = fsts_.begin(), - fend = fsts_.end(); - for (++fiter; fiter != fend; ++fiter) { - // Get start state symbles - unordered_set syms; - for (ArcIterator >aiter(*fiter, fiter->Start()); !aiter.Done(); aiter.Next()) { - const Arc &arc = aiter.Value(); - syms.insert(arc.ilabel); - } - for (StateIterator siter(*fiter); !siter.Done(); siter.Next()) { - typename Arc::StateId s = siter.Value(); - for (ArcIterator >aiter(*fiter, s); !aiter.Done(); aiter.Next()) { - const Arc &arc = aiter.Value(); - if (arc.nextstate == fiter->Start()) - return false; - if (s != fiter->Start() && syms.find(arc.ilabel) != syms.end()) - return false; - } - } - } - return true; -} - -} // namespace chain -} // namespace kaldi diff --git a/src/chain/phone-topology.h b/src/chain/phone-topology.h deleted file mode 100644 index cec7e28686d..00000000000 --- a/src/chain/phone-topology.h +++ /dev/null @@ -1,99 +0,0 @@ -// chain/phone-topology.h - -// Copyright 2015 Johns Hopkins University (Author: Daniel Povey) - - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - - -#ifndef KALDI_CHAIN_PHONE_TOPOLOGY_H_ -#define KALDI_CHAIN_PHONE_TOPOLOGY_H_ - -#include -#include - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "fstext/fstext-lib.h" - -namespace kaldi { -namespace chain { - - -/** - The 'PhoneTopology' object stores the topology for each of the phones that the - system handles. This is the equivalent of a HMM topology, except that the - emission probabilities are on the arcs not the states (so it's much more - FST-like), and there are no transition probabilities (these are just folded - into the emission probabilities). Note that it's the fact that the 'chain' - system is trained discriminatively from the start is what enables us to treat - the transition probabilities this way. - - A topology is an epsilon-free finite state acceptor. The - 'normal' topology that you get if you don't do anything special, is as - follows: - -0 1 1 # transition from state 0 to state 1 with label 1. -1 1 2 # transition from state 1 to state 1 (self-loop) with label 2. -1 0 # this says that state 1 is final. - - The FSTs have the following properties: - - they are epsilon free - - the start state is numbered zero. - - the start state is not final. - - all states are used. - - the symbols on the labels of the FST start from 1 and are contiguous (no - unused symbols between the smallest and largest symbol). - - - Phones are given indexes from 1 to NumPhones() (no gaps are allowed here). - - A topology for a phone is an FST - */ - -class PhoneTopology { - public: - int32 NumPhones() { returns static_cast(fsts_.size()) - 1; } - - // Returns the topology for a given phone. - const fst::VectorFst &TopologyForPhone(int32 phone); - - // This constructor gives the phones the default topology. If you want to - // give it a different topology, then you can create the text-form of this - // object using a script. - PhoneTopology(int32 num_phones); - - void Write(std::ostream &os, bool binary) const; - - void Read(std::istream &is, bool binary) const; - - // returns true if all the phones' FSTs have the following properties: - // - the symbols on arcs out of the start-state are disjoint from the - // symbols on arcs out of other states. - // - there are no arcs ending in the start state. - bool IsAlignable(); - private: - void Check(); - - // index zero is not used. - std::vector > fsts_; -}; - - -} // namespace chain -} // namespace kaldi - -#endif // KALDI_CHAIN_PHONE_TOPOLOGY_H_ diff --git a/src/chainbin/Makefile b/src/chainbin/Makefile index e886d454da0..61f653f174f 100644 --- a/src/chainbin/Makefile +++ b/src/chainbin/Makefile @@ -11,7 +11,7 @@ BINFILES = chain-est-phone-lm chain-get-supervision chain-make-den-fst \ nnet3-chain-shuffle-egs nnet3-chain-subset-egs \ nnet3-chain-acc-lda-stats nnet3-chain-train nnet3-chain-compute-prob \ nnet3-chain-combine nnet3-chain-normalize-egs \ - nnet3-chain-e2e-get-egs + nnet3-chain-e2e-get-egs nnet3-chain-compute-post OBJFILES = diff --git a/src/chainbin/chain-get-supervision.cc b/src/chainbin/chain-get-supervision.cc index b05f1166da4..6090d9f0058 100644 --- a/src/chainbin/chain-get-supervision.cc +++ b/src/chainbin/chain-get-supervision.cc @@ -33,10 +33,12 @@ static bool ProcessSupervision(const TransitionModel &trans_model, const ContextDependencyInterface &ctx_dep, const ProtoSupervision &proto_sup, const std::string &key, + bool convert_to_pdfs, SupervisionWriter *supervision_writer) { Supervision supervision; if (!ProtoSupervisionToSupervision(ctx_dep, trans_model, - proto_sup, &supervision)) { + proto_sup, convert_to_pdfs, + &supervision)) { KALDI_WARN << "Failed creating supervision for utterance " << key; return false; @@ -118,7 +120,9 @@ int main(int argc, char *argv[]) { continue; } if (ProcessSupervision(trans_model, ctx_dep, - proto_supervision, key, &supervision_writer)) + proto_supervision, key, + sup_opts.convert_to_pdfs, + &supervision_writer)) num_utts_done++; else num_utts_error++; @@ -134,7 +138,9 @@ int main(int argc, char *argv[]) { AlignmentToProtoSupervision(sup_opts, ali, &proto_supervision); if (ProcessSupervision(trans_model, ctx_dep, - proto_supervision, key, &supervision_writer)) + proto_supervision, key, + sup_opts.convert_to_pdfs, + &supervision_writer)) num_utts_done++; else num_utts_error++; diff --git a/src/chainbin/nnet3-chain-acc-lda-stats.cc b/src/chainbin/nnet3-chain-acc-lda-stats.cc index b195f5ba1fb..693eb2dad86 100644 --- a/src/chainbin/nnet3-chain-acc-lda-stats.cc +++ b/src/chainbin/nnet3-chain-acc-lda-stats.cc @@ -56,7 +56,11 @@ class NnetChainLdaStatsAccumulator { computer.AcceptInputs(nnet_, eg.inputs); computer.Run(); const CuMatrixBase &nnet_output = computer.GetOutput("output"); - AccStatsFromOutput(eg, nnet_output); + if (eg.outputs[0].supervision.fst.NumStates() > 0) { + AccStatsFst(eg, nnet_output); + } else { + AccStatsAlignment(eg, nnet_output); + } } void WriteStats(const std::string &stats_wxfilename, bool binary) { @@ -70,8 +74,8 @@ class NnetChainLdaStatsAccumulator { } } private: - void AccStatsFromOutput(const NnetChainExample &eg, - const CuMatrixBase &nnet_output) { + void AccStatsFst(const NnetChainExample &eg, + const CuMatrixBase &nnet_output) { BaseFloat rand_prune = rand_prune_; if (eg.outputs.size() != 1 || eg.outputs[0].name != "output") @@ -85,6 +89,7 @@ class NnetChainLdaStatsAccumulator { int32 num_frames = supervision.frames_per_sequence, num_pdfs = supervision.label_dim; KALDI_ASSERT(num_frames == nnet_output.NumRows()); + const fst::StdVectorFst &fst = supervision.fst; Lattice lat; @@ -128,6 +133,48 @@ class NnetChainLdaStatsAccumulator { } } + + void AccStatsAlignment(const NnetChainExample &eg, + const CuMatrixBase &nnet_output) { + BaseFloat rand_prune = rand_prune_; + + if (eg.outputs.size() != 1 || eg.outputs[0].name != "output") + KALDI_ERR << "Expecting the example to have one output named 'output'."; + + const chain::Supervision &supervision = eg.outputs[0].supervision; + // handling the one-sequence-per-eg case is easier so we just do that. + KALDI_ASSERT(supervision.num_sequences == 1 && + "This program expects one sequence per eg."); + + int32 num_frames = supervision.frames_per_sequence, + num_pdfs = supervision.label_dim; + KALDI_ASSERT(num_frames == nnet_output.NumRows()); + + if (supervision.alignment_pdfs.size() != + static_cast(num_frames)) + KALDI_ERR << "Alignment pdfs not present or wrong length. Using e2e egs?"; + + if (lda_stats_.Dim() == 0) + lda_stats_.Init(num_pdfs, + nnet_output.NumCols()); + + for (int32 t = 0; t < num_frames; t++) { + // the following, transferring row by row to CPU, would be wasteful if we + // actually were using a GPU, but we don't anticipate using a GPU in this + // program. + CuSubVector cu_row(nnet_output, t); + // "row" is actually just a redudant copy, since we're likely on CPU, + // but we're about to do an outer product, so this doesn't dominate. + Vector row(cu_row); + + int32 pdf = supervision.alignment_pdfs[t]; + BaseFloat weight = 1.0; + BaseFloat pruned_weight = RandPrune(weight, rand_prune); + if (pruned_weight != 0.0) + lda_stats_.Accumulate(row, pdf, pruned_weight); + } + } + BaseFloat rand_prune_; const Nnet &nnet_; CachingOptimizingCompiler compiler_; diff --git a/src/chainbin/nnet3-chain-compute-post.cc b/src/chainbin/nnet3-chain-compute-post.cc new file mode 100644 index 00000000000..914c70bb7a8 --- /dev/null +++ b/src/chainbin/nnet3-chain-compute-post.cc @@ -0,0 +1,283 @@ +// nnet3bin/nnet3-chain-compute-post.cc + +// Copyright 2012-2015 Johns Hopkins University (author: Daniel Povey) +// 2015 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "nnet3/nnet-am-decodable-simple.h" +#include "base/timer.h" +#include "nnet3/nnet-utils.h" +#include "chain/chain-denominator.h" + + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Compute posteriors from 'denominator FST' of chain model and optionally " + "map them to phones.\n" + "\n" + "Usage: nnet3-chain-compute-post [options] \n" + " e.g.: nnet3-chain-compute-post --transform-mat=transform.mat final.raw den.fst scp:feats.scp ark:nnet_prediction.ark\n" + "See also: nnet3-compute\n" + "See steps/nnet3/chain/get_phone_post.sh for example of usage.\n" + "Note: this program makes *extremely inefficient* use of the GPU.\n" + "You are advised to run this on CPU until it's improved.\n"; + + ParseOptions po(usage); + Timer timer; + + BaseFloat leaky_hmm_coefficient = 0.1; + NnetSimpleComputationOptions opts; + opts.acoustic_scale = 1.0; // by default do no acoustic scaling. + + std::string use_gpu = "yes"; + + std::string transform_mat_rxfilename; + std::string ivector_rspecifier, + online_ivector_rspecifier, + utt2spk_rspecifier; + int32 online_ivector_period = 0; + opts.Register(&po); + + po.Register("ivectors", &ivector_rspecifier, "Rspecifier for " + "iVectors as vectors (i.e. not estimated online); per utterance " + "by default, or per speaker if you provide the --utt2spk option."); + po.Register("utt2spk", &utt2spk_rspecifier, "Rspecifier for " + "utt2spk option used to get ivectors per speaker"); + po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier for " + "iVectors estimated online, as matrices. If you supply this," + " you must set the --online-ivector-period option."); + po.Register("online-ivector-period", &online_ivector_period, "Number of frames " + "between iVectors in matrices supplied to the --online-ivectors " + "option"); + po.Register("use-gpu", &use_gpu, + "yes|no|optional|wait, only has effect if compiled with CUDA"); + po.Register("leaky-hmm-coefficient", &leaky_hmm_coefficient, "'Leaky HMM' " + "coefficient: smaller values will tend to lead to more " + "confident posteriors. 0.1 is what we normally use in " + "training."); + po.Register("transform-mat", &transform_mat_rxfilename, "Location to read " + "the matrix to transform posteriors to phones. Matrix is " + "of dimension num-phones by num-pdfs."); + + po.Read(argc, argv); + + if (po.NumArgs() != 4) { + po.PrintUsage(); + exit(1); + } + +#if HAVE_CUDA==1 + CuDevice::Instantiate().SelectGpuId(use_gpu); +#endif + + std::string nnet_rxfilename = po.GetArg(1), + den_fst_rxfilename = po.GetArg(2), + feature_rspecifier = po.GetArg(3), + matrix_wspecifier = po.GetArg(4); + + Nnet nnet; + ReadKaldiObject(nnet_rxfilename, &nnet); + + SetBatchnormTestMode(true, &nnet); + SetDropoutTestMode(true, &nnet); + CollapseModel(CollapseModelConfig(), &nnet); + + RandomAccessBaseFloatMatrixReader online_ivector_reader( + online_ivector_rspecifier); + RandomAccessBaseFloatVectorReaderMapped ivector_reader( + ivector_rspecifier, utt2spk_rspecifier); + + CachingOptimizingCompiler compiler(nnet, opts.optimize_config); + + chain::ChainTrainingOptions chain_opts; + // the only option that actually gets used here is + // opts_.leaky_hmm_coefficient, and that's the only one we expose on the + // command line. + chain_opts.leaky_hmm_coefficient = leaky_hmm_coefficient; + + fst::StdVectorFst den_fst; + ReadFstKaldi(den_fst_rxfilename, &den_fst); + int32 num_pdfs = nnet.OutputDim("output"); + if (num_pdfs < 0) { + KALDI_ERR << "Neural net '" << nnet_rxfilename + << "' has no output named 'output'"; + } + chain::DenominatorGraph den_graph(den_fst, num_pdfs); + + + CuSparseMatrix transform_sparse_mat; + if (!transform_mat_rxfilename.empty()) { + Matrix transform_mat; + ReadKaldiObject(transform_mat_rxfilename, &transform_mat); + if (transform_mat.NumCols() != num_pdfs) + KALDI_ERR << "transform-mat from " << transform_mat_rxfilename + << " has " << transform_mat.NumCols() << " cols, expected " + << num_pdfs; + SparseMatrix temp_sparse_mat(transform_mat); + // the following is just a shallow swap if we're on CPU. This program + // actually won't actually work very fast on GPU, but doing it this way + // will make it easier to modify it later if we really want efficient + // operation on GPU. + transform_sparse_mat.Swap(&temp_sparse_mat); + } + + BaseFloatMatrixWriter matrix_writer(matrix_wspecifier); + + int32 num_success = 0, num_fail = 0; + int64 tot_input_frames = 0, tot_output_frames = 0; + double tot_forward_prob = 0.0; + + SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier); + + for (; !feature_reader.Done(); feature_reader.Next()) { + std::string utt = feature_reader.Key(); + const Matrix &features (feature_reader.Value()); + if (features.NumRows() == 0) { + KALDI_WARN << "Zero-length utterance: " << utt; + num_fail++; + continue; + } + const Matrix *online_ivectors = NULL; + const Vector *ivector = NULL; + if (!ivector_rspecifier.empty()) { + if (!ivector_reader.HasKey(utt)) { + KALDI_WARN << "No iVector available for utterance " << utt; + num_fail++; + continue; + } else { + ivector = &ivector_reader.Value(utt); + } + } + if (!online_ivector_rspecifier.empty()) { + if (!online_ivector_reader.HasKey(utt)) { + KALDI_WARN << "No online iVector available for utterance " << utt; + num_fail++; + continue; + } else { + online_ivectors = &online_ivector_reader.Value(utt); + } + } + + Vector priors; // empty vector, we don't need priors here. + DecodableNnetSimple nnet_computer( + opts, nnet, priors, + features, &compiler, + ivector, online_ivectors, + online_ivector_period); + + Matrix matrix(nnet_computer.NumFrames(), + nnet_computer.OutputDim()); + for (int32 t = 0; t < nnet_computer.NumFrames(); t++) { + SubVector row(matrix, t); + nnet_computer.GetOutputForFrame(t, &row); + } + + // Of course it makes no sense to copy to GPU and then back again. + // But anyway this program woudn't work very well if we actually ran + // with --use-gpu=yes. In the CPU case the following is just a shallow + // swap. + CuMatrix gpu_nnet_output; + gpu_nnet_output.Swap(&matrix); + + + chain::DenominatorComputation den_computation( + chain_opts, den_graph, + 1, // num_sequences, + gpu_nnet_output); + + + int32 num_frames = gpu_nnet_output.NumRows(); + BaseFloat forward_prob = den_computation.Forward(); + + CuMatrix posteriors(num_frames, num_pdfs); + BaseFloat scale = 1.0; + bool ok = den_computation.Backward(scale, &posteriors); + + KALDI_VLOG(1) << "For utterance " << utt << ", log-prob per frame was " + << (forward_prob / num_frames) << " over " + << num_frames << " frames."; + + if (!ok || !(forward_prob - forward_prob == 0)) { // if or NaN + KALDI_WARN << "Something went wrong for utterance " << utt + << "; forward-prob = " << forward_prob + << ", num-frames = " << num_frames; + num_fail++; + continue; + } + + num_success++; + tot_input_frames += features.NumRows(); + tot_output_frames += num_frames; + tot_forward_prob += forward_prob; + + // Write out the posteriors. + if (transform_mat_rxfilename.empty()) { + // write out posteriors over pdfs. + Matrix posteriors_cpu; + posteriors.Swap(&posteriors_cpu); + matrix_writer.Write(utt, posteriors_cpu); + } else { + // write out posteriors over (most likely) phones. + int32 num_phones = transform_sparse_mat.NumRows(); + CuMatrix phone_post(num_frames, num_phones); + phone_post.AddMatSmat(1.0, posteriors, + transform_sparse_mat, kTrans, 0.0); + Matrix phone_post_cpu; + phone_post.Swap(&phone_post_cpu); + // write out posteriors over phones. + matrix_writer.Write(utt, phone_post_cpu); + + if (GetVerboseLevel() >= 1 || RandInt(0,99)==0) { + BaseFloat sum = posteriors.Sum(); + if (((sum / num_frames) - 1.0) > 0.01) { + KALDI_WARN << "Expected sum of posteriors " << sum + << " to be close to num-frames " << num_frames; + } + } + } + } + +#if HAVE_CUDA==1 + CuDevice::Instantiate().PrintProfile(); +#endif + double elapsed = timer.Elapsed(); + KALDI_LOG << "Time taken "<< elapsed + << "s: real-time factor assuming 100 input frames/sec is " + << (elapsed*100.0/tot_input_frames); + KALDI_LOG << "Done " << num_success << " utterances, failed for " + << num_fail; + + KALDI_LOG << "Overall log-prob per (output) frame was " + << (tot_forward_prob / tot_output_frames) + << " over " << tot_output_frames << " frames."; + + if (num_success != 0) return 0; + else return 1; + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/chainbin/nnet3-chain-e2e-get-egs.cc b/src/chainbin/nnet3-chain-e2e-get-egs.cc index a2941d98840..8cdda8deb32 100644 --- a/src/chainbin/nnet3-chain-e2e-get-egs.cc +++ b/src/chainbin/nnet3-chain-e2e-get-egs.cc @@ -1,6 +1,7 @@ // chainbin/nnet3-chain-e2e-get-egs.cc // Copyright 2015 Johns Hopkins University (author: Daniel Povey) +// 2017, 2018 Hossein Hadian // See ../../COPYING for clarification regarding multiple authors // @@ -32,6 +33,39 @@ namespace kaldi { namespace nnet3 { +/** + This function finds the minimum number of arcs required to + traverse the input fst from the initial state to a final state. +*/ + +static int32 FindMinimumLengthPath( + const fst::StdVectorFst &fst) { + using fst::VectorFst; + using fst::StdArc; + using fst::StdVectorFst; + StdVectorFst distance_fst(fst); + // Modify distance_fst such that all the emitting + // arcs have cost 1 and others (and final-probs) a cost of zero + int32 num_states = distance_fst.NumStates(); + for (int32 state = 0; state < num_states; state++) { + for (fst::MutableArcIterator aiter(&distance_fst, state); + !aiter.Done(); aiter.Next()) { + const StdArc &arc = aiter.Value(); + StdArc arc2(arc); + if (arc.olabel == 0) + arc2.weight = fst::TropicalWeight::One(); + else + arc2.weight = fst::TropicalWeight(1.0); + aiter.SetValue(arc2); + } + if (distance_fst.Final(state) != fst::TropicalWeight::Zero()) + distance_fst.Final(state) = fst::TropicalWeight::One(); + } + VectorFst shortest_path; + fst::ShortestPath(distance_fst, &shortest_path); + return shortest_path.NumStates() - 1; +} + /** This function does all the processing for one utterance, and outputs the supervision objects to 'example_writer'. Note: if normalization_fst is the @@ -79,6 +113,16 @@ static bool ProcessFile(const ExampleGenerationConfig &opts, if (!TrainingGraphToSupervisionE2e(training_fst, trans_model, num_output_frames, &supervision)) return false; + + int32 min_fst_duration = FindMinimumLengthPath(supervision.e2e_fsts[0]); + if (min_fst_duration > num_frames_subsampled) { + KALDI_WARN << "For utterance " << utt_id + << ", there are too many phones for too few frames; " + << "Number of subsampled frames: " << num_frames_subsampled + << ", Minimum number of frames required by the fst: " << min_fst_duration; + return false; + } + if (normalization_fst.NumStates() > 0 && !AddWeightToSupervisionFst(normalization_fst, &supervision)) { diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc index ef545ab9162..1032b7e2125 100644 --- a/src/chainbin/nnet3-chain-get-egs.cc +++ b/src/chainbin/nnet3-chain-get-egs.cc @@ -33,48 +33,61 @@ namespace nnet3 { /** This function does all the processing for one utterance, and outputs the - supervision objects to 'example_writer'. Note: if normalization_fst is the - empty FST (with no states), it skips the final stage of egs preparation and - you should do it later with nnet3-chain-normalize-egs. - + supervision objects to 'example_writer'. + + @param [in] trans_mdl The transition-model for the tree for which we + are dumping egs. This is expected to be + NULL if the input examples already contain + pdfs-ids+1 in their FSTs, and non-NULL if the + input examples contain transition-ids in + their FSTs and need to be converted to + unconstrained 'e2e' (end-to-end) style FSTs + which contain pdf-ids+1 but which won't enforce any + alignment constraints interior to the + utterance. @param [in] normalization_fst A version of denominator FST used to add weights - to the created supervision. It is + to the created supervision. It is actually an FST expected to have the - labels as (pdf-id+1) - @param [in] feats Input feature matrix - @param [in] ivector_feats Online iVector matrix sub-sampled at a + labels as (pdf-id+1). If this has no states, + we skip the final stage of egs preparation + in which we compose with the normalization + FST, and you should do it later with + nnet3-chain-normalize-egs. + @param [in] feats Input feature matrix + @param [in] ivector_feats Online iVector matrix sub-sampled at a rate of "ivector_period". - If NULL, iVector will not be added + If NULL, iVector will not be added as in input to the egs. @param [in] ivector_period Number of frames between iVectors in "ivector_feats" matrix. - @param [in] supervision Supervision for 'chain' training created + @param [in] supervision Supervision for 'chain' training created from the binary chain-get-supervision. - This is expected to be at a - sub-sampled rate if + This is expected to be at a + sub-sampled rate if --frame-subsampling-factor > 1. @param [in] deriv_weights Vector of per-frame weights that scale a frame's gradient during backpropagation. If NULL, this is equivalent to specifying - a vector of all 1s. - The dimension of the vector is expected - to be the supervision size, which is - at a sub-sampled rate if + a vector of all 1s. + The dimension of the vector is expected + to be the supervision size, which is + at a sub-sampled rate if --frame-subsampling-factor > 1. @param [in] supervision_length_tolerance - Tolerance for difference in num-frames-subsampled between - supervision and deriv weights, and also between supervision + Tolerance for difference in num-frames-subsampled between + supervision and deriv weights, and also between supervision and input frames. @param [in] utt_id Utterance-id @param [in] compress If true, compresses the feature matrices. @param [out] utt_splitter Pointer to UtteranceSplitter object, - which helps to split an utterance into + which helps to split an utterance into chunks. This also stores some stats. @param [out] example_writer Pointer to egs writer. **/ -static bool ProcessFile(const fst::StdVectorFst &normalization_fst, +static bool ProcessFile(const TransitionModel *trans_mdl, + const fst::StdVectorFst &normalization_fst, const GeneralMatrix &feats, const MatrixBase *ivector_feats, int32 ivector_period, @@ -103,6 +116,16 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, supervision_length_tolerance)) return false; // LengthsMatch() will have printed a warning. + // It can happen if people mess with the feature frame-width options, that + // there can be small mismatches in length between the supervisions (derived + // from lattices) and the features; if this happens, and + // supervision_length_tolerance is nonzero, and the num-input-frames is larger + // than plausible for this num_output_frames, then it could lead us to try to + // access frames in the supervision that don't exist. The following + // if-statement is to prevent that happening. + if (num_input_frames > num_output_frames * frame_subsampling_factor) + num_input_frames = num_output_frames * frame_subsampling_factor; + std::vector chunks; utt_splitter->GetChunksForUtterance(num_input_frames, &chunks); @@ -127,6 +150,9 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, num_frames_subsampled, &supervision_part); + if (trans_mdl != NULL) + ConvertSupervisionToUnconstrained(*trans_mdl, &supervision_part); + if (normalization_fst.NumStates() > 0 && !AddWeightToSupervisionFst(normalization_fst, &supervision_part)) { @@ -139,7 +165,7 @@ static bool ProcessFile(const fst::StdVectorFst &normalization_fst, int32 first_frame = 0; // we shift the time-indexes of all these parts so // that the supervised part starts from frame 0. - + NnetChainExample nnet_chain_eg; nnet_chain_eg.outputs.resize(1); @@ -234,7 +260,7 @@ int main(int argc, char *argv[]) { "\n" "An example [where $feats expands to the actual features]:\n" "chain-get-supervision [args] | \\\n" - " nnet3-chain-get-egs --left-context=25 --right-context=9 --num-frames=20 dir/normalization.fst \\\n" + " nnet3-chain-get-egs --left-context=25 --right-context=9 --num-frames=150,100,90 dir/normalization.fst \\\n" " \"$feats\" ark,s,cs:- ark:cegs.1.ark\n" "Note: the --frame-subsampling-factor option must be the same as given to\n" "chain-get-supervision.\n"; @@ -248,7 +274,9 @@ int main(int argc, char *argv[]) { BaseFloat normalization_fst_scale = 1.0; int32 srand_seed = 0; - std::string online_ivector_rspecifier, deriv_weights_rspecifier; + std::string online_ivector_rspecifier, + deriv_weights_rspecifier, + trans_mdl_rxfilename; ParseOptions po(usage); po.Register("compress", &compress, "If true, write egs with input features " @@ -266,7 +294,7 @@ int main(int argc, char *argv[]) { po.Register("srand", &srand_seed, "Seed for random number generator "); po.Register("length-tolerance", &length_tolerance, "Tolerance for " "difference in num-frames between feat and ivector matrices"); - po.Register("supervision-length-tolerance", &supervision_length_tolerance, + po.Register("supervision-length-tolerance", &supervision_length_tolerance, "Tolerance for difference in num-frames-subsampled between " "supervision and deriv weights, and also between supervision " "and input frames."); @@ -275,10 +303,14 @@ int main(int argc, char *argv[]) { "backpropagation. " "Not specifying this is equivalent to specifying a vector of " "all 1s."); - po.Register("normalization-fst-scale", &normalization_fst_scale, + po.Register("normalization-fst-scale", &normalization_fst_scale, "Scale the weights from the " "'normalization' FST before applying them to the examples. " "(Useful for semi-supervised training)"); + po.Register("transition-model", &trans_mdl_rxfilename, + "Filename of transition model to read; should only be supplied " + "if you want 'unconstrained' egs, and if you supplied " + "--convert-to-pdfs=false to chain-get-supervision."); eg_config.Register(&po); @@ -296,6 +328,7 @@ int main(int argc, char *argv[]) { feature_rspecifier, supervision_rspecifier, examples_wspecifier; + if (po.NumArgs() == 3) { feature_rspecifier = po.GetArg(1); supervision_rspecifier = po.GetArg(2); @@ -311,11 +344,21 @@ int main(int argc, char *argv[]) { eg_config.ComputeDerived(); UtteranceSplitter utt_splitter(eg_config); + + const TransitionModel *trans_mdl_ptr = NULL; + TransitionModel trans_mdl; + if (!trans_mdl_rxfilename.empty()) { + ReadKaldiObject(trans_mdl_rxfilename, + &trans_mdl); + trans_mdl_ptr = &trans_mdl; + } + + fst::StdVectorFst normalization_fst; if (!normalization_fst_rxfilename.empty()) { ReadFstKaldi(normalization_fst_rxfilename, &normalization_fst); KALDI_ASSERT(normalization_fst.NumStates() > 0); - + if (normalization_fst_scale <= 0.0) KALDI_ERR << "Invalid scale on normalization FST; must be > 0.0"; @@ -366,7 +409,7 @@ int main(int argc, char *argv[]) { num_err++; continue; } - + const Vector *deriv_weights = NULL; if (!deriv_weights_rspecifier.empty()) { if (!deriv_weights_reader.HasKey(key)) { @@ -380,7 +423,7 @@ int main(int argc, char *argv[]) { } } - if (!ProcessFile(normalization_fst, feats, + if (!ProcessFile(trans_mdl_ptr, normalization_fst, feats, online_ivector_feats, online_ivector_period, supervision, deriv_weights, supervision_length_tolerance, key, compress, diff --git a/src/configure b/src/configure index 277bb340781..a954583d3fb 100755 --- a/src/configure +++ b/src/configure @@ -88,7 +88,7 @@ Configuration options: --speex-incdir=DIR SPEEX include directory --host=HOST Host triple in the format 'cpu-vendor-os' If provided, it is prepended to all toolchain programs. - --android-incdir=DIR Andraid include directory + --android-incdir=DIR Android include directory Following environment variables can be used to override the default toolchain. CXX C++ compiler [default=g++] @@ -411,10 +411,14 @@ function configure_cuda { MIN_UNSUPPORTED_GCC_VER="6.0" MIN_UNSUPPORTED_GCC_VER_NUM=60000; ;; - 9_*) + 9_0 | 9_1) MIN_UNSUPPORTED_GCC_VER="7.0" MIN_UNSUPPORTED_GCC_VER_NUM=70000; ;; + 9_2 | 9_*) + MIN_UNSUPPORTED_GCC_VER="8.0" + MIN_UNSUPPORTED_GCC_VER_NUM=80000; + ;; *) echo "Unsupported CUDA_VERSION (CUDA_VERSION=$CUDA_VERSION), please report it to Kaldi mailing list, together with 'nvcc -h' or 'ptxas -h' which lists allowed -gencode values..."; exit 1; ;; @@ -450,6 +454,8 @@ function configure_cuda { else cat makefiles/cuda_64bit.mk >> kaldi.mk fi + elif [ "`uname -m`" == "aarch64" ]; then + cat makefiles/cuda_64bit.mk >> kaldi.mk elif [ "`uname -m`" == "ppc64le" ]; then cat makefiles/cuda_64bit.mk >> kaldi.mk else diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index 6b99a77e73b..ebbcb9da5ff 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -5,7 +5,7 @@ // 2013 Hainan Xu // 2013 Xiaohui Zhang // 2013-2015 Guoguo Chen -// 2016-2017 Shiyin Kang +// 2016-2018 Shiyin Kang // See ../../COPYING for clarification regarding multiple authors // @@ -70,11 +70,12 @@ void cudaF_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha, void cudaD_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha, const double* M, const int strid_M, const double* N, const MatrixDim dim_N, - const double beta, double* v); + const double beta, double* v, + const int stride_v); void cudaF_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha, const float* M, const int strid_M, const float* N, const MatrixDim dim_N, - const float beta, float* v); + const float beta, float* v, const int stride_v); void cudaD_add_diag_packed(int Gr, int Bl, double* mat, double value, int dim); void cudaF_add_diag_packed(int Gr, int Bl, float* mat, float value, int dim); void cudaD_add_diag_vec_mat(dim3 Gr, dim3 Bl, double alpha, double *mat, diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 934a860a055..4101d5ba52f 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -6,7 +6,7 @@ // 2013 Hainan Xu // 2013 Xiaohui Zhang // 2013-2015 Guoguo Chen -// 2016-2017 Shiyin Kang +// 2016-2018 Shiyin Kang // 2017 Hossein Hadian, Daniel Galvez // Licensed under the Apache License, Version 2.0 (the "License"); @@ -1118,7 +1118,7 @@ __global__ static void _add_diag_mat_mat_MTN(const Real alpha, const Real* M, const int stride_M, const Real* N, const MatrixDim dim_N, const Real beta, - Real* v) { + Real* v, const int stride_v) { __shared__ Real ssum[CU1DBLOCK]; const int tid = threadIdx.y * blockDim.x + threadIdx.x; const int j = blockIdx.x * blockDim.x + threadIdx.x; @@ -1127,9 +1127,12 @@ static void _add_diag_mat_mat_MTN(const Real alpha, const Real* M, return; // Loop along the matrix column. - // Reduce to CU1DBLOCK / TileDim elements per column. + // Reduce to gridDim.y * CU1DBLOCK / TileDim elements per column. Real tsum = Real(0); - for (int i = threadIdx.y; i < dim_N.rows; i += blockDim.y) { + + const int grid_stride_y = blockDim.y * gridDim.y; + for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < dim_N.rows; i += + grid_stride_y) { tsum += M[i * stride_M + j] * N[i * dim_N.stride + j]; } ssum[tid] = tsum; @@ -1156,7 +1159,12 @@ static void _add_diag_mat_mat_MTN(const Real alpha, const Real* M, // output TileDim sums per thread block if (tid < TileDim) { - v[j] = alpha * ssum[tid] + beta * v[j]; + if (beta != Real(0)) { + v[blockIdx.y * stride_v + j] = alpha * ssum[tid] + + beta * v[blockIdx.y * stride_v + j]; + } else { + v[blockIdx.y * stride_v + j] = alpha * ssum[tid]; + } } } @@ -4084,11 +4092,14 @@ void cudaF_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha, void cudaF_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha, const float* M, const int stride_M, const float* N, const MatrixDim dim_N, - const float beta, float* v) { + const float beta, float* v, + const int stride_v) { if (Bl.x == 16) { - _add_diag_mat_mat_MTN<16> <<>>(alpha,M,stride_M,N,dim_N,beta,v); - } else if (Bl.x==32) { - _add_diag_mat_mat_MTN<32><<>>(alpha,M,stride_M,N,dim_N,beta,v); + _add_diag_mat_mat_MTN<16> <<>>(alpha, M, stride_M, N, dim_N, beta, + v, stride_v); + } else if (Bl.x == 32) { + _add_diag_mat_mat_MTN<32> <<>>(alpha, M, stride_M, N, dim_N, beta, + v, stride_v); } } @@ -4781,11 +4792,14 @@ void cudaD_add_diag_mat_mat_MNT(int Gr, int Bl, const double alpha, void cudaD_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha, const double* M, const int stride_M, const double* N, const MatrixDim dim_N, - const double beta, double* v) { + const double beta, double* v, + const int stride_v) { if (Bl.x == 16) { - _add_diag_mat_mat_MTN<16> <<>>(alpha,M,stride_M,N,dim_N,beta,v); - } else if (Bl.x==32) { - _add_diag_mat_mat_MTN<32><<>>(alpha,M,stride_M,N,dim_N,beta,v); + _add_diag_mat_mat_MTN<16> <<>>(alpha, M, stride_M, N, dim_N, beta, + v, stride_v); + } else if (Bl.x == 32) { + _add_diag_mat_mat_MTN<32> <<>>(alpha, M, stride_M, N, dim_N, beta, + v, stride_v); } } diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index 8f719a8c4a1..6c24ce0dd58 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -6,7 +6,7 @@ // 2013 Hainan Xu // 2013 Xiaohui Zhang // 2013-2015 Guoguo Chen -// 2016-2017 Shiyin Kang +// 2016-2018 Shiyin Kang // See ../../COPYING for clarification regarding multiple authors // @@ -85,14 +85,18 @@ inline void cuda_add_diag_mat_mat_MNT(int Gr, int Bl, const float alpha, inline void cuda_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const double alpha, const double* M, const int stride_M, const double* N, const MatrixDim dim_N, - const double beta, double* v) { - cudaD_add_diag_mat_mat_MTN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v); + const double beta, double* v, + const int stride_v) { + cudaD_add_diag_mat_mat_MTN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v, + stride_v); } inline void cuda_add_diag_mat_mat_MTN(dim3 Gr, dim3 Bl, const float alpha, const float* M, const int stride_M, const float* N, const MatrixDim dim_N, - const float beta, float* v) { - cudaF_add_diag_mat_mat_MTN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v); + const float beta, float* v, + const int stride_v) { + cudaF_add_diag_mat_mat_MTN(Gr, Bl, alpha, M, stride_M, N, dim_N, beta, v, + stride_v); } inline void cuda_add_diag_packed(int Gr, int Bl, double* mat, double value, int dim) { diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index 03e69b639d3..85aa4c049e7 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -693,8 +693,8 @@ class CuMatrixBase { // The following two functions should only be called if we did not compile // with CUDA or could not get a CUDA card; in that case the contents are - // interpreted the same as a regular matrix. Don't use these unless you know - // what you are doing! + // interpreted the same as a regular matrix. DON'T USE THESE UNLESS YOU KNOW + // WHAT YOU ARE DOING! inline const MatrixBase &Mat() const { return *(reinterpret_cast* >(this)); } diff --git a/src/cudamatrix/cu-vector-speed-test.cc b/src/cudamatrix/cu-vector-speed-test.cc index b5efda3d8de..a532e01b069 100644 --- a/src/cudamatrix/cu-vector-speed-test.cc +++ b/src/cudamatrix/cu-vector-speed-test.cc @@ -1,7 +1,9 @@ // cudamatrix/cu-vector-speed-test.cc -// Copyright 2013 Johns Hopkins University (author: Daniel Povey) -// Copyright 2017 Daniel Galvez +// Copyright 2013 Johns Hopkins University (author: Daniel Povey) +// 2017 Daniel Galvez +// 2016-2018 Shiyin Kang + // See ../../COPYING for clarification regarding multiple authors // @@ -191,6 +193,32 @@ template void TestCuVectorAddDiagMatMat(int32 dim, } +template void TestCuVectorAddDiagMat2OnVariousShapes( + int32 dim, MatrixTransposeType trans) { + BaseFloat time_in_secs = 0.02; + int32 size = 1024 * 32; + CuVector v(trans == kNoTrans ? size / dim : dim); + v.SetRandn(); + CuMatrix N(size / dim, dim); + N.SetRandn(); + + Timer tim; + int32 iter = 0; + + for (; tim.Elapsed() < time_in_secs; iter++) { + v.AddDiagMat2(1.0, N, trans, 0.0); + } + + BaseFloat fdim = size; + BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09); + KALDI_LOG << "For CuVector::AddDiagMat2Shapes" << NameOf() + << (trans == kTrans ? "[trans]" : "[no-trans]") << ", for dim = (" + << size / dim << ", " << dim << "), speed was " << gflops + << " gigaflops."; +} + + + template void TestCuVectorAddDiagMat2(int32 dim, MatrixTransposeType trans) { BaseFloat time_in_secs = 0.02; CuVector v(dim); @@ -293,8 +321,8 @@ template void TestCuVectorApplyFloorNoCount(int32 dim) { BaseFloat fdim = dim; BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuVector::ApplyFloor (no count variety)" << NameOf() - << ", for dim = " << dim << ", speed was " << gflops - << " gigaflops."; + << ", for dim = " << dim << ", speed was " << gflops + << " gigaflops."; } @@ -335,20 +363,60 @@ template void TestCuVectorApplyCeilingNoCount(int32 dim) { BaseFloat fdim = dim; BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09); KALDI_LOG << "For CuVector::ApplyCeiling (no count variety)" << NameOf() - << ", for dim = " << dim << ", speed was " << gflops - << " gigaflops."; + << ", for dim = " << dim << ", speed was " << gflops + << " gigaflops."; } +template void TestCuVectorAddDiagMatMatShape( + int32 num_rows, int32 num_cols, MatrixTransposeType transM, + MatrixTransposeType transN) { + BaseFloat time_in_secs = 0.02; + CuVector v(transM == kTrans ? num_cols : num_rows); + v.SetRandn(); + CuMatrix M(num_rows, num_cols); + CuMatrix N(transM != transN ? num_rows : num_cols, + transM != transN ? num_cols : num_rows); + M.SetRandn(); + N.SetRandn(); + + Timer tim; + int32 iter = 0; + + for (;tim.Elapsed() < time_in_secs; iter++) { + v.AddDiagMatMat(1.0, M, transM, N, transN, 1.0); + } + + BaseFloat fnr = num_rows; + BaseFloat fnc = num_cols; + BaseFloat gflops = (fnr * fnc * iter) / (tim.Elapsed() * 1.0e+09); + KALDI_LOG << "For CuVector::AddDiagMatMat" << NameOf() + << (transM == kNoTrans ? "[no-trans],":"[trans],") + << (transN == kNoTrans ? "[no-trans],":"[trans],") + << " for dim = "<< num_rows << ", " << num_cols + << ", speed was " << gflops << " gigaflops."; +} + + template void CudaVectorSpeedTest() { + const size_t a = 1 << 5; + const size_t b = 1 << 8; + for (size_t i = a; i <= b; i *= 2) { + for (size_t j = a; j <= b; j *= 2) { + if (i * j <= a * b) { + TestCuVectorAddDiagMatMatShape(i, j, kNoTrans, kNoTrans); + TestCuVectorAddDiagMatMatShape(i, j, kNoTrans, kTrans); + TestCuVectorAddDiagMatMatShape(i, j, kTrans, kNoTrans); + TestCuVectorAddDiagMatMatShape(i, j, kTrans, kTrans); + } + } + } + std::vector sizes; - sizes.push_back(16); - sizes.push_back(32); - sizes.push_back(64); - sizes.push_back(128); - sizes.push_back(256); - sizes.push_back(1024); + for (int i = 32; i <= 1024; i *= 2) { + sizes.push_back(i); + } int32 ns = sizes.size(); for (int32 s = 0; s < ns; s++) TestCuVectorSoftmax(sizes[s]); @@ -369,6 +437,10 @@ template void CudaVectorSpeedTest() { TestCuVectorAddDiagMatMat(sizes[s], kTrans, kNoTrans); TestCuVectorAddDiagMatMat(sizes[s], kTrans, kTrans); } + for (int32 s = 0; s < ns; s++) { + TestCuVectorAddDiagMat2OnVariousShapes(sizes[s], kNoTrans); + TestCuVectorAddDiagMat2OnVariousShapes(sizes[s], kTrans); + } for (int32 s = 0; s < ns; s++) { TestCuVectorAddDiagMat2(sizes[s], kNoTrans); TestCuVectorAddDiagMat2(sizes[s], kTrans); diff --git a/src/cudamatrix/cu-vector-test.cc b/src/cudamatrix/cu-vector-test.cc index 0aa8ae931a4..6604b5ed851 100644 --- a/src/cudamatrix/cu-vector-test.cc +++ b/src/cudamatrix/cu-vector-test.cc @@ -345,7 +345,16 @@ template void CuVectorUnitTestSum() { A.SetRandn(); ones.Set(1.0); - AssertEqual(VecVec(A, ones), A.Sum()); + Real x = VecVec(A, ones); + Real y = A.Sum(); + Real diff = std::abs(x - y); + // Note: CuVectorBase<> does not have an ApplyAbs() member + // function, so we copy back to a host vector for simplicity in + // this test case. + Vector A_host(A); + A_host.ApplyAbs(); + Real s = A_host.Sum(); + KALDI_ASSERT ( diff <= 1.0e-04 * s); } } diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc index f85d20d37f1..5ea3a236b0a 100644 --- a/src/cudamatrix/cu-vector.cc +++ b/src/cudamatrix/cu-vector.cc @@ -3,6 +3,7 @@ // Copyright 2012-2013 Karel Vesely // 2012-2014 Johns Hopkins University (author: Daniel Povey) // 2017 Daniel Galvez +// 2016-2018 Shiyin Kang // See ../../COPYING for clarification regarding multiple authors // @@ -569,8 +570,8 @@ void CuVectorBase::AddDiagMat2(Real alpha, const CuMatrixBase &M, if (CuDevice::Instantiate().Enabled()) { if (dim_ == 0) return; MatrixTransposeType other_trans = (trans == kTrans ? kNoTrans : kTrans); - this->AddDiagMatMat(alpha, M, trans, - M, other_trans, beta); + KALDI_ASSERT(dim_ == (trans == kNoTrans ? M.NumRows() : M.NumCols())); + this->AddDiagMatMat(alpha, M, trans, M, other_trans, beta); } else #endif { @@ -601,15 +602,34 @@ void CuVectorBase::AddDiagMatMat(Real alpha, const CuMatrixBase &M, } else { // Case 2: diag(M'*N) == sum(M.*N, 1) // 16x16 or 8x32 2D block for coalesced memory access. - // One block per 'tile_dim' columns of N. - // Large tile dim only for large matrix - // 1D grid expands along the row of N. - int tile_dim = - sizeof(Real) == sizeof(float) && N.NumCols() >= 1536 ? 32 : 16; + // Grid shape is designed as follows, + // 1. for small matrices, use 1D grid with only 1 row of 16x16 block, + // to avoid multiple kernel launch; + // 2. for large enough matrices (no matter thin or fat), + // use 1- or 2-D grid so that the grid contains + // at least and not much larger than 'kOptNumBlocks' blocks + // to fully utilize the GPU; + const int32 warpSize = 32; + const int32 kOptNumBlocks = 512; + const int32 tile_dim = + (N.NumRows() < 4096 && N.NumCols() < kOptNumBlocks * warpSize) ? + 16 : 32; dim3 dimBlock(tile_dim, CU1DBLOCK / tile_dim); - dim3 dimGrid(n_blocks(N.NumCols(), tile_dim)); - cuda_add_diag_mat_mat_MTN(dimGrid, dimBlock, alpha, M.Data(), - M.Stride(), N.Data(), N.Dim(), beta, data_); + dim3 dimGrid(n_blocks(N.NumCols(), dimBlock.x), + n_blocks(N.NumRows(), dimBlock.y)); + dimGrid.y = std::min(dimGrid.y, (kOptNumBlocks - 1) / dimGrid.x + 1); + dimGrid.y = tile_dim == 16 ? 1 : dimGrid.y; + if (dimGrid.y > 1) { + CuMatrix buf(dimGrid.y, N.NumCols()); + cuda_add_diag_mat_mat_MTN(dimGrid, dimBlock, Real(1), M.Data(), + M.Stride(), N.Data(), N.Dim(), Real(0), + buf.Data(), buf.Stride()); + this->AddRowSumMat(alpha, buf, beta); + } else { + cuda_add_diag_mat_mat_MTN(dimGrid, dimBlock, alpha, M.Data(), + M.Stride(), N.Data(), N.Dim(), beta, data_, + dim_); + } } } else { KALDI_ASSERT(M.NumCols() == N.NumRows()); diff --git a/src/decoder/decoder-wrappers.cc b/src/decoder/decoder-wrappers.cc index 6d3326f7b12..150d9e513a8 100644 --- a/src/decoder/decoder-wrappers.cc +++ b/src/decoder/decoder-wrappers.cc @@ -450,8 +450,6 @@ void AlignUtteranceWrapper( return; } - - fst::StdArc::Label special_symbol = 0; if (config.careful) ModifyGraphForCarefulAlignment(fst); diff --git a/src/decoder/lattice-faster-decoder.cc b/src/decoder/lattice-faster-decoder.cc index 963430a63f1..b837d836a70 100644 --- a/src/decoder/lattice-faster-decoder.cc +++ b/src/decoder/lattice-faster-decoder.cc @@ -870,7 +870,7 @@ void LatticeFasterDecoder::ProcessNonemittingWrapper(BaseFloat cost_cutoff) { } else if (fst_.Type() == "vector") { return LatticeFasterDecoder::ProcessNonemitting>(cost_cutoff); } else { - return LatticeFasterDecoder::ProcessNonemitting>(cost_cutoff); + return LatticeFasterDecoder::ProcessNonemitting>(cost_cutoff); } } diff --git a/src/decoder/lattice-faster-decoder.h b/src/decoder/lattice-faster-decoder.h index 56e4af1b95b..9c6ddd67acd 100644 --- a/src/decoder/lattice-faster-decoder.h +++ b/src/decoder/lattice-faster-decoder.h @@ -233,7 +233,7 @@ class LatticeFasterDecoder { // links from it when we process the next frame. struct Token { BaseFloat tot_cost; // would equal weight.Value()... cost up to this point. - BaseFloat extra_cost; // >= 0. This is used in pruning a way tokens. + BaseFloat extra_cost; // >= 0. This is used in pruning away tokens. // there is a comment in lattice-faster-decoder.cc explaining this; // search for "a note on the definition of extra_cost". @@ -340,7 +340,7 @@ class LatticeFasterDecoder { /// Processes emitting arcs for one frame. Propagates from prev_toks_ to cur_toks_. /// Returns the cost cutoff for subsequent ProcessNonemitting() to use. - /// Templated on FST type for speed; called via ProcessEmittingWrapper(). + /// Templated on FST type for speed; called via ProcessEmittingWrapper(). template BaseFloat ProcessEmitting(DecodableInterface *decodable); BaseFloat ProcessEmittingWrapper(DecodableInterface *decodable); diff --git a/src/decoder/lattice-faster-online-decoder.cc b/src/decoder/lattice-faster-online-decoder.cc index 5fb2ef25a3d..0a921438f94 100644 --- a/src/decoder/lattice-faster-online-decoder.cc +++ b/src/decoder/lattice-faster-online-decoder.cc @@ -1054,7 +1054,7 @@ void LatticeFasterOnlineDecoder::ProcessNonemittingWrapper( ProcessNonemitting>(cost_cutoff); } else { return LatticeFasterOnlineDecoder:: - ProcessNonemitting>(cost_cutoff); + ProcessNonemitting>(cost_cutoff); } } diff --git a/src/decoder/training-graph-compiler.cc b/src/decoder/training-graph-compiler.cc index d528a48d3e9..8b28ad2d11f 100644 --- a/src/decoder/training-graph-compiler.cc +++ b/src/decoder/training-graph-compiler.cc @@ -39,7 +39,7 @@ TrainingGraphCompiler::TrainingGraphCompiler(const TransitionModel &trans_model, disambig_syms_[i])) KALDI_ERR << "Disambiguation symbol " << disambig_syms_[i] << " is also a phone."; - + int32 subseq_symbol = 1 + phone_syms.back(); if (!disambig_syms_.empty() && subseq_symbol <= disambig_syms_.back()) subseq_symbol = 1 + disambig_syms_.back(); @@ -111,10 +111,10 @@ bool TrainingGraphCompiler::CompileGraph(const fst::VectorFst &word trans_model_, h_cfg, &disambig_syms_h); - + VectorFst &trans2word_fst = *out_fst; // transition-id to word. TableCompose(*H, ctx2word_fst, &trans2word_fst); - + KALDI_ASSERT(trans2word_fst.Start() != kNoStateId); // Epsilon-removal and determinization combined. This will fail if not determinizable. @@ -128,15 +128,17 @@ bool TrainingGraphCompiler::CompileGraph(const fst::VectorFst &word RemoveEpsLocal(&trans2word_fst); } - + // Encoded minimization. MinimizeEncoded(&trans2word_fst); std::vector disambig; + bool check_no_self_loops = true; AddSelfLoops(trans_model_, disambig, opts_.self_loop_scale, opts_.reorder, + check_no_self_loops, &trans2word_fst); delete H; @@ -154,7 +156,7 @@ bool TrainingGraphCompiler::CompileGraphsFromText( VectorFst *word_fst = new VectorFst(); MakeLinearAcceptor(transcripts[i], word_fst); word_fsts[i] = word_fst; - } + } bool ans = CompileGraphs(word_fsts, out_fsts); for (size_t i = 0; i < transcripts.size(); i++) delete word_fsts[i]; @@ -192,7 +194,7 @@ bool TrainingGraphCompiler::CompileGraphs( KALDI_ASSERT(phone2word_fst.Start() != kNoStateId && "Perhaps you have words missing in your lexicon?"); - + VectorFst ctx2word_fst; ComposeContextFst(*cfst, phone2word_fst, &ctx2word_fst); // ComposeContextFst is like Compose but faster for this particular Fst type. @@ -226,15 +228,17 @@ bool TrainingGraphCompiler::CompileGraphs( if (opts_.rm_eps) RemoveEpsLocal(&trans2word_fst); } - + // Encoded minimization. MinimizeEncoded(&trans2word_fst); std::vector disambig; + bool check_no_self_loops = true; AddSelfLoops(trans_model_, disambig, opts_.self_loop_scale, opts_.reorder, + check_no_self_loops, &trans2word_fst); KALDI_ASSERT(trans2word_fst.Start() != kNoStateId); diff --git a/src/doc/history.dox b/src/doc/history.dox index bf114a3a9e0..40d46c7e32f 100644 --- a/src/doc/history.dox +++ b/src/doc/history.dox @@ -82,9 +82,13 @@ Excellence. BUT researchers were partially supported during this time by Czech Ministry of Trade and Commerce project no. FR-TI1/034, Grant Agency of Czech Republic project no. 102/08/0707, and Czech Ministry of Education project - no. MSM0021630528. Arnab Ghoshal was partially supported by the European - Community's Seventh Framework Programme under grant agreement number 213850 - (SCALE). + no. MSM0021630528. + Arnab Ghoshal was affiliated with Saarland University supported by + the European Community's Seventh Framework Programme + grant number 213850 (SCALE), and with The University of Edinburgh + supported by United Kingdom's Engineering and Physical Sciences + Research Council grant number EP/I031022/1 (Natural Speech + Technology)" The work of BUT researchers on Kaldi was supported by the Technology Agency of the Czech Republic under project No. TA01011328. diff --git a/src/doc/io.dox b/src/doc/io.dox index 9c1f786c322..dc958f57a6f 100644 --- a/src/doc/io.dox +++ b/src/doc/io.dox @@ -73,10 +73,10 @@ namespace kaldi { \code // we suppose that class_member_ is of type int32. void SomeKaldiClass::Read(std::istream &is, bool binary) { - ReadBasicType(binary, &class_member_); + ReadBasicType(is, binary, &class_member_); } - void SomeKaldiClass::Write(std::ostream &is, bool binary) const { - WriteBasicType(binary, class_member_); + void SomeKaldiClass::Write(std::ostream &os, bool binary) const { + WriteBasicType(os, binary, class_member_); } \endcode We have assumed that \c class_member_ is of type int32, which is a type of known diff --git a/src/doc/lattices.dox b/src/doc/lattices.dox index 0b222ec5f1a..1b05102c721 100644 --- a/src/doc/lattices.dox +++ b/src/doc/lattices.dox @@ -476,7 +476,10 @@ programs. \verbatim lattice-lmrescore --lm-scale=1.0 ark:nolm.lats G_new.fst ark:out.lats \endverbatim -Note that there are other ways to do this; see +Note: the above examples are slightly simplified; actually you would have +to "project" the G.fst FST's on the output to remove disambiguation symbols +using `fstproject` with `--project_output=true`, see steps/lmrescore.sh for examples. +Also there are other ways to do this; see the documentation for lattice-compose below. What the program \ref lattice-lmrescore.cc "lattice-lmrescore" does is this... actually, we will first first describe the simple version that is not exactly what diff --git a/src/feat/feature-window.h b/src/feat/feature-window.h index 88002ae1903..a897c6fa4b0 100644 --- a/src/feat/feature-window.h +++ b/src/feat/feature-window.h @@ -86,7 +86,7 @@ struct FrameExtractionOptions { "frame-length. If false, the number of frames depends only on the " "frame-shift, and we reflect the data at the ends."); opts->Register("allow-downsample", &allow_downsample, - "If true, allow the input waveform to have a higher frequency than" + "If true, allow the input waveform to have a higher frequency than " "the specified --sample-frequency (and we'll downsample)."); } int32 WindowShift() const { @@ -116,7 +116,7 @@ struct FeatureWindowFunction { file with the given number of samples in it (assumed to have the same sampling rate as specified in 'opts'). - @param [in] wave_length The number of samples in the wave file. + @param [in] num_samples The number of samples in the wave file. @param [in] opts The frame-extraction options class @param [in] flush True if we are asserting that this number of samples is diff --git a/src/feat/mel-computations.cc b/src/feat/mel-computations.cc index 714d963f01b..810b6247e93 100644 --- a/src/feat/mel-computations.cc +++ b/src/feat/mel-computations.cc @@ -37,13 +37,9 @@ MelBanks::MelBanks(const MelBanksOptions &opts, int32 num_bins = opts.num_bins; if (num_bins < 3) KALDI_ERR << "Must have at least 3 mel bins"; BaseFloat sample_freq = frame_opts.samp_freq; - int32 window_length = static_cast(frame_opts.samp_freq*0.001*frame_opts.frame_length_ms); - int32 window_length_padded = - (frame_opts.round_to_power_of_two ? - RoundUpToNearestPowerOfTwo(window_length) : - window_length); + int32 window_length_padded = frame_opts.PaddedWindowSize(); KALDI_ASSERT(window_length_padded % 2 == 0); - int32 num_fft_bins = window_length_padded/2; + int32 num_fft_bins = window_length_padded / 2; BaseFloat nyquist = 0.5 * sample_freq; BaseFloat low_freq = opts.low_freq, high_freq; @@ -73,7 +69,9 @@ MelBanks::MelBanks(const MelBanksOptions &opts, BaseFloat vtln_low = opts.vtln_low, vtln_high = opts.vtln_high; - if (vtln_high < 0.0) vtln_high += nyquist; + if (vtln_high < 0.0) { + vtln_high += nyquist; + } if (vtln_warp_factor != 1.0 && (vtln_low < 0.0 || vtln_low <= low_freq diff --git a/src/featbin/extract-rows.cc b/src/featbin/extract-rows.cc deleted file mode 100644 index e4e2a927e6b..00000000000 --- a/src/featbin/extract-rows.cc +++ /dev/null @@ -1,168 +0,0 @@ -// featbin/extract-rows.cc - -// Copyright 2013 Korbinian Riedhammer -// 2015 Johns Hopkins University (author: Daniel Povey) - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "matrix/kaldi-matrix.h" - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; - using namespace std; - - const char *usage = - "Extract certain row ranges of matrices. This is most useful to extract segments\n" - "from feature files, for example to modify segmentations or to extract features\n" - "corresponding to certain alignments. The program expects a segments file in the\n" - "form of\n" - " segment-name utterance-id start end\n" - "where the segment-name is chosen by the user and utterance-id indexes the input matrices.\n" - "By default, 'start' and 'end' are row numbers (zero-based), but if you specify the --frame-shift\n" - "option (e.g. --frame-shift=0.01), then they represent a time in seconds, which are converted\n" - "to integers by dividing by frame-shift.\n" - "\n" - "Usage: extract-rows [options] \n" - " e.g. extract-rows --frame-shift=0.01 segments ark:feats-in.ark ark:feats-out.ark\n" - "See also: select-feats, subset-feats, subsample-feats\n"; - - ParseOptions po(usage); - - float frame_shift = 0; - - po.Register("frame-shift", &frame_shift, - "Frame shift in sec (e.g. 0.01), if segment files contains times " - "instead of frames"); - - po.Read(argc, argv); - - if (po.NumArgs() != 3) { - po.PrintUsage(); - exit(1); - } - - string segment_rspecifier = po.GetArg(1); - string feat_rspecifier = po.GetArg(2); - string feat_wspecifier = po.GetArg(3); - - Input ki(segment_rspecifier); - RandomAccessBaseFloatMatrixReader reader(feat_rspecifier); - BaseFloatMatrixWriter writer(feat_wspecifier); - - int32 num_done = 0, num_err = 0; - - string line; - - /* read each line from segments file */ - while (std::getline(ki.Stream(), line)) { - - vector split_line; - SplitStringToVector(line, " \t\r", true, &split_line); - if (split_line.size() != 4) { - KALDI_WARN << "Invalid line in segments file: " << line; - num_err++; - continue; - } - - string utt = split_line[0], - recording = split_line[1], - start_str = split_line[2], - end_str = split_line[3]; - - // if the segments are in time, we need to convert them to frame numbers - int32 start = 0; - int32 end = 0; - if (frame_shift > 0) { - // Convert the start time and endtime to real from string. Segment is - // ignored if start or end time cannot be converted to real. - double t1, t2; - if (!ConvertStringToReal(start_str, &t1)) { - KALDI_ERR << "Invalid line in segments file [bad start]: " << line; - continue; - } - if (!ConvertStringToReal(end_str, &t2)) { - KALDI_ERR << "Invalid line in segments file [bad end]: " << line; - continue; - } - - start = (int) (t1 / frame_shift); - end = (int) (t2 / frame_shift); - } else { - if (!ConvertStringToInteger(start_str, &start)) { - KALDI_ERR << "Invalid line in segments file [bad start]: " << line; - continue; - } - if (!ConvertStringToInteger(end_str, &end)) { - KALDI_ERR << "Invalid line in segments file [bad end]: " << line; - continue; - } - } - - if (start < 0 || end - start <= 0) { - KALDI_WARN << "Invalid line in segments file [less than one frame]: " << line; - num_err++; - continue; - } - - if (reader.HasKey(recording)) { - const Matrix &feats = reader.Value(recording); - - if (feats.NumRows() < end) { - if (feats.NumRows() > start) { - KALDI_WARN << "Truncating end time of segment " << utt << " from " - << end << " to " << feats.NumRows(); - end = feats.NumRows(); - } else { - KALDI_WARN << "Segment " << utt << " is outside of input range: " - << "input num-rows " << feats.NumRows() << " vs. " - << line; - num_err++; - continue; - } - } - - Matrix to_write(feats.RowRange(start, (end-start))); - writer.Write(utt, to_write); - num_done++; - } else { - KALDI_WARN << "No recording-id " << recording << " present in features."; - num_err++; - } - } - - KALDI_LOG << "Processed " << num_done << " segments successfully; " - << "errors on " << num_err; - - return (num_done > 0 ? 0 : 1); - } catch(const std::exception &e) { - std::cerr << e.what(); - return -1; - } -} - -/* -testing: -cat < ... \n" + " e.g.: fsts-concat scp:fsts1.scp scp:fsts2.scp ... ark:fsts_out.ark\n" + "\n" + "see also: fstconcat (from the OpenFst toolkit)\n"; + + ParseOptions po(usage); + + po.Read(argc, argv); + + if (po.NumArgs() < 3) { + po.PrintUsage(); + exit(1); + } + + std::string fsts_rspecifier = po.GetArg(1), + fsts_wspecifier = po.GetArg(po.NumArgs()); + + SequentialTableReader fst_reader(fsts_rspecifier); + std::vector*> fst_readers; + TableWriter fst_writer(fsts_wspecifier); + + for (int32 i = 2; i < po.NumArgs(); i++) + fst_readers.push_back(new RandomAccessTableReader(po.GetArg(i))); + const int32 num_fst_readers = fst_readers.size(); + + int32 n_done = 0, + n_skipped = 0; + + for (; !fst_reader.Done(); fst_reader.Next()) { + std::string key = fst_reader.Key(); + + // Check that the key exists in all 'fst_readers'. + bool skip_key = false; + for (int32 i = 0; i < num_fst_readers; i++) { + if (!fst_readers[i]->HasKey(key)) { + KALDI_WARN << "Skipping '" << key << "'" + << " due to missing the fst in " << (i+2) << "th : " + << "'" << po.GetArg(i+2) << "'"; + skip_key = true; + } + } + if (skip_key) { + n_skipped++; + continue; + } + + // Concatenate! + VectorFst fst_out = fst_readers.back()->Value(key); + // Loop from (last-1) to first, as 'prepending' the fsts is faster, + // see: http://www.openfst.org/twiki/bin/view/FST/ConcatDoc + for (int32 i = num_fst_readers-2; i >= 0; i--) { + fst::Concat(fst_readers[i]->Value(key), &fst_out); + } + // Finally, prepend the fst from the 'Sequential' reader. + fst::Concat(fst_reader.Value(), &fst_out); + + // Write the output. + fst_writer.Write(key, fst_out); + n_done++; + } + + // Cleanup. + for (int32 i = 0; i < num_fst_readers; i++) + delete fst_readers[i]; + fst_readers.clear(); + + KALDI_LOG << "Produced " << n_done << " FSTs by concatenating " << po.NumArgs()-1 + << " streams " << "(" << n_skipped << " keys skipped)."; + return (n_done != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/fstext/determinize-star.h b/src/fstext/determinize-star.h index 6377ad7352f..253068ec572 100644 --- a/src/fstext/determinize-star.h +++ b/src/fstext/determinize-star.h @@ -40,7 +40,7 @@ namespace fst { /* DeterminizeStar implements determinization with epsilon removal, which we distinguish with a star. - + We define a determinized* FST as one in which no state has more than one transition with the same input-label. Epsilon input labels are not allowed except starting from states that have exactly one arc exiting them (and are @@ -54,8 +54,8 @@ namespace fst { float-weight. It does epsilon removal and determinization. This algorithm may fail if the input has epsilon cycles under certain circumstances (i.e. the semiring is non-idempotent, e.g. the log - semiring, or there are negative cost epsilon cycles). - + semiring, or there are negative cost epsilon cycles). + This implementation is much less fancy than the one in fst/determinize.h, and does not have an "on-demand" version. @@ -80,8 +80,10 @@ namespace fst { If allow_partial is true, the algorithm will output partial results when the specified max_states is reached (when larger than zero), instead of throwing out an error. - The function will return false if partial FST is generated, and true if the - complete determinized FST is generated. + + Caution, the return status is un-intuitive: this function will return false if + determinization completed normally, and true if it was stopped early by + reaching the 'max-states' limit, and a partial FST was generated. */ template bool DeterminizeStar(F &ifst, MutableFst *ofst, @@ -100,8 +102,10 @@ bool DeterminizeStar(F &ifst, MutableFst *ofst, If allow_partial is true, the algorithm will output partial results when the specified max_states is reached (when larger than zero), instead of throwing out an error. - The function will return false if partial FST is generated, and true if the - complete determinized FST is generated. + + Caution, the return status is un-intuitive: this function will return false if + determinization completed normally, and true if it was stopped early by + reaching the 'max-states' limit, and a partial FST was generated. */ template bool DeterminizeStar(F &ifst, MutableFst > *ofst, diff --git a/src/fstext/fstext-utils-inl.h b/src/fstext/fstext-utils-inl.h index 923c67c07e2..756e449fcaa 100644 --- a/src/fstext/fstext-utils-inl.h +++ b/src/fstext/fstext-utils-inl.h @@ -1132,7 +1132,7 @@ inline bool IsStochasticFst(const Fst &fst, // Will override this for LogArc where NaturalLess will not work. template -bool IsStochasticFst(const Fst &fst, +inline bool IsStochasticFst(const Fst &fst, float delta, typename Arc::Weight *min_sum, typename Arc::Weight *max_sum) { @@ -1168,7 +1168,7 @@ bool IsStochasticFst(const Fst &fst, // Overriding template for LogArc as NaturalLess does not work there. template<> -bool IsStochasticFst(const Fst &fst, +inline bool IsStochasticFst(const Fst &fst, float delta, LogArc::Weight *min_sum, LogArc::Weight *max_sum) { @@ -1208,7 +1208,7 @@ bool IsStochasticFst(const Fst &fst, // This function deals with the generic fst. // This version currently supports ConstFst or VectorFst. // Otherwise, it will be died with an error. -bool IsStochasticFstInLog(const Fst &fst, +inline bool IsStochasticFstInLog(const Fst &fst, float delta, StdArc::Weight *min_sum, StdArc::Weight *max_sum) { diff --git a/src/fstext/kaldi-fst-io.cc b/src/fstext/kaldi-fst-io.cc index cda146104d0..acbd9e59000 100644 --- a/src/fstext/kaldi-fst-io.cc +++ b/src/fstext/kaldi-fst-io.cc @@ -54,7 +54,7 @@ Fst *ReadFstKaldiGeneric(std::string rxfilename, bool throw_on_err) { << kaldi::PrintableRxfilename(rxfilename); } else { KALDI_WARN << "We fail to read FST header from " - << kaldi::PrintableRxfilename(rxfilename) + << kaldi::PrintableRxfilename(rxfilename) << ". A NULL pointer is returned."; return NULL; } @@ -92,16 +92,15 @@ Fst *ReadFstKaldiGeneric(std::string rxfilename, bool throw_on_err) { } VectorFst *CastOrConvertToVectorFst(Fst *fst) { - // This version currently supports ConstFst or VectorFst + // This version currently supports ConstFst or VectorFst std::string real_type = fst->Type(); KALDI_ASSERT(real_type == "vector" || real_type == "const"); if (real_type == "vector") { return dynamic_cast *>(fst); } else { - // As the 'fst' can't cast to VectorFst, I'm creating a new - // VectorFst initialized by 'fst', and deletes 'fst'. + // As the 'fst' can't cast to VectorFst, we carete a new + // VectorFst initialized by 'fst', and delete 'fst'. VectorFst *new_fst = new VectorFst(*fst); - KALDI_WARN << "The 'fst' is deleted."; delete fst; return new_fst; } diff --git a/src/fstext/lattice-weight-test.cc b/src/fstext/lattice-weight-test.cc index 8bcbeb4eba3..ae768e711f7 100644 --- a/src/fstext/lattice-weight-test.cc +++ b/src/fstext/lattice-weight-test.cc @@ -34,11 +34,12 @@ typedef CompactLatticeWeightCommonDivisorTpl LatticeWeight RandomLatticeWeight() { - if (kaldi::Rand() % 3 == 0) { + int tmp = kaldi::Rand() % 4; + if (tmp == 0) { return LatticeWeight::Zero(); - } else if (kaldi::Rand() % 3 == 0) { + } else if (tmp == 1) { return LatticeWeight( 1, 2); // sometimes return special values.. - } else if (kaldi::Rand() % 3 == 0) { + } else if (tmp == 2) { return LatticeWeight( 2, 1); // this tests more thoroughly certain properties... } else { return LatticeWeight( 100 * kaldi::RandGauss(), 100 * kaldi::RandGauss()); @@ -107,7 +108,7 @@ void LatticeWeightTest() { s1 << l1; std::istringstream s2(s1.str()); s2 >> l2; - KALDI_ASSERT(ApproxEqual(l1, l2)); + KALDI_ASSERT(ApproxEqual(l1, l2, 0.001)); std::cout << s1.str() << '\n'; { std::ostringstream s1b; diff --git a/src/fstext/lattice-weight.h b/src/fstext/lattice-weight.h index 462fc94f204..7281dc9ba50 100644 --- a/src/fstext/lattice-weight.h +++ b/src/fstext/lattice-weight.h @@ -330,6 +330,34 @@ class NaturalLess > { return (Compare(w1, w2) == 1); } }; +template<> +class NaturalLess > { + public: + typedef LatticeWeightTpl Weight; + + NaturalLess() {} + + bool operator()(const Weight &w1, const Weight &w2) const { + // NaturalLess is a negative order (opposite to normal ordering). + // This operator () corresponds to "<" in the negative order, which + // corresponds to the ">" in the normal order. + return (Compare(w1, w2) == 1); + } +}; +template<> +class NaturalLess > { + public: + typedef LatticeWeightTpl Weight; + + NaturalLess() {} + + bool operator()(const Weight &w1, const Weight &w2) const { + // NaturalLess is a negative order (opposite to normal ordering). + // This operator () corresponds to "<" in the negative order, which + // corresponds to the ">" in the normal order. + return (Compare(w1, w2) == 1); + } +}; template inline LatticeWeightTpl Times(const LatticeWeightTpl &w1, @@ -592,6 +620,34 @@ class NaturalLess, IntType> return (Compare(w1, w2) == 1); } }; +template<> +class NaturalLess, int32> > { + public: + typedef CompactLatticeWeightTpl, int32> Weight; + + NaturalLess() {} + + bool operator()(const Weight &w1, const Weight &w2) const { + // NaturalLess is a negative order (opposite to normal ordering). + // This operator () corresponds to "<" in the negative order, which + // corresponds to the ">" in the normal order. + return (Compare(w1, w2) == 1); + } +}; +template<> +class NaturalLess, int32> > { + public: + typedef CompactLatticeWeightTpl, int32> Weight; + + NaturalLess() {} + + bool operator()(const Weight &w1, const Weight &w2) const { + // NaturalLess is a negative order (opposite to normal ordering). + // This operator () corresponds to "<" in the negative order, which + // corresponds to the ">" in the normal order. + return (Compare(w1, w2) == 1); + } +}; // Make sure Compare is defined for TropicalWeight, so everything works // if we substitute LatticeWeight for TropicalWeight. diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc index fe6c5b32d6e..a122ca5dc05 100644 --- a/src/hmm/hmm-utils.cc +++ b/src/hmm/hmm-utils.cc @@ -388,40 +388,38 @@ fst::VectorFst *GetPdfToTransitionIdTransducer(const TransitionMode - -// this is the code that expands an FST from transition-states to -// transition-ids, in the case where "reorder == true", -// i.e. non-optional transition is before the self-loop. - - - class TidToTstateMapper { public: // Function object used in MakePrecedingInputSymbolsSameClass and - // MakeFollowingInputSymbolsSameClass (as called by AddSelfLoopsBefore - // and AddSelfLoopsAfter). It maps transition-ids to transition-states - // (and -1 to -1, 0 to 0 and disambiguation symbols to 0). It also - // checks that there are no self-loops in the graph (i.e. in the labels - // it is called with). This is just a convenient place to put this check. + // MakeFollowingInputSymbolsSameClass (as called by AddSelfLoopsReorder and + // AddSelfLoopsNoReorder). It maps transition-ids to transition-states (and + // -1 to -1, 0 to 0 and disambiguation symbols to 0). If check_no_self_loops + // == true, it also checks that there are no self-loops in the graph (i.e. in + // the labels it is called with). This is just a convenient place to put this + // check. // This maps valid transition-ids to transition states, maps kNoLabel to -1, and // maps all other symbols (i.e. epsilon symbols and disambig symbols) to zero. // Its point is to provide an equivalence class on labels that's relevant to what // the self-loop will be on the following (or preceding) state. TidToTstateMapper(const TransitionModel &trans_model, - const std::vector &disambig_syms): + const std::vector &disambig_syms, + bool check_no_self_loops): trans_model_(trans_model), - disambig_syms_(disambig_syms) { } + disambig_syms_(disambig_syms), + check_no_self_loops_(check_no_self_loops) { } typedef int32 Result; int32 operator() (int32 label) const { if (label == static_cast(fst::kNoLabel)) return -1; // -1 -> -1 else if (label >= 1 && label <= trans_model_.NumTransitionIds()) { - if (trans_model_.IsSelfLoop(label)) + if (check_no_self_loops_ && trans_model_.IsSelfLoop(label)) KALDI_ERR << "AddSelfLoops: graph already has self-loops."; return trans_model_.TransitionIdToTransitionState(label); } else { // 0 or (presumably) disambiguation symbol. Map to zero if (label != 0) - KALDI_ASSERT(std::binary_search(disambig_syms_.begin(), disambig_syms_.end(), label)); // or invalid label + KALDI_ASSERT(std::binary_search(disambig_syms_.begin(), + disambig_syms_.end(), + label)); // or invalid label return 0; } } @@ -429,21 +427,28 @@ class TidToTstateMapper { private: const TransitionModel &trans_model_; const std::vector &disambig_syms_; // sorted. + bool check_no_self_loops_; }; -static void AddSelfLoopsBefore(const TransitionModel &trans_model, - const std::vector &disambig_syms, - BaseFloat self_loop_scale, - fst::VectorFst *fst) { +// This is the code that expands an FST from transition-states to +// transition-ids, in the case where reorder == true, i.e. the non-optional +// transition is before the self-loop. +static void AddSelfLoopsReorder(const TransitionModel &trans_model, + const std::vector &disambig_syms, + BaseFloat self_loop_scale, + bool check_no_self_loops, + fst::VectorFst *fst) { using namespace fst; typedef StdArc Arc; typedef Arc::Label Label; typedef Arc::StateId StateId; typedef Arc::Weight Weight; - TidToTstateMapper f(trans_model, disambig_syms); - // Duplicate states as necessary so that each state has at most one self-loop - // on it. + TidToTstateMapper f(trans_model, disambig_syms, check_no_self_loops); + // Duplicate states as necessary so that each state will require at most one + // self-loop to be added to it. Approximately this means that if a + // state has multiple different symbols on arcs entering it, it will be + // duplicated, with one copy per incoming symbol. MakePrecedingInputSymbolsSameClass(true, fst, f); int32 kNoTransState = f(kNoLabel); @@ -508,13 +513,14 @@ static void AddSelfLoopsBefore(const TransitionModel &trans_model, // this is the code that expands an FST from transition-states to -// transition-ids, in the case where "reorder == false", i.e. non-optional transition -// is after the self-loop. - -static void AddSelfLoopsAfter(const TransitionModel &trans_model, - const std::vector &disambig_syms, - BaseFloat self_loop_scale, - fst::VectorFst *fst) { +// transition-ids, in the case where reorder == false, i.e. non-optional +// transition is after the self-loop. +static void AddSelfLoopsNoReorder( + const TransitionModel &trans_model, + const std::vector &disambig_syms, + BaseFloat self_loop_scale, + bool check_no_self_loops, + fst::VectorFst *fst) { using namespace fst; typedef StdArc Arc; typedef Arc::Label Label; @@ -523,7 +529,7 @@ static void AddSelfLoopsAfter(const TransitionModel &trans_model, // Duplicate states as necessary so that each state has at most one self-loop // on it. - TidToTstateMapper f(trans_model, disambig_syms); + TidToTstateMapper f(trans_model, disambig_syms, check_no_self_loops); MakeFollowingInputSymbolsSameClass(true, fst, f); StateId num_states = fst->NumStates(); @@ -559,13 +565,16 @@ static void AddSelfLoopsAfter(const TransitionModel &trans_model, void AddSelfLoops(const TransitionModel &trans_model, const std::vector &disambig_syms, BaseFloat self_loop_scale, - bool reorder, // true->dan-style, false->lukas-style. + bool reorder, + bool check_no_self_loops, fst::VectorFst *fst) { KALDI_ASSERT(fst->Start() != fst::kNoStateId); if (reorder) - AddSelfLoopsBefore(trans_model, disambig_syms, self_loop_scale, fst); + AddSelfLoopsReorder(trans_model, disambig_syms, self_loop_scale, + check_no_self_loops, fst); else - AddSelfLoopsAfter(trans_model, disambig_syms, self_loop_scale, fst); + AddSelfLoopsNoReorder(trans_model, disambig_syms, self_loop_scale, + check_no_self_loops, fst); } // IsReordered returns true if the transitions were possibly reordered. This reordering diff --git a/src/hmm/hmm-utils.h b/src/hmm/hmm-utils.h index 3d51cbe1f14..628f018a3fa 100644 --- a/src/hmm/hmm-utils.h +++ b/src/hmm/hmm-utils.h @@ -158,12 +158,23 @@ void GetIlabelMapping (const std::vector > &ilabel_info_old, * @param self_loop_scale [in] Transition-probability scale for self-loops; c.f. * \ref hmm_scale * @param reorder [in] If true, reorders the transitions (see \ref hmm_reorder). + * You'll normally want this to be true. + * @param check_no_self_loops [in] If true, it will check that there are no + * self-loops in the original graph; you'll normally want + * this to be true. If false, it will allow them, and + * will add self-loops after the original self-loop + * transitions, assuming reorder==true... this happens to + * be what we want when converting normal to unconstrained + * chain examples. WARNING: this was added in 2018; + * if you get a compilation error, add this as 'true', + * which emulates the behavior of older code. * @param fst [in, out] The FST to be modified. */ void AddSelfLoops(const TransitionModel &trans_model, const std::vector &disambig_syms, // used as a check only. BaseFloat self_loop_scale, - bool reorder, // true->dan-style, false->lukas-style. + bool reorder, + bool check_no_self_loops, fst::VectorFst *fst); /** diff --git a/src/hmm/transition-model.h b/src/hmm/transition-model.h index 442de8fd2e0..9843dff946b 100644 --- a/src/hmm/transition-model.h +++ b/src/hmm/transition-model.h @@ -130,7 +130,7 @@ class TransitionModel { /// Constructor that takes no arguments: typically used prior to calling Read. - TransitionModel() { } + TransitionModel(): num_pdfs_(0) { } void Read(std::istream &is, bool binary); // note, no symbol table: topo object always read/written w/o symbols. void Write(std::ostream &os, bool binary) const; @@ -184,7 +184,7 @@ class TransitionModel { // an unseen phone has the highest-numbered pdf, this might be different. int32 NumPdfs() const { return num_pdfs_; } - // This loops over the triples and finds the highest phone index present. If + // This loops over the tuples and finds the highest phone index present. If // the FST symbol table for the phones is created in the expected way, i.e.: // starting from 1 ( is 0) and numbered contiguously till the last phone, // this will be the total number of phones. @@ -288,9 +288,9 @@ class TransitionModel { HmmTopology topo_; - /// Triples indexed by transition state minus one; - /// the triples are in sorted order which allows us to do the reverse mapping from - /// triple to transition state + /// Tuples indexed by transition state minus one; + /// the tuples are in sorted order which allows us to do the reverse mapping from + /// tuple to transition state std::vector tuples_; /// Gives the first transition_id of each transition-state; indexed by diff --git a/src/ivectorbin/agglomerative-cluster.cc b/src/ivectorbin/agglomerative-cluster.cc index 0a144c40d7c..9dca9bfeb83 100644 --- a/src/ivectorbin/agglomerative-cluster.cc +++ b/src/ivectorbin/agglomerative-cluster.cc @@ -57,7 +57,7 @@ int main(int argc, char *argv[]) { " is less than this threshold."); po.Register("read-costs", &read_costs, "If true, the first" " argument is interpreted as a matrix of costs rather than a" - "similarity matrix."); + " similarity matrix."); po.Read(argc, argv); @@ -75,6 +75,8 @@ int main(int argc, char *argv[]) { RandomAccessInt32Reader reco2num_spk_reader(reco2num_spk_rspecifier); Int32Writer label_writer(label_wspecifier); + if (!read_costs) + threshold = -threshold; for (; !scores_reader.Done(); scores_reader.Next()) { std::string reco = scores_reader.Key(); Matrix costs = scores_reader.Value(); @@ -84,7 +86,6 @@ int main(int argc, char *argv[]) { // clustering code requires. if (!read_costs) costs.Scale(-1); - threshold = -threshold; std::vector uttlist = reco2utt_reader.Value(reco); std::vector spk_ids; if (reco2num_spk_rspecifier.size()) { diff --git a/src/lat/compose-lattice-pruned.cc b/src/lat/compose-lattice-pruned.cc index 46c44d76595..c6e4dafc008 100644 --- a/src/lat/compose-lattice-pruned.cc +++ b/src/lat/compose-lattice-pruned.cc @@ -159,6 +159,10 @@ class PrunedCompactLatticeComposer { int32 lat_state; int32 lm_state; + // the number of arcs on the path from the start state to this state, in the + // composed lattice, by which this state was first reached. + int32 depth; + // If you have just called RecomputePruningInfo(), then // 'forward_cost' will equal the cost of the best path from the start-state // to this state, in the composed output. @@ -284,7 +288,26 @@ class PrunedCompactLatticeComposer { // BaseFloat expected_cost_offset; }; - + // This bool variable is initialized to false, and will be updated to true + // the first time a Final() function is called on the det_fst_. Then we will + // immediately call RecomputeRruningInfo() so that the output_best_cost_ is + // changed from +inf to a finite value, to be used in beam search. This is the + // only time the RecomputeRruningInfo() function is called manually; otherwise + // it always follows an automatic schedule based on the num-arcs of the output + // lattice. + bool output_reached_final_; + + // This variable, which we set initially to -1000, makes sure that in the + // beginning of the algorithm, we always prioritize exploring the lattice + // in a depth-first way. Once we find a path reaching a final state, this + // variable will be reset to 0. + // The reason we do this is because the beam-search depends on a good estimate + // of the composed-best-cost, which before we reach a final state, we instead + // borrow the value from best-cost from the input lattice, which is usually + // systematically worse than the RNNLM scores, and makes the algorithm spend + // a lot of time before reaching any final state, especially if the input + // lattices are large. + float depth_penalty_; const ComposeLatticePrunedOptions &opts_; const CompactLattice &clat_in_; fst::DeterministicOnDemandFst *det_fst_; @@ -412,6 +435,8 @@ void PrunedCompactLatticeComposer::ComputeForwardCosts( std::vector::iterator state_iter = composed_state_info_.begin(), state_end = composed_state_info_.end(); + + state_iter->depth = 0; // start state has depth 0 ++state_iter; // Skip over the start state. // Set all other forward_cost fields to infinity and prev_composed_state to // -1. @@ -441,6 +466,7 @@ void PrunedCompactLatticeComposer::ComputeForwardCosts( if (next_info.forward_cost > next_forward_cost) { next_info.forward_cost = next_forward_cost; next_info.prev_composed_state = composed_state_index; + next_info.depth = composed_state_info_[composed_state_index].depth + 1; } } } @@ -493,7 +519,7 @@ void PrunedCompactLatticeComposer::ComputeDeltaBackwardCosts( // backward_cost was +infinity. This is OK; we'll set them all to // finite values later in this function. info.delta_backward_cost = - info.backward_cost - lat_state_info_[lat_state].backward_cost; + info.backward_cost - lat_state_info_[lat_state].backward_cost + info.depth * depth_penalty_; } // 'queue_elements' is a list of items (expected_cost_offset, @@ -523,7 +549,7 @@ void PrunedCompactLatticeComposer::ComputeDeltaBackwardCosts( // Check that prev_info.delta_backward_cost is finite. KALDI_ASSERT(prev_info.delta_backward_cost - prev_info.delta_backward_cost == 0.0); - info.delta_backward_cost = prev_info.delta_backward_cost; + info.delta_backward_cost = prev_info.delta_backward_cost + depth_penalty_; } } double lat_backward_cost = lat_state_info_[info.lat_state].backward_cost; @@ -596,13 +622,14 @@ PrunedCompactLatticeComposer::PrunedCompactLatticeComposer( const ComposeLatticePrunedOptions &opts, const CompactLattice &clat_in, fst::DeterministicOnDemandFst *det_fst, - CompactLattice* composed_clat): + CompactLattice* composed_clat): output_reached_final_(false), opts_(opts), clat_in_(clat_in), det_fst_(det_fst), clat_out_(composed_clat), num_arcs_out_(0), output_best_cost_(std::numeric_limits::infinity()), current_cutoff_(std::numeric_limits::infinity()) { clat_out_->DeleteStates(); + depth_penalty_ = -1000; } @@ -614,6 +641,7 @@ void PrunedCompactLatticeComposer::AddFirstState() { ComposedStateInfo &composed_state = composed_state_info_[0]; composed_state.lat_state = 0; composed_state.lm_state = det_fst_->Start(); + composed_state.depth = 0; composed_state.forward_cost = 0.0; composed_state.backward_cost = std::numeric_limits::infinity(); composed_state.delta_backward_cost = 0.0; @@ -709,6 +737,11 @@ void PrunedCompactLatticeComposer::ProcessQueueElement( double final_cost = ConvertToCost(final_lat_weight); if (final_cost < src_composed_state_info.backward_cost) src_composed_state_info.backward_cost = final_cost; + if (!output_reached_final_) { + output_reached_final_ = true; + depth_penalty_ = 0.0; + RecomputePruningInfo(); + } } } else { // It really was an arc. This code is very complicated, so we make it its @@ -782,13 +815,14 @@ void PrunedCompactLatticeComposer::ProcessTransition(int32 src_composed_state, dest_lat_state_info.composed_states.push_back(new_composed_state); dest_info->lat_state = dest_lat_state; dest_info->lm_state = dest_lm_state; + dest_info->depth = src_info->depth + 1; dest_info->forward_cost = src_info->forward_cost + ConvertToCost(lat_arc.weight) + lm_arc.weight.Value(); dest_info->backward_cost = std::numeric_limits::infinity(); dest_info->delta_backward_cost = - src_info->delta_backward_cost; + src_info->delta_backward_cost + dest_info->depth * depth_penalty_; // The 'prev_composed_state' field will not be read again until after it's // overwritten; we set it as below only for debugging purposes (the // negation is also for debugging purposes). diff --git a/src/lat/determinize-lattice-pruned.cc b/src/lat/determinize-lattice-pruned.cc index 8c790e749a3..447c951d02c 100644 --- a/src/lat/determinize-lattice-pruned.cc +++ b/src/lat/determinize-lattice-pruned.cc @@ -1437,7 +1437,7 @@ bool DeterminizeLatticePhonePruned( // If --phone-determinize is true, do the determinization on phone + word // lattices. if (opts.phone_determinize) { - KALDI_VLOG(1) << "Doing first pass of determinization on phone + word " + KALDI_VLOG(3) << "Doing first pass of determinization on phone + word " << "lattices."; ans = DeterminizeLatticePhonePrunedFirstPass( trans_model, beam, ifst, det_opts) && ans; @@ -1452,14 +1452,14 @@ bool DeterminizeLatticePhonePruned( // If --word-determinize is true, do the determinization on word lattices. if (opts.word_determinize) { - KALDI_VLOG(1) << "Doing second pass of determinization on word lattices."; + KALDI_VLOG(3) << "Doing second pass of determinization on word lattices."; ans = DeterminizeLatticePruned( *ifst, beam, ofst, det_opts) && ans; } // If --minimize is true, push and minimize after determinization. if (opts.minimize) { - KALDI_VLOG(1) << "Pushing and minimizing on word lattices."; + KALDI_VLOG(3) << "Pushing and minimizing on word lattices."; ans = PushCompactLatticeStrings(ofst) && ans; ans = PushCompactLatticeWeights(ofst) && ans; ans = MinimizeCompactLattice(ofst) && ans; diff --git a/src/lat/sausages.h b/src/lat/sausages.h index 9dab0b68713..f613097b190 100644 --- a/src/lat/sausages.h +++ b/src/lat/sausages.h @@ -83,10 +83,18 @@ class MinimumBayesRisk { MinimumBayesRiskOptions opts = MinimumBayesRiskOptions()); // Uses the provided as instead of using the lattice best path. + // Note that the default value of opts.decode_mbr is true. If you provide 1-best + // hypothesis from MAP decoding, the output ctm from MBR decoding may be + // mismatched with the provided ( would be used as the starting + // point of optimization). MinimumBayesRisk(const CompactLattice &clat, const std::vector &words, MinimumBayesRiskOptions opts = MinimumBayesRiskOptions()); // Uses the provided as and of bins instead of using the lattice best path. + // Note that the default value of opts.decode_mbr is true. If you provide 1-best + // hypothesis from MAP decoding, the output ctm from MBR decoding may be + // mismatched with the provided ( would be used as the starting + // point of optimization). MinimumBayesRisk(const CompactLattice &clat, const std::vector &words, const std::vector > ×, diff --git a/src/latbin/Makefile b/src/latbin/Makefile index bcffbb43168..afff54cb845 100644 --- a/src/latbin/Makefile +++ b/src/latbin/Makefile @@ -25,7 +25,7 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \ lattice-determinize-phone-pruned-parallel lattice-expand-ngram \ lattice-lmrescore-const-arpa lattice-lmrescore-rnnlm nbest-to-prons \ lattice-arc-post lattice-determinize-non-compact lattice-lmrescore-kaldi-rnnlm \ - lattice-lmrescore-pruned lattice-lmrescore-kaldi-rnnlm-pruned + lattice-lmrescore-pruned lattice-lmrescore-kaldi-rnnlm-pruned lattice-reverse OBJFILES = diff --git a/src/latbin/lattice-1best.cc b/src/latbin/lattice-1best.cc index f6723687790..e03736561f8 100644 --- a/src/latbin/lattice-1best.cc +++ b/src/latbin/lattice-1best.cc @@ -1,6 +1,8 @@ // latbin/lattice-1best.cc // Copyright 2009-2012 Stefan Kombrink Johns Hopkins University (Author: Daniel Povey) +// 2018 Music Technology Group, Universitat Pompeu Fabra (Rong Gong) + // See ../../COPYING for clarification regarding multiple authors // @@ -44,11 +46,14 @@ int main(int argc, char *argv[]) { ParseOptions po(usage); BaseFloat acoustic_scale = 1.0; BaseFloat lm_scale = 1.0; + BaseFloat word_ins_penalty = 0.0; po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods"); po.Register("lm-scale", &lm_scale, "Scaling factor for language model scores."); + po.Register("word-ins-penalty", &word_ins_penalty, + "Word insertion penality."); po.Read(argc, argv); @@ -74,6 +79,9 @@ int main(int argc, char *argv[]) { CompactLattice clat = clat_reader.Value(); clat_reader.FreeCurrent(); fst::ScaleLattice(fst::LatticeScale(lm_scale, acoustic_scale), &clat); + if (word_ins_penalty > 0.0) { + AddWordInsPenToCompactLattice(word_ins_penalty, &clat); + } CompactLattice best_path; CompactLatticeShortestPath(clat, &best_path); @@ -85,6 +93,9 @@ int main(int argc, char *argv[]) { } else { fst::ScaleLattice(fst::LatticeScale(1.0 / lm_scale, 1.0/acoustic_scale), &best_path); + if (word_ins_penalty > 0.0) { + AddWordInsPenToCompactLattice(word_ins_penalty, &clat); + } compact_1best_writer.Write(key, best_path); n_done++; } diff --git a/src/latbin/lattice-copy.cc b/src/latbin/lattice-copy.cc index f66eb699705..22bddef4575 100644 --- a/src/latbin/lattice-copy.cc +++ b/src/latbin/lattice-copy.cc @@ -25,13 +25,13 @@ #include "lat/kaldi-lattice.h" namespace kaldi { - int32 CopySubsetLattices(std::string filename, + int32 CopySubsetLattices(std::string filename, SequentialLatticeReader *lattice_reader, LatticeWriter *lattice_writer, - bool include = true, bool ignore_missing = false - ) { + bool include = true, bool ignore_missing = false, + bool sorted = false) { unordered_set subset; - std::set subset_list; + std::set subset_list; bool binary; Input ki(filename, &binary); @@ -50,7 +50,8 @@ namespace kaldi { int32 num_total = 0; size_t num_success = 0; for (; !lattice_reader->Done(); lattice_reader->Next(), num_total++) { - if (include && lattice_reader->Key() > *(subset_list.rbegin())) { + if (include && sorted && subset_list.size() > 0 + && lattice_reader->Key() > *(subset_list.rbegin())) { KALDI_LOG << "The utterance " << lattice_reader->Key() << " is larger than " << "the last key in the include list. Not reading further."; @@ -75,14 +76,14 @@ namespace kaldi { return (num_success != 0 ? 0 : 1); } - int32 CopySubsetLattices(std::string filename, + int32 CopySubsetLattices(std::string filename, SequentialCompactLatticeReader *lattice_reader, CompactLatticeWriter *lattice_writer, - bool include = true, bool ignore_missing = false - ) { + bool include = true, bool ignore_missing = false, + bool sorted = false) { unordered_set subset; - std::set subset_list; - + std::set subset_list; + bool binary; Input ki(filename, &binary); KALDI_ASSERT(!binary); @@ -100,7 +101,8 @@ namespace kaldi { int32 num_total = 0; size_t num_success = 0; for (; !lattice_reader->Done(); lattice_reader->Next(), num_total++) { - if (include && lattice_reader->Key() > *(subset_list.rbegin())) { + if (include && sorted && subset_list.size() > 0 + && lattice_reader->Key() > *(subset_list.rbegin())) { KALDI_LOG << "The utterance " << lattice_reader->Key() << " is larger than " << "the last key in the include list. Not reading further."; @@ -145,19 +147,20 @@ int main(int argc, char *argv[]) { "Only one of --include and --exclude can be supplied.\n" "Usage: lattice-copy [options] lattice-rspecifier lattice-wspecifier\n" " e.g.: lattice-copy --write-compact=false ark:1.lats ark,t:text.lats\n" - "See also: lattice-to-fst, and the script egs/wsj/s5/utils/convert_slf.pl\n"; - + "See also: lattice-scale, lattice-to-fst, and\n" + " the script egs/wsj/s5/utils/convert_slf.pl\n"; + ParseOptions po(usage); bool write_compact = true, ignore_missing = false; std::string include_rxfilename; std::string exclude_rxfilename; po.Register("write-compact", &write_compact, "If true, write in normal (compact) form."); - po.Register("include", &include_rxfilename, + po.Register("include", &include_rxfilename, "Text file, the first field of each " "line being interpreted as the " "utterance-id whose lattices will be included"); - po.Register("exclude", &exclude_rxfilename, + po.Register("exclude", &exclude_rxfilename, "Text file, the first field of each " "line being interpreted as an utterance-id " "whose lattices will be excluded"); @@ -174,21 +177,25 @@ int main(int argc, char *argv[]) { std::string lats_rspecifier = po.GetArg(1), lats_wspecifier = po.GetArg(2); + RspecifierOptions opts; + ClassifyRspecifier(lats_rspecifier, NULL, &opts); + bool sorted = opts.sorted; + int32 n_done = 0; - + if (write_compact) { SequentialCompactLatticeReader lattice_reader(lats_rspecifier); CompactLatticeWriter lattice_writer(lats_wspecifier); - + if (include_rxfilename != "") { if (exclude_rxfilename != "") { KALDI_ERR << "should not have both --exclude and --include option!"; } - return CopySubsetLattices(include_rxfilename, + return CopySubsetLattices(include_rxfilename, &lattice_reader, &lattice_writer, - true, ignore_missing); + true, ignore_missing, sorted); } else if (exclude_rxfilename != "") { - return CopySubsetLattices(exclude_rxfilename, + return CopySubsetLattices(exclude_rxfilename, &lattice_reader, &lattice_writer, false, ignore_missing); } @@ -198,14 +205,14 @@ int main(int argc, char *argv[]) { } else { SequentialLatticeReader lattice_reader(lats_rspecifier); LatticeWriter lattice_writer(lats_wspecifier); - + if (include_rxfilename != "") { if (exclude_rxfilename != "") { KALDI_ERR << "should not have both --exclude and --include option!"; } return CopySubsetLattices(include_rxfilename, &lattice_reader, &lattice_writer, - true, ignore_missing); + true, ignore_missing, sorted); } else if (exclude_rxfilename != "") { return CopySubsetLattices(exclude_rxfilename, &lattice_reader, &lattice_writer, @@ -216,7 +223,7 @@ int main(int argc, char *argv[]) { lattice_writer.Write(lattice_reader.Key(), lattice_reader.Value()); } KALDI_LOG << "Done copying " << n_done << " lattices."; - + if (ignore_missing) return 0; return (n_done != 0 ? 0 : 1); diff --git a/src/latbin/lattice-determinize-pruned-parallel.cc b/src/latbin/lattice-determinize-pruned-parallel.cc index 4ed6b926261..82f200c1bf9 100644 --- a/src/latbin/lattice-determinize-pruned-parallel.cc +++ b/src/latbin/lattice-determinize-pruned-parallel.cc @@ -62,6 +62,7 @@ class DeterminizeLatticeTask { } delete lat_; // This is no longer needed so we can delete it now; lat_ = NULL; + fst::Connect(&det_clat_); // remove states not leading to any final state, if (minimize_) { PushCompactLatticeStrings(&det_clat_); PushCompactLatticeWeights(&det_clat_); diff --git a/src/latbin/lattice-oracle.cc b/src/latbin/lattice-oracle.cc index 80c4e3e05d4..bd13fe0f4d7 100644 --- a/src/latbin/lattice-oracle.cc +++ b/src/latbin/lattice-oracle.cc @@ -23,21 +23,20 @@ #include "util/common-utils.h" #include "fstext/fstext-lib.h" #include "lat/kaldi-lattice.h" +#include "lat/lattice-functions.h" namespace kaldi { -using std::vector; -using std::set; - -typedef unordered_set LabelSet; +typedef fst::StdArc::Label Label; +typedef std::vector> LabelPairVector; void ReadSymbolList(const std::string &rxfilename, fst::SymbolTable *word_syms, - LabelSet *lset) { + LabelPairVector *lpairs) { Input ki(rxfilename); std::string line; - KALDI_ASSERT(lset != NULL); - lset->clear(); + KALDI_ASSERT(lpairs != NULL); + lpairs->clear(); while (getline(ki.Stream(), line)) { std::string sym; std::istringstream ss(line); @@ -47,35 +46,12 @@ void ReadSymbolList(const std::string &rxfilename, << ", file is: " << PrintableRxfilename(rxfilename); } fst::StdArc::Label lab = word_syms->Find(sym.c_str()); - if (lab == fst::SymbolTable::kNoSymbol) { + if (lab == -1) { // fst::kNoSymbol KALDI_ERR << "Can't find symbol in symbol table: " << line << ", file is: " << PrintableRxfilename(rxfilename); } - lset->insert(lab); - } -} - -void MapWildCards(const LabelSet &wildcards, fst::StdVectorFst *ofst) { - // map all wildcards symbols to epsilons - for (fst::StateIterator siter(*ofst); - !siter.Done(); siter.Next()) { - fst::StdArc::StateId s = siter.Value(); - for (fst::MutableArcIterator aiter(ofst, s); - !aiter.Done(); aiter.Next()) { - fst::StdArc arc(aiter.Value()); - LabelSet::const_iterator it = wildcards.find(arc.ilabel); - if (it != wildcards.end()) { - KALDI_VLOG(4) << "MapWildCards: mapping symbol " << arc.ilabel - << " to epsilon" << std::endl; - arc.ilabel = 0; - } - it = wildcards.find(arc.olabel); - if (it != wildcards.end()) { - arc.olabel = 0; - } - aiter.SetValue(arc); - } + lpairs->emplace_back(lab, 0); } } @@ -83,14 +59,14 @@ void MapWildCards(const LabelSet &wildcards, fst::StdVectorFst *ofst) { // also maps wildcard symbols to epsilons // then removes epsilons void ConvertLatticeToUnweightedAcceptor(const kaldi::Lattice &ilat, - const LabelSet &wildcards, + const LabelPairVector &wildcards, fst::StdVectorFst *ofst) { // first convert from lattice to normal FST fst::ConvertLattice(ilat, ofst); // remove weights, project to output, sort according to input arg fst::Map(ofst, fst::RmWeightMapper()); fst::Project(ofst, fst::PROJECT_OUTPUT); // The words are on the output side - MapWildCards(wildcards, ofst); + fst::Relabel(ofst, wildcards, wildcards); fst::RmEpsilon(ofst); // Don't tolerate epsilons as they make it hard to // tally errors fst::ArcSort(ofst, fst::StdILabelCompare()); @@ -259,7 +235,7 @@ int main(int argc, char *argv[]) { KALDI_ERR << "Could not read symbol table from file " << word_syms_filename; - LabelSet wildcards; + LabelPairVector wildcards; if (wild_syms_rxfilename != "") { KALDI_WARN << "--wildcard-symbols-list option deprecated."; KALDI_ASSERT(wildcard_symbols.empty() && "Do not use both " @@ -275,7 +251,7 @@ int main(int argc, char *argv[]) { << "--wildcard-symbols option, got: " << wildcard_symbols; } for (size_t i = 0; i < wildcard_symbols_vec.size(); i++) - wildcards.insert(wildcard_symbols_vec[i]); + wildcards.emplace_back(wildcard_symbols_vec[i], 0); } int32 n_done = 0, n_fail = 0; @@ -301,9 +277,9 @@ int main(int argc, char *argv[]) { const std::vector &reference = reference_reader.Value(key); VectorFst reference_fst; MakeLinearAcceptor(reference, &reference_fst); - MapWildCards(wildcards, &reference_fst); // Remove any wildcards in - // reference. + // Remove any wildcards in reference. + fst::Relabel(&reference_fst, wildcards, wildcards); CheckFst(reference_fst, "reference_fst_", key); // recreate edit distance fst if necessary @@ -384,7 +360,12 @@ int main(int argc, char *argv[]) { CompactLattice clat; CompactLattice oracle_clat; ConvertLattice(lat, &clat); - fst::Compose(oracle_clat_mask, clat, &oracle_clat); + fst::Relabel(&clat, wildcards, LabelPairVector()); + fst::ArcSort(&clat, fst::ILabelCompare()); + fst::Compose(oracle_clat_mask, clat, &oracle_clat_mask); + fst::ShortestPath(oracle_clat_mask, &oracle_clat); + fst::Project(&oracle_clat, fst::PROJECT_OUTPUT); + TopSortCompactLatticeIfNeeded(&oracle_clat); if (oracle_clat.Start() == fst::kNoStateId) { KALDI_WARN << "Failed to find the oracle path in the original " diff --git a/src/latbin/lattice-reverse.cc b/src/latbin/lattice-reverse.cc new file mode 100644 index 00000000000..ad288de04a7 --- /dev/null +++ b/src/latbin/lattice-reverse.cc @@ -0,0 +1,76 @@ +// latbin/lattice-reverse.cc + +// Copyright 2018 Hainan Xu + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fstext/fstext-lib.h" +#include "lat/kaldi-lattice.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + using fst::SymbolTable; + using fst::VectorFst; + using fst::StdArc; + + const char *usage = + "Reverse a lattice in order to rescore the lattice with a RNNLM \n" + "trained reversed text. An example for its application is at \n" + "swbd/local/rnnlm/run_lstm_tdnn_back.sh\n" + "Usage: lattice-reverse lattice-rspecifier lattice-wspecifier\n" + " e.g.: lattice-reverse ark:forward.lats ark:backward.lats\n"; + + ParseOptions po(usage); + std::string include_rxfilename; + std::string exclude_rxfilename; + + po.Read(argc, argv); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string lats_rspecifier = po.GetArg(1), + lats_wspecifier = po.GetArg(2); + + int32 n_done = 0; + + SequentialLatticeReader lattice_reader(lats_rspecifier); + LatticeWriter lattice_writer(lats_wspecifier); + + for (; !lattice_reader.Done(); lattice_reader.Next(), n_done++) { + string key = lattice_reader.Key(); + Lattice &lat = lattice_reader.Value(); + Lattice olat; + fst::Reverse(lat, &olat); + lattice_writer.Write(lattice_reader.Key(), olat); + } + + KALDI_LOG << "Done reversing " << n_done << " lattices."; + + return (n_done != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/latbin/lattice-to-ctm-conf.cc b/src/latbin/lattice-to-ctm-conf.cc index 8aa0d7949ce..772197d6d41 100644 --- a/src/latbin/lattice-to-ctm-conf.cc +++ b/src/latbin/lattice-to-ctm-conf.cc @@ -35,7 +35,10 @@ int main(int argc, char *argv[]) { "sequence. In the 3-argument form, we read it from the\n" "<1best-rspecifier> input; otherwise it is the 1-best of the lattice.\n" "Then, if --decode-mbr=true, we iteratively refine the hypothesis\n" - "using Minimum Bayes Risk decoding. If you don't need confidences,\n" + "using Minimum Bayes Risk decoding. (Note that the default value of decode_mbr\n" + "is true. If you provide <1best-rspecifier> from MAP decoding, the output ctm\n" + "from MBR decoding may be mismatched with the provided 1best hypothesis (the\n" + "starting point of optimization). If you don't need confidences,\n" "you can do lattice-1best and pipe to nbest-to-ctm. The ctm this\n" "program produces will be relative to the utterance-id; a standard\n" "ctm relative to the filename can be obtained using\n" diff --git a/src/latbin/lattice-to-nbest.cc b/src/latbin/lattice-to-nbest.cc index 7fd54525488..f5ecbe044c3 100644 --- a/src/latbin/lattice-to-nbest.cc +++ b/src/latbin/lattice-to-nbest.cc @@ -51,7 +51,8 @@ int main(int argc, char *argv[]) { po.Register("lm-scale", &lm_scale, "Scaling factor for language model scores."); po.Register("n", &n, "Number of distinct paths"); po.Register("random", &random, - "If true, generate n random paths instead of n-best paths"); + "If true, generate n random paths instead of n-best paths" + "In this case, all costs in generated paths will be zero."); po.Register("srand", &srand_seed, "Seed for random number generator " "(only relevant if --random=true)"); diff --git a/src/latbin/nbest-to-prons.cc b/src/latbin/nbest-to-prons.cc index c049403daaa..aa6326e031c 100644 --- a/src/latbin/nbest-to-prons.cc +++ b/src/latbin/nbest-to-prons.cc @@ -42,17 +42,17 @@ int main(int argc, char *argv[]) { " lattice-align-words data/lang/phones/word_boundary.int exp/dir/final.mdl ark:- ark:- | \\\n" " nbest-to-prons exp/dir/final.mdl ark:- 1.prons\n" "Note: the type of the model doesn't matter as only the transition-model is read.\n"; - + ParseOptions po(usage); bool print_lengths_per_phone = false; - po.Register("print-lengths-per-phone", &print_lengths_per_phone, + po.Register("print-lengths-per-phone", &print_lengths_per_phone, "If true, in place of the length of the word, " "print out a comma-separated list of the lengths of each phone in the word."); po.Read(argc, argv); - + if (po.NumArgs() != 3) { po.PrintUsage(); exit(1); @@ -65,13 +65,13 @@ int main(int argc, char *argv[]) { TransitionModel trans_model; ReadKaldiObject(model_rxfilename, &trans_model); - + SequentialCompactLatticeReader clat_reader(lats_rspecifier); - + int32 n_done = 0, n_err = 0; Output ko(wxfilename, false); // false == non-binary write mode. - + for (; !clat_reader.Done(); clat_reader.Next()) { std::string utt = clat_reader.Key(); CompactLattice clat = clat_reader.Value(); @@ -96,16 +96,16 @@ int main(int argc, char *argv[]) { if (!print_lengths_per_phone) ko.Stream() << utt << ' ' << times[i] << ' ' << lengths[i] << ' ' - << words[i]; + << words[i]; else { ko.Stream() << utt << ' ' << times[i] << ' '; for (size_t pl = 0; pl < phone_lengths[i].size()-1; pl++) - ko.Stream() << phone_lengths[i][pl] << ','; + ko.Stream() << phone_lengths[i][pl] << ','; ko.Stream() << phone_lengths[i][phone_lengths[i].size()-1] - << ' ' << words[i]; - } - for (size_t j = 0; j < prons[i].size(); j++) - ko.Stream() << ' ' << prons[i][j]; + << ' ' << words[i]; + } + for (size_t j = 0; j < prons[i].size(); j++) + ko.Stream() << ' ' << prons[i][j]; ko.Stream() << std::endl; } n_done++; @@ -115,7 +115,7 @@ int main(int argc, char *argv[]) { // we just let them go out of scope and it happens automatically. // We do it this time in order to avoid wrongly printing out a success message // if the stream was going to fail to close - + KALDI_LOG << "Printed prons for " << n_done << " linear lattices; " << n_err << " had errors."; return (n_done != 0 ? 0 : 1); diff --git a/src/lm/arpa-file-parser.cc b/src/lm/arpa-file-parser.cc index b012a919784..f3565eabf4e 100644 --- a/src/lm/arpa-file-parser.cc +++ b/src/lm/arpa-file-parser.cc @@ -209,7 +209,7 @@ void ArpaFileParser::Read(std::istream &is) { word = symbols_->AddSymbol(col[1 + index]); } else { word = symbols_->Find(col[1 + index]); - if (word == fst::SymbolTable::kNoSymbol) { + if (word == -1) { // fst::kNoSymbol switch (options_.oov_handling) { case ArpaParseOptions::kReplaceWithUnk: word = options_.unk_symbol; diff --git a/src/lmbin/arpa2fst.cc b/src/lmbin/arpa2fst.cc index d9b771ba13e..811e623ad48 100644 --- a/src/lmbin/arpa2fst.cc +++ b/src/lmbin/arpa2fst.cc @@ -92,7 +92,7 @@ int main(int argc, char *argv[]) { options.oov_handling = ArpaParseOptions::kSkipNGram; if (!disambig_symbol.empty()) { disambig_symbol_id = symbols->Find(disambig_symbol); - if (disambig_symbol_id == fst::SymbolTable::kNoSymbol) + if (disambig_symbol_id == -1) // fst::kNoSymbol KALDI_ERR << "Symbol table " << read_syms_filename << " has no symbol for " << disambig_symbol; } diff --git a/src/makefiles/cygwin.mk b/src/makefiles/cygwin.mk index c58cd3a42da..d2b9c0e6474 100644 --- a/src/makefiles/cygwin.mk +++ b/src/makefiles/cygwin.mk @@ -10,12 +10,12 @@ ifndef OPENFSTLIBS $(error OPENFSTLIBS not defined.) endif -CXXFLAGS = -std=c++11 -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \ +CXXFLAGS = -std=c++11 -U__STRICT_ANSI__ -I.. -I$(OPENFSTINC) $(EXTRA_CXXFLAGS) \ -Wall -Wno-sign-compare -Wno-unused-local-typedefs \ -Wno-deprecated-declarations -Winit-self \ -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \ -DHAVE_CLAPACK -I../../tools/CLAPACK/ \ - -msse -msse2 \ + -msse -msse2 -O -Wa,-mbig-obj \ -g # -O0 -DKALDI_PARANOID ifeq ($(KALDI_FLAVOR), dynamic) diff --git a/src/makefiles/default_rules.mk b/src/makefiles/default_rules.mk index 1b55b478d0c..ee0f3c2e90b 100644 --- a/src/makefiles/default_rules.mk +++ b/src/makefiles/default_rules.mk @@ -28,7 +28,7 @@ endif all: $(LIBFILE) $(BINFILES) $(LIBFILE): $(OBJFILES) - $(AR) -cru $(LIBNAME).a $(OBJFILES) + $(AR) -cr $(LIBNAME).a $(OBJFILES) $(RANLIB) $(LIBNAME).a ifeq ($(KALDI_FLAVOR), dynamic) ifeq ($(shell uname), Darwin) diff --git a/src/matrix/kaldi-gpsr-test.cc b/src/matrix/kaldi-gpsr-test.cc index e77d11857ef..6d895527e55 100644 --- a/src/matrix/kaldi-gpsr-test.cc +++ b/src/matrix/kaldi-gpsr-test.cc @@ -37,11 +37,11 @@ template static void InitRand(MatrixBase *M) { for (MatrixIndexT i = 0;i < M->NumRows();i++) for (MatrixIndexT j = 0;j < M->NumCols();j++) (*M)(i, j) = RandGauss(); - if (M->NumRows() != 0 && M->Cond() > 100) { - KALDI_WARN << "Condition number of random matrix large" << M->Cond() - << ": trying again (this is normal)"; - goto start; - } + if (M->NumRows() != 0 && M->Cond() > 100) { + KALDI_WARN << "Condition number of random matrix large" << M->Cond() + << ": trying again (this is normal)"; + goto start; + } } template static void InitRand(SpMatrix *M) { diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc index 8cedc9c0487..c8ea35112ea 100644 --- a/src/matrix/kaldi-vector.cc +++ b/src/matrix/kaldi-vector.cc @@ -691,9 +691,11 @@ void VectorBase::CopyDiagFromPacked(const PackedMatrix &M) { template Real VectorBase::Sum() const { - double sum = 0.0; - for (MatrixIndexT i = 0; i < dim_; i++) { sum += data_[i]; } - return sum; + // Do a dot-product with a size-1 array with a stride of 0 to + // implement sum. This allows us to access SIMD operations in a + // cross-platform way via your BLAS library. + Real one(1); + return cblas_Xdot(dim_, data_, 1, &one, 0); } template diff --git a/src/matrix/matrix-functions.h b/src/matrix/matrix-functions.h index 6c438cd2192..ca50ddda7c8 100644 --- a/src/matrix/matrix-functions.h +++ b/src/matrix/matrix-functions.h @@ -38,7 +38,6 @@ namespace kaldi { v is a vector of even dimension, interpreted for both input and output as a vector of complex numbers i.e. \f[ v = ( re_0, im_0, re_1, im_1, ... ) \f] - The dimension of v must be a power of 2. If "forward == true" this routine does the Discrete Fourier Transform (DFT), i.e.: diff --git a/src/matrix/sparse-matrix-test.cc b/src/matrix/sparse-matrix-test.cc index ac7f7c9ce0b..26b2c227bba 100644 --- a/src/matrix/sparse-matrix-test.cc +++ b/src/matrix/sparse-matrix-test.cc @@ -171,6 +171,29 @@ void UnitTestSparseMatrixAddToMat() { } } +template +void UnitTestSparseMatrixConstructor() { + int32 num_rows = RandInt(1, 10), + num_cols = RandInt(0, 10); + if (num_cols == 0) + num_rows = 0; + + Matrix mat(num_rows, num_cols); + + for (int32 r = 0; r < num_rows; r++) { + for (int32 c = 0; c < num_cols; c++) { + if (RandInt(0, 5) == 0) + mat(r, c) = RandGauss(); + } + } + SparseMatrix smat(mat); + + Matrix mat2(num_rows, num_cols); + mat2.SetRandn(); + smat.CopyToMat(&mat2); + AssertEqual(mat, mat2); +} + template void UnitTestSparseMatrixTraceMatSmat() { for (int32 i = 0; i < 10; i++) { @@ -300,6 +323,8 @@ void SparseMatrixUnitTest() { UnitTestSparseMatrixFrobeniusNorm(); UnitTestSparseMatrixAddToMat(); UnitTestSparseMatrixTraceMatSmat(); + for (int32 i = 0; i < 30; i++) + UnitTestSparseMatrixConstructor(); // Matrix functions involving sparse matrices. diff --git a/src/matrix/sparse-matrix.cc b/src/matrix/sparse-matrix.cc index 38ad940fb45..55d8edeb4b3 100644 --- a/src/matrix/sparse-matrix.cc +++ b/src/matrix/sparse-matrix.cc @@ -690,6 +690,16 @@ void SparseMatrix::Scale(Real alpha) { rows_[row].Scale(alpha); } +template +SparseMatrix::SparseMatrix(const MatrixBase &mat) { + MatrixIndexT num_rows = mat.NumRows(); + rows_.resize(num_rows); + for (int32 row = 0; row < num_rows; row++) { + SparseVector this_row(mat.Row(row)); + rows_[row].Swap(&this_row); + } +} + template Real TraceMatSmat(const MatrixBase &A, const SparseMatrix &B, @@ -1198,6 +1208,20 @@ Real SparseVector::Max(int32 *index_out) const { return 0.0; } +template +SparseVector::SparseVector(const VectorBase &vec) { + MatrixIndexT dim = vec.Dim(); + dim_ = dim; + if (dim == 0) + return; + const Real *ptr = vec.Data(); + for (MatrixIndexT i = 0; i < dim; i++) { + Real val = ptr[i]; + if (val != 0.0) + pairs_.push_back(std::pair(i,val)); + } +} + void GeneralMatrix::Swap(GeneralMatrix *other) { mat_.Swap(&(other->mat_)); cmat_.Swap(&(other->cmat_)); @@ -1262,6 +1286,7 @@ void ExtractRowRangeWithPadding( } + template class SparseVector; template class SparseVector; template class SparseMatrix; diff --git a/src/matrix/sparse-matrix.h b/src/matrix/sparse-matrix.h index 60085b93fbe..76f77f531d5 100644 --- a/src/matrix/sparse-matrix.h +++ b/src/matrix/sparse-matrix.h @@ -97,6 +97,9 @@ class SparseVector { SparseVector(MatrixIndexT dim, const std::vector > &pairs); + // constructor from a VectorBase that keeps only the nonzero elements of 'vec'. + explicit SparseVector(const VectorBase &vec); + /// Resizes to this dimension. resize_type == kUndefined /// behaves the same as kSetZero. void Resize(MatrixIndexT dim, MatrixResizeType resize_type = kSetZero); @@ -135,6 +138,12 @@ class SparseMatrix { Real FrobeniusNorm() const; + + /// This constructor creates a SparseMatrix that just contains the nonzero + /// elements of 'mat'. + explicit SparseMatrix(const MatrixBase &mat); + + /// Copy to matrix. It must already have the correct size. template void CopyToMat(MatrixBase *other, MatrixTransposeType t = kNoTrans) const; diff --git a/src/nnet/nnet-blstm-projected.h b/src/nnet/nnet-blstm-projected.h index c11cab14e03..45851f5d9fc 100644 --- a/src/nnet/nnet-blstm-projected.h +++ b/src/nnet/nnet-blstm-projected.h @@ -761,10 +761,10 @@ class BlstmProjected : public MultistreamComponent { CuSubMatrix y_i(F_YI.RowRange(t*S, S)); CuSubMatrix y_f(F_YF.RowRange(t*S, S)); CuSubMatrix y_o(F_YO.RowRange(t*S, S)); - CuSubMatrix y_c(F_YC.RowRange(t*S, S)); + // CuSubMatrix y_c(F_YC.RowRange(t*S, S)); CuSubMatrix y_h(F_YH.RowRange(t*S, S)); - CuSubMatrix y_m(F_YM.RowRange(t*S, S)); - CuSubMatrix y_r(F_YR.RowRange(t*S, S)); + // CuSubMatrix y_m(F_YM.RowRange(t*S, S)); + // CuSubMatrix y_r(F_YR.RowRange(t*S, S)); CuSubMatrix d_all(f_backpropagate_buf_.RowRange(t*S, S)); CuSubMatrix d_g(F_DG.RowRange(t*S, S)); @@ -890,10 +890,10 @@ class BlstmProjected : public MultistreamComponent { CuSubMatrix y_i(B_YI.RowRange(t*S, S)); CuSubMatrix y_f(B_YF.RowRange(t*S, S)); CuSubMatrix y_o(B_YO.RowRange(t*S, S)); - CuSubMatrix y_c(B_YC.RowRange(t*S, S)); + // CuSubMatrix y_c(B_YC.RowRange(t*S, S)); CuSubMatrix y_h(B_YH.RowRange(t*S, S)); - CuSubMatrix y_m(B_YM.RowRange(t*S, S)); - CuSubMatrix y_r(B_YR.RowRange(t*S, S)); + // CuSubMatrix y_m(B_YM.RowRange(t*S, S)); + // CuSubMatrix y_r(B_YR.RowRange(t*S, S)); CuSubMatrix d_all(b_backpropagate_buf_.RowRange(t*S, S)); CuSubMatrix d_g(B_DG.RowRange(t*S, S)); diff --git a/src/nnet/nnet-lstm-projected.h b/src/nnet/nnet-lstm-projected.h index 7f8780b28aa..cc2b7c24ed2 100644 --- a/src/nnet/nnet-lstm-projected.h +++ b/src/nnet/nnet-lstm-projected.h @@ -411,7 +411,7 @@ class LstmProjected : public MultistreamComponent { CuSubMatrix y_c(YC.RowRange(t*S, S)); CuSubMatrix y_h(YH.RowRange(t*S, S)); CuSubMatrix y_m(YM.RowRange(t*S, S)); - CuSubMatrix y_r(YR.RowRange(t*S, S)); + CuSubMatrix y_r(YR.RowRange(t*S, S)); CuSubMatrix y_gifo(YGIFO.RowRange(t*S, S)); // r(t-1) -> g, i, f, o @@ -514,10 +514,10 @@ class LstmProjected : public MultistreamComponent { CuSubMatrix y_i(YI.RowRange(t*S, S)); CuSubMatrix y_f(YF.RowRange(t*S, S)); CuSubMatrix y_o(YO.RowRange(t*S, S)); - CuSubMatrix y_c(YC.RowRange(t*S, S)); + // CuSubMatrix y_c(YC.RowRange(t*S, S)); CuSubMatrix y_h(YH.RowRange(t*S, S)); - CuSubMatrix y_m(YM.RowRange(t*S, S)); - CuSubMatrix y_r(YR.RowRange(t*S, S)); + // CuSubMatrix y_m(YM.RowRange(t*S, S)); + // CuSubMatrix y_r(YR.RowRange(t*S, S)); CuSubMatrix d_all(backpropagate_buf_.RowRange(t*S, S)); CuSubMatrix d_g(DG.RowRange(t*S, S)); diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile index df0fb2d4502..135853cadc3 100644 --- a/src/nnet3/Makefile +++ b/src/nnet3/Makefile @@ -31,7 +31,7 @@ OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \ nnet-compile-looped.o decodable-simple-looped.o \ decodable-online-looped.o convolution.o \ nnet-convolutional-component.o attention.o \ - nnet-attention-component.o + nnet-attention-component.o nnet-tdnn-component.o LIBNAME = kaldi-nnet3 diff --git a/src/nnet3/convolution.cc b/src/nnet3/convolution.cc index b69215f8d54..287ab7f47dd 100644 --- a/src/nnet3/convolution.cc +++ b/src/nnet3/convolution.cc @@ -988,7 +988,7 @@ static void ComputeTempMatrixSize(const ConvolutionComputationOptions &opts, temp_rows = new_num_t_out * computation->num_images; BaseFloat new_num_megabytes = (4 * temp_rows * temp_cols) / 1000000.0; // make sure we're within the memory limit. - if (new_num_megabytes > megabyte_limit) { + if (new_num_megabytes > 1.01 * megabyte_limit) { KALDI_WARN << "Memory consumed in convolution is more than requested " << "(maybe very long time sequence?)"; } diff --git a/src/nnet3/discriminative-supervision.cc b/src/nnet3/discriminative-supervision.cc index 716650f16fa..0f8f8a4aef7 100644 --- a/src/nnet3/discriminative-supervision.cc +++ b/src/nnet3/discriminative-supervision.cc @@ -399,7 +399,7 @@ void DiscriminativeSupervisionSplitter::ComputeLatticeScores(const Lattice &lat, // This check will fail if the lattice is not breadth-first search sorted } -void AppendSupervision(const std::vector &input, +void MergeSupervision(const std::vector &input, DiscriminativeSupervision *output_supervision) { KALDI_ASSERT(!input.empty()); int32 num_inputs = input.size(); diff --git a/src/nnet3/discriminative-supervision.h b/src/nnet3/discriminative-supervision.h index 6fd684093e2..17c0b1cdb1e 100644 --- a/src/nnet3/discriminative-supervision.h +++ b/src/nnet3/discriminative-supervision.h @@ -222,7 +222,7 @@ class DiscriminativeSupervisionSplitter { /// normal use-case for this is when you are combining neural-net examples for /// training; appending them like this helps to simplify the training process. -void AppendSupervision(const std::vector &input, +void MergeSupervision(const std::vector &input, DiscriminativeSupervision *output_supervision); diff --git a/src/nnet3/nnet-am-decodable-simple.cc b/src/nnet3/nnet-am-decodable-simple.cc index 341f87f0372..d66e24830c6 100644 --- a/src/nnet3/nnet-am-decodable-simple.cc +++ b/src/nnet3/nnet-am-decodable-simple.cc @@ -46,7 +46,7 @@ DecodableNnetSimple::DecodableNnetSimple( (feats_.NumRows() + opts_.frame_subsampling_factor - 1) / opts_.frame_subsampling_factor; KALDI_ASSERT(IsSimpleNnet(nnet)); - ComputeSimpleNnetContext(nnet, &nnet_left_context_, &nnet_right_context_); + compiler_.GetSimpleNnetContext(&nnet_left_context_, &nnet_right_context_); KALDI_ASSERT(!(ivector != NULL && online_ivectors != NULL)); KALDI_ASSERT(!(online_ivectors != NULL && online_ivector_period <= 0 && "You need to set the --online-ivector-period option!")); diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc index ec1d3fa0f2e..584a7c19ab8 100644 --- a/src/nnet3/nnet-analyze.cc +++ b/src/nnet3/nnet-analyze.cc @@ -1441,10 +1441,18 @@ int64 GetMaxMemoryUse(const NnetComputation &computation) { max_memory_use = 0; int32 num_commands = computation.commands.size(), num_submatrices = computation.submatrices.size(); + // the vector 'num_compressed_bytes' is used to remember the number of bytes + // in the compressed matrices for each submatrix (this will only be used for + // those that correspond to a 'whole matrix). It's needed because the + // decompression command doesn't tell us what compression type was used for + // that matrix. + std::vector num_compressed_bytes(num_submatrices, -100000000); for (int32 command_index = 0; command_index < num_commands; ++command_index) { const NnetComputation::Command &c = computation.commands[command_index]; int64 this_num_bytes = -100000000, this_compressed_num_bytes = -10000000; + + if (c.arg1 >= 0 && c.arg1 < num_submatrices) { // if arg1 could plausibly be a sub-matrix index... const NnetComputation::SubMatrixInfo &submat_info = @@ -1452,11 +1460,16 @@ int64 GetMaxMemoryUse(const NnetComputation &computation) { this_num_bytes = static_cast(sizeof(BaseFloat)) * submat_info.num_rows * submat_info.num_cols; - this_compressed_num_bytes = - ((c.arg2 == static_cast(kCompressedMatrixInt8) || - c.arg2 == static_cast(kCompressedMatrixUint8)) ? - 1 : 2) * static_cast(submat_info.num_rows) * - submat_info.num_cols; + if (c.command_type == kCompressMatrix) { + this_compressed_num_bytes = + ((c.arg2 == static_cast(kCompressedMatrixInt8) || + c.arg2 == static_cast(kCompressedMatrixUint8)) ? + 1 : 2) * static_cast(submat_info.num_rows) * + submat_info.num_cols; + num_compressed_bytes[c.arg1] = this_compressed_num_bytes; + } else if (c.command_type == kDecompressMatrix) { + this_compressed_num_bytes = num_compressed_bytes[c.arg1]; + } } switch (c.command_type) { case kAllocMatrix: diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc index a05c002c3af..c627bb1032a 100644 --- a/src/nnet3/nnet-chain-example.cc +++ b/src/nnet3/nnet-chain-example.cc @@ -207,8 +207,8 @@ static void MergeSupervision( for (int32 n = 0; n < num_inputs; n++) input_supervision.push_back(&(inputs[n]->supervision)); chain::Supervision output_supervision; - AppendSupervision(input_supervision, - &output_supervision); + MergeSupervision(input_supervision, + &output_supervision); output->supervision.Swap(&output_supervision); output->indexes.clear(); diff --git a/src/nnet3/nnet-chain-example.h b/src/nnet3/nnet-chain-example.h index 047f30cfc48..187bb4ef3a3 100644 --- a/src/nnet3/nnet-chain-example.h +++ b/src/nnet3/nnet-chain-example.h @@ -50,9 +50,13 @@ struct NnetChainSupervision { /// Be careful about the order of these indexes-- it is a little confusing. /// The indexes in the 'index' vector are ordered as: (frame 0 of each sequence); /// (frame 1 of each sequence); and so on. But in the 'supervision' object, - /// the FST contains (sequence 0; sequence 1; ...). So reordering is needed. - /// This is done for efficiency in the denominator computation (it helps memory - /// locality), as well as to match the ordering inside the neural net. + /// the FST contains (sequence 0; sequence 1; ...). So reordering is needed + /// when doing the numerator computation. + /// We order 'indexes' in this way for efficiency in the denominator + /// computation (it helps memory locality), as well as to avoid the need for + /// the nnet to reorder things internally to match the requested output + /// (for layers inside the neural net, the ordering is (frame 0; frame 1 ...) + /// as this corresponds to the order you get when you sort a vector of Index). std::vector indexes; @@ -101,7 +105,7 @@ struct NnetChainSupervision { bool operator == (const NnetChainSupervision &other) const; }; -/// NnetChainExample is like NnetExample, but specialized for +/// NnetChainExample is like NnetExample, but specialized for /// lattice-free (chain) training. struct NnetChainExample { diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 1d149b6f193..2ec2699ec97 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -245,17 +245,15 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg, // delta_nnet is scaled by 1 + backstitch_training_scale when added to nnet; max_change_scale = 1.0 + nnet_config.backstitch_training_scale; scale_adding = 1.0 + nnet_config.backstitch_training_scale; + // If relevant, add in the part of the gradient that comes from L2 + // regularization. It may not be optimally inefficient to do it on both + // passes of the backstitch, like we do here, but it probably minimizes + // any harmful interactions with the max-change. + ApplyL2Regularization(*nnet_, + 1.0 / scale_adding * GetNumNvalues(eg.inputs, false) * + nnet_config.l2_regularize_factor, delta_nnet_); } - // If relevant, add in the part of the gradient that comes from L2 - // regularization. It may not be optimally inefficient to do it on both - // passes of the backstitch, like we do here, but it probably minimizes - // any harmful interactions with the max-change. - ApplyL2Regularization(*nnet_, - scale_adding * GetNumNvalues(eg.inputs, false) * - nnet_config.l2_regularize_factor, - delta_nnet_); - // Updates the parameters of nnet UpdateNnetWithMaxChange(*delta_nnet_, nnet_config.max_param_change, max_change_scale, scale_adding, nnet_, diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index c73f3fb921d..2c76805f5cc 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -68,6 +68,8 @@ ComponentPrecomputedIndexes* ComponentPrecomputedIndexes::NewComponentPrecompute ans = new RestrictedAttentionComponent::PrecomputedIndexes(); } else if (cpi_type == "GeneralDropoutComponentPrecomputedIndexes") { ans = new GeneralDropoutComponentPrecomputedIndexes(); + } else if (cpi_type == "TdnnComponentPrecomputedIndexes") { + ans = new TdnnComponent::PrecomputedIndexes(); } if (ans != NULL) { KALDI_ASSERT(cpi_type == ans->Type()); @@ -134,6 +136,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) { ans = new ElementwiseProductComponent(); } else if (component_type == "ConvolutionComponent") { ans = new ConvolutionComponent(); + } else if (component_type == "TdnnComponent") { + ans = new TdnnComponent(); } else if (component_type == "MaxpoolingComponent") { ans = new MaxpoolingComponent(); } else if (component_type == "PermuteComponent") { diff --git a/src/nnet3/nnet-component-test.cc b/src/nnet3/nnet-component-test.cc index 39bd156e360..618fa7c0c45 100644 --- a/src/nnet3/nnet-component-test.cc +++ b/src/nnet3/nnet-component-test.cc @@ -166,6 +166,45 @@ void TestNnetComponentUpdatable(Component *c) { } } + +/* + This function gets the 'ComponentPrecomputedIndexes*' pointer from + a component, given the num-rows in the matrix of inputs we're testing it + with. It uses a plausible arrangement of indexes. + + Note: in this file we primarily test simple components, and simple + components don't return precomputed indexes; but we also test a + few non-simple components that operate with the same set of indexes + on the input and the output. Simple components would return NULL. + */ +ComponentPrecomputedIndexes *GetPrecomputedIndexes(const Component &c, + int32 num_rows) { + std::vector input_indexes(num_rows); + int32 num_t_values; + if (num_rows % 3 == 0) { num_t_values = 3; } + else if (num_rows % 2 == 0) { num_t_values = 2; } + else { num_t_values = 1; } + + for (int32 i = 0; i < num_rows; i++) { + input_indexes[i].n = i % num_t_values; + input_indexes[i].x = 0; + input_indexes[i].t = i / num_t_values; + } + std::vector output_indexes(input_indexes); + + if (c.Properties()&kReordersIndexes) { + c.ReorderIndexes(&input_indexes, &output_indexes); + } + MiscComputationInfo misc_info; + bool need_backprop = true; // just in case. + ComponentPrecomputedIndexes *ans = c.PrecomputeIndexes(misc_info, + input_indexes, + output_indexes, + need_backprop); + // ans will be NULL in most cases. + return ans; +} + // tests the properties kPropagateAdds, kBackpropAdds, // kBackpropNeedsInput, kBackpropNeedsOutput. void TestSimpleComponentPropagateProperties(const Component &c) { @@ -202,13 +241,14 @@ void TestSimpleComponentPropagateProperties(const Component &c) { } ResetSeed(rand_seed, c); - void *memo = c.Propagate(NULL, input_data, &output_data1); + ComponentPrecomputedIndexes *indexes = GetPrecomputedIndexes(c, num_rows); + void *memo = c.Propagate(indexes, input_data, &output_data1); ResetSeed(rand_seed, c); - c.DeleteMemo(c.Propagate(NULL, input_data, &output_data2)); + c.DeleteMemo(c.Propagate(indexes, input_data, &output_data2)); if (properties & kPropagateInPlace) { ResetSeed(rand_seed, c); - c.DeleteMemo(c.Propagate(NULL, output_data3, &output_data3)); + c.DeleteMemo(c.Propagate(indexes, output_data3, &output_data3)); if (!output_data1.ApproxEqual(output_data3)) { KALDI_ERR << "Test of kPropagateInPlace flag for component of type " << c.Type() << " failed."; @@ -230,7 +270,7 @@ void TestSimpleComponentPropagateProperties(const Component &c) { CuMatrix empty_mat; // test with input_deriv1 that's zero - c.Backprop("foobar", NULL, + c.Backprop("foobar", indexes, ((properties & kBackpropNeedsInput) ? input_data : empty_mat), ((properties & kBackpropNeedsOutput) ? output_data1 : empty_mat), output_deriv, @@ -238,7 +278,7 @@ void TestSimpleComponentPropagateProperties(const Component &c) { c_copy, &input_deriv1); // test with input_deriv2 that's all ones. - c.Backprop("foobar", NULL, + c.Backprop("foobar", indexes, ((properties & kBackpropNeedsInput) ? input_data : empty_mat), ((properties & kBackpropNeedsOutput) ? output_data1 : empty_mat), output_deriv, @@ -247,7 +287,7 @@ void TestSimpleComponentPropagateProperties(const Component &c) { &input_deriv2); // test backprop in place, if supported. if (properties & kBackpropInPlace) { - c.Backprop("foobar", NULL, + c.Backprop("foobar", indexes, ((properties & kBackpropNeedsInput) ? input_data : empty_mat), ((properties & kBackpropNeedsOutput) ? output_data1 : empty_mat), input_deriv3, @@ -263,6 +303,7 @@ void TestSimpleComponentPropagateProperties(const Component &c) { if (properties & kBackpropInPlace) AssertEqual(input_deriv1, input_deriv3); delete c_copy; + delete indexes; } bool TestSimpleComponentDataDerivative(const Component &c, @@ -284,11 +325,12 @@ bool TestSimpleComponentDataDerivative(const Component &c, output_deriv.SetRandn(); ResetSeed(rand_seed, c); - void *memo = c.Propagate(NULL, input_data, &output_data); + ComponentPrecomputedIndexes *indexes = GetPrecomputedIndexes(c, num_rows); + void *memo = c.Propagate(indexes, input_data, &output_data); CuMatrix input_deriv(num_rows, input_dim, kSetZero, input_stride_type), empty_mat; - c.Backprop("foobar", NULL, + c.Backprop("foobar", indexes, ((properties & kBackpropNeedsInput) ? input_data : empty_mat), ((properties & kBackpropNeedsOutput) ? output_data : empty_mat), output_deriv, memo, NULL, &input_deriv); @@ -311,7 +353,7 @@ bool TestSimpleComponentDataDerivative(const Component &c, perturbed_input_data.AddMat(1.0, input_data); ResetSeed(rand_seed, c); - c.DeleteMemo(c.Propagate(NULL, perturbed_input_data, &perturbed_output_data)); + c.DeleteMemo(c.Propagate(indexes, perturbed_input_data, &perturbed_output_data)); measured_objf_change(i) = TraceMatMat(output_deriv, perturbed_output_data, kTrans) - original_objf; } @@ -336,6 +378,7 @@ bool TestSimpleComponentDataDerivative(const Component &c, << "it is ClipGradientComponent."; return true; } + delete indexes; return ans; } @@ -366,7 +409,8 @@ bool TestSimpleComponentModelDerivative(const Component &c, input_data.SetRandn(); output_deriv.SetRandn(); - void *memo = c.Propagate(NULL, input_data, &output_data); + ComponentPrecomputedIndexes *indexes = GetPrecomputedIndexes(c, num_rows); + void *memo = c.Propagate(indexes, input_data, &output_data); BaseFloat original_objf = TraceMatMat(output_deriv, output_data, kTrans); @@ -383,7 +427,7 @@ bool TestSimpleComponentModelDerivative(const Component &c, CuMatrix input_deriv(num_rows, input_dim, kSetZero, input_stride_type), empty_mat; - c.Backprop("foobar", NULL, + c.Backprop("foobar", indexes, ((properties & kBackpropNeedsInput) ? input_data : empty_mat), ((properties & kBackpropNeedsOutput) ? output_data : empty_mat), output_deriv, memo, c_copy, @@ -393,7 +437,7 @@ bool TestSimpleComponentModelDerivative(const Component &c, if (!test_derivative) { // Just testing that the model update is downhill. CuMatrix new_output_data(num_rows, output_dim, kSetZero, output_stride_type); - c.DeleteMemo(c_copy->Propagate(NULL, input_data, &new_output_data)); + c.DeleteMemo(c_copy->Propagate(indexes, input_data, &new_output_data)); BaseFloat new_objf = TraceMatMat(output_deriv, new_output_data, kTrans); bool ans = (new_objf > original_objf); @@ -402,6 +446,7 @@ bool TestSimpleComponentModelDerivative(const Component &c, << new_objf << " <= " << original_objf; } delete c_copy; + delete indexes; return ans; } else { // check that the model derivative is accurate. @@ -420,7 +465,7 @@ bool TestSimpleComponentModelDerivative(const Component &c, predicted_objf_change(i) = uc_copy->DotProduct(*uc_perturbed) - uc_copy->DotProduct(*uc); - c_perturbed->Propagate(NULL, input_data, &perturbed_output_data); + c_perturbed->Propagate(indexes, input_data, &perturbed_output_data); measured_objf_change(i) = TraceMatMat(output_deriv, perturbed_output_data, kTrans) - original_objf; delete c_perturbed; @@ -436,6 +481,7 @@ bool TestSimpleComponentModelDerivative(const Component &c, << c.Type() << ", input-dim=" << input_dim << ", output-dim=" << output_dim; delete c_copy; + delete indexes; return ans; } } diff --git a/src/nnet3/nnet-compute-test.cc b/src/nnet3/nnet-compute-test.cc index ae0da75ac40..b8e20e50d88 100644 --- a/src/nnet3/nnet-compute-test.cc +++ b/src/nnet3/nnet-compute-test.cc @@ -284,7 +284,7 @@ int main() { using namespace kaldi::nnet3; // uncommenting the following activates extra checks during optimization, that // can help narrow down the source of problems. - SetVerboseLevel(4); + //SetVerboseLevel(4); for (kaldi::int32 loop = 0; loop < 2; loop++) { diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc index cae6f41f5f2..7ee7d7df717 100644 --- a/src/nnet3/nnet-compute.cc +++ b/src/nnet3/nnet-compute.cc @@ -411,7 +411,8 @@ void NnetComputer::ExecuteCommand() { matrices_[m].NumRows() == 0); matrices_[m].Resize(compressed_matrix->NumRows(), compressed_matrix->NumCols(), - kUndefined); + kUndefined, + computation_.matrices[m].stride_type); compressed_matrix->CopyToMat(&(matrices_[m])); delete compressed_matrix; compressed_matrices_[m] = NULL; diff --git a/src/nnet3/nnet-convolutional-component.cc b/src/nnet3/nnet-convolutional-component.cc index bea3b9d31d5..f48a3968c88 100644 --- a/src/nnet3/nnet-convolutional-component.cc +++ b/src/nnet3/nnet-convolutional-component.cc @@ -79,8 +79,7 @@ std::string TimeHeightConvolutionComponent::Info() const { << preconditioner_in_.GetNumMinibatchesHistory() << ", rank-in=" << preconditioner_in_.GetRank() << ", rank-out=" << preconditioner_out_.GetRank() - << ", alpha-in=" << preconditioner_in_.GetAlpha() - << ", alpha-out=" << preconditioner_out_.GetAlpha(); + << ", alpha=" << preconditioner_in_.GetAlpha(); } return stream.str(); } diff --git a/src/nnet3/nnet-convolutional-component.h b/src/nnet3/nnet-convolutional-component.h index 35cf0de11c9..e107962abc2 100644 --- a/src/nnet3/nnet-convolutional-component.h +++ b/src/nnet3/nnet-convolutional-component.h @@ -154,7 +154,7 @@ namespace nnet3 { num-rows of the parameter matrix. [note: I'm considering decreasing this default to e.g. 40 or 20]. num-minibatches-history - This is used setting the 'num_samples_history_in' + This is used setting the 'num_samples_history' configuration value of the natural gradient object. There is no concept of samples (frames) in the application of natural gradient to the convnet, because @@ -372,6 +372,263 @@ class TimeHeightConvolutionComponent: public UpdatableComponent { +/** + TdnnComponent is a more memory-efficient alternative to manually splicing + several frames of input and then using a NaturalGradientAffineComponent or + a LinearComponent. It does the splicing of the input itself, using + mechanisms similar to what TimeHeightConvolutionComponent uses. The + implementation is in nnet-tdnn-component.cc + + Parameters inherited from UpdatableComponent (see comment above declaration of + UpdadableComponent in nnet-component-itf.h for details): + learning-rate, learning-rate-factor, max-change + + Important parameters: + + input-dim The input feature dimension (before splicing). + + output-dim The output feature dimension + + time-offsets E.g. time-offsets=-1,0,1 or time-offsets=-3,0,3. + The time offsets that we require at the input to produce a given output. + comparable to the offsets used in TDNNs. They + must be unique (no repeats). + use-bias Defaults to true, but set to false if you want this to + be linear rather than affine in its input. + + + Extra parameters: + orthonormal-constraint=0.0 If you set this to 1.0, then the linear_params_ + matrix will be (approximately) constrained during training + to have orthonormal rows (or columns, whichever is + fewer).. it turns out the real name for this is a + "semi-orthogonal" matrix. You can choose a positive + nonzero value different than 1.0 to have a scaled + semi-orthgonal matrix, i.e. with singular values at the + selected value (e.g. 0.5, or 2.0). This is not enforced + inside the component itself; you have to call + ConstrainOrthonormal() from the training code to do this. + All this component does is return the + OrthonormalConstraint() value. If you set this to a + negative value, it's like saying "for any value", i.e. it + will constrain the parameter matrix to be closer to "any + alpha" times a semi-orthogonal matrix, without changing + its overall norm. + + + Initialization parameters: + param-stddev Standard deviation of the linear parameters of the + convolution. Defaults to + sqrt(1.0 / (input-dim * the number of time-offsets)) + bias-stddev Standard deviation of bias terms. default=0.0. + You should not set this if you set use-bias=false. + + + Natural-gradient related options are below; you won't normally have to + set these as the defaults are reasonable. + + use-natural-gradient e.g. use-natural-gradient=false (defaults to true). + You can set this to false to disable the natural gradient + updates (you won't normally want to do this). + rank-out Rank used in low-rank-plus-unit estimate of the Fisher-matrix + factor that has the dimension (num-rows of linear_params_), + which equals output_dim. It + defaults to the minimum of 80, or half of the output dim. + rank-in Rank used in low-rank-plus-unit estimate of the Fisher + matrix factor which has the dimension (num-cols of the + parameter matrix), which is input-dim times the number of + time offsets. It defaults to the minimum of 20, or half the + num-rows of the parameter matrix. + num-samples-history + This becomes the 'num_samples_history' + configuration value of the natural gradient objects. The + default value is 2000.0. + + */ +class TdnnComponent: public UpdatableComponent { + public: + + // The use of this constructor should only precede InitFromConfig() + TdnnComponent(); + + // Copy constructor + TdnnComponent(const TdnnComponent &other); + + virtual int32 InputDim() const { + return linear_params_.NumCols() / static_cast(time_offsets_.size()); + } + virtual int32 OutputDim() const { return linear_params_.NumRows(); } + + virtual std::string Info() const; + virtual void InitFromConfig(ConfigLine *cfl); + virtual std::string Type() const { return "TdnnComponent"; } + virtual int32 Properties() const { + return kUpdatableComponent|kReordersIndexes|kBackpropAdds| + (bias_params_.Dim() == 0 ? kPropagateAdds : 0)| + kBackpropNeedsInput; + } + virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in, + CuMatrixBase *out) const; + virtual void Backprop(const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + const CuMatrixBase &out_deriv, + void *memo, + Component *to_update, + CuMatrixBase *in_deriv) const; + + virtual void Read(std::istream &is, bool binary); + virtual void Write(std::ostream &os, bool binary) const; + virtual Component* Copy() const { + return new TdnnComponent(*this); + } + + + // Some functions that are only to be reimplemented for GeneralComponents. + + // This ReorderIndexes function may insert 'blank' indexes (indexes with + // t == kNoTime) as well as reordering the indexes. This is allowed + // behavior of ReorderIndexes functions. + virtual void ReorderIndexes(std::vector *input_indexes, + std::vector *output_indexes) const; + + virtual void GetInputIndexes(const MiscComputationInfo &misc_info, + const Index &output_index, + std::vector *desired_indexes) const; + + // This function returns true if at least one of the input indexes used to + // compute this output index is computable. + virtual bool IsComputable(const MiscComputationInfo &misc_info, + const Index &output_index, + const IndexSet &input_index_set, + std::vector *used_inputs) const; + + virtual ComponentPrecomputedIndexes* PrecomputeIndexes( + const MiscComputationInfo &misc_info, + const std::vector &input_indexes, + const std::vector &output_indexes, + bool need_backprop) const; + + // Some functions from base-class UpdatableComponent. + virtual void Scale(BaseFloat scale); + virtual void Add(BaseFloat alpha, const Component &other); + virtual void PerturbParams(BaseFloat stddev); + virtual BaseFloat DotProduct(const UpdatableComponent &other) const; + virtual int32 NumParameters() const; + virtual void Vectorize(VectorBase *params) const; + virtual void UnVectorize(const VectorBase ¶ms); + virtual void FreezeNaturalGradient(bool freeze); + + + class PrecomputedIndexes: public ComponentPrecomputedIndexes { + public: + PrecomputedIndexes() { } + PrecomputedIndexes(const PrecomputedIndexes &other): + row_stride(other.row_stride), row_offsets(other.row_offsets) { } + virtual PrecomputedIndexes *Copy() const; + virtual void Write(std::ostream &os, bool binary) const; + virtual void Read(std::istream &os, bool binary); + virtual std::string Type() const { + return "TdnnComponentPrecomputedIndexes"; + } + virtual ~PrecomputedIndexes() { } + + + // input_row_stride is the stride (in number of rows) we have to take in the + // input matrix each time we form a sub-matrix that will be part of the + // input to the tdnn operation. Normally this will be 1, but it may be, + // for example, 3 in layers where we do subsampling. + int32 row_stride; + + // 'row_offsets' is of the same dimension as time_offsets_. Each element + // describes the row offset (in the input matrix) of a sub-matrix, and each. + // We will append together these sub-matrices (row-wise) to be the input to + // the affine or linear transform. + std::vector row_offsets; + }; + + CuMatrixBase &LinearParams() { return linear_params_; } + + // This allows you to resize the vector in order to add a bias where + // there previously was none-- obviously this should be done carefully. + CuVector &BiasParams() { return bias_params_; } + + BaseFloat OrthonormalConstraint() const { return orthonormal_constraint_; } + private: + + // This static function is a utility function that extracts a CuSubMatrix + // representing a subset of rows of 'input_matrix'. + // The numpy syntax would be: + // return input_matrix[row_offset:row_stride:num_output_rows*row_stride,:] + static CuSubMatrix GetInputPart( + const CuMatrixBase &input_matrix, + int32 num_output_rows, + int32 row_stride, + int32 row_offset); + + // see the definition for more explanation. + static void ModifyComputationIo(time_height_convolution::ConvolutionComputationIo *io); + + void Check() const; + + // Function that updates linear_params_, and bias_params_ if present, which + // uses the natural gradient code. + void UpdateNaturalGradient( + const PrecomputedIndexes &indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &out_deriv); + + // Function that updates linear_params_, and bias_params_ if present, which + // does not use the natural gradient code. + void UpdateSimple( + const PrecomputedIndexes &indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &out_deriv); + + + + + // time_offsets_ is the list of time-offsets of the input that + // we append together; it will typically be (-1,0,1) or (-3,0,3). + std::vector time_offsets_; + + // the linear parameters of the network; its NumRows() is the output + // dim, and its NumCols() equals the input dim times time_offsets_.size(). + CuMatrix linear_params_; + + // the bias parameters if this is an affine transform, or the empty vector if + // this is a linear operation (i.e. use-bias == false in the config). + CuVector bias_params_; + + // If nonzero, this controls how we apply an orthonormal constraint to the + // parameter matrix; see docs for ConstrainOrthonormal() in nnet-utils.h. + // This class just returns the value via the OrthonormalConstraint() function; + // it doesn't actually do anything with it directly. + BaseFloat orthonormal_constraint_; + + // Controls whether or not the natural-gradient is used. Note: even if this + // is true, if is_gradient_ (from the UpdatableComponent base class) is true, + // we'll do the 'simple' update that doesn't include natural gradient. + bool use_natural_gradient_; + + // Preconditioner for the input space, of dimension linear_params_.NumCols() + + // 1 (the 1 is for the bias). As with other natural-gradient objects, it's + // not stored with the model on disk but is reinitialized each time we start + // up. + OnlineNaturalGradient preconditioner_in_; + + // Preconditioner for the output space, of dimension + // linear_params_.NumRows(). + OnlineNaturalGradient preconditioner_out_; +}; + + + + + + } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-descriptor.cc b/src/nnet3/nnet-descriptor.cc index fb3d152dc2e..78fedc3b00c 100644 --- a/src/nnet3/nnet-descriptor.cc +++ b/src/nnet3/nnet-descriptor.cc @@ -499,7 +499,7 @@ bool Descriptor::Parse(const std::vector &node_names, } if (**next_token != "end of input") KALDI_ERR << "Parsing Descriptor, expected end of input but got " - << "'" << *next_token << "'"; + << "'" << **next_token << "'"; Descriptor *desc = gen_desc->ConvertToDescriptor(); *this = *desc; delete desc; diff --git a/src/nnet3/nnet-discriminative-example.cc b/src/nnet3/nnet-discriminative-example.cc index 29eb65c30b1..b0ba56d1e35 100644 --- a/src/nnet3/nnet-discriminative-example.cc +++ b/src/nnet3/nnet-discriminative-example.cc @@ -197,7 +197,7 @@ void MergeSupervision( for (int32 n = 0; n < num_inputs; n++) input_supervision.push_back(&(inputs[n]->supervision)); discriminative::DiscriminativeSupervision output_supervision; - discriminative::AppendSupervision(input_supervision, + discriminative::MergeSupervision(input_supervision, &output_supervision); output->supervision.Swap(&(output_supervision)); diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc index 07112c9d873..cc5fe3cc050 100644 --- a/src/nnet3/nnet-example-utils.cc +++ b/src/nnet3/nnet-example-utils.cc @@ -300,6 +300,9 @@ void RoundUpNumFrames(int32 frame_subsampling_factor, } void ExampleGenerationConfig::ComputeDerived() { + if (num_frames_str == "-1") { + return; + } if (!SplitStringToIntegers(num_frames_str, ",", false, &num_frames) || num_frames.empty()) { KALDI_ERR << "Invalid option (expected comma-separated list of integers): " @@ -341,11 +344,13 @@ UtteranceSplitter::UtteranceSplitter(const ExampleGenerationConfig &config): total_num_utterances_(0), total_input_frames_(0), total_frames_overlap_(0), total_num_chunks_(0), total_frames_in_chunks_(0) { - if (config.num_frames.empty()) { - KALDI_ERR << "You need to call ComputeDerived() on the " + if (config.num_frames_str != "-1") { + if (config.num_frames.empty()) { + KALDI_ERR << "You need to call ComputeDerived() on the " "ExampleGenerationConfig()."; + } + InitSplitForLength(); } - InitSplitForLength(); } UtteranceSplitter::~UtteranceSplitter() { @@ -377,6 +382,7 @@ UtteranceSplitter::~UtteranceSplitter() { KALDI_LOG << "Output frames are distributed among chunk-sizes as follows: " << os.str(); } + } float UtteranceSplitter::DefaultDurationOfSplit( @@ -816,23 +822,35 @@ void UtteranceSplitter::GetGapSizes(int32 utterance_length, void UtteranceSplitter::GetChunksForUtterance( int32 utterance_length, std::vector *chunk_info) { - std::vector chunk_sizes; - GetChunkSizesForUtterance(utterance_length, &chunk_sizes); - std::vector gaps(chunk_sizes.size()); - GetGapSizes(utterance_length, true, chunk_sizes, &gaps); - int32 num_chunks = chunk_sizes.size(); - chunk_info->resize(num_chunks); int32 t = 0; - for (int32 i = 0; i < num_chunks; i++) { - t += gaps[i]; - ChunkTimeInfo &info = (*chunk_info)[i]; - info.first_frame = t; - info.num_frames = chunk_sizes[i]; - info.left_context = (i == 0 && config_.left_context_initial >= 0 ? - config_.left_context_initial : config_.left_context); - info.right_context = (i == num_chunks - 1 && config_.right_context_final >= 0 ? - config_.right_context_final : config_.right_context); - t += chunk_sizes[i]; + if (config_.num_frames_str == "-1" ) { + ChunkTimeInfo *info; + info = new ChunkTimeInfo; + info->first_frame = 0; + info->num_frames = utterance_length; + info->left_context = (config_.left_context_initial >= 0 ? + config_.left_context_initial : config_.left_context); + info->right_context = (config_.right_context_final >= 0 ? + config_.right_context_final : config_.right_context); + (*chunk_info).push_back(*info); + } else { + std::vector chunk_sizes; + GetChunkSizesForUtterance(utterance_length, &chunk_sizes); + std::vector gaps(chunk_sizes.size()); + GetGapSizes(utterance_length, true, chunk_sizes, &gaps); + int32 num_chunks = chunk_sizes.size(); + chunk_info->resize(num_chunks); + for (int32 i = 0; i < num_chunks; i++) { + t += gaps[i]; + ChunkTimeInfo &info = (*chunk_info)[i]; + info.first_frame = t; + info.num_frames = chunk_sizes[i]; + info.left_context = (i == 0 && config_.left_context_initial >= 0 ? + config_.left_context_initial : config_.left_context); + info.right_context = (i == num_chunks - 1 && config_.right_context_final >= 0 ? + config_.right_context_final : config_.right_context); + t += chunk_sizes[i]; + } } SetOutputWeights(utterance_length, chunk_info); AccStatsForUtterance(utterance_length, *chunk_info); diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h index c93d0dd2c81..52b2ebbf904 100644 --- a/src/nnet3/nnet-example-utils.h +++ b/src/nnet3/nnet-example-utils.h @@ -131,7 +131,8 @@ struct ExampleGenerationConfig { "that most of the time the number of frames will be 40, but to " "deal with odd-sized inputs we may also generate egs with these " "other sizes. All these values will be rounded up to the " - "closest multiple of --frame-subsampling-factor."); + "closest multiple of --frame-subsampling-factor. As a special case, " + "--num-frames=-1 means 'don't do any splitting'."); po->Register("num-frames-overlap", &num_frames_overlap, "Number of frames of " "overlap between adjacent eamples (applies to chunks of size " "equal to the primary [first-listed] --num-frames value... " @@ -149,7 +150,6 @@ struct ExampleGenerationConfig { struct ChunkTimeInfo is used by class UtteranceSplitter to output information about how we split an utterance into chunks. */ - struct ChunkTimeInfo { int32 first_frame; int32 num_frames; diff --git a/src/nnet3/nnet-nnet.cc b/src/nnet3/nnet-nnet.cc index f281d2294a3..334234f53db 100644 --- a/src/nnet3/nnet-nnet.cc +++ b/src/nnet3/nnet-nnet.cc @@ -279,7 +279,7 @@ void Nnet::ProcessComponentConfigLine( } if (config->HasUnusedValues()) KALDI_ERR << "Unused values '" << config->UnusedValues() - << " in config line: " << config->WholeLine(); + << "' in config line: " << config->WholeLine(); } diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc index e587c7ff947..2a6523ddf06 100644 --- a/src/nnet3/nnet-optimize-utils.cc +++ b/src/nnet3/nnet-optimize-utils.cc @@ -696,18 +696,15 @@ void RenumberComputation(NnetComputation *computation) { } +static bool IsNoop(const NnetComputation::Command &command) { + return command.command_type == kNoOperation; +} + void RemoveNoOps(NnetComputation *computation) { - std::vector::iterator - input_iter = computation->commands.begin(), - input_end = computation->commands.end(), - output_iter = computation->commands.begin(); - for (; input_iter != input_end; ++input_iter) { - if (input_iter->command_type != kNoOperation) { - *output_iter = *input_iter; - ++output_iter; - } - } - computation->commands.resize(output_iter - computation->commands.begin()); + computation->commands.erase( + std::remove_if(computation->commands.begin(), + computation->commands.end(), + IsNoop), computation->commands.end()); } @@ -4695,7 +4692,7 @@ class MemoryCompressionOptimizer { /** @param [in] nnet The neural net the computation is for. @param [in] memory_compression_level. The level of compression: - 0 = no compression (the constructor should not be calle with this value). + 0 = no compression (the constructor should not be called with this value). 1 = compression that doesn't affect the results (but still takes time). 2 = compression that affects the results only very slightly 3 = compression that affects the results a little more. diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc index 63a7e833c74..b0eaa4916ae 100644 --- a/src/nnet3/nnet-optimize.cc +++ b/src/nnet3/nnet-optimize.cc @@ -21,6 +21,7 @@ #include #include "nnet3/nnet-optimize.h" #include "nnet3/nnet-optimize-utils.h" +#include "nnet3/nnet-utils.h" #include "base/timer.h" namespace kaldi { @@ -638,7 +639,8 @@ CachingOptimizingCompiler::CachingOptimizingCompiler( seconds_taken_total_(0.0), seconds_taken_compile_(0.0), seconds_taken_optimize_(0.0), seconds_taken_expand_(0.0), seconds_taken_check_(0.0), seconds_taken_indexes_(0.0), - seconds_taken_io_(0.0), cache_(config.cache_capacity) { } + seconds_taken_io_(0.0), cache_(config.cache_capacity), + nnet_left_context_(-1), nnet_right_context_(-1) { } CachingOptimizingCompiler::CachingOptimizingCompiler( const Nnet &nnet, @@ -648,8 +650,18 @@ CachingOptimizingCompiler::CachingOptimizingCompiler( seconds_taken_total_(0.0), seconds_taken_compile_(0.0), seconds_taken_optimize_(0.0), seconds_taken_expand_(0.0), seconds_taken_check_(0.0), seconds_taken_indexes_(0.0), - seconds_taken_io_(0.0), cache_(config.cache_capacity) { } + seconds_taken_io_(0.0), cache_(config.cache_capacity), + nnet_left_context_(-1), nnet_right_context_(-1) { } +void CachingOptimizingCompiler::GetSimpleNnetContext( + int32 *nnet_left_context, int32 *nnet_right_context) { + if (nnet_left_context_ == -1) { + ComputeSimpleNnetContext(nnet_, &nnet_left_context_, + &nnet_right_context_); + } + *nnet_left_context = nnet_left_context_; + *nnet_right_context = nnet_right_context_; +} void CachingOptimizingCompiler::ReadCache(std::istream &is, bool binary) { { diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h index 78763732469..0804729519d 100644 --- a/src/nnet3/nnet-optimize.h +++ b/src/nnet3/nnet-optimize.h @@ -242,6 +242,16 @@ class CachingOptimizingCompiler { void ReadCache(std::istream &is, bool binary); void WriteCache(std::ostream &os, bool binary); + + // GetSimpleNnetContext() is equivalent to calling: + // ComputeSimpleNnetContext(nnet_, &nnet_left_context, + // &nnet_right_context) + // but it caches it inside this class. This functionality is independent of + // the rest of the functionality of this class; it just happens to be a + // convenient place to put this mechanism. + void GetSimpleNnetContext(int32 *nnet_left_context, + int32 *nnet_right_context); + private: // This function just implements the work of Compile(); it's made a separate @@ -290,6 +300,10 @@ class CachingOptimizingCompiler { double seconds_taken_io_; ComputationCache cache_; + + // These following two variables are only used by the function GetSimpleNnetContext(). + int32 nnet_left_context_; + int32 nnet_right_context_; }; diff --git a/src/nnet3/nnet-parse.cc b/src/nnet3/nnet-parse.cc index bb3a209460a..c920790fd6f 100644 --- a/src/nnet3/nnet-parse.cc +++ b/src/nnet3/nnet-parse.cc @@ -481,7 +481,7 @@ static void PrintFloatSuccinctly(std::ostream &os, BaseFloat f) { // Returns a string that summarizes a vector fairly succintly, for // printing stats in info lines. -std::string SummarizeVector(const VectorBase &vec) { +std::string SummarizeVector(const VectorBase &vec) { std::ostringstream os; if (vec.Dim() < 10) { os << "[ "; diff --git a/src/nnet3/nnet-parse.h b/src/nnet3/nnet-parse.h index 0b2e0041aaa..a073a54f7e0 100644 --- a/src/nnet3/nnet-parse.h +++ b/src/nnet3/nnet-parse.h @@ -194,7 +194,7 @@ std::string ErrorContext(const std::string &str); "[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.001,0.003,0.003,0.004 \ 0.005,0.01,0.07,0.11,0.14 0.18,0.24,0.29,0.39), mean=0.0745, stddev=0.0611]" */ -std::string SummarizeVector(const VectorBase &vec); +std::string SummarizeVector(const VectorBase &vec); std::string SummarizeVector(const VectorBase &vec); diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index 4eb078c0fcb..69f8442a08a 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -448,8 +448,84 @@ void NoOpComponent::Backprop(const std::string &debug_info, // to "this" or different. CuMatrixBase *in_deriv) const { in_deriv->CopyFromMat(out_deriv); + if (backprop_scale_ != 1.0) + in_deriv->Scale(backprop_scale_); } +void NoOpComponent::InitFromConfig(ConfigLine *cfl) { + backprop_scale_ = 1.0; + cfl->GetValue("backprop-scale", &backprop_scale_); + if (!cfl->GetValue("dim", &dim_) || + dim_ <= 0 || cfl->HasUnusedValues()) { + KALDI_ERR << "Invalid initializer for layer of type " + << Type() << ": \"" << cfl->WholeLine() << "\""; + } +} + +std::string NoOpComponent::Info() const { + std::ostringstream stream; + stream << Type() << ", dim=" << dim_; + if (backprop_scale_ != 1.0) + stream << ", backprop-scale=" << backprop_scale_; + return stream.str(); +} + +void NoOpComponent::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, backprop_scale_); + WriteToken(os, binary, ""); +} + +void NoOpComponent::Read(std::istream &is, bool binary) { + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &dim_); + + if (PeekToken(is, binary) == 'V') { + // This is the old format, from when NoOpComponent inherited from + // NonlinearComponent. + backprop_scale_ = 1.0; + ExpectToken(is, binary, ""); + CuVector temp_vec; + temp_vec.Read(is, binary); + ExpectToken(is, binary, ""); + temp_vec.Read(is, binary); + ExpectToken(is, binary, ""); + BaseFloat temp_float; + ReadBasicType(is, binary, &temp_float); + if (PeekToken(is, binary) == 'O') { + ExpectToken(is, binary, ""); + temp_vec.Read(is, binary); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &temp_float); + } + std::string token; + ReadToken(is, binary, &token); + if (token[0] != '<') { + // this should happen only rarely, in case we couldn't push back the + // '<' to the stream in PeekToken(). + token = '<' + token; + } + if (token == "") { + ReadBasicType(is, binary, &temp_float); + ReadToken(is, binary, &token); + } + if (token == "") { + ReadBasicType(is, binary, &temp_float); + ReadToken(is, binary, &token); + } + KALDI_ASSERT(token == ""); + return; + } else { + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &backprop_scale_); + ExpectToken(is, binary, ""); + } +} + + void ClipGradientComponent::Read(std::istream &is, bool binary) { // might not see the "" part because // of how ReadNew() works. @@ -2803,7 +2879,7 @@ void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) { // Set natural-gradient configs. BaseFloat num_samples_history = 2000.0, alpha = 4.0; - int32 rank_in = 20, rank_out = 80, + int32 rank_in = -1, rank_out = -1, update_period = 4; cfl->GetValue("num-samples-history", &num_samples_history); cfl->GetValue("alpha", &alpha); @@ -2811,6 +2887,11 @@ void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) { cfl->GetValue("rank-out", &rank_out); cfl->GetValue("update-period", &update_period); + if (rank_in < 0) + rank_in = std::min(20, (InputDim() + 1) / 2); + if (rank_out < 0) + rank_out = std::min(80, (OutputDim() + 1) / 2); + preconditioner_in_.SetNumSamplesHistory(num_samples_history); preconditioner_out_.SetNumSamplesHistory(num_samples_history); preconditioner_in_.SetAlpha(alpha); @@ -3012,7 +3093,7 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) { params_.Scale(param_stddev); } // Read various natural-gradient-related configs. - int32 rank_in = 20, rank_out = 80, update_period = 4; + int32 rank_in = -1, rank_out = -1, update_period = 4; BaseFloat alpha = 4.0, num_samples_history = 2000.0; @@ -3025,6 +3106,11 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) { cfl->GetValue("update-period", &update_period); cfl->GetValue("use-natural-gradient", &use_natural_gradient_); + if (rank_in < 0) + rank_in = std::min(20, (InputDim() + 1) / 2); + if (rank_out < 0) + rank_out = std::min(80, (OutputDim() + 1) / 2); + preconditioner_in_.SetAlpha(alpha); preconditioner_out_.SetAlpha(alpha); preconditioner_in_.SetRank(rank_in); @@ -5861,5 +5947,6 @@ void SumBlockComponent::Backprop( } + } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index 3929c253aab..12ae99d716b 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -381,16 +381,35 @@ class FixedScaleComponent; class PerElementScaleComponent; class PerElementOffsetComponent; -// Affine means a linear function plus an offset. -// Note: although this class can be instantiated, it also -// functions as a base-class for more specialized versions of -// AffineComponent. +/* + Affine means a linear function plus an offset. + Note: although this class can be instantiated, it also + functions as a base-class for more specialized versions of + AffineComponent. + + Parameters accepted on the config line, with default if applicable: + + matrix If specified, a filename containing the parameters of the class as + a single matrix containing the linear_params, plus the bias_params + as the last column + + input-dim The input dimension of the component + output-dim The output dimension of the component + param-stddev=1/sqrt(input-dim) The standard deviation of the elements of the linear parameters + (they will have a Gaussian distribution with this standard deviation). + bias-stddev=1.0 The standard deviation of the elements of the bias parameters + + orthonormal-constraint=0.0 Can be used to constrain the linear parameter matrix + to be semi-orthogonal, see ConstraintOrhonormal() in nnet-utils.h, + and http://www.danielpovey.com/files/2018_interspeech_tdnnf.pdf. +*/ class AffineComponent: public UpdatableComponent { public: virtual int32 InputDim() const { return linear_params_.NumCols(); } virtual int32 OutputDim() const { return linear_params_.NumRows(); } BaseFloat OrthonormalConstraint() const { return orthonormal_constraint_; } + virtual std::string Info() const; virtual void InitFromConfig(ConfigLine *cfl); @@ -434,6 +453,7 @@ class AffineComponent: public UpdatableComponent { virtual void SetParams(const CuVectorBase &bias, const CuMatrixBase &linear); const CuVector &BiasParams() const { return bias_params_; } + CuVector &BiasParams() { return bias_params_; } const CuMatrix &LinearParams() const { return linear_params_; } CuMatrix &LinearParams() { return linear_params_; } explicit AffineComponent(const AffineComponent &other); @@ -468,6 +488,8 @@ class AffineComponent: public UpdatableComponent { const AffineComponent &operator = (const AffineComponent &other); // Disallow. CuMatrix linear_params_; CuVector bias_params_; + // see documentation at the top of this class for more information on the + // following. BaseFloat orthonormal_constraint_; }; @@ -1145,18 +1167,30 @@ class FixedBiasComponent: public Component { KALDI_DISALLOW_COPY_AND_ASSIGN(FixedBiasComponent); }; -/** NoOpComponent just duplicates its input. We don't anticipate this being used - very often, but it may sometimes make your life easier - The only config parameter it accepts is 'dim', e.g. 'dim=400'. +/** + NoOpComponent just duplicates its input. We don't anticipate this being used + very often, but it may sometimes make your life easier. Config parameters: + + dim E.g. dim=1024. Required. + backprop-scale Defaults to 1.0. May be set to a different value to scale + the derivatives being backpropagated. */ -class NoOpComponent: public NonlinearComponent { +class NoOpComponent: public Component { public: - explicit NoOpComponent(const NoOpComponent &other): NonlinearComponent(other) { } + explicit NoOpComponent(const NoOpComponent &other): + dim_(other.dim_), backprop_scale_(other.backprop_scale_) { } NoOpComponent() { } virtual std::string Type() const { return "NoOpComponent"; } virtual int32 Properties() const { - return kSimpleComponent|kPropagateInPlace; + return kSimpleComponent|kPropagateInPlace|kBackpropInPlace; } + virtual int32 InputDim() const { return dim_; } + virtual int32 OutputDim() const { return dim_; } + virtual Component *Copy() { return new NoOpComponent(*this); } + virtual void InitFromConfig(ConfigLine *cfl); + virtual void Read(std::istream &is, bool binary); + virtual void Write(std::ostream &os, bool binary) const; + virtual std::string Info() const; virtual Component* Copy() const { return new NoOpComponent(*this); } virtual void* Propagate(const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &in, @@ -1170,6 +1204,9 @@ class NoOpComponent: public NonlinearComponent { Component *to_update, CuMatrixBase *in_deriv) const; private: + int32 dim_; + BaseFloat backprop_scale_; + NoOpComponent &operator = (const NoOpComponent &other); // Disallow. }; @@ -1386,6 +1423,11 @@ class ClipGradientComponent: public Component { for each feature/activation dimension i: output(row, i) = input(row, column_map_[i]). + The only config value it accepts is 'column-map', e.g.: + column-map=0,10,1,11,...,9,19 + ... which should be a permutation of a contiguous block of integers + starting with 0 (i.e. something like '3,2,1,0' but not '0,4' or '0,0,2'). + See the equation above for how it is used. */ class PermuteComponent: public Component { public: @@ -1555,6 +1597,12 @@ class PerElementScaleComponent: public UpdatableComponent { which does not support natural gradient directly-- in that case you have to use NaturalGradientPerElementScaleComponent if you want to use natural gradient update. + + Values inherited from UpdatableComponent (see its declaration in + nnet-component-itf for details): + learning-rate + learning-rate-factor + max-change */ class PerElementOffsetComponent: public UpdatableComponent { public: diff --git a/src/nnet3/nnet-tdnn-component.cc b/src/nnet3/nnet-tdnn-component.cc new file mode 100644 index 00000000000..52ad1031a4c --- /dev/null +++ b/src/nnet3/nnet-tdnn-component.cc @@ -0,0 +1,699 @@ +// nnet3/nnet-tdnn-component.h + +// Copyright 2017 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +// Note: the code defined here was declared in nnet-convolutional-component.h. + +#include +#include +#include +#include "nnet3/nnet-convolutional-component.h" +#include "nnet3/nnet-computation-graph.h" +#include "nnet3/nnet-parse.h" + +namespace kaldi { +namespace nnet3 { + + +TdnnComponent::TdnnComponent(): + orthonormal_constraint_(0.0), + use_natural_gradient_(true) { } + + +TdnnComponent::TdnnComponent( + const TdnnComponent &other): + UpdatableComponent(other), // initialize base-class + time_offsets_(other.time_offsets_), + linear_params_(other.linear_params_), + bias_params_(other.bias_params_), + orthonormal_constraint_(other.orthonormal_constraint_), + use_natural_gradient_(other.use_natural_gradient_), + preconditioner_in_(other.preconditioner_in_), + preconditioner_out_(other.preconditioner_out_) { + Check(); +} + + +void TdnnComponent::Check() const { + KALDI_ASSERT(linear_params_.NumRows() > 0 && + !time_offsets_.empty() && + std::set(time_offsets_.begin(), + time_offsets_.end()).size() == + time_offsets_.size() && + linear_params_.NumCols() % time_offsets_.size() == 0 && + (bias_params_.Dim() == 0 || + bias_params_.Dim() == linear_params_.NumRows())); +} + +std::string TdnnComponent::Info() const { + std::ostringstream stream; + stream << UpdatableComponent::Info(); + if (orthonormal_constraint_ != 0.0) + stream << ", orthonormal-constraint=" << orthonormal_constraint_; + stream << ", time-offsets="; + for (size_t i = 0; i < time_offsets_.size(); i++) { + if (i != 0) stream << ','; + stream << time_offsets_[i]; + } + PrintParameterStats(stream, "linear-params", linear_params_, + false, // include_mean + true, // include_row_norms + true, // include_column_norms + GetVerboseLevel() >= 2); // include_singular_values + if (bias_params_.Dim() == 0) { + stream << ", has-bias=false"; + } else { + PrintParameterStats(stream, "bias", bias_params_, true); + } + if (!use_natural_gradient_) { + stream << ", use-natural-gradient=false"; + } else { + stream << ", rank-in=" << preconditioner_in_.GetRank() + << ", rank-out=" << preconditioner_out_.GetRank() + << ", num-samples-history=" << preconditioner_in_.GetNumSamplesHistory() + << ", update-period=" << preconditioner_in_.GetUpdatePeriod() + << ", alpha-in=" << preconditioner_in_.GetAlpha() + << ", alpha-out=" << preconditioner_out_.GetAlpha(); + } + return stream.str(); +} + + +void TdnnComponent::InitFromConfig(ConfigLine *cfl) { + // 1. Config values inherited from UpdatableComponent. + InitLearningRatesFromConfig(cfl); + + // 2. Structural config values + std::string time_offsets; + + int32 input_dim = -1, output_dim = -1; + + bool ok = cfl->GetValue("time-offsets", &time_offsets) && + cfl->GetValue("input-dim", &input_dim) && + cfl->GetValue("output-dim", &output_dim); + if (!ok || input_dim <= 0 || output_dim <= 0 || + !SplitStringToIntegers(time_offsets, ",", false, &time_offsets_) || + time_offsets_.empty()) { + KALDI_ERR << "Bad initializer: there is a problem with " + "time-offsets, input-dim or output-dim (not defined?): " + << cfl->WholeLine(); + } + + if (std::set(time_offsets_.begin(), + time_offsets_.end()).size() != time_offsets_.size()) { + KALDI_ERR << "Bad initializer: repeated time-offsets: " + << cfl->WholeLine(); + } + + // 3. Parameter-initialization configs, "has-bias", and + // orthonormal-constraint. + orthonormal_constraint_ = 0.0; + BaseFloat param_stddev = -1, bias_mean = 0.0, bias_stddev = 1.0; + bool use_bias = true; + cfl->GetValue("param-stddev", ¶m_stddev); + cfl->GetValue("bias-stddev", &bias_stddev); + cfl->GetValue("bias-mean", &bias_mean); + cfl->GetValue("use-bias", &use_bias); + cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_); + if (param_stddev < 0.0) { + param_stddev = 1.0 / sqrt(input_dim * time_offsets_.size()); + } + // initialize the parameters. + linear_params_.Resize(output_dim, + input_dim * time_offsets_.size()); + linear_params_.SetRandn(); + linear_params_.Scale(param_stddev); + + if (use_bias) { + bias_params_.Resize(output_dim); + bias_params_.SetRandn(); + bias_params_.Scale(bias_stddev); + bias_params_.Add(bias_mean); + } else { + bias_params_.Resize(0); + } + + // 4. Natural-gradient related configs. + use_natural_gradient_ = true; + int32 rank_out = -1, rank_in = -1; + BaseFloat alpha_out = 4.0, alpha_in = 4.0, + num_samples_history = 2000.0; + cfl->GetValue("use-natural-gradient", &use_natural_gradient_); + cfl->GetValue("rank-in", &rank_in); + cfl->GetValue("rank-out", &rank_out); + cfl->GetValue("alpha-in", &alpha_in); + cfl->GetValue("alpha-out", &alpha_out); + cfl->GetValue("num-samples-history", &num_samples_history); + + int32 spliced_input_dim = + input_dim * static_cast(time_offsets_.size()); + if (rank_in < 0) + rank_in = std::min(20, (spliced_input_dim + 1) / 2); + preconditioner_in_.SetRank(rank_in); + if (rank_out < 0) + rank_out = std::min(80, (output_dim + 1) / 2); + preconditioner_out_.SetRank(rank_out); + preconditioner_in_.SetNumSamplesHistory(num_samples_history); + preconditioner_out_.SetNumSamplesHistory(num_samples_history); + + preconditioner_in_.SetAlpha(alpha_in); + preconditioner_out_.SetAlpha(alpha_out); + + preconditioner_in_.SetUpdatePeriod(4); + preconditioner_out_.SetUpdatePeriod(4); +} + +void* TdnnComponent::Propagate( + const ComponentPrecomputedIndexes *indexes_in, + const CuMatrixBase &in, + CuMatrixBase *out) const { + const PrecomputedIndexes *indexes = + dynamic_cast(indexes_in); + KALDI_ASSERT(indexes != NULL); + + if (bias_params_.Dim() != 0) + out->CopyRowsFromVec(bias_params_); + // if bias_params_.Dim() == 0 we don't need to zero 'out' at + // this point because in that case we set the flag kPropagateAdds, + // so the calling code knows that the Propagate function *adds to* + // the 'out' matrix, so it should (typicaly) be zeroed before calling + // Propagate(). + + KALDI_ASSERT(indexes->row_offsets.size() == time_offsets_.size()); + + int32 num_offsets = time_offsets_.size(), + input_dim = InputDim(); + for (int32 i = 0; i < num_offsets; i++) { + CuSubMatrix in_part = GetInputPart(in, out->NumRows(), + indexes->row_stride, + indexes->row_offsets[i]); + CuSubMatrix linear_params_part(linear_params_, + 0, linear_params_.NumRows(), + i * input_dim, input_dim); + out->AddMatMat(1.0, in_part, kNoTrans, linear_params_part, kTrans, 1.0); + } + return NULL; +} + +void TdnnComponent::Backprop( + const std::string &debug_info, + const ComponentPrecomputedIndexes *indexes_in, + const CuMatrixBase &in_value, + const CuMatrixBase &, // out_value + const CuMatrixBase &out_deriv, + void*, // memo + Component *to_update_in, + CuMatrixBase *in_deriv) const { + const PrecomputedIndexes *indexes = + dynamic_cast(indexes_in); + KALDI_ASSERT(indexes != NULL && + indexes->row_offsets.size() == time_offsets_.size()); + int32 num_offsets = time_offsets_.size(), + input_dim = InputDim(); + + if (in_deriv != NULL) { + // Propagate the derivatives back to the input data. + for (int32 i = 0; i < num_offsets; i++) { + CuSubMatrix in_deriv_part = + GetInputPart(*in_deriv, out_deriv.NumRows(), + indexes->row_stride, indexes->row_offsets[i]); + CuSubMatrix linear_params_part(linear_params_, + 0, linear_params_.NumRows(), + i * input_dim, input_dim); + // note: this component has the property kBackpropAdds, which is why the + // final 1.0 is there in the following call (otherwise we'd have to zero + // *in_deriv first). + in_deriv_part.AddMatMat(1.0, out_deriv, kNoTrans, + linear_params_part, kNoTrans, 1.0); + } + } + + if (to_update_in != NULL) { + TdnnComponent *to_update = + dynamic_cast(to_update_in); + KALDI_ASSERT(to_update != NULL); + + if (to_update->learning_rate_ == 0.0) + return; + + if (to_update->is_gradient_ || !to_update->use_natural_gradient_) + to_update->UpdateSimple(*indexes, in_value, out_deriv); + else + to_update->UpdateNaturalGradient(*indexes, in_value, out_deriv); + } +} + +void TdnnComponent::UpdateSimple( + const PrecomputedIndexes &indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &out_deriv) { + + if (bias_params_.Dim() != 0) + bias_params_.AddRowSumMat(learning_rate_, out_deriv); + + int32 input_dim = in_value.NumCols(), + num_offsets = time_offsets_.size(); + for (int32 i = 0; i < num_offsets; i++) { + CuSubMatrix in_value_part = + GetInputPart(in_value, out_deriv.NumRows(), + indexes.row_stride, + indexes.row_offsets[i]); + CuSubMatrix linear_params_part(linear_params_, + 0, linear_params_.NumRows(), + i * input_dim, input_dim); + linear_params_part.AddMatMat(learning_rate_, out_deriv, kTrans, + in_value_part, kNoTrans, 1.0); + } +} + +void TdnnComponent::UpdateNaturalGradient( + const PrecomputedIndexes &indexes, + const CuMatrixBase &in_value, + const CuMatrixBase &out_deriv) { + + int32 num_offsets = time_offsets_.size(), + num_rows = out_deriv.NumRows(), + input_dim = in_value.NumCols(), + spliced_input_dim = num_offsets * input_dim, + augmented_input_dim = + spliced_input_dim + (bias_params_.Dim() != 0 ? 1 : 0); + + // in_value_temp is the fully spliced input with a column of ones appended to + // it. + CuMatrix in_value_temp(num_rows, + augmented_input_dim); + if (bias_params_.Dim() != 0) { + // set the last column of in_value_temp to 1.0 + in_value_temp.Range(0, num_rows, spliced_input_dim, 1).Set(1.0); + } + + for (int32 i = 0; i < num_offsets; i++) { + CuSubMatrix in_value_temp_part(in_value_temp, + 0, num_rows, + i * input_dim, input_dim), + in_value_part = GetInputPart(in_value, + num_rows, + indexes.row_stride, + indexes.row_offsets[i]); + in_value_temp_part.CopyFromMat(in_value_part); + } + + CuMatrix out_deriv_temp(out_deriv); + + // These "scale" values get will get multiplied into the learning rate (faster + // than having the matrices scaled inside the preconditioning code). + BaseFloat in_scale, out_scale; + + preconditioner_in_.PreconditionDirections(&in_value_temp, &in_scale); + preconditioner_out_.PreconditionDirections(&out_deriv_temp, &out_scale); + + // "scale" is a scaling factor coming from the PreconditionDirections calls + // (it's faster to have them output a scaling factor than to have them scale + // their outputs). + BaseFloat scale = in_scale * out_scale, + local_lrate = scale * learning_rate_; + + if (bias_params_.Dim() != 0) { + // this "precon_ones" is what happens to the vector of 1's representing + // offsets, after multiplication by the preconditioner. + CuVector precon_ones(num_rows); + precon_ones.CopyColFromMat(in_value_temp, spliced_input_dim); + bias_params_.AddMatVec(local_lrate, out_deriv_temp, kTrans, + precon_ones, 1.0); + } + + CuSubMatrix in_value_precon_part(in_value_temp, + 0, num_rows, + 0, spliced_input_dim); + + linear_params_.AddMatMat(local_lrate, out_deriv_temp, kTrans, + in_value_precon_part, kNoTrans, 1.0); +} + +void TdnnComponent::ReorderIndexes( + std::vector *input_indexes, + std::vector *output_indexes) const { + using namespace time_height_convolution; + + // The following figures out a regular structure for the input and + // output indexes, in case there were gaps (which is unlikely in typical + // situations). + ConvolutionComputationIo io; + GetComputationIo(*input_indexes, *output_indexes, &io); + ModifyComputationIo(&io); + + std::vector modified_input_indexes, + modified_output_indexes; + // The following call ensures that 'modified_input_indexes' and + // 'modified_output_indexes' have the required ordering (where t has the + // largest stride and each (n,x) pair is repeated for each 't' value), as well + // as doing padding (setting t values to kNoTime where it had to insert + // elements to ensure regular structure). + GetIndexesForComputation(io, *input_indexes, *output_indexes, + &modified_input_indexes, + &modified_output_indexes); + + // It will be quite rare that this function actually changes + // 'input_indexes' or 'output_indexes', because in most cases, + // the indexes will already have the required structure and + // ordering. + input_indexes->swap(modified_input_indexes); + output_indexes->swap(modified_output_indexes); +} + +void TdnnComponent::Write(std::ostream &os, bool binary) const { + WriteUpdatableCommon(os, binary); // Write opening tag and learning rate. + WriteToken(os, binary, ""); + WriteIntegerVector(os, binary, time_offsets_); + WriteToken(os, binary, ""); + linear_params_.Write(os, binary); + WriteToken(os, binary, ""); + bias_params_.Write(os, binary); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, orthonormal_constraint_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, use_natural_gradient_); + int32 rank_in = preconditioner_in_.GetRank(), + rank_out = preconditioner_out_.GetRank(); + BaseFloat alpha_in = preconditioner_in_.GetAlpha(), + alpha_out = preconditioner_out_.GetAlpha(), + num_samples_history = preconditioner_in_.GetNumSamplesHistory(); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, num_samples_history); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, alpha_in); + WriteBasicType(os, binary, alpha_out); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, rank_in); + WriteBasicType(os, binary, rank_out); + WriteToken(os, binary, ""); +} + +void TdnnComponent::Read(std::istream &is, bool binary) { + std::string token = ReadUpdatableCommon(is, binary); + ExpectToken(is, binary, ""); + ReadIntegerVector(is, binary, &time_offsets_); + ExpectToken(is, binary, ""); + linear_params_.Read(is, binary); + ExpectToken(is, binary, ""); + bias_params_.Read(is, binary); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &orthonormal_constraint_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &use_natural_gradient_); + int32 rank_in, rank_out; + BaseFloat alpha_in, alpha_out, + num_samples_history; + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &num_samples_history); + { // This can be simplified after a while. It's to read a format of the model + // that was never checked into master, but with which I (Dan) did many of + // the experiments while tuning the resnet TDNN-F. + std::string token; + ReadToken(is, binary, &token); + if (token == "") { + ReadBasicType(is, binary, &alpha_in); + ReadBasicType(is, binary, &alpha_out); + } else { + KALDI_ASSERT(token == ""); + ReadBasicType(is, binary, &alpha_in); + alpha_out = alpha_in; + } + } + preconditioner_in_.SetAlpha(alpha_in); + preconditioner_out_.SetAlpha(alpha_out); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &rank_in); + ReadBasicType(is, binary, &rank_out); + preconditioner_in_.SetRank(rank_in); + preconditioner_out_.SetRank(rank_out); + preconditioner_in_.SetNumSamplesHistory(num_samples_history); + preconditioner_out_.SetNumSamplesHistory(num_samples_history); + // the update periods are not configurable. + preconditioner_in_.SetUpdatePeriod(4); + preconditioner_out_.SetUpdatePeriod(4); + ExpectToken(is, binary, ""); + Check(); +} + +void TdnnComponent::GetInputIndexes( + const MiscComputationInfo &misc_info, + const Index &output_index, + std::vector *desired_indexes) const { + KALDI_ASSERT(output_index.t != kNoTime); + size_t size = time_offsets_.size(); + desired_indexes->resize(size); + for (size_t i = 0; i < size; i++) { + (*desired_indexes)[i].n = output_index.n; + (*desired_indexes)[i].t = output_index.t + time_offsets_[i]; + (*desired_indexes)[i].x = output_index.x; + } +} + + +bool TdnnComponent::IsComputable( + const MiscComputationInfo &misc_info, + const Index &output_index, + const IndexSet &input_index_set, + std::vector *used_inputs) const { + KALDI_ASSERT(output_index.t != kNoTime); + size_t size = time_offsets_.size(); + Index index(output_index); + + if (used_inputs != NULL) { + used_inputs->clear(); + used_inputs->reserve(size); + } + for (size_t i = 0; i < size; i++) { + index.t = output_index.t + time_offsets_[i]; + if (input_index_set(index)) { + if (used_inputs != NULL) { + // This input index is available. + used_inputs->push_back(index); + } + } else { + return false; + } + } + return true; +} + +// static +CuSubMatrix TdnnComponent::GetInputPart( + const CuMatrixBase &input_matrix, + int32 num_output_rows, + int32 row_stride, + int32 row_offset) { + KALDI_ASSERT(row_offset >= 0 && row_stride >= 1 && + input_matrix.NumRows() >= + row_offset + (row_stride * num_output_rows) - (row_stride - 1)); + // constructor takes args: (data, num_rows, num_cols, stride). + return CuSubMatrix( + input_matrix.Data() + input_matrix.Stride() * row_offset, + num_output_rows, + input_matrix.NumCols(), + input_matrix.Stride() * row_stride); +} + +void TdnnComponent::ModifyComputationIo( + time_height_convolution::ConvolutionComputationIo *io) { + if (io->t_step_out == 0) { + // the 't_step' values may be zero if there was only one (input or output) + // index so the time-stride could not be determined. This code fixes them + // up in that case. (If there was only one value, the stride is a + // don't-care actually). + if (io->t_step_in == 0) + io->t_step_in = 1; + io->t_step_out = io->t_step_in; + } + // At this point the t_step_{in,out} values will be nonzero. + KALDI_ASSERT(io->t_step_out % io->t_step_in == 0); + // The following affects the ordering of the input indexes; it allows us to + // reshape the input matrix in the way that we need to, in cases where there + // is subsampling. See the explanation where the variable was declared in + // class ConvolutionComputationIo. + io->reorder_t_in = io->t_step_out / io->t_step_in; + + // make sure that num_t_in is a multiple of io->reorder_t_in by rounding up. + int32 n = io->reorder_t_in; + io->num_t_in = n * ((io->num_t_in + n - 1) / n); +} + +ComponentPrecomputedIndexes* TdnnComponent::PrecomputeIndexes( + const MiscComputationInfo &misc_info, + const std::vector &input_indexes, + const std::vector &output_indexes, + bool need_backprop) const { + using namespace time_height_convolution; + // The following figures out a regular structure for the input and + // output indexes, in case there were gaps (which is unlikely in typical + // situations). + ConvolutionComputationIo io; + GetComputationIo(input_indexes, output_indexes, &io); + ModifyComputationIo(&io); + + if (RandInt(0, 10) == 0) { + // Spot check that the provided indexes have the required properties; + // this is like calling this->ReorderIndexes() and checking that it + // doesn't change anything. + std::vector modified_input_indexes, + modified_output_indexes; + GetIndexesForComputation(io, input_indexes, output_indexes, + &modified_input_indexes, + &modified_output_indexes); + KALDI_ASSERT(modified_input_indexes == input_indexes && + modified_output_indexes == output_indexes); + } + + + PrecomputedIndexes *ans = new PrecomputedIndexes(); + ans->row_stride = io.reorder_t_in; + int32 num_offsets = time_offsets_.size(); + ans->row_offsets.resize(num_offsets); + for (int32 i = 0; i < num_offsets; i++) { + // For each offset, work out which row of the input has the same t value as + // the first t value in the output plus that offset. That becomes the start + // row of the corresponding sub-part of the input. + int32 time_offset = time_offsets_[i], + required_input_t = io.start_t_out + time_offset, + input_t = (required_input_t - io.start_t_in) / io.t_step_in; + + KALDI_ASSERT(required_input_t == io.start_t_in + io.t_step_in * input_t); + // input_t is a kind of normalized time offset in the input, relative to the + // first 't' value in the input and divided by the t-step in the input, so + // it's the numbering "as if" the input 't' values were numbered from 0,1,2. + // To turn input_t into an input row we need to take account of 'reorder_t_in'. + // If this is 1 then the input row is input_t times io.num_images. + // Otherwise it's a little more complicated and to understand it you should + // read the comment where 'reorder_t_in' is declared in convolution.h. + // Briefly: the part that is an integer multiple of 'reorder_t_in' gets + // multiplied by io.num_images; the remainder does not. + + int32 n = io.reorder_t_in, + input_t_multiple = n * (input_t / n), input_t_remainder = input_t % n; + // note: input_t == input_t_multiple + input_t_remainder . + int32 input_row_offset = input_t_multiple * io.num_images + + input_t_remainder; + ans->row_offsets[i] = input_row_offset; + } + return ans; +} + +void TdnnComponent::Scale(BaseFloat scale) { + if (scale == 0.0) { + linear_params_.SetZero(); + bias_params_.SetZero(); + } else { + linear_params_.Scale(scale); + bias_params_.Scale(scale); + } +} + +void TdnnComponent::Add(BaseFloat alpha, + const Component &other_in) { + const TdnnComponent *other = + dynamic_cast(&other_in); + KALDI_ASSERT(other != NULL); + linear_params_.AddMat(alpha, other->linear_params_); + if (bias_params_.Dim() != 0) + bias_params_.AddVec(alpha, other->bias_params_); +} + +void TdnnComponent::PerturbParams(BaseFloat stddev) { + CuMatrix temp_mat(linear_params_.NumRows(), + linear_params_.NumCols(), kUndefined); + temp_mat.SetRandn(); + linear_params_.AddMat(stddev, temp_mat); + if (bias_params_.Dim() != 0) { + CuVector temp_vec(bias_params_.Dim(), kUndefined); + temp_vec.SetRandn(); + bias_params_.AddVec(stddev, temp_vec); + } +} + +BaseFloat TdnnComponent::DotProduct( + const UpdatableComponent &other_in) const { + const TdnnComponent *other = + dynamic_cast(&other_in); + KALDI_ASSERT(other != NULL); + BaseFloat ans = TraceMatMat(linear_params_, other->linear_params_, kTrans); + if (bias_params_.Dim() != 0) + ans += VecVec(bias_params_, other->bias_params_); + return ans; +} + +int32 TdnnComponent::NumParameters() const { + // note: bias_param_.Dim() may actually be zero. + return linear_params_.NumRows() * linear_params_.NumCols() + + bias_params_.Dim(); +} + +void TdnnComponent::Vectorize( + VectorBase *params) const { + KALDI_ASSERT(params->Dim() == NumParameters()); + int32 linear_size = linear_params_.NumRows() * linear_params_.NumCols(), + bias_size = bias_params_.Dim(); + params->Range(0, linear_size).CopyRowsFromMat(linear_params_); + if (bias_size != 0) + params->Range(linear_size, bias_size).CopyFromVec(bias_params_); +} + +void TdnnComponent::UnVectorize( + const VectorBase ¶ms) { + KALDI_ASSERT(params.Dim() == NumParameters()); + int32 linear_size = linear_params_.NumRows() * linear_params_.NumCols(), + bias_size = bias_params_.Dim(); + linear_params_.CopyRowsFromVec(params.Range(0, linear_size)); + if (bias_size != 0) + bias_params_.CopyFromVec(params.Range(linear_size, bias_size)); +} + +void TdnnComponent::FreezeNaturalGradient(bool freeze) { + preconditioner_in_.Freeze(freeze); + preconditioner_out_.Freeze(freeze); +} + +TdnnComponent::PrecomputedIndexes* +TdnnComponent::PrecomputedIndexes::Copy() const { + return new PrecomputedIndexes(*this); +} + +void TdnnComponent::PrecomputedIndexes::Write( + std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, row_stride); + WriteToken(os, binary, ""); + WriteIntegerVector(os, binary, row_offsets); + WriteToken(os, binary, ""); +} + +void TdnnComponent::PrecomputedIndexes::Read( + std::istream &is, bool binary) { + ExpectOneOrTwoTokens(is, binary, + "", + ""); + ReadBasicType(is, binary, &row_stride); + ExpectToken(is, binary, ""); + ReadIntegerVector(is, binary, &row_offsets); + ExpectToken(is, binary, ""); +} + + +} // namespace nnet3 +} // namespace kaldi diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc index e68321b3260..bae332cd584 100644 --- a/src/nnet3/nnet-test-utils.cc +++ b/src/nnet3/nnet-test-utils.cc @@ -69,20 +69,37 @@ void GenerateConfigSequenceSimpleContext( opts.output_dim : 100 + Rand() % 200); - os << "component name=affine1 type=AffineComponent input-dim=" - << spliced_dim << " output-dim=" << output_dim << std::endl; + if (RandInt(0,1) == 0) { + // do it the traditional way with an AffineComponent and an Append() expression. + os << "component name=affine1 type=AffineComponent input-dim=" + << spliced_dim << " output-dim=" << output_dim << std::endl; - os << "input-node name=input dim=" << input_dim << std::endl; + os << "input-node name=input dim=" << input_dim << std::endl; - os << "component-node name=affine1_node component=affine1 input=Append("; - for (size_t i = 0; i < splice_context.size(); i++) { - int32 offset = splice_context[i]; - os << "Offset(input, " << offset << ")"; - if (i + 1 < splice_context.size()) - os << ", "; + os << "component-node name=affine1_node component=affine1 input=Append("; + for (size_t i = 0; i < splice_context.size(); i++) { + int32 offset = splice_context[i]; + os << "Offset(input, " << offset << ")"; + if (i + 1 < splice_context.size()) + os << ", "; + } + os << ")\n"; + os << "output-node name=output input=affine1_node\n"; + } else { + os << "component name=tdnn1 type=TdnnComponent input-dim=" + << input_dim << " output-dim=" << output_dim + << " time-offsets="; + for (size_t i = 0; i < splice_context.size(); i++) { + if (i>0) os << ','; + os << splice_context[i]; + } + os << " use-bias=" << (RandInt(0,1) == 0 ? "true":"false") + << " use-natural-gradient=" << (RandInt(0,1) == 0 ? "true":"false") + << std::endl; + os << "input-node name=input dim=" << input_dim << std::endl; + os << "component-node name=tdnn1_node component=tdnn1 input=input\n"; + os << "output-node name=output input=tdnn1_node\n"; } - os << ")\n"; - os << "output-node name=output input=affine1_node\n"; configs->push_back(os.str()); } @@ -1329,7 +1346,7 @@ void ComputeExampleComputationRequestSimple( int32 num_output_frames = 1 + Rand() % 10, output_start_frame = Rand() % 10, - num_examples = 1 + Rand() % 10, + num_examples = 1 + Rand() % 4, output_end_frame = output_start_frame + num_output_frames, input_start_frame = output_start_frame - left_context - (Rand() % 3), input_end_frame = output_end_frame + right_context + (Rand() % 3), @@ -1383,7 +1400,7 @@ void ComputeExampleComputationRequestSimple( static void GenerateRandomComponentConfig(std::string *component_type, std::string *config) { - int32 n = RandInt(0, 34); + int32 n = RandInt(0, 35); BaseFloat learning_rate = 0.001 * RandInt(1, 100); std::ostringstream os; @@ -1729,6 +1746,17 @@ static void GenerateRandomComponentConfig(std::string *component_type, << " learning-rate=" << learning_rate; break; } + case 35: { + // This is not technically a SimpleComponent, but it behaves as one + // if time-offsets=0. + *component_type = "TdnnComponent"; + int32 input_dim = RandInt(1, 50), output_dim = RandInt(1, 50); + os << "input-dim=" << input_dim << " output-dim=" << output_dim + << " learning-rate=" << learning_rate << " time-offsets=0" + << " use-natural-gradient=" << (RandInt(0,1) == 0 ? "true":"false") + << " use-bias=" << (RandInt(0,1) == 0 ? "true":"false"); + break; + } default: KALDI_ERR << "Error generating random component"; } @@ -1747,6 +1775,11 @@ Component *GenerateRandomSimpleComponent() { if (c == NULL) KALDI_ERR << "Invalid component type " << component_type; c->InitFromConfig(&config_line); + if (config_line.HasUnusedValues()) { + KALDI_ERR << "Config line " << config_line.WholeLine() + << " has unused values: " + << config_line.UnusedValues(); + } return c; } diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc index 49222549e4e..8fda24cd22d 100644 --- a/src/nnet3/nnet-training.cc +++ b/src/nnet3/nnet-training.cc @@ -153,17 +153,15 @@ void NnetTrainer::TrainInternalBackstitch(const NnetExample &eg, // delta_nnet is scaled by 1 + backstitch_training_scale when added to nnet; max_change_scale = 1.0 + config_.backstitch_training_scale; scale_adding = 1.0 + config_.backstitch_training_scale; + // If relevant, add in the part of the gradient that comes from L2 + // regularization. It may not be optimally inefficient to do it on both + // passes of the backstitch, like we do here, but it probably minimizes + // any harmful interactions with the max-change. + ApplyL2Regularization(*nnet_, + 1.0 / scale_adding * GetNumNvalues(eg.io, false) * + config_.l2_regularize_factor, delta_nnet_); } - // If relevant, add in the part of the gradient that comes from L2 - // regularization. It may not be optimally inefficient to do it on both - // passes of the backstitch, like we do here, but it probably minimizes - // any harmful interactions with the max-change. - ApplyL2Regularization(*nnet_, - scale_adding * GetNumNvalues(eg.io, false) * - config_.l2_regularize_factor, - delta_nnet_); - // Updates the parameters of nnet UpdateNnetWithMaxChange(*delta_nnet_, config_.max_param_change, max_change_scale, scale_adding, nnet_, diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index fa6ade55864..d16a728e2ab 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -83,10 +83,13 @@ void EvaluateComputationRequest( } } -// this non-exported function is used in ComputeSimpleNnetContext +// This non-exported function is used in ComputeSimpleNnetContext // to compute the left and right context of the nnet for a particular // window size and shift-length. -static void ComputeSimpleNnetContextForShift( +// It returns false if no outputs were computable, meaning the left and +// right context could not be computed. (Normally this means the window +// size is too small). +static bool ComputeSimpleNnetContextForShift( const Nnet &nnet, int32 input_start, int32 window_size, @@ -118,7 +121,6 @@ static void ComputeSimpleNnetContextForShift( ivector.indexes.push_back(Index(n, t)); } - ComputationRequest request; request.inputs.push_back(input); request.outputs.push_back(output); @@ -135,9 +137,10 @@ static void ComputeSimpleNnetContextForShift( int32 first_not_ok = std::find(iter, output_ok.end(), false) - output_ok.begin(); if (first_ok == window_size || first_not_ok <= first_ok) - KALDI_ERR << "No outputs were computable (perhaps not a simple nnet?)"; + return false; *left_context = first_ok; *right_context = window_size - first_not_ok; + return true; } void ComputeSimpleNnetContext(const Nnet &nnet, @@ -154,24 +157,43 @@ void ComputeSimpleNnetContext(const Nnet &nnet, std::vector left_contexts(modulus + 1); std::vector right_contexts(modulus + 1); - // This will crash if the total context (left + right) is greater - // than window_size. - int32 window_size = 200; - - // by going "<= modulus" instead of "< modulus" we do one more computation - // than we really need; it becomes a sanity check. - for (int32 input_start = 0; input_start <= modulus; input_start++) - ComputeSimpleNnetContextForShift(nnet, input_start, window_size, - &(left_contexts[input_start]), - &(right_contexts[input_start])); - KALDI_ASSERT(left_contexts[0] == left_contexts[modulus] && - "nnet does not have the properties we expect."); - KALDI_ASSERT(right_contexts[0] == right_contexts[modulus] && - "nnet does not have the properties we expect."); - *left_context = - *std::max_element(left_contexts.begin(), left_contexts.end()); - *right_context = - *std::max_element(right_contexts.begin(), right_contexts.end()); + // window_size is a number which needs to be greater than the total context + // of the nnet, else we won't be able to work out the context. Large window + // size will make this code slow, so we start off with small window size, and + // if it isn't enough, we keep doubling it up to a maximum. + int32 window_size = 40, max_window_size = 800; + + while (window_size < max_window_size) { + + // by going "<= modulus" instead of "< modulus" we do one more computation + // than we really need; it becomes a sanity check. + int32 input_start; + for (input_start = 0; input_start <= modulus; input_start++) { + if (!ComputeSimpleNnetContextForShift(nnet, input_start, window_size, + &(left_contexts[input_start]), + &(right_contexts[input_start]))) + break; + } + if (input_start <= modulus) { + // We broke from the loop over 'input_start', which means there was + // a failure in ComputeSimpleNnextContextForShift-- we assume at + // this point that it was because window_size was too small. + window_size *= 2; + continue; + } + + KALDI_ASSERT(left_contexts[0] == left_contexts[modulus] && + "nnet does not have the properties we expect."); + KALDI_ASSERT(right_contexts[0] == right_contexts[modulus] && + "nnet does not have the properties we expect."); + *left_context = + *std::max_element(left_contexts.begin(), left_contexts.end()); + *right_context = + *std::max_element(right_contexts.begin(), right_contexts.end()); + // Success. + return; + } + KALDI_ERR << "Failure in ComputeSimpleNnetContext (perhaps not a simple nnet?)"; } void PerturbParams(BaseFloat stddev, @@ -886,8 +908,23 @@ void ConstrainOrthonormalInternal(BaseFloat scale, CuMatrixBase *M) { P.SymAddMat2(1.0, *M, kNoTrans, 0.0); P.CopyLowerToUpper(); - if (scale < 0.0) { - // If scale < 0.0 then it's like letting the scale "float". + // The 'update_speed' is a constant that determines how fast we approach a + // matrix with the desired properties (larger -> faster). Larger values will + // update faster but will be more prone to instability. 0.125 (1/8) is the + // value that gives us the fastest possible convergence when we are already + // close to be a semi-orthogonal matrix (in fact, it will lead to quadratic + // convergence). + // See http://www.danielpovey.com/files/2018_interspeech_tdnnf.pdf + // for more details. + BaseFloat update_speed = 0.125; + bool floating_scale = (scale < 0.0); + + + if (floating_scale) { + // This (letting the scale "float") is described in Sec. 2.3 of + // http://www.danielpovey.com/files/2018_interspeech_tdnnf.pdf, + // where 'scale' here is written 'alpha' in the paper. + // // We pick the scale that will give us an update to M that is // orthogonal to M (viewed as a vector): i.e., if we're doing // an update M := M + X, then we want to have tr(M X^T) == 0. @@ -905,37 +942,64 @@ void ConstrainOrthonormalInternal(BaseFloat scale, CuMatrixBase *M) { // tr(P^2 - scale^2 P) == 0, // or scale^2 = tr(P^2) / tr(P). // Note: P is symmetric so it doesn't matter whether we use tr(P P) or - // tr(P^T P); we use tr(P^T P) becaus I believe it's faster to compute. - scale = std::sqrt(TraceMatMat(P, P, kTrans)/ P.Trace()); + // tr(P^T P); we use tr(P^T P) because I believe it's faster to compute. + + BaseFloat trace_P = P.Trace(), trace_P_P = TraceMatMat(P, P, kTrans); + + scale = std::sqrt(trace_P_P / trace_P); + + // The following is a tweak to avoid divergence when the eigenvalues aren't + // close to being the same. trace_P is the sum of eigenvalues of P, and + // trace_P_P is the sum-square of eigenvalues of P. Treat trace_P as a sum + // of positive values, and trace_P_P as their sumsq. Then mean = trace_P / + // dim, and trace_P_P cannot be less than dim * (trace_P / dim)^2, + // i.e. trace_P_P >= trace_P^2 / dim. If ratio = trace_P_P * dim / + // trace_P^2, then ratio >= 1.0, and the excess above 1.0 is a measure of + // how far we are from convergence. If we're far from convergence, we make + // the learning rate slower to reduce the risk of divergence, since the + // update may not be stable for starting points far from equilibrium. + BaseFloat ratio = (trace_P_P * P.NumRows() / (trace_P * trace_P)); + KALDI_ASSERT(ratio > 0.999); + if (ratio > 1.02) { + update_speed *= 0.5; // Slow down the update speed to reduce the risk of divergence. + if (ratio > 1.1) update_speed *= 0.5; // Slow it down even more. + } } - // The 'update_speed' is a constant that determines how fast we approach a - // matrix with the desired properties (larger -> faster). Larger values will - // update faster but will be more prone to instability. I believe - // 'update_speed' shouldn't be more than 0.25 or maybe 0.5, or it will always - // be unstable, but I haven't done the analysis. It should definitely be more - // than 0.0. - BaseFloat update_speed = 0.125; - - // The factor of 1/scale^2 is, I *believe*, going to give us the right kind of - // invariance w.r.t. the scale. To explain why this is the appropriate - // factor, look at the statement M_update.AddMatMat(-4.0 * alpha, P, kNoTrans, - // *M, kNoTrans, 0.0); where P is proportional to scale^2 and M to 'scale' and - // alpha to 1/scale^2, so change in M_update is proportional to 'scale'. - // We'd like 'M_update' to be proportional to 'scale'. This reasoning is very - // approximate but I think it can be made rigorous. This is about remaining - // stable (not prone to divergence) even for very large or small values of - // 'scale'. - BaseFloat alpha = update_speed / (scale * scale); - - P.AddToDiag(-1.0 * scale * scale); + // We may want to un-comment the following code block later on if we have a + // problem with instability in setups with a non-floating orthonormal + // constraint. + /* + if (!floating_scale) { + // This is analogous to the stuff with 'ratio' above, but when we don't have + // a floating scale. It reduces the chances of divergence when we have + // a bad initialization. + BaseFloat error = P.FrobeniusNorm(), + error_proportion = error * error / P.NumRows(); + // 'error_proportion' is the sumsq of elements in (P - I) divided by the + // sumsq of elements of I. It should be much less than one (i.e. close to + // zero) if the error is small. + if (error_proportion > 0.02) { + update_speed *= 0.5; + if (error_proportion > 0.1) + update_speed *= 0.5; + } + } + */ + if (GetVerboseLevel() >= 1) { BaseFloat error = P.FrobeniusNorm(); KALDI_VLOG(2) << "Error in orthogonality is " << error; } + // see Sec. 2.2 of http://www.danielpovey.com/files/2018_interspeech_tdnnf.pdf + // for explanation of the 1/(scale*scale) factor, but there is a difference in + // notation; 'scale' here corresponds to 'alpha' in the paper, and + // 'update_speed' corresponds to 'nu' in the paper. + BaseFloat alpha = update_speed / (scale * scale); + // At this point, the matrix P contains what, in the math, would be Q = // P-scale^2*I. The derivative of the objective function w.r.t. an element q(i,j) // of Q is now equal to -2*alpha*q(i,j), i.e. we could write q_deriv(i,j) @@ -959,39 +1023,37 @@ void ConstrainOrthonormal(Nnet *nnet) { for (int32 c = 0; c < nnet->NumComponents(); c++) { Component *component = nnet->GetComponent(c); + CuMatrixBase *params = NULL; + BaseFloat orthonormal_constraint = 0.0; + LinearComponent *lc = dynamic_cast(component); if (lc != NULL && lc->OrthonormalConstraint() != 0.0) { - if (RandInt(0, 3) != 0) - continue; // For efficiency, only do this every 4 minibatches-- it won't - // stray far. - BaseFloat scale = lc->OrthonormalConstraint(); - - CuMatrixBase ¶ms = lc->Params(); - int32 rows = params.NumRows(), cols = params.NumCols(); - if (rows <= cols) { - ConstrainOrthonormalInternal(scale, ¶ms); - } else { - CuMatrix params_trans(params, kTrans); - ConstrainOrthonormalInternal(scale, ¶ms_trans); - params.CopyFromMat(params_trans, kTrans); - } + orthonormal_constraint = lc->OrthonormalConstraint(); + params = &(lc->Params()); } - AffineComponent *ac = dynamic_cast(component); if (ac != NULL && ac->OrthonormalConstraint() != 0.0) { - if (RandInt(0, 3) != 0) - continue; // For efficiency, only do this every 4 minibatches-- it won't - // stray far. - BaseFloat scale = ac->OrthonormalConstraint(); - CuMatrixBase ¶ms = ac->LinearParams(); - int32 rows = params.NumRows(), cols = params.NumCols(); - if (rows <= cols) { - ConstrainOrthonormalInternal(scale, ¶ms); - } else { - CuMatrix params_trans(params, kTrans); - ConstrainOrthonormalInternal(scale, ¶ms_trans); - params.CopyFromMat(params_trans, kTrans); - } + orthonormal_constraint = ac->OrthonormalConstraint(); + params = &(ac->LinearParams()); + } + TdnnComponent *tc = dynamic_cast(component); + if (tc != NULL && tc->OrthonormalConstraint() != 0.0) { + orthonormal_constraint = tc->OrthonormalConstraint(); + params = &(tc->LinearParams()); + } + if (orthonormal_constraint == 0.0 || RandInt(0, 3) != 0) { + // For efficiency, only do this every 4 or so minibatches-- it won't have + // time stray far from the constraint in between. + continue; + } + + int32 rows = params->NumRows(), cols = params->NumCols(); + if (rows <= cols) { + ConstrainOrthonormalInternal(orthonormal_constraint, params); + } else { + CuMatrix params_trans(*params, kTrans); + ConstrainOrthonormalInternal(orthonormal_constraint, ¶ms_trans); + params->CopyFromMat(params_trans, kTrans); } } } @@ -1507,8 +1569,8 @@ class ModelCollapser { 'component_index2' with input given by 'component_index1'. This handles the case where 'component_index1' is of type DropoutComponent or GeneralDropoutComponent, and where 'component_index2' is of type - AffineComponent, NaturalGradientAffineComponent or - TimeHeightConvolutionComponent. + AffineComponent, NaturalGradientAffineComponent, LinearComponent, + TdnnComponent or TimeHeightConvolutionComponent. Returns -1 if this code can't produce a combined component (normally because the components have the wrong types). @@ -1743,13 +1805,12 @@ class ModelCollapser { and 'scale'. In practice it will be the component-index from where 'offset' and 'scale' were taken. - @param [in] component_index The component to be modified (not in-place, but - as a copy). The component described in 'component_index' must - be AffineComponent or NaturalGradientAffineComponent, and - case the dimension of 'offset'/'scale' should divide the - component input dimension, otherwise it's an error. - of 'offset' and 'scale' should equal 'scale_input' - (else it's an error). + @param [in] component_index The component to be modified (not in-place, + but as a copy). The component described in 'component_index' + must be AffineComponent, NaturalGradientAffineComponent, + LinearComponent or TdnnComponent, and the dimension of + 'offset'/'scale' should divide the component input dimension, + otherwise it's an error. @return Returns the component-index of a suitably modified component. If one like this already exists, the existing one will be returned. If the component in 'component_index' was not of a type that can @@ -1761,7 +1822,6 @@ class ModelCollapser { const CuVectorBase &scale, const std::string &src_identifier, int32 component_index) { - int32 transform_dim = offset.Dim(); KALDI_ASSERT(offset.Dim() > 0 && offset.Dim() == scale.Dim()); if (offset.Max() == 0.0 && offset.Min() == 0.0 && scale.Max() == 1.0 && scale.Min() == 1.0) @@ -1774,17 +1834,72 @@ class ModelCollapser { int32 new_component_index = nnet_->GetComponentIndex(new_component_name); if (new_component_index >= 0) return new_component_index; // we previously created this. + const Component *component = nnet_->GetComponent(component_index); const AffineComponent *affine_component = dynamic_cast(component); - if (affine_component == NULL) - return -1; // we can't do this. + const LinearComponent *linear_component = + dynamic_cast(component); + const TdnnComponent *tdnn_component = + dynamic_cast(component); + Component *new_component = NULL; + if (affine_component != NULL) { + new_component = component->Copy(); + AffineComponent *new_affine_component = + dynamic_cast(new_component); + PreMultiplyAffineParameters(offset, scale, + &(new_affine_component->BiasParams()), + &(new_affine_component->LinearParams())); + } else if (linear_component != NULL) { + CuVector bias_params(linear_component->OutputDim()); + AffineComponent *new_affine_component = + new AffineComponent(linear_component->Params(), + bias_params, + linear_component->LearningRate()); + PreMultiplyAffineParameters(offset, scale, + &(new_affine_component->BiasParams()), + &(new_affine_component->LinearParams())); + new_component = new_affine_component; + } else if (tdnn_component != NULL) { + new_component = tdnn_component->Copy(); + TdnnComponent *new_tdnn_component = + dynamic_cast(new_component); + if (new_tdnn_component->BiasParams().Dim() == 0) { + // make sure it has a bias even if it had none before. + new_tdnn_component->BiasParams().Resize( + new_tdnn_component->OutputDim()); + } + PreMultiplyAffineParameters(offset, scale, + &(new_tdnn_component->BiasParams()), + &(new_tdnn_component->LinearParams())); - int32 input_dim = affine_component->InputDim(); - if (input_dim % transform_dim != 0) { - KALDI_ERR << "Dimension mismatch when modifying affine component."; + } else { + return -1; // we can't do this: this component isn't of the right type. } + return nnet_->AddComponent(new_component_name, new_component); + } + + /** + This helper function, used GetDiagonallyPreModifiedComponentIndex, + modifies the linear and bias parameters of an affine transform to + capture the effect of preceding that affine transform by a + diagonal affine transform with parameters 'offset' and 'scale'. + The dimension of 'offset' and 'scale' must be the same and must + divide the input dim of the affine transform, i.e. must divide + linear_params->NumCols(). + */ + static void PreMultiplyAffineParameters( + const CuVectorBase &offset, + const CuVectorBase &scale, + CuVectorBase *bias_params, + CuMatrixBase *linear_params) { + int32 input_dim = linear_params->NumCols(), + transform_dim = offset.Dim(); + KALDI_ASSERT(bias_params->Dim() == linear_params->NumRows() && + offset.Dim() == scale.Dim() && + input_dim % transform_dim == 0); + // we may have to repeat 'offset' and scale' several times. // 'full_offset' and 'full_scale' may be repeated versions of // 'offset' and 'scale' in case input_dim > transform_dim. CuVector full_offset(input_dim), @@ -1793,20 +1908,17 @@ class ModelCollapser { full_offset.Range(d, transform_dim).CopyFromVec(offset); full_scale.Range(d, transform_dim).CopyFromVec(scale); } - CuVector bias_params(affine_component->BiasParams()); - CuMatrix linear_params(affine_component->LinearParams()); + // Image the affine component does y = a x + b, and by applying // the pre-transform we are replacing x with s x + o // s for scale and o for offset), so we have: // y = a s x + (b + a o). // do: b += a o. - bias_params.AddMatVec(1.0, linear_params, kNoTrans, full_offset, 1.0); + bias_params->AddMatVec(1.0, *linear_params, kNoTrans, full_offset, 1.0); // do: a = a * s. - linear_params.MulColsVec(full_scale); - AffineComponent *new_affine_component = - dynamic_cast(affine_component->Copy()); - new_affine_component->SetParams(bias_params, linear_params); - return nnet_->AddComponent(new_component_name, new_affine_component); + linear_params->MulColsVec(full_scale); + + } @@ -1815,7 +1927,7 @@ class ModelCollapser { will give the same output as the current component gives when its input is scaled by 'scale'. This will generally mean applying the scale to the linear parameters in the component, if it is - an affine or convolutional component. + an affine, linear or convolutional component. If the component referred to in 'component_index' is not an affine or convolutional component, and therefore cannot @@ -1837,26 +1949,33 @@ class ModelCollapser { dynamic_cast(current_component); const TimeHeightConvolutionComponent *conv_component = dynamic_cast(current_component); + const LinearComponent *linear_component = + dynamic_cast(current_component); + const TdnnComponent *tdnn_component = + dynamic_cast(current_component); + + if (affine_component == NULL && conv_component == NULL && + linear_component == NULL && tdnn_component == NULL) { + // We can't scale this component (at least, not using this code). + return -1; + } + + Component *new_component = current_component->Copy(); + if (affine_component != NULL) { // AffineComponent or NaturalGradientAffineComponent. - CuVector bias_params(affine_component->BiasParams()); - CuMatrix linear_params(affine_component->LinearParams()); - linear_params.Scale(scale); - AffineComponent *new_affine_component = - dynamic_cast(current_component->Copy()); - new_affine_component->SetParams(bias_params, linear_params); - return nnet_->AddComponent(new_component_name, new_affine_component); + dynamic_cast(new_component)-> + LinearParams().Scale(scale); } else if (conv_component != NULL) { - TimeHeightConvolutionComponent *new_conv_component = - dynamic_cast( - current_component->Copy()); - // scale the linear but not the bias parameters. - new_conv_component->ScaleLinearParams(scale); - return nnet_->AddComponent(new_component_name, new_conv_component); + dynamic_cast(new_component)-> + ScaleLinearParams(scale); + } else if (linear_component != NULL) { + dynamic_cast(new_component)->Params().Scale(scale); } else { - // We can't scale this component (at least, not using this code). - return -1; + KALDI_ASSERT(tdnn_component != NULL); + dynamic_cast(new_component)->LinearParams().Scale(scale); } + return nnet_->AddComponent(new_component_name, new_component); } const CollapseModelConfig &config_; diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index 3b304b8fb39..c54fcf87e64 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -471,6 +471,8 @@ void ScaleBatchnormStats(BaseFloat batchnorm_stats_scale, In order to make it efficient on GPU, it doesn't make it completely orthonormal, it just makes it closer to being orthonormal (times the 'orthonormal_constraint' value). Over multiple iterations this rapidly makes it almost exactly orthonormal. + + See http://www.danielpovey.com/files/2018_interspeech_tdnnf.pdf */ void ConstrainOrthonormal(Nnet *nnet); diff --git a/src/nnet3bin/nnet3-compute.cc b/src/nnet3bin/nnet3-compute.cc index 3cd56ef1c74..f67167bc819 100644 --- a/src/nnet3bin/nnet3-compute.cc +++ b/src/nnet3bin/nnet3-compute.cc @@ -41,7 +41,8 @@ int main(int argc, char *argv[]) { "\n" "Usage: nnet3-compute [options] \n" " e.g.: nnet3-compute final.raw scp:feats.scp ark:nnet_prediction.ark\n" - "See also: nnet3-compute-from-egs\n"; + "See also: nnet3-compute-from-egs, nnet3-chain-compute-post\n" + "Note: this program does not currently make very efficient use of the GPU.\n"; ParseOptions po(usage); Timer timer; @@ -52,7 +53,6 @@ int main(int argc, char *argv[]) { bool apply_exp = false, use_priors = false; std::string use_gpu = "yes"; - std::string word_syms_filename; std::string ivector_rspecifier, online_ivector_rspecifier, utt2spk_rspecifier; diff --git a/src/nnet3bin/nnet3-copy-egs.cc b/src/nnet3bin/nnet3-copy-egs.cc index 17053ad9b2d..19c205461ae 100644 --- a/src/nnet3bin/nnet3-copy-egs.cc +++ b/src/nnet3bin/nnet3-copy-egs.cc @@ -393,26 +393,30 @@ int main(int argc, char *argv[]) { weight = egs_weight_reader.Value(key); ScaleSupervisionWeight(weight, &eg); } - + + std::string new_output_name; if (!eg_output_name_rspecifier.empty()) { if (!output_name_reader.HasKey(key)) { KALDI_WARN << "No new output-name for example key " << key; num_err++; continue; } - std::string new_output_name = output_name_reader.Value(key); - RenameOutputs(new_output_name, &eg); + new_output_name = output_name_reader.Value(key); } for (int32 c = 0; c < count; c++) { int32 index = (random ? Rand() : num_written) % num_outputs; if (frame_str == "" && left_context == -1 && right_context == -1 && frame_shift == 0) { + if (!new_output_name.empty() && c == 0) + RenameOutputs(new_output_name, &eg); example_writers[index]->Write(key, eg); num_written++; } else { // the --frame option or context options were set. NnetExample eg_modified; if (SelectFromExample(eg, frame_str, left_context, right_context, frame_shift, &eg_modified)) { + if (!new_output_name.empty()) + RenameOutputs(new_output_name, &eg_modified); // this branch of the if statement will almost always be taken (should only // not be taken for shorter-than-normal egs from the end of a file. example_writers[index]->Write(key, eg_modified); diff --git a/src/nnet3bin/nnet3-egs-augment-image.cc b/src/nnet3bin/nnet3-egs-augment-image.cc index 6020036cc29..ef724d0c6a6 100644 --- a/src/nnet3bin/nnet3-egs-augment-image.cc +++ b/src/nnet3bin/nnet3-egs-augment-image.cc @@ -66,8 +66,8 @@ struct ImageAugmentationConfig { po->Register("rotation-prob", &rotation_prob, "Probability of doing rotation"); po->Register("fill-mode", &fill_mode_string, "Mode for dealing with " - "points outside the image boundary when applying transformation. " - "Choices = {nearest, reflect}"); + "points outside the image boundary when applying transformation. " + "Choices = {nearest, reflect}"); } void Check() const { @@ -87,10 +87,10 @@ struct ImageAugmentationConfig { fill_mode = kReflect; } else { if (fill_mode_string != "nearest") { - KALDI_ERR << "Choices for --fill-mode are 'nearest' or 'reflect', got: " - << fill_mode_string; + KALDI_ERR << "Choices for --fill-mode are 'nearest' or 'reflect', got: " + << fill_mode_string; } else { - fill_mode = kNearest; + fill_mode = kNearest; } } return fill_mode; @@ -243,7 +243,7 @@ void PerturbImage(const ImageAugmentationConfig &config, // 0 0 1 ] if (RandUniform() <= config.rotation_prob) { BaseFloat theta = (2 * config.rotation_degree * RandUniform() - - config.rotation_degree) / 180.0 * M_PI; + config.rotation_degree) / 180.0 * M_PI; rotation_mat(0, 0) = cos(theta); rotation_mat(0, 1) = -sin(theta); rotation_mat(1, 0) = sin(theta); @@ -325,8 +325,8 @@ void PerturbImageInNnetExample( } -} // namespace nnet3 -} // namespace kaldi +} // namespace nnet3 +} // namespace kaldi int main(int argc, char *argv[]) { try { diff --git a/src/nnet3bin/nnet3-xvector-compute.cc b/src/nnet3bin/nnet3-xvector-compute.cc index 33edc8c9fa6..a4bc89a7def 100644 --- a/src/nnet3bin/nnet3-xvector-compute.cc +++ b/src/nnet3bin/nnet3-xvector-compute.cc @@ -44,7 +44,7 @@ static void RunNnetComputation(const MatrixBase &features, output_spec.indexes.resize(1); request.outputs.resize(1); request.outputs[0].Swap(&output_spec); - std::shared_ptr computation = compiler->Compile(request); + std::shared_ptr computation(std::move(compiler->Compile(request))); Nnet *nnet_to_update = NULL; // we're not doing any update. NnetComputer computer(NnetComputeOptions(), *computation, nnet, nnet_to_update); @@ -98,6 +98,7 @@ int main(int argc, char *argv[]) { std::string use_gpu = "no"; int32 chunk_size = -1, min_chunk_size = 100; + bool pad_input = true; opts.Register(&po); compiler_config.Register(&po); @@ -109,6 +110,8 @@ int main(int argc, char *argv[]) { "If not set, extracts an xvector from all available features."); po.Register("min-chunk-size", &min_chunk_size, "Minimum chunk-size allowed when extracting xvectors."); + po.Register("pad-input", &pad_input, "If true, duplicate the first and " + "last frames of the input features as required to equal min-chunk-size."); po.Read(argc, argv); @@ -152,8 +155,7 @@ int main(int argc, char *argv[]) { int32 num_rows = features.NumRows(), feat_dim = features.NumCols(), this_chunk_size = chunk_size; - - if (num_rows < min_chunk_size) { + if (!pad_input && num_rows < min_chunk_size) { KALDI_WARN << "Minimum chunk size of " << min_chunk_size << " is greater than the number of rows " << "in utterance: " << utt; @@ -180,13 +182,29 @@ int main(int argc, char *argv[]) { // the nnet. int32 offset = std::min( this_chunk_size, num_rows - chunk_indx * this_chunk_size); - if (offset < min_chunk_size) + if (!pad_input && offset < min_chunk_size) continue; SubMatrix sub_features( features, chunk_indx * this_chunk_size, offset, 0, feat_dim); Vector xvector; tot_weight += offset; - RunNnetComputation(sub_features, nnet, &compiler, &xvector); + + // Pad input if the offset is less than the minimum chunk size + if (pad_input && offset < min_chunk_size) { + Matrix padded_features(min_chunk_size, feat_dim); + int32 left_context = (min_chunk_size - offset) / 2; + int32 right_context = min_chunk_size - offset - left_context; + for (int32 i = 0; i < left_context; i++) { + padded_features.Row(i).CopyFromVec(sub_features.Row(0)); + } + for (int32 i = 0; i < right_context; i++) { + padded_features.Row(min_chunk_size - i - 1).CopyFromVec(sub_features.Row(offset - 1)); + } + padded_features.Range(left_context, offset, 0, feat_dim).CopyFromMat(sub_features); + RunNnetComputation(padded_features, nnet, &compiler, &xvector); + } else { + RunNnetComputation(sub_features, nnet, &compiler, &xvector); + } xvector_avg.AddVec(offset, xvector); } xvector_avg.Scale(1.0 / tot_weight); diff --git a/src/nnetbin/cuda-gpu-available.cc b/src/nnetbin/cuda-gpu-available.cc index 89fd26be86f..390468d3046 100644 --- a/src/nnetbin/cuda-gpu-available.cc +++ b/src/nnetbin/cuda-gpu-available.cc @@ -41,7 +41,7 @@ void TestGpuComputation() { int main(int argc, char *argv[]) try { char hostname[100] = "UNKNOWN-HOSTNAME"; -#ifndef _MSC_VER +#if !defined(_MSC_VER) && !defined(__CYGWIN__) if (gethostname(hostname, 100)) { KALDI_WARN << "Cannot get hostname, " << strerror(errno); } diff --git a/src/rnnlm/rnnlm-embedding-training.cc b/src/rnnlm/rnnlm-embedding-training.cc index 4c42bd4ab39..c4238c7356a 100644 --- a/src/rnnlm/rnnlm-embedding-training.cc +++ b/src/rnnlm/rnnlm-embedding-training.cc @@ -175,7 +175,7 @@ void RnnlmEmbeddingTrainer::Train( if (config_.l2_regularize > 0.0) { BaseFloat l2_term = -2 * config_.l2_regularize; if (l2_term != 0.0) { - embedding_deriv->AddToRows(l2_term, active_words, embedding_mat_); + embedding_deriv->AddRows(l2_term, *embedding_mat_, active_words); } } BaseFloat scale = 1.0; @@ -229,8 +229,8 @@ void RnnlmEmbeddingTrainer::TrainBackstitch( if (config_.l2_regularize > 0.0 && !is_backstitch_step1) { BaseFloat l2_term = -2 * config_.l2_regularize; if (l2_term != 0.0) { - embedding_deriv->AddMat(1.0 / (1.0 + config_.backstitch_training_scale) * - l2_term, *embedding_mat_); + embedding_deriv->AddRows(l2_term / (1.0 + config_.backstitch_training_scale), + *embedding_mat_, active_words); } } BaseFloat scale = 1.0; diff --git a/src/rnnlm/rnnlm-example.h b/src/rnnlm/rnnlm-example.h index 3817752b992..1f3bcb957a9 100644 --- a/src/rnnlm/rnnlm-example.h +++ b/src/rnnlm/rnnlm-example.h @@ -118,10 +118,8 @@ struct RnnlmExample { // Shallow swap. void Swap(RnnlmExample *other); - // TODO: implement this. void Write(std::ostream &os, bool binary) const; - // TODO: implement this. void Read(std::istream &is, bool binary); }; diff --git a/src/rnnlm/rnnlm-test-utils.cc b/src/rnnlm/rnnlm-test-utils.cc index 84546987bd8..32e8b5a4236 100644 --- a/src/rnnlm/rnnlm-test-utils.cc +++ b/src/rnnlm/rnnlm-test-utils.cc @@ -79,7 +79,7 @@ void ConvertToInteger( (*int_sentences)[i].resize(string_sentences[i].size()); for (int j = 0; j < string_sentences[i].size(); j++) { kaldi::int64 key = symbol_table.Find(string_sentences[i][j]); - KALDI_ASSERT(key != fst::SymbolTable::kNoSymbol); + KALDI_ASSERT(key != -1); // fst::kNoSymbol (*int_sentences)[i][j] = static_cast(key); } } diff --git a/src/rnnlm/rnnlm-training.cc b/src/rnnlm/rnnlm-training.cc index 959906be2f2..370f6395dc0 100644 --- a/src/rnnlm/rnnlm-training.cc +++ b/src/rnnlm/rnnlm-training.cc @@ -42,9 +42,6 @@ RnnlmTrainer::RnnlmTrainer(bool train_embedding, embedding_trainer_(NULL), word_feature_mat_(word_feature_mat), num_minibatches_processed_(0), - end_of_input_(false), - previous_minibatch_empty_(1), - current_minibatch_empty_(1), srand_seed_(RandInt(0, 100000)) { @@ -75,13 +72,6 @@ RnnlmTrainer::RnnlmTrainer(bool train_embedding, << embedding_mat_->NumRows() << " (mismatch)."; } } - - // Start a thread that calls run_background_thread(this). - // That thread will be responsible for computing derived variables of - // the minibatch, since that can be done independently of the main - // training process. - background_thread_ = std::thread(run_background_thread, this); - } @@ -92,25 +82,40 @@ void RnnlmTrainer::Train(RnnlmExample *minibatch) { << VocabSize() << ", got " << minibatch->vocab_size; - // hand over 'minibatch' to the background thread to have its derived variable - // computed, via the class variable 'current_minibatch_'. - current_minibatch_empty_.Wait(); current_minibatch_.Swap(minibatch); - current_minibatch_full_.Signal(); num_minibatches_processed_++; - if (num_minibatches_processed_ == 1) { - return; // The first time this function is called, return immediately - // because there is no previous minibatch to train on. + RnnlmExampleDerived derived; + CuArray active_words_cuda; + CuSparseMatrix active_word_features; + CuSparseMatrix active_word_features_trans; + + if (!current_minibatch_.sampled_words.empty()) { + std::vector active_words; + RenumberRnnlmExample(¤t_minibatch_, &active_words); + active_words_cuda.CopyFromVec(active_words); + + if (word_feature_mat_ != NULL) { + active_word_features.SelectRows(active_words_cuda, + *word_feature_mat_); + active_word_features_trans.CopyFromSmat(active_word_features, + kTrans); + } } - previous_minibatch_full_.Wait(); + GetRnnlmExampleDerived(current_minibatch_, train_embedding_, + &derived); + + derived_.Swap(&derived); + active_words_.Swap(&active_words_cuda); + active_word_features_.Swap(&active_word_features); + active_word_features_trans_.Swap(&active_word_features_trans); + TrainInternal(); - previous_minibatch_empty_.Signal(); } void RnnlmTrainer::GetWordEmbedding(CuMatrix *word_embedding_storage, CuMatrix **word_embedding) { - RnnlmExample &minibatch = previous_minibatch_; + RnnlmExample &minibatch = current_minibatch_; bool sampling = !minibatch.sampled_words.empty(); if (word_feature_mat_ == NULL) { @@ -148,7 +153,7 @@ void RnnlmTrainer::GetWordEmbedding(CuMatrix *word_embedding_storage, void RnnlmTrainer::TrainWordEmbedding( CuMatrixBase *word_embedding_deriv) { - RnnlmExample &minibatch = previous_minibatch_; + RnnlmExample &minibatch = current_minibatch_; bool sampling = !minibatch.sampled_words.empty(); if (word_feature_mat_ == NULL) { @@ -186,7 +191,7 @@ void RnnlmTrainer::TrainWordEmbedding( void RnnlmTrainer::TrainBackstitchWordEmbedding( bool is_backstitch_step1, CuMatrixBase *word_embedding_deriv) { - RnnlmExample &minibatch = previous_minibatch_; + RnnlmExample &minibatch = current_minibatch_; bool sampling = !minibatch.sampled_words.empty(); if (word_feature_mat_ == NULL) { @@ -239,7 +244,7 @@ void RnnlmTrainer::TrainInternal() { srand_seed_ % core_config_.backstitch_training_interval) { bool is_backstitch_step1 = true; srand(srand_seed_ + num_minibatches_processed_); - core_trainer_->TrainBackstitch(is_backstitch_step1, previous_minibatch_, + core_trainer_->TrainBackstitch(is_backstitch_step1, current_minibatch_, derived_, *word_embedding, (train_embedding_ ? &word_embedding_deriv : NULL)); if (train_embedding_) @@ -247,13 +252,13 @@ void RnnlmTrainer::TrainInternal() { is_backstitch_step1 = false; srand(srand_seed_ + num_minibatches_processed_); - core_trainer_->TrainBackstitch(is_backstitch_step1, previous_minibatch_, + core_trainer_->TrainBackstitch(is_backstitch_step1, current_minibatch_, derived_, *word_embedding, (train_embedding_ ? &word_embedding_deriv : NULL)); if (train_embedding_) TrainBackstitchWordEmbedding(is_backstitch_step1, &word_embedding_deriv); } else { - core_trainer_->Train(previous_minibatch_, derived_, *word_embedding, + core_trainer_->Train(current_minibatch_, derived_, *word_embedding, (train_embedding_ ? &word_embedding_deriv : NULL)); if (train_embedding_) TrainWordEmbedding(&word_embedding_deriv); @@ -265,61 +270,7 @@ int32 RnnlmTrainer::VocabSize() { else return embedding_mat_->NumRows(); } -void RnnlmTrainer::RunBackgroundThread() { - while (true) { - current_minibatch_full_.Wait(); - if (end_of_input_) - return; - RnnlmExampleDerived derived; - CuArray active_words_cuda; - CuSparseMatrix active_word_features; - CuSparseMatrix active_word_features_trans; - - if (!current_minibatch_.sampled_words.empty()) { - std::vector active_words; - RenumberRnnlmExample(¤t_minibatch_, &active_words); - active_words_cuda.CopyFromVec(active_words); - - if (word_feature_mat_ != NULL) { - active_word_features.SelectRows(active_words_cuda, - *word_feature_mat_); - active_word_features_trans.CopyFromSmat(active_word_features, - kTrans); - } - } - GetRnnlmExampleDerived(current_minibatch_, train_embedding_, - &derived); - - // Wait until the main thread is not currently processing - // previous_minibatch_; once we get this semaphore we are free to write to - // it and other related variables such as 'derived_'. - previous_minibatch_empty_.Wait(); - previous_minibatch_.Swap(¤t_minibatch_); - derived_.Swap(&derived); - active_words_.Swap(&active_words_cuda); - active_word_features_.Swap(&active_word_features); - active_word_features_trans_.Swap(&active_word_features_trans); - - // The following statement signals that 'previous_minibatch_' - // and related variables have been written to by this thread. - previous_minibatch_full_.Signal(); - // The following statement signals that 'current_minibatch_' - // has been consumed by this thread and is no longer needed. - current_minibatch_empty_.Signal(); - } -} - RnnlmTrainer::~RnnlmTrainer() { - // Train on the last minibatch, because Train() always trains on the previously - // provided one (for threading reasons). - if (num_minibatches_processed_ > 0) { - previous_minibatch_full_.Wait(); - TrainInternal(); - } - end_of_input_ = true; - current_minibatch_full_.Signal(); - background_thread_.join(); - // Note: the following delete statements may cause some diagnostics to be // issued, from the destructors of those classes. if (core_trainer_) diff --git a/src/rnnlm/rnnlm-training.h b/src/rnnlm/rnnlm-training.h index e1eec79a3ff..d0a9a1a32e4 100644 --- a/src/rnnlm/rnnlm-training.h +++ b/src/rnnlm/rnnlm-training.h @@ -20,7 +20,6 @@ #ifndef KALDI_RNNLM_RNNLM_TRAINING_H_ #define KALDI_RNNLM_RNNLM_TRAINING_H_ -#include #include "rnnlm/rnnlm-core-training.h" #include "rnnlm/rnnlm-embedding-training.h" #include "rnnlm/rnnlm-utils.h" @@ -79,10 +78,7 @@ class RnnlmTrainer { // Train on one example. The example is provided as a pointer because we - // acquire it destructively, via Swap(). Note: this function doesn't - // actually train on this eg; what it does is to train on the previous - // example, and provide this eg to the background thread that computes the - // derived parameters of the eg. + // acquire it destructively, via Swap(). void Train(RnnlmExample *minibatch); @@ -129,16 +125,6 @@ class RnnlmTrainer { bool is_backstitch_step1, CuMatrixBase *word_embedding_deriv); - /// This is the function-call that's run as the background thread which - /// computes the derived parameters for each minibatch. - void RunBackgroundThread(); - - /// This function is invoked by the newly created background thread. - static void run_background_thread(RnnlmTrainer *trainer) { - trainer->RunBackgroundThread(); - } - - bool train_embedding_; // true if we are training the embedding. const RnnlmCoreTrainerOptions &core_config_; const RnnlmEmbeddingTrainerOptions &embedding_config_; @@ -173,32 +159,14 @@ class RnnlmTrainer { // it's needed. CuSparseMatrix word_feature_mat_transpose_; - - // num_minibatches_processed_ starts at zero is incremented each time we - // provide an example to the background thread for computing the derived - // parameters. int32 num_minibatches_processed_; - // 'current_minibatch' is where the Train() function puts the minibatch that - // is provided to Train(), so that the background thread can work on it. RnnlmExample current_minibatch_; - // View 'end_of_input_' as part of a unit with current_minibatch_, for threading/access - // purposes. It is set by the foreground thread from the destructor, while - // incrementing the current_minibatch_ready_ semaphore; and when the background - // thread decrements the semaphore and notices that end_of_input_ is true, it will - // exit. - bool end_of_input_; - - - // previous_minibatch_ is the previous minibatch that was provided to Train(), - // but the minibatch that we're currently trainig on. - RnnlmExample previous_minibatch_; - // The variables derived_ and active_words_ [and more that I'll add, TODO] are in the same - // group as previous_minibatch_ from the point of view - // of threading and access control. - RnnlmExampleDerived derived_; + + // The variables derived_ and active_words_ corresponds to group as current_minibatch_. + RnnlmExampleDerived derived_; // Only if we are doing subsampling (depends on the eg), active_words_ - // contains the list of active words for the minibatch 'previous_minibatch_'; + // contains the list of active words for the minibatch 'current_minibatch_'; // it is a CUDA version of the 'active_words' output by // RenumberRnnlmExample(). Otherwise it is empty. CuArray active_words_; @@ -212,42 +180,6 @@ class RnnlmTrainer { // This is a derived quantity computed by the background thread. CuSparseMatrix active_word_features_trans_; - - // The 'previous_minibatch_full_' semaphore is incremented by the background - // thread once it has written to 'previous_minibatch_' and - // 'derived_previous_', to let the Train() function know that they are ready - // to be trained on. The Train() function waits on this semaphore. - Semaphore previous_minibatch_full_; - - // The 'previous_minibatch_empty_' semaphore is incremented by the foreground - // thread when it has done processing previous_minibatch_ and - // derived_ and active_words_ (and hence, it is safe for the background thread to write - // to these variables). The background thread waits on this semaphore once it - // has finished computing the derived variables; and when it successfully - // decrements it, it will write to those variables (quickly, via Swap()). - Semaphore previous_minibatch_empty_; - - - // The 'current_minibatch_ready_' semaphore is incremented by the foreground - // thread from Train(), when it has written the just-provided minibatch to - // 'current_minibatch_' (it's also incremented by the destructor, together - // with setting end_of_input_. The background thread waits on this semaphore - // before either processing previous_minibatch (if !end_of_input_), or exiting - // (if end_of_input_). - Semaphore current_minibatch_full_; - - // The 'current_minibatch_empty_' semaphore is incremented by the background - // thread when it has done processing current_minibatch_, - // so, it is safe for the foreground thread to write - // to this variable). The foreground thread waits on this semaphore before - // writing to 'current_minibatch_' (in practice it should get the semaphore - // immediately since we expect that the foreground thread will have more to - // do than the background thread). - Semaphore current_minibatch_empty_; - - std::thread background_thread_; // Background thread for computing 'derived' - // parameters of a minibatch. - // This value is used in backstitch training when we need to ensure // consistent dropout masks. It's set to a value derived from rand() // when the class is initialized. diff --git a/src/rnnlm/sampling-lm-test.cc b/src/rnnlm/sampling-lm-test.cc index efb8385d7c3..2bf7eaef222 100644 --- a/src/rnnlm/sampling-lm-test.cc +++ b/src/rnnlm/sampling-lm-test.cc @@ -64,7 +64,7 @@ void SamplingLmTest::ReadHistories(std::istream &is, bool binary, BaseFloat hist_weight = 0; for (int32 i = 0; i < tokens.size() - 1; ++i) { word = sym->Find(tokens[i]); - if (word == fst::SymbolTable::kNoSymbol) { + if (word == -1) { // fst::kNoSymbol KALDI_ERR << "Found history contains word that is not in Arpa LM"; } history.push_back(word); diff --git a/src/rnnlmbin/rnnlm-sentence-probs.cc b/src/rnnlmbin/rnnlm-sentence-probs.cc index 0e7c74cbf6b..ec9f7fc40fb 100644 --- a/src/rnnlmbin/rnnlm-sentence-probs.cc +++ b/src/rnnlmbin/rnnlm-sentence-probs.cc @@ -104,7 +104,9 @@ int main(int argc, char *argv[]) { while (ifile >> key) { getline(ifile, line); std::vector v; - KALDI_ASSERT(SplitStringToIntegers(line, " ", true, &v)); + if (!SplitStringToIntegers(line, " ", true, &v)) { + KALDI_ERR << "Input file should contain only integers."; + } RnnlmComputeState rnnlm_compute_state(info, opts.bos_index); std::cout << key << " "; diff --git a/src/tfrnnlm/tensorflow-rnnlm.cc b/src/tfrnnlm/tensorflow-rnnlm.cc index 3f11564c7c6..e4de98abd12 100644 --- a/src/tfrnnlm/tensorflow-rnnlm.cc +++ b/src/tfrnnlm/tensorflow-rnnlm.cc @@ -49,7 +49,7 @@ void SetUnkPenalties(const string &filename, float count, total_count = 0; while (ifile >> word >> count) { int id = fst_word_symbols.Find(word); - KALDI_ASSERT(id != fst::SymbolTable::kNoSymbol); + KALDI_ASSERT(id != -1); // fst::kNoSymbol (*out)[id] = count; total_count += count; } @@ -145,7 +145,7 @@ KaldiTfRnnlmWrapper::KaldiTfRnnlmWrapper( rnn_label_to_word_.push_back(word); // vector[i] = word int fst_label = fst_word_symbols->Find(word); - if (fst::SymbolTable::kNoSymbol == fst_label) { + if (fst_label == -1) { // fst::kNoSymbol if (id == eos_) continue; diff --git a/src/tree/context-dep.cc b/src/tree/context-dep.cc index 4eab67f52be..5583717633c 100644 --- a/src/tree/context-dep.cc +++ b/src/tree/context-dep.cc @@ -319,8 +319,8 @@ void ContextDependency::GetPdfInfo( ContextDependency* -MonophoneContextDependency(const std::vector phones, - const std::vector phone2num_pdf_classes) { +MonophoneContextDependency(const std::vector &phones, + const std::vector &phone2num_pdf_classes) { std::vector > phone_sets(phones.size()); for (size_t i = 0; i < phones.size(); i++) phone_sets[i].push_back(phones[i]); std::vector share_roots(phones.size(), false); // don't share roots. @@ -331,8 +331,8 @@ MonophoneContextDependency(const std::vector phones, } ContextDependency* -MonophoneContextDependencyShared(const std::vector > phone_sets, - const std::vector phone2num_pdf_classes) { +MonophoneContextDependencyShared(const std::vector > &phone_sets, + const std::vector &phone2num_pdf_classes) { std::vector share_roots(phone_sets.size(), false); // don't share roots. // N is context size, P = position of central phone (must be 0). int32 num_leaves = 0, P = 0, N = 1; diff --git a/src/tree/context-dep.h b/src/tree/context-dep.h index 6342d89667b..e69c26f8638 100644 --- a/src/tree/context-dep.h +++ b/src/tree/context-dep.h @@ -180,15 +180,15 @@ ContextDependency *GenRandContextDependencyLarge(const std::vector &phone // 0, 1, 2). ContextDependency* -MonophoneContextDependency(const std::vector phones, - const std::vector phone2num_pdf_classes); +MonophoneContextDependency(const std::vector &phones, + const std::vector &phone2num_pdf_classes); // MonophoneContextDependencyShared is as MonophoneContextDependency but lets // you define classes of phones which share pdfs (e.g. different stress-markers of a single // phone.) Each element of phone_classes is a set of phones that are in that class. ContextDependency* -MonophoneContextDependencyShared(const std::vector > phone_classes, - const std::vector phone2num_pdf_classes); +MonophoneContextDependencyShared(const std::vector > &phone_classes, + const std::vector &phone2num_pdf_classes); // Important note: diff --git a/src/util/kaldi-io.cc b/src/util/kaldi-io.cc index d577c1e9a89..493a335f2db 100644 --- a/src/util/kaldi-io.cc +++ b/src/util/kaldi-io.cc @@ -26,6 +26,8 @@ #include "util/kaldi-holder.h" #include "util/kaldi-pipebuf.h" #include "util/kaldi-table.h" // for Classify{W,R}specifier +#include +#include #ifdef KALDI_CYGWIN_COMPAT #include "util/kaldi-cygwin-io-inl.h" @@ -34,7 +36,8 @@ #define MapOsPath(x) x #endif // KALDI_CYGWIN_COMPAT -#ifdef _MSC_VER + +#if defined(_MSC_VER) static FILE *popen(const char* command, const char* mode) { #ifdef KALDI_CYGWIN_COMPAT return kaldi::CygwinCompatPopen(command, mode); @@ -280,7 +283,7 @@ class PipeOutputImpl: public OutputImplBase { KALDI_ASSERT(wxfilename.length() != 0 && wxfilename[0] == '|'); // should // start with '|' std::string cmd_name(wxfilename, 1); -#ifdef _MSC_VER +#if defined(_MSC_VER) || defined(__CYGWIN__) f_ = popen(cmd_name.c_str(), (binary ? "wb" : "w")); #else f_ = popen(cmd_name.c_str(), "w"); @@ -457,7 +460,7 @@ class PipeInputImpl: public InputImplBase { KALDI_ASSERT(rxfilename.length() != 0 && rxfilename[rxfilename.length()-1] == '|'); // should end with '|' std::string cmd_name(rxfilename, 0, rxfilename.length()-1); -#ifdef _MSC_VER +#if defined(_MSC_VER) || defined(__CYGWIN__) f_ = popen(cmd_name.c_str(), (binary ? "rb" : "r")); #else f_ = popen(cmd_name.c_str(), "r"); diff --git a/src/util/kaldi-thread.h b/src/util/kaldi-thread.h index 09901c84043..ac418cd0b62 100644 --- a/src/util/kaldi-thread.h +++ b/src/util/kaldi-thread.h @@ -174,6 +174,7 @@ template class TaskSequencer { public: TaskSequencer(const TaskSequencerConfig &config): + num_threads_(config.num_threads), threads_avail_(config.num_threads), tot_threads_avail_(config.num_threads_total > 0 ? config.num_threads_total : config.num_threads + 20), @@ -186,6 +187,13 @@ class TaskSequencer { /// This function takes ownership of the pointer "c", and will delete it /// in the same sequence as Run was called on the jobs. void Run(C *c) { + // run in main thread + if (num_threads_ == 0) { + (*c)(); + delete c; + return; + } + threads_avail_.Wait(); // wait till we have a thread for computation free. tot_threads_avail_.Wait(); // this ensures we don't have too many threads // waiting on I/O, and consume too much memory. @@ -260,6 +268,8 @@ class TaskSequencer { args->me->tot_threads_avail_.Signal(); } + int32 num_threads_; // copy of config.num_threads (since Semaphore doesn't store original count) + Semaphore threads_avail_; // Initialized to the number of threads we are // supposed to run with; the function Run() waits on this. diff --git a/src/util/parse-options.h b/src/util/parse-options.h index 12ed62bb55d..3d76b692c7d 100644 --- a/src/util/parse-options.h +++ b/src/util/parse-options.h @@ -38,7 +38,7 @@ class ParseOptions : public OptionsItf { explicit ParseOptions(const char *usage) : print_args_(true), help_(false), usage_(usage), argc_(0), argv_(NULL), prefix_(""), other_parser_(NULL) { -#ifndef _MSC_VER // This is just a convenient place to set the stderr to line +#if !defined(_MSC_VER) && !defined(__CYGWIN__) // This is just a convenient place to set the stderr to line setlinebuf(stderr); // buffering mode, since it's called at program start. #endif // This helps ensure different programs' output is not mixed up. RegisterStandard("config", &config_, "Configuration file to read (this " diff --git a/tools/Makefile b/tools/Makefile index 478a7ae47f5..1d62e1a3765 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -7,7 +7,7 @@ CC = gcc # used for sph2pipe # Note: OpenFst requires a relatively recent C++ compiler with C++11 support, # e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3. -OPENFST_VERSION ?= 1.6.5 +OPENFST_VERSION ?= 1.6.7 # Default features configured for OpenFST; can be overridden in the make command line. OPENFST_CONFIGURE ?= --enable-static --enable-shared --enable-far --enable-ngram-fsts @@ -84,7 +84,7 @@ openfst-$(OPENFST_VERSION): openfst-$(OPENFST_VERSION).tar.gz tar xozf openfst-$(OPENFST_VERSION).tar.gz openfst-$(OPENFST_VERSION).tar.gz: - wget -T 10 -t 1 http://openfst.cs.nyu.edu/twiki/pub/FST/FstDownload/openfst-$(OPENFST_VERSION).tar.gz || \ + wget -T 10 -t 1 http://www.openfst.org/twiki/pub/FST/FstDownload/openfst-$(OPENFST_VERSION).tar.gz || \ wget -T 10 -t 3 http://www.openslr.org/resources/2/openfst-$(OPENFST_VERSION).tar.gz sclite: sclite_compiled diff --git a/tools/extras/check_dependencies.sh b/tools/extras/check_dependencies.sh index 430a25671f0..cd9ec7f5c1e 100755 --- a/tools/extras/check_dependencies.sh +++ b/tools/extras/check_dependencies.sh @@ -65,7 +65,7 @@ if ! echo "#include " | $CXX -E - >&/dev/null; then add_packages zlib-devel zlib1g-dev zlib-devel fi -for f in make automake autoconf patch grep bzip2 gzip wget git; do +for f in make automake autoconf patch grep bzip2 gzip wget git sox; do if ! which $f >&/dev/null; then echo "$0: $f is not installed." add_packages $f $f $f @@ -100,7 +100,7 @@ if ! which python3 >&/dev/null; then pythonok=false fi -( +( #Use a subshell so that sourcing env.sh does not have an influence on the rest of the script [ -f ./env.sh ] && . ./env.sh if $pythonok && ! which python2 >&/dev/null; then @@ -111,14 +111,14 @@ if $pythonok && ! which python2 >&/dev/null; then fi if [[ -f $PWD/python/.use_default_python && -f $PWD/python/python ]]; then - rm $PWD/python/python + rm $PWD/python/python fi if $pythonok && which python >&/dev/null && [[ ! -f $PWD/python/.use_default_python ]]; then version=`python 2>&1 --version | awk '{print $2}' ` if [[ $version != "2.7"* ]] ; then echo "$0: WARNING python 2.7 is not the default python. We fixed this by adding a correct symlink more prominently on the path." - echo "$0: If you really want to use python $version as default, add an empty file $PWD/python/.use_default_python and run this script again." + echo "$0: If you really want to use python $version as default, add an empty file $PWD/python/.use_default_python and run this script again." mkdir -p $PWD/python ln -s $(which python2.7) $PWD/python/python echo "export PATH=$PWD/python:\${PATH}" >> env.sh diff --git a/tools/extras/install_jieba.sh b/tools/extras/install_jieba.sh new file mode 100755 index 00000000000..49fe1b79804 --- /dev/null +++ b/tools/extras/install_jieba.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# The script downloads and installs jieba + +set -e + +# Make sure we are in the tools/ directory. +if [ `basename $PWD` == extras ]; then + cd .. +fi + +! [ `basename $PWD` == tools ] && \ + echo "You must call this script from the tools/ directory" && exit 1; + +echo "Installing jieba" + +if [ -d ./jieba ] ; then + echo >&2 "$0: Warning: old installation of jieba found. You should manually" + echo >&2 " delete the directory tools/jieba and " + echo >&2 " edit the file tools/env.sh and remove manually all references to it" + exit 1 +fi + +if [ ! -d ./jieba ]; then + git clone https://github.com/fxsjy/jieba.git || exit 1; +fi + +( +cd jieba +pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'` +export PYTHONPATH=$PYTHONPATH:$PWD/lib/python${pyver}/site-packages/ +# we have to create those dir, as the install target does not create it +mkdir -p $PWD/lib/python${pyver}/site-packages/ +python setup.py install --prefix `pwd` +cd .. +) + +lib_dir=./lib/ +site_packages_dir=$(cd ./jieba; find $lib_dir -name "site-packages" -type d | head -n1) +( + echo "export JIEBA=\"$PWD/jieba\"" + echo "export PYTHONPATH=\"\${PYTHONPATH:-}:\$JIEBA/${site_packages_dir}\"" +) >> env.sh + +echo >&2 "Installation of jieba finished successfully" +echo >&2 "Please source tools/env.sh in your path.sh to enable it" diff --git a/tools/extras/install_sequitur.sh b/tools/extras/install_sequitur.sh index 65f82a14134..6ee4d9f4336 100755 --- a/tools/extras/install_sequitur.sh +++ b/tools/extras/install_sequitur.sh @@ -94,10 +94,19 @@ fi # the next two lines deal with the issue that the new setup tools # expect the directory in which we will be installing to be visible # as module directory to python -site_packages_dir=$(python -m site --user-site | grep -oE "lib.*") +site_packages_dir=$(PYTHONPATH="" python -m site --user-site | grep -oE "lib.*") SEQUITUR=$(pwd)/$site_packages_dir +# some bits of info to troubleshoot this in case people have problems +echo -n >&2 "USER SITE: "; PYTHONPATH="" python -m site --user-site +echo >&2 "SEQUITUR_PACKAGE: ${site_packages_dir:-}" +echo >&2 "SEQUITUR: $SEQUITUR" +echo >&2 "PYTHONPATH: ${PYTHONPATH:-}" +mkdir -p $SEQUITUR PYTHONPATH=${PYTHONPATH:-}:$SEQUITUR python setup.py install --prefix `pwd` -) +) || { + echo >&2 "Problem installing sequitur!" + exit 1 +} site_packages_dir=$(cd sequitur-g2p; find ./lib{,64} -type d -name site-packages | head -n 1) (