Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
15 changes: 11 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,17 @@ GSYMS
/src/kaldi.mk.bak

# /egs/
/egs/*/s*/mfcc
/egs/*/s*/plp
/egs/*/s*/exp
/egs/*/s*/data
/egs/*/*/mfcc
/egs/*/*/plp
/egs/*/*/exp
/egs/*/*/data

# /tools/
/tools/pocolm/
/tools/ATLAS/
/tools/atlas3.8.3.tar.gz
/tools/irstlm/
/tools/mitlm/
/tools/openfst
/tools/openfst-1.3.2.tar.gz
/tools/openfst-1.3.2/
Expand All @@ -101,6 +103,8 @@ GSYMS
/tools/openfst-1.6.2/
/tools/openfst-1.6.5.tar.gz
/tools/openfst-1.6.5/
/tools/openfst-1.6.7.tar.gz
/tools/openfst-1.6.7/
/tools/BeamformIt/
/tools/libsndfile-1.0.25.tar.gz
/tools/libsndfile-1.0.25/
Expand Down Expand Up @@ -141,3 +145,6 @@ GSYMS
/tools/mmseg-1.3.0.tar.gz
/tools/mmseg-1.3.0/
/kaldiwin_vs*
/tools/cub-1.8.0.zip
/tools/cub-1.8.0/
/tools/cub
5 changes: 3 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ addons:
- gfortran-4.9
- liblapack-dev
- clang-3.8
- sox

branches:
only:
Expand All @@ -47,8 +48,8 @@ script:
# http://peter.eisentraut.org/blog/2014/12/01/ccache-and-clang-part-3/
# for the explanation why extra switches needed for clang with ccache.
- CXX="ccache clang++-3.8 -Qunused-arguments -fcolor-diagnostics -Wno-tautological-compare"
CFLAGS="-march=native"
LDFLAGS="-llapack"
CFLAGS=""
LDFLAGS="-llapack -Wl,-fuse-ld=gold"
INCDIRS="$XROOT/usr/include"
LIBDIRS="$XROOT/usr/lib"
tools/extras/travis_script.sh
Expand Down
12 changes: 6 additions & 6 deletions COPYING
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ contributors and original source material as well as the full text of the Apache
License v 2.0 are set forth below.

Individual Contributors (in alphabetical order)

Mohit Agarwal
Tanel Alumae
Gilles Boulianne
Expand Down Expand Up @@ -123,7 +123,7 @@ Individual Contributors (in alphabetical order)
Haihua Xu
Hainan Xu
Xiaohui Zhang

Other Source Material

This project includes a port and modification of materials from JAMA: A Java
Expand All @@ -136,9 +136,9 @@ Other Source Material
"Signal processing with lapped transforms," Artech House, Inc., 1992. The
current copyright holder, Henrique S. Malvar, has given his permission for the
release of this modified version under the Apache License 2.0.
This project includes material from the OpenFST Library v1.2.7 available at
http://www.openfst.org and released under the Apache License v. 2.0.

This project includes material from the OpenFST Library v1.2.7 available at
http://www.openfst.org and released under the Apache License v. 2.0.

[OpenFst COPYING file begins here]

Expand All @@ -147,7 +147,7 @@ Other Source Material
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
Expand Down
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
[![Build Status](https://travis-ci.org/kaldi-asr/kaldi.svg?branch=master)](https://travis-ci.org/kaldi-asr/kaldi)

[![Build Status](https://travis-ci.com/kaldi-asr/kaldi.svg?branch=master)](https://travis-ci.com/kaldi-asr/kaldi)
Kaldi Speech Recognition Toolkit
================================

Expand Down
26 changes: 18 additions & 8 deletions egs/aishell/s5/RESULTS
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
%WER 33.82 [ 35432 / 104765, 743 ins, 3991 del, 30698 sub ] exp/mono/decode_test/cer_12_0.0
%WER 19.39 [ 20310 / 104765, 903 ins, 1452 del, 17955 sub ] exp/tri1/decode_test/cer_13_0.5
%WER 19.23 [ 20147 / 104765, 910 ins, 1287 del, 17950 sub ] exp/tri2/decode_test/cer_14_0.5
%WER 17.14 [ 17961 / 104765, 812 ins, 1024 del, 16125 sub ] exp/tri3a/decode_test/cer_14_0.0
%WER 13.64 [ 14294 / 104765, 669 ins, 736 del, 12889 sub ] exp/tri4a/decode_test/cer_14_0.5
%WER 12.23 [ 12809 / 104765, 656 ins, 580 del, 11573 sub ] exp/tri5a/decode_test/cer_13_1.0
%WER 8.45 [ 8849 / 104765, 312 ins, 538 del, 7999 sub ] exp/nnet3/tdnn_sp/decode_test/cer_13_1.0
%WER 7.46 [ 7813 / 104765, 287 ins, 472 del, 7054 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_10_1.0
%WER 36.41 [ 38146 / 104765, 837 ins, 3114 del, 34195 sub ] exp/mono/decode_test/cer_10_0.0
%WER 18.76 [ 19654 / 104765, 949 ins, 1152 del, 17553 sub ] exp/tri1/decode_test/cer_13_0.5
%WER 18.64 [ 19531 / 104765, 941 ins, 1159 del, 17431 sub ] exp/tri2/decode_test/cer_14_0.5
%WER 17.04 [ 17849 / 104765, 810 ins, 1021 del, 16018 sub ] exp/tri3a/decode_test/cer_14_0.5
%WER 13.82 [ 14482 / 104765, 764 ins, 670 del, 13048 sub ] exp/tri4a/decode_test/cer_13_0.5
%WER 12.12 [ 12694 / 104765, 751 ins, 523 del, 11420 sub ] exp/tri5a/decode_test/cer_13_0.5
%WER 8.65 [ 9064 / 104765, 367 ins, 455 del, 8242 sub ] exp/nnet3/tdnn_sp/decode_test/cer_14_0.5
%WER 7.48 [ 7839 / 104765, 285 ins, 454 del, 7100 sub ] exp/chain/tdnn_1a_sp/decode_test/cer_10_1.0

# nnet3 tdnn with online pitch, local/nnet3/tuning/tun_tdnn_2a.sh
%WER 8.64 [ 9050 / 104765, 349 ins, 521 del, 8180 sub ] exp/nnet3/tdnn_sp/decode_test/cer_15_0.5
%WER 8.72 [ 9135 / 104765, 367 ins, 422 del, 8346 sub ] exp/nnet3/tdnn_sp_online/decode_test/cer_12_1.0
%WER 9.36 [ 9807 / 104765, 386 ins, 441 del, 8980 sub ] exp/nnet3/tdnn_sp_online/decode_test_per_utt/cer_13_1.0

# chain with online pitch, local/chain/tuning/run_tdnn_2a.sh
%WER 7.45 [ 7807 / 104765, 340 ins, 497 del, 6970 sub ] exp/chain/tdnn_2a_sp/decode_test/cer_11_0.5
%WER 7.43 [ 7780 / 104765, 341 ins, 469 del, 6970 sub ] exp/chain/tdnn_2a_sp_online/decode_test/cer_11_0.5
%WER 7.92 [ 8296 / 104765, 384 ins, 472 del, 7440 sub ] exp/chain/tdnn_2a_sp_online/decode_test_per_utt/cer_11_0.5
4 changes: 4 additions & 0 deletions egs/aishell/s5/conf/online_pitch.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
--sample-frequency=16000
--simulate-first-pass-online=true
--normalization-right-context=25
--frames-per-chunk=10
18 changes: 3 additions & 15 deletions egs/aishell/s5/local/aishell_prepare_dict.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,9 @@ mkdir -p $dict_dir
cp $res_dir/lexicon.txt $dict_dir

cat $dict_dir/lexicon.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \
sort -u |\
perl -e '
my %ph_cl;
while (<STDIN>) {
$phone = $_;
chomp($phone);
chomp($_);
$phone = $_;
next if ($phone eq "sil");
if (exists $ph_cl{$phone}) { push(@{$ph_cl{$phone}}, $_) }
else { $ph_cl{$phone} = [$_]; }
}
foreach $key ( keys %ph_cl ) {
print "@{ $ph_cl{$key} }\n"
}
perl -e 'while(<>){ chomp($_); $phone = $_; next if ($phone eq "sil");
m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$1} .= "$phone "; }
foreach $l (values %q) {print "$l\n";}
' | sort -k1 > $dict_dir/nonsilence_phones.txt || exit 1;

echo sil > $dict_dir/silence_phones.txt
Expand Down
2 changes: 1 addition & 1 deletion egs/aishell/s5/local/aishell_train_lms.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ kaldi_lm=`which train_lm.sh`
if [ -z $kaldi_lm ]; then
echo "$0: train_lm.sh is not found. That might mean it's not installed"
echo "$0: or it is not added to PATH"
echo "$0: Use the script tools/extra/install_kaldi_lm.sh to install it"
echo "$0: Use the script tools/extras/install_kaldi_lm.sh to install it"
exit 1
fi

Expand Down
2 changes: 1 addition & 1 deletion egs/aishell/s5/local/chain/tuning/run_tdnn_1a.sh
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ if [ $stage -le 10 ]; then
echo "$0: creating neural net configs using the xconfig parser";

num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)

mkdir -p $dir/configs
cat <<EOF > $dir/configs/network.xconfig
Expand Down
211 changes: 211 additions & 0 deletions egs/aishell/s5/local/chain/tuning/run_tdnn_2a.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
#!/bin/bash

# This script is based on run_tdnn_1a.sh.
# This setup used online pitch to train the neural network.
# It requires a online_pitch.conf in the conf dir.

set -e

# configs for 'chain'
affix=
stage=0
train_stage=-10
get_egs_stage=-10
dir=exp/chain/tdnn_2a # Note: _sp will get added to this
decode_iter=

# training options
num_epochs=4
initial_effective_lrate=0.001
final_effective_lrate=0.0001
max_param_change=2.0
final_layer_normalize_target=0.5
num_jobs_initial=2
num_jobs_final=12
minibatch_size=128
frames_per_eg=150,110,90
remove_egs=true
common_egs_dir=
xent_regularize=0.1

# End configuration section.
echo "$0 $@" # Print the command line for logging

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh

if ! cuda-compiled; then
cat <<EOF && exit 1
This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
If you want to use GPUs (and have them), go to src/, and configure and make on a machine
where "nvcc" is installed.
EOF
fi

# The iVector-extraction and feature-dumping parts are the same as the standard
# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
# run those things.

dir=${dir}${affix:+_$affix}_sp
train_set=train_sp
ali_dir=exp/tri5a_sp_ali
treedir=exp/chain/tri6_7d_tree_sp
lang=data/lang_chain


# if we are using the speed-perturbed data we need to generate
# alignments for it.
local/nnet3/run_ivector_common.sh --stage $stage --online true || exit 1;

if [ $stage -le 7 ]; then
# Get the alignments as lattices (gives the LF-MMI training more freedom).
# use the same num-jobs as the alignments
nj=$(cat $ali_dir/num_jobs) || exit 1;
steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
data/lang exp/tri5a exp/tri5a_sp_lats
rm exp/tri5a_sp_lats/fsts.*.gz # save space
fi

if [ $stage -le 8 ]; then
# Create a version of the lang/ directory that has one state per phone in the
# topo file. [note, it really has two states.. the first one is only repeated
# once, the second one has zero or more repeats.]
rm -rf $lang
cp -r data/lang $lang
silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
# Use our special topology... note that later on may have to tune this
# topology.
steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
fi

if [ $stage -le 9 ]; then
# Build a tree using our new topology. This is the critically different
# step compared with other recipes.
steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
--context-opts "--context-width=2 --central-position=1" \
--cmd "$train_cmd" 5000 data/$train_set $lang $ali_dir $treedir
fi

if [ $stage -le 10 ]; then
echo "$0: creating neural net configs using the xconfig parser";

num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
learning_rate_factor=$(echo "print (0.5/$xent_regularize)" | python)

mkdir -p $dir/configs
cat <<EOF > $dir/configs/network.xconfig
input dim=100 name=ivector
input dim=43 name=input

# please note that it is important to have input layer with the name=input
# as the layer immediately preceding the fixed-affine-layer to enable
# the use of short notation for the descriptor
fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat

# the first splicing is moved before the lda layer, so no splicing here
relu-batchnorm-layer name=tdnn1 dim=625
relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=625
relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=625
relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=625
relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=625
relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=625

## adding the layers for chain branch
relu-batchnorm-layer name=prefinal-chain input=tdnn6 dim=625 target-rms=0.5
output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5

# adding the layers for xent branch
# This block prints the configs for a separate output that will be
# trained with a cross-entropy objective in the 'chain' models... this
# has the effect of regularizing the hidden parts of the model. we use
# 0.5 / args.xent_regularize as the learning rate factor- the factor of
# 0.5 / args.xent_regularize is suitable as it means the xent
# final-layer learns at a rate independent of the regularization
# constant; and the 0.5 was tuned so as to make the relative progress
# similar in the xent and regular final layers.
relu-batchnorm-layer name=prefinal-xent input=tdnn6 dim=625 target-rms=0.5
output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5

EOF
steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
fi

if [ $stage -le 11 ]; then
if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
utils/create_split_dir.pl \
/export/b0{5,6,7,8}/$USER/kaldi-data/egs/aishell-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
fi

steps/nnet3/chain/train.py --stage $train_stage \
--cmd "$decode_cmd" \
--feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
--feat.cmvn-opts "--norm-means=false --norm-vars=false" \
--chain.xent-regularize $xent_regularize \
--chain.leaky-hmm-coefficient 0.1 \
--chain.l2-regularize 0.00005 \
--chain.apply-deriv-weights false \
--chain.lm-opts="--num-extra-lm-states=2000" \
--egs.dir "$common_egs_dir" \
--egs.stage $get_egs_stage \
--egs.opts "--frames-overlap-per-eg 0" \
--egs.chunk-width $frames_per_eg \
--trainer.num-chunk-per-minibatch $minibatch_size \
--trainer.frames-per-iter 1500000 \
--trainer.num-epochs $num_epochs \
--trainer.optimization.num-jobs-initial $num_jobs_initial \
--trainer.optimization.num-jobs-final $num_jobs_final \
--trainer.optimization.initial-effective-lrate $initial_effective_lrate \
--trainer.optimization.final-effective-lrate $final_effective_lrate \
--trainer.max-param-change $max_param_change \
--cleanup.remove-egs $remove_egs \
--feat-dir data/${train_set}_hires_online \
--tree-dir $treedir \
--lat-dir exp/tri5a_sp_lats \
--dir $dir || exit 1;
fi

if [ $stage -le 12 ]; then
# Note: it might appear that this $lang directory is mismatched, and it is as
# far as the 'topo' is concerned, but this script doesn't read the 'topo' from
# the lang directory.
utils/mkgraph.sh --self-loop-scale 1.0 data/lang_test $dir $dir/graph
fi

graph_dir=$dir/graph
if [ $stage -le 13 ]; then
for test_set in dev test; do
steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
--nj 10 --cmd "$decode_cmd" \
--online-ivector-dir exp/nnet3/ivectors_$test_set \
$graph_dir data/${test_set}_hires_online $dir/decode_${test_set} || exit 1;
done
fi

if [ $stage -le 14 ]; then
steps/online/nnet3/prepare_online_decoding.sh --mfcc-config conf/mfcc_hires.conf \
--add-pitch true \
$lang exp/nnet3/extractor "$dir" ${dir}_online || exit 1;
fi

dir=${dir}_online
if [ $stage -le 15 ]; then
for test_set in dev test; do
steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
--nj 10 --cmd "$decode_cmd" \
--config conf/decode.config \
$graph_dir data/${test_set}_hires_online $dir/decode_${test_set} || exit 1;
done
fi

if [ $stage -le 16 ]; then
for test_set in dev test; do
steps/online/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
--nj 10 --cmd "$decode_cmd" --per-utt true \
--config conf/decode.config \
$graph_dir data/${test_set}_hires_online $dir/decode_${test_set}_per_utt || exit 1;
done
fi

exit;
Loading