diff --git a/egs/gop/README.md b/egs/gop/README.md new file mode 100644 index 00000000000..d95f4e966fd --- /dev/null +++ b/egs/gop/README.md @@ -0,0 +1,98 @@ +There is a copy of this document on Google Docs, which renders the equations better: +[link](https://docs.google.com/document/d/1pie-PU6u2NZZC_FzocBGGm6mpfBJMiCft9UoG0uA1kA/edit?usp=sharing) + +* * * + +# GOP on Kaldi + +The Goodness of Pronunciation (GOP) is a variation of the posterior probability, for phone level pronunciation scoring. +GOP is widely used in pronunciation evaluation and mispronunciation detection tasks. + +This implementation is mainly based on the following paper: + +Hu, W., Qian, Y., Soong, F. K., & Wang, Y. (2015). Improved mispronunciation detection with deep neural network trained acoustic models and transfer learning based logistic regression classifiers. Speech Communication, 67(January), 154-166. + +## GOP-GMM + +In the conventional GMM-HMM based system, GOP was first proposed in (Witt et al., 2000). It was defined as the duration normalised log of the posterior: + +$$ +GOP(p)=\frac{1}{t_e-t_s+1} \log p(p|\mathbf o) +$$ + +where $\mathbf o$ is the input observations, $p$ is the canonical phone, $t_s, t_e$ are the start and end frame indexes. + +Assuming $p(q_i)\approx p(q_j)$ for any $q_i, q_j$, we have: + +$$ +\log p(p|\mathbf o)=\frac{p(\mathbf o|p)p(p)}{\sum_{q\in Q} p(\mathbf o|q)p(q)} + \approx\frac{p(\mathbf o|p)}{\sum_{q\in Q} p(\mathbf o|q)} +$$ + +where $Q$ is the whole phone set. + +The numerator of the equation is calculated from forced alignment result and the denominator is calculated from an Viterbi decoding with a unconstrained phone loop. + +We do not implement GOP-GMM for Kaldi, as GOP-NN performs much better than GOP-GMM. + +## GOP-NN + +The definition of GOP-NN is a bit different from the GOP-GMM. GOP-NN was defined as the log phone posterior ratio between the canonical phone and the one with the highest score (Hu et al., 2015). + +Firstly we define Log Phone Posterior (LPP): + +$$ +LPP(p)=\log p(p|\mathbf o; t_s,t_e) +$$ + +Then we define the GOP-NN using LPP: + +$$ +GOP(p)=\log \frac{LPP(p)}{\max_{q\in Q} LPP(q)} +$$ + +LPP could be calculated as: + +$$ +LPP(p) \approx \frac{1}{t_e-t_s+1} \sum_{t=t_s}^{t_e}\log p(p|o_t) +$$ + +$$ +p(p|o_t) = \sum_{s \in p} p(s|o_t) +$$ + +where $s$ is the senone label, $\{s|s \in p\}$ is the states belonging to those triphones whose current phone is $p$. + +## Phone-level Feature + +Normally the classifier-based approach archives better performance than GOP-based approach. + +Different from GOP based method, an extra supervised training process is needed. The input features for supervised training are phone-level, segmental features. The phone-level feature is defined as: + +$$ +{[LPP(p_1),\cdots,LPP(p_M), LPR(p_1|p_i), \cdots, LPR(p_j|p_i),\cdots]}^T +$$ + +where the Log Posterior Ratio (LPR) between phone $p_j$ and $p_i$ is defined as: + +$$ +LPR(p_j|p_i) = \log p(p_j|\mathbf o; t_s, t_e) - \log p(p_i|\mathbf o; t_s, t_e) +$$ + +## Implementation + +This implementation consists of a executable binary `bin/compute-gop` and some scripts. + +`compute-gop` computes GOP and extracts phone-level features using nnet output probabilities. +The output probabilities are assumed to be from a log-softmax layer. + +The script `run.sh` shows a typical pipeline based on librispeech's model and data. + +In Hu's paper, GOP was computed using a feed-forward DNN. +We have tried to use the output-xent of a chain model to compute GOP, but the result was not good. +We guess the HMM topo of chain model may not fit for GOP. + +The nnet3's TDNN (no chain) model performs well in GOP computing, so this recipe uses it. + +## Acknowledgement +The author of this recipe would like to thank Xingyu Na for his works of model tuning and his helpful suggestions. diff --git a/egs/gop/s5/cmd.sh b/egs/gop/s5/cmd.sh new file mode 100644 index 00000000000..9139633e57a --- /dev/null +++ b/egs/gop/s5/cmd.sh @@ -0,0 +1,13 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export cmd="run.pl" diff --git a/egs/gop/s5/local/make_testcase.sh b/egs/gop/s5/local/make_testcase.sh new file mode 100755 index 00000000000..884563066b1 --- /dev/null +++ b/egs/gop/s5/local/make_testcase.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +src=$1 +dst=$2 + +# Select a very small set for testing +utils/subset_data_dir.sh --shortest $src 10 $dst + +# make fake transcripts as negative examples +cp $dst/text $dst/text.ori +sed -i "s/ THERE / THOSE /" $dst/text +sed -i "s/ IN / ON /" $dst/text diff --git a/egs/gop/s5/local/remove_phone_markers.pl b/egs/gop/s5/local/remove_phone_markers.pl new file mode 100755 index 00000000000..16236a749cf --- /dev/null +++ b/egs/gop/s5/local/remove_phone_markers.pl @@ -0,0 +1,72 @@ +#!/usr/bin/env perl +# Copyright 2019 Junbo Zhang + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +use strict; +use warnings; + +my $Usage = <new phone mapping file, in which each line is: "old-integer-id new-integer-id. + +Usage: utils/remove_phone_markers.pl + e.g.: utils/remove_phone_markers.pl phones.txt phones-pure.txt phone-to-pure-phone.int +EOU + +if (@ARGV < 3) { + die $Usage; +} + +my $old_phone_symbols_filename = shift @ARGV; +my $new_phone_symbols_filename = shift @ARGV; +my $mapping_filename = shift @ARGV; + +my %id_of_old_phone; +open(IN, $old_phone_symbols_filename) or die "Can't open $old_phone_symbols_filename"; +while () { + chomp; + my ($phone, $id) = split; + next if $phone =~ /\#/; + $id_of_old_phone{$phone} = $id; +} +close IN; + +my $new_id = 0; +my %id_of_new_phone; +my %id_old_to_new; +foreach (sort { $id_of_old_phone{$a} <=> $id_of_old_phone{$b} } keys %id_of_old_phone) { + my $old_phone = $_; + s/_[BIES]//; + s/\d//; + my $new_phone = $_; + $id_of_new_phone{$new_phone} = $new_id++ if not exists $id_of_new_phone{$new_phone}; + $id_old_to_new{$id_of_old_phone{$old_phone}} = $id_of_new_phone{$new_phone}; +} + +# Write to file +open(OUT, ">$new_phone_symbols_filename") or die "Can\'t write to $new_phone_symbols_filename"; +foreach (sort { $id_of_new_phone{$a} <=> $id_of_new_phone{$b} } keys %id_of_new_phone) { + print OUT "$_\t$id_of_new_phone{$_}\n"; +} +close OUT; + +open(OUT, ">$mapping_filename") or die "Can\'t write to $mapping_filename"; +foreach (sort { $a <=> $b } keys %id_old_to_new) { + next if $_ == 0; + print OUT "$_ $id_old_to_new{$_}\n"; +} +close OUT; diff --git a/egs/gop/s5/path.sh b/egs/gop/s5/path.sh new file mode 100755 index 00000000000..03df6dd9f2b --- /dev/null +++ b/egs/gop/s5/path.sh @@ -0,0 +1,27 @@ +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C + +# we use this both in the (optional) LM training and the G2P-related scripts +PYTHON='python2.7' + +### Below are the paths used by the optional parts of the recipe + +# We only need the Festival stuff below for the optional text normalization(for LM-training) step +FEST_ROOT=tools/festival +NSW_PATH=${FEST_ROOT}/festival/bin:${FEST_ROOT}/nsw/bin +export PATH=$PATH:$NSW_PATH + +# SRILM is needed for LM model building +SRILM_ROOT=$KALDI_ROOT/tools/srilm +SRILM_PATH=$SRILM_ROOT/bin:$SRILM_ROOT/bin/i686-m64 +export PATH=$PATH:$SRILM_PATH + +# Sequitur G2P executable +sequitur=$KALDI_ROOT/tools/sequitur/g2p.py +sequitur_path="$(dirname $sequitur)/lib/$PYTHON/site-packages" + +# Directory under which the LM training corpus should be extracted +LM_CORPUS_ROOT=./lm-corpus diff --git a/egs/gop/s5/run.sh b/egs/gop/s5/run.sh new file mode 100755 index 00000000000..a731b913552 --- /dev/null +++ b/egs/gop/s5/run.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +# Copyright 2019 Junbo Zhang +# Apache 2.0 + +# This script shows how to calculate Goodness of Pronunciation (GOP) and +# extract phone-level pronunciation feature for mispronunciations detection +# tasks. Read ../README.md or the following paper for details: +# +# "Hu et al., Improved mispronunciation detection with deep neural network +# trained acoustic models and transfer learning based logistic regression +# classifiers, 2015." + +# You might not want to do this for interactive shells. +set -e + +# Before running this recipe, you have to run the librispeech recipe firstly. +# This script assumes the following paths exist. +librispeech_eg=../../librispeech/s5 +model=$librispeech_eg/exp/nnet3_cleaned/tdnn_sp +ivector=$librispeech_eg/exp/nnet3_cleaned/ivectors_test_clean_hires +lang=$librispeech_eg/data/lang +test_data=$librispeech_eg/data/test_clean_hires + +for d in $model $ivector $lang $test_data; do + [ ! -d $d ] && echo "$0: no such path $d" && exit 1; +done + +# Global configurations +stage=0 +nj=4 + +data=test_10short +dir=exp/gop_$data + +. ./cmd.sh +. ./path.sh +. parse_options.sh + +if [ $stage -le 0 ]; then + # Prepare test data + [ -d data ] || mkdir -p data/$data + local/make_testcase.sh $test_data data/$data +fi + +if [ $stage -le 1 ]; then + # Compute Log-likelihoods + steps/nnet3/compute_output.sh --cmd "$cmd" --nj $nj \ + --online-ivector-dir $ivector data/$data $model exp/probs_$data +fi + +if [ $stage -le 2 ]; then + steps/nnet3/align.sh --cmd "$cmd" --nj $nj --use_gpu false \ + --online_ivector_dir $ivector data/$data $lang $model $dir +fi + +if [ $stage -le 3 ]; then + # make a map which converts phones to "pure-phones" + # "pure-phone" means the phone whose stress and pos-in-word markers are ignored + # eg. AE1_B --> AE, EH2_S --> EH, SIL --> SIL + local/remove_phone_markers.pl $lang/phones.txt $dir/phones-pure.txt \ + $dir/phone-to-pure-phone.int + + # Convert transition-id to pure-phone id + $cmd JOB=1:$nj $dir/log/ali_to_phones.JOB.log \ + ali-to-phones --per-frame=true $model/final.mdl "ark,t:gunzip -c $dir/ali.JOB.gz|" \ + "ark,t:-" \| utils/apply_map.pl -f 2- $dir/phone-to-pure-phone.int \| \ + gzip -c \>$dir/ali-pure-phone.JOB.gz || exit 1; +fi + +if [ $stage -le 4 ]; then + # The outputs of the binary compute-gop are the GOPs and the phone-level features. + # + # An example of the GOP result (extracted from "ark,t:$dir/gop.3.txt"): + # 4446-2273-0031 [ 1 0 ] [ 12 0 ] [ 27 -5.382001 ] [ 40 -13.91807 ] [ 1 -0.2555897 ] \ + # [ 21 -0.2897284 ] [ 5 0 ] [ 31 0 ] [ 33 0 ] [ 3 -11.43557 ] [ 25 0 ] \ + # [ 16 0 ] [ 30 -0.03224623 ] [ 5 0 ] [ 25 0 ] [ 33 0 ] [ 1 0 ] + # It is in the posterior format, where each pair stands for [pure-phone-index gop-value]. + # For example, [ 27 -5.382001 ] means the GOP of the pure-phone 27 (it corresponds to the + # phone "OW", according to "$dir/phones-pure.txt") is -5.382001, indicating the audio + # segment of this phone should be a mispronunciation. + # + # The phone-level features are in matrix format: + # 4446-2273-0031 [ -0.2462088 -10.20292 -11.35369 ... + # -8.584108 -7.629755 -13.04877 ... + # ... + # ... ] + # The row number is the phone number of the utterance. In this case, it is 17. + # The column number is 2 * (pure-phone set size), as the feature is consist of LLR + LPR. + # The phone-level features can be used to train a classifier with human labels. See Hu's + # paper for detail. + $cmd JOB=1:$nj $dir/log/compute_gop.JOB.log \ + compute-gop --phone-map=$dir/phone-to-pure-phone.int $model/final.mdl \ + "ark,t:gunzip -c $dir/ali-pure-phone.JOB.gz|" \ + "ark:exp/probs_$data/output.JOB.ark" \ + "ark,t:$dir/gop.JOB.txt" "ark,t:$dir/phonefeat.JOB.txt" || exit 1; + echo "Done compute-gop, the results: \"$dir/gop..txt\" in posterior format." + + # We set -5 as a universal empirical threshold here. You can also determine multiple phone + # dependent thresholds based on the human-labeled mispronunciation data. + echo "The phones whose gop values less than -5 could be treated as mispronunciations." +fi diff --git a/egs/gop/s5/steps b/egs/gop/s5/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/gop/s5/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/gop/s5/utils b/egs/gop/s5/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/gop/s5/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/librispeech/s5/RESULTS b/egs/librispeech/s5/RESULTS index b45271765bc..dbf54b9384d 100644 --- a/egs/librispeech/s5/RESULTS +++ b/egs/librispeech/s5/RESULTS @@ -1,6 +1,6 @@ # In the results below, "tgsmall" is the pruned 3-gram LM, which is used for lattice generation. # The following language models are then used for rescoring: -# a) tgmed- slightly less pruned 3-gram LM +# a) tgmed- slightly less pruned 3-gram LM # b) tglarge- the full, non-pruned 3-gram LM # c) fglarge- non-pruned 4-gram LM # @@ -337,7 +337,7 @@ %WER 4.39 [ 2387 / 54402, 377 ins, 199 del, 1811 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_clean_tglarge/wer_14 %WER 5.36 [ 2918 / 54402, 328 ins, 338 del, 2252 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_clean_tgmed/wer_17 %WER 6.08 [ 3305 / 54402, 369 ins, 396 del, 2540 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch3_dev_clean_tgsmall/wer_15 -%WER 4.40 [ 2395 / 54402, 375 ins, 200 del, 1820 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tglarge/wer_14 +%WER 4.40 [ 2395 / 54402, 375 ins, 200 del, 1820 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tglarge/wer_14 %WER 5.35 [ 2909 / 54402, 328 ins, 339 del, 2242 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tgmed/wer_17 %WER 6.05 [ 3291 / 54402, 384 ins, 381 del, 2526 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch4_dev_clean_tgsmall/wer_14 %WER 13.45 [ 6850 / 50948, 808 ins, 876 del, 5166 sub ] exp/nnet2_online/nnet_ms_a_smbr_0.000005/decode_epoch0_dev_other_tglarge/wer_15 @@ -423,7 +423,7 @@ %WER 17.64 [ 9231 / 52343, 764 ins, 1662 del, 6805 sub ] exp/nnet2_online/nnet_ms_a_online/decode_pp_test_other_tgsmall_utt_offline/wer_14 # Results with nnet3 tdnn -# local/nnet3/run_tdnn.sh +# local/nnet3/run_tdnn.sh (with old configs, now moved to local/nnet3/tuning/run_tdnn_1a.sh) # (4 epoch training on speed-perturbed data) # num_params=19.3M %WER 4.43 [ 2410 / 54402, 306 ins, 278 del, 1826 sub ] exp/nnet3/tdnn_sp/decode_dev_clean_fglarge/wer_13_1.0 @@ -444,7 +444,7 @@ %WER 16.29 [ 8528 / 52343, 828 ins, 1320 del, 6380 sub ] exp/nnet3/tdnn_sp/decode_test_other_tgsmall/wer_14_0.0 # Results with nnet3 tdnn -# local/nnet3/run_tdnn.sh +# local/nnet3/run_tdnn.sh (with old configs, now moved to local/nnet3/tuning/run_tdnn_1a.sh) # (4 epoch training on speed-perturbed and volumn-perturbed "cleaned" data) # num_params=19.3M, average training time=68.8s per job(on Tesla K80), real-time factor=1.23161 # for x in exp/nnet3_cleaned/tdnn_sp/decode_*; do grep WER $x/wer_* | utils/best_wer.sh ; done @@ -465,6 +465,24 @@ %WER 14.78 [ 7737 / 52343, 807 ins, 1115 del, 5815 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_tgmed/wer_15_0.0 %WER 16.28 [ 8521 / 52343, 843 ins, 1258 del, 6420 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_tgsmall/wer_14_0.0 +# Results with nnet3 tdnn with new configs, a.k.a. xconfig +# local/nnet3/run_tdnn.sh (linked to local/nnet3/tuning/run_tdnn_1b.sh) +%WER 4.60 [ 2502 / 54402, 324 ins, 286 del, 1892 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_clean_fglarge/wer_13_1.0 +%WER 4.80 [ 2612 / 54402, 350 ins, 285 del, 1977 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_clean_tglarge/wer_11_1.0 +%WER 5.97 [ 3248 / 54402, 460 ins, 310 del, 2478 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_clean_tgmed/wer_11_0.0 +%WER 6.66 [ 3625 / 54402, 479 ins, 392 del, 2754 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_clean_tgsmall/wer_11_0.0 +%WER 12.29 [ 6262 / 50948, 863 ins, 665 del, 4734 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_other_fglarge/wer_15_0.0 +%WER 12.89 [ 6565 / 50948, 773 ins, 853 del, 4939 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_other_tglarge/wer_14_0.5 +%WER 15.41 [ 7849 / 50948, 894 ins, 1083 del, 5872 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_other_tgmed/wer_15_0.0 +%WER 16.81 [ 8562 / 50948, 896 ins, 1215 del, 6451 sub ] exp/nnet3_cleaned/tdnn_sp/decode_dev_other_tgsmall/wer_14_0.0 +%WER 4.99 [ 2624 / 52576, 393 ins, 253 del, 1978 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_clean_fglarge/wer_13_0.5 +%WER 5.16 [ 2715 / 52576, 359 ins, 319 del, 2037 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_clean_tglarge/wer_12_1.0 +%WER 6.29 [ 3307 / 52576, 471 ins, 341 del, 2495 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_clean_tgmed/wer_12_0.0 +%WER 7.13 [ 3750 / 52576, 473 ins, 452 del, 2825 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_clean_tgsmall/wer_13_0.0 +%WER 12.73 [ 6665 / 52343, 894 ins, 711 del, 5060 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_fglarge/wer_14_0.0 +%WER 13.33 [ 6979 / 52343, 920 ins, 796 del, 5263 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_tglarge/wer_14_0.0 +%WER 15.90 [ 8323 / 52343, 921 ins, 1126 del, 6276 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_tgmed/wer_13_0.0 +%WER 17.28 [ 9044 / 52343, 894 ins, 1372 del, 6778 sub ] exp/nnet3_cleaned/tdnn_sp/decode_test_other_tgsmall/wer_14_0.0 # Results with nnet3 tdnn+sMBR # local/nnet3/run_tdnn_discriminative.sh diff --git a/egs/librispeech/s5/local/nnet3/run_tdnn.sh b/egs/librispeech/s5/local/nnet3/run_tdnn.sh deleted file mode 100755 index 28ee2b92004..00000000000 --- a/egs/librispeech/s5/local/nnet3/run_tdnn.sh +++ /dev/null @@ -1,127 +0,0 @@ -#!/bin/bash - -# this is the standard "tdnn" system, built in nnet3; it's what we use to -# call multi-splice. - -# without cleanup: -# local/nnet3/run_tdnn.sh --train-set train960 --gmm tri6b --nnet3-affix "" & - - -# At this script level we don't support not running on GPU, as it would be painfully slow. -# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, -# --num-threads 16 and --minibatch-size 128. - -# First the options that are passed through to run_ivector_common.sh -# (some of which are also used in this script directly). -stage=0 -decode_nj=30 -train_set=train_960_cleaned -gmm=tri6b_cleaned # this is the source gmm-dir for the data-type of interest; it - # should have alignments for the specified training data. -nnet3_affix=_cleaned - -# Options which are not passed through to run_ivector_common.sh -affix= -train_stage=-10 -common_egs_dir= -reporting_email= -remove_egs=true - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - - -if ! cuda-compiled; then - cat </dev/null || true - for test in test_clean test_other dev_clean dev_other; do - ( - steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ - --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test}_hires \ - ${graph_dir} data/${test}_hires $dir/decode_${test}_tgsmall || exit 1 - steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \ - data/${test}_hires $dir/decode_${test}_{tgsmall,tgmed} || exit 1 - steps/lmrescore_const_arpa.sh \ - --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ - data/${test}_hires $dir/decode_${test}_{tgsmall,tglarge} || exit 1 - steps/lmrescore_const_arpa.sh \ - --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ - data/${test}_hires $dir/decode_${test}_{tgsmall,fglarge} || exit 1 - ) || touch $dir/.error & - done - wait - [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 -fi - -exit 0; diff --git a/egs/librispeech/s5/local/nnet3/run_tdnn.sh b/egs/librispeech/s5/local/nnet3/run_tdnn.sh new file mode 120000 index 00000000000..61f8f499182 --- /dev/null +++ b/egs/librispeech/s5/local/nnet3/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1b.sh \ No newline at end of file diff --git a/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1a.sh b/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..28ee2b92004 --- /dev/null +++ b/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1a.sh @@ -0,0 +1,127 @@ +#!/bin/bash + +# this is the standard "tdnn" system, built in nnet3; it's what we use to +# call multi-splice. + +# without cleanup: +# local/nnet3/run_tdnn.sh --train-set train960 --gmm tri6b --nnet3-affix "" & + + +# At this script level we don't support not running on GPU, as it would be painfully slow. +# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, +# --num-threads 16 and --minibatch-size 128. + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=30 +train_set=train_960_cleaned +gmm=tri6b_cleaned # this is the source gmm-dir for the data-type of interest; it + # should have alignments for the specified training data. +nnet3_affix=_cleaned + +# Options which are not passed through to run_ivector_common.sh +affix= +train_stage=-10 +common_egs_dir= +reporting_email= +remove_egs=true + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat </dev/null || true + for test in test_clean test_other dev_clean dev_other; do + ( + steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test}_hires \ + ${graph_dir} data/${test}_hires $dir/decode_${test}_tgsmall || exit 1 + steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \ + data/${test}_hires $dir/decode_${test}_{tgsmall,tgmed} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ + data/${test}_hires $dir/decode_${test}_{tgsmall,tglarge} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ + data/${test}_hires $dir/decode_${test}_{tgsmall,fglarge} || exit 1 + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + +exit 0; diff --git a/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1b.sh b/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1b.sh new file mode 100755 index 00000000000..a96a1b33e6c --- /dev/null +++ b/egs/librispeech/s5/local/nnet3/tuning/run_tdnn_1b.sh @@ -0,0 +1,135 @@ +#!/bin/bash + +# 1b is as 1a but uses xconfigs. + +# this is the standard "tdnn" system, built in nnet3; it's what we use to +# call multi-splice. + +# without cleanup: +# local/nnet3/run_tdnn.sh --train-set train960 --gmm tri6b --nnet3-affix "" & + + +# At this script level we don't support not running on GPU, as it would be painfully slow. +# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, +# --num-threads 16 and --minibatch-size 128. + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=30 +train_set=train_960_cleaned +gmm=tri6b_cleaned # this is the source gmm-dir for the data-type of interest; it + # should have alignments for the specified training data. +nnet3_affix=_cleaned + +# Options which are not passed through to run_ivector_common.sh +affix= +train_stage=-10 +common_egs_dir= +reporting_email= +remove_egs=true + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + relu-batchnorm-layer name=tdnn0 dim=1280 + relu-batchnorm-layer name=tdnn1 dim=1280 input=Append(-1,2) + relu-batchnorm-layer name=tdnn2 dim=1280 input=Append(-3,3) + relu-batchnorm-layer name=tdnn3 dim=1280 input=Append(-7,2) + relu-batchnorm-layer name=tdnn4 dim=1280 + output-layer name=output input=tdnn4 dim=$num_targets max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig \ + --config-dir $dir/configs || exit 1; +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/train_dnn.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir $train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --trainer.num-epochs 4 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.0017 \ + --trainer.optimization.final-effective-lrate 0.00017 \ + --egs.dir "$common_egs_dir" \ + --cleanup.remove-egs $remove_egs \ + --cleanup.preserve-model-interval 100 \ + --feat-dir=$train_data_dir \ + --ali-dir $ali_dir \ + --lang data/lang \ + --reporting.email="$reporting_email" \ + --dir=$dir || exit 1; + +fi + +if [ $stage -le 13 ]; then + # this does offline decoding that should give about the same results as the + # real online decoding (the one with --per-utt true) + rm $dir/.error 2>/dev/null || true + for test in test_clean test_other dev_clean dev_other; do + ( + steps/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${test}_hires \ + ${graph_dir} data/${test}_hires $dir/decode_${test}_tgsmall || exit 1 + steps/lmrescore.sh --cmd "$decode_cmd" data/lang_test_{tgsmall,tgmed} \ + data/${test}_hires $dir/decode_${test}_{tgsmall,tgmed} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,tglarge} \ + data/${test}_hires $dir/decode_${test}_{tgsmall,tglarge} || exit 1 + steps/lmrescore_const_arpa.sh \ + --cmd "$decode_cmd" data/lang_test_{tgsmall,fglarge} \ + data/${test}_hires $dir/decode_${test}_{tgsmall,fglarge} || exit 1 + ) || touch $dir/.error & + done + wait + [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1 +fi + +exit 0; diff --git a/src/bin/Makefile b/src/bin/Makefile index 7cb01b50120..bfb037fc792 100644 --- a/src/bin/Makefile +++ b/src/bin/Makefile @@ -22,7 +22,7 @@ BINFILES = align-equal align-equal-compiled acc-tree-stats \ matrix-sum build-pfile-from-ali get-post-on-ali tree-info am-info \ vector-sum matrix-sum-rows est-pca sum-lda-accs sum-mllt-accs \ transform-vec align-text matrix-dim post-to-smat compile-graph \ - compare-int-vector + compare-int-vector compute-gop OBJFILES = diff --git a/src/bin/compute-gop.cc b/src/bin/compute-gop.cc new file mode 100644 index 00000000000..63b42212ee7 --- /dev/null +++ b/src/bin/compute-gop.cc @@ -0,0 +1,227 @@ +// bin/compute-gop.cc + +// Copyright 2019 Junbo Zhang + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +/** + This code computes Goodness of Pronunciation (GOP) and extracts phone-level + pronunciation feature for mispronunciations detection tasks, the reference: + + "Improved mispronunciation detection with deep neural network trained acoustic + models and transfer learning based logistic regression classifiers" + by Hu et al., Speech Comunication, 2015. + + GOP is widely used to detect mispronunciations. The DNN-based GOP was defined + as the log phone posterior ratio between the canonical phone and the one with + the highest score. + + To compute GOP, we need to compute Log Phone Posterior (LPP): + LPP(p) = \log p(p|\mathbf o; t_s,t_e) + where {\mathbf o} is the input observations, p is the canonical phone, + {t_s, t_e} are the start and end frame indexes. + + LPP could be calculated as the average of the frame-level LPP, i.e. p(p|o_t): + LPP(p) = \frac{1}{t_e-t_s+1} \sum_{t=t_s}^{t_e}\log p(p|o_t) + p(p|o_t) = \sum_{s \in p} p(s|o_t) + where s is the senone label, {s|s \in p} is the states belonging to those + triphones whose current phone is p. + + GOP is extracted from LPP: + GOP(p) = \log \frac{LPP(p)}{\max_{q\in Q} LPP(q)} + + An array of a phone-level feature for each phone is extracted as well, which + could be used to train a classifier to detect mispronunciations. Normally the + classifier-based approach archives better performance than the GOP-based approach. + + The phone-level feature is defined as: + {[LPP(p_1),\cdots,LPP(p_M), LPR(p_1|p_i), \cdots, LPR(p_j|p_i),\cdots]}^T + + where the Log Posterior Ratio (LPR) between phone p_j and p_i is defined as: + LPR(p_j|p_i) = \log p(p_j|\mathbf o; t_s, t_e) - \log p(p_i|\mathbf o; t_s, t_e) + */ + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "hmm/transition-model.h" +#include "hmm/hmm-utils.h" +#include "hmm/tree-accu.h" +#include "hmm/posterior.h" + +namespace kaldi { + +/** FrameLevelLpp compute a log posterior for pure-phones by sum the posterior + of the states belonging to those triphones whose current phone is the canonical + phone: + + p(p|o_t) = \sum_{s \in p} p(s|o_t), + + where s is the senone label, {s|s \in p} is the states belonging to those + riphones whose current phone is the canonical phone p. + + */ +void FrameLevelLpp(const SubVector &prob_row, + const std::vector > &pdf2phones, + const std::vector *phone_map, + Vector *out_frame_level_lpp) { + for (int32 i = 0; i < prob_row.Dim(); i++) { + std::set dest_idxs; + for (int32 ph : pdf2phones.at(i)) { + dest_idxs.insert((phone_map != NULL) ? (*phone_map)[ph] - 1 : ph - 1); + } + + for (int32 idx : dest_idxs) { + KALDI_ASSERT(idx < out_frame_level_lpp->Dim()); + (*out_frame_level_lpp)(idx) += prob_row(i); + } + } + out_frame_level_lpp->ApplyLog(); +} + +} // namespace kaldi + +int main(int argc, char *argv[]) { + using namespace kaldi; + typedef kaldi::int32 int32; + try { + const char *usage = + "Compute Goodness Of Pronunciation (GOP) from a matrix of " + "probabilities (e.g. from nnet3-compute).\n" + "Usage: compute-gop [options] " + " " + "[]\n" + "e.g.:\n" + " nnet3-compute [args] | compute-gop 1.mdl ark:ali-phone.1 ark:-" + " ark:gop.1 ark:phone-feat.1\n"; + + ParseOptions po(usage); + + bool log_applied = true; + std::string phone_map_rxfilename; + + po.Register("log-applied", &log_applied, + "If true, assume the input probabilities have been applied log."); + po.Register("phone-map", &phone_map_rxfilename, + "File name containing old->new phone mapping (each line is: " + "old-integer-id new-integer-id)"); + + po.Read(argc, argv); + + if (po.NumArgs() != 4 && po.NumArgs() != 5) { + po.PrintUsage(); + exit(1); + } + + std::string model_filename = po.GetArg(1), + alignments_rspecifier = po.GetArg(2), + prob_rspecifier = po.GetArg(3), + gop_wspecifier = po.GetArg(4), + feat_wspecifier = po.GetArg(5); + + TransitionModel trans_model; + { + bool binary; + Input ki(model_filename, &binary); + trans_model.Read(ki.Stream(), binary); + } + std::vector > pdf2phones; + GetPdfToPhonesMap(trans_model, &pdf2phones); + int32 phone_num = trans_model.NumPhones(); + + std::vector phone_map; + if (phone_map_rxfilename != "") { + ReadPhoneMap(phone_map_rxfilename, &phone_map); + phone_num = phone_map[phone_map.size() - 1]; + } + + RandomAccessInt32VectorReader alignment_reader(alignments_rspecifier); + SequentialBaseFloatMatrixReader prob_reader(prob_rspecifier); + PosteriorWriter gop_writer(gop_wspecifier); + BaseFloatMatrixWriter feat_writer(feat_wspecifier); + + int32 num_done = 0; + for (; !prob_reader.Done(); prob_reader.Next()) { + std::string key = prob_reader.Key(); + auto alignment = alignment_reader.Value(key); + Matrix &probs = prob_reader.Value(); + if (log_applied) probs.ApplyExp(); + + int32 frame_num = alignment.size(); + if (alignment.size() != probs.NumRows()) { + KALDI_WARN << "The frame numbers of alignment and prob are not equal."; + if (frame_num > probs.NumRows()) frame_num = probs.NumRows(); + } + + KALDI_ASSERT(frame_num > 0); + int32 cur_phone_id = alignment[0] - 1; // start by 0, skipping + int32 duration = 0; + Vector phone_level_feat(phone_num * 2); // LPPs and LPRs + SubVector lpp_part(phone_level_feat, 0, phone_num); + std::vector > phone_level_feat_stdvector; + Posterior posterior_gop; + for (int32 i = 0; i < frame_num; i++) { + // Calculate LPP and LPR for each pure-phone + Vector frame_level_lpp(phone_num); + FrameLevelLpp(probs.Row(i), pdf2phones, + (phone_map_rxfilename != "") ? &phone_map : NULL, + &frame_level_lpp); + + // LPP(p)=\frac{1}{t_e-t_s+1} \sum_{t=t_s}^{t_e}\log p(p|o_t) + lpp_part.AddVec(1, frame_level_lpp); + duration++; + + int32 next_phone_id = (i < frame_num - 1) ? alignment[i + 1] - 1: -1; + if (next_phone_id != cur_phone_id) { + // The current phone's feature have been ready + lpp_part.Scale(1.0 / duration); + + // LPR(p_j|p_i)=\log p(p_j|\mathbf o; t_s, t_e)-\log p(p_i|\mathbf o; t_s, t_e) + for (int k = 0; k < phone_num; k++) + phone_level_feat(phone_num + k) = lpp_part(cur_phone_id) - lpp_part(k); + phone_level_feat_stdvector.push_back(phone_level_feat); + + // Compute GOP from LPP + // GOP(p)=\log \frac{LPP(p)}{\max_{q\in Q} LPP(q)} + BaseFloat gop = lpp_part(cur_phone_id) - lpp_part.Max(); + std::vector > posterior_item; + posterior_item.push_back(std::make_pair(cur_phone_id + 1, gop)); + posterior_gop.push_back(posterior_item); + + // Reset + phone_level_feat.Set(0); + duration = 0; + } + cur_phone_id = next_phone_id; + } + + // Write GOPs and the phone-level features + Matrix feats(phone_level_feat_stdvector.size(), phone_num * 2); + for (int32 i = 0; i < phone_level_feat_stdvector.size(); i++) { + SubVector row(feats, i); + row.AddVec(1.0, phone_level_feat_stdvector[i]); + } + feat_writer.Write(key, feats); + gop_writer.Write(key, posterior_gop); + num_done++; + } + + KALDI_LOG << "Processed " << num_done << " prob matrices."; + return (num_done != 0 ? 0 : 1); + } catch (const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} diff --git a/src/hmm/hmm-utils.cc b/src/hmm/hmm-utils.cc index 06edf8d5976..15a1edfd255 100644 --- a/src/hmm/hmm-utils.cc +++ b/src/hmm/hmm-utils.cc @@ -1289,5 +1289,16 @@ void ChangeReorderingOfAlignment(const TransitionModel &trans_model, } } +void GetPdfToPhonesMap(const TransitionModel &trans_model, + std::vector > *pdf2phones) { + pdf2phones->clear(); + pdf2phones->resize(trans_model.NumPdfs()); + for (int32 i = 0; i < trans_model.NumTransitionIds(); i++) { + int32 trans_id = i + 1; + int32 pdf_id = trans_model.TransitionIdToPdf(trans_id); + int32 phone = trans_model.TransitionIdToPhone(trans_id); + (*pdf2phones)[pdf_id].insert(phone); + } +} } // namespace kaldi diff --git a/src/hmm/hmm-utils.h b/src/hmm/hmm-utils.h index a8ad846949e..4415927df4e 100644 --- a/src/hmm/hmm-utils.h +++ b/src/hmm/hmm-utils.h @@ -329,6 +329,12 @@ void GetRandomAlignmentForPhone(const ContextDependencyInterface &ctx_dep, void ChangeReorderingOfAlignment(const TransitionModel &trans_model, std::vector *alignment); + +// GetPdfToPhonesMap creates a map which maps each pdf-id into its +// corresponding monophones. +void GetPdfToPhonesMap(const TransitionModel &trans_model, + std::vector > *pdf2phones); + /// @} end "addtogroup hmm_group" } // end namespace kaldi