diff --git a/egs/swbd/s5c/local/chain/run_tdnn_2i.sh b/egs/swbd/s5c/local/chain/run_tdnn_2i.sh index eaa5a77949f..218890cc418 100755 --- a/egs/swbd/s5c/local/chain/run_tdnn_2i.sh +++ b/egs/swbd/s5c/local/chain/run_tdnn_2i.sh @@ -1,69 +1,10 @@ #!/bin/bash - -# _2i is as _2d but with a new set of code for estimating the LM, in which we compute -# the log-like change when deciding which states to back off. The code is not the same -# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By -# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration -# is quite similar to 2d, except new/more-exact code is used. - -# see table in run_tdnn_2a.sh for results - -# _2d is as _2c but with different LM options: -# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" -# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. -# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions -# provided from the tree-building, and effectively puts the leftmost context position as a single -# set. -# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg -# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. - -# _2c is as _2a but after a code change in which we start using transition-scale -# and self-loop-scale of 1 instead of zero in training; we change the options to -# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect -# results at all; it's is mainly for convenience in pushing weights in graphs, -# and checking that graphs are stochastic. - -# _2a is as _z but setting --lm-opts "--num-extra-states=8000". - -# _z is as _x but setting --lm-opts "--num-extra-states=2000". -# (see also y, which has --num-extra-states=500). - -# _x is as _s but setting --lm-opts "--num-extra-states=0". -# this is a kind of repeat of the u->v experiment, where it seemed to make things -# worse, but there were other factors involved in that so I want to be sure. - -# _s is as _q but setting pdf-boundary-penalty to 0.0 -# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, -# and 18.07 -> 16.96 on train_dev, after fg rescoring. - -# _q is as _p except making the same change as from n->o, which -# reduces the parameters to try to reduce over-training. We reduce -# relu-dim from 1024 to 850, and target num-states from 12k to 9k, -# and modify the splicing setup. -# note: I don't rerun the tree-building, I just use the '5o' treedir. - -# _p is as _m except with a code change in which we switch to a different, more -# exact mechanism to deal with the edges of the egs, and correspondingly -# different script options... we now dump weights with the egs, and apply the -# weights to the derivative w.r.t. the output instead of using the -# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap -# to 30 also. This wil. give 10 frames on each side with zero derivs, then -# ramping up to a weight of 1.0 over 10 frames. - -# _m is as _k but after a code change that makes the denominator FST more -# compact. I am rerunning in order to verify that the WER is not changed (since -# it's possible in principle that due to edge effects related to weight-pushing, -# the results could be a bit different). -# The results are inconsistently different but broadly the same. On all of eval2000, -# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. -# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. - - -# _k is as _i but reverting the g->h change, removing the --scale-max-param-change -# option and setting max-param-change to 1.. Using the same egs. - +# _2i is as _i but it uses speaker perturbation combined with speed perturbation. # _i is as _h but longer egs: 150 frames instead of 75, and # 128 elements per minibatch instead of 256. +# be cautious comparing the valid probs with h though, because +# we fixed the utt2uniq bug at this point, so from h on, the valid probs +# are properly held out. # _h is as _g but different application of max-param-change (use --scale-max-param-change true) @@ -93,21 +34,23 @@ set -e # configs for 'chain' -stage=12 +stage=1 train_stage=-10 get_egs_stage=-10 speed_perturb=true +speaker_perturb=true dir=exp/chain/tdnn_2i # Note: _sp will get added to this if $speed_perturb == true. # TDNN options -splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -6,3 -6,3" +splice_indexes="-2,-1,0,1,2 -1,2 -3,3 -9,0,9 0" # training options num_epochs=4 initial_effective_lrate=0.001 final_effective_lrate=0.0001 leftmost_questions_truncate=30 -max_param_change=1.0 +max_param_change=0.3333 +scale_max_param_change=true final_layer_normalize_target=0.5 num_jobs_initial=3 num_jobs_final=16 @@ -138,16 +81,19 @@ suffix= if [ "$speed_perturb" == "true" ]; then suffix=_sp fi +if [ "$speaker_perturb" == "true" ]; then + suffix=$suffix"_fp" +fi dir=${dir}$suffix train_set=train_nodup$suffix ali_dir=exp/tri4_ali_nodup$suffix -treedir=exp/chain/tri5o_tree$suffix +treedir=exp/chain/tri5f_tree$suffix # if we are using the speed-perturbed data we need to generate # alignments for it. -local/nnet3/run_ivector_common.sh --stage $stage \ - --speed-perturb $speed_perturb \ +local/nnet3/run_ivector_common_2.sh --stage $stage \ + --speed-perturb $speed_perturb --speaker-perturb $speaker_perturb \ --generate-alignments $speed_perturb || exit 1; @@ -161,6 +107,7 @@ if [ $stage -le 9 ]; then fi +if false; then #100 if [ $stage -le 10 ]; then # Create a version of the lang/ directory that has one state per phone in the # topo file. [note, it really has two states.. the first one is only repeated @@ -179,23 +126,23 @@ if [ $stage -le 11 ]; then # Build a tree using our new topology. steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ --leftmost-questions-truncate $leftmost_questions_truncate \ - --cmd "$train_cmd" 9000 data/$train_set data/lang_chain_d $ali_dir $treedir + --cmd "$train_cmd" 12000 data/$train_set data/lang_chain_d $ali_dir $treedir fi +fi #100 if [ $stage -le 12 ]; then if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then utils/create_split_dir.pl \ - /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage fi touch $dir/egs/.nodelete # keep egs around when that run dies. steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ - --pdf-boundary-penalty 0.0 \ - --lm-opts "--num-extra-lm-states=2000" \ --get-egs-stage $get_egs_stage \ + --left-deriv-truncate 5 --right-deriv-truncate 5 --right-tolerance 5 \ --minibatch-size $minibatch_size \ - --egs-opts "--frames-overlap-per-eg 30" \ + --egs-opts "--frames-overlap-per-eg 10 --nj 40" \ --frames-per-eg $frames_per_eg \ --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ --splice-indexes "$splice_indexes" \ @@ -205,7 +152,7 @@ if [ $stage -le 12 ]; then --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ --max-param-change $max_param_change \ --final-layer-normalize-target $final_layer_normalize_target \ - --relu-dim 850 \ + --relu-dim 1024 \ --cmd "$decode_cmd" \ --remove-egs $remove_egs \ data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; @@ -215,7 +162,8 @@ if [ $stage -le 13 ]; then # Note: it might appear that this $lang directory is mismatched, and it is as # far as the 'topo' is concerned, but this script doesn't read the 'topo' from # the lang directory. - utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg + utils/mkgraph.sh --transition-scale 0.0 \ + --self-loop-scale 0.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg fi decode_suff=sw1_tg diff --git a/egs/swbd/s5c/local/nnet3/run_ivector_common_v2.sh b/egs/swbd/s5c/local/nnet3/run_ivector_common_v2.sh new file mode 100755 index 00000000000..d46d5cc7238 --- /dev/null +++ b/egs/swbd/s5c/local/nnet3/run_ivector_common_v2.sh @@ -0,0 +1,190 @@ +#!/bin/bash + +. ./cmd.sh +set -e +stage=1 +train_stage=-10 +generate_alignments=true # false if doing ctc training +speed_perturb=true +speaker_perturb=true +lpc_order=100 +filter_nj=30 +spkf_per_spk=3 +perturb_suffix="" + +. ./path.sh +. ./utils/parse_options.sh + +mkdir -p nnet3 +# perturbed data preparation +train_set=train_nodup + +if $speed_perturb; then + perturb_suffix="_sp" +fi + +if $speaker_perturb; then + perturb_suffix=$perturb_suffix"_fp" +fi + +if [ "$speed_perturb" == "true" ]; then + if [ $stage -le 1 ]; then + echo "speed perturb the data" + #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment + # _sp stands for speed-perturbed + + for datadir in train_nodup; do + utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp_sp1 + utils/perturb_data_dir_speed.sh 0.95 data/${datadir} data/temp_sp2 + utils/perturb_data_dir_speed.sh 1.05 data/${datadir} data/temp_sp3 + utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp_sp4 + + utils/combine_data.sh data/${datadir}_temp_sp data/temp_sp1 data/temp_sp2 data/temp_sp3 data/temp_sp4 + utils/validate_data_dir.sh --no-feats data/${datadir}_temp_sp + rm -r data/temp_sp1 data/temp_sp2 data/temp_sp3 data/temp_sp4 + + if [ "$speaker_perturb" == "true" ]; then + echo "speaker perturbation of data" + utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp_sp0 + utils/combine_data.sh data/${datadir}_sp data/${datadir}_temp_sp data/temp_sp0 + utils/fix_data_dir.sh data/${datadir}_sp + + # compute filter correspond to different speed perturbed speaker. + spk_filters=spkfilters + mkdir -p $spk_filters + utils/split_data.sh data/${datadir}_sp $filter_nj + echo $filter_nj > data/${datadir}_sp/num_filter_jobs + + $decode_cmd JOB=1:$filter_nj data/${datadir}_sp/split$filter_nj/compute_filter.JOB.log \ + compute-filter --lpc-order=$lpc_order scp:data/${datadir}_sp/split$filter_nj/JOB/wav.scp \ + ark,scp:$spk_filters/spk_filter.JOB.ark,$spk_filters/spk_filter.JOB.scp || exit 1; + + # combine filters.scp files together + for n in $(seq $filter_nj); do + cat $spk_filters/spk_filter.$n.scp || exit 1; + done > data/${datadir}_sp/spk_filter.scp + echo "Finished generating filters per speakers." + + echo "Perturb data using speaker perturbation." + utils/perturb_data_signal_v2.sh $spkf_per_spk 'fp' data/${datadir}_sp data/${datadir}_temp_sp_fp + utils/validate_data_dir.sh --no-feats data/${datadir}_temp_sp_fp + fi + + echo "perturb_suffix=$perturb_suffix " + mfccdir=mfcc_perturbed + echo "Generating features using perturbed data" + steps/make_mfcc.sh --cmd "$decode_cmd" --nj 50 \ + data/${datadir}_temp${perturb_suffix} exp/make_mfcc/${datadir}_temp${perturb_suffix} $mfccdir || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_temp${perturb_suffix} exp/make_mfcc/${datadir}_temp${perturb_suffix} $mfccdir || exit 1; + utils/fix_data_dir.sh data/${datadir}_temp${perturb_suffix} + + utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0 + utils/combine_data.sh data/${datadir}${perturb_suffix} data/${datadir}_temp${perturb_suffix} data/temp0 + utils/fix_data_dir.sh data/${datadir}${perturb_suffix} + rm -r data/temp0 data/${datadir}_temp${perturb_suffix} + done + fi + + if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then + #obtain the alignment of the perturbed data + steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \ + data/train_nodup${perturb_suffix} data/lang_nosp exp/tri4 exp/tri4_ali_nodup${perturb_suffix} || exit 1 + fi +fi + +train_set=train_nodup${perturb_suffix} +if [ $stage -le 3 ]; then + mfccdir=mfcc_hires + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + date=$(date +'%m_%d_%H_%M') + utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/egs/swbd-$date/s5b/$mfccdir/storage $mfccdir/storage + fi + + # the 100k_nodup directory is copied seperately, as + # we want to use exp/tri2_ali_100k_nodup for lda_mllt training + # the main train directory might be speed_perturbed + for dataset in $train_set train_100k_nodup; do + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires + + # scale the waveforms, this is useful as we don't use CMVN + data_dir=data/${dataset}_hires + cat $data_dir/wav.scp | python -c " +import sys, os, subprocess, re, random +scale_low = 1.0/8 +scale_high = 2.0 +for line in sys.stdin.readlines(): + if len(line.strip()) == 0: + continue + print '{0} sox --vol {1} -t wav - -t wav - |'.format(line.strip(), random.uniform(scale_low, scale_high)) +"| sort -k1,1 -u > $data_dir/wav.scp_scaled || exit 1; + mv $data_dir/wav.scp_scaled $data_dir/wav.scp + + steps/make_mfcc.sh --nj 70 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/${dataset} $mfccdir; + + # Remove the small number of utterances that couldn't be extracted for some + # reason (e.g. too short; no such file). + utils/fix_data_dir.sh data/${dataset}_hires; + done + if false; then #300 + for dataset in eval2000 train_dev; do + # Create MFCCs for the eval set + utils/copy_data_dir.sh data/$dataset data/${dataset}_hires + steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 --mfcc-config conf/mfcc_hires.conf \ + data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + steps/compute_cmvn_stats.sh data/${dataset}_hires exp/make_hires/$dataset $mfccdir; + utils/fix_data_dir.sh data/${dataset}_hires # remove segments with problems + done + fi #300 + # Take the first 30k utterances (about 1/8th of the data) this will be used + # for the diagubm training + utils/subset_data_dir.sh --first data/${train_set}_hires 30000 data/${train_set}_30k_hires + local/remove_dup_utts.sh 200 data/${train_set}_30k_hires data/${train_set}_30k_nodup_hires # 33hr +fi +if false; then #400 +# ivector extractor training +if [ $stage -le 5 ]; then + # We need to build a small system just because we need the LDA+MLLT transform + # to train the diag-UBM on top of. We use --num-iters 13 because after we get + # the transform (12th iter is the last), any further training is pointless. + # this decision is based on fisher_english + steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \ + --splice-opts "--left-context=3 --right-context=3" \ + 5500 90000 data/train_100k_nodup_hires \ + data/lang_nosp exp/tri2_ali_100k_nodup exp/nnet3/tri3b +fi + +if [ $stage -le 6 ]; then + # To train a diagonal UBM we don't need very much data, so use the smallest subset. + steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 200000 \ + data/${train_set}_30k_nodup_hires 512 exp/nnet3/tri3b exp/nnet3/diag_ubm +fi + +if [ $stage -le 7 ]; then + # iVector extractors can be sensitive to the amount of data, but this one has a + # fairly small dim (defaults to 100) so we don't use all of it, we use just the + # 100k subset (just under half the data). + steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj 10 \ + data/train_100k_nodup_hires exp/nnet3/diag_ubm exp/nnet3/extractor || exit 1; +fi +fi #400 + +if [ $stage -le 8 ]; then + # We extract iVectors on all the train_nodup data, which will be what we + # train the system on. + + # having a larger number of speakers is helpful for generalization, and to + # handle per-utterance decoding well (iVector starts at zero). + steps/online/nnet2/copy_data_dir.sh --utts-per-spk-max 2 data/${train_set}_hires data/${train_set}_max2_hires + + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${train_set}_max2_hires exp/nnet3/extractor exp/nnet3/ivectors_$train_set || exit 1; + + for data_set in eval2000 train_dev; do + steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \ + data/${data_set}_hires exp/nnet3/extractor exp/nnet3/ivectors_$data_set || exit 1; + done +fi + +exit 0; diff --git a/egs/wsj/s5/utils/perturb_data_signal.sh b/egs/wsj/s5/utils/perturb_data_signal.sh new file mode 100755 index 00000000000..dbc123769d1 --- /dev/null +++ b/egs/wsj/s5/utils/perturb_data_signal.sh @@ -0,0 +1,149 @@ +#!/bin/bash + +# Copyright 2013 Johns Hopkins University (author: Daniel Povey) +# 2014 Tom Ko +# Apache 2.0 + +# This script operates on a directory, such as in data/train/, +# that contains some subset of the following files: +# wav.scp +# spk2utt +# utt2spk +# text +# spk_filter.scp +# It generates the files which are used for perturbing the data at signal-level. + +. utils/parse_options.sh + +if [ $# != 3 ]; then + echo "Usage: perturb_data_signal.sh " + echo "e.g.:" + echo " $0 'fp01' data/train_si284 data/train_si284p" + exit 1 +fi + +export LC_ALL=C + +prefix=$1 +srcdir=$2 +destdir=$3 +spk_prefix=$prefix"-" +utt_prefix=$prefix"-" + +for f in spk2utt text utt2spk wav.scp spk_filter.scp; do + [ ! -f $srcdir/$f ] && echo "$0: no such file $srcdir/$f" && exit 1; +done + +set -e; +set -o pipefail + +mkdir -p $destdir + +cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/utt_map +cat $srcdir/spk2utt | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/spk_map +cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s%s %s\n", p, $1, $1);}' > $destdir/utt2uniq + +cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ + utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk + +utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt + + +# The following perl script is the core part. + +echo $spk_prefix | perl -e ' + $prefix = ; + chomp($prefix); + ($u2s_in, $seg_in, $wav_in, $filt_in, $wav_out) = @ARGV; + if (open(SEG, "<$seg_in")) { + $have_segments="true"; + } else { + $have_segments="false"; + } + open(UI, "<$u2s_in") || die "Error: fail to open $u2s_in\n"; + open(WI, "<$wav_in") || die "Error: fail to open $wav_in\n"; + open(FI, "<$filt_in") || die "Error: fail to open $filt_in\n"; + open(WO, ">$wav_out") || die "Error: fail to open $wav_out\n"; + while () { + chomp; + @col = split; + @col == 2 || die "Error: bad line $_\n"; + ($utt_id, $spk) = @col; + $utt2spk{$utt_id} = $spk; + } + if ($have_segments eq "true") { + while () { + chomp; + @col = split; + $reco2utt{$col[1]} = $col[0]; + } + } + while () { + chomp; + @col = split; + $pipe = join(" ", @col[1..@col-1]); + $reco2pipe{$col[0]} = $pipe; + $recolist{$col[0]} = $col[0]; + if ($have_segments eq "false") { + $reco2utt{$col[0]} = $col[0]; + } + } + while () { + chomp; + @col = split; + @col == 2 || die "Error: bad line $_\n"; + $spk2filt{$col[0]} = $col[1]; + } + + foreach $reco (sort keys %recolist) { + #$reco2spk{$reco} = $utt2spk{$reco2utt{$reco}}; + #$reco2filt{$reco} = $spk2filt{$utt2spk{$reco2utt{$reco}}}; + $reco2spk{$reco} = $reco; + $reco2filt{$reco} = $spk2filt{$reco}; + if ($reco2filt{$reco} eq "") { + $spk = (keys %spk2filt)[rand keys %spk2filt]; + $reco2spk{$reco} = $spk; + $reco2filt{$reco} = $spk2filt{$spk}; + } + while (1) { + # randomly pick a filter from another speaker + $spk = (keys %spk2filt)[rand keys %spk2filt]; + $reco2perturbspk{$reco} = $spk; + $reco2perturbfilt{$reco} = $spk2filt{$spk}; + if ($reco2perturbfilt{$reco} ne $reco2filt{$reco}) { + last; + } + } + } + + foreach $reco (sort keys %recolist) { + print WO "$prefix$reco $reco2pipe{$reco} apply-filter --inverse=false \"ark:echo $reco2spk{$reco} $reco2filt{$reco} | \" - - | apply-filter --inverse=true \"ark:echo $reco2perturbspk{$reco} $reco2perturbfilt{$reco} | \" - - |\n"; + } + +' $srcdir/utt2spk $srcdir/segments $srcdir/wav.scp \ +$srcdir/spk_filter.scp $destdir/wav.scp + +if [ -f $srcdir/segments ]; then + # also apply the spk_prefix to the recording-ids. + cat $srcdir/wav.scp | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/reco_map + + utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments | utils/apply_map.pl -f 2 $destdir/reco_map >$destdir/segments + + if [ -f $srcdir/reco2file_and_channel ]; then + utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel + fi + + rm $destdir/reco_map 2>/dev/null +fi + +if [ -f $srcdir/text ]; then + utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text +fi +if [ -f $srcdir/spk2gender ]; then + utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender +fi + + +rm $destdir/spk_map $destdir/utt_map 2>/dev/null +echo "$0: generated signal-perturbed version of data in $srcdir, in $destdir" +utils/validate_data_dir.sh --no-feats $destdir diff --git a/egs/wsj/s5/utils/perturb_data_signal_v2.sh b/egs/wsj/s5/utils/perturb_data_signal_v2.sh new file mode 100755 index 00000000000..c205b67e5e0 --- /dev/null +++ b/egs/wsj/s5/utils/perturb_data_signal_v2.sh @@ -0,0 +1,187 @@ +#!/bin/bash + +# Copyright 2013 Johns Hopkins University (author: Daniel Povey) +# 2014 Tom Ko +# Apache 2.0 + +# This script operates on a directory, such as in data/train/, +# that contains some subset of the following files: +# wav.scp +# spk2utt +# utt2spk +# text +# spk_filter.scp +# It generates the files which are used for perturbing the data at signal-level. + +. utils/parse_options.sh + +if [ $# != 4 ]; then + echo "Usage: perturb_data_signal.sh " + echo "e.g.:" + echo " $0 3 'fp01' data/train_si284 data/train_si284p" + exit 1 +fi + +export LC_ALL=C + +num_parts=$1 +prefix=$2 +srcdir=$3 +destdir=$4 +spk_prefix=$prefix"-" +utt_prefix=$prefix"-" + +for f in spk2utt text utt2spk wav.scp spk_filter.scp; do + [ ! -f $srcdir/$f ] && echo "$0: no such file $srcdir/$f" && exit 1; +done + +set -e; +set -o pipefail + +mkdir -p $destdir + +cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/utt_map +cat $srcdir/spk2utt | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/spk_map +cat $srcdir/utt2spk | awk -v p=$utt_prefix '{printf("%s%s %s\n", p, $1, $1);}' > $destdir/utt2uniq + +cat $srcdir/utt2spk | utils/apply_map.pl -f 1 $destdir/utt_map | \ + utils/apply_map.pl -f 2 $destdir/spk_map >$destdir/utt2spk + +utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt + + +# The following perl script is the core part. + +echo $spk_prefix | perl -e ' + $prefix = ; + chomp($prefix); + ($num_parts, $u2s_in, $s2u_in, $seg_in, $wav_in, $filt_in, $wav_out, $seg_out) = @ARGV; + if (open(SEG, "<$seg_in")) { + $have_segments="true"; + } else { + $have_segments="false"; + } + open(UI, "<$u2s_in") || die "Error: fail to open $u2s_in\n"; + open(SI, "<$s2u_in") || die "Error: fail to open $s2u_in\n"; + open(WI, "<$wav_in") || die "Error: fail to open $wav_in\n"; + open(FI, "<$filt_in") || die "Error: fail to open $filt_in\n"; + open(WO, ">$wav_out") || die "Error: fail to open $wav_out\n"; + open(SO, ">$seg_out") || die "Error: fail to open $seg_out\n"; + while () { + chomp; + @col = split; + @col == 2 || die "Error: bad line $_\n"; + ($utt_id, $spk) = @col; + $utt2spk{$utt_id} = $spk; + } + while () { + chomp; + @col = split; + $spks = join(" ", @col[1..@col-1]); + $spk2utt{$col[0]} = $spks; + } + if ($have_segments eq "true") { + while () { + chomp; + @col = split; + $seg = join(" ", @col[2..@col-1]); + $reco2utt{$col[1]} = $col[0]; + $utt2reco{$col[0]} = $col[1]; + $utt2seg{$col[0]} = $seg; + } + } + while () { + chomp; + @col = split; + $pipe = join(" ", @col[1..@col-1]); + $reco2pipe{$col[0]} = $pipe; + $recolist{$col[0]} = $col[0]; + if ($have_segments eq "false") { + $reco2utt{$col[0]} = $col[0]; + } + } + while () { + chomp; + @col = split; + @col == 2 || die "Error: bad line $_\n"; + $spk2filt{$col[0]} = $col[1]; + } + + foreach $reco (sort keys %recolist) { + #$reco2spk{$reco} = $utt2spk{$reco2utt{$reco}}; + #$reco2filt{$reco} = $spk2filt{$utt2spk{$reco2utt{$reco}}}; + $reco2spk{$reco} = $reco; + $reco2filt{$reco} = $spk2filt{$reco}; + for (my $i=0; $i < $num_parts; $i++) { + $newreco2spk{$reco.$i} = $reco; + } + @spk2filt_rand{keys %spk2filt} = @spk2filt{keys %spk2filt}; + delete $spk2filt_rand{$reco}; + if ($reco2filt{$reco} eq "") { + $spk = (keys %spk2filt)[rand keys %spk2filt]; + $reco2spk{$reco} = $spk; + $reco2filt{$reco} = $spk2filt{$spk}; + delete $spk2filt_rand{$spk}; + } + for (my $i=0; $i < $num_parts; $i++) { + # randomly pick a filter from another speaker + $spk = (keys %spk2filt_rand)[rand keys %spk2filt_rand]; + $newreco2perturbspk{$reco.$i} = $spk; + $newreco2perturbfilt{$reco.$i} = $spk2filt{$spk}; + delete $spk2filt_rand{$spk}; + } + } + + foreach $spk (sort keys %spk2utt) { + @utts = split(" ", $spk2utt{$spk}); + $numutts = @utts; + if ($numutts < $num_parts) { + $partsize = $numutts; + } else { + $partsize = $numutts / $num_parts; + } + for (my $i=0; $i < $numutts; $i++) { + $partid = int($i / $partsize); + $utt = $utts[$i]; + $filled = sprintf "%02d", $partid; + print SO "$prefix$utt $prefix$utt2reco{$utt}-$filled $utt2seg{$utt}\n"; + $newrecolist{"$prefix$utt2reco{$utt}-$filled"} = "$prefix$utt2reco{$utt}-$filled"; + } + } + + foreach $reco (sort keys %recolist) { + for (my $i=0; $i < $num_parts; $i++) { + $filled = sprintf "%02d", $i; + if ($newrecolist{"$prefix$reco-$filled"} ne "") { + print WO "$prefix$reco-$filled $reco2pipe{$reco} apply-filter \"scp:echo $reco2spk{$reco} $reco2filt{$reco} |\" - - | apply-filter --inverse=true \"scp:echo $newreco2perturbspk{$reco.$i} $newreco2perturbfilt{$reco.$i} |\" - - |\n"; + } + } + } + +' $num_parts $srcdir/utt2spk $srcdir/spk2utt $srcdir/segments $srcdir/wav.scp \ +$srcdir/spk_filter.scp $destdir/wav.scp $destdir/segments + +if [ -f $srcdir/segments ]; then + # also apply the spk_prefix to the recording-ids. + cat $srcdir/wav.scp | awk -v p=$spk_prefix '{printf("%s %s%s\n", $1, p, $1);}' > $destdir/reco_map + +# utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/segments | utils/apply_map.pl -f 2 $destdir/reco_map >$destdir/segments + +# if [ -f $srcdir/reco2file_and_channel ]; then +# utils/apply_map.pl -f 1 $destdir/reco_map <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel +# fi + + rm $destdir/reco_map 2>/dev/null +fi + +if [ -f $srcdir/text ]; then + utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/text >$destdir/text +fi +if [ -f $srcdir/spk2gender ]; then + utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender +fi + + +rm $destdir/spk_map $destdir/utt_map 2>/dev/null +echo "$0: generated signal-perturbed version of data in $srcdir, in $destdir" +utils/validate_data_dir.sh --no-feats $destdir diff --git a/src/feat/signal.cc b/src/feat/signal.cc index e8fbb0b84cf..a374c531e3d 100644 --- a/src/feat/signal.cc +++ b/src/feat/signal.cc @@ -38,7 +38,7 @@ void ConvolveSignals(const Vector &filter, Vector *signal) signal_padded.SetZero(); for (int32 i = 0; i < signal_length; i++) { for (int32 j = 0; j < filter_length; j++) { - signal_padded(i + j) += (*signal)(i) * filter(j); + signal_padded(i+j) += (*signal)(i) * filter(j); } } signal->CopyFromVec(signal_padded.Range(0, signal_length)); @@ -70,7 +70,8 @@ void FFTbasedConvolveSignals(const Vector &filter, Vector signal->CopyFromVec(signal_padded.Range(0, signal_length)); } -void FFTbasedBlockConvolveSignals(const Vector &filter, Vector *signal) { +void FFTbasedBlockConvolveSignals(const Vector &filter, Vector *signal, + bool apply_inverse) { int32 signal_length = signal->Dim(); int32 filter_length = filter.Dim(); @@ -86,6 +87,30 @@ void FFTbasedBlockConvolveSignals(const Vector &filter, Vector filter_padded(fft_length); filter_padded.Range(0, filter_length).CopyFromVec(filter); srfft.Compute(filter_padded.Data(), true); + + // If true, inverse of filter is computed and + // input signal is convolved with inverse of filter. + // The inverse of filter H_inv(w) is estimated as + // conj(H(w))/( abs(H(w))^2 + const) + if (apply_inverse) { + BaseFloat abs_Hw, const_val = 0.0; + int32 half_N = filter_padded.Dim() / 2; + Vector inv_filter_padded(filter_padded); + inv_filter_padded(0) = + filter_padded(0) / (filter_padded(0) * filter_padded(0) + const_val); + inv_filter_padded(1) = + filter_padded(1) / (filter_padded(1) * filter_padded(1) + const_val); + for (int32 bin = 1; bin < half_N; bin++) { + int32 w_real_ind = 2 * bin, + w_im_ind = 2 * bin + 1; + abs_Hw = filter_padded(w_real_ind) * filter_padded(w_real_ind) + + filter_padded(w_im_ind) * filter_padded(w_im_ind); + + inv_filter_padded(w_real_ind) /= (abs_Hw + const_val); + inv_filter_padded(w_im_ind) *= -1.0 / (abs_Hw + const_val); + } + filter_padded.CopyFromVec(inv_filter_padded); + } Vector temp_pad(filter_length - 1); temp_pad.SetZero(); diff --git a/src/feat/signal.h b/src/feat/signal.h index 7ff0ce33b52..b9a49473b96 100644 --- a/src/feat/signal.h +++ b/src/feat/signal.h @@ -44,7 +44,8 @@ void FFTbasedConvolveSignals(const Vector &filter, Vector overlap-add method. This is an efficient way to evaluate the discrete convolution of a long signal with a finite impulse response filter. */ -void FFTbasedBlockConvolveSignals(const Vector &filter, Vector *signal); +void FFTbasedBlockConvolveSignals(const Vector &filter, Vector *signal, + bool apply_inverse = false); } // namespace kaldi diff --git a/src/featbin/Makefile b/src/featbin/Makefile index 9843e7bbd4b..bff2b212a5b 100644 --- a/src/featbin/Makefile +++ b/src/featbin/Makefile @@ -15,7 +15,7 @@ BINFILES = compute-mfcc-feats compute-plp-feats compute-fbank-feats \ process-kaldi-pitch-feats compare-feats wav-to-duration add-deltas-sdc \ compute-and-process-kaldi-pitch-feats modify-cmvn-stats wav-copy \ wav-reverberate append-vector-to-feats detect-sinusoids shift-feats \ - concat-feats + concat-feats compute-filter apply-filter OBJFILES = diff --git a/src/featbin/apply-filter.cc b/src/featbin/apply-filter.cc new file mode 100644 index 00000000000..18069ab33a6 --- /dev/null +++ b/src/featbin/apply-filter.cc @@ -0,0 +1,90 @@ +// featbin/apply-filters.cc + +// Copyright 2016 Pegah Ghahremani + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "feat/wave-reader.h" +#include "feat/signal.h" + +namespace kaldi { +} + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + + const char *usage = + "apply filter to wave files supplied via input pip as FIR or IIR filter.\n" + "If the --inverse=false, it applies filter as FIR filter\n" + "and if --inverse=true, the inverse of filter applies as IIR filter.\n" + "Usage: apply-filters [options...] " + " \n" + "e.g. \n" + "apply-filters --inverse=false --utt2spkfilter=ark:data/train/utt2spkfilter \n" + " input.wav filter.wav output_1.wav\n"; + ParseOptions po(usage); + + bool inverse = false; + std::string utt2spkfilter_rspecifier = ""; + po.Register("inverse", &inverse, + "If false, the filter is applied as FIR filter," + "otherwise its inverse applied as IIR filter."); + po.Register("utt2spkfilter", &utt2spkfilter_rspecifier, + "rspecifier for utterance to spkear-filter list map" + " used to filter each utterance"); + po.Read(argc, argv); + if (po.NumArgs() != 3) { + po.PrintUsage(); + exit(1); + } + std::string input_wave_file = po.GetArg(2), + filter_file = po.GetArg(1), + output_wave_file = po.GetArg(3); + + WaveData input_wave; + { + Input ki(input_wave_file); + input_wave.Read(ki.Stream()); + + } + + SequentialBaseFloatVectorReader filter_reader(filter_file); + const Vector &lpc_filter = filter_reader.Value(); + + Vector filtered_wav(input_wave.Data().Row(0)); + BaseFloat samp_freq_input = input_wave.SampFreq(); + // If inverse = false, it does FFT-based block Convolution of filter with + // long input signal. + // Otherwise inverse of filter is convolved with input signal. + // If we use lp coefficients as [1 -a1 -a2 ... ap] as filter + // convolving input with this filter is like whitening transform. + // y'[n] = y[n] - sum_{i=1}^p {input_wav[n-i] * lpc_coeffs[i]} + // = conv(y, [1 :-lpc-coeffs]) + FFTbasedBlockConvolveSignals(lpc_filter, &filtered_wav, inverse); + Matrix filtered_wav_mat(1, filtered_wav.Dim()); + filtered_wav_mat.CopyRowsFromVec(filtered_wav); + WaveData out_wave(samp_freq_input, filtered_wav_mat); + Output ko(output_wave_file, false); + out_wave.Write(ko.Stream()); + return 0; + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/featbin/apply-filters.cc b/src/featbin/apply-filters.cc new file mode 100644 index 00000000000..11c6867b553 --- /dev/null +++ b/src/featbin/apply-filters.cc @@ -0,0 +1,96 @@ +// featbin/apply-filters.cc + +// Copyright 2016 Pegah Ghahremani + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "feat/wave-reader.h" +#include "feat/signal.h" + +namespace kaldi { +} + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + + const char *usage = + "apply filter to input wave as FIR or IIR filter.\n" + "If the --inverse=false, it applies filter as FIR filter\n" + "and if --inverse=true, the inverse of filter applies as IIR filter.\n" + + "Usage: apply-filters [options...] " + " \n" + "e.g. \n" + "apply-filters --inverse=false input.wav ark:filters.ark output.wav\n"; + ParseOptions po(usage); + + bool inverse = false; + + po.Register("inverse", &inverse, + "If false, the filter is applied as FIR filter," + "otherwise its inverse applied as IIR filter."); + + po.Read(argc, argv); + if (po.NumArgs() != 3) { + po.PrintUsage(); + exit(1); + } + + std::string wav_rspecifier = po.GetArg(1), + filter_rspecifier = po.GetArg(2), + out_rspecifier = po.GetArg(3); + + SequentialTableReader wav_reader(wav_rspecifier); + TableWriter wav_writer(out_rspecifier); + RandomAccessBaseFloatVectorReader filters(filter_rspecifier); + int32 num_done = 0, num_err = 0; + for (;! wav_reader.Done(); wav_reader.Next()) { + std::string utt = wav_reader.Key(); + const WaveData &wav = wav_reader.Value(); + if (!filters.HasKey(utt)) { + KALDI_WARN << "Did not find filter for utterance " << utt; + num_err++; + continue; + } + const Vector lpc_filter = filters.Value(utt); + Vector filtered_wav(wav.Data().Row(0)); + BaseFloat samp_freq_input = wav.SampFreq(); + // If inverse = false, it does FFT-based block Convolution of filter with + // long input signal. + // Otherwise inverse of filter is convolved with input signal. + // If we use lp coefficients as [1 -a1 -a2 ... ap] as filter + // convolving input with this filter is like whitening transform. + // y'[n] = y[n] - sum_{i=1}^p {input_wav[n-i] * lpc_coeffs[i]} + // = conv(y, [1 :-lpc-coeffs]) + FFTbasedBlockConvolveSignals(lpc_filter, &filtered_wav, inverse); + Matrix filtered_wav_mat(1, filtered_wav.Dim()); + filtered_wav_mat.CopyRowsFromVec(filtered_wav); + WaveData out_wav(samp_freq_input, filtered_wav_mat); + wav_writer.Write(utt, out_wav); + num_done++; + } + KALDI_LOG << "Applied filter on " + << num_done << " utterances" + << num_err << " had errors."; + return (num_done != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/featbin/compute-filter.cc b/src/featbin/compute-filter.cc new file mode 100644 index 00000000000..4f750e418cf --- /dev/null +++ b/src/featbin/compute-filter.cc @@ -0,0 +1,300 @@ +// featbin/compute-filters.cc + +// Copyright 2016 Pegah Ghahremani + + +#include "feat/feature-functions.h" +#include "matrix/srfft.h" +#include "matrix/kaldi-matrix-inl.h" +#include "feat/wave-reader.h" +namespace kaldi { + +// struct used to store statistics required for +// computing correlation coefficients +struct CorrelationStats { + int32 corr_order; // number of correlation coefficient. R[0],..,R[corr_order - 1] + int32 num_samp; // number of samples + BaseFloat samp_sum; // sum of samples. + Vector l2_norms; // l2_norms[j] - inner product of shifted input by itself as + // sum_{i=0}^corr_window_size x[i+j]^2 + Vector inner_prod; // inner product of input vector with its shifted version by j + // sum_{i=0}^corr_window_size x[i] * x[i+j] + CorrelationStats(): corr_order(100), num_samp(0), samp_sum(0) { + l2_norms.Resize(corr_order); + inner_prod.Resize(corr_order);} + + CorrelationStats(int32 corr_order): corr_order(corr_order), + num_samp(0), samp_sum(0) { + l2_norms.Resize(corr_order); + inner_prod.Resize(corr_order); } +}; + +/* + This function computes and accumulates statistics + required for computing auto-correlation coefficient using waveform "wave", + e.g dot-product of input with its shifted version. + inner_prod[j] - inner product of input vector with its shifted version by j + sum_{i=0}^corr_window_size x[i] * x[i+j] + l2_norms[j] - inner product of shifted input by itself as + sum_{i=0}^corr_window_size x[i+j]^2 + lpc_order is the size of autocorr_coeffs. +*/ +void AccStatsForCorrelation(const VectorBase &wave, + int32 lpc_order, + CorrelationStats *acc_corr_stats) { + KALDI_ASSERT(acc_corr_stats->inner_prod.Dim() == lpc_order); + acc_corr_stats->samp_sum += wave.Sum(); + acc_corr_stats->num_samp += wave.Dim(); + int32 corr_window_size = wave.Dim() - lpc_order; + Vector norm_wave(wave); + SubVector sub_vec1(norm_wave, 0, corr_window_size); + BaseFloat local_l2_norm = VecVec(sub_vec1, sub_vec1), sum; + + acc_corr_stats->inner_prod(0) += local_l2_norm; + + for (int32 lag = 1; lag < lpc_order; lag++) { + SubVector sub_vec2(norm_wave, lag, corr_window_size); + int32 last_ind = corr_window_size + lag - 1; + local_l2_norm += (wave(last_ind) * wave(last_ind) - + wave(lag - 1) * wave(lag - 1)); + sum = VecVec(sub_vec1, sub_vec2); + acc_corr_stats->inner_prod(lag) += sum; + acc_corr_stats->l2_norms(lag) += local_l2_norm; + } +} +/* + Compute autocorrelation coefficients using accumulated unnormalized statistics + such as inner product and l2 norms. + inner product and l2_norms are normalized using mean as E[x]. + autocorr[j] - sum_{i=0}^n ([x[i] - E[x]) * (x[i+j] - E(x)) / + [(sum_{i=0}^n ([x[i] - E[x])^2) * (sum_{i=0}^n ([x[i+j] - E[x])^2)]^0.5 + autocorr[j] - inner_prod[j] / (norms[0] * norms[j])^0.5 + inner_prod[j] - inner product of input vector with its shifted version by j + sum_{i=0}^n x[i] * x[i+j] + l2_norms[j] - inner product of shifted input by itself as sum_{i=0}^n x[i+j]^2 +*/ +void ComputeCorrelation(const CorrelationStats &acc_corr_stats, + Vector *autocorr) { + + KALDI_ASSERT(acc_corr_stats.inner_prod.Dim() == acc_corr_stats.l2_norms.Dim()); + + int32 lpc_order = acc_corr_stats.inner_prod.Dim(); + autocorr->Resize(lpc_order); + for (int32 lag = 0; lag < lpc_order; lag++) + (*autocorr)(lag) = acc_corr_stats.inner_prod(lag); + + // scale outocorrelation between 0 and 1 using autocorr(0) + autocorr->Scale(1.0 / (*autocorr)(0)); + +} +/* + Durbin's recursion - converts autocorrelation coefficients to the LPC + pTmp - temporal place [n] + pAC - autocorrelation coefficients [n + 1] + pLP - linear prediction coefficients [n] (predicted_sn = sum_1^P{a[i] * s[n-i]}}) +*/ +double DurbinInternal(int32 n, double *pAC, double *pLP, double *pTmp) { + double ki; // reflection coefficient + + // we add this bias term to pAC[0]. + // Adding bias term is equivalent to t = teoplitz(pAC) + diag(bias) + // which is like shifting eigenvalues of teoplitz(pAC) using bias term + // and we can make sure t is convertible. + double durbin_bias = 1e-2; + int32 max_repeats = 20; + + double E = pAC[0]; + pLP[0] = 1.0; + for (int32 i = 1; i <= n; i++) { + // next reflection coefficient + ki = pAC[i]; + for (int32 j = 1; j < i; j++) + ki += pLP[j] * pAC[i - j]; + ki = ki / E; + + if (abs(ki) > 1) { + int32 num_repeats = int((pAC[0] - 1.0) / durbin_bias); + KALDI_LOG << " warning: In Durbin algorithm, abs(ki) > 1 " + << " for iteration = " << i + << " ki = " << ki + << " autocorr[0] = " << pAC[0] + << " num_repeats = " << num_repeats + << "; the bias added"; + pAC[0] += durbin_bias; + if (num_repeats < max_repeats) + return -1; + } + // new error + double c = 1 - ki * ki; + if (c < 1.0e-5) // remove NaNs for constan signal + c = 1.0e-5; + + E *= c; + // new LP coefficients + pTmp[i] = -ki; + for (int32 j = 1; j < i; j++) + pTmp[j] = pLP[j] - ki * pLP[i - j]; + + for (int32 j = 1; j <= i; j++) + pLP[j] = pTmp[j]; + } + return E; +} +/* + This function computes coefficients of forward linear predictor + w.r.t autocorrelation coefficients by minimizing the prediction + error using MSE. + Durbin used to compute LP coefficients using autocorrelation coefficients. + R(j) = sum_{i=1}^P R((i+j % p)] * a[i] j = 0, ..,P + P is order of Linear prediction + lp_filter - [1, -a[1], -a[2] ... ,-a[P]] + where a[i] are linear prediction coefficients. (predicted_x[n] = sum_{i=1}^P{a[i] * x[n-i]}) + x[n] - predicted_x[n] = sum_{i = 0}^P { lp_filter[i] .* x[n] } + = conv(x, lp_filter) + R(j) is the j_th autocorrelation coefficient. +*/ +void ComputeFilters(const VectorBase &autocorr, + Vector *lp_filter) { + int32 n = autocorr.Dim(); + lp_filter->Resize(n); + // compute lpc coefficients using autocorrelation coefficients + // with Durbin algorithm + Vector d_autocorr(autocorr), + d_lpc_coeffs(n), d_tmp(n); + + KALDI_LOG << "compute lpc using correlations "; + while (DurbinInternal(n, d_autocorr.Data(), + d_lpc_coeffs.Data(), + d_tmp.Data()) < 0); + lp_filter->CopyFromVec(d_lpc_coeffs); + if (KALDI_ISNAN(lp_filter->Sum())) { + KALDI_WARN << "NaN encountered in lpc coefficients derived from Durbin algorithm."; + lp_filter->Set(0.0); + (*lp_filter)(0) = 1.0; + } + +} + +} // namescape kaldi + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using kaldi::int32; + + const char *usage = + "Computes LP coefficients per-speaker, by minimizing " + "prediction error using MSE.\n" + "This coefficient contain speaker-dependent information correspond to each speaker.\n" + + "Usage: compute-filters [options] \n" + "e.g.: compute-filters " + " scp:data/train/wav.scp ark,scp:filter.ark,filter.scp\n"; + + ParseOptions po(usage); + std::string spk2utt_rspecifier; + bool binary = true; + int32 channel = -1, + lpc_order = 100; + po.Register("binary", &binary, "write in binary mode (applies only to global filters)"); + po.Register("lpc-order", &lpc_order, "number of LP coefficients used to extract filters."); + + po.Read(argc, argv); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + int32 num_done = 0, num_err = 0; + std::string wav_rspecifier = po.GetArg(1), + wspecifier = po.GetArg(2); + + BaseFloatVectorWriter writer(wspecifier); + if (spk2utt_rspecifier != "") { + SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier); + RandomAccessTableReader wav_reader(wav_rspecifier); + for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) { + std::string spk = spk2utt_reader.Key(); + const std::vector &uttlist = spk2utt_reader.Value(); + CorrelationStats acc_corr_stats(lpc_order); + for (size_t i = 0; i < uttlist.size(); i++) { + std::string utt = uttlist[i]; + if (!wav_reader.HasKey(utt)) { + KALDI_WARN << "Did not find wave for utterance " << utt; + num_err++; + continue; + } + const WaveData &wav_data = wav_reader.Value(utt); + int32 num_chan = wav_data.Data().NumRows(), this_chan = channel; + KALDI_ASSERT(num_chan > 0); + if (channel == -1) { + this_chan = 0; + if (num_chan != 1) + KALDI_WARN << "Channel not specified but you have data with " + << num_chan << " channels; defaulting to zero"; + } else { + if (this_chan >= num_chan) { + KALDI_WARN << "File with id " << spk << " has " + << num_chan << " channels but you specified channel " + << channel << ", producing no output."; + num_err++; + continue; + } + } + Vector waveform(wav_data.Data().Row(this_chan)); + waveform.Scale(1.0 / (1 << 15)); + AccStatsForCorrelation(waveform, lpc_order, + &acc_corr_stats); + } + Vector filter, autocorr(lpc_order); + ComputeCorrelation(acc_corr_stats, + &autocorr); + ComputeFilters(autocorr, &filter); + writer.Write(spk, filter); + num_done++; + } + } else { // assume the input waveform is per-speaker. + SequentialTableReader wav_reader(wav_rspecifier); + for (; !wav_reader.Done(); wav_reader.Next()) { + std::string spk = wav_reader.Key(); + const WaveData &wave_data = wav_reader.Value(); + int32 num_chan = wave_data.Data().NumRows(), this_chan = channel; + KALDI_ASSERT(num_chan > 0); + if (channel == -1) { + this_chan = 0; + if (num_chan != 1) + KALDI_WARN << "Channel not specified but you have data with " + << num_chan << " channels; defaulting to zero"; + } else { + if (this_chan >= num_chan) { + KALDI_WARN << "File with id " << spk << " has " + << num_chan << " channels but you specified channel " + << channel << ", producing no output."; + num_err++; + continue; + } + } + Vector waveform(wave_data.Data().Row(this_chan)); + Vector autocorr, filter; + waveform.Scale(1.0 / (1 << 15)); + KALDI_ASSERT(waveform.Max() <=1 && waveform.Min() >= -1); + CorrelationStats acc_corr_stats(lpc_order); + + AccStatsForCorrelation(waveform, lpc_order, + &acc_corr_stats); + ComputeCorrelation(acc_corr_stats, + &autocorr); + //KALDI_LOG << "autocorr = " << autocorr; + ComputeFilters(autocorr, &filter); + writer.Write(spk, filter); + num_done++; + } + } + KALDI_LOG << "Done " << num_done << " speakers, " << num_err + << " with errors."; + return (num_done != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +}