From df0913327878801b1b8e1c3301ec8becc5d82a9d Mon Sep 17 00:00:00 2001 From: Nagendra Kumar Goel Date: Tue, 9 Jan 2018 17:05:01 -0500 Subject: [PATCH 01/11] SWBD stats pooling VAD recipe --- .../s5c/local/run_cleanup_segmentation.sh | 56 ++++++++++++++ .../local/segmentation/copy_targets_dir.sh | 76 +++++++++++++++++++ 2 files changed, 132 insertions(+) create mode 100755 egs/swbd/s5c/local/run_cleanup_segmentation.sh create mode 100755 egs/swbd/s5c/local/segmentation/copy_targets_dir.sh diff --git a/egs/swbd/s5c/local/run_cleanup_segmentation.sh b/egs/swbd/s5c/local/run_cleanup_segmentation.sh new file mode 100755 index 00000000000..d08d3f0e0b4 --- /dev/null +++ b/egs/swbd/s5c/local/run_cleanup_segmentation.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +# Copyright 2016 Vimal Manohar +# 2016 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 + +# This script demonstrates how to re-segment training data selecting only the +# "good" audio that matches the transcripts. +# The basic idea is to decode with an existing in-domain acoustic model, and a +# biased language model built from the reference, and then work out the +# segmentation from a ctm like file. + +# For nnet3 and chain results after cleanup, see the scripts in +# local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh + +# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets +# [will add these later]. + +set -e +set -o pipefail +set -u + +stage=0 +cleanup_stage=0 +data=data/train +cleanup_affix=cleaned +srcdir=exp/tri4_mmi_b0.1 +langdir=data/lang_sw1_tg +nj=100 +decode_nj=16 +decode_num_threads=4 + +. ./path.sh +. ./cmd.sh +. utils/parse_options.sh + +cleaned_data=${data}_${cleanup_affix} + +dir=${srcdir}_${cleanup_affix}_work +cleaned_dir=${srcdir}_${cleanup_affix} + +if [ $stage -le 1 ]; then + # This does the actual data cleanup. + steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj $nj --cmd "$train_cmd" \ + $data $langdir $srcdir $dir $cleaned_data +fi + +if [ $stage -le 2 ]; then + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + $cleaned_data $langdir $srcdir ${srcdir}_ali_${cleanup_affix} +fi + +if [ $stage -le 3 ]; then + steps/train_sat.sh --cmd "$train_cmd" \ + 5000 100000 $cleaned_data $langdir ${srcdir}_ali_${cleanup_affix} ${cleaned_dir} +fi diff --git a/egs/swbd/s5c/local/segmentation/copy_targets_dir.sh b/egs/swbd/s5c/local/segmentation/copy_targets_dir.sh new file mode 100755 index 00000000000..8be70b4715a --- /dev/null +++ b/egs/swbd/s5c/local/segmentation/copy_targets_dir.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +# Copyright 2014 Johns Hopkins University (author: Nagendra K Goel) +# Apache 2.0 + +# This script operates on a directory, such as in exp/segmentation_1a/train_whole_combined_targets_rev1, +# that contains some subset of the following files: +# targets.X.ark +# frame_subsampling_factor +# It copies to another directory, possibly adding a specified prefix or a suffix +# to the utterance names. + + +# begin configuration section +utt_prefix= +utt_suffix= +cmd=run.pl +# end configuration section + +. utils/parse_options.sh + +if [ $# != 2 ]; then + echo "Usage: " + echo " $0 [options] " + echo "e.g.:" + echo " $0 --utt-prefix=1- exp/segmentation_1a/train_whole_combined_targets_sub3 exp/segmentation_1a/train_whole_combined_targets_sub3_rev1" + echo "Options" + echo " --utt-prefix= # Prefix for utterance ids, default empty" + echo " --utt-suffix= # Suffix for utterance ids, default empty" + exit 1; +fi + + +export LC_ALL=C + +src_dir=$1 +dest_dir=$2 + +mkdir -p $dest_dir + +if [ ! -f $src_dir/targets.1.ark ]; then + echo "copy_targets_dir.sh: no such files $src_dir/targets.1.ark" + exit 1; +fi + +for f in frame_subsampling_factor; do + if [ ! -f $src_dir/$f ]; then + echo "$0: no such file $src_dir/$f this might be serious error." + continue + fi + cp $src_dir/$f $dest_dir/ +done + +nj=$(ls $src_dir/targets.*.ark | wc -l) +mkdir -p $dest_dir/temp +cat << EOF > $dest_dir/temp/copy_targets.sh +set -e; +id=\$1 +echo "$src_dir/targets.\$id.ark" +copy-matrix ark:$src_dir/targets.\$id.ark ark,t:- | \ +python -c " +import sys +for line in sys.stdin: + parts = line.split() + if \"[\" not in line: + print line.rstrip() + else: + print '$utt_prefix{0}$utt_suffix {1}'.format(parts[0], ' '.join(parts[1:])) +" | \ + copy-matrix ark,t:- ark:$dest_dir/targets.\$id.ark || exit 1; +set +o pipefail; # unset the pipefail option. +EOF +chmod +x $dest_dir/temp/copy_targets.sh +$cmd -v PATH JOB=1:$nj $dest_dir/temp/copy_targets.JOB.log $dest_dir/temp/copy_targets.sh JOB || exit 1; + +echo "$0: copied targets from $src_dir to $dest_dir" From b9c7161fbaf6b37907a47e3dfa510c28b5c4abdd Mon Sep 17 00:00:00 2001 From: Nagendra Kumar Goel Date: Tue, 9 Jan 2018 17:14:38 -0500 Subject: [PATCH 02/11] Add SWBD VAD recipe --- egs/swbd/s5c/local/run_asr_segmentation.sh | 83 ++++++++++++++----- .../s5c/local/run_cleanup_segmentation.sh | 3 +- .../local/segmentation/copy_targets_dir.sh | 3 +- .../tuning/train_lstm_asr_sad_1a.sh | 7 +- .../tuning/train_stats_asr_sad_1a.sh | 9 +- 5 files changed, 77 insertions(+), 28 deletions(-) diff --git a/egs/swbd/s5c/local/run_asr_segmentation.sh b/egs/swbd/s5c/local/run_asr_segmentation.sh index 32b2e3a8411..d87703d1e90 100755 --- a/egs/swbd/s5c/local/run_asr_segmentation.sh +++ b/egs/swbd/s5c/local/run_asr_segmentation.sh @@ -1,15 +1,17 @@ -#! /bin/bash +#!/bin/bash -# Copyright 2017 Vimal Manohar +# Copyright 2017 Nagendra Kumar Goel +# 2017 Vimal Manohar # Apache 2.0 -# Features configs (Must match the features used to train the models -# $sat_model_dir and $model_dir) +# We assume the run-1-main.sh (because we are using model directories like +# exp/tri4) and later we assumme run-4-anydecode.sh was run to prepare +# data/dev10h.pem -lang=data/lang_nosp # Must match the one used to train the models +lang=data/lang # Must match the one used to train the models lang_test=data/lang_nosp_sw1_tg # Lang directory for decoding. -data_dir=data/train_100k_nodup +data_dir=data/train # Model directory used to align the $data_dir to get target labels for training # SAD. This should typically be a speaker-adapted system. sat_model_dir=exp/tri4 @@ -40,8 +42,8 @@ affix=_1a stage=-1 nj=80 -. ./path.sh -. ./cmd.sh +. path.sh +. cmd.sh set -e -u -o pipefail . utils/parse_options.sh @@ -55,7 +57,7 @@ mkdir -p $dir # See $lang/phones.txt and decide which should be garbage garbage_phones="lau spn" -silence_phones="nsn SIL" +silence_phones="sil" for p in $garbage_phones; do for affix in "" "_B" "_E" "_I" "_S"; do @@ -85,8 +87,10 @@ fi # Extract features for the whole data directory ############################################################################### if [ $stage -le 1 ]; then - steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj --write-utt2num-frames true \ - ${whole_data_dir} || exit 1 + steps/make_mfcc.sh --nj 50 --cmd "$train_cmd" --write-utt2num-frames true \ + $whole_data_dir exp/make_mfcc/train_whole + steps/compute_cmvn_stats.sh $whole_data_dir exp/make_mfcc/train_whole + utils/fix_data_dir.sh $whole_data_dir fi ############################################################################### @@ -112,18 +116,27 @@ if [ $stage -le 3 ]; then fi if [ $stage -le 4 ]; then - utils/copy_data_dir.sh ${whole_data_dir} ${whole_data_dir}_hires_bp - steps/make_mfcc.sh --mfcc-config conf/mfcc_hires_bp.conf --nj 40 \ - ${whole_data_dir}_hires_bp - steps/compute_cmvn_stats.sh ${whole_data_dir}_hires_bp + utils/copy_data_dir.sh ${whole_data_dir} ${whole_data_dir}_hires + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 \ + ${whole_data_dir}_hires + steps/compute_cmvn_stats.sh ${whole_data_dir}_hires fi +# if [ $stage -le 4.5 ]; then +# # Train a TDNN-LSTM network for SAD +# local/segmentation/tuning/train_lstm_asr_sad_1a.sh \ +# --stage $nstage --train-stage $train_stage \ +# --targets-dir $dir \ +# --data-dir ${whole_data_dir}_hires +# fi + if [ $stage -le 5 ]; then # Train a TDNN-LSTM network for SAD - local/segmentation/tuning/train_lstm_asr_sad_1a.sh \ + + local/segmentation/tuning/train_stats_asr_sad_1a.sh \ --stage $nstage --train-stage $train_stage \ --targets-dir $dir \ - --data-dir ${whole_data_dir}_hires_bp + --data-dir ${whole_data_dir}_hires fi if [ $stage -le 6 ]; then @@ -137,9 +150,37 @@ if [ $stage -le 6 ]; then steps/segmentation/detect_speech_activity.sh \ --extra-left-context 70 --extra-right-context 0 --frames-per-chunk 150 \ --extra-left-context-initial 0 --extra-right-context-final 0 \ - --nj 32 --acwt 0.3 --stage $test_stage \ + --nj 32 --acwt 0.3 --mfcc-config "conf/mfcc_hires.conf" --stage $test_stage \ data/eval2000 \ - exp/segmentation_1a/tdnn_lstm_asr_sad_1a \ - mfcc_hires_bp \ - exp/segmentation_1a/tdnn_lstm_asr_sad_1a/{,eval2000} + exp/segmentation_1a/tdnn_stats_asr_sad_1a2 \ + mfcc_hires \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/{,eval2000} +fi + +if [ $stage -le 7 ]; then + # Do some diagnostics + steps/segmentation/evaluate_segmentation.pl data/eval2000/segments \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/segments &> \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/evaluate_segmentation.log + + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/utt2spk \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/segments \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/sys.rttm + + steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ + data/eval2000/utt2spk \ + data/eval2000/segments \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/ref.rttm + + export PATH=$PATH:$KALDI_ROOT/tools/sctk/bin + md-eval.pl -c 0.25 -r exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/ref.rttm \ + -s exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/sys.rttm > \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/md_eval.log +fi + +if [ $stage -le 8 ]; then + utils/copy_data_dir.sh exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg \ + data/eval2000.seg_asr_sad_1a fi + diff --git a/egs/swbd/s5c/local/run_cleanup_segmentation.sh b/egs/swbd/s5c/local/run_cleanup_segmentation.sh index d08d3f0e0b4..b286f10e0d3 100755 --- a/egs/swbd/s5c/local/run_cleanup_segmentation.sh +++ b/egs/swbd/s5c/local/run_cleanup_segmentation.sh @@ -1,6 +1,7 @@ #!/bin/bash -# Copyright 2016 Vimal Manohar +# 2017 Nagendra Kumar Goel +# 2016 Vimal Manohar # 2016 Johns Hopkins University (author: Daniel Povey) # Apache 2.0 diff --git a/egs/swbd/s5c/local/segmentation/copy_targets_dir.sh b/egs/swbd/s5c/local/segmentation/copy_targets_dir.sh index 8be70b4715a..81c9193d22e 100755 --- a/egs/swbd/s5c/local/segmentation/copy_targets_dir.sh +++ b/egs/swbd/s5c/local/segmentation/copy_targets_dir.sh @@ -1,6 +1,7 @@ #!/bin/bash -# Copyright 2014 Johns Hopkins University (author: Nagendra K Goel) +# Copyright 2017 Nagendra Kumar Goel +# 2014 Johns Hopkins University (author: Nagendra K Goel) # Apache 2.0 # This script operates on a directory, such as in exp/segmentation_1a/train_whole_combined_targets_rev1, diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh index 63f78aa8092..9ea3e895f95 100755 --- a/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh +++ b/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh @@ -1,12 +1,15 @@ #!/bin/bash +# Copyright 2017 Nagendra Kumar Goel +# Apache 2.0 + # This is a script to train a TDNN-LSTM for speech activity detection (SAD) # using LSTM for long-context information. set -o pipefail set -u -. ./cmd.sh +. cmd.sh # At this script level we don't support not running on GPU, as it would be painfully slow. # If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, @@ -47,7 +50,7 @@ affix=1a data_dir=exp/segmentation_1a/train_whole_hires_bp targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3 -. ./cmd.sh +. cmd.sh . ./path.sh . ./utils/parse_options.sh diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh index 2dfe9a0bb96..b3a6b6948a3 100755 --- a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh +++ b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh @@ -1,12 +1,15 @@ #!/bin/bash +# Copyright 2017 Nagendra Kumar Goel +# Apache 2.0 + # This is a script to train a TDNN for speech activity detection (SAD) # using statistics pooling for long-context information. set -o pipefail set -u -. ./cmd.sh +. cmd.sh # At this script level we don't support not running on GPU, as it would be painfully slow. # If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, @@ -46,7 +49,7 @@ affix=1a2 data_dir=exp/segmentation_1a/train_whole_hires_bp targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3 -. ./cmd.sh +. cmd.sh . ./path.sh . ./utils/parse_options.sh @@ -132,7 +135,7 @@ if [ $stage -le 6 ]; then copy-feats scp:$targets_dir/targets.scp ark:- | \ matrix-sum-rows ark:- ark:- | vector-sum --binary=false ark:- - | \ - awk '{print " [ "$2" "$3" ]"}' > $dir/post_output.vec + awk '{print " [ "$2" "$3" "$4" ]"}' > $dir/post_output.vec echo 3 > $dir/frame_subsampling_factor fi From 36747c4273685b3e88a25b1bacd4e8d3fa2a079e Mon Sep 17 00:00:00 2001 From: Nagendra Kumar Goel Date: Thu, 11 Jan 2018 12:13:10 -0500 Subject: [PATCH 03/11] path.sh convention and comments update --- egs/swbd/s5c/local/run_asr_segmentation.sh | 9 ++++----- egs/swbd/s5c/local/run_cleanup_segmentation.sh | 2 +- .../local/segmentation/tuning/train_lstm_asr_sad_1a.sh | 3 +-- .../local/segmentation/tuning/train_stats_asr_sad_1a.sh | 5 ++--- egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh | 1 + 5 files changed, 9 insertions(+), 11 deletions(-) diff --git a/egs/swbd/s5c/local/run_asr_segmentation.sh b/egs/swbd/s5c/local/run_asr_segmentation.sh index d87703d1e90..d986a481f8c 100755 --- a/egs/swbd/s5c/local/run_asr_segmentation.sh +++ b/egs/swbd/s5c/local/run_asr_segmentation.sh @@ -4,9 +4,8 @@ # 2017 Vimal Manohar # Apache 2.0 -# We assume the run-1-main.sh (because we are using model directories like -# exp/tri4) and later we assumme run-4-anydecode.sh was run to prepare -# data/dev10h.pem +# We assume the run.sh has been executed (because we are using model +# directories like exp/tri4) lang=data/lang # Must match the one used to train the models lang_test=data/lang_nosp_sw1_tg # Lang directory for decoding. @@ -42,8 +41,8 @@ affix=_1a stage=-1 nj=80 -. path.sh -. cmd.sh +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi set -e -u -o pipefail . utils/parse_options.sh diff --git a/egs/swbd/s5c/local/run_cleanup_segmentation.sh b/egs/swbd/s5c/local/run_cleanup_segmentation.sh index b286f10e0d3..8b08422d277 100755 --- a/egs/swbd/s5c/local/run_cleanup_segmentation.sh +++ b/egs/swbd/s5c/local/run_cleanup_segmentation.sh @@ -31,8 +31,8 @@ nj=100 decode_nj=16 decode_num_threads=4 -. ./path.sh . ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi . utils/parse_options.sh cleaned_data=${data}_${cleanup_affix} diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh index 9ea3e895f95..e3baa67b606 100755 --- a/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh +++ b/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh @@ -50,8 +50,7 @@ affix=1a data_dir=exp/segmentation_1a/train_whole_hires_bp targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3 -. cmd.sh -. ./path.sh +if [ -f ./path.sh ]; then . ./path.sh; fi . ./utils/parse_options.sh if [ -z "$dir" ]; then diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh index b3a6b6948a3..842f96ce1b9 100755 --- a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh +++ b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh @@ -9,7 +9,7 @@ set -o pipefail set -u -. cmd.sh +. ./cmd.sh # At this script level we don't support not running on GPU, as it would be painfully slow. # If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, @@ -49,8 +49,7 @@ affix=1a2 data_dir=exp/segmentation_1a/train_whole_hires_bp targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3 -. cmd.sh -. ./path.sh +if [ -f ./path.sh ]; then . ./path.sh; fi . ./utils/parse_options.sh if [ -z "$dir" ]; then diff --git a/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh b/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh index f8557a70177..bc646986eea 100755 --- a/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh +++ b/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh @@ -1,6 +1,7 @@ #! /bin/bash # Copyright 2017 Vimal Manohar +# 2017 Nagendra Kumar Goel # Apache 2.0 # This script prepares targets for training neural network for From 6390477ce0ce8e21806886f3362e5625d7c37c8e Mon Sep 17 00:00:00 2001 From: Nagendra Kumar Goel Date: Fri, 12 Jan 2018 17:25:17 -0500 Subject: [PATCH 04/11] add options for noise and reverberations --- egs/swbd/s5c/local/run_asr_segmentation.sh | 101 +++++++++++++----- .../segmentation/combine_targets_dirs.sh | 83 ++++++++++++++ .../tuning/train_stats_asr_sad_1a.sh | 13 +-- .../steps/segmentation/prepare_targets_gmm.sh | 2 +- 4 files changed, 163 insertions(+), 36 deletions(-) create mode 100755 egs/swbd/s5c/local/segmentation/combine_targets_dirs.sh diff --git a/egs/swbd/s5c/local/run_asr_segmentation.sh b/egs/swbd/s5c/local/run_asr_segmentation.sh index d986a481f8c..4d3356dc7b0 100755 --- a/egs/swbd/s5c/local/run_asr_segmentation.sh +++ b/egs/swbd/s5c/local/run_asr_segmentation.sh @@ -36,7 +36,8 @@ prepare_targets_stage=-10 nstage=-10 train_stage=-10 test_stage=-10 - +num_data_reps=1 +base_rirs=simulated affix=_1a stage=-1 nj=80 @@ -77,6 +78,7 @@ if ! cat $dir/garbage_phones.txt $dir/silence_phones.txt | \ fi whole_data_dir=${data_dir}_whole +rvb_data_dir=${whole_data_dir}_rvb if [ $stage -le 0 ]; then utils/data/convert_data_dir_to_whole.sh $data_dir $whole_data_dir @@ -115,30 +117,76 @@ if [ $stage -le 3 ]; then fi if [ $stage -le 4 ]; then - utils/copy_data_dir.sh ${whole_data_dir} ${whole_data_dir}_hires + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + if [ ! -f rirs_noises.zip ]; then + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + + rvb_opts=() + if [ "$base_rirs" == "simulated" ]; then + # This is the config for the system using simulated RIRs and point-source noises + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") + rvb_opts+=(--noise-set-parameters RIRS_NOISES/pointsource_noises/noise_list) + else + # This is the config for the JHU ASpIRE submission system + rvb_opts+=(--rir-set-parameters "1.0, RIRS_NOISES/real_rirs_isotropic_noises/rir_list") + rvb_opts+=(--noise-set-parameters RIRS_NOISES/real_rirs_isotropic_noises/noise_list) + fi + + foreground_snrs="20:10:15:5:0" + background_snrs="20:10:15:5:0" + num_reps=1 + # corrupt the data to generate multi-condition data + # for data_dir in train dev test; do + python steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --prefix "rev" \ + --foreground-snrs $foreground_snrs \ + --background-snrs $background_snrs \ + --speech-rvb-probability 0.5 \ + --pointsource-noise-addition-probability 0.5 \ + --isotropic-noise-addition-probability 0.7 \ + --num-replications $num_reps \ + --max-noises-per-minute 4 \ + --source-sampling-rate 8000 \ + $whole_data_dir $rvb_data_dir + + for i in `seq 1 $num_data_reps`; do + local/segmentation/copy_targets_dir.sh --cmd "$decode_cmd" --utt-prefix "rev${i}_" exp/segmentation_1a/train_whole_combined_targets_sub3 exp/segmentation_1a/train_whole_combined_targets_sub3_temp_$i || exit 1; + rvb_dirs+=" exp/segmentation_1a/train_whole_combined_targets_sub3_temp_$i" + done + + local/segmentation/combine_targets_dirs.sh $rvb_data_dir exp/segmentation_1a/train_whole_combined_targets_sub3_rvb $rvb_dirs || exit 1; + cp exp/segmentation_1a/train_whole_combined_targets_sub3_rvb/targets.scp exp/segmentation_1a/ +fi + +if [ $stage -le 5 ]; then + utils/copy_data_dir.sh ${rvb_data_dir} ${rvb_data_dir}_hires steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 \ - ${whole_data_dir}_hires - steps/compute_cmvn_stats.sh ${whole_data_dir}_hires + ${rvb_data_dir}_hires + steps/compute_cmvn_stats.sh ${rvb_data_dir}_hires fi -# if [ $stage -le 4.5 ]; then +# if [ $stage -le 6 ]; then # # Train a TDNN-LSTM network for SAD # local/segmentation/tuning/train_lstm_asr_sad_1a.sh \ # --stage $nstage --train-stage $train_stage \ # --targets-dir $dir \ -# --data-dir ${whole_data_dir}_hires +# --data-dir ${rvb_data_dir}_hires # fi -if [ $stage -le 5 ]; then - # Train a TDNN-LSTM network for SAD +if [ $stage -le 6 ]; then + # Train a STATS-pooling network for SAD local/segmentation/tuning/train_stats_asr_sad_1a.sh \ --stage $nstage --train-stage $train_stage \ --targets-dir $dir \ - --data-dir ${whole_data_dir}_hires + --data-dir ${rvb_data_dir}_hires fi -if [ $stage -le 6 ]; then +if [ $stage -le 7 ]; then # The options to this script must match the options used in the # nnet training script. # e.g. extra-left-context is 70, because the model is an LSTM trained with a @@ -149,37 +197,32 @@ if [ $stage -le 6 ]; then steps/segmentation/detect_speech_activity.sh \ --extra-left-context 70 --extra-right-context 0 --frames-per-chunk 150 \ --extra-left-context-initial 0 --extra-right-context-final 0 \ - --nj 32 --acwt 0.3 --mfcc-config "conf/mfcc_hires.conf" --stage $test_stage \ + --nj 32 --acwt 0.3 --stage $test_stage \ data/eval2000 \ exp/segmentation_1a/tdnn_stats_asr_sad_1a2 \ mfcc_hires \ exp/segmentation_1a/tdnn_stats_asr_sad_1a2/{,eval2000} fi -if [ $stage -le 7 ]; then +if [ $stage -le 8 ]; then # Do some diagnostics - steps/segmentation/evaluate_segmentation.pl data/eval2000/segments \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/segments &> \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/evaluate_segmentation.log + steps/segmentation/evalute_segmentation.pl data/dev10h.pem/segments \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/segments &> \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/evalutate_segmentation.log steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/utt2spk \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/segments \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/sys.rttm + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/utt2spk \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/segments \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/sys.rttm - steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ - data/eval2000/utt2spk \ - data/eval2000/segments \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/ref.rttm - export PATH=$PATH:$KALDI_ROOT/tools/sctk/bin - md-eval.pl -c 0.25 -r exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/ref.rttm \ - -s exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/sys.rttm > \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/md_eval.log + md-eval.pl -c 0.25 -r $dev10h_rttm_file \ + -s exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/sys.rttm > \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/md_eval.log fi -if [ $stage -le 8 ]; then - utils/copy_data_dir.sh exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg \ - data/eval2000.seg_asr_sad_1a +if [ $stage -le 9 ]; then + utils/copy_data_dir.sh exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg \ + data/dev10h.seg_asr_sad_1a fi diff --git a/egs/swbd/s5c/local/segmentation/combine_targets_dirs.sh b/egs/swbd/s5c/local/segmentation/combine_targets_dirs.sh new file mode 100755 index 00000000000..48c4ce93db0 --- /dev/null +++ b/egs/swbd/s5c/local/segmentation/combine_targets_dirs.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# Copyright 2017 Nagendra Kumar Goel +# Apache 2.0. + +# This srcipt operates on targets directories, such as exp/segmentation_1a/train_whole_combined_targets_sub3 +# the output is a new targets dir which has targets from all the input targets dirs + +# Begin configuration section. +cmd=run.pl +extra_files= +num_jobs=4 +# End configuration section. +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [[ $# -lt 3 ]]; then + echo "Usage: $0 [options] ..." + echo "e.g.: $0 --num-jobs 32 data/train exp/targets_combined exp/targets_1 exp/targets_2" + echo "Options:" + echo " --extra-files # specify addtional files in 'src-targets-dir1' to copy" + echo " --num-jobs # number of jobs used to split the data directory." + echo " Note, files that don't appear in the first source dir will not be added even if they appear in later ones." + echo " Other than alignments, only files from the first src ali dir are copied." + exit 1; +fi + +data=$1; +shift; +dest=$1; +shift; +first_src=$1; + +mkdir -p $dest; +rm $dest/{targets.*.ark,frame_subsampling_factor} 2>/dev/null + +cp $first_src/frame_subsampling_factor $dest 2>/dev/null + +export LC_ALL=C + +for dir in $*; do + if [ ! -f $dir/targets.1.ark ]; then + echo "$0: check if targets (targets.*.ark) are present in $dir." + exit 1; + fi +done + +for dir in $*; do + for f in frame_subsampling_factor; do + diff $first_src/$f $dir/$f 1>/dev/null 2>&1 + if [ $? -ne 0 ]; then + echo "$0: Cannot combine alignment directories with different $f files." + fi + done +done + +for f in frame_subsampling_factor $extra_files; do + if [ ! -f $first_src/$f ]; then + echo "combine_targets_dir.sh: no such file $first_src/$f" + exit 1; + fi + cp $first_src/$f $dest/ +done + +src_id=0 +temp_dir=$dest/temp +[ -d $temp_dir ] && rm -r $temp_dir; +mkdir -p $temp_dir +echo "$0: dumping targets in each source directory as single archive and index." +for dir in $*; do + src_id=$((src_id + 1)) + cur_num_jobs=$(ls $dir/targets.*.ark | wc -l) || exit 1; + tgts=$(for n in $(seq $cur_num_jobs); do echo -n "$dir/targets.$n.ark "; done) + $cmd $dir/log/copy_targets.log \ + copy-matrix "ark:cat $tgts|" \ + ark,scp:$temp_dir/targets.$src_id.ark,$temp_dir/targets.$src_id.scp || exit 1; +done +sort -m $temp_dir/targets.*.scp > $dest/targets.scp || exit 1; + + +echo "Combined targets and stored in $dest" +exit 0 diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh index 842f96ce1b9..feb88a53454 100755 --- a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh +++ b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh @@ -1,15 +1,15 @@ #!/bin/bash # Copyright 2017 Nagendra Kumar Goel +# 2016 Vimal Manohar # Apache 2.0 - # This is a script to train a TDNN for speech activity detection (SAD) # using statistics pooling for long-context information. set -o pipefail set -u -. ./cmd.sh +. cmd.sh # At this script level we don't support not running on GPU, as it would be painfully slow. # If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, @@ -30,11 +30,11 @@ extra_right_context=21 relu_dim=256 # training options -num_epochs=4 +num_epochs=2 initial_effective_lrate=0.0003 final_effective_lrate=0.00003 -num_jobs_initial=3 -num_jobs_final=8 +num_jobs_initial=1 +num_jobs_final=1 remove_egs=true max_param_change=0.2 # Small max-param change for small network @@ -49,6 +49,7 @@ affix=1a2 data_dir=exp/segmentation_1a/train_whole_hires_bp targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3 +. cmd.sh if [ -f ./path.sh ]; then . ./path.sh; fi . ./utils/parse_options.sh @@ -134,7 +135,7 @@ if [ $stage -le 6 ]; then copy-feats scp:$targets_dir/targets.scp ark:- | \ matrix-sum-rows ark:- ark:- | vector-sum --binary=false ark:- - | \ - awk '{print " [ "$2" "$3" "$4" ]"}' > $dir/post_output.vec + awk '{print " [ "$2" "$3" ]"}' > $dir/post_output.vec echo 3 > $dir/frame_subsampling_factor fi diff --git a/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh b/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh index bc646986eea..de19cfc6772 100755 --- a/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh +++ b/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh @@ -211,7 +211,7 @@ if [ $stage -le 5 ]; then # the speech / silence decisions, not the exact word sequences. steps/decode.sh --cmd "$decode_cmd --mem 2G" --nj $nj \ --max-active 1000 --beam 10.0 \ - --decode-extra-opts "--word-determinize=false" --skip-scoring true \ + --skip-scoring true \ $graph_dir $uniform_seg_data_dir $decode_dir fi From b62c2a87ce2bbdbc65f48afd6f6675b97c68c7f7 Mon Sep 17 00:00:00 2001 From: Nagendra Kumar Goel Date: Tue, 16 Jan 2018 08:46:49 -0500 Subject: [PATCH 05/11] Fix bugs in evaluations part --- egs/swbd/s5c/local/run_asr_segmentation.sh | 30 +++++++++---------- .../tuning/train_lstm_asr_sad_1a.sh | 2 +- .../tuning/train_stats_asr_sad_1a.sh | 12 ++++---- .../segmentation/detect_speech_activity.sh | 12 ++++---- 4 files changed, 30 insertions(+), 26 deletions(-) diff --git a/egs/swbd/s5c/local/run_asr_segmentation.sh b/egs/swbd/s5c/local/run_asr_segmentation.sh index 4d3356dc7b0..7129e905480 100755 --- a/egs/swbd/s5c/local/run_asr_segmentation.sh +++ b/egs/swbd/s5c/local/run_asr_segmentation.sh @@ -36,7 +36,7 @@ prepare_targets_stage=-10 nstage=-10 train_stage=-10 test_stage=-10 -num_data_reps=1 +num_data_reps=2 base_rirs=simulated affix=_1a stage=-1 @@ -164,7 +164,7 @@ fi if [ $stage -le 5 ]; then utils/copy_data_dir.sh ${rvb_data_dir} ${rvb_data_dir}_hires - steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 40 \ + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 10 \ ${rvb_data_dir}_hires steps/compute_cmvn_stats.sh ${rvb_data_dir}_hires fi @@ -206,23 +206,23 @@ fi if [ $stage -le 8 ]; then # Do some diagnostics - steps/segmentation/evalute_segmentation.pl data/dev10h.pem/segments \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/segments &> \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/evalutate_segmentation.log + steps/segmentation/evaluate_segmentation.pl data/eval2000/segments \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/segments &> \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/evalutate_segmentation.log steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/utt2spk \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/segments \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/sys.rttm - - export PATH=$PATH:$KALDI_ROOT/tools/sctk/bin - md-eval.pl -c 0.25 -r $dev10h_rttm_file \ - -s exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/sys.rttm > \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg/md_eval.log + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/utt2spk \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/segments \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/sys.rttm + +# export PATH=$PATH:$KALDI_ROOT/tools/sctk/bin +# md-eval.pl -c 0.25 -r $eval2000_rttm_file \ +# -s exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/sys.rttm > \ +# exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/md_eval.log fi if [ $stage -le 9 ]; then - utils/copy_data_dir.sh exp/segmentation_1a/tdnn_stats_asr_sad_1a2/dev10h_seg \ - data/dev10h.seg_asr_sad_1a + utils/copy_data_dir.sh exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg \ + data/eval2000.seg_asr_sad_1a fi diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh index e3baa67b606..74697df099f 100755 --- a/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh +++ b/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh @@ -9,7 +9,7 @@ set -o pipefail set -u -. cmd.sh +. ./cmd.sh # At this script level we don't support not running on GPU, as it would be painfully slow. # If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh index feb88a53454..3254929306f 100755 --- a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh +++ b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh @@ -9,7 +9,7 @@ set -o pipefail set -u -. cmd.sh +. ./cmd.sh # At this script level we don't support not running on GPU, as it would be painfully slow. # If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, @@ -30,7 +30,7 @@ extra_right_context=21 relu_dim=256 # training options -num_epochs=2 +num_epochs=1 initial_effective_lrate=0.0003 final_effective_lrate=0.00003 num_jobs_initial=1 @@ -46,7 +46,7 @@ config_dir= dir= affix=1a2 -data_dir=exp/segmentation_1a/train_whole_hires_bp +data_dir=exp/segmentation_1a/train_whole_rvb_hires targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3 . cmd.sh @@ -132,10 +132,12 @@ if [ $stage -le 6 ]; then --targets-scp="$targets_dir/targets.scp" \ --egs.opts="--frame-subsampling-factor 3 --num-utts-subset $num_utts_subset" \ --dir=$dir || exit 1 +fi - copy-feats scp:$targets_dir/targets.scp ark:- | \ +if [ $stage -le 7 ]; then + copy-feats scp:$targets_dir/targets.scp ark:- | \ matrix-sum-rows ark:- ark:- | vector-sum --binary=false ark:- - | \ - awk '{print " [ "$2" "$3" ]"}' > $dir/post_output.vec + awk '{print " [ "$2" "$3" "$4" ]"}' > $dir/post_output.vec echo 3 > $dir/frame_subsampling_factor fi diff --git a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh index 69f47c28d60..9bc8eea675c 100755 --- a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh +++ b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh @@ -1,6 +1,7 @@ #!/bin/bash # Copyright 2016-17 Vimal Manohar +# 2017 Nagendra Kumar Goel # Apache 2.0. # This script does nnet3-based speech activity detection given an input @@ -12,16 +13,17 @@ set -e set -o pipefail set -u -. ./path.sh +. ./cmd.sh +if [ -f ./path.sh ]; then . ./path.sh; fi affix= # Affix for the segmentation nj=32 -cmd=queue.pl +cmd=$decode_cmd stage=-1 # Feature options (Must match training) -mfcc_config=conf/mfcc_hires_bp.conf -feat_affix=bp # Affix for the type of feature used +mfcc_config=conf/mfcc_hires.conf +feat_affix=hires # Affix for the type of feature used convert_data_dir_to_whole=true # If true, the input data directory is # first converted to whole data directory (i.e. whole recordings) @@ -67,7 +69,7 @@ if [ $# -ne 5 ]; then echo "See script for details of the options to be supplied." echo "Usage: $0 " echo " e.g.: $0 ~/workspace/egs/ami/s5b/data/sdm1/dev exp/nnet3_sad_snr/nnet_tdnn_j_n4 \\" - echo " mfcc_hires_bp exp/segmentation_sad_snr/nnet_tdnn_j_n4 data/ami_sdm1_dev" + echo " mfcc_hires exp/segmentation_sad_snr/nnet_tdnn_j_n4 data/ami_sdm1_dev" echo "" echo "Options: " echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." From ce3cbab528f14db3bcaa4d15391004a101a0e19b Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 25 Jan 2018 18:20:21 -0500 Subject: [PATCH 06/11] Simplifying recipe --- egs/swbd/s5c/local/run_asr_segmentation.sh | 68 +++++++-------- .../s5c/local/run_cleanup_segmentation.sh | 15 ++-- .../segmentation/combine_targets_dirs.sh | 83 ------------------- .../local/segmentation/copy_targets_dir.sh | 77 ----------------- .../tuning/train_lstm_asr_sad_1a.sh | 3 +- .../tuning/train_stats_asr_sad_1a.sh | 13 +-- .../segmentation/combine_targets_dirs.sh | 55 ++++++++++++ .../s5/steps/segmentation/copy_targets_dir.sh | 46 ++++++++++ .../segmentation/detect_speech_activity.sh | 5 +- .../steps/segmentation/prepare_targets_gmm.sh | 3 +- 10 files changed, 148 insertions(+), 220 deletions(-) delete mode 100755 egs/swbd/s5c/local/segmentation/combine_targets_dirs.sh delete mode 100755 egs/swbd/s5c/local/segmentation/copy_targets_dir.sh create mode 100755 egs/wsj/s5/steps/segmentation/combine_targets_dirs.sh create mode 100755 egs/wsj/s5/steps/segmentation/copy_targets_dir.sh diff --git a/egs/swbd/s5c/local/run_asr_segmentation.sh b/egs/swbd/s5c/local/run_asr_segmentation.sh index 7129e905480..21c20b0a423 100755 --- a/egs/swbd/s5c/local/run_asr_segmentation.sh +++ b/egs/swbd/s5c/local/run_asr_segmentation.sh @@ -10,7 +10,7 @@ lang=data/lang # Must match the one used to train the models lang_test=data/lang_nosp_sw1_tg # Lang directory for decoding. -data_dir=data/train +data_dir=data/train # Model directory used to align the $data_dir to get target labels for training # SAD. This should typically be a speaker-adapted system. sat_model_dir=exp/tri4 @@ -37,7 +37,6 @@ nstage=-10 train_stage=-10 test_stage=-10 num_data_reps=2 -base_rirs=simulated affix=_1a stage=-1 nj=80 @@ -113,6 +112,7 @@ if [ $stage -le 3 ]; then --nj 80 --reco-nj 40 --lang-test $lang_test \ --garbage-phones-list $dir/garbage_phones.txt \ --silence-phones-list $dir/silence_phones.txt \ + --merge-weights $merge_weights \ $lang $data_dir $whole_data_dir $sat_model_dir $model_dir $dir fi @@ -124,20 +124,14 @@ if [ $stage -le 4 ]; then fi rvb_opts=() - if [ "$base_rirs" == "simulated" ]; then - # This is the config for the system using simulated RIRs and point-source noises - rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") - rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") - rvb_opts+=(--noise-set-parameters RIRS_NOISES/pointsource_noises/noise_list) - else - # This is the config for the JHU ASpIRE submission system - rvb_opts+=(--rir-set-parameters "1.0, RIRS_NOISES/real_rirs_isotropic_noises/rir_list") - rvb_opts+=(--noise-set-parameters RIRS_NOISES/real_rirs_isotropic_noises/noise_list) - fi + # This is the config for the system using simulated RIRs and point-source noises + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") + rvb_opts+=(--noise-set-parameters RIRS_NOISES/pointsource_noises/noise_list) foreground_snrs="20:10:15:5:0" background_snrs="20:10:15:5:0" - num_reps=1 + num_data_reps=1 # corrupt the data to generate multi-condition data # for data_dir in train dev test; do python steps/data/reverberate_data_dir.py \ @@ -148,41 +142,36 @@ if [ $stage -le 4 ]; then --speech-rvb-probability 0.5 \ --pointsource-noise-addition-probability 0.5 \ --isotropic-noise-addition-probability 0.7 \ - --num-replications $num_reps \ + --num-replications $num_data_reps \ --max-noises-per-minute 4 \ --source-sampling-rate 8000 \ $whole_data_dir $rvb_data_dir + rvb_dirs=() for i in `seq 1 $num_data_reps`; do - local/segmentation/copy_targets_dir.sh --cmd "$decode_cmd" --utt-prefix "rev${i}_" exp/segmentation_1a/train_whole_combined_targets_sub3 exp/segmentation_1a/train_whole_combined_targets_sub3_temp_$i || exit 1; - rvb_dirs+=" exp/segmentation_1a/train_whole_combined_targets_sub3_temp_$i" + steps/segmentation/copy_targets_dir.sh --utt-prefix "rev${i}_" \ + exp/segmentation_1a/train_whole_combined_targets_sub3 \ + exp/segmentation_1a/train_whole_combined_targets_sub3_temp_$i || exit 1; + rvb_dirs+=(exp/segmentation_1a/train_whole_combined_targets_sub3_temp_$i) done - local/segmentation/combine_targets_dirs.sh $rvb_data_dir exp/segmentation_1a/train_whole_combined_targets_sub3_rvb $rvb_dirs || exit 1; - cp exp/segmentation_1a/train_whole_combined_targets_sub3_rvb/targets.scp exp/segmentation_1a/ + steps/segmentation/combine_targets_dirs.sh \ + $rvb_data_dir exp/segmentation_1a/train_whole_combined_targets_sub3_rvb \ + $rvb_dirs || exit 1; fi if [ $stage -le 5 ]; then utils/copy_data_dir.sh ${rvb_data_dir} ${rvb_data_dir}_hires - steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 10 \ + steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 80 \ ${rvb_data_dir}_hires steps/compute_cmvn_stats.sh ${rvb_data_dir}_hires fi -# if [ $stage -le 6 ]; then -# # Train a TDNN-LSTM network for SAD -# local/segmentation/tuning/train_lstm_asr_sad_1a.sh \ -# --stage $nstage --train-stage $train_stage \ -# --targets-dir $dir \ -# --data-dir ${rvb_data_dir}_hires -# fi - if [ $stage -le 6 ]; then # Train a STATS-pooling network for SAD - local/segmentation/tuning/train_stats_asr_sad_1a.sh \ --stage $nstage --train-stage $train_stage \ - --targets-dir $dir \ + --targets-dir exp/segmentation_1a/train_whole_combined_targets_sub3_rvb \ --data-dir ${rvb_data_dir}_hires fi @@ -199,30 +188,29 @@ if [ $stage -le 7 ]; then --extra-left-context-initial 0 --extra-right-context-final 0 \ --nj 32 --acwt 0.3 --stage $test_stage \ data/eval2000 \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2 \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a \ mfcc_hires \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/{,eval2000} + exp/segmentation_1a/tdnn_stats_asr_sad_1a/{,eval2000} fi if [ $stage -le 8 ]; then # Do some diagnostics steps/segmentation/evaluate_segmentation.pl data/eval2000/segments \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/segments &> \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/evalutate_segmentation.log + exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/segments &> \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/evalutate_segmentation.log steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/utt2spk \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/segments \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/sys.rttm + exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/utt2spk \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/segments \ + exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/sys.rttm # export PATH=$PATH:$KALDI_ROOT/tools/sctk/bin # md-eval.pl -c 0.25 -r $eval2000_rttm_file \ -# -s exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/sys.rttm > \ -# exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg/md_eval.log +# -s exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/sys.rttm > \ +# exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/md_eval.log fi if [ $stage -le 9 ]; then - utils/copy_data_dir.sh exp/segmentation_1a/tdnn_stats_asr_sad_1a2/eval2000_seg \ + utils/copy_data_dir.sh exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg \ data/eval2000.seg_asr_sad_1a fi - diff --git a/egs/swbd/s5c/local/run_cleanup_segmentation.sh b/egs/swbd/s5c/local/run_cleanup_segmentation.sh index 8b08422d277..c879a55d16a 100755 --- a/egs/swbd/s5c/local/run_cleanup_segmentation.sh +++ b/egs/swbd/s5c/local/run_cleanup_segmentation.sh @@ -1,8 +1,8 @@ #!/bin/bash -# 2017 Nagendra Kumar Goel -# 2016 Vimal Manohar -# 2016 Johns Hopkins University (author: Daniel Povey) +# Copyright 2016 Vimal Manohar +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Nagendra Kumar Goel # Apache 2.0 # This script demonstrates how to re-segment training data selecting only the @@ -23,9 +23,9 @@ set -u stage=0 cleanup_stage=0 -data=data/train +data=data/train_nodup cleanup_affix=cleaned -srcdir=exp/tri4_mmi_b0.1 +srcdir=exp/tri4 langdir=data/lang_sw1_tg nj=100 decode_nj=16 @@ -42,7 +42,8 @@ cleaned_dir=${srcdir}_${cleanup_affix} if [ $stage -le 1 ]; then # This does the actual data cleanup. - steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj $nj --cmd "$train_cmd" \ + steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage \ + --nj $nj --cmd "$train_cmd" \ $data $langdir $srcdir $dir $cleaned_data fi @@ -53,5 +54,5 @@ fi if [ $stage -le 3 ]; then steps/train_sat.sh --cmd "$train_cmd" \ - 5000 100000 $cleaned_data $langdir ${srcdir}_ali_${cleanup_affix} ${cleaned_dir} + 11500 200000 $cleaned_data $langdir ${srcdir}_ali_${cleanup_affix} ${cleaned_dir} fi diff --git a/egs/swbd/s5c/local/segmentation/combine_targets_dirs.sh b/egs/swbd/s5c/local/segmentation/combine_targets_dirs.sh deleted file mode 100755 index 48c4ce93db0..00000000000 --- a/egs/swbd/s5c/local/segmentation/combine_targets_dirs.sh +++ /dev/null @@ -1,83 +0,0 @@ -#!/bin/bash -# Copyright 2017 Nagendra Kumar Goel -# Apache 2.0. - -# This srcipt operates on targets directories, such as exp/segmentation_1a/train_whole_combined_targets_sub3 -# the output is a new targets dir which has targets from all the input targets dirs - -# Begin configuration section. -cmd=run.pl -extra_files= -num_jobs=4 -# End configuration section. -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -. parse_options.sh || exit 1; - -if [[ $# -lt 3 ]]; then - echo "Usage: $0 [options] ..." - echo "e.g.: $0 --num-jobs 32 data/train exp/targets_combined exp/targets_1 exp/targets_2" - echo "Options:" - echo " --extra-files # specify addtional files in 'src-targets-dir1' to copy" - echo " --num-jobs # number of jobs used to split the data directory." - echo " Note, files that don't appear in the first source dir will not be added even if they appear in later ones." - echo " Other than alignments, only files from the first src ali dir are copied." - exit 1; -fi - -data=$1; -shift; -dest=$1; -shift; -first_src=$1; - -mkdir -p $dest; -rm $dest/{targets.*.ark,frame_subsampling_factor} 2>/dev/null - -cp $first_src/frame_subsampling_factor $dest 2>/dev/null - -export LC_ALL=C - -for dir in $*; do - if [ ! -f $dir/targets.1.ark ]; then - echo "$0: check if targets (targets.*.ark) are present in $dir." - exit 1; - fi -done - -for dir in $*; do - for f in frame_subsampling_factor; do - diff $first_src/$f $dir/$f 1>/dev/null 2>&1 - if [ $? -ne 0 ]; then - echo "$0: Cannot combine alignment directories with different $f files." - fi - done -done - -for f in frame_subsampling_factor $extra_files; do - if [ ! -f $first_src/$f ]; then - echo "combine_targets_dir.sh: no such file $first_src/$f" - exit 1; - fi - cp $first_src/$f $dest/ -done - -src_id=0 -temp_dir=$dest/temp -[ -d $temp_dir ] && rm -r $temp_dir; -mkdir -p $temp_dir -echo "$0: dumping targets in each source directory as single archive and index." -for dir in $*; do - src_id=$((src_id + 1)) - cur_num_jobs=$(ls $dir/targets.*.ark | wc -l) || exit 1; - tgts=$(for n in $(seq $cur_num_jobs); do echo -n "$dir/targets.$n.ark "; done) - $cmd $dir/log/copy_targets.log \ - copy-matrix "ark:cat $tgts|" \ - ark,scp:$temp_dir/targets.$src_id.ark,$temp_dir/targets.$src_id.scp || exit 1; -done -sort -m $temp_dir/targets.*.scp > $dest/targets.scp || exit 1; - - -echo "Combined targets and stored in $dest" -exit 0 diff --git a/egs/swbd/s5c/local/segmentation/copy_targets_dir.sh b/egs/swbd/s5c/local/segmentation/copy_targets_dir.sh deleted file mode 100755 index 81c9193d22e..00000000000 --- a/egs/swbd/s5c/local/segmentation/copy_targets_dir.sh +++ /dev/null @@ -1,77 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Nagendra Kumar Goel -# 2014 Johns Hopkins University (author: Nagendra K Goel) -# Apache 2.0 - -# This script operates on a directory, such as in exp/segmentation_1a/train_whole_combined_targets_rev1, -# that contains some subset of the following files: -# targets.X.ark -# frame_subsampling_factor -# It copies to another directory, possibly adding a specified prefix or a suffix -# to the utterance names. - - -# begin configuration section -utt_prefix= -utt_suffix= -cmd=run.pl -# end configuration section - -. utils/parse_options.sh - -if [ $# != 2 ]; then - echo "Usage: " - echo " $0 [options] " - echo "e.g.:" - echo " $0 --utt-prefix=1- exp/segmentation_1a/train_whole_combined_targets_sub3 exp/segmentation_1a/train_whole_combined_targets_sub3_rev1" - echo "Options" - echo " --utt-prefix= # Prefix for utterance ids, default empty" - echo " --utt-suffix= # Suffix for utterance ids, default empty" - exit 1; -fi - - -export LC_ALL=C - -src_dir=$1 -dest_dir=$2 - -mkdir -p $dest_dir - -if [ ! -f $src_dir/targets.1.ark ]; then - echo "copy_targets_dir.sh: no such files $src_dir/targets.1.ark" - exit 1; -fi - -for f in frame_subsampling_factor; do - if [ ! -f $src_dir/$f ]; then - echo "$0: no such file $src_dir/$f this might be serious error." - continue - fi - cp $src_dir/$f $dest_dir/ -done - -nj=$(ls $src_dir/targets.*.ark | wc -l) -mkdir -p $dest_dir/temp -cat << EOF > $dest_dir/temp/copy_targets.sh -set -e; -id=\$1 -echo "$src_dir/targets.\$id.ark" -copy-matrix ark:$src_dir/targets.\$id.ark ark,t:- | \ -python -c " -import sys -for line in sys.stdin: - parts = line.split() - if \"[\" not in line: - print line.rstrip() - else: - print '$utt_prefix{0}$utt_suffix {1}'.format(parts[0], ' '.join(parts[1:])) -" | \ - copy-matrix ark,t:- ark:$dest_dir/targets.\$id.ark || exit 1; -set +o pipefail; # unset the pipefail option. -EOF -chmod +x $dest_dir/temp/copy_targets.sh -$cmd -v PATH JOB=1:$nj $dest_dir/temp/copy_targets.JOB.log $dest_dir/temp/copy_targets.sh JOB || exit 1; - -echo "$0: copied targets from $src_dir to $dest_dir" diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh index 74697df099f..13318756e43 100755 --- a/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh +++ b/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh @@ -9,8 +9,6 @@ set -o pipefail set -u -. ./cmd.sh - # At this script level we don't support not running on GPU, as it would be painfully slow. # If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false, # --num-threads 16 and --minibatch-size 128. @@ -50,6 +48,7 @@ affix=1a data_dir=exp/segmentation_1a/train_whole_hires_bp targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3 +. ./cmd.sh if [ -f ./path.sh ]; then . ./path.sh; fi . ./utils/parse_options.sh diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh index 3254929306f..96009c69374 100755 --- a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh +++ b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh @@ -33,8 +33,8 @@ relu_dim=256 num_epochs=1 initial_effective_lrate=0.0003 final_effective_lrate=0.00003 -num_jobs_initial=1 -num_jobs_final=1 +num_jobs_initial=3 +num_jobs_final=8 remove_egs=true max_param_change=0.2 # Small max-param change for small network @@ -49,7 +49,7 @@ affix=1a2 data_dir=exp/segmentation_1a/train_whole_rvb_hires targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3 -. cmd.sh +. ./cmd.sh if [ -f ./path.sh ]; then . ./path.sh; fi . ./utils/parse_options.sh @@ -135,9 +135,10 @@ if [ $stage -le 6 ]; then fi if [ $stage -le 7 ]; then - copy-feats scp:$targets_dir/targets.scp ark:- | \ - matrix-sum-rows ark:- ark:- | vector-sum --binary=false ark:- - | \ - awk '{print " [ "$2" "$3" "$4" ]"}' > $dir/post_output.vec + # Use a subset to compute prior over the output targets + $cmd $dir/log/get_priors.log \ + matrix-sum-rows "scp:utils/subset_scp.pl --quiet 1000 $targets_dir/targets.scp |" \ + ark:- \| vector-sum --binary=false ark:- $dir/post_output.vec || exit 1 echo 3 > $dir/frame_subsampling_factor fi diff --git a/egs/wsj/s5/steps/segmentation/combine_targets_dirs.sh b/egs/wsj/s5/steps/segmentation/combine_targets_dirs.sh new file mode 100755 index 00000000000..f6be21e16f3 --- /dev/null +++ b/egs/wsj/s5/steps/segmentation/combine_targets_dirs.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +# Copyright 2017 Nagendra Kumar Goel +# 2018 Vimal Manohar +# Apache 2.0. + +# This script combines targets directory into a new targets directory +# containing targets from all the input targets directories. + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# -lt 3 ]; then + echo "Usage: $0 [options] ..." + echo "e.g.: $0 data/train exp/targets_combined exp/targets_1 exp/targets_2" + exit 1; +fi + +export LC_ALL=C + +data=$1; +shift; +dest=$1; +shift; +first_src=$1; + +mkdir -p $dest; +rm -f $dest/{targets.*.ark,frame_subsampling_factor} 2>/dev/null + +frame_subsampling_factor=1 +if [ -f $first_src/frame_subsampling_factor ]; then + cp $first_src/frame_subsampling_factor $dest + frame_subsampling_factor=$(cat $dest/frame_subsampling_factor) +fi + +for d in $*; do + this_frame_subsampling_factor=1 + if [ -f $d/frame_subsampling_factor ]; then + this_frame_subsampling_factor=$(cat $d/frame_subsampling_factor) + fi + + if [ $this_frame_subsampling_factor != $frame_subsampling_factor ]; then + echo "$0: Cannot combine targets directories with different frame-subsampling-factors" 1>&2 + exit 1 + fi + + cat $d/targets.scp +done | sort -k1,1 > $dest/targets.scp || exit 1 + +steps/segmentation/verify_targets_dir.sh $data $dest || exit 1 + +echo "Combined targets and stored in $dest" +exit 0 diff --git a/egs/wsj/s5/steps/segmentation/copy_targets_dir.sh b/egs/wsj/s5/steps/segmentation/copy_targets_dir.sh new file mode 100755 index 00000000000..f15206b1f7d --- /dev/null +++ b/egs/wsj/s5/steps/segmentation/copy_targets_dir.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Copyright 2017 Nagendra Kumar Goel +# 2014 Johns Hopkins University (author: Nagendra K Goel) +# Apache 2.0 + +# This script makes a copy of targets directory (by copying targets.scp), +# possibly adding a specified prefix or a suffix to the utterance names. + +# begin configuration section +utt_prefix= +utt_suffix= +# end configuration section + +if [ -f ./path.sh ]; then . ./path.sh; fi +. ./utils/parse_options.sh + +if [ $# != 2 ]; then + echo "Usage: " + echo " $0 [options] " + echo "e.g.:" + echo " $0 --utt-prefix=1- exp/segmentation_1a/train_whole_combined_targets_sub3 exp/segmentation_1a/train_whole_combined_targets_sub3_rev1" + echo "Options" + echo " --utt-prefix= # Prefix for utterance ids, default empty" + echo " --utt-suffix= # Suffix for utterance ids, default empty" + exit 1; +fi + +export LC_ALL=C + +srcdir=$1 +destdir=$2 + +mkdir -p $destdir + +if [ -f $srcdir/frame_subsampling_factor ]; then + cp $srcdir/frame_subsampling_factor $destdir +fi + +cat $srcdir/targets.scp | awk -v p=$utt_prefix -v s=$utt_suffix \ + '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map + +cat $srcdir/targets.scp | utils/apply_map.pl -f 1 $destdir/utt_map | \ + sort -k1,1 > $destdir/targets.scp + +echo "$0: copied targets from $srcdir to $destdir" diff --git a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh index 9bc8eea675c..60e3df20df2 100755 --- a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh +++ b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh @@ -13,17 +13,16 @@ set -e set -o pipefail set -u -. ./cmd.sh if [ -f ./path.sh ]; then . ./path.sh; fi affix= # Affix for the segmentation nj=32 -cmd=$decode_cmd +cmd=queue.pl stage=-1 # Feature options (Must match training) mfcc_config=conf/mfcc_hires.conf -feat_affix=hires # Affix for the type of feature used +feat_affix= # Affix for the type of feature used convert_data_dir_to_whole=true # If true, the input data directory is # first converted to whole data directory (i.e. whole recordings) diff --git a/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh b/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh index de19cfc6772..f8557a70177 100755 --- a/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh +++ b/egs/wsj/s5/steps/segmentation/prepare_targets_gmm.sh @@ -1,7 +1,6 @@ #! /bin/bash # Copyright 2017 Vimal Manohar -# 2017 Nagendra Kumar Goel # Apache 2.0 # This script prepares targets for training neural network for @@ -211,7 +210,7 @@ if [ $stage -le 5 ]; then # the speech / silence decisions, not the exact word sequences. steps/decode.sh --cmd "$decode_cmd --mem 2G" --nj $nj \ --max-active 1000 --beam 10.0 \ - --skip-scoring true \ + --decode-extra-opts "--word-determinize=false" --skip-scoring true \ $graph_dir $uniform_seg_data_dir $decode_dir fi From b43e5dcb87ed7d04725c8e69c82f9e1779ca20a0 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Fri, 26 Jan 2018 12:22:05 -0500 Subject: [PATCH 07/11] simplifying stuff --- egs/swbd/s5c/local/run_asr_segmentation.sh | 78 ++++++++++------------ 1 file changed, 37 insertions(+), 41 deletions(-) diff --git a/egs/swbd/s5c/local/run_asr_segmentation.sh b/egs/swbd/s5c/local/run_asr_segmentation.sh index 21c20b0a423..4bc43007aca 100755 --- a/egs/swbd/s5c/local/run_asr_segmentation.sh +++ b/egs/swbd/s5c/local/run_asr_segmentation.sh @@ -10,7 +10,7 @@ lang=data/lang # Must match the one used to train the models lang_test=data/lang_nosp_sw1_tg # Lang directory for decoding. -data_dir=data/train +data_dir=data/train_nodup # Model directory used to align the $data_dir to get target labels for training # SAD. This should typically be a speaker-adapted system. sat_model_dir=exp/tri4 @@ -18,15 +18,8 @@ sat_model_dir=exp/tri4 # get target labels for training SAD. This should typically be a # speaker-independent system like LDA+MLLT system. model_dir=exp/tri3 -graph_dir= # If not provided, a new one will be created using $lang_test - -# Uniform segmentation options for decoding whole recordings. All values are in -# seconds. -max_segment_duration=10 -overlap_duration=2.5 -max_remaining_duration=5 # If the last remaining piece when splitting uniformly - # is smaller than this duration, then the last piece - # is merged with the previous. +graph_dir= # Graph for decoding whole-recording version of $data_dir. + # If not provided, a new one will be created using $lang_test # List of weights on labels obtained from alignment, # labels obtained from decoding and default labels in out-of-segment regions @@ -37,7 +30,7 @@ nstage=-10 train_stage=-10 test_stage=-10 num_data_reps=2 -affix=_1a +affix=_1a # For segmentation stage=-1 nj=80 @@ -77,7 +70,10 @@ if ! cat $dir/garbage_phones.txt $dir/silence_phones.txt | \ fi whole_data_dir=${data_dir}_whole -rvb_data_dir=${whole_data_dir}_rvb +targets_dir=exp/segmentation${affix}/train_whole_combined_targets_sub3 + +rvb_data_dir=${whole_data_dir}_rvb_hires +rvb_targets_dir=${targets_dir}_rvb if [ $stage -le 0 ]; then utils/data/convert_data_dir_to_whole.sh $data_dir $whole_data_dir @@ -112,15 +108,16 @@ if [ $stage -le 3 ]; then --nj 80 --reco-nj 40 --lang-test $lang_test \ --garbage-phones-list $dir/garbage_phones.txt \ --silence-phones-list $dir/silence_phones.txt \ - --merge-weights $merge_weights \ + --merge-weights "$merge_weights" \ + --graph-dir "$graph_dir" \ $lang $data_dir $whole_data_dir $sat_model_dir $model_dir $dir fi if [ $stage -le 4 ]; then # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises if [ ! -f rirs_noises.zip ]; then - wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip - unzip rirs_noises.zip + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip fi rvb_opts=() @@ -131,7 +128,6 @@ if [ $stage -le 4 ]; then foreground_snrs="20:10:15:5:0" background_snrs="20:10:15:5:0" - num_data_reps=1 # corrupt the data to generate multi-condition data # for data_dir in train dev test; do python steps/data/reverberate_data_dir.py \ @@ -147,70 +143,70 @@ if [ $stage -le 4 ]; then --source-sampling-rate 8000 \ $whole_data_dir $rvb_data_dir - rvb_dirs=() + rvb_targets_dirs=() for i in `seq 1 $num_data_reps`; do steps/segmentation/copy_targets_dir.sh --utt-prefix "rev${i}_" \ - exp/segmentation_1a/train_whole_combined_targets_sub3 \ - exp/segmentation_1a/train_whole_combined_targets_sub3_temp_$i || exit 1; - rvb_dirs+=(exp/segmentation_1a/train_whole_combined_targets_sub3_temp_$i) + $targets_dir ${targets_dir}_temp_$i || exit 1 + rvb_targets_dirs+=(${targets_dir}_temp_$i) done steps/segmentation/combine_targets_dirs.sh \ - $rvb_data_dir exp/segmentation_1a/train_whole_combined_targets_sub3_rvb \ - $rvb_dirs || exit 1; + $rvb_data_dir ${rvb_targets_dir} \ + ${rvb_targets_dirs[@]} || exit 1; + + rm -r ${rvb_targets_dirs[@]} fi if [ $stage -le 5 ]; then - utils/copy_data_dir.sh ${rvb_data_dir} ${rvb_data_dir}_hires steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 80 \ - ${rvb_data_dir}_hires - steps/compute_cmvn_stats.sh ${rvb_data_dir}_hires + ${rvb_data_dir} + steps/compute_cmvn_stats.sh ${rvb_data_dir} fi if [ $stage -le 6 ]; then # Train a STATS-pooling network for SAD - local/segmentation/tuning/train_stats_asr_sad_1a.sh \ + local/segmentation/tuning/train_stats_asr_sad_1a.sh \ --stage $nstage --train-stage $train_stage \ - --targets-dir exp/segmentation_1a/train_whole_combined_targets_sub3_rvb \ - --data-dir ${rvb_data_dir}_hires + --targets-dir ${rvb_targets_dir} \ + --data-dir ${rvb_data_dir} --affix "1a" || exit 1 fi if [ $stage -le 7 ]; then # The options to this script must match the options used in the # nnet training script. - # e.g. extra-left-context is 70, because the model is an LSTM trained with a - # chunk-left-context of 60. + # e.g. extra-left-context is 79, because the model is an stats pooling network + # trained with a chunk-left-context of 79 and chunk-right-context of 21. # Note: frames-per-chunk is 150 even though the model was trained with # chunk-width of 20. This is just for speed. # See the script for details of the options. steps/segmentation/detect_speech_activity.sh \ - --extra-left-context 70 --extra-right-context 0 --frames-per-chunk 150 \ + --extra-left-context 79 --extra-right-context 21 --frames-per-chunk 150 \ --extra-left-context-initial 0 --extra-right-context-final 0 \ --nj 32 --acwt 0.3 --stage $test_stage \ data/eval2000 \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a \ + exp/segmentation${affix}/tdnn_stats_asr_sad_1a \ mfcc_hires \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a/{,eval2000} + exp/segmentation${affix}/tdnn_stats_asr_sad_1a/{,eval2000} fi if [ $stage -le 8 ]; then # Do some diagnostics steps/segmentation/evaluate_segmentation.pl data/eval2000/segments \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/segments &> \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/evalutate_segmentation.log + exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/segments &> \ + exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/evalutate_segmentation.log steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/utt2spk \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/segments \ - exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/sys.rttm + exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/utt2spk \ + exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/segments \ + exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/sys.rttm # export PATH=$PATH:$KALDI_ROOT/tools/sctk/bin # md-eval.pl -c 0.25 -r $eval2000_rttm_file \ -# -s exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/sys.rttm > \ -# exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg/md_eval.log +# -s exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/sys.rttm > \ +# exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/md_eval.log fi if [ $stage -le 9 ]; then - utils/copy_data_dir.sh exp/segmentation_1a/tdnn_stats_asr_sad_1a/eval2000_seg \ + utils/copy_data_dir.sh exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg \ data/eval2000.seg_asr_sad_1a fi From a1224eeea978174c46dfd4c57d9c8a122dbf4d49 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 5 Feb 2018 00:37:07 -0500 Subject: [PATCH 08/11] Minor bug fix --- egs/wsj/s5/steps/segmentation/combine_targets_dirs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/wsj/s5/steps/segmentation/combine_targets_dirs.sh b/egs/wsj/s5/steps/segmentation/combine_targets_dirs.sh index f6be21e16f3..8135d089f5b 100755 --- a/egs/wsj/s5/steps/segmentation/combine_targets_dirs.sh +++ b/egs/wsj/s5/steps/segmentation/combine_targets_dirs.sh @@ -49,7 +49,7 @@ for d in $*; do cat $d/targets.scp done | sort -k1,1 > $dest/targets.scp || exit 1 -steps/segmentation/verify_targets_dir.sh $data $dest || exit 1 +steps/segmentation/validate_targets_dir.sh $dest $data || exit 1 echo "Combined targets and stored in $dest" exit 0 From e5a454a1ac1fc10ce2a812c68a20e91036ac423e Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 5 Feb 2018 09:30:36 -0500 Subject: [PATCH 09/11] Minor bug fixes --- egs/swbd/s5c/local/run_asr_segmentation.sh | 106 +++++++++++---------- 1 file changed, 55 insertions(+), 51 deletions(-) diff --git a/egs/swbd/s5c/local/run_asr_segmentation.sh b/egs/swbd/s5c/local/run_asr_segmentation.sh index 4bc43007aca..6d935616225 100755 --- a/egs/swbd/s5c/local/run_asr_segmentation.sh +++ b/egs/swbd/s5c/local/run_asr_segmentation.sh @@ -52,14 +52,14 @@ garbage_phones="lau spn" silence_phones="sil" for p in $garbage_phones; do - for affix in "" "_B" "_E" "_I" "_S"; do - echo "$p$affix" + for a in "" "_B" "_E" "_I" "_S"; do + echo "$p$a" done done > $dir/garbage_phones.txt for p in $silence_phones; do - for affix in "" "_B" "_E" "_I" "_S"; do - echo "$p$affix" + for a in "" "_B" "_E" "_I" "_S"; do + echo "$p$a" done done > $dir/silence_phones.txt @@ -69,8 +69,9 @@ if ! cat $dir/garbage_phones.txt $dir/silence_phones.txt | \ exit 1 fi +data_id=$(basename $data_dir) whole_data_dir=${data_dir}_whole -targets_dir=exp/segmentation${affix}/train_whole_combined_targets_sub3 +targets_dir=exp/segmentation${affix}/${data_id}_whole_combined_targets_sub3 rvb_data_dir=${whole_data_dir}_rvb_hires rvb_targets_dir=${targets_dir}_rvb @@ -84,8 +85,8 @@ fi ############################################################################### if [ $stage -le 1 ]; then steps/make_mfcc.sh --nj 50 --cmd "$train_cmd" --write-utt2num-frames true \ - $whole_data_dir exp/make_mfcc/train_whole - steps/compute_cmvn_stats.sh $whole_data_dir exp/make_mfcc/train_whole + $whole_data_dir exp/make_mfcc/${data_id}_whole + steps/compute_cmvn_stats.sh $whole_data_dir exp/make_mfcc/${data_id}_whole utils/fix_data_dir.sh $whole_data_dir fi @@ -114,56 +115,59 @@ if [ $stage -le 3 ]; then fi if [ $stage -le 4 ]; then - # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises - if [ ! -f rirs_noises.zip ]; then - wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip - unzip rirs_noises.zip - fi - - rvb_opts=() - # This is the config for the system using simulated RIRs and point-source noises - rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") - rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") - rvb_opts+=(--noise-set-parameters RIRS_NOISES/pointsource_noises/noise_list) - - foreground_snrs="20:10:15:5:0" - background_snrs="20:10:15:5:0" - # corrupt the data to generate multi-condition data - # for data_dir in train dev test; do - python steps/data/reverberate_data_dir.py \ - "${rvb_opts[@]}" \ - --prefix "rev" \ - --foreground-snrs $foreground_snrs \ - --background-snrs $background_snrs \ - --speech-rvb-probability 0.5 \ - --pointsource-noise-addition-probability 0.5 \ - --isotropic-noise-addition-probability 0.7 \ - --num-replications $num_data_reps \ - --max-noises-per-minute 4 \ - --source-sampling-rate 8000 \ - $whole_data_dir $rvb_data_dir - - rvb_targets_dirs=() - for i in `seq 1 $num_data_reps`; do - steps/segmentation/copy_targets_dir.sh --utt-prefix "rev${i}_" \ - $targets_dir ${targets_dir}_temp_$i || exit 1 - rvb_targets_dirs+=(${targets_dir}_temp_$i) - done - - steps/segmentation/combine_targets_dirs.sh \ - $rvb_data_dir ${rvb_targets_dir} \ - ${rvb_targets_dirs[@]} || exit 1; - - rm -r ${rvb_targets_dirs[@]} + # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises + if [ ! -f rirs_noises.zip ]; then + wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip + unzip rirs_noises.zip + fi + + rvb_opts=() + # This is the config for the system using simulated RIRs and point-source noises + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list") + rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list") + rvb_opts+=(--noise-set-parameters RIRS_NOISES/pointsource_noises/noise_list) + + foreground_snrs="20:10:15:5:0" + background_snrs="20:10:15:5:0" + # corrupt the data to generate multi-condition data + # for data_dir in train dev test; do + python steps/data/reverberate_data_dir.py \ + "${rvb_opts[@]}" \ + --prefix "rev" \ + --foreground-snrs $foreground_snrs \ + --background-snrs $background_snrs \ + --speech-rvb-probability 0.5 \ + --pointsource-noise-addition-probability 0.5 \ + --isotropic-noise-addition-probability 0.7 \ + --num-replications $num_data_reps \ + --max-noises-per-minute 4 \ + --source-sampling-rate 8000 \ + $whole_data_dir $rvb_data_dir fi if [ $stage -le 5 ]; then steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 80 \ ${rvb_data_dir} steps/compute_cmvn_stats.sh ${rvb_data_dir} + utils/fix_data_dir.sh $rvb_data_dir fi if [ $stage -le 6 ]; then + rvb_targets_dirs=() + for i in `seq 1 $num_data_reps`; do + steps/segmentation/copy_targets_dir.sh --utt-prefix "rev${i}_" \ + $targets_dir ${targets_dir}_temp_$i || exit 1 + rvb_targets_dirs+=(${targets_dir}_temp_$i) + done + + steps/segmentation/combine_targets_dirs.sh \ + $rvb_data_dir ${rvb_targets_dir} \ + ${rvb_targets_dirs[@]} || exit 1; + + rm -r ${rvb_targets_dirs[@]} +fi + +if [ $stage -le 7 ]; then # Train a STATS-pooling network for SAD local/segmentation/tuning/train_stats_asr_sad_1a.sh \ --stage $nstage --train-stage $train_stage \ @@ -171,7 +175,7 @@ if [ $stage -le 6 ]; then --data-dir ${rvb_data_dir} --affix "1a" || exit 1 fi -if [ $stage -le 7 ]; then +if [ $stage -le 8 ]; then # The options to this script must match the options used in the # nnet training script. # e.g. extra-left-context is 79, because the model is an stats pooling network @@ -189,7 +193,7 @@ if [ $stage -le 7 ]; then exp/segmentation${affix}/tdnn_stats_asr_sad_1a/{,eval2000} fi -if [ $stage -le 8 ]; then +if [ $stage -le 9 ]; then # Do some diagnostics steps/segmentation/evaluate_segmentation.pl data/eval2000/segments \ exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/segments &> \ @@ -206,7 +210,7 @@ if [ $stage -le 8 ]; then # exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/md_eval.log fi -if [ $stage -le 9 ]; then +if [ $stage -le 10 ]; then utils/copy_data_dir.sh exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg \ data/eval2000.seg_asr_sad_1a fi From fb19685f72364d70afdddff6d35b440e29efc09d Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Mon, 5 Feb 2018 15:50:08 -0500 Subject: [PATCH 10/11] Making the split per-spk instead of per-utt --- egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh b/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh index 29d52588807..d47daac1bc0 100755 --- a/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh +++ b/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh @@ -72,9 +72,9 @@ fi nj=$(cat $latdir/num_jobs) oov=$(cat $lang/oov.int) -utils/split_data.sh --per-utt $data $nj +utils/split_data.sh $data $nj -sdata=$data/split${nj}utt +sdata=$data/split${nj} if [ $stage -le 1 ]; then $cmd JOB=1:$nj $dir/log/get_oracle.JOB.log \ From b539e3d2972a7a6012d2f9d8290e321297e08c2e Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 6 Feb 2018 15:14:49 -0500 Subject: [PATCH 11/11] swbd_sad: Minor fix --- .../s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh index 96009c69374..05e5f4ded05 100755 --- a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh +++ b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh @@ -136,7 +136,7 @@ fi if [ $stage -le 7 ]; then # Use a subset to compute prior over the output targets - $cmd $dir/log/get_priors.log \ + $train_cmd $dir/log/get_priors.log \ matrix-sum-rows "scp:utils/subset_scp.pl --quiet 1000 $targets_dir/targets.scp |" \ ark:- \| vector-sum --binary=false ark:- $dir/post_output.vec || exit 1