kaldi-asr · danpovey · Feb 11, 2018 · Jan 9, 2018 · Jan 9, 2018 · Jan 11, 2018
diff --git a/egs/swbd/s5c/local/run_asr_segmentation.sh b/egs/swbd/s5c/local/run_asr_segmentation.sh
@@ -1,31 +1,25 @@
-#! /bin/bash
+#!/bin/bash
 
-# Copyright 2017  Vimal Manohar
+# Copyright  2017  Nagendra Kumar Goel
+#            2017  Vimal Manohar
 # Apache 2.0
 
-# Features configs (Must match the features used to train the models
-# $sat_model_dir and $model_dir)
+# We assume the run.sh has been executed (because we are using model
+# directories like exp/tri4)
 
-lang=data/lang_nosp   # Must match the one used to train the models
+lang=data/lang   # Must match the one used to train the models
 lang_test=data/lang_nosp_sw1_tg  # Lang directory for decoding.
 
-data_dir=data/train_100k_nodup
+data_dir=data/train_nodup
 # Model directory used to align the $data_dir to get target labels for training
 # SAD. This should typically be a speaker-adapted system.
 sat_model_dir=exp/tri4
 # Model direcotry used to decode the whole-recording version of the $data_dir to
 # get target labels for training SAD. This should typically be a 
 # speaker-independent system like LDA+MLLT system.
 model_dir=exp/tri3
-graph_dir=    # If not provided, a new one will be created using $lang_test
-
-# Uniform segmentation options for decoding whole recordings. All values are in
-# seconds.
-max_segment_duration=10
-overlap_duration=2.5
-max_remaining_duration=5  # If the last remaining piece when splitting uniformly
-                          # is smaller than this duration, then the last piece 
-                          # is  merged with the previous.
+graph_dir=    # Graph for decoding whole-recording version of $data_dir.
+              # If not provided, a new one will be created using $lang_test
 
 # List of weights on labels obtained from alignment, 
 # labels obtained from decoding and default labels in out-of-segment regions
@@ -35,13 +29,13 @@ prepare_targets_stage=-10
 nstage=-10
 train_stage=-10
 test_stage=-10
-
-affix=_1a
+num_data_reps=2
+affix=_1a   # For segmentation
 stage=-1
 nj=80
 
-. ./path.sh
 . ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
 
 set -e -u -o pipefail
 . utils/parse_options.sh 
@@ -55,17 +49,17 @@ mkdir -p $dir
 
 # See $lang/phones.txt and decide which should be garbage
 garbage_phones="lau spn"
-silence_phones="nsn SIL"
+silence_phones="sil"
 
 for p in $garbage_phones; do 
-  for affix in "" "_B" "_E" "_I" "_S"; do
-    echo "$p$affix"
+  for a in "" "_B" "_E" "_I" "_S"; do
+    echo "$p$a"
   done
 done > $dir/garbage_phones.txt
 
 for p in $silence_phones; do 
-  for affix in "" "_B" "_E" "_I" "_S"; do
-    echo "$p$affix"
+  for a in "" "_B" "_E" "_I" "_S"; do
+    echo "$p$a"
   done
 done > $dir/silence_phones.txt
 
@@ -75,7 +69,12 @@ if ! cat $dir/garbage_phones.txt $dir/silence_phones.txt | \
   exit 1
 fi
 
+data_id=$(basename $data_dir)
 whole_data_dir=${data_dir}_whole
+targets_dir=exp/segmentation${affix}/${data_id}_whole_combined_targets_sub3
+
+rvb_data_dir=${whole_data_dir}_rvb_hires
+rvb_targets_dir=${targets_dir}_rvb
 
 if [ $stage -le 0 ]; then
   utils/data/convert_data_dir_to_whole.sh $data_dir $whole_data_dir
@@ -85,8 +84,10 @@ fi
 # Extract features for the whole data directory
 ###############################################################################
 if [ $stage -le 1 ]; then
-  steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj --write-utt2num-frames true \
-    ${whole_data_dir} || exit 1
+  steps/make_mfcc.sh --nj 50 --cmd "$train_cmd"  --write-utt2num-frames true \
+    $whole_data_dir exp/make_mfcc/${data_id}_whole
+  steps/compute_cmvn_stats.sh $whole_data_dir exp/make_mfcc/${data_id}_whole
+  utils/fix_data_dir.sh $whole_data_dir
 fi
 
 ###############################################################################
@@ -108,38 +109,108 @@ if [ $stage -le 3 ]; then
     --nj 80 --reco-nj 40 --lang-test $lang_test \
     --garbage-phones-list $dir/garbage_phones.txt \
     --silence-phones-list $dir/silence_phones.txt \
+    --merge-weights "$merge_weights" \
+    --graph-dir "$graph_dir" \
     $lang $data_dir $whole_data_dir $sat_model_dir $model_dir $dir
 fi
 
 if [ $stage -le 4 ]; then
-  utils/copy_data_dir.sh ${whole_data_dir} ${whole_data_dir}_hires_bp
-  steps/make_mfcc.sh --mfcc-config conf/mfcc_hires_bp.conf --nj 40 \
-    ${whole_data_dir}_hires_bp
-  steps/compute_cmvn_stats.sh ${whole_data_dir}_hires_bp
+  # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+  if [ ! -f rirs_noises.zip ]; then
+    wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+    unzip rirs_noises.zip
+  fi
+
+  rvb_opts=()
+  # This is the config for the system using simulated RIRs and point-source noises
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
+  rvb_opts+=(--noise-set-parameters RIRS_NOISES/pointsource_noises/noise_list)
+
+  foreground_snrs="20:10:15:5:0"
+  background_snrs="20:10:15:5:0"
+  # corrupt the data to generate multi-condition data
+  # for data_dir in train dev test; do
+  python steps/data/reverberate_data_dir.py \
+    "${rvb_opts[@]}" \
+    --prefix "rev" \
+    --foreground-snrs $foreground_snrs \
+    --background-snrs $background_snrs \
+    --speech-rvb-probability 0.5 \
+    --pointsource-noise-addition-probability 0.5 \
+    --isotropic-noise-addition-probability 0.7 \
+    --num-replications $num_data_reps \
+    --max-noises-per-minute 4 \
+    --source-sampling-rate 8000 \
+    $whole_data_dir $rvb_data_dir
 fi
 
 if [ $stage -le 5 ]; then
-  # Train a TDNN-LSTM network for SAD
-  local/segmentation/tuning/train_lstm_asr_sad_1a.sh \
-    --stage $nstage --train-stage $train_stage \
-    --targets-dir $dir \
-    --data-dir ${whole_data_dir}_hires_bp
+  steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 80 \
+    ${rvb_data_dir}
+  steps/compute_cmvn_stats.sh ${rvb_data_dir}
+  utils/fix_data_dir.sh $rvb_data_dir
 fi
 
 if [ $stage -le 6 ]; then
+  rvb_targets_dirs=()
+  for i in `seq 1 $num_data_reps`; do
+    steps/segmentation/copy_targets_dir.sh --utt-prefix "rev${i}_" \
+      $targets_dir ${targets_dir}_temp_$i || exit 1
+    rvb_targets_dirs+=(${targets_dir}_temp_$i)
+  done
+
+  steps/segmentation/combine_targets_dirs.sh \
+    $rvb_data_dir ${rvb_targets_dir} \
+    ${rvb_targets_dirs[@]} || exit 1;
+
+  rm -r ${rvb_targets_dirs[@]}
+fi
+
+if [ $stage -le 7 ]; then
+  # Train a STATS-pooling network for SAD
+  local/segmentation/tuning/train_stats_asr_sad_1a.sh \
+    --stage $nstage --train-stage $train_stage \
+    --targets-dir ${rvb_targets_dir} \
+    --data-dir ${rvb_data_dir} --affix "1a" || exit 1
+fi
+
+if [ $stage -le 8 ]; then
   # The options to this script must match the options used in the 
   # nnet training script. 
-  # e.g. extra-left-context is 70, because the model is an LSTM trained with a 
-  # chunk-left-context of 60. 
+  # e.g. extra-left-context is 79, because the model is an stats pooling network 
+  # trained with a chunk-left-context of 79 and chunk-right-context of 21. 
   # Note: frames-per-chunk is 150 even though the model was trained with 
   # chunk-width of 20. This is just for speed.
   # See the script for details of the options.
   steps/segmentation/detect_speech_activity.sh \
-    --extra-left-context 70 --extra-right-context 0 --frames-per-chunk 150 \
+    --extra-left-context 79 --extra-right-context 21 --frames-per-chunk 150 \
     --extra-left-context-initial 0 --extra-right-context-final 0 \
     --nj 32 --acwt 0.3 --stage $test_stage \
     data/eval2000 \
-    exp/segmentation_1a/tdnn_lstm_asr_sad_1a \
-    mfcc_hires_bp \
-    exp/segmentation_1a/tdnn_lstm_asr_sad_1a/{,eval2000}
+    exp/segmentation${affix}/tdnn_stats_asr_sad_1a \
+    mfcc_hires \
+    exp/segmentation${affix}/tdnn_stats_asr_sad_1a/{,eval2000}
+fi
+
+if [ $stage -le 9 ]; then
+  # Do some diagnostics
+  steps/segmentation/evaluate_segmentation.pl data/eval2000/segments \
+    exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/segments &> \
+    exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/evalutate_segmentation.log
+
+  steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \
+    exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/utt2spk \
+    exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/segments \
+    exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/sys.rttm
+
+#  export PATH=$PATH:$KALDI_ROOT/tools/sctk/bin
+#  md-eval.pl -c 0.25 -r $eval2000_rttm_file \
+#    -s exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/sys.rttm > \
+#    exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/md_eval.log
+fi
+
+if [ $stage -le 10 ]; then
+  utils/copy_data_dir.sh exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg \
+    data/eval2000.seg_asr_sad_1a
 fi
diff --git a/egs/swbd/s5c/local/run_cleanup_segmentation.sh b/egs/swbd/s5c/local/run_cleanup_segmentation.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+# Copyright   2016  Vimal Manohar
+#             2016  Johns Hopkins University (author: Daniel Povey)
+#             2017  Nagendra Kumar Goel
+# Apache 2.0
+
+# This script demonstrates how to re-segment training data selecting only the
+# "good" audio that matches the transcripts.
+# The basic idea is to decode with an existing in-domain acoustic model, and a
+# biased language model built from the reference, and then work out the
+# segmentation from a ctm like file.
+
+# For nnet3 and chain results after cleanup, see the scripts in
+# local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
+
+# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets
+# [will add these later].
+
+set -e
+set -o pipefail
+set -u
+
+stage=0
+cleanup_stage=0
+data=data/train_nodup
+cleanup_affix=cleaned
+srcdir=exp/tri4
+langdir=data/lang_sw1_tg
+nj=100
+decode_nj=16
+decode_num_threads=4
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. utils/parse_options.sh
+
+cleaned_data=${data}_${cleanup_affix}
+
+dir=${srcdir}_${cleanup_affix}_work
+cleaned_dir=${srcdir}_${cleanup_affix}
+
+if [ $stage -le 1 ]; then
+  # This does the actual data cleanup.
+  steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage \
+    --nj $nj --cmd "$train_cmd" \
+    $data $langdir $srcdir $dir $cleaned_data
+fi
+
+if [ $stage -le 2 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $cleaned_data $langdir $srcdir ${srcdir}_ali_${cleanup_affix}
+fi
+
+if [ $stage -le 3 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" \
+    11500 200000 $cleaned_data $langdir ${srcdir}_ali_${cleanup_affix} ${cleaned_dir}
+fi
diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh
@@ -1,13 +1,14 @@
 #!/bin/bash
 
+# Copyright 2017 Nagendra Kumar Goel
+# Apache 2.0
+
 # This is a script to train a TDNN-LSTM for speech activity detection (SAD) 
 # using LSTM for long-context information.
 
 set -o pipefail
 set -u
 
-. ./cmd.sh
-
 # At this script level we don't support not running on GPU, as it would be painfully slow.
 # If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
 # --num-threads 16 and --minibatch-size 128.
@@ -48,7 +49,7 @@ data_dir=exp/segmentation_1a/train_whole_hires_bp
 targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3
 
 . ./cmd.sh
-. ./path.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
 . ./utils/parse_options.sh
 
 if [ -z "$dir" ]; then

diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
 
+# Copyright 2017   Nagendra Kumar Goel
+#           2016   Vimal Manohar
+# Apache 2.0
 # This is a script to train a TDNN for speech activity detection (SAD) 
 # using statistics pooling for long-context information.
 
@@ -27,7 +30,7 @@ extra_right_context=21
 relu_dim=256
 
 # training options
-num_epochs=4
+num_epochs=1
 initial_effective_lrate=0.0003
 final_effective_lrate=0.00003
 num_jobs_initial=3
@@ -43,11 +46,11 @@ config_dir=
 dir=
 affix=1a2
 
-data_dir=exp/segmentation_1a/train_whole_hires_bp
+data_dir=exp/segmentation_1a/train_whole_rvb_hires
 targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3
 
 . ./cmd.sh
-. ./path.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
 . ./utils/parse_options.sh
 
 if [ -z "$dir" ]; then
@@ -129,10 +132,13 @@ if [ $stage -le 6 ]; then
     --targets-scp="$targets_dir/targets.scp" \
     --egs.opts="--frame-subsampling-factor 3 --num-utts-subset $num_utts_subset" \
     --dir=$dir || exit 1
+fi
 
-  copy-feats scp:$targets_dir/targets.scp ark:- | \
-    matrix-sum-rows ark:- ark:- | vector-sum --binary=false ark:- - | \
-    awk '{print " [ "$2" "$3" ]"}' > $dir/post_output.vec
+if [ $stage -le 7 ]; then
+  # Use a subset to compute prior over the output targets
+  $train_cmd $dir/log/get_priors.log \
+    matrix-sum-rows "scp:utils/subset_scp.pl --quiet 1000 $targets_dir/targets.scp |" \
+    ark:- \| vector-sum --binary=false ark:- $dir/post_output.vec || exit 1
 
   echo 3 > $dir/frame_subsampling_factor
 fi
diff --git a/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh b/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh
@@ -74,7 +74,7 @@ oov=$(cat $lang/oov.int)
 
 utils/split_data.sh $data $nj
 
-sdata=$data/split$nj;
+sdata=$data/split${nj}
 
 if [ $stage -le 1 ]; then
   $cmd JOB=1:$nj $dir/log/get_oracle.JOB.log \