Skip to content
151 changes: 111 additions & 40 deletions egs/swbd/s5c/local/run_asr_segmentation.sh
Original file line number Diff line number Diff line change
@@ -1,31 +1,25 @@
#! /bin/bash
#!/bin/bash

# Copyright 2017 Vimal Manohar
# Copyright 2017 Nagendra Kumar Goel
# 2017 Vimal Manohar
# Apache 2.0

# Features configs (Must match the features used to train the models
# $sat_model_dir and $model_dir)
# We assume the run.sh has been executed (because we are using model
# directories like exp/tri4)

lang=data/lang_nosp # Must match the one used to train the models
lang=data/lang # Must match the one used to train the models
lang_test=data/lang_nosp_sw1_tg # Lang directory for decoding.

data_dir=data/train_100k_nodup
data_dir=data/train_nodup
# Model directory used to align the $data_dir to get target labels for training
# SAD. This should typically be a speaker-adapted system.
sat_model_dir=exp/tri4
# Model direcotry used to decode the whole-recording version of the $data_dir to
# get target labels for training SAD. This should typically be a
# speaker-independent system like LDA+MLLT system.
model_dir=exp/tri3
graph_dir= # If not provided, a new one will be created using $lang_test

# Uniform segmentation options for decoding whole recordings. All values are in
# seconds.
max_segment_duration=10
overlap_duration=2.5
max_remaining_duration=5 # If the last remaining piece when splitting uniformly
# is smaller than this duration, then the last piece
# is merged with the previous.
graph_dir= # Graph for decoding whole-recording version of $data_dir.
# If not provided, a new one will be created using $lang_test

# List of weights on labels obtained from alignment,
# labels obtained from decoding and default labels in out-of-segment regions
Expand All @@ -35,13 +29,13 @@ prepare_targets_stage=-10
nstage=-10
train_stage=-10
test_stage=-10

affix=_1a
num_data_reps=2
affix=_1a # For segmentation
stage=-1
nj=80

. ./path.sh
. ./cmd.sh
if [ -f ./path.sh ]; then . ./path.sh; fi

set -e -u -o pipefail
. utils/parse_options.sh
Expand All @@ -55,17 +49,17 @@ mkdir -p $dir

# See $lang/phones.txt and decide which should be garbage
garbage_phones="lau spn"
silence_phones="nsn SIL"
silence_phones="sil"

for p in $garbage_phones; do
for affix in "" "_B" "_E" "_I" "_S"; do
echo "$p$affix"
for a in "" "_B" "_E" "_I" "_S"; do
echo "$p$a"
done
done > $dir/garbage_phones.txt

for p in $silence_phones; do
for affix in "" "_B" "_E" "_I" "_S"; do
echo "$p$affix"
for a in "" "_B" "_E" "_I" "_S"; do
echo "$p$a"
done
done > $dir/silence_phones.txt

Expand All @@ -75,7 +69,12 @@ if ! cat $dir/garbage_phones.txt $dir/silence_phones.txt | \
exit 1
fi

data_id=$(basename $data_dir)
whole_data_dir=${data_dir}_whole
targets_dir=exp/segmentation${affix}/${data_id}_whole_combined_targets_sub3

rvb_data_dir=${whole_data_dir}_rvb_hires
rvb_targets_dir=${targets_dir}_rvb

if [ $stage -le 0 ]; then
utils/data/convert_data_dir_to_whole.sh $data_dir $whole_data_dir
Expand All @@ -85,8 +84,10 @@ fi
# Extract features for the whole data directory
###############################################################################
if [ $stage -le 1 ]; then
steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj --write-utt2num-frames true \
${whole_data_dir} || exit 1
steps/make_mfcc.sh --nj 50 --cmd "$train_cmd" --write-utt2num-frames true \
$whole_data_dir exp/make_mfcc/${data_id}_whole
steps/compute_cmvn_stats.sh $whole_data_dir exp/make_mfcc/${data_id}_whole
utils/fix_data_dir.sh $whole_data_dir
fi

###############################################################################
Expand All @@ -108,38 +109,108 @@ if [ $stage -le 3 ]; then
--nj 80 --reco-nj 40 --lang-test $lang_test \
--garbage-phones-list $dir/garbage_phones.txt \
--silence-phones-list $dir/silence_phones.txt \
--merge-weights "$merge_weights" \
--graph-dir "$graph_dir" \
$lang $data_dir $whole_data_dir $sat_model_dir $model_dir $dir
fi

if [ $stage -le 4 ]; then
utils/copy_data_dir.sh ${whole_data_dir} ${whole_data_dir}_hires_bp
steps/make_mfcc.sh --mfcc-config conf/mfcc_hires_bp.conf --nj 40 \
${whole_data_dir}_hires_bp
steps/compute_cmvn_stats.sh ${whole_data_dir}_hires_bp
# Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
if [ ! -f rirs_noises.zip ]; then
wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
unzip rirs_noises.zip
fi

rvb_opts=()
# This is the config for the system using simulated RIRs and point-source noises
rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
rvb_opts+=(--noise-set-parameters RIRS_NOISES/pointsource_noises/noise_list)

foreground_snrs="20:10:15:5:0"
background_snrs="20:10:15:5:0"
# corrupt the data to generate multi-condition data
# for data_dir in train dev test; do
python steps/data/reverberate_data_dir.py \
"${rvb_opts[@]}" \
--prefix "rev" \
--foreground-snrs $foreground_snrs \
--background-snrs $background_snrs \
--speech-rvb-probability 0.5 \
--pointsource-noise-addition-probability 0.5 \
--isotropic-noise-addition-probability 0.7 \
--num-replications $num_data_reps \
--max-noises-per-minute 4 \
--source-sampling-rate 8000 \
$whole_data_dir $rvb_data_dir
fi

if [ $stage -le 5 ]; then
# Train a TDNN-LSTM network for SAD
local/segmentation/tuning/train_lstm_asr_sad_1a.sh \
--stage $nstage --train-stage $train_stage \
--targets-dir $dir \
--data-dir ${whole_data_dir}_hires_bp
steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 80 \
${rvb_data_dir}
steps/compute_cmvn_stats.sh ${rvb_data_dir}
utils/fix_data_dir.sh $rvb_data_dir
fi

if [ $stage -le 6 ]; then
rvb_targets_dirs=()
for i in `seq 1 $num_data_reps`; do
steps/segmentation/copy_targets_dir.sh --utt-prefix "rev${i}_" \
$targets_dir ${targets_dir}_temp_$i || exit 1
rvb_targets_dirs+=(${targets_dir}_temp_$i)
done

steps/segmentation/combine_targets_dirs.sh \
$rvb_data_dir ${rvb_targets_dir} \
${rvb_targets_dirs[@]} || exit 1;

rm -r ${rvb_targets_dirs[@]}
fi

if [ $stage -le 7 ]; then
# Train a STATS-pooling network for SAD
local/segmentation/tuning/train_stats_asr_sad_1a.sh \
--stage $nstage --train-stage $train_stage \
--targets-dir ${rvb_targets_dir} \
--data-dir ${rvb_data_dir} --affix "1a" || exit 1
fi

if [ $stage -le 8 ]; then
# The options to this script must match the options used in the
# nnet training script.
# e.g. extra-left-context is 70, because the model is an LSTM trained with a
# chunk-left-context of 60.
# e.g. extra-left-context is 79, because the model is an stats pooling network
# trained with a chunk-left-context of 79 and chunk-right-context of 21.
# Note: frames-per-chunk is 150 even though the model was trained with
# chunk-width of 20. This is just for speed.
# See the script for details of the options.
steps/segmentation/detect_speech_activity.sh \
--extra-left-context 70 --extra-right-context 0 --frames-per-chunk 150 \
--extra-left-context 79 --extra-right-context 21 --frames-per-chunk 150 \
--extra-left-context-initial 0 --extra-right-context-final 0 \
--nj 32 --acwt 0.3 --stage $test_stage \
data/eval2000 \
exp/segmentation_1a/tdnn_lstm_asr_sad_1a \
mfcc_hires_bp \
exp/segmentation_1a/tdnn_lstm_asr_sad_1a/{,eval2000}
exp/segmentation${affix}/tdnn_stats_asr_sad_1a \
mfcc_hires \
exp/segmentation${affix}/tdnn_stats_asr_sad_1a/{,eval2000}
fi

if [ $stage -le 9 ]; then
# Do some diagnostics
steps/segmentation/evaluate_segmentation.pl data/eval2000/segments \
exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/segments &> \
exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/evalutate_segmentation.log

steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \
exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/utt2spk \
exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/segments \
exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/sys.rttm

# export PATH=$PATH:$KALDI_ROOT/tools/sctk/bin
# md-eval.pl -c 0.25 -r $eval2000_rttm_file \
# -s exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/sys.rttm > \
# exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/md_eval.log
fi

if [ $stage -le 10 ]; then
utils/copy_data_dir.sh exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg \
data/eval2000.seg_asr_sad_1a
fi
58 changes: 58 additions & 0 deletions egs/swbd/s5c/local/run_cleanup_segmentation.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/bin/bash

# Copyright 2016 Vimal Manohar
# 2016 Johns Hopkins University (author: Daniel Povey)
# 2017 Nagendra Kumar Goel
# Apache 2.0

# This script demonstrates how to re-segment training data selecting only the
# "good" audio that matches the transcripts.
# The basic idea is to decode with an existing in-domain acoustic model, and a
# biased language model built from the reference, and then work out the
# segmentation from a ctm like file.

# For nnet3 and chain results after cleanup, see the scripts in
# local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh

# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets
# [will add these later].

set -e
set -o pipefail
set -u

stage=0
cleanup_stage=0
data=data/train_nodup
cleanup_affix=cleaned
srcdir=exp/tri4
langdir=data/lang_sw1_tg
nj=100
decode_nj=16
decode_num_threads=4

. ./cmd.sh
if [ -f ./path.sh ]; then . ./path.sh; fi
. utils/parse_options.sh

cleaned_data=${data}_${cleanup_affix}

dir=${srcdir}_${cleanup_affix}_work
cleaned_dir=${srcdir}_${cleanup_affix}

if [ $stage -le 1 ]; then
# This does the actual data cleanup.
steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage \
--nj $nj --cmd "$train_cmd" \
$data $langdir $srcdir $dir $cleaned_data
fi

if [ $stage -le 2 ]; then
steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
$cleaned_data $langdir $srcdir ${srcdir}_ali_${cleanup_affix}
fi

if [ $stage -le 3 ]; then
steps/train_sat.sh --cmd "$train_cmd" \
11500 200000 $cleaned_data $langdir ${srcdir}_ali_${cleanup_affix} ${cleaned_dir}
fi
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
#!/bin/bash

# Copyright 2017 Nagendra Kumar Goel
# Apache 2.0

# This is a script to train a TDNN-LSTM for speech activity detection (SAD)
# using LSTM for long-context information.

set -o pipefail
set -u

. ./cmd.sh

# At this script level we don't support not running on GPU, as it would be painfully slow.
# If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
# --num-threads 16 and --minibatch-size 128.
Expand Down Expand Up @@ -48,7 +49,7 @@ data_dir=exp/segmentation_1a/train_whole_hires_bp
targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3

. ./cmd.sh
. ./path.sh
if [ -f ./path.sh ]; then . ./path.sh; fi
. ./utils/parse_options.sh

if [ -z "$dir" ]; then
Expand Down
18 changes: 12 additions & 6 deletions egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#!/bin/bash

# Copyright 2017 Nagendra Kumar Goel
# 2016 Vimal Manohar
# Apache 2.0
# This is a script to train a TDNN for speech activity detection (SAD)
# using statistics pooling for long-context information.

Expand Down Expand Up @@ -27,7 +30,7 @@ extra_right_context=21
relu_dim=256

# training options
num_epochs=4
num_epochs=1
initial_effective_lrate=0.0003
final_effective_lrate=0.00003
num_jobs_initial=3
Expand All @@ -43,11 +46,11 @@ config_dir=
dir=
affix=1a2

data_dir=exp/segmentation_1a/train_whole_hires_bp
data_dir=exp/segmentation_1a/train_whole_rvb_hires
targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3

. ./cmd.sh
. ./path.sh
if [ -f ./path.sh ]; then . ./path.sh; fi
. ./utils/parse_options.sh

if [ -z "$dir" ]; then
Expand Down Expand Up @@ -129,10 +132,13 @@ if [ $stage -le 6 ]; then
--targets-scp="$targets_dir/targets.scp" \
--egs.opts="--frame-subsampling-factor 3 --num-utts-subset $num_utts_subset" \
--dir=$dir || exit 1
fi

copy-feats scp:$targets_dir/targets.scp ark:- | \
matrix-sum-rows ark:- ark:- | vector-sum --binary=false ark:- - | \
awk '{print " [ "$2" "$3" ]"}' > $dir/post_output.vec
if [ $stage -le 7 ]; then
# Use a subset to compute prior over the output targets
$train_cmd $dir/log/get_priors.log \
matrix-sum-rows "scp:utils/subset_scp.pl --quiet 1000 $targets_dir/targets.scp |" \
ark:- \| vector-sum --binary=false ark:- $dir/post_output.vec || exit 1

echo 3 > $dir/frame_subsampling_factor
fi
2 changes: 1 addition & 1 deletion egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ oov=$(cat $lang/oov.int)

utils/split_data.sh $data $nj

sdata=$data/split$nj;
sdata=$data/split${nj}

if [ $stage -le 1 ]; then
$cmd JOB=1:$nj $dir/log/get_oracle.JOB.log \
Expand Down
Loading