diff --git a/egs/sre08/v1/sid/compute_vad_decision.sh b/egs/sre08/v1/sid/compute_vad_decision.sh deleted file mode 100755 index 7099d063c7f..00000000000 --- a/egs/sre08/v1/sid/compute_vad_decision.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash - -# Copyright 2013 Daniel Povey -# Apache 2.0 -# To be run from .. (one directory up from here) -# see ../run.sh for example - -# Compute energy based VAD output -# We do this in just one job; it's fast. -# - -nj=2 -cmd=run.pl -vad_config=conf/vad.conf - -echo "$0 $@" # Print the command line for logging - -if [ -f path.sh ]; then . ./path.sh; fi -. parse_options.sh || exit 1; - -if [ $# != 3 ]; then - echo "Usage: $0 [options] "; - echo "e.g.: $0 data/train exp/make_vad mfcc" - echo " Options:" - echo " --vad-config # config passed to compute-vad-energy" - echo " --nj # number of parallel jobs" - echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." - exit 1; -fi - -data=$1 -logdir=$2 -vaddir=$3 - -# make $vaddir an absolute pathname. -vaddir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $vaddir ${PWD}` - -# use "name" as part of name of the archive. -name=`basename $data` - -mkdir -p $vaddir || exit 1; -mkdir -p $logdir || exit 1; - - -for f in $data/feats.scp "$vad_config"; do - if [ ! -f $f ]; then - echo "compute_vad_decision.sh: no such file $f" - exit 1; - fi -done - -utils/split_data.sh $data $nj || exit 1; -sdata=$data/split$nj; - -$cmd JOB=1:$nj $logdir/vad_${name}.JOB.log \ - compute-vad --config=$vad_config scp:$sdata/JOB/feats.scp ark,scp:$vaddir/vad_${name}.JOB.ark,$vaddir/vad_${name}.JOB.scp \ - || exit 1; - -for ((n=1; n<=nj; n++)); do - cat $vaddir/vad_${name}.$n.scp || exit 1; -done > $data/vad.scp - -nc=`cat $data/vad.scp | wc -l` -nu=`cat $data/feats.scp | wc -l` -if [ $nc -ne $nu ]; then - echo "**Warning it seems not all of the speakers got VAD output ($nc != $nu);" - echo "**validate_data_dir.sh will fail; you might want to use fix_data_dir.sh" - [ $nc -eq 0 ] && exit 1; -fi - - -echo "Created VAD output for $name" diff --git a/egs/sre08/v1/sid/compute_vad_decision.sh b/egs/sre08/v1/sid/compute_vad_decision.sh new file mode 120000 index 00000000000..174321b847e --- /dev/null +++ b/egs/sre08/v1/sid/compute_vad_decision.sh @@ -0,0 +1 @@ +../steps/compute_vad_decision.sh \ No newline at end of file diff --git a/egs/wsj/s5/steps/cleanup/clean_and_segment_data.sh b/egs/wsj/s5/steps/cleanup/clean_and_segment_data.sh index 670e6c2b714..9bf354b7891 100755 --- a/egs/wsj/s5/steps/cleanup/clean_and_segment_data.sh +++ b/egs/wsj/s5/steps/cleanup/clean_and_segment_data.sh @@ -6,9 +6,9 @@ # This script demonstrates how to re-segment training data selecting only the # "good" audio that matches the transcripts. -# The basic idea is to decode with an existing in-domain acoustic model, and a -# biased language model built from the reference, and then work out the -# segmentation from a ctm like file. +# The basic idea is to decode with an existing in-domain GMM acoustic model, and +# a biased language model built from the reference transcript, and then work out +# the segmentation from a ctm like file. set -e -o pipefail diff --git a/egs/wsj/s5/steps/cleanup/clean_and_segment_data_nnet3.sh b/egs/wsj/s5/steps/cleanup/clean_and_segment_data_nnet3.sh new file mode 100755 index 00000000000..306d6d3647a --- /dev/null +++ b/egs/wsj/s5/steps/cleanup/clean_and_segment_data_nnet3.sh @@ -0,0 +1,265 @@ +#!/bin/bash + +# Copyright 2016 Vimal Manohar +# 2016 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 + +# This script demonstrates how to re-segment training data selecting only the +# "good" audio that matches the transcripts. +# This script is like clean_and_segment_data.sh, but uses nnet3 model instead of +# a GMM for decoding. +# The basic idea is to decode with an existing in-domain nnet3 acoustic model, +# and a biased language model built from the reference transcript, and then work +# out the segmentation from a ctm like file. + +set -e +set -o pipefail +set -u + +stage=0 + +cmd=run.pl +cleanup=true # remove temporary directories and files +nj=4 +# Decode options +graph_opts= +beam=15.0 +lattice_beam=1.0 + +# Contexts must ideally match training +extra_left_context=0 # Set to some large value, typically 40 for LSTM (must match training) +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 +frames_per_chunk=150 + +# i-vector options +extractor= # i-Vector extractor. If provided, will extract i-vectors. + # Required if the network was trained with i-vector extractor. +use_vad= # Use energy-based VAD for i-vector extraction + +segmentation_opts= + +. ./path.sh +. utils/parse_options.sh + + +if [ $# -ne 5 ]; then + cat <] [options] + This script does data cleanup to remove bad portions of transcripts and + may do other minor modifications of transcripts such as allowing repetitions + for disfluencies, and adding or removing non-scored words (by default: + words that map to 'silence phones') + Note: is expected to contain a nnet3-based model. + and decoding options like --extra-left-context must match + the appropriate options used for training. + + e.g. $0 data/train data/lang exp/tri3 exp/tri3_cleanup data/train_cleaned + main options (for others, see top of script file): + --stage # stage to run from, to enable resuming from partially + # completed run (default: 0) + --cmd '$cmd' # command to submit jobs with (e.g. run.pl, queue.pl) + --nj # number of parallel jobs to use in graph creation and + # decoding + --graph-opts 'opts' # Additional options to make_biased_lm_graphs.sh. + # Please run steps/cleanup/make_biased_lm_graphs.sh + # without arguments to see allowed options. + --segmentation-opts 'opts' # Additional options to segment_ctm_edits.py. + # Please run steps/cleanup/internal/segment_ctm_edits.py + # without arguments to see allowed options. + --cleanup # Clean up intermediate files afterward. Default true. + --extractor # i-vector extractor directory if i-vector is + # to be used during decoding. Must match + # the extractor used for training neural-network. + --use-vad # If true, uses energy-based VAD to apply frame weights + # for i-vector stats extraction +EOF + exit 1 +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 +data_out=$5 + + +extra_files= +if [ ! -z "$extractor" ]; then + extra_files="$extractor/final.ie" +fi + +for f in $srcdir/{final.mdl,tree,cmvn_opts} $data/utt2spk $data/feats.scp \ + $lang/words.txt $lang/oov.txt $extra_files; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist." + exit 1 + fi +done + +mkdir -p $dir +cp $srcdir/final.mdl $dir +cp $srcdir/tree $dir +cp $srcdir/cmvn_opts $dir +cp $srcdir/{splice_opts,delta_opts,final.mat,final.alimdl} $dir 2>/dev/null || true +cp $srcdir/frame_subsampling_factor $dir 2>/dev/null || true + +utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt +cp $lang/phones.txt $dir + +if [ $stage -le 1 ]; then + echo "$0: Building biased-language-model decoding graphs..." + + + steps/cleanup/make_biased_lm_graphs.sh $graph_opts \ + --nj $nj --cmd "$cmd" \ + $data $lang $dir $dir/graphs +fi + +online_ivector_dir= +if [ ! -z "$extractor" ]; then + online_ivector_dir=$dir/ivectors_$(basename $data_uniform_seg) + + if [ $stage -le 2 ]; then + # Compute energy-based VAD + if $use_vad; then + steps/compute_vad_decision.sh $data_uniform_seg \ + $data_uniform_seg/log $data_uniform_seg/data + fi + + steps/online/nnet2/extract_ivectors_online.sh \ + --nj $nj --cmd "$cmd --mem 4G" --use-vad $use_vad \ + $data_uniform_seg $extractor $online_ivector_dir + fi +fi + +if [ $stage -le 3 ]; then + echo "$0: Decoding with biased language models..." + + steps/cleanup/decode_segmentation_nnet3.sh \ + --beam $beam --lattice-beam $lattice_beam --nj $nj --cmd "$cmd --mem 4G" \ + --skip-scoring true --allow-partial false \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk $frames_per_chunk \ + ${online_ivector_dir:+--online-ivector-dir $online_ivector_dir} \ + $dir/graphs $data $dir/lats + + # the following is for diagnostics, e.g. it will give us the lattice depth. + steps/diagnostic/analyze_lats.sh --cmd "$cmd" $lang $dir/lats +fi + +frame_shift_opt= +if [ -f $srcdir/frame_subsampling_factor ]; then + frame_shift_opt="--frame-shift=0.0$(cat $srcdir/frame_subsampling_factor)" +fi + +if [ $stage -le 4 ]; then + echo "$0: Doing oracle alignment of lattices..." + steps/cleanup/lattice_oracle_align.sh --cmd "$cmd --mem 4G" $frame_shift_opt \ + $data $lang $dir/lats $dir/lattice_oracle +fi + + +if [ $stage -le 4 ]; then + echo "$0: using default values of non-scored words..." + + # At the level of this script we just hard-code it that non-scored words are + # those that map to silence phones (which is what get_non_scored_words.py + # gives us), although this could easily be made user-configurable. This list + # of non-scored words affects the behavior of several of the data-cleanup + # scripts; essentially, we view the non-scored words as negotiable when it + # comes to the reference transcript, so we'll consider changing the reference + # to match the hyp when it comes to these words. + steps/cleanup/internal/get_non_scored_words.py $lang > $dir/non_scored_words.txt +fi + +if [ $stage -le 5 ]; then + echo "$0: modifying ctm-edits file to allow repetitions [for dysfluencies] and " + echo " ... to fix reference mismatches involving non-scored words. " + + $cmd $dir/log/modify_ctm_edits.log \ + steps/cleanup/internal/modify_ctm_edits.py --verbose=3 $dir/non_scored_words.txt \ + $dir/lattice_oracle/ctm_edits $dir/ctm_edits.modified + + echo " ... See $dir/log/modify_ctm_edits.log for details and stats, including" + echo " a list of commonly-repeated words." +fi + +if [ $stage -le 6 ]; then + echo "$0: applying 'taint' markers to ctm-edits file to mark silences and" + echo " ... non-scored words that are next to errors." + $cmd $dir/log/taint_ctm_edits.log \ + steps/cleanup/internal/taint_ctm_edits.py $dir/ctm_edits.modified $dir/ctm_edits.tainted + echo "... Stats, including global cor/ins/del/sub stats, are in $dir/log/taint_ctm_edits.log." +fi + + +if [ $stage -le 7 ]; then + echo "$0: creating segmentation from ctm-edits file." + + $cmd $dir/log/segment_ctm_edits.log \ + steps/cleanup/internal/segment_ctm_edits.py \ + $segmentation_opts \ + --oov-symbol-file=$lang/oov.txt \ + --ctm-edits-out=$dir/ctm_edits.segmented \ + --word-stats-out=$dir/word_stats.txt \ + $dir/non_scored_words.txt \ + $dir/ctm_edits.tainted $dir/text $dir/segments + + echo "$0: contents of $dir/log/segment_ctm_edits.log are:" + cat $dir/log/segment_ctm_edits.log + echo "For word-level statistics on p(not-being-in-a-segment), with 'worst' words at the top," + echo "see $dir/word_stats.txt" + echo "For detailed utterance-level debugging information, see $dir/ctm_edits.segmented" +fi + +if [ $stage -le 8 ]; then + echo "$0: working out required segment padding to account for feature-generation edge effects." + # make sure $data/utt2dur exists. + utils/data/get_utt2dur.sh $data + # utt2dur.from_ctm contains lines of the form 'utt dur', e.g. + # AMI_EN2001a_H00_MEE068_0000557_0000594 0.35 + # where the times are ultimately derived from the num-frames in the features. + cat $dir/lattice_oracle/ctm_edits | \ + awk '{utt=$1; t=$3+$4; if (t > dur[$1]) dur[$1] = t; } END{for (k in dur) print k, dur[k];}' | \ + sort > $dir/utt2dur.from_ctm + # the apply_map command below gives us lines of the form 'utt dur-from-$data/utt2dur dur-from-utt2dur.from_ctm', + # e.g. AMI_EN2001a_H00_MEE068_0000557_0000594 0.37 0.35 + utils/apply_map.pl -f 1 <(awk '{print $1,$1,$2}' <$data/utt2dur) <$dir/utt2dur.from_ctm | \ + awk '{printf("%.3f\n", $2 - $3); }' | sort | uniq -c > $dir/padding_frequencies + # there are values other than the most-frequent one (0.02) in there because + # of wav files that were shorter than the segment info. + padding=$(head -n 1 $dir/padding_frequencies | awk '{print $2}') + echo "$0: we'll pad segments with $padding seconds at segment ends to correct for feature-generation end effects" + echo $padding >$dir/segment_end_padding +fi + + +if [ $stage -le 8 ]; then + echo "$0: based on the segments and text file in $dir/segments and $dir/text, creating new data-dir in $data_out" + padding=$(cat $dir/segment_end_padding) # e.g. 0.02 + utils/data/subsegment_data_dir.sh --segment-end-padding $padding ${data} $dir/segments $dir/text $data_out + # utils/data/subsegment_data_dir.sh can output directories that have e.g. to many entries left in wav.scp + # Clean this up with the fix_dat_dir.sh script + utils/fix_data_dir.sh $data_out +fi + +if [ $stage -le 9 ]; then + echo "$0: recomputing CMVN stats for the new data" + # Caution: this script puts the CMVN stats in $data_out/data, + # e.g. data/train_cleaned/data. This is not the general pattern we use. + steps/compute_cmvn_stats.sh $data_out $data_out/log $data_out/data +fi + +if $cleanup; then + echo "$0: cleaning up intermediate files" + rm -r $dir/fsts $dir/HCLG.fsts.scp || true + rm -r $dir/lats/lat.*.gz $dir/lats/split_fsts || true + rm $dir/lattice_oracle/lat.*.gz || true +fi + +echo "$0: done." diff --git a/egs/wsj/s5/steps/cleanup/decode_segmentation_nnet3.sh b/egs/wsj/s5/steps/cleanup/decode_segmentation_nnet3.sh new file mode 100755 index 00000000000..02a9d87d26b --- /dev/null +++ b/egs/wsj/s5/steps/cleanup/decode_segmentation_nnet3.sh @@ -0,0 +1,174 @@ +#!/bin/bash + +# Copyright 2014 Guoguo Chen, 2015 GoVivace Inc. (Nagendra Goel) +# 2017 Vimal Manohar +# Apache 2.0 + +# This script is similar to steps/cleanup/decode_segmentation.sh, but +# does decoding using nnet3 model. + +set -e +set -o pipefail + +# Begin configuration section. +stage=-1 +nj=4 # number of decoding jobs. +acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. +post_decode_acwt=1.0 # can be used in 'chain' systems to scale acoustics by 10 so the + # regular scoring script works. +cmd=run.pl +beam=15.0 +frames_per_chunk=50 +max_active=7000 +min_active=200 +ivector_scale=1.0 +lattice_beam=8.0 # Beam we use in lattice generation. We can reduce this if + # we only need the best path +iter=final +num_threads=1 # if >1, will use gmm-latgen-faster-parallel +scoring_opts= +skip_scoring=false +allow_partial=true +extra_left_context=0 +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 +online_ivector_dir= +minimize=false +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. utils/parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "$0: This is a special decoding script for segmentation where we" + echo "use one decoding graph per segment. We assume a file HCLG.fsts.scp exists" + echo "which is the scp file of the graphs for each segment." + echo "This will normally be obtained by steps/cleanup/make_biased_lm_graphs.sh." + echo "" + echo "Usage: $0 [options] " + echo " e.g.: $0 --online-ivector-dir exp/nnet3/ivectors_train_si284_split " + echo " exp/nnet3/tdnn/graph_train_si284_split \\" + echo " data/train_si284_split exp/nnet3/tdnn/decode_train_si284_split" + echo "" + echo "where is assumed to be a sub-directory of the directory" + echo "where the model is." + echo "" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --iter # Iteration of model to test." + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --acwt # acoustic scale used for lattice generation " + echo " --scoring-opts # options to local/score.sh" + echo " --num-threads # number of threads to use, default 1." + exit 1; +fi + + +graphdir=$1 +data=$2 +dir=$3 + +mkdir -p $dir/log + +if [ -e $dir/$iter.mdl ]; then + srcdir=$dir +elif [ -e $dir/../$iter.mdl ]; then + srcdir=$(dirname $dir) +else + echo "$0: expected either $dir/$iter.mdl or $dir/../$iter.mdl to exist" + exit 1 +fi +model=$srcdir/$iter.mdl + + +extra_files= +if [ ! -z "$online_ivector_dir" ]; then + steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1 + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" +fi + +utils/lang/check_phones_compatible.sh $graph_dir/phones.txt $srcdir/phones.txt || exit 1 + +for f in $graphdir/HCLG.fsts.scp $data/feats.scp $model $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +sdata=$data/split$nj; +cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1; +thread_string= +[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" + +mkdir -p $dir/log +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + +# Split HCLG.fsts.scp by input utterance +n1=$(cat $graphdir/HCLG.fsts.scp | wc -l) +n2=$(cat $data/feats.scp | wc -l) +if [ $n1 != $n2 ]; then + echo "$0: expected $n2 graphs in $graphdir/HCLG.fsts.scp, got $n1" +fi + +mkdir -p $dir/split_fsts +sort -k1,1 $graphdir/HCLG.fsts.scp > $dir/HCLG.fsts.sorted.scp +utils/filter_scps.pl --no-warn -f 1 JOB=1:$nj \ + $sdata/JOB/feats.scp $dir/HCLG.fsts.sorted.scp $dir/split_fsts/HCLG.fsts.JOB.scp +HCLG=scp:$dir/split_fsts/HCLG.fsts.JOB.scp + +## Set up features. +echo "$0: feature type is raw" + +feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" + +if [ ! -z "$online_ivector_dir" ]; then + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" +fi + +if [ "$post_decode_acwt" == 1.0 ]; then + lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz" +else + lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz" +fi + +frame_subsampling_opt= +if [ -f $srcdir/frame_subsampling_factor ]; then + # e.g. for 'chain' systems + frame_subsampling_opt="--frame-subsampling-factor=$(cat $srcdir/frame_subsampling_factor)" +fi + +if [ $stage -le 1 ]; then + if [ -f "$graphdir/num_pdfs" ]; then + [ "`cat $graphdir/num_pdfs`" -eq `am-info --print-args=false $model | grep pdfs | awk '{print $NF}'` ] || \ + { echo "Mismatch in number of pdfs with $model"; exit 1; } + fi + $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \ + nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ + --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=$allow_partial \ + --word-symbol-table=$graphdir/words.txt "$model" \ + "$HCLG" "$feats" "$lat_wspecifier" || exit 1; +fi + + +if [ $stage -le 2 ]; then + if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "$0: Not scoring because local/score.sh does not exist or not executable." && exit 1; + iter_opt= + [ "$iter" != "final" ] && iter_opt="--iter $iter" + local/score.sh $iter_opt $scoring_opts --cmd "$cmd" $data $graphdir $dir || + { echo "$0: Scoring failed. (ignore by '--skip-scoring true')"; exit 1; } + fi +fi +echo "Decoding done." +exit 0; diff --git a/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh b/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh index 16350fdb032..c7e50ea165e 100755 --- a/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh +++ b/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh @@ -4,6 +4,23 @@ # 2016 Vimal Manohar # Apache 2.0 +# This script performs segmentation of the input data based on the transcription +# and outputs segmented data along with the corresponding aligned transcription. +# The purpose of this script is to divide up the input data (which may consist +# of long recordings such as television shows or audiobooks) into segments which +# are of manageable length for further processing, along with the portion of the +# transcript that seems to match (aligns with) each segment. +# This the light-supervised training scenario where the input transcription is +# not expected to be completely clean and may have significant errors. +# See "JHU Kaldi System for Arabic MGB-3 ASR Challenge using Diarization, +# Audio-transcript Alignment and Transfer Learning": Vimal Manohar, Daniel +# Povey, Sanjeev Khudanpur, ASRU 2017 +# (http://www.danielpovey.com/files/2017_asru_mgb3.pdf) for details. +# The output data is not necessarily particularly clean; you can run +# steps/cleanup/clean_and_segment_data.sh on the output in order to +# further clean it and eliminate data where the transcript doesn't seem to +# match. + . ./path.sh set -e @@ -380,7 +397,8 @@ if [ $stage -le 9 ]; then fi if [ $stage -le 10 ]; then - steps/cleanup/internal/resolve_ctm_edits_overlaps.py \ + $cmd $dir/log/resolve_ctm_edits.log \ + steps/cleanup/internal/resolve_ctm_edits_overlaps.py \ ${data_uniform_seg}/segments $decode_dir/ctm_$lmwt/ctm_edits $dir/ctm_edits fi diff --git a/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh new file mode 100755 index 00000000000..d21b94fc5fb --- /dev/null +++ b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh @@ -0,0 +1,523 @@ +#!/bin/bash + +# Copyright 2014 Guoguo Chen +# 2016 Vimal Manohar +# Apache 2.0 + +# This script is similar to steps/cleanup/segment_long_utterances.sh, but +# uses nnet3 acoustic model instead of GMM acoustic model for decoding. +# This script performs segmentation of the input data based on the transcription +# and outputs segmented data along with the corresponding aligned transcription. +# The purpose of this script is to divide up the input data (which may consist +# of long recordings such as television shows or audiobooks) into segments which +# are of manageable length for further processing, along with the portion of the +# transcript that seems to match (aligns with) each segment. +# This the light-supervised training scenario where the input transcription is +# not expected to be completely clean and may have significant errors. +# See "JHU Kaldi System for Arabic MGB-3 ASR Challenge using Diarization, +# Audio-transcript Alignment and Transfer Learning": Vimal Manohar, Daniel +# Povey, Sanjeev Khudanpur, ASRU 2017 +# (http://www.danielpovey.com/files/2017_asru_mgb3.pdf) for details. +# The output data is not necessarily particularly clean; you can run +# steps/cleanup/clean_and_segment_data_nnet3.sh on the output in order to +# further clean it and eliminate data where the transcript doesn't seem to +# match. + + +set -e +set -o pipefail +set -u + +stage=-1 +cmd=run.pl +nj=4 + +# Uniform segmentation options +max_segment_duration=30 +overlap_duration=5 +seconds_per_spk_max=30 + +# Decode options +graph_opts= +beam=15.0 +lattice_beam=1.0 +lmwt=10 + +# Contexts must ideally match training +extra_left_context=0 # Set to some large value, typically 40 for LSTM (must match training) +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 +frames_per_chunk=150 + +# i-vector options +extractor= # i-Vector extractor. If provided, will extract i-vectors. + # Required if the network was trained with i-vector extractor. +use_vad= # Use energy-based VAD for i-vector extraction + +# TF-IDF similarity search options +max_words=1000 +num_neighbors_to_search=1 # Number of neighboring documents to search around the one retrieved based on maximum tf-idf similarity. +neighbor_tfidf_threshold=0.5 + +align_full_hyp=false # Align full hypothesis i.e. trackback from the end to get the alignment. + +# First-pass segmentation opts +# These options are passed to the script +# steps/cleanup/internal/segment_ctm_edits_mild.py +segmentation_extra_opts= +min_split_point_duration=0.1 +max_deleted_words_kept_when_merging=1 +max_wer=50 +max_segment_length_for_merging=60 +max_bad_proportion=0.75 +max_intersegment_incorrect_words_length=1 +max_segment_length_for_splitting=10 +hard_max_segment_length=15 +min_silence_length_to_split_at=0.3 +min_non_scored_length_to_split_at=0.3 + + +. ./path.sh +. utils/parse_options.sh + +if [ $# -ne 5 ] && [ $# -ne 7 ]; then + cat <] [options] [ ] + e.g.: $0 exp/wsj_tri2b data/lang_nosp data/train_long data/train_long/text data/train_reseg exp/segment_wsj_long_utts_train +This script performs segmentation of the data in and writes out the +segmented data (with a segments file) to + along with the corresponding aligned transcription. +Note: If is not provided, the "text" file in is used as the +raw transcripts to train biased LM for the utterances. +If is provided, then it should be a mapping from the utterance-ids in + to the transcript-keys in the file , which will be +used to train biased LMs for the utterances. +The purpose of this script is to divide up the input data (which may consist of +long recordings such as television shows or audiobooks) into segments which are +of manageable length for further processing, along with the portion of the +transcript that seems to match each segment. +The output data is not necessarily particularly clean; you are advised to run +steps/cleanup/clean_and_segment_data.sh on the output in order to further clean +it and eliminate data where the transcript doesn't seem to match. + main options (for others, see top of script file): + --stage # stage to run from, to enable resuming from partially + # completed run (default: 0) + --cmd '$cmd' # command to submit jobs with (e.g. run.pl, queue.pl) + --nj # number of parallel jobs to use in graph creation and + # decoding + --graph-opts 'opts' # Additional options to make_biased_lm_graphs.sh. + # Please run steps/cleanup/make_biased_lm_graphs.sh + # without arguments to see allowed options. + --segmentation-extra-opts 'opts' # Additional options to segment_ctm_edits_mild.py. + # Please run steps/cleanup/internal/segment_ctm_edits_mild.py + # without arguments to see allowed options. + --align-full-hyp # If true, align full hypothesis + i.e. trackback from the end to get the alignment. + This is different from the normal + Smith-Waterman alignment, where the + traceback will be from the maximum score. + --extractor # i-vector extractor directory if i-vector is + # to be used during decoding. Must match + # the extractor used for training neural-network. + --use-vad # If true, uses energy-based VAD to apply frame weights + # for i-vector stats extraction +EOF + exit 1 +fi + +srcdir=$1 +lang=$2 +data=$3 + +extra_files= +utt2text= +text=$data/text +if [ $# -eq 7 ]; then + text=$4 + utt2text=$5 + out_data=$6 + dir=$7 + extra_files="$utt2text" +else + out_data=$4 + dir=$5 +fi + +if [ ! -z "$extractor" ]; then + extra_files="$extra_files $extractor/final.ie" +fi + +for f in $data/feats.scp $text $extra_files $srcdir/tree \ + $srcdir/final.mdl $srcdir/cmvn_opts; do + if [ ! -f $f ]; then + echo "$0: Could not find file $f" + exit 1 + fi +done + +data_id=`basename $data` +mkdir -p $dir +cp $srcdir/final.mdl $dir +cp $srcdir/tree $dir +cp $srcdir/cmvn_opts $dir +cp $srcdir/{splice_opts,delta_opts,final.mat,final.alimdl} $dir 2>/dev/null || true +cp $srcdir/frame_subsampling_factor $dir 2>/dev/null || true + +utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt +cp $lang/phones.txt $dir + +data_uniform_seg=$dir/${data_id}_uniform_seg + +# First we split the data into segments of around 30s long, on which +# it would be possible to do a decoding. +# A diarization step will be added in the future. +if [ $stage -le 1 ]; then + echo "$0: Stage 1 (Splitting data directory $data into uniform segments)" + + utils/data/get_utt2dur.sh $data + if [ ! -f $data/segments ]; then + utils/data/get_segments_for_data.sh $data > $data/segments + fi + + utils/data/get_uniform_subsegments.py \ + --max-segment-duration=$max_segment_duration \ + --overlap-duration=$overlap_duration \ + --max-remaining-duration=$(perl -e "print $max_segment_duration / 2.0") \ + $data/segments > $dir/uniform_sub_segments +fi + +if [ $stage -le 2 ]; then + echo "$0: Stage 2 (Prepare uniform sub-segmented data directory)" + rm -r $data_uniform_seg || true + + if [ ! -z "$seconds_per_spk_max" ]; then + utils/data/subsegment_data_dir.sh \ + $data $dir/uniform_sub_segments $dir/${data_id}_uniform_seg.temp + + utils/data/modify_speaker_info.sh --seconds-per-spk-max $seconds_per_spk_max \ + $dir/${data_id}_uniform_seg.temp $data_uniform_seg + else + utils/data/subsegment_data_dir.sh \ + $data $dir/uniform_sub_segments $data_uniform_seg + fi + + utils/fix_data_dir.sh $data_uniform_seg + + # Compute new cmvn stats for the segmented data directory + steps/compute_cmvn_stats.sh $data_uniform_seg/ +fi + +graph_dir=$dir/graphs_uniform_seg + +if [ $stage -le 3 ]; then + echo "$0: Stage 3 (Building biased-language-model decoding graphs)" + + mkdir -p $graph_dir + + # Make graphs w.r.t. to the original text (usually recording-level) + steps/cleanup/make_biased_lm_graphs.sh $graph_opts \ + --nj $nj --cmd "$cmd" $text \ + $lang $dir $dir/graphs + if [ -z "$utt2text" ]; then + # and then copy it to the sub-segments. + cat $dir/uniform_sub_segments | awk '{print $1" "$2}' | \ + utils/apply_map.pl -f 2 $dir/graphs/HCLG.fsts.scp | \ + sort -k1,1 > \ + $graph_dir/HCLG.fsts.scp + else + # and then copy it to the sub-segments. + cat $dir/uniform_sub_segments | awk '{print $1" "$2}' | \ + utils/apply_map.pl -f 2 $utt2text | \ + utils/apply_map.pl -f 2 $dir/graphs/HCLG.fsts.scp | \ + sort -k1,1 > \ + $graph_dir/HCLG.fsts.scp + fi + + cp $lang/words.txt $graph_dir + cp -r $lang/phones $graph_dir + [ -f $dir/graphs/num_pdfs ] && cp $dir/graphs/num_pdfs $graph_dir/ +fi + +decode_dir=$dir/lats +mkdir -p $decode_dir + +online_ivector_dir= +if [ ! -z "$extractor" ]; then + online_ivector_dir=$dir/ivectors_$(basename $data_uniform_seg) + + if [ $stage -le 4 ]; then + # Compute energy-based VAD + if $use_vad; then + steps/compute_vad_decision.sh $data_uniform_seg \ + $data_uniform_seg/log $data_uniform_seg/data + fi + + steps/online/nnet2/extract_ivectors_online.sh \ + --nj $nj --cmd "$cmd --mem 4G" --use-vad $use_vad \ + $data_uniform_seg $extractor $online_ivector_dir + fi +fi + +if [ $stage -le 5 ]; then + echo "$0: Decoding with biased language models..." + + steps/cleanup/decode_segmentation_nnet3.sh \ + --beam $beam --lattice-beam $lattice_beam --nj $nj --cmd "$cmd --mem 4G" \ + --skip-scoring true --allow-partial false \ + --extra-left-context $extra_left_context \ + --extra-right-context $extra_right_context \ + --extra-left-context-initial $extra_left_context_initial \ + --extra-right-context-final $extra_right_context_final \ + --frames-per-chunk $frames_per_chunk \ + ${online_ivector_dir:+--online-ivector-dir $online_ivector_dir} \ + $graph_dir $data_uniform_seg $decode_dir +fi + +frame_shift_opt= +if [ -f $srcdir/frame_subsampling_factor ]; then + frame_shift_opt="--frame-shift=0.0$(cat $srcdir/frame_subsampling_factor)" +fi + +if [ $stage -le 6 ]; then + steps/get_ctm_fast.sh --lmwt $lmwt --cmd "$cmd --mem 4G" \ + --print-silence true $frame_shift_opt \ + $data_uniform_seg $lang $decode_dir $decode_dir/ctm_$lmwt +fi + +# Split the original text into documents, over which we can do +# searching reasonably efficiently. Also get a mapping from the original +# text to the created documents (i.e. text2doc) +# Since the Smith-Waterman alignment is linear in the length of the +# text, we want to keep it reasonably small (a few thousand words). + +if [ $stage -le 7 ]; then + # Split the reference text into documents. + mkdir -p $dir/docs + + # text2doc is a mapping from the original transcript to the documents + # it is split into. + # The format is + # ... + steps/cleanup/internal/split_text_into_docs.pl --max-words $max_words \ + $text $dir/docs/doc2text $dir/docs/docs.txt + utils/utt2spk_to_spk2utt.pl $dir/docs/doc2text > $dir/docs/text2doc +fi + +if [ $stage -le 8 ]; then + # Get TF-IDF for the reference documents. + echo $nj > $dir/docs/num_jobs + + utils/split_data.sh $data_uniform_seg $nj + + mkdir -p $dir/docs/split$nj/ + + # First compute IDF stats + $cmd $dir/log/compute_source_idf_stats.log \ + steps/cleanup/internal/compute_tf_idf.py \ + --tf-weighting-scheme="raw" \ + --idf-weighting-scheme="log" \ + --output-idf-stats=$dir/docs/idf_stats.txt \ + $dir/docs/docs.txt $dir/docs/src_tf_idf.txt + + # Split documents so that they can be accessed easily by parallel jobs. + mkdir -p $dir/docs/split$nj/ + sdir=$dir/docs/split$nj + for n in `seq $nj`; do + + # old2new_utts is a mapping from the original segments to the + # new segments created by uniformly segmenting. + # The format is ... + utils/filter_scp.pl $data_uniform_seg/split$nj/$n/utt2spk $dir/uniform_sub_segments | \ + cut -d ' ' -f 1,2 | utils/utt2spk_to_spk2utt.pl > $sdir/old2new_utts.$n.txt + + if [ ! -z "$utt2text" ]; then + # utt2text, if provided, is a mapping from the to + # . + # Since text2doc is mapping from to documents, we + # first have to find the original-transcripts that are in the current + # split. + utils/filter_scp.pl $sdir/old2new_utts.$n.txt $utt2text | \ + cut -d ' ' -f 2 | sort -u | \ + utils/filter_scp.pl /dev/stdin $dir/docs/text2doc > $sdir/text2doc.$n + else + utils/filter_scp.pl $sdir/old2new_utts.$n.txt \ + $dir/docs/text2doc > $sdir/text2doc.$n + fi + + utils/spk2utt_to_utt2spk.pl $sdir/text2doc.$n | \ + utils/filter_scp.pl /dev/stdin $dir/docs/docs.txt > \ + $sdir/docs.$n.txt + done + + # Compute TF-IDF for the source documents. + $cmd JOB=1:$nj $dir/docs/log/get_tfidf_for_source_texts.JOB.log \ + steps/cleanup/internal/compute_tf_idf.py \ + --tf-weighting-scheme="raw" \ + --idf-weighting-scheme="log" \ + --input-idf-stats=$dir/docs/idf_stats.txt \ + $sdir/docs.JOB.txt $sdir/src_tf_idf.JOB.txt + + sdir=$dir/docs/split$nj + # Make $sdir an absolute pathname. + sdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $sdir ${PWD}` + + for n in `seq $nj`; do + awk -v f="$sdir/src_tf_idf.$n.txt" '{print $1" "f}' \ + $sdir/text2doc.$n + done | perl -ane 'BEGIN { %tfidfs = (); } + { + if (!defined $tfidfs{$F[0]}) { + $tfidfs{$F[0]} = $F[1]; + } + } + END { + while(my ($k, $v) = each %tfidfs) { + print "$k $v\n"; + } }' > $dir/docs/source2tf_idf.scp +fi + +if [ $stage -le 9 ]; then + echo "$0: using default values of non-scored words..." + + # At the level of this script we just hard-code it that non-scored words are + # those that map to silence phones (which is what get_non_scored_words.py + # gives us), although this could easily be made user-configurable. This list + # of non-scored words affects the behavior of several of the data-cleanup + # scripts; essentially, we view the non-scored words as negotiable when it + # comes to the reference transcript, so we'll consider changing the reference + # to match the hyp when it comes to these words. + steps/cleanup/internal/get_non_scored_words.py $lang > $dir/non_scored_words.txt +fi + +if [ $stage -le 10 ]; then + sdir=$dir/query_docs/split$nj + mkdir -p $sdir + + # Compute TF-IDF for the query documents (decode hypotheses). + # The output is an archive of TF-IDF indexed by the query. + $cmd JOB=1:$nj $decode_dir/ctm_$lmwt/log/compute_query_tf_idf.JOB.log \ + steps/cleanup/internal/ctm_to_text.pl --non-scored-words $dir/non_scored_words.txt \ + $decode_dir/ctm_$lmwt/ctm.JOB \| \ + steps/cleanup/internal/compute_tf_idf.py \ + --tf-weighting-scheme="normalized" \ + --idf-weighting-scheme="log" \ + --input-idf-stats=$dir/docs/idf_stats.txt \ + --accumulate-over-docs=false \ + - $sdir/query_tf_idf.JOB.ark.txt + + # The relevant documents can be found using TF-IDF similarity and nearby + # documents can also be picked for the Smith-Waterman alignment stage. + + # Get a mapping from the new utterance-ids to original transcripts + if [ -z "$utt2text" ]; then + awk '{print $1" "$2}' $dir/uniform_sub_segments > \ + $dir/new2orig_utt + else + awk '{print $1" "$2}' $dir/uniform_sub_segments | \ + utils/apply_map.pl -f 2 $utt2text > \ + $dir/new2orig_utt + fi + + # The query TF-IDFs are all indexed by the utterance-id of the sub-segments. + # The source TF-IDFs use the document-ids created by splitting the reference + # text into documents. + # For each query, we need to retrieve the documents that were created from + # the same original utterance that the sub-segment was from. For this, + # we have to load the source TF-IDF that has those documents. This + # information is provided using the option --source-text-id2tf-idf-file. + # The output of this script is a file where the first column is the + # query-id (i.e. sub-segment-id) and the remaining columns, which is at least + # one in number and a maxmium of (1 + 2 * num-neighbors-to-search) columns + # is the document-ids for the retrieved documents. + $cmd JOB=1:$nj $dir/log/retrieve_similar_docs.JOB.log \ + steps/cleanup/internal/retrieve_similar_docs.py \ + --query-tfidf=$dir/query_docs/split$nj/query_tf_idf.JOB.ark.txt \ + --source-text-id2tfidf=$dir/docs/source2tf_idf.scp \ + --source-text-id2doc-ids=$dir/docs/text2doc \ + --query-id2source-text-id=$dir/new2orig_utt \ + --num-neighbors-to-search=$num_neighbors_to_search \ + --neighbor-tfidf-threshold=$neighbor_tfidf_threshold \ + --relevant-docs=$dir/query_docs/split$nj/relevant_docs.JOB.txt + + $cmd JOB=1:$nj $decode_dir/ctm_$lmwt/log/get_ctm_edits.JOB.log \ + steps/cleanup/internal/stitch_documents.py \ + --query2docs=$dir/query_docs/split$nj/relevant_docs.JOB.txt \ + --input-documents=$dir/docs/split$nj/docs.JOB.txt \ + --output-documents=- \| \ + steps/cleanup/internal/align_ctm_ref.py --eps-symbol='""' \ + --oov-word="'`cat $lang/oov.txt`'" --symbol-table=$lang/words.txt \ + --hyp-format=CTM --align-full-hyp=$align_full_hyp \ + --hyp=$decode_dir/ctm_$lmwt/ctm.JOB --ref=- \ + --output=$decode_dir/ctm_$lmwt/ctm_edits.JOB + + for n in `seq $nj`; do + cat $decode_dir/ctm_$lmwt/ctm_edits.$n + done > $decode_dir/ctm_$lmwt/ctm_edits + +fi + +if [ $stage -le 11 ]; then + $cmd $dir/log/resolve_ctm_edits.log \ + steps/cleanup/internal/resolve_ctm_edits_overlaps.py \ + ${data_uniform_seg}/segments $decode_dir/ctm_$lmwt/ctm_edits $dir/ctm_edits +fi + +if [ $stage -le 12 ]; then + echo "$0: modifying ctm-edits file to allow repetitions [for dysfluencies] and " + echo " ... to fix reference mismatches involving non-scored words. " + + $cmd $dir/log/modify_ctm_edits.log \ + steps/cleanup/internal/modify_ctm_edits.py --verbose=3 $dir/non_scored_words.txt \ + $dir/ctm_edits $dir/ctm_edits.modified + + echo " ... See $dir/log/modify_ctm_edits.log for details and stats, including" + echo " a list of commonly-repeated words." +fi + +if [ $stage -le 13 ]; then + echo "$0: applying 'taint' markers to ctm-edits file to mark silences and" + echo " ... non-scored words that are next to errors." + $cmd $dir/log/taint_ctm_edits.log \ + steps/cleanup/internal/taint_ctm_edits.py --remove-deletions=false \ + $dir/ctm_edits.modified $dir/ctm_edits.tainted + echo "... Stats, including global cor/ins/del/sub stats, are in $dir/log/taint_ctm_edits.log." +fi + +if [ $stage -le 14 ]; then + echo "$0: creating segmentation from ctm-edits file." + + segmentation_opts=( + --min-split-point-duration=$min_split_point_duration + --max-deleted-words-kept-when-merging=$max_deleted_words_kept_when_merging + --merging.max-wer=$max_wer + --merging.max-segment-length=$max_segment_length_for_merging + --merging.max-bad-proportion=$max_bad_proportion + --merging.max-intersegment-incorrect-words-length=$max_intersegment_incorrect_words_length + --splitting.max-segment-length=$max_segment_length_for_splitting + --splitting.hard-max-segment-length=$hard_max_segment_length + --splitting.min-silence-length=$min_silence_length_to_split_at + --splitting.min-non-scored-length=$min_non_scored_length_to_split_at + ) + + $cmd $dir/log/segment_ctm_edits.log \ + steps/cleanup/internal/segment_ctm_edits_mild.py \ + ${segmentation_opts[@]} $segmentation_extra_opts \ + --oov-symbol-file=$lang/oov.txt \ + --ctm-edits-out=$dir/ctm_edits.segmented \ + --word-stats-out=$dir/word_stats.txt \ + $dir/non_scored_words.txt \ + $dir/ctm_edits.tainted $dir/text $dir/segments + + echo "$0: contents of $dir/log/segment_ctm_edits.log are:" + cat $dir/log/segment_ctm_edits.log + echo "For word-level statistics on p(not-being-in-a-segment), with 'worst' words at the top," + echo "see $dir/word_stats.txt" + echo "For detailed utterance-level debugging information, see $dir/ctm_edits.segmented" +fi + +mkdir -p $out_data +if [ $stage -le 15 ]; then + utils/data/subsegment_data_dir.sh $data_uniform_seg \ + $dir/segments $dir/text $out_data +fi diff --git a/egs/wsj/s5/steps/compute_vad_decision.sh b/egs/wsj/s5/steps/compute_vad_decision.sh new file mode 100755 index 00000000000..4cf3c5b2b79 --- /dev/null +++ b/egs/wsj/s5/steps/compute_vad_decision.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# Apache 2.0 + +# To be run from .. (one directory up from here) +# see ../run.sh for example + +# Compute energy based VAD output + +nj=4 +cmd=run.pl +vad_config=conf/vad.conf + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# -lt 1 ] || [ $# -gt 3 ]; then + echo "Usage: $0 [options] [ []]"; + echo "e.g.: $0 data/train exp/make_vad mfcc" + echo "Note: defaults to /log, and defaults to /data" + echo " Options:" + echo " --vad-config # config passed to compute-vad-energy" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +if [ $# -ge 2 ]; then + logdir=$2 +else + logdir=$data/log +fi +if [ $# -ge 3 ]; then + vaddir=$3 +else + vaddir=$data/data +fi + + +# make $vaddir an absolute pathname. +vaddir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $vaddir ${PWD}` + +# use "name" as part of name of the archive. +name=`basename $data` + +mkdir -p $vaddir || exit 1; +mkdir -p $logdir || exit 1; + +if [ -f $data/vad.scp ]; then + mkdir -p $data/.backup + echo "$0: moving $data/vad.scp to $data/.backup" + mv $data/vad.scp $data/.backup +fi + +for f in $data/feats.scp "$vad_config"; do + if [ ! -f $f ]; then + echo "compute_vad_decision.sh: no such file $f" + exit 1; + fi +done + +utils/split_data.sh $data $nj || exit 1; +sdata=$data/split$nj; + +$cmd JOB=1:$nj $logdir/vad_${name}.JOB.log \ + compute-vad --config=$vad_config scp:$sdata/JOB/feats.scp \ + ark,scp:$vaddir/vad_${name}.JOB.ark,$vaddir/vad_${name}.JOB.scp || exit 1 + +for ((n=1; n<=nj; n++)); do + cat $vaddir/vad_${name}.$n.scp || exit 1; +done > $data/vad.scp + +nc=`cat $data/vad.scp | wc -l` +nu=`cat $data/feats.scp | wc -l` +if [ $nc -ne $nu ]; then + echo "**Warning it seems not all of the speakers got VAD output ($nc != $nu);" + echo "**validate_data_dir.sh will fail; you might want to use fix_data_dir.sh" + [ $nc -eq 0 ] && exit 1; +fi + + +echo "Created VAD output for $name" diff --git a/egs/wsj/s5/steps/data/reverberate_data_dir.py b/egs/wsj/s5/steps/data/reverberate_data_dir.py index 71e64d9e680..f6be7a286ec 100755 --- a/egs/wsj/s5/steps/data/reverberate_data_dir.py +++ b/egs/wsj/s5/steps/data/reverberate_data_dir.py @@ -413,13 +413,7 @@ def CreateReverberatedCopy(input_dir, wav_scp = ParseFileToDict(input_dir + "/wav.scp", value_processor = lambda x: " ".join(x)) if not os.path.isfile(input_dir + "/reco2dur"): print("Getting the duration of the recordings..."); - read_entire_file="false" - for value in wav_scp.values(): - # we will add more checks for sox commands which modify the header as we come across these cases in our data - if "sox" in value and "speed" in value: - read_entire_file="true" - break - data_lib.RunKaldiCommand("wav-to-duration --read-entire-file={1} scp:{0}/wav.scp ark,t:{0}/reco2dur".format(input_dir, read_entire_file)) + data_lib.RunKaldiCommand("utils/data/get_reco2dur.sh {}".format(input_dir)) durations = ParseFileToDict(input_dir + "/reco2dur", value_processor = lambda x: float(x[0])) foreground_snr_array = map(lambda x: float(x), foreground_snr_string.split(':')) background_snr_array = map(lambda x: float(x), background_snr_string.split(':')) diff --git a/egs/wsj/s5/steps/nnet3/align.sh b/egs/wsj/s5/steps/nnet3/align.sh index cf1cc9124d3..92780f76480 100755 --- a/egs/wsj/s5/steps/nnet3/align.sh +++ b/egs/wsj/s5/steps/nnet3/align.sh @@ -24,6 +24,7 @@ extra_right_context=0 extra_left_context_initial=-1 extra_right_context_final=-1 online_ivector_dir= +graphs_scp= # End configuration options. echo "$0 $@" # Print the command line for logging @@ -97,8 +98,6 @@ fi echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir" -tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; - frame_subsampling_opt= if [ -f $srcdir/frame_subsampling_factor ]; then # e.g. for 'chain' systems @@ -114,9 +113,20 @@ if [ -f $srcdir/frame_subsampling_factor ]; then fi fi +if [ ! -z "$graphs_scp" ]; then + if [ ! -f $graphs_scp ]; then + echo "Could not find graphs $graphs_scp" && exit 1 + fi + tra="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $graphs_scp |" + prog=compile-train-graphs-fsts +else + tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; + prog=compile-train-graphs +fi $cmd $queue_opt JOB=1:$nj $dir/log/align.JOB.log \ - compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $srcdir/${iter}.mdl $lang/L.fst "$tra" ark:- \| \ + $prog --read-disambig-syms=$lang/phones/disambig.int $dir/tree \ + $srcdir/${iter}.mdl $lang/L.fst "$tra" ark:- \| \ nnet3-align-compiled $scale_opts $ivector_opts $frame_subsampling_opt \ --frames-per-chunk=$frames_per_chunk \ --extra-left-context=$extra_left_context \ diff --git a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh index 7dade75a0ed..3b6371168ce 100755 --- a/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh +++ b/egs/wsj/s5/steps/nnet3/chain/make_weighted_den_fst.sh @@ -86,37 +86,44 @@ else fi fi -if [ $stage -le 1 ]; then - all_phones="" # will contain the names of the .gz files containing phones, - # with some members possibly repeated per the --num-repeats - # option - for n in `seq 0 $[num_alignments-1]`; do - this_num_repeats=${num_repeats_array[$n]} - this_alignment_dir=${ali_dirs[$n]} - num_jobs=$(cat $this_alignment_dir/num_jobs) - if ! [ "$this_num_repeats" -gt 0 ]; then - echo "Expected comma-separated list of integers for --num-repeats option, got '$num_repeats'" - exit 1 - fi +all_phones="" # will contain the names of the .gz files containing phones, + # with some members possibly repeated per the --num-repeats + # option +for n in `seq 0 $[num_alignments-1]`; do + this_num_repeats=${num_repeats_array[$n]} + this_alignment_dir=${ali_dirs[$n]} + num_jobs=$(cat $this_alignment_dir/num_jobs) + if ! [ "$this_num_repeats" -ge 0 ]; then + echo "Expected comma-separated list of integers for --num-repeats option, got '$num_repeats'" + exit 1 + fi + if [ $stage -le 1 ]; then for j in $(seq $num_jobs); do gunzip -c $this_alignment_dir/ali.$j.gz; done | \ ali-to-phones $this_alignment_dir/final.mdl ark:- "ark:|gzip -c >$dir/phones.$n.gz" || exit 1; + fi - all_phones="$all_phones $(for r in $(seq $this_num_repeats); do echo $dir/phones.$n.gz; done)" - done + if [ ! -s $dir/phones.$n.gz ]; then + echo "$dir/phones.$n.gz is empty or does not exist" + exit 1 + fi + all_phones="$all_phones $(for r in $(seq $this_num_repeats); do echo $dir/phones.$n.gz; done)" +done + +if [ $stage -le 2 ]; then $cmd $dir/log/make_phone_lm_fst.log \ gunzip -c $all_phones \| \ chain-est-phone-lm $lm_opts ark:- $dir/phone_lm.fst || exit 1; rm $dir/phones.*.gz fi -if [ $stage -le 2 ]; then +if [ $stage -le 3 ]; then copy-transition-model ${ali_dirs[0]}/final.mdl $dir/0.trans_mdl || exit 1; fi -if [ $stage -le 3 ]; then +if [ $stage -le 4 ]; then $cmd $dir/log/make_den_fst.log \ chain-make-den-fst $dir/tree $dir/0.trans_mdl \ $dir/phone_lm.fst \ diff --git a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh index 0a5eb340a34..ddbc1a74266 100755 --- a/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh +++ b/egs/wsj/s5/steps/online/nnet2/extract_ivectors_online.sh @@ -42,6 +42,7 @@ max_count=0 # The use of this option (e.g. --max-count 100) can make # posterior-scaling, so assuming the posterior-scale is 0.1, # --max-count 100 starts having effect after 1000 frames, or # 10 seconds of data. +use_vad=false # End configuration section. @@ -69,8 +70,13 @@ data=$1 srcdir=$2 dir=$3 +extra_files= +if $use_vad; then + extra_files=$data/vad.scp +fi + for f in $data/feats.scp $srcdir/final.ie $srcdir/final.dubm $srcdir/global_cmvn.stats $srcdir/splice_opts \ - $srcdir/online_cmvn.conf $srcdir/final.mat; do + $srcdir/online_cmvn.conf $srcdir/final.mat $extra_files; do [ ! -f $f ] && echo "$0: No such file $f" && exit 1; done @@ -117,9 +123,15 @@ done if [ $stage -le 0 ]; then echo "$0: extracting iVectors" + extra_opts= + if $use_vad; then + extra_opts="--frame-weights-rspecifier=scp:$data/vad.scp" + fi + $cmd JOB=1:$nj $dir/log/extract_ivectors.JOB.log \ - ivector-extract-online2 --config=$ieconf ark:$sdata/JOB/spk2utt scp:$sdata/JOB/feats.scp ark:- \| \ - copy-feats --compress=$compress ark:- \ + ivector-extract-online2 --config=$ieconf $extra_opts \ + ark:$sdata/JOB/spk2utt scp:$sdata/JOB/feats.scp ark:- \| \ + copy-feats --compress=$compress ark:- \ ark,scp:$absdir/ivector_online.JOB.ark,$absdir/ivector_online.JOB.scp || exit 1; fi diff --git a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh index 60e3df20df2..f71a14aebf1 100755 --- a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh +++ b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh @@ -99,14 +99,14 @@ data_id=`basename $data_dir` sad_dir=${dir}/${sad_name}${affix}_${data_id}_whole${feat_affix} seg_dir=${dir}/${segmentation_name}${affix}_${data_id}_whole${feat_affix} -test_data_dir=data/${data_id}${feat_affix}_hires - if $convert_data_dir_to_whole; then + test_data_dir=data/${data_id}_whole${feat_affix}_hires if [ $stage -le 0 ]; then rm -r ${test_data_dir} || true utils/data/convert_data_dir_to_whole.sh $src_data_dir ${test_data_dir} fi else + test_data_dir=data/${data_id}${feat_affix}_hires if [ $stage -le 0 ]; then rm -r ${test_data_dir} || true utils/copy_data_dir.sh $src_data_dir $test_data_dir @@ -170,7 +170,8 @@ fi ## Prepare FST we search to make speech/silence decisions. ############################################################################### -frame_shift=$(utils/data/get_frame_shift.sh $test_data_dir) +utils/data/get_utt2dur.sh --nj $nj --cmd "$cmd" $test_data_dir || exit 1 +frame_shift=$(utils/data/get_frame_shift.sh $test_data_dir) || exit 1 graph_dir=${dir}/graph_${output_name} if [ $stage -le 5 ]; then diff --git a/src/online2bin/ivector-extract-online2.cc b/src/online2bin/ivector-extract-online2.cc index 4d71c2923ab..33aa990d1c3 100644 --- a/src/online2bin/ivector-extract-online2.cc +++ b/src/online2bin/ivector-extract-online2.cc @@ -55,6 +55,8 @@ int main(int argc, char *argv[]) { g_num_threads = 8; bool repeat = false; + int32 length_tolerance = 0; + std::string frame_weights_rspecifier; po.Register("num-threads", &g_num_threads, "Number of threads to use for computing derived variables " @@ -62,6 +64,12 @@ int main(int argc, char *argv[]) { po.Register("repeat", &repeat, "If true, output the same number of iVectors as input frames " "(including repeated data)."); + po.Register("frame-weights-rspecifier", &frame_weights_rspecifier, + "Archive of frame weights to scale stats"); + po.Register("length-tolerance", &length_tolerance, + "Tolerance on the difference in number of frames " + "for feats and frame weights"); + po.Read(argc, argv); if (po.NumArgs() != 3) { @@ -82,6 +90,7 @@ int main(int argc, char *argv[]) { SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier); RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier); + RandomAccessBaseFloatVectorReader frame_weights_reader(frame_weights_rspecifier); BaseFloatMatrixWriter ivector_writer(ivectors_wspecifier); @@ -106,6 +115,31 @@ int main(int argc, char *argv[]) { ivector_feature.SetAdaptationState(adaptation_state); + if (!frame_weights_rspecifier.empty()) { + if (!frame_weights_reader.HasKey(utt)) { + KALDI_WARN << "Did not find weights for utterance " << utt; + num_err++; + continue; + } + const Vector &weights = frame_weights_reader.Value(utt); + + if (std::abs(weights.Dim() - feats.NumRows()) > length_tolerance) { + num_err++; + continue; + } + + std::vector > frame_weights; + for (int32 i = 0; i < feats.NumRows(); i++) { + if (i < weights.Dim()) + frame_weights.push_back(std::make_pair(i, weights(i))); + else + frame_weights.push_back(std::make_pair(i, 0.0)); + } + + + ivector_feature.UpdateFrameWeights(frame_weights); + } + int32 T = feats.NumRows(), n = (repeat ? 1 : ivector_config.ivector_period), num_ivectors = (T + n - 1) / n;