diff --git a/egs/material/README b/egs/material/README new file mode 100644 index 00000000000..e69de29bb2d diff --git a/egs/material/s5/README b/egs/material/s5/README new file mode 100644 index 00000000000..0eb112493a4 --- /dev/null +++ b/egs/material/s5/README @@ -0,0 +1,35 @@ +About the MATERIAL corpus: + +The MATERIAL project: +https://www.iarpa.gov/index.php/research-programs/material +https://www.nist.gov/itl/iad/mig/openclir-evaluation + +The speech data in the MATERIAL corpus consist of four data sets for each +language: train (BUILD), development (BUILD-dev), test (ANALYSIS1 and ANALYSIS2), +and unlabeled evaluation audio (EVAL{1,2,3}). The train, development, test, and +evaluation data contain around 40, 10, 20, and 250 hours of audio respectively. +The train set is transcribed conversational audio that can be used for training +an ASR system. It consists of some in 8-bit a-law .sph (Sphere) files and some +in .wav files with 24-bit samples. The development set is transcribed +conversational audio that can be used as development data for training to tune +model parameters. The test data come in long unsegmented files. The reference +transcripts for the test set is provided, hence, one can measure WER on the test +set. The evaluation set is untranscribed audio that can be used for +semi-supervised training of the acoustic model. +Conversational speech data in the train and test sets are two-channel audio with +the two channels temporally aligned. Each audio channel is provided and +transcribed as a separate file, identified as inLine or outLine channel. Both +audio channels are interleaved in a single file and a there is a single +interleaved transcript that reflects the temporal alignments. In addition to +conversational speech, the test and evlatuion sets also contain other +genres of speech, namely news broadcast and topical broadcast, which are +single channel files. + + +Running the recipe: + +In s5) +./run.sh --language +./local/chain/run_tdnn.sh +./local/chain/decode_test.sh --language +./local/rnnlm/run_tdnn_lstm.sh diff --git a/egs/material/s5/RESULTS b/egs/material/s5/RESULTS new file mode 100644 index 00000000000..546f1630698 --- /dev/null +++ b/egs/material/s5/RESULTS @@ -0,0 +1,51 @@ +WER results for supervised and semi-supervised acoustic model training + +Baseline: GMM training to create alignments and lattice-free MMI-trained neural +network with factorized TDNN. The BUILD package labeled audio is used for +supervised acoustic model training, the EVALs unlabeled audio is added for +semi-supervised acoustic model training. + +Source-side bitext on the BUILD package and crawled monolingual data are used in +building the n-gram LM, RNNLM re-scoring, as well as extending the baseline lexicon. + + +Results for *supervised* acoustic model training: + +Swahili + Baseline +RNNLM +RNNLM-nbest +BUILD-dev 36.8 36.7 38.9 +ANALYSIS1 42.5 41.3 41.4 +ANALYSIS2 38.1 36.8 36.9 + +Tagalog + Baseline +RNNLM +RNNLM-nbest +BUILD-dev 46.4 46.1 47.5 +ANALYSIS1 52.1 51.0 50.9 +ANALYSIS2 53.6 52.3 52.2 + +Somali + Baseline +RNNLM +RNNLM-nbest +BUILD-dev 57.4 56.5 57.8 +ANALYSIS1 61.6 57.8 57.7 +ANALYSIS2 59.3 55.5 55.3 + + +Results for *semi-supervised* acoustic model training: + +Swahili + Baseline +RNNLM +RNNLM-nbest +BUILD-dev 35.3 35.1 36.7 +ANALYSIS1 35.2 34.5 34.7 +ANALYSIS2 30.8 30.0 30.1 + +Tagalog + Baseline +RNNLM +RNNLM-nbest +BUILD-dev 45.0 45.2 46.6 +ANALYSIS1 40.8 40.1 40.1 +ANALYSIS2 41.1 40.6 40.6 + +Somali + Baseline +RNNLM +RNNLM-nbest +BUILD-dev 56.8 56.3 57.7 +ANALYSIS1 50.6 48.8 48.6 +ANALYSIS2 49.8 48.2 48.2 diff --git a/egs/material/s5/cmd.sh b/egs/material/s5/cmd.sh new file mode 100644 index 00000000000..2bb1c6d24f5 --- /dev/null +++ b/egs/material/s5/cmd.sh @@ -0,0 +1,14 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="retry.pl --num-tries 3 queue.pl --mem 8G" diff --git a/egs/material/s5/conf/decode.config b/egs/material/s5/conf/decode.config new file mode 100644 index 00000000000..7ba966f2b83 --- /dev/null +++ b/egs/material/s5/conf/decode.config @@ -0,0 +1 @@ +# empty config, just use the defaults. diff --git a/egs/material/s5/conf/lang/somali.conf b/egs/material/s5/conf/lang/somali.conf new file mode 100755 index 00000000000..999c4c0ef14 --- /dev/null +++ b/egs/material/s5/conf/lang/somali.conf @@ -0,0 +1,26 @@ +# speech corpora files location +# the user should replace the values with the ones that work for their location +corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/BUILD/ +# test audio files to decode +audio_path_analysis1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/ANALYSIS1/audio/ +audio_path_analysis2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/ANALYSIS2/audio/ +audio_path_dev=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/DEV/audio/ +audio_path_eval1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/EVAL1/audio/ +audio_path_eval2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/EVAL2/audio/ +audio_path_eval3=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/EVAL3/audio/ +# bitext file location +bitext=$corpus/bitext/MATERIAL_BASE-1S-BUILD_bitext.txt +mono=/home/pkoehn/statmt/data/site-crawl/corpus/paracrawl-release3.2018-11-05.en-so.zipporah-20-dedup.lang-filtered.so +mono2=/home/pkoehn/statmt/data/data.statmt.org/lm/so.filtered.tok.gz +# number_mapping is a 2-column file consisting of the numbers written as digits (1st column) and letters (2nd column) +number_mapping=/home/pkoehn/experiment/material-asr-so-en/scripts/somali_1_9999.txt +# Acoustic model parameters +numShorestUtts=40000 +numLeavesTri1=2000 +numGaussTri1=30000 +numLeavesTri2=3000 +numGaussTri2=60000 +numLeavesTri3=6000 +numGaussTri3=80000 + + diff --git a/egs/material/s5/conf/lang/swahili.conf b/egs/material/s5/conf/lang/swahili.conf new file mode 100755 index 00000000000..d90f4c2abd7 --- /dev/null +++ b/egs/material/s5/conf/lang/swahili.conf @@ -0,0 +1,26 @@ +# speech corpora files location +# the user should replace the values with the ones that work for their location +corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/ +# test audio files to decode +audio_path_analysis1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/ANALYSIS1/audio/ +audio_path_analysis2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/ANALYSIS2/audio/ +audio_path_dev=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/DEV/audio/ +audio_path_eval1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/EVAL1/audio/ +audio_path_eval2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/EVAL2/audio/ +audio_path_eval3=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/EVAL3/audio/ +# bitext file location +bitext=$corpus/bitext/MATERIAL_BASE-1A-BUILD_bitext.txt +mono=/home/pkoehn/statmt/data/site-crawl/mono-corpus/mono.2018-04-24.sw +mono2= +# number_mapping is a 2-column file consisting of the numbers written as digits (1st column) and letters (2nd column) +number_mapping=/home/pkoehn/experiment/material-asr-so-en/scripts/swahili_1_9999.txt +# Acoustic model parameters +numShorestUtts=40000 +numLeavesTri1=2000 +numGaussTri1=30000 +numLeavesTri2=3000 +numGaussTri2=60000 +numLeavesTri3=6000 +numGaussTri3=80000 + + diff --git a/egs/material/s5/conf/lang/tagalog.conf b/egs/material/s5/conf/lang/tagalog.conf new file mode 100644 index 00000000000..238979feb3f --- /dev/null +++ b/egs/material/s5/conf/lang/tagalog.conf @@ -0,0 +1,26 @@ +# speech corpora files location +# the user should replace the values with the ones that work for their location +corpus=/home/pkoehn/experiment/material-asr-so-en/scripts/swahili_1_9999.txt +# test audio files to decode +audio_path_analysis1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/ANALYSIS1/audio/ +audio_path_analysis2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/ANALYSIS2/audio/ +audio_path_dev=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/DEV/audio/ +audio_path_eval1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/EVAL1/audio/ +audio_path_eval2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/EVAL2/audio/ +audio_path_eval3=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/EVAL3/audio/ +# bitext file location +bitext=$corpus/bitext/MATERIAL_BASE-1B-BUILD_bitext.txt +mono=/home/pkoehn/statmt/data/site-crawl/mono-corpus/mono.2018-04-24.tl +mono2= +# number_mapping is a 2-column file consisting of the numbers written as digits (1st column) and letters (2nd column) +number_mapping= +# Acoustic model parameters +numShorestUtts=45000 +numLeavesTri1=4000 +numGaussTri1=60000 +numLeavesTri2=5000 +numGaussTri2=80000 +numLeavesTri3=7000 +numGaussTri3=100000 + + diff --git a/egs/material/s5/conf/mfcc.conf b/egs/material/s5/conf/mfcc.conf new file mode 100644 index 00000000000..e6defc10078 --- /dev/null +++ b/egs/material/s5/conf/mfcc.conf @@ -0,0 +1,2 @@ +--use-energy=false +--sample-frequency=8000 diff --git a/egs/material/s5/conf/mfcc_hires.conf b/egs/material/s5/conf/mfcc_hires.conf new file mode 100644 index 00000000000..f218143e78a --- /dev/null +++ b/egs/material/s5/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training. +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--sample-frequency=8000 # most of the files are 8kHz +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=40 # low cutoff frequency for mel bins +--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800) diff --git a/egs/material/s5/conf/online_cmvn.conf b/egs/material/s5/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/material/s5/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/material/s5/conf/plp.conf b/egs/material/s5/conf/plp.conf new file mode 100644 index 00000000000..926bcfca92a --- /dev/null +++ b/egs/material/s5/conf/plp.conf @@ -0,0 +1 @@ +--sample-frequency=8000 diff --git a/egs/material/s5/local/audio2wav_scp.pl b/egs/material/s5/local/audio2wav_scp.pl new file mode 100755 index 00000000000..f051c2714d2 --- /dev/null +++ b/egs/material/s5/local/audio2wav_scp.pl @@ -0,0 +1,55 @@ +#!/usr/bin/env perl +#=============================================================================== +# Copyright 2017 (Author: Yenda Trmal ) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +use strict; +use warnings; +use utf8; + + +my $sox = `which sox` or die "The sox binary does not exist"; +chomp $sox; +my $sph2pipe = `which sph2pipe` or die "The sph2pipe binary does not exist"; +chomp $sph2pipe; + +while() { + chomp; + my $full_path = $_; + (my $basename = $full_path) =~ s/.*\///g; + + die "The filename $basename does not match the expected naming pattern!" unless $basename =~ /.*\.(wav|sph)$/; + (my $ext = $basename) =~ s/.*\.(wav|sph)$/$1/g; + (my $name = $basename) =~ s/(.*)\.(wav|sph)$/$1/g; + + + # name looks like this: + # MATERIAL_BASE-1A-BUILD_10002_20131130_011225_inLine.sph + # Please note that the naming pattern must match + # the pattern in create_datafiles.pl + $name =~ s/inLine.*/0/g; + $name =~ s/outLine.*/1/g; + $name =~ s/_BASE//g; + $name =~ s/-BUILD//g; + + if ($ext eq "wav") { + print "$name $sox $full_path -r 8000 -c 1 -b 16 -t wav - downsample|\n"; + } else { + print "$name $sph2pipe -f wav -p -c 1 $full_path|\n"; + } +} + + diff --git a/egs/material/s5/local/chain/decode_test.sh b/egs/material/s5/local/chain/decode_test.sh new file mode 100755 index 00000000000..40115a04cf6 --- /dev/null +++ b/egs/material/s5/local/chain/decode_test.sh @@ -0,0 +1,171 @@ +#!/bin/bash + +# Copyright 2018 Johns Hopkins University (author: Daniel Povey) +# 2018 Mahsa Yarmohammadi +# 2018 Yiming Wang + + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +language=swahili +stage=0 +datadev="data/analysis1 data/analysis2 data/test_dev data/eval1 data/eval2 data/eval3" +dir=exp/chain/tdnn1b_sp +lang=data/lang_combined_chain +tree_dir=exp/chain/tree_sp +cmd=queue.pl +graph_affix=_combined + +# training options +chunk_width=140,100,160 +chunk_left_context=0 +chunk_right_context=0 + +# ivector options +max_count=75 # parameter for extract_ivectors.sh +sub_speaker_frames=600 +filter_ctm=true +weights_file= +silence_weight=0.00001 +nj=30 + +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +[ ! -f ./conf/lang/${language}.conf ] && \ + echo "Language configuration conf/lang/${language}.conf does not exist!" && exit 1 +ln -sf ./conf/lang/${language}.conf lang.conf +. ./lang.conf + +if ! cuda-compiled; then + cat </dev/null || true + +if [ $stage -le 3 ]; then + # do the 1st pass decoding + for datadir in $datadev; do + ( + data=$(basename $datadir) + nspk=$(wc -l ${dir}/decode_${data}_segmented/ali.JOB.gz" || exit 1; + + cp $lang/phones.txt ${dir}/decode_${data}_segmented || exit 1; + + steps/resegment_data.sh --segmentation-opts "$segmentation_opts" ${datadir}_segmented_hires $lang \ + ${dir}/decode_${data}_segmented ${datadir}_segmented_reseg_hires_tmp exp/resegment_${data}_segmented + + utils/data/subsegment_data_dir.sh ${datadir}_segmented_hires ${datadir}_segmented_reseg_hires_tmp/segments \ + ${datadir}_segmented_reseg_hires + + rm -rf ${datadir}_segmented_reseg_hires_tmp 2>/dev/null || true + + echo "Extracting i-vectors, stage 2" + # this does offline decoding, except we estimate the iVectors per + # speaker, excluding silence (based on alignments from a DNN decoding), with a + # different script. This is just to demonstrate that script. + # the --sub-speaker-frames is optional; if provided, it will divide each speaker + # up into "sub-speakers" of at least that many frames... can be useful if + # acoustic conditions drift over time within the speaker's data. + steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $nj \ + --silence-weight $silence_weight \ + --sub-speaker-frames $sub_speaker_frames --max-count $max_count \ + ${datadir}_segmented_reseg_hires $lang exp/nnet3/extractor \ + exp/nnet3/ivectors_${data}_segmented_reseg_hires; + done +fi + +if [ $stage -le 5 ]; then + # 2nd-pass decoding on the resegmented data + for datadir in $datadev; do + ( + data=$(basename $datadir) + nspk=$(wc -l 1792 xent:train/valid[65,98,final]=(-1.93,-1.66,-1.68/-2.05,-1.84,-1.83) logprob:train/valid[65,98,final]=(-0.199,-0.166,-0.167/-0.225,-0.208,-0.206) +# [for tagalog] +# exp/chain/tdnn1a_sp: num-iters=96 nj=2..12 num-params=12.3M dim=40+100->1952 combine=-0.165->-0.165 (over 2) xent:train/valid[63,95,final]=(-1.89,-1.66,-1.65/-2.06,-1.89,-1.89) logprob:train/valid[63,95,final]=(-0.186,-0.158,-0.157/-0.231,-0.219,-0.218) + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +train_set=train +test_sets="dev" +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +tree_affix= +common_egs_dir= +reporting_email= + +# LSTM/chain options +train_stage=-10 +get_egs_stage=-10 +xent_regularize=0.1 + +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +num_epochs=7 + +# training options +srand=0 +remove_egs=true + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang_combined/topo + fi +fi + +if [ $stage -le 8 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang_combined $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 9 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 6000 ${lores_train_data_dir} \ + $lang_combined $ali_dir $tree_dir +fi + + +if [ $stage -le 10 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.01 dropout-per-dim=true dropout-per-dim-continuous=true" + linear_opts="orthonormal-constraint=1.0" + output_opts="l2-regularize=0.005" + + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $opts dim=768 + linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=768 + linear-component name=tdnn3l dim=256 $linear_opts + relu-batchnorm-dropout-layer name=tdnn3 $opts dim=768 + linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=768 + linear-component name=tdnn5l dim=256 $linear_opts + relu-batchnorm-dropout-layer name=tdnn5 $opts dim=768 input=Append(0, tdnn3l) + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1024 + linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=768 + linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1024 + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=768 + linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1024 + linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=768 + linear-component name=prefinal-l dim=256 $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1024 + output-layer name=output include-log-softmax=false dim=$num_targets bottleneck-dim=256 $output_opts + + relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1024 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor bottleneck-dim=256 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=$num_epochs \ + --trainer.frames-per-iter=1500000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=12 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=0 \ + --egs.chunk-right-context=0 \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 12 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test \ + $tree_dir $tree_dir/graph || exit 1; + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_combined_test \ + $tree_dir ${tree_dir}/graph_combined || exit 1; +fi + +if [ $stage -le 13 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 1816 +# combine=-0.127->-0.127 (over 2) xent:train/valid[65,98,final]=(-1.74,-1.44,-1.43/-1.80,-1.62,-1.61) +# logprob:train/valid[65,98,final]=(-0.175,-0.136,-0.135/-0.194,-0.182,-0.180) + +# [for tagalog] +# exp/chain/tdnn1b_sp/: num-iters=96 nj=2..12 num-params=17.2M dim=40+100->1928 combine=-0.124->-0.123 +# (over 2) xent:train/valid[63,95,final]=(-1.69,-1.43,-1.42/-1.75,-1.62,-1.60) +# logprob:train/valid[63,95,final]=(-0.168,-0.128,-0.127/-0.193,-0.187,-0.187) + +# [for somali] +# exp/chain/tdnn1b_sp/: num-iters=84 nj=2..12 num-params=17.9M dim=40+100->3240 combine=-0.162->-0.160 +# (over 2) xent:train/valid[55,83,final]=(-2.31,-2.02,-2.00/-2.27,-2.13,-2.10) +# logprob:train/valid[55,83,final]=(-0.218,-0.157,-0.154/-0.268,-0.263,-0.263) + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +train_set=train +test_sets="dev" +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=1b #affix for TDNN directory e.g. "1a" or "1b", in case we change the configuration. +tree_affix= +common_egs_dir= +reporting_email= + +# LSTM/chain options +train_stage=-10 +get_egs_stage=-10 +xent_regularize=0.1 + +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +dropout_schedule='0,0@0.20,0.3@0.50,0' +num_epochs=7 + +# training options +srand=0 +remove_egs=true + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang_combined/topo + fi +fi + +if [ $stage -le 8 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang_combined $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 9 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 6000 ${lores_train_data_dir} \ + $lang_combined $ali_dir $tree_dir +fi + + +if [ $stage -le 10 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004" + output_opts="l2-regularize=0.002" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1024 + linear-component name=tdnn2l0 dim=256 $linear_opts input=Append(-1,0) + linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1024 + linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1024 input=Append(0,1) + linear-component name=tdnn4l0 dim=256 $linear_opts input=Append(-1,0) + linear-component name=tdnn4l dim=256 $linear_opts input=Append(0,1) + relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1024 + linear-component name=tdnn5l dim=256 $linear_opts + relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1024 input=Append(0, tdnn3l) + linear-component name=tdnn6l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn7l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn7l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1024 + linear-component name=tdnn8l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn8l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn9l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1024 + linear-component name=tdnn10l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn10l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn11l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1024 + linear-component name=prefinal-l dim=256 $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280 + linear-component name=prefinal-chain-l dim=256 $linear_opts + batchnorm-component name=prefinal-chain-batchnorm + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280 + linear-component name=prefinal-xent-l dim=256 $linear_opts + batchnorm-component name=prefinal-xent-batchnorm + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=$num_epochs \ + --trainer.frames-per-iter=1500000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=12 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=0 \ + --egs.chunk-right-context=0 \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 12 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_combined_test \ + $tree_dir ${tree_dir}/graph_combined || exit 1; +fi + +if [ $stage -le 13 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 1792 combine=-0.176->-0.174 (over 6) xent:train/valid[45,69,final]=(-1.71,-1.52,-1.50/-1.81,-1.69,-1.67) logprob:train/valid[45,69,final]=(-0.185,-0.160,-0.159/-0.213,-0.208,-0.205) + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=30 +train_set=train +test_sets=dev +gmm=tri3 +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +tlstm_affix=1a # affix for the TDNN-LSTM directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang_combined/topo + fi +fi + +if [ $stage -le 8 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang_combined $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 9 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 6000 ${lores_train_data_dir} \ + $lang_combined $ali_dir $tree_dir +fi + +if [ $stage -le 10 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + tdnn_opts="l2-regularize=0.02" + lstm_opts="l2-regularize=0.005" + output_opts="l2-regularize=0.004" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $tdnn_opts dim=512 + relu-batchnorm-layer name=tdnn2 $tdnn_opts input=Append(-1,0,1) dim=512 + relu-batchnorm-layer name=tdnn3 $tdnn_opts input=Append(-1,0,1) dim=512 + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts + relu-batchnorm-layer name=tdnn4 $tdnn_opts input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn5 $tdnn_opts input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn6 $tdnn_opts input=Append(-3,0,3) dim=512 + fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts + relu-batchnorm-layer name=tdnn7 $tdnn_opts input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn8 $tdnn_opts input=Append(-3,0,3) dim=512 + relu-batchnorm-layer name=tdnn9 $tdnn_opts input=Append(-3,0,3) dim=512 + fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=5 \ + --trainer.frames-per-iter=1500000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=12 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.0 \ + --trainer.deriv-truncate-margin=8 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 12 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test \ + $tree_dir $tree_dir/graph || exit 1; + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_combined_test \ + $tree_dir ${tree_dir}/graph_combined || exit 1; +fi + +if [ $stage -le 13 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l ) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +use strict; +use warnings; +use utf8; + +binmode STDIN, "utf8"; +binmode STDOUT, "utf8"; +binmode STDERR, "utf8"; + +# replacement of the smart-match operator (apparently not supported anymore) +sub is_elem { + my $word = shift; + my $array = shift; + foreach my $other_word (@{$array}) { + return 1 if $word eq $other_word; + } + return 0; +} + +my $unk = ""; +my $noise = ""; +my $spnoise = ""; +my $sil = ""; + +my @ignore_events = ("", ""); +#as per the BABEL docs, ~ means truncation of the word/utterance +my @ignore_utt_events = ("", "", "", "~"); +my @sil_events = (""); +my @noise_events = ("", "", "" ); +my @spnoise_events = ("", "", "", "", "", ""); + + + +UTT: while(<>) { + chomp; + my @line = split " ", $_; + my $file = shift @line; + my $begin = shift @line; + my $end = shift @line; + + next if (@line == 1) and ($line[0] eq ""); + next if (@line == 1) and ($line[0] =~ "<.*>"); #skip the utterance if all + #it contains is a non-speech event + + my @out_line; + foreach my $word (@line) { + if ($word =~ /.*-$/) { + push @out_line, $unk; + } elsif ($word =~ /^-.*/) { + push @out_line, $unk; + } elsif ($word =~ /^\*.*\*$/) { + push @out_line, $unk; + } elsif ($word eq "(())") { + push @out_line, $unk; + } elsif (is_elem $word, \@ignore_events) { + next; + } elsif (is_elem $word, \@ignore_utt_events) { + next UTT; + } elsif (is_elem $word, \@sil_events) { + push @out_line, $sil; + } elsif (is_elem $word, \@noise_events) { + push @out_line, $noise; + } elsif (is_elem $word, \@spnoise_events) { + push @out_line, $spnoise; + } else { + push @out_line, $word; + } + } + print "$file\t$begin\t$end\t" . join(" ", @out_line) . "\n" if @out_line; + +} + + diff --git a/egs/material/s5/local/convert_lexicon.pl b/egs/material/s5/local/convert_lexicon.pl new file mode 100755 index 00000000000..1fe7e90ac1f --- /dev/null +++ b/egs/material/s5/local/convert_lexicon.pl @@ -0,0 +1,76 @@ +#!/usr/bin/env perl +#=============================================================================== +# Copyright 2017 (Author: Yenda Trmal ) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +use strict; +use warnings; +use utf8; + +binmode STDIN, ":utf8"; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; + +my $lexicon_name = $ARGV[0]; +open(my $lexicon_file, "<:encoding(UTF-8)", $lexicon_name) or + die "Cannot open $lexicon_name: $!\n"; + +my $wordlist_name = $ARGV[1]; +open(my $wordlist_file, "<:encoding(UTF-8)", $wordlist_name) or + die "Cannot open $wordlist_name: $!\n"; + + +my %lexicon; +while (<$lexicon_file>) { + chomp; + (my $word, my $prons) = split " ", $_, 2; + $lexicon{uc $word} = $prons; +} + +while (<$wordlist_file>) { + chomp; + my $word = $_; + print STDERR "Cannot find word $word in lexicon\n" unless defined($lexicon{uc $word}); + + #print "$word $lexicon{$word}\n"; + + my @prons = split "\t", $lexicon{uc $word}; + foreach my $pron (@prons) { + my @phones = split " ", $pron; + my $stress_mark = 0; + my @out_phones = (); + foreach my $phone (@phones) { + if ($phone eq "\"") { + $stress_mark = 1 + } elsif ( $phone eq "." ) { + $stress_mark = 0; + push @out_phones, '.'; + } elsif ( $phone eq "#" ) { + $stress_mark = 0; + push @out_phones, '.'; + } else { + $phone =~ s/_/+/g; + #let's just ignore stress for now + #$phone = "${phone}_\"" if $stress_mark; + push @out_phones, $phone; + } + } + my $out_pron = join(" ", @out_phones); + $out_pron =~ s/ *\. */\t/g; + print "$word\t$out_pron\n"; + } +} + diff --git a/egs/material/s5/local/count_oovs.pl b/egs/material/s5/local/count_oovs.pl new file mode 100755 index 00000000000..228399f99e3 --- /dev/null +++ b/egs/material/s5/local/count_oovs.pl @@ -0,0 +1,81 @@ +#!/usr/bin/perl -W + +# (c) 2014 Korbinian Riedhammer + +# Count the number of OOV per turn (or speaker, if utt2spk is provided). Use +# the --split-words option to split non-ascii words into characters (syllable +# based languages). + + +use strict; +use warnings; +use Getopt::Long; +use open qw(:std :utf8); + + +my $utt2spkf = ""; +my $split_words = 0; + +GetOptions( + 'utt2spk=s' => \$utt2spkf, + 'split-words' => \$split_words +); + +if (scalar @ARGV lt 1) { + print STDERR "usage: $0 [--utt2spk=utt2spk] words.txt [input]\n"; + exit 1; +} + +my $lexf = shift @ARGV; + +my %lex = map { my ($a, $b) = split /\s+/; $a => $b; } `cat $lexf`; + +my %utt2spk = (); +if (length $utt2spkf gt 0) { + %utt2spk = map { my ($a, $b) = split /\s+/; $a => $b; } `cat $utt2spkf`; #read_file($utt2spkf, binmode => ':utf8'); +} + +my %num_words = (); +my %num_oovs = (); +my %oov_string = (); + +while (<>) { + my ($id, @trl) = split /\s+/; + + if (length $utt2spkf gt 0) { + if (defined $utt2spk{$id}) { + $id = $utt2spk{$id}; + } else { + printf STDERR "Warning: $id not specified in $utt2spkf\n"; + } + } + + $num_words{$id} = 0 unless defined $num_words{$id}; + $num_oovs{$id} = 0 unless defined $num_oovs{$id}; + $oov_string{$id} = "" unless defined $oov_string{$id}; + + + if ($split_words) { + for (my $i = 0; $i < scalar @trl; $i++) { + my $w = $trl[$i]; + unless ($w =~ m/[a-zA-Z_\-]/) { + my @sw = split //, $w; + splice @trl, $i, 1, @sw; + $i += (scalar @sw) - 1; + } + } + } + + $num_words{$id} += scalar @trl; + for my $w (@trl) { + $num_oovs{$id} += 1 unless defined $lex{$w}; + $oov_string{$id} .= "$w " unless defined $lex{$w}; + } + +} + +for my $i (sort keys %num_words) { + printf "%s %d %d %s\n", $i, $num_words{$i}, $num_oovs{$i}, + ( defined $oov_string{$i} ? $oov_string{$i} : ""); +} + diff --git a/egs/material/s5/local/create_datafiles.pl b/egs/material/s5/local/create_datafiles.pl new file mode 100755 index 00000000000..d8e692524a1 --- /dev/null +++ b/egs/material/s5/local/create_datafiles.pl @@ -0,0 +1,69 @@ +#!/usr/bin/env perl +#=============================================================================== +# Copyright 2017 (Author: Yenda Trmal ) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +use strict; +use warnings; +use utf8; + +binmode STDIN, "utf8"; +binmode STDOUT, "utf8"; +binmode STDERR, "utf8"; + +my $output = $ARGV[0]; +open(my $utt2spk, ">:utf8", "$output/utt2spk") or + die "Cannot open $output/utt2spk: $!\n"; +open(my $text, ">:utf8", "$output/text") or + die "Cannot open $output/text: $!\n"; +open(my $segments, ">:utf8", "$output/segments") or + die "Cannot open $output/segments: $!\n"; +open(my $wav, ">:utf8", "$output/wav2file") or + die "Cannot open $output/wav2file: $!\n"; + +my %text2id; +while() { + chomp; + my @line = split (" ", $_, 4); + my $name = shift @line; + my $begin = shift @line; + my $end = shift @line; + my $words = shift @line; + my $name_raw = $name; + + my $begin_text = sprintf("%07d", $begin * 1000); + my $end_text = sprintf("%07d", $end * 1000); + + # name looks like this: + # MATERIAL_BASE-1A-BUILD_10002_20131130_011225_inLine.txt + # Please note that the naming pattern must match + # the pattern in audio2wav_scp.pl + $name =~ s/inLine.*/0/g; + $name =~ s/outLine.*/1/g; + $name =~ s/_BASE//g; + $name =~ s/-BUILD//g; + + my $utt_name = join("_", $name, $begin_text, $end_text); + print $segments "$utt_name $name $begin $end\n"; + print $utt2spk "$utt_name $name\n"; + print $text "$utt_name $words\n"; + if (defined $text2id{$name}) { + die "" if $text2id{$name} ne $name_raw; + } else { + print $wav "$name $name_raw\n"; + $text2id{$name} = $name_raw; + } +} diff --git a/egs/material/s5/local/ctm_filter b/egs/material/s5/local/ctm_filter new file mode 100755 index 00000000000..fa0f749c92a --- /dev/null +++ b/egs/material/s5/local/ctm_filter @@ -0,0 +1,7 @@ +#!/usr/bin/perl + +while (<>) { + if ($_ !~ m/<(noise|unk|spnoise|sil)>/i) { + print $_; + } +} diff --git a/egs/material/s5/local/g2p/apply_g2p.sh b/egs/material/s5/local/g2p/apply_g2p.sh new file mode 100755 index 00000000000..704a1a906bb --- /dev/null +++ b/egs/material/s5/local/g2p/apply_g2p.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# Copyright 2016 Allen Guo +# 2017 Xiaohui Zhang +# Apache License 2.0 + +# This script applies a trained Phonetisarus G2P model to +# synthesize pronunciations for missing words (i.e., words in +# transcripts but not the lexicon), and output the expanded lexicon. + +var_counts=1 + +. ./path.sh || exit 1 +. parse_options.sh || exit 1; + +if [ $# -ne "4" ]; then + echo "Usage: $0 " + exit 1 +fi + +model=$1 +workdir=$2 +lexicon=$3 +outlexicon=$4 + +mkdir -p $workdir + +echo 'Synthesizing pronunciations for missing words...' +phonetisaurus-apply --nbest $var_counts --model $model --thresh 5 --accumulate --word_list $workdir/missing_onlywords.txt > $workdir/missing_g2p_${var_counts}.txt + +echo "Adding new pronunciations to $lexicon" +cat "$lexicon" $workdir/missing_g2p_${var_counts}.txt | sort | uniq > $outlexicon diff --git a/egs/material/s5/local/g2p/train_g2p.sh b/egs/material/s5/local/g2p/train_g2p.sh new file mode 100755 index 00000000000..43e75f6608d --- /dev/null +++ b/egs/material/s5/local/g2p/train_g2p.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +# Copyright 2017 Intellisist, Inc. (Author: Navneeth K) +# 2017 Xiaohui Zhang +# Apache License 2.0 + +# This script trains a g2p model using Phonetisaurus and SRILM. + +stage=0 +silence_phones= + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. utils/parse_options.sh || exit 1; + + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + exit 1; +fi + +lexicondir=$1 +outdir=$2 + +[ ! -f $lexicondir/lexicon.txt ] && echo "Cannot find $lexicondir/lexicon.txt" && exit + +isuconv=`which uconv` +if [ -z $isuconv ]; then + echo "uconv was not found. You must install the icu4c package." + exit 1; +fi + +mkdir -p $outdir + + +# For input lexicon, remove pronunciations containing non-utf-8-encodable characters, +# and optionally remove words that are mapped to a single silence phone from the lexicon. +if [ $stage -le 0 ]; then + lexicon=$lexicondir/lexicon.txt + if [ ! -z "$silence_phones" ]; then + awk 'NR==FNR{a[$1] = 1; next} {s=$2;for(i=3;i<=NF;i++) s=s" "$i; if(!(s in a)) print $1" "s}' \ + $silence_phones $lexicon | \ + awk '{printf("%s\t",$1); for (i=2;i 0'> $outdir/lexicon_tab_separated.txt + else + awk '{printf("%s\t",$1); for (i=2;i 0'> $outdir/lexicon_tab_separated.txt + fi +fi + +if [ $stage -le 1 ]; then + # Align lexicon stage. Lexicon is assumed to have first column tab separated + phonetisaurus-align --input=$outdir/lexicon_tab_separated.txt --ofile=${outdir}/aligned_lexicon.corpus || exit 1; +fi + +if [ $stage -le 2 ]; then + # Convert aligned lexicon to arpa using srilm. + ngram-count -order 7 -kn-modify-counts-at-end -gt1min 0 -gt2min 0 \ + -gt3min 0 -gt4min 0 -gt5min 0 -gt6min 0 -gt7min 0 -ukndiscount \ + -text ${outdir}/aligned_lexicon.corpus -lm ${outdir}/aligned_lexicon.arpa +fi + +if [ $stage -le 3 ]; then + # Convert the arpa file to FST. + phonetisaurus-arpa2wfst --lm=${outdir}/aligned_lexicon.arpa --ofile=${outdir}/model.fst +fi diff --git a/egs/material/s5/local/nnet3/run_ivector_common.sh b/egs/material/s5/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..a56b3bf67d8 --- /dev/null +++ b/egs/material/s5/local/nnet3/run_ivector_common.sh @@ -0,0 +1,149 @@ +#!/bin/bash + +set -euo pipefail + +# This script is called from local/nnet3/run_tdnn.sh and +# local/chain/run_tdnn.sh (and may eventually be called by more +# scripts). It contains the common feature preparation and +# iVector-related parts of the script. See those scripts for examples +# of usage. + +stage=0 +train_set=train +test_sets="dev" +nj=30 +gmm=tri3 + +nnet3_affix= + +. ./cmd.sh +. ./path.sh +. utils/parse_options.sh + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp + +for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + +if [ $stage -le 1 ]; then + # Although the nnet will be trained by high resolution data, we still have to + # perturb the normal data to get the alignment _sp stands for speed-perturbed + echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp + echo "$0: making MFCC features for low-resolution speed-perturbed data" + steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj data/${train_set}_sp || exit 1; + steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1; + utils/fix_data_dir.sh data/${train_set}_sp +fi + +if [ $stage -le 2 ]; then + echo "$0: aligning with the perturbed low-resolution data" + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + data/${train_set}_sp data/lang_test $gmm_dir $ali_dir || exit 1 +fi + +if [ $stage -le 3 ]; then + # Create high-resolution MFCC features (with 40 cepstra instead of 13). + # this shows how you can split across multiple file-systems. + echo "$0: creating high-resolution MFCC features" + mfccdir=data/${train_set}_sp_hires/data + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/b1{5,6,7,8}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp ${test_sets}; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1; + + for datadir in ${train_set}_sp ${test_sets}; do + steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1; + utils/fix_data_dir.sh data/${datadir}_hires || exit 1; + done +fi + +if [ $stage -le 4 ]; then + echo "$0: computing a subset of data to train the diagonal UBM." + # We'll use about a quarter of the data. + mkdir -p exp/nnet3${nnet3_affix}/diag_ubm + temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm + + num_utts_total=$(wc -l ", "--", ".", "?", "~"]: + parts[i] = "" + elif w == "%incomplete": + parts[i] = "" + elif w in ["", "", "", ""]: + parts[i] = "" + elif w in ["", ""]: + parts[i] = "" + elif w in ["", "(())", "", "", ""]: + parts[i] = "" + + # change *word* into word + parts[i] = re.sub(r"^[*](\S+)[*]$", r"\1", parts[i]) + + return re.sub(r"\s+", " ", " ".join(parts)) + + +def write_segment(start_time, end_time, text, reco_id, + segments_fh, utt2spk_fh, text_fh): + assert end_time > start_time + + text = normalize_text(text) + + utt_id = "{reco_id}-{st:06d}-{end:06d}".format( + reco_id=reco_id, + st=int(start_time * 100), end=int(end_time * 100)) + + print ("{utt_id} {reco_id} {st} {end}" + "".format(utt_id=utt_id, reco_id=reco_id, + st=start_time, end=end_time), + file=segments_fh) + print ("{utt_id} {reco_id}" + "".format(utt_id=utt_id, reco_id=reco_id), + file=utt2spk_fh) + print ("{utt_id} {text}" + "".format(utt_id=utt_id, text=text), + file=text_fh) + + +def parse_calls_transcript_file(transcript_file, segments_fh, + utt2spk_fh, text_fh): + base_name = os.path.basename(transcript_file) + file_id = re.sub(".transcription.txt", "", base_name) + + inline_start_time = -1 + outline_start_time = -1 + + i = 0 + + for line in open(transcript_file): + parts = line.strip().split() + + if i == 0 and not parts[0].startswith('0'): + raise Exception("Transcript file {0} does not start with 0.000" + "".format(transcript_file)) + i += 1 + + start_time = float(parts[0]) + if len(parts) == 1: + # Last line in the file + write_segment(inline_start_time, start_time, inline_text, file_id + "_inLine", + segments_fh, utt2spk_fh, text_fh) + write_segment(outline_start_time, start_time, outline_text, file_id + "_outLine", + segments_fh, utt2spk_fh, text_fh) + break + + assert parts[1] in ["inLine", "outLine"] + + if parts[1] == "inLine": + reco_id = file_id + "_inLine" + if inline_start_time >= 0: + write_segment(inline_start_time, start_time, inline_text, reco_id, + segments_fh, utt2spk_fh, text_fh) + inline_text = " ".join(parts[2:]) + inline_start_time = start_time + else: + reco_id = file_id + "_outLine" + if outline_start_time >= 0: + write_segment(outline_start_time, start_time, outline_text, reco_id, + segments_fh, utt2spk_fh, text_fh) + outline_text = " ".join(parts[2:]) + outline_start_time = start_time + + +def parse_non_calls_transcript_file(transcript_file, segments_fh, + utt2spk_fh, text_fh): + base_name = os.path.basename(transcript_file) + file_id = re.sub(".transcription.txt", "", base_name) + + start_time = -1 + i = 0 + + with open(transcript_file) as fh: + line = fh.readline().strip() + if not line.startswith('['): + raise Exception("Transcript file {0} does not start with [0.000" + "".format(transcript_file)) + try: + start_time = float(re.sub(r"\[([^\]]+)\]", r"\1", line)) + except Exception: + print("Could not parse line {0}".format(line), file=sys.stderr) + raise + + text = fh.readline() + while text != '': + text = text.strip() + line = fh.readline().strip() + if not line.startswith('['): + raise Exception("Time-stamp in transcript file {0} does not start with [; error parsing line {1} after text {2}" + "".format(transcript_file, line, text)) + try: + end_time = float(re.sub(r"\[([^\]]+)\]", r"\1", line)) + except Exception: + print("Could not parse line {0}".format(line), file=sys.stderr) + raise + + write_segment(start_time, end_time, text, file_id, + segments_fh, utt2spk_fh, text_fh) + start_time = end_time + text = fh.readline() + + +if __name__ == "__main__": + if len(sys.argv) != 5: + print ("Usage: {0} ", + file=sys.stderr) + raise SystemExit(1) + + root_path = sys.argv[1] + calls_list = open(sys.argv[2]).readlines() + non_calls_list = open(sys.argv[3]).readlines() + data_dir = sys.argv[4] + + wav_scp_fh = open("{0}/wav.scp".format(data_dir), 'w') + utt2spk_fh = open("{0}/utt2spk".format(data_dir), 'w') + reco2file_and_channel_fh = open( + "{0}/reco2file_and_channel".format(data_dir), 'w') + text_fh = open("{0}/text".format(data_dir), 'w') + segments_fh = open("{0}/segments".format(data_dir), 'w') + + for line in calls_list: + file_id = line.strip() + transcript_file = ( + "{root_path}/transcription/{file_id}.transcription.txt" + "".format(root_path=root_path, file_id=file_id)) + wav_file = "{root_path}/src/{file_id}.wav".format( + root_path=root_path, file_id=file_id) + + for channel in [1, 2]: + reco_id = file_id + ("_inLine" if channel == 1 else "_outLine") + print ("{reco_id} {file_id} {channel}" + "".format(reco_id=reco_id, file_id=file_id, + channel="A" if channel == 1 else "B"), + file=reco2file_and_channel_fh) + print ("{reco_id} sox {wav_file} -r 8000 -b 16 -c 1 -t wav - remix {channel} |" + "".format(reco_id=reco_id, wav_file=wav_file, channel=channel), + file=wav_scp_fh) + + parse_calls_transcript_file(transcript_file, segments_fh, + utt2spk_fh, text_fh) + + for line in non_calls_list: + file_id = line.strip() + transcript_file = ( + "{root_path}/transcription/{file_id}.transcription.txt" + "".format(root_path=root_path, file_id=file_id)) + wav_file = "{root_path}/src/{file_id}.wav".format( + root_path=root_path, file_id=file_id) + + print ("{file_id} {file_id} 1" + "".format(file_id=file_id), + file=reco2file_and_channel_fh) + print ("{reco_id} sox {wav_file} -r 8000 -b 16 -c 1 -t wav - |" + "".format(reco_id=file_id, wav_file=wav_file), + file=wav_scp_fh) + + parse_non_calls_transcript_file(transcript_file, segments_fh, + utt2spk_fh, text_fh) + + wav_scp_fh.close() + utt2spk_fh.close() + reco2file_and_channel_fh.close() + text_fh.close() + segments_fh.close() diff --git a/egs/material/s5/local/parse_transcripts.pl b/egs/material/s5/local/parse_transcripts.pl new file mode 100755 index 00000000000..06c18a30c6c --- /dev/null +++ b/egs/material/s5/local/parse_transcripts.pl @@ -0,0 +1,53 @@ +#!/usr/bin/env perl +#=============================================================================== +# Copyright 2017 (Author: Yenda Trmal ) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +use strict; +use warnings; +use utf8; + +binmode STDIN, "utf8"; +binmode STDOUT, "utf8"; +binmode STDERR, "utf8"; + +my $file = $ARGV[0]; + +open(my $transcript, "<:utf8", $file) or + die "Cannot open file $file: $!\n"; + +(my $basename = $file) =~ s/(.*\/)?([^\/]+)/$2/g; + +my $sentence = undef; +my $begin_time = undef; +my $end_time = undef; +while(<$transcript>) { + chomp; + if (/^\[([0-9.]+)\]$/) { + $begin_time = $end_time; + $end_time = $1; + if ($sentence) { + print "$basename\t$begin_time\t$end_time\t$sentence\n"; + $sentence = undef; + } + } else { + die "Invalid format of the transcription in $basename\n" if defined($sentence); + $sentence = $_; + } +} + +die "Invalid format of the transcription in $basename\n" if defined($sentence); + diff --git a/egs/material/s5/local/postprocess_test.sh b/egs/material/s5/local/postprocess_test.sh new file mode 100755 index 00000000000..950c1191d4d --- /dev/null +++ b/egs/material/s5/local/postprocess_test.sh @@ -0,0 +1,56 @@ +#!/bin/sh +set -euo pipefail +echo "$0 $@" + +stage=0 + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if [ $# -ne 3 ]; then + echo "Usage: $0 " + echo " e.g.: $0 analysis1 exp/chain/tdnn/graph exp/chain/tdnn/decode_analysis1_segmented" + exit 1 +fi + +data=$1 +graph_dir=$2 +decode_dir=$3 + +# get recording-level CTMs from the lattice by resolving the overlapping +# regions + +if [ $stage -le 0 ]; then + steps/get_ctm_fast.sh --cmd "$decode_cmd" --frame-shift 0.03 \ + data/${data}_hires/ ${graph_dir} \ + ${decode_dir} ${decode_dir}/score_10_0.0 +fi + +if [ $stage -le 1 ]; then + utils/ctm/resolve_ctm_overlaps.py data/${data}_hires/segments \ + ${decode_dir}/score_10_0.0/ctm \ + - | utils/convert_ctm.pl data/${data}_hires/segments data/${data}_hires/reco2file_and_channel > \ + ${decode_dir}/score_10_0.0/${data}_hires.ctm +fi + +if [ $stage -le 2 ]; then + # extract n-best lists from archive.* files + if [[ ${decode_dir} == *_rescore_nbest ]]; then + hyp_filtering_cmd="cat" + [ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter" + [ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter" + mkdir -p ${decode_dir}/output_nbest + for f in ${decode_dir}/archives.*; do + docid=$(head -1 $f/words_text | awk '{print $1}' | cut -f1,2 -d'-') + $hyp_filtering_cmd $f/words_text > \ + ${decode_dir}/output_nbest/$docid".n.txt" || exit 1; + done + fi + + # compute WER + local/score_stm.sh --min-lmwt 10 --max-lmwt 10 --word-ins-penalty 0.0 \ + --cmd "$decode_cmd" data/${data}_hires $graph_dir ${decode_dir} + + grep -H Sum ${decode_dir}/score*/*.sys | utils/best_wer.sh +fi diff --git a/egs/material/s5/local/prepare_audio_data.sh b/egs/material/s5/local/prepare_audio_data.sh new file mode 100755 index 00000000000..2bf9283f435 --- /dev/null +++ b/egs/material/s5/local/prepare_audio_data.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal) +# License: Apache 2.0 + +# Begin configuration section. +# End configuration section +set -e -o pipefail +set -o nounset # Treat unset variables as an error +echo "$0 " "$@" + +if [ $# -ne 1 ] ; then + echo "Invalid number of script parameters. " + echo " $0 " + echo "e.g." + echo " $0 /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/" + exit +fi +data=$1 + +conversational_train=$data/conversational/training/ +audio=$conversational_train/audio/ +[ ! -d $audio ] && \ + echo "The directory $audio does not exist!" && exit 1 + +find $audio -type f \( -name "*.wav" -o -name "*.sph" \) | \ + local/audio2wav_scp.pl > data/train/wav.scp + + +conversational_dev=$data/conversational/dev +audio=$conversational_dev/audio/ +[ ! -d $audio ] && \ + echo "The directory $audio does not exist!" && exit 1 + +find $audio -type f \( -name "*.wav" -o -name "*.sph" \) | \ + local/audio2wav_scp.pl > data/dev/wav.scp + diff --git a/egs/material/s5/local/prepare_dict.sh b/egs/material/s5/local/prepare_dict.sh new file mode 100755 index 00000000000..710f1a66e2e --- /dev/null +++ b/egs/material/s5/local/prepare_dict.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal) +# License: Apache 2.0 + +# Begin configuration section. +# End configuration section +set -e -o pipefail +set -o nounset # Treat unset variables as an error +echo "$0 " "$@" + +language=swahili + +. ./utils/parse_options.sh + +if [ $# -ne 1 ] ; then + echo "Invalid number of script parameters. " + echo " $0 [options] " + echo "e.g." + echo " $0 --language swahili /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/" + exit +fi +data=$1 + +lexicon=$data/conversational/reference_materials/lexicon.txt + +mkdir -p data/local +cat $lexicon | awk '{print $1}' > data/local/lexicon_words +cat $lexicon | cut -f2- > data/local/lexicon_phns + +if [ "$language" == "swahili" ]; then + language_affix="sw" +elif [ "$language" == "tagalog" ]; then + language_affix="tl" +elif [ "$language" == "somali" ]; then + language_affix="so" +fi +MOSES=/home/pkoehn/moses +SOURCE_TC_MODEL=/home/pkoehn/experiment/material-asr-${language_affix}-en/truecaser/truecase-model.1.${language_affix} + $MOSES/scripts/recaser/truecase.perl -model $SOURCE_TC_MODEL \ + < data/local/lexicon_words > data/local/lexicon_words_tc + +paste data/local/lexicon_words_tc data/local/lexicon_phns | sort > data/local/lexicon_tc + +lexicon=data/local/lexicon_tc + +[ ! -f $lexicon ] && echo "Lexicon $lexicon does not exist!" && exit 1; +echo $0: using lexicon $lexicon +mkdir -p data/local/dict_nosp/ +cat data/train/text | cut -f 2- -d ' ' | \ + sed 's/ /\n/g' | grep . | sort -u > data/local/dict_nosp/wordlist + +local/convert_lexicon.pl <(echo -e "\t\n\t\n\t\n\t" | cat - $lexicon ) data/local/dict_nosp/wordlist | sort -u > data/local/dict_nosp/lexicon.txt +[ -f data/local/dict_nosp/lexiconp.txt ] && rm data/local/dict_nosp/lexiconp.txt + +cat data/local/dict_nosp/lexicon.txt | sed 's/\t/ /g' | \ + cut -f 2- -d ' ' | sed 's/ /\n/g' | grep . | sort -u > data/local/dict_nosp/phones.txt + + +grep "^<.*>$" data/local/dict_nosp/phones.txt > data/local/dict_nosp/silence_phones.txt +grep -v "^<.*>$" data/local/dict_nosp/phones.txt > data/local/dict_nosp/nonsilence_phones.txt +echo "" > data/local/dict_nosp/optional_silence.txt +echo "" > data/local/dict_nosp/oov.txt + + + +utils/validate_dict_dir.pl data/local/dict_nosp/ + diff --git a/egs/material/s5/local/prepare_text_data.sh b/egs/material/s5/local/prepare_text_data.sh new file mode 100755 index 00000000000..4200a55ed9d --- /dev/null +++ b/egs/material/s5/local/prepare_text_data.sh @@ -0,0 +1,66 @@ +#!/bin/bash +# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal) +# License: Apache 2.0 + +# Begin configuration section. +# End configuration section +set -e -o pipefail +set -o nounset # Treat unset variables as an error +echo "$0 " "$@" + +language=swahili + +. ./utils/parse_options.sh + +if [ $# -ne 1 ] ; then + echo "Invalid number of script parameters. " + echo " $0 [options] " + echo "e.g." + echo " $0 --language swahili /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/" + exit +fi +data=$1; +conversational_train=$data/conversational/training/ +mkdir -p data/train/ +for file in $conversational_train/transcription/*txt ; do + ./local/parse_transcripts.pl $file +done > data/train/transcripts.txt + + +conversational_dev=$data/conversational/dev/ +mkdir -p data/dev +for file in $conversational_dev/transcription/*txt ; do + ./local/parse_transcripts.pl $file +done > data/dev/transcripts.txt + + +cat data/train/transcripts.txt | \ + local/cleanup_transcripts.pl | \ + local/create_datafiles.pl data/train/ + +cat data/dev/transcripts.txt | \ + local/cleanup_transcripts.pl | \ + local/create_datafiles.pl data/dev/ + +if [ "$language" == "swahili" ]; then + language_affix="sw" +elif [ "$language" == "tagalog" ]; then + language_affix="tl" +elif [ "$language" == "somali" ]; then + language_affix="so" +fi +MOSES=/home/pkoehn/moses +SOURCE_TC_MODEL=/home/pkoehn/experiment/material-asr-${language_affix}-en/truecaser/truecase-model.1.${language_affix} + +for i in train dev; do + cat data/$i/text | cut -d " " -f2- > data/$i/text.notruecase + cat data/$i/text | cut -d " " -f1 > data/$i/uttids + # Truecase + $MOSES/scripts/recaser/truecase.perl -model $SOURCE_TC_MODEL \ + < data/$i/text.notruecase | sed "s=<= <=g" > data/$i/text.truecase +# cat data/$i/text.truecase | sed 's/' //g' | sed 's/&apos//g' | sed 's/[//g' | sed 's/]//g' | sed 's/" //g' | sed 's/" //g' | sed 's/& //g' | sed 's/@-@ //g' | sed 's/://g' | sed 's/\///g' | sed 's/%//g' | sed 's/+//g' | sed 's/( //g' | sed 's/) //g' | sed 's/\, //g' | sed 's/ \.//g' | sed 's/\?//g' | sed 's/\!//g' | sed 's/\;//g' > data/$i/text.nopunc + cat data/$i/text.truecase | tr 'A-Z' 'a-z' > data/$i/text.nopunc + paste -d " " data/$i/uttids data/$i/text.nopunc > data/$i/text +done + + diff --git a/egs/material/s5/local/preprocess_external_text.sh b/egs/material/s5/local/preprocess_external_text.sh new file mode 100755 index 00000000000..4cbc457310e --- /dev/null +++ b/egs/material/s5/local/preprocess_external_text.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +set -euo pipefail +set -e -o pipefail +set -o nounset # Treat unset variables as an error +echo "$0 $@" + +language=swahili +srctext_bitext=data/bitext/text + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +output=$1 + +set -e -o pipefail +set -o nounset # Treat unset variables as an error + +if [ "$language" == "swahili" ]; then + language_affix="sw" +elif [ "$language" == "tagalog" ]; then + language_affix="tl" +elif [ "$language" == "somali" ]; then + language_affix="so" +fi +MOSES=/home/pkoehn/moses + +# Normalize punctuation and tokenize input +$MOSES/scripts/tokenizer/normalize-punctuation.perl ${language_affix} < ${srctext_bitext} \ + | $MOSES/scripts/tokenizer/tokenizer.perl -a -l ${language_affix} > ${srctext_bitext}.tok + +# convert to lower cases +cat ${srctext_bitext}.tok | tr 'A-Z' 'a-z' > ${srctext_bitext}.tc + +# Remove punctuation +cat ${srctext_bitext}.tc | sed 's/' //g' | sed 's/&apos//g' | sed 's/[//g' | sed 's/]//g' | sed 's/" //g' | sed 's/" //g' | sed 's/& //g' | sed 's/@-@ //g' | sed 's/-//g' | sed 's/://g' | sed 's/\///g' | sed 's/%//g' | sed 's/+//g' | sed 's/( //g' | sed 's/) //g' | sed 's/\, //g' | sed 's/ \.//g' | sed 's/\?//g' | sed 's/\!//g' | sed 's/\;//g' > $output + diff --git a/egs/material/s5/local/preprocess_test.sh b/egs/material/s5/local/preprocess_test.sh new file mode 100755 index 00000000000..fbc868d3f7c --- /dev/null +++ b/egs/material/s5/local/preprocess_test.sh @@ -0,0 +1,135 @@ +#!/bin/sh +set -euo pipefail +set -e -o pipefail +set -o nounset # Treat unset variables as an error +echo "$0 $@" + +stage=0 + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh +. ./lang.conf + +datadev=$1 + +mkdir -p $datadev + +# 1. create the reference transcript $datadev/reftext + +dataset=$(basename $datadev) + +audio_path= +if [ $dataset == "analysis1" ]; then + audio_path=${audio_path_analysis1} +elif [ $dataset == "analysis2" ]; then + audio_path=${audio_path_analysis2} +elif [ $(basename $datadev) == 'test_dev' ]; then + audio_path=${audio_path_dev} +elif [ $(basename $datadev) == 'eval1' ]; then + audio_path=${audio_path_eval1} +elif [ $(basename $datadev) == 'eval2' ]; then + audio_path=${audio_path_eval2} +elif [ $(basename $datadev) == 'eval3' ]; then + audio_path=${audio_path_eval3} +fi + +[ -z ${audio_path} ] && echo "$0: test data should be either analysis1, analysis2, test_dev, eval1 or eval2." && exit 1 + +metadata_file=${audio_path}/metadata/metadata.tsv + +if [ $stage -le 0 ]; then + mkdir -p data/local/$dataset + + tail -n +2 $metadata_file | \ + perl -ane '$F[0] =~ s/.wav//; print "$F[0] $F[1]\n";' > \ + data/local/$dataset/all_list + + awk '{if ($2 == "CS") { print $1 } }' data/local/$dataset/all_list > data/local/$dataset/call_list + awk '{if ($2 != "CS") { print $1 } }' data/local/$dataset/all_list > data/local/$dataset/non_call_list +fi + +if [ $stage -le 2 ]; then + rm data/local/$dataset/{wav.scp,reco2file_and_channel} 2>/dev/null || true + + if [ $dataset == "analysis1" ] || [ $dataset == "analysis2" ]; then + local/parse_dev_transcripts.py $audio_path \ + data/local/$dataset/call_list \ + data/local/$dataset/non_call_list \ + data/local/$dataset + else + for f in $(cat data/local/$dataset/call_list); do + wav_file="$audio_path/src/$f.wav" + + echo "${f}_inLine sox $wav_file -r 8000 -b 16 -c 1 -t wav - remix 1 |" >> data/local/$dataset/wav.scp + echo "${f}_outLine sox $wav_file -r 8000 -b 16 -c 1 -t wav - remix 2 |" >> data/local/$dataset/wav.scp + echo "${f}_inLine ${f} A" >> data/local/$dataset/reco2file_and_channel + echo "${f}_outLine ${f} B" >> data/local/$dataset/reco2file_and_channel + done + + for f in $(cat data/local/$dataset/non_call_list); do + wav_file="$audio_path/src/$f.wav" + + echo "${f} sox $wav_file -r 8000 -b 16 -c 1 -t wav - |" >> data/local/$dataset/wav.scp + echo "${f} ${f} 1" >> data/local/$dataset/reco2file_and_channel + done + + awk '{print $1" "$1}' data/local/$dataset/wav.scp > data/local/$dataset/utt2spk + fi + utils/utt2spk_to_spk2utt.pl data/local/$dataset/utt2spk > data/local/$dataset/spk2utt + utils/fix_data_dir.sh data/local/$dataset + + utils/copy_data_dir.sh data/local/$dataset $datadev +fi + +if [ $stage -le 3 ]; then + if [ $dataset == "analysis1" ] || [ $dataset == "analysis2" ]; then + cat data/local/$dataset/all_list | awk '{print $1" <"$2",O>"}' > \ + data/local/$dataset/all_list_labels + + awk '{print $2" "$1" "$3" "$4" "$1}' $datadev/segments | \ + utils/apply_map.pl -f 1 $datadev/reco2file_and_channel | \ + utils/apply_map.pl -f 3 $datadev/utt2spk | \ + awk '{print $1" "$2" "$3" "$4" "$5" "$1" "$6}' | \ + utils/apply_map.pl -f 7 $datadev/text | \ + utils/apply_map.pl -f 6 data/local/$dataset/all_list_labels | \ + sort +0 -1 +1 -2 +3nb -4 > \ + $datadev/stm + + touch $datadev/glm + fi +fi + +# 3. segment .wav files + +# 3.1. create a trivial segments file: + +if [ $stage -le 4 ]; then + utils/data/get_utt2dur.sh --nj 4 --cmd "$train_cmd" ${datadev} + + if [ ! -f $datadev/segments ]; then + utils/data/get_segments_for_data.sh $datadev/ > $datadev/segments + fi + + # 3.2. create uniform segmented directory using: (The durations are in seconds) + + if [ $dataset == "analysis1" ] || [ $dataset == "analysis2" ]; then + utils/data/convert_data_dir_to_whole.sh $datadev ${datadev}_whole + utils/data/get_utt2dur.sh --nj 4 --cmd "$train_cmd" ${datadev}_whole + + utils/data/get_segments_for_data.sh ${datadev}_whole > ${datadev}_whole/segments + utils/data/get_uniform_subsegments.py --max-segment-duration=30 \ + --overlap-duration=5 --max-remaining-duration=15 ${datadev}_whole/segments > \ + ${datadev}_whole/uniform_sub_segments + + utils/data/subsegment_data_dir.sh ${datadev}_whole/ \ + ${datadev}_whole/uniform_sub_segments ${datadev}_segmented + else + utils/data/get_uniform_subsegments.py --max-segment-duration=30 \ + --overlap-duration=5 --max-remaining-duration=15 ${datadev}/segments > \ + ${datadev}/uniform_sub_segments + + utils/data/subsegment_data_dir.sh ${datadev}/ \ + ${datadev}/uniform_sub_segments ${datadev}_segmented + fi +fi diff --git a/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh b/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh new file mode 100755 index 00000000000..3f5c7e547b1 --- /dev/null +++ b/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh @@ -0,0 +1,217 @@ +#!/bin/bash + +# Copyright 2017-2018 Johns Hopkins University (author: Daniel Povey) +# 2017 Hainan Xu +# 2018 Ke Li +# 2018 Yiming Wang + + +# [for swahili] +# rnnlm/train_rnnlm.sh: best iteration (out of 40) was 38, linking it to final iteration. +# rnnlm/train_rnnlm.sh: train/dev perplexity was 140.6 / 1019.4. +# Train objf: -6.28 -5.90 -5.70 -5.56 -5.47 -5.40 -5.34 -5.29 -5.25 -5.22 -5.17 -5.16 -5.13 -5.10 -5.07 -5.06 -5.04 -5.01 -4.99 -4.98 -4.97 -4.96 -4.93 -4.93 -4.91 -4.91 -4.89 -4.88 -4.87 -4.86 -4.84 -4.85 -4.81 -4.79 -4.78 -4.76 -4.75 -4.74 -4.73 +# Dev objf: -8.69 -7.76 -7.31 -7.03 -6.98 -7.00 -6.96 -6.96 -6.93 -6.94 + +# %WER 36.75 [ 22836 / 62144, 2758 ins, 6307 del, 13771 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore/wer_10_0.0 +# %WER 38.91 [ 24181 / 62144, 2750 ins, 6579 del, 14852 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore_nbest/wer_10_0.0 +# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore/score_10_0.0/analysis1_segmented_hires.ctm.sys +# | Sum/Avg | 9906 59164 | 62.2 23.8 14.0 3.5 41.3 49.1 | +# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis1_segmented_hires.ctm.sys +# | Sum/Avg | 9906 59164 | 61.9 23.6 14.6 3.2 41.4 49.5 | +# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore/score_10_0.0/analysis2_segmented_hires.ctm.sys +# | Sum/Avg | 5322 37120 | 66.2 21.2 12.6 2.9 36.8 49.8 | +# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis2_segmented_hires.ctm.sys +# | Sum/Avg | 5322 37120 | 65.8 21.1 13.1 2.7 36.9 49.9 | + +# [for tagalog] +# rnnlm/train_rnnlm.sh: best iteration (out of 320) was 125, linking it to final iteration. +# rnnlm/train_rnnlm.sh: train/dev perplexity was 141.2 / 259.6. +# Train objf: -6.08 -5.78 -5.62 -5.52 -5.45 -5.40 -5.36 -5.32 -5.28 -5.26 -5.23 -5.20 -5.18 -5.16 -5.14 -5.13 -5.11 -5.10 -5.09 -5.07 -5.06 -5.05 -5.03 -5.03 -5.02 -5.01 -5.00 -4.99 -4.99 -4.97 -4.97 -4.97 -4.96 -4.94 -4.94 -4.93 -4.93 -4.92 -4.91 -4.92 -4.91 -4.90 -4.89 -4.89 -4.89 -4.88 -4.87 -4.87 -4.87 -4.86 -4.86 -4.85 -4.85 -4.84 -4.84 -4.84 -4.83 -4.83 -4.83 -4.82 -4.82 -4.82 -4.81 -4.82 -4.81 -4.81 -4.80 -4.79 -4.79 -4.79 -4.79 -4.80 -4.79 -4.79 -4.79 -4.80 -4.79 -4.78 -4.78 -4.79 -4.77 -4.79 -4.79 -4.78 -4.78 -4.78 -4.78 -4.78 -4.77 -4.77 -4.79 -4.79 -4.78 -4.78 -4.78 -4.78 -4.78 -4.79 -4.78 -4.80 -4.79 -4.78 -4.79 -4.80 -4.80 -4.79 -4.79 -4.77 -4.78 -4.77 -4.77 -4.78 -4.75 -4.80 -4.78 -4.77 -4.76 -4.77 -4.76 -4.76 -4.75 -4.75 -4.76 -4.76 -4.77 -4.75 -4.75 -4.75 -4.76 -4.75 -4.76 -4.74 -4.75 -4.75 -4.76 -4.75 -4.75 -4.75 -4.74 -4.76 -4.75 -4.74 -4.78 -4.74 -4.73 -4.77 -4.76 -4.75 -4.74 -4.73 -4.73 -4.75 -4.75 -4.74 -4.76 -4.73 -4.72 -4.76 -4.72 -4.72 -4.73 -4.72 -4.73 -4.75 -4.72 -4.73 -4.76 -4.75 -4.72 -4.72 -4.74 -4.75 -4.73 -4.72 -4.74 -4.74 -4.73 -4.74 -4.74 -4.74 -4.72 -4.70 -4.72 -4.75 -4.74 -4.75 -4.74 -4.76 -4.72 -4.72 -4.74 -4.75 -4.71 -4.74 -4.73 -4.73 -4.73 -4.73 -4.74 -4.75 -4.73 -4.73 -4.72 -4.71 -4.72 -4.71 -4.72 -4.75 -4.72 -4.71 -4.74 -4.71 -4.70 -4.73 -4.73 -4.75 -4.75 -4.72 -4.72 -4.73 -4.75 -4.73 -4.72 -4.72 -4.72 -4.73 -4.76 -4.73 -4.76 -4.74 -4.73 -4.74 -4.74 -4.74 -4.73 -4.73 -4.73 -4.70 -4.73 -4.74 -4.72 -4.73 -4.73 -4.75 -4.72 -4.73 -4.73 -4.75 -4.73 -4.75 -4.75 -4.73 -4.75 -4.74 -4.75 -4.77 -4.74 -4.75 -4.74 -4.73 -4.77 -4.75 -4.74 -4.75 -4.74 -4.77 -4.76 -4.75 -4.79 -4.78 -4.76 -4.76 -4.77 -4.76 -4.75 -4.74 -4.74 -4.78 -4.77 -4.77 -4.78 -4.79 -4.79 -4.79 -4.76 -4.77 -4.76 -4.79 -4.76 -4.77 -4.76 -4.78 -4.80 -4.79 -4.78 -4.82 -4.82 -4.79 -4.80 -4.81 -4.79 -4.77 -4.79 -4.82 -4.81 -4.82 -4.83 -4.85 -4.84 -4.83 -4.85 -4.88 -4.85 -4.87 -4.86 -4.84 -4.87 -4.85 -4.84 +# Dev objf: -8.70 -7.03 -60340.00 -6.61 -6.45 -6.54 -60340.00 -6.34 -60340.00 -60340.00 -6.15 -6.12 -6.03 -6.03 -60340.00 -60340.00 -6.64 -60340.00 -6.01 -5.91 -5.93 -6.06 -5.92 -5.95 -6.00 -6.17 -6.06 -5.92 -5.92 -60340.00 -6.03 -5.93 -5.98 -60340.00 -6.00 -5.90 -5.84 -6.00 -60340.00 -5.95 -5.89 -60340.00 -5.90 -6.14 -5.84 -5.92 -5.83 -5.86 -5.89 -5.84 -60340.00 -5.90 -5.80 -5.87 -5.87 -60340.00 -5.79 -60340.00 -60340.00 -60340.00 -6.56 -5.88 -5.94 -60340.00 -5.84 -60340.00 -5.84 -5.81 -5.77 -60340.00 -60340.00 -60340.00 -5.81 -5.90 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.72 -5.79 -60340.00 -60340.00 -60340.00 -60340.00 -5.72 -5.80 -60340.00 -60340.00 -5.68 -5.73 -5.74 -60340.00 -5.67 -5.63 -60340.00 -5.75 -60340.00 -5.66 -5.71 -5.73 -5.73 -5.75 -60340.00 -5.77 -60340.00 -5.70 -5.70 -5.82 -60340.00 -60340.00 -5.77 -5.72 -5.75 -60340.00 -5.56 -60340.00 -5.73 -60340.00 -60340.00 -5.99 -5.77 -60340.00 -5.65 -5.80 -60340.00 -60340.00 -5.64 -5.67 -5.73 -5.59 -60340.00 -60340.00 -5.73 -60340.00 -60340.00 -5.83 -5.58 -5.64 -5.75 -60340.00 -5.77 -5.68 -60340.00 -60340.00 -5.70 -5.85 -60340.00 -60340.00 -5.82 -6.15 -5.74 -5.73 -5.75 -60340.00 -60340.00 -5.86 -60340.00 -5.80 -5.79 -5.81 -60340.00 -5.89 -60340.00 -5.81 -5.71 -60340.00 -60340.00 -5.65 -5.87 -60340.00 -60340.00 -60340.00 -5.83 -60340.00 -5.94 -5.74 -5.75 -5.75 -60340.00 -5.76 -5.73 -5.76 -60340.00 -60340.00 -5.85 -5.91 -5.98 -60340.00 -5.88 -5.86 -60340.00 -60340.00 -60340.00 -60340.00 -5.91 -5.81 -5.86 -60340.00 -6.10 -6.17 -60340.00 -60340.00 -5.82 -5.82 -60340.00 -60340.00 -6.78 -5.71 -5.87 -60340.00 -60340.00 -5.98 -5.94 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.81 -60340.00 -60340.00 -60340.00 -5.74 -60340.00 -5.83 -60340.00 -5.96 -5.80 -60340.00 -60340.00 -60340.00 -5.82 -60340.00 -60340.00 -60340.00 -60340.00 -5.80 -60340.00 -60340.00 -60340.00 -60340.00 -5.79 -60340.00 -6.13 -5.97 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.97 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.98 -60340.00 -60340.00 -60340.00 -5.85 -5.92 -5.85 -5.82 -6.04 -60340.00 -60340.00 -60340.00 -60340.00 -5.93 -60340.00 -5.85 -5.87 -5.77 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.89 -60340.00 -60340.00 -60340.00 -60340.00 -6.18 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -60340.00 -5.92 -6.01 + +# %WER 46.07 [ 29664 / 64382, 3133 ins, 9896 del, 16635 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore/wer_10_0.5 +# %WER 47.47 [ 30563 / 64382, 3568 ins, 8934 del, 18061 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore_nbest/wer_10_0.5 +# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore/score_10_0.0/analysis1_segmented_hires.ctm.sys +# | Sum/Avg | 10551 87329 | 53.7 25.3 21.0 4.6 51.0 65.6 | +# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis1_segmented_hires.ctm.sys +# | Sum/Avg | 10551 87329 | 53.4 24.9 21.6 4.3 50.9 65.6 | +# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore/score_10_0.0/analysis2_segmented_hires.ctm.sys +# | Sum/Avg | 5933 56887 | 52.6 25.0 22.4 4.9 52.3 73.8 | +# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis2_segmented_hires.ctm.sys +# | Sum/Avg | 5933 56887 | 52.3 24.5 23.1 4.5 52.2 73.9 | + +# [for somali] +# rnnlm/train_rnnlm.sh: best iteration (out of 800) was 133, linking it to final iteration. +# rnnlm/train_rnnlm.sh: train/dev perplexity was 414.5 / 860.9. + +# %WER 56.54 [ 46160 / 81637, 4654 ins, 13070 del, 28436 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore/wer_10_0.0 +# %WER 57.85 [ 47226 / 81637, 5002 ins, 12287 del, 29937 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore_nbest/wer_10_0.0 +# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore/score_10_0.0/analysis1_segmented_hires.ctm.sys +# | Sum/Avg | 9852 90609 | 50.4 33.3 16.3 8.2 57.8 74.8 | +# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis1_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis1_segmented_hires.ctm.sys +# | Sum/Avg | 9852 90609 | 50.4 33.2 16.4 8.1 57.7 74.9 | +# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore/score_10_0.0/analysis2_segmented_hires.ctm.sys +# | Sum/Avg | 8275 67640 | 53.0 32.8 14.2 8.5 55.5 69.3 | +# grep 'Sum' exp/chain/tdnn1b_sp/decode_analysis2_segmented_rnnlm_rescore_nbest/score_10_0.0/analysis2_segmented_hires.ctm.sys +# | Sum/Avg | 8275 67640 | 53.0 32.7 14.3 8.3 55.3 69.2 | + + +# Begin configuration section. + +embedding_dim=512 +lstm_rpd=128 +lstm_nrpd=128 +stage=0 +train_stage=-10 +epochs=40 + +# variables for lattice rescoring +run_rescore=true +decode_dir_suffix=rnnlm +ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order + # if it's set, it merges histories in the lattice if they share + # the same ngram history and this prevents the lattice from + # exploding exponentially +pruned_rescore=true + +ac_model_dir=exp/chain/tdnn1b_sp +decode_sets="dev analysis1_segmented analysis2_segmented test_dev_segmented eval1_segmented eval2_segmented eval3_segmented" + +dir=exp/rnnlm_lstm_1a +text_dir=data/rnnlm/text +train_text=data/lm/train.txt +dev_text=data/lm/dev.txt +bitext=data/bitext/text.txt +monotext=data/mono/text.txt + +lang=data/lang_combined_chain +tree_dir=exp/chain/tree_sp + +. ./cmd.sh +. ./utils/parse_options.sh + + +mkdir -p $dir/config +set -e + +for f in ${train_text} ${dev_text} $bitext $monotext; do + + [ ! -f $f ] && \ + echo "$0: expected file $f to exist; look at stage 12 in run.sh" && exit 1 +done + +if [ $stage -le 0 ]; then + mkdir -p $text_dir + cat $train_text > $text_dir/train.txt + cat $dev_text > $text_dir/dev.txt + cat $bitext > $text_dir/bitext.txt + cat $monotext > $text_dir/monotext.txt + +fi + +if [ $stage -le 1 ]; then + cp $lang/words.txt $dir/config/ + n=`cat $dir/config/words.txt | wc -l` + echo " $n" >> $dir/config/words.txt + + # words that are not present in words.txt but are in the training or dev data, will be + # mapped to during training. + echo "" >$dir/config/oov.txt + + cat > $dir/config/data_weights.txt <$dir/config/unigram_probs.txt + + # choose features + rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ + --use-constant-feature=true \ + --special-words=',,,,,,' \ + $dir/config/words.txt > $dir/config/features.txt + + cat >$dir/config/xconfig </dev/null || true + for decode_set in ${decode_sets}; do + ( + decode_dir=${ac_model_dir}/decode_${decode_set} + skip_scoring=false + if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi + + # Lattice rescoring + rnnlm/lmrescore$pruned.sh \ + --cmd "$decode_cmd" \ + --weight 0.5 --max-ngram-order $ngram_order --max-arcs 20000 \ + --skip-scoring ${skip_scoring} \ + data/lang_$LM $dir data/${decode_set}_hires \ + ${decode_dir} ${decode_dir}_${decode_dir_suffix}_rescore || exit 1 + + if [ ${decode_set} != "dev" ]; then + local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \ + ${decode_dir}_${decode_dir_suffix}_rescore + fi + ) || touch $dir/.error & + done +fi +wait +#[ -f $dir/.error ] && echo "$0: there was a problem while rescoring" && exit 1 + +if [ $stage -le 5 ]; then + echo "$0: Perform nbest-rescoring on $ac_model_dir" + + rm $dir/.error 2>/dev/null || true + for decode_set in ${decode_sets}; do + ( + decode_dir=${ac_model_dir}/decode_${decode_set} + skip_scoring=false + if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi + + # Lattice rescoring + rnnlm/lmrescore_nbest.sh \ + --N 20 \ + --cmd "$decode_cmd" \ + --skip-scoring ${skip_scoring} \ + 0.5 data/lang_$LM $dir data/${decode_set}_hires \ + ${decode_dir}_${decode_dir_suffix}_rescore ${decode_dir}_${decode_dir_suffix}_rescore_nbest || exit 1 + + if [ ${decode_set} != "dev" ]; then + local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \ + ${decode_dir}_${decode_dir_suffix}_rescore_nbest + fi + ) || touch $dir/.error + done +fi + +exit 0 diff --git a/egs/material/s5/local/rnnlm/run_tdnn_lstm_2.sh b/egs/material/s5/local/rnnlm/run_tdnn_lstm_2.sh new file mode 100755 index 00000000000..13cf0bde44c --- /dev/null +++ b/egs/material/s5/local/rnnlm/run_tdnn_lstm_2.sh @@ -0,0 +1,181 @@ +#!/bin/bash + +# Copyright 2017-2018 Johns Hopkins University (author: Daniel Povey) +# 2017 Hainan Xu +# 2018 Ke Li +# 2018 Yiming Wang + + +# [for swahili] +# rnnlm/train_rnnlm.sh: best iteration (out of 10) was 5, linking it to final iteration. +# rnnlm/train_rnnlm.sh: train/dev perplexity was 59.1 / 273.1. +# Train objf: -5.48 -4.75 -4.47 -4.30 -4.17 -4.06 -3.96 -3.87 -3.77 -3.68 +# Dev objf: -10.79 -6.00 -5.75 -5.69 -5.62 -5.61 -5.62 -5.66 -5.66 + +# %WER 35.84 [ 22270 / 62144, 2573 ins, 6961 del, 12736 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore/wer_11_0.5 +# %WER 48.49 [ 28692 / 59166, 2310 ins, 9200 del, 17182 sub ] exp/chain/tdnn1b_sp/decode_analysis1_segmented_reseg_rnnlm_rescore + +# [for tagalog] +# rnnlm/train_rnnlm.sh: best iteration (out of 10) was 4, linking it to final iteration. +# rnnlm/train_rnnlm.sh: train/dev perplexity was 73.6 / 106.2. +# Train objf: -5.55 -4.83 -4.58 -4.41 -4.28 -4.17 -4.06 -3.96 -3.86 +# Dev objf: -10.54 -4.87 -4.72 -4.67 -4.67 -4.69 -4.71 -4.74 -4.78 + +# %WER 42.91 [ 27628 / 64382, 3624 ins, 8301 del, 15703 sub ] exp/chain/tdnn1b_sp/decode_dev_rnnlm_rescore/wer_10_0.0 +# %WER 55.55 [ 48530 / 87362, 4030 ins, 19326 del, 25174 sub ] exp/chain/tdnn1b_sp/decode_analysis1_segmented_reseg_rnnlm_rescore + +# Begin configuration section. + +embedding_dim=512 +lstm_rpd=128 +lstm_nrpd=128 +stage=0 +train_stage=-10 +epochs=40 + +# variables for lattice rescoring +run_rescore=true +decode_dir_suffix=rnnlm +ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order + # if it's set, it merges histories in the lattice if they share + # the same ngram history and this prevents the lattice from + # exploding exponentially +pruned_rescore=true + +ac_model_dir=exp/chain/tdnn1b_sp +#decode_sets="dev analysis1_segmented_reseg test_dev_segmented_reseg eval1_segmented_reseg eval2_segmented_reseg" +decode_sets="dev analysis1_segmented test_dev_segmented eval1_segmented eval2_segmented eval3_segmented" +decode_sets="analysis2_segmented" +#decode_sets="dev eval1_segmented eval2_segmented" +dir=exp/rnnlm_lstm_1a +text_dir=data/rnnlm/text +train_text=data/lm/train.txt +dev_text=data/lm/dev.txt +bitext=data/bitext/text.txt +lang=data/lang_combined_chain +tree_dir=exp/chain/tree_sp + +. ./cmd.sh +. ./utils/parse_options.sh + + +mkdir -p $dir/config +set -e + +for f in ${train_text} ${dev_text} $bitext; do + [ ! -f $f ] && \ + echo "$0: expected file $f to exist; look at stage 12 in run.sh" && exit 1 +done + +if [ $stage -le 0 ]; then + mkdir -p $text_dir + cat $train_text > $text_dir/train.txt + cat $dev_text > $text_dir/dev.txt + cat $bitext > $text_dir/bitext.txt +fi + +if [ $stage -le 1 ]; then + cp $lang/words.txt $dir/config/ + n=`cat $dir/config/words.txt | wc -l` + echo " $n" >> $dir/config/words.txt + + # words that are not present in words.txt but are in the training or dev data, will be + # mapped to during training. + echo "" >$dir/config/oov.txt + + cat > $dir/config/data_weights.txt <$dir/config/unigram_probs.txt + + # choose features + rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ + --use-constant-feature=true \ + --special-words=',,,,,,' \ + $dir/config/words.txt > $dir/config/features.txt + + cat >$dir/config/xconfig </dev/null || true + for decode_set in ${decode_sets}; do + ( + decode_dir=${ac_model_dir}/decode_${decode_set} + skip_scoring=false + if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi + + # Lattice rescoring + rnnlm/lmrescore$pruned.sh \ + --cmd "$decode_cmd" \ + --weight 0.5 --max-ngram-order $ngram_order --max-arcs 20000 \ + --skip-scoring ${skip_scoring} \ + data/lang_$LM $dir data/${decode_set}_hires \ + ${decode_dir} ${decode_dir}_${decode_dir_suffix}_rescore || exit 1 + + if [ ${decode_set} != "dev" ]; then + local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \ + ${decode_dir}_${decode_dir_suffix}_rescore + fi + ) || touch $dir/.error & + done +fi +wait +#[ -f $dir/.error ] && echo "$0: there was a problem while rescoring" && exit 1 + +if [ $stage -le 5 ]; then + echo "$0: Perform nbest-rescoring on $ac_model_dir" + + rm $dir/.error 2>/dev/null || true + for decode_set in ${decode_sets}; do + ( + decode_dir=${ac_model_dir}/decode_${decode_set} + skip_scoring=false + if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi + + # Lattice rescoring + rnnlm/lmrescore_nbest.sh \ + --N 20 \ + --cmd "$decode_cmd" \ + --skip-scoring ${skip_scoring} \ + 0.5 data/lang_$LM $dir data/${decode_set}_hires \ + ${decode_dir}_${decode_dir_suffix}_rescore ${decode_dir}_${decode_dir_suffix}_rescore_nbest || exit 1 + + if [ ${decode_set} != "dev" ]; then + local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \ + ${decode_dir}_${decode_dir_suffix}_rescore_nbest + fi + ) || touch $dir/.error + done +fi + +exit 0 diff --git a/egs/material/s5/local/score.sh b/egs/material/s5/local/score.sh new file mode 100755 index 00000000000..c7da00fba32 --- /dev/null +++ b/egs/material/s5/local/score.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal) +# License: Apache 2.0 + +# Begin configuration section. +# End configuration section +set -e -o pipefail +set -o nounset # Treat unset variables as an error + +echo "$0" "$@" +steps/scoring/score_kaldi_wer.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" + diff --git a/egs/material/s5/local/score_segments.sh b/egs/material/s5/local/score_segments.sh new file mode 100755 index 00000000000..064e15ae40d --- /dev/null +++ b/egs/material/s5/local/score_segments.sh @@ -0,0 +1,8 @@ +#!/bin/bash +set -e -o pipefail +set -o nounset # Treat unset variables as an error + +echo "$0" "$@" +local/score_wer_segments.sh "$@" +#local/score_cer_segment.sh --stage 2 "$@" + diff --git a/egs/material/s5/local/score_stm.sh b/egs/material/s5/local/score_stm.sh new file mode 100755 index 00000000000..7e1236ce92e --- /dev/null +++ b/egs/material/s5/local/score_stm.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# Copyright 2013 Johns Hopkins University (authors: Yenda Trmal) +# 2018 Vimal Manohar + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This scoring script is copied from Babel and modified. +# This is a scoring script for the CTMS in /score_/${name}.ctm +# it tries to mimic the NIST scoring setup as much as possible (and usually does a good job) + +# begin configuration section. +cmd=run.pl +cer=0 +min_lmwt=7 +max_lmwt=17 +model= +stage=0 +ctm_name= +word_ins_penalty=0.0,0.5,1.0 +case_insensitive=true +use_icu=true +icu_transform='Any-Lower' +#end configuration section. + +echo $0 $@ + +[ -f ./path.sh ] && . ./path.sh +[ -f ./cmd.sh ] && . ./cmd.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: $0 [options] " && exit; + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --cer (0|1) # compute CER in addition to WER" + exit 1; +fi + +data=$1 +lang=$2 # This parameter is not used -- kept only for backwards compatibility +dir=$3 + +set -e +set -o pipefail +set -u + +ScoringProgram=`which sclite` || ScoringProgram=$KALDI_ROOT/tools/sctk/bin/sclite +[ ! -x $ScoringProgram ] && echo "Cannot find scoring program at $ScoringProgram" && exit 1; +SortingProgram=`which hubscr.pl` || SortingProgram=$KALDI_ROOT/tools/sctk/bin/hubscr.pl +[ ! -x $ScoringProgram ] && echo "Cannot find scoring program at $ScoringProgram" && exit 1; + +stm_filter_cmd=cat +[ -x local/stm_filter ] && stm_filter_cmd=local/stm_filter +ctm_filter_cmd=cat +[ -x local/ctm_filter ] && ctm_filter_cmd=local/ctm_filter + +for f in $data/stm ; do + [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; +done + +if [ -z $ctm_name ] ; then + name=`basename $data`; # e.g. eval2000 +else + name=$ctm_name +fi + +if [ $stage -le 0 ] ; then + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + mkdir -p $dir/scoring/penalty_$wip/log + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/penalty_$wip/log/score.LMWT.log \ + set -e';' set -o pipefail';' \ + cat $dir/score_LMWT_${wip}/${name}.ctm \| $ctm_filter_cmd '>' $dir/score_LMWT_${wip}/${name}.ctm.unsorted '&&' \ + cat $data/stm \| $stm_filter_cmd '>' $dir/score_LMWT_${wip}/stm.unsorted '&&' \ + $SortingProgram sortSTM \<$dir/score_LMWT_${wip}/stm.unsorted \>$dir/score_LMWT_${wip}/stm.sorted '&&' \ + $SortingProgram sortCTM \<$dir/score_LMWT_${wip}/${name}.ctm.unsorted \>$dir/score_LMWT_${wip}/${name}.ctm.sorted '&&' \ + paste -d ' ' \<\(cut -f 1-5 -d ' ' $dir/score_LMWT_${wip}/stm.sorted \) \ + \<\(cut -f 6- -d ' ' $dir/score_LMWT_${wip}/stm.sorted \| uconv -f utf8 -t utf8 -x "$icu_transform" \) \ + \> $dir/score_LMWT_${wip}/stm '&&' \ + paste -d ' ' \<\(cut -f 1-4 -d ' ' $dir/score_LMWT_${wip}/${name}.ctm.sorted \) \ + \<\(cut -f 5- -d ' ' $dir/score_LMWT_${wip}/${name}.ctm.sorted \| uconv -f utf8 -t utf8 -x "$icu_transform" \) \ + \> $dir/score_LMWT_${wip}/${name}.ctm.sorted2 '&&' \ + utils/fix_ctm.sh $dir/score_LMWT_${wip}/stm $dir/score_LMWT_${wip}/${name}.ctm.sorted2 '&&' \ + $SortingProgram sortCTM \<$dir/score_LMWT_${wip}/${name}.ctm.sorted2 \>$dir/score_LMWT_${wip}/${name}.ctm '&&' \ + $ScoringProgram -s -r $dir/score_LMWT_${wip}/stm stm -h $dir/score_LMWT_${wip}/${name}.ctm ctm \ + -n "$name.ctm" -f 0 -D -F -o sum rsum prf dtl sgml -e utf-8 || exit 1 + done +fi + +if [ $stage -le 1 ]; then + if [ $cer -eq 1 ]; then + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/penalty_$wip/log/score.LMWT.char.log \ + $ScoringProgram -s -r $dir/score_LMWT_${wip}/stm stm -h $dir/score_LMWT_${wip}/${name}.ctm ctm \ + -n "$name.char.ctm" -o sum rsum prf dtl sgml -f 0 -D -F -c NOASCII DH -e utf-8 || exit 1 + fi +fi + + +echo "Finished scoring on" `date` +exit 0 diff --git a/egs/material/s5/local/score_wer_segments.sh b/egs/material/s5/local/score_wer_segments.sh new file mode 100755 index 00000000000..555ec5056d9 --- /dev/null +++ b/egs/material/s5/local/score_wer_segments.sh @@ -0,0 +1,100 @@ +#!/bin/bash + + +[ -f ./path.sh ] && . ./path.sh + +# begin configuration section. +cmd=run.pl +stage=0 +stats=true +#end configuration section. + +echo "$0 $@" # Print the command line for logging +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +data=$1 +dir=$2 + +ref_filtering_cmd="cat" +[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter" +[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter" +hyp_filtering_cmd="cat" +[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter" +[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter" + +mkdir -p $dir/scoring_kaldi +if [ -f $data/reftext ]; then + cat $data/reftext | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1; +else + echo "$0: No reference text to compute WER" +fi + +if [ $stage -le 0 ]; then + + mkdir -p $dir/scoring_kaldi/log + # begin building hypothesis hyp.txt + # in the same format as $data/reftext + awk '{a[$1]=a[$1]" "$5;}END{for(i in a)print i""a[i];}' \ + $dir/score_10/ctm_out > tmpconcat + if [ -f $data/reftext ]; then + awk -F" " '{print $1}' $data/reftext > tmporder + awk 'FNR==NR {x2[$1] = $0; next} $1 in x2 {print x2[$1]}' \ + tmpconcat tmporder > "$dir/score_10/ctm_out.concat" + $hyp_filtering_cmd $dir/score_10/ctm_out.concat > \ + $dir/scoring_kaldi/hyp.txt || exit 1; + # end building hypothesis hyp.txt + + $cmd $dir/scoring_kaldi/log/score.hyp.log \ + cat $dir/scoring_kaldi/hyp.txt \| \ + compute-wer --text --mode=present \ + ark:$dir/scoring_kaldi/test_filt.txt ark:- ">&" $dir/wer || exit 1; + + cat $dir/wer + else + cat tmpconcat > "$dir/score_10/ctm_out.concat" + awk -F" " '{print $1}' $dir/score_10/ctm_out.concat > tmporder + $hyp_filtering_cmd $dir/score_10/ctm_out.concat > \ + $dir/scoring_kaldi/hyp.txt || exit 1; + #exit 0; + #end building hypothesis hyp.txt + + fi + + # building hyp.segmentedXms.txt + for dur in {700,800,900,1000}; do + dursec=`echo $dur' / 1000' | bc -l` + awk '{if ($4 < '$dursec') a[$1]=a[$1]" "$5; else a[$1]=a[$1]" "$5"\n"$1"";}END\ + {for(i in a)print i""a[i];}' $dir/score_10/ctm_out > tmpconcat + rm -rf $dir/score_10/ctm_out.concat.$dur + while read LINE; do + grep "$LINE" "tmpconcat" >> "$dir/score_10/ctm_out.concat."$dur + done < "tmporder" + + $hyp_filtering_cmd $dir/score_10/ctm_out.concat.$dur > $dir/scoring_kaldi/hyp.segmented${dur}ms.txt || exit 1; + done + rm -rf tmpconcat + rm -rf tmporder +fi + +if [ $stage -le 1 ]; then + if $stats; then + mkdir -p $dir/scoring_kaldi/wer_details + + $cmd $dir/scoring_kaldi/log/stats1.log \ + cat $dir/scoring_kaldi/hyp.txt \| \ + align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \ + utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \| \ + utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1; + + $cmd $dir/scoring_kaldi/log/stats2.log \ + cat $dir/scoring_kaldi/wer_details/per_utt \| \ + utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \ + sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1; + + $cmd $dir/scoring_kaldi/log/wer_bootci.log \ + compute-wer-bootci --mode=present \ + ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/hyp.txt \ + '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1; + fi +fi diff --git a/egs/material/s5/local/semisup/chain/decode_test.sh b/egs/material/s5/local/semisup/chain/decode_test.sh new file mode 100755 index 00000000000..3d9a1eda1f5 --- /dev/null +++ b/egs/material/s5/local/semisup/chain/decode_test.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +# Copyright 2018 Johns Hopkins University (author: Daniel Povey) +# 2018 Mahsa Yarmohammadi +# 2018 Yiming Wang + + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +language=swahili +stage=0 +datadev="data/analysis1 data/analysis2 data/test_dev data/eval1 data/eval2 data/eval3" +dir=exp/semisup/chain/tdnn_semisup_1a +lang=data/lang_combined_chain +tree_dir=exp/semisup/chain/tree_sp +cmd=queue.pl +graph_affix=_combined + +# training options +chunk_width=140,100,160 +chunk_left_context=0 +chunk_right_context=0 + +# ivector options +max_count=75 # parameter for extract_ivectors.sh +sub_speaker_frames=600 +filter_ctm=true +weights_file= +silence_weight=0.00001 +nj=30 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +[ ! -f ./conf/lang/${language}.conf ] && \ + echo "Language configuration conf/lang/${language}.conf does not exist!" && exit 1 +ln -sf ./conf/lang/${language}.conf lang.conf +. ./lang.conf + +if ! cuda-compiled; then + cat </dev/null || true + +if [ $stage -le 3 ]; then + # do the 1st pass decoding + for datadir in $datadev; do + ( + data=$(basename $datadir) + nspk=$(wc -l /dev/null || true + cp -r data/lang_combined_test $lang_combined + silphonelist=$(cat $lang_combined/phones/silence.csl) || exit 1; + nonsilphonelist=$(cat $lang_combined/phones/nonsilence.csl) || exit 1; + # Use our special topology... note that later on may have to tune this + # topology. + steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang_combined/topo + fi +fi + +if [ $stage -le 8 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" \ + --generate-ali-from-lats true ${lores_train_data_dir} \ + data/lang_combined $gmm_dir $lat_dir || exit 1; + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 9 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 6000 ${lores_train_data_dir} \ + $lang_combined $lat_dir $tree_dir || exit 1 +fi + + +if [ $stage -le 10 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004" + output_opts="l2-regularize=0.002" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1024 + linear-component name=tdnn2l0 dim=256 $linear_opts input=Append(-1,0) + linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1024 + linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1024 input=Append(0,1) + linear-component name=tdnn4l0 dim=256 $linear_opts input=Append(-1,0) + linear-component name=tdnn4l dim=256 $linear_opts input=Append(0,1) + relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1024 + linear-component name=tdnn5l dim=256 $linear_opts + relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1024 input=Append(0, tdnn3l) + linear-component name=tdnn6l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn7l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn7l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1024 + linear-component name=tdnn8l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn8l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn9l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1024 + linear-component name=tdnn10l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn10l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn11l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1024 + linear-component name=prefinal-l dim=256 $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280 + linear-component name=prefinal-chain-l dim=256 $linear_opts + batchnorm-component name=prefinal-chain-batchnorm + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280 + linear-component name=prefinal-xent-l dim=256 $linear_opts + batchnorm-component name=prefinal-xent-batchnorm + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts + +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 11 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + mkdir -p $dir/egs + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=$num_epochs \ + --trainer.frames-per-iter=1500000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=12 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=0 \ + --egs.chunk-right-context=0 \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.stage=$get_egs_stage \ + --egs.opts="--frames-overlap-per-eg 0 --generate-egs-scp true" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 12 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_combined_test \ + $tree_dir ${tree_dir}/graph_combined || exit 1; +fi + +if [ $stage -le 13 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l \ +# $sup_chain_dir/best_path_${unsupervised_set_perturbed}/frame_subsampling_factor +# +# # This should be 1 if using a different source for supervised data alignments. +# # However alignments in seed tree directory have already been sub-sampled. +# echo $frame_subsampling_factor > \ +# $sup_tree_dir/frame_subsampling_factor +# +# # Build a new tree using stats from both supervised and unsupervised data +# steps/nnet3/chain/build_tree_multiple_sources.sh \ +# --use-fmllr false --context-opts "--context-width=2 --central-position=1" \ +# --frame-subsampling-factor $frame_subsampling_factor \ +# 7000 $lang \ +# data/${supervised_set_perturbed} \ +# ${sup_tree_dir} \ +# data/${unsupervised_set_perturbed} \ +# $chaindir/best_path_${unsupervised_set_perturbed} \ +# $treedir || exit 1 +# fi +# +# sup_tree_dir=$treedir # Use the new tree dir for further steps + +# Train denominator FST using phone alignments from +# supervised and unsupervised data +if [ $stage -le 7 ]; then + steps/nnet3/chain/make_weighted_den_fst.sh --num-repeats $lm_weights --cmd "$train_cmd" \ + ${sup_tree_dir} ${sup_chain_dir}/best_path_${unsupervised_set_perturbed} \ + $dir +fi + +if [ $stage -le 8 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $sup_tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.004 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true" + linear_opts="orthonormal-constraint=-1.0 l2-regularize=0.004" + output_opts="l2-regularize=0.002" + + mkdir -p $dir/configs + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1024 + linear-component name=tdnn2l0 dim=256 $linear_opts input=Append(-1,0) + linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1024 + linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0) + relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1024 input=Append(0,1) + linear-component name=tdnn4l0 dim=256 $linear_opts input=Append(-1,0) + linear-component name=tdnn4l dim=256 $linear_opts input=Append(0,1) + relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1024 + linear-component name=tdnn5l dim=256 $linear_opts + relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1024 input=Append(0, tdnn3l) + linear-component name=tdnn6l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn7l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn7l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1024 + linear-component name=tdnn8l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn8l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn9l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1024 + linear-component name=tdnn10l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn10l dim=256 $linear_opts input=Append(0,3) + relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1280 + linear-component name=tdnn11l0 dim=256 $linear_opts input=Append(-3,0) + linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0) + relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1024 + linear-component name=prefinal-l dim=256 $linear_opts + + relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280 + linear-component name=prefinal-chain-l dim=256 $linear_opts + batchnorm-component name=prefinal-chain-batchnorm + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280 + linear-component name=prefinal-xent-l dim=256 $linear_opts + batchnorm-component name=prefinal-xent-batchnorm + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts + + # We use separate outputs for supervised and unsupervised data + # so we can properly track the train and valid objectives. + + output name=output-0 input=output.affine + output name=output-1 input=output.affine + + output name=output-0-xent input=output-xent.log-softmax + output name=output-1-xent input=output-xent.log-softmax +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +# Get values for $model_left_context, $model_right_context +. $dir/configs/vars + +left_context=$model_left_context +right_context=$model_right_context + +egs_left_context=$(perl -e "print int($left_context + $frame_subsampling_factor / 2)") +egs_right_context=$(perl -e "print int($right_context + $frame_subsampling_factor / 2)") + +if [ -z "$sup_egs_dir" ]; then + sup_egs_dir=$dir/egs_${supervised_set_perturbed} + frames_per_eg=$(cat $sup_chain_dir/egs/info/frames_per_eg) + + if [ $stage -le 9 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $sup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$sup_egs_dir/storage $sup_egs_dir/storage + fi + mkdir -p $sup_egs_dir/ + touch $sup_egs_dir/.nodelete # keep egs around when that run dies. + + echo "$0: generating egs from the supervised data" + steps/nnet3/chain/get_egs.sh --cmd "$decode_cmd" \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context --right-context-final $egs_right_context \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor $frame_subsampling_factor \ + --frames-per-eg $frames_per_eg \ + --frames-per-iter 1500000 \ + --cmvn-opts "$cmvn_opts" \ + --online-ivector-dir $sup_ivector_dir \ + --generate-egs-scp true \ + data/${supervised_set_perturbed}_hires $dir \ + $sup_lat_dir $sup_egs_dir + fi +else + frames_per_eg=$(cat $sup_egs_dir/info/frames_per_eg) +fi + +unsup_frames_per_eg=150 # Using a frames-per-eg of 150 for unsupervised data + # was found to be better than allowing smaller chunks + # (160,140,110,80) like for supervised system +lattice_lm_scale=0.5 # lm-scale for using the weights from unsupervised lattices when + # creating numerator supervision +lattice_prune_beam=4.0 # beam for pruning the lattices prior to getting egs + # for unsupervised data +tolerance=1 # frame-tolerance for chain training + +unsup_lat_dir=${sup_chain_dir}/decode_${unsupervised_set_perturbed} +if [ -z "$unsup_egs_dir" ]; then + unsup_egs_dir=$dir/egs_${unsupervised_set_perturbed} + + if [ $stage -le 10 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $unsup_egs_dir/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$unsup_egs_dir/storage $unsup_egs_dir/storage + fi + mkdir -p $unsup_egs_dir + touch $unsup_egs_dir/.nodelete # keep egs around when that run dies. + + cp $sup_chain_dir/final.mdl $unsup_lat_dir || exit 1; + + echo "$0: generating egs from the unsupervised data" + steps/nnet3/chain/get_egs.sh \ + --cmd "$decode_cmd" --alignment-subsampling-factor 1 \ + --left-tolerance $tolerance --right-tolerance $tolerance \ + --left-context $egs_left_context --right-context $egs_right_context \ + --left-context-initial $egs_left_context --right-context-final $egs_right_context \ + --frames-per-eg $unsup_frames_per_eg --frames-per-iter 1500000 \ + --frame-subsampling-factor $frame_subsampling_factor \ + --cmvn-opts "$cmvn_opts" --lattice-lm-scale $lattice_lm_scale \ + --lattice-prune-beam "$lattice_prune_beam" \ + --deriv-weights-scp $sup_chain_dir/best_path_${unsupervised_set_perturbed}/weights.scp \ + --online-ivector-dir $ivector_root_dir/ivectors_${unsupervised_set_perturbed}_hires \ + --generate-egs-scp true $unsup_egs_opts \ + data/${unsupervised_set_perturbed}_hires $dir \ + $unsup_lat_dir $unsup_egs_dir + fi +fi + +comb_egs_dir=$dir/comb_egs +if [ $stage -le 11 ]; then + steps/nnet3/chain/multilingual/combine_egs.sh --cmd "$train_cmd" \ + --block-size 128 \ + --lang2weight $supervision_weights 2 \ + $sup_egs_dir $unsup_egs_dir $comb_egs_dir + touch $comb_egs_dir/.nodelete # keep egs around when that run dies. +fi + +if [ $train_stage -le -4 ]; then + # This is to skip stages of den-fst creation, which was already done. + train_stage=-4 +fi + +if [ $stage -le 12 ]; then + steps/nnet3/chain/train.py --stage $train_stage \ + --egs.dir="$comb_egs_dir" \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$sup_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=true \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --egs.chunk-width=$frames_per_eg \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --trainer.frames-per-iter=1500000 \ + --trainer.num-epochs=$num_epochs \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=12 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.momentum=0.0 \ + --trainer.max-param-change=2.0 \ + --cleanup.remove-egs=false \ + --feat-dir=data/${supervised_set_perturbed}_hires \ + --tree-dir=$sup_tree_dir \ + --lat-dir=$sup_lat_dir \ + --dir=$dir || exit 1; +fi + +test_graph_dir=$dir/graph_combined +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $test_graph_dir +fi + +if [ $stage -le 14 ]; then + frames_per_chunk=150 + rm -f $dir/.error 2>/dev/null || true + for data in $test_sets; do + ( + nspk=$(wc -l $text_dir/train.txt + cat $dev_text > $text_dir/dev.txt + cat $bitext > $text_dir/bitext.txt + cat $monotext > $text_dir/monotext.txt + +fi + +if [ $stage -le 1 ]; then + cp $lang/words.txt $dir/config/ + n=`cat $dir/config/words.txt | wc -l` + echo " $n" >> $dir/config/words.txt + + # words that are not present in words.txt but are in the training or dev data, will be + # mapped to during training. + echo "" >$dir/config/oov.txt + + cat > $dir/config/data_weights.txt <$dir/config/unigram_probs.txt + + # choose features + rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ + --use-constant-feature=true \ + --special-words=',,,,,,' \ + $dir/config/words.txt > $dir/config/features.txt + + cat >$dir/config/xconfig </dev/null || true + for decode_set in ${decode_sets}; do + ( + decode_dir=${ac_model_dir}/decode_${decode_set} + skip_scoring=false + if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi + + # Lattice rescoring + rnnlm/lmrescore$pruned.sh \ + --cmd "$decode_cmd" \ + --weight 0.5 --max-ngram-order $ngram_order --max-arcs 20000 \ + --skip-scoring ${skip_scoring} \ + data/lang_$LM $dir data/${decode_set}_hires \ + ${decode_dir} ${decode_dir}_${decode_dir_suffix}_rescore || exit 1 + + if [ ${decode_set} != "dev" ]; then + local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \ + ${decode_dir}_${decode_dir_suffix}_rescore + fi + ) || touch $dir/.error & + done +fi +wait +#[ -f $dir/.error ] && echo "$0: there was a problem while rescoring" && exit 1 + +if [ $stage -le 5 ]; then + echo "$0: Perform nbest-rescoring on $ac_model_dir" + + rm $dir/.error 2>/dev/null || true + for decode_set in ${decode_sets}; do + ( + decode_dir=${ac_model_dir}/decode_${decode_set} + skip_scoring=false + if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi + + # Lattice rescoring + rnnlm/lmrescore_nbest.sh \ + --N 20 \ + --cmd "$decode_cmd" \ + --skip-scoring ${skip_scoring} \ + 0.5 data/lang_$LM $dir data/${decode_set}_hires \ + ${decode_dir}_${decode_dir_suffix}_rescore ${decode_dir}_${decode_dir_suffix}_rescore_nbest || exit 1 + + if [ ${decode_set} != "dev" ]; then + local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \ + ${decode_dir}_${decode_dir_suffix}_rescore_nbest + fi + ) || touch $dir/.error + done +fi + +exit 0 diff --git a/egs/material/s5/local/semisup/run.sh b/egs/material/s5/local/semisup/run.sh new file mode 100755 index 00000000000..6b22cb1ad36 --- /dev/null +++ b/egs/material/s5/local/semisup/run.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# Copyright 2017 Vimal Manohar +# 2019 Yiming Wang +# Apache 2.0 + +# This script demonstrates semi-supervised training using ~40 hours of +# supervised data and ~320 hours of unsupervised data. + +. ./cmd.sh +. ./path.sh + +set -o pipefail +exp_root=exp/semisup + +stage=0 + +. ./utils/parse_options.sh + +############################################################################### +# Train seed chain system using ~40 hours supervised data. +# Here we train i-vector extractor on only the supervised set. +############################################################################### + +if [ $stage -le 1 ]; then + local/semisup/chain/run_tdnn.sh \ + --train-set train \ + --nnet3-affix "" \ + --affix 1a --tree-affix "" \ + --gmm tri3 --exp-root $exp_root || exit 1 +fi + +if [ $stage -le 2 ]; then + utils/combine_data.sh data/eval1_2_3_segmented data/eval1_segmented data/eval2_segmented data/eval3_segmented || exit 1 +fi + +############################################################################### +# Semi-supervised training using ~40 hours supervised data and +# 320 hours unsupervised data. We use i-vector extractor, tree, lattices +# and seed chain system from the previous stage. +############################################################################### + +if [ $stage -le 3 ]; then + local/semisup/chain/run_tdnn_semisupervised.sh \ + --supervised-set train \ + --unsupervised-set eval1_2_3_segmented \ + --sup-chain-dir $exp_root/chain/tdnn_1a_sp \ + --sup-lat-dir $exp_root/chain/tri3_train_sp_lats \ + --sup-tree-dir $exp_root/chain/tree_sp \ + --ivector-root-dir exp/nnet3 \ + --affix 1a \ + --exp-root $exp_root || exit 1 + + # [for swahili] + # %WER 35.2 | 9906 59164 | 67.8 18.4 13.8 3.0 35.2 47.1 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis1_segmented/score_10_0.0/analysis1_segmented_hires.ctm.sys + # %WER 30.8 | 5322 37120 | 71.9 16.4 11.8 2.7 30.8 47.8 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis2_segmented/score_10_0.0/analysis2_segmented_hires.ctm.sys + + # [for tagalog] + # %WER 40.8 | 10551 87329 | 64.0 21.4 14.6 4.8 40.8 63.9 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis1_segmented/score_10_0.0/analysis1_segmented_hires.ctm.sys + # %WER 41.1 | 5933 56887 | 63.8 20.4 15.9 4.9 41.1 71.9 | exp/semisup/chain/tdnn_semisup_1a/decode_analysis2_segmented/score_10_0.0/analysis2_segmented_hires.ctm.sys +fi + diff --git a/egs/material/s5/local/stm_filter b/egs/material/s5/local/stm_filter new file mode 100755 index 00000000000..9409119a54f --- /dev/null +++ b/egs/material/s5/local/stm_filter @@ -0,0 +1,22 @@ +#!/usr/bin/perl + +while (<>) { + chomp; + my @F = split; + my @A = @F[6..$#F]; + for (my $i = 0; $i <= $#A; $i++) { + my $w = $A[$i]; + + # Make partial words optionally detectable + if ($w =~ m/^(\S+-)$/ || $w =~ m/^(-\S+)$/) { + $A[$i] = "(" . $w . ")"; + } + + # Remove filler words + if ($w =~ m/<(unk|noise|spnoise|sil)>/) { + $A[$i] = ""; + } + } + + print join(" ", @F[0..5]) . " " . join(" ", @A) . "\n"; +} diff --git a/egs/material/s5/local/train_lms_srilm.sh b/egs/material/s5/local/train_lms_srilm.sh new file mode 100755 index 00000000000..8160b060dc7 --- /dev/null +++ b/egs/material/s5/local/train_lms_srilm.sh @@ -0,0 +1,224 @@ +#!/bin/bash +export LC_ALL=C + +words_file= +train_text= +dev_text= +oov_symbol="" + +echo "$0 $@" + +[ -f path.sh ] && . ./path.sh +. ./utils/parse_options.sh || exit 1 + +echo "-------------------------------------" +echo "Building an SRILM language model " +echo "-------------------------------------" + +if [ $# -ne 2 ] ; then + echo "Incorrect number of parameters. " + echo "Script has to be called like this:" + echo " $0 [switches] " + echo "For example: " + echo " $0 data data/srilm" + echo "The allowed switches are: " + echo " words_file= word list file -- data/lang/words.txt by default" + echo " train_text= data/train/text is used in case when not specified" + echo " dev_text= last 10 % of the train text is used by default" + echo " oov_symbol=> symbol to use for oov modeling -- by default" + exit 1 +fi + +datadir=$1 +tgtdir=$2 +outlm=lm.gz + + +##End of configuration +loc=`which ngram-count`; +if [ -z $loc ]; then + if uname -a | grep 64 >/dev/null; then # some kind of 64 bit... + sdir=`pwd`/../../../tools/srilm/bin/i686-m64 + else + sdir=`pwd`/../../../tools/srilm/bin/i686 + fi + if [ -f $sdir/ngram-count ]; then + echo Using SRILM tools from $sdir + export PATH=$PATH:$sdir + else + echo You appear to not have SRILM tools installed, either on your path, + echo or installed in $sdir. See tools/install_srilm.sh for installation + echo instructions. + exit 1 + fi +fi + +# Prepare the destination directory +mkdir -p $tgtdir + +for f in $words_file $train_text $dev_text; do + [ ! -s $f ] && echo "No such file $f" && exit 1; +done + +[ -z $words_file ] && words_file=$datadir/lang/words.txt +if [ ! -z "$train_text" ] && [ ! -z "$dev_text" ] ; then + echo "Using words file: $words_file" + echo "Using train text: $train_text" + echo "Using dev text : $dev_text" + train_text=$train_text + dev_text=$dev_text +else + [ -z "$train_text" ] && train_text=$datadir/train/text + nr=`cat $train_text | wc -l` + nr_dev=$(($nr / 10 )) + nr_train=$(( $nr - $nr_dev )) + orig_train_text=$train_text + head -n $nr_train $train_text > $tgtdir/train_text + tail -n $nr_dev $train_text > $tgtdir/dev_text + + train_text=$tgtdir/train_text + dev_text=$tgtdir/dev_text + echo "Using words file: $words_file" + echo "Using train text: 9/10 of $orig_train_text" + echo "Using dev text : 1/10 of $orig_train_text" +fi + + + +# Extract the word list from the training dictionary; exclude special symbols +sort $words_file | awk '{print $1}' | grep -v '\#0' | grep -v '' | grep -v -F "$oov_symbol" > $tgtdir/vocab +if (($?)); then + echo "Failed to create vocab from $words_file" + exit 1 +else + # wc vocab # doesn't work due to some encoding issues + echo vocab contains `cat $tgtdir/vocab | perl -ne 'BEGIN{$l=$w=0;}{split; $w+=$#_; $w++; $l++;}END{print "$l lines, $w words\n";}'` +fi + +# Kaldi transcript files contain Utterance_ID as the first word; remove it +cat $train_text | cut -f2- -d' ' > $tgtdir/train.txt +if (($?)); then + echo "Failed to create $tgtdir/train.txt from $train_text" + exit 1 +else + echo "Removed first word (uid) from every line of $train_text" + # wc text.train train.txt # doesn't work due to some encoding issues + echo $train_text contains `cat $train_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'` + echo train.txt contains `cat $tgtdir/train.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'` +fi + +# Kaldi transcript files contain Utterance_ID as the first word; remove it +cat $dev_text | cut -f2- -d' ' > $tgtdir/dev.txt +if (($?)); then + echo "Failed to create $tgtdir/dev.txt from $dev_text" + exit 1 +else + echo "Removed first word (uid) from every line of $dev_text" + # wc text.train train.txt # doesn't work due to some encoding issues + echo $dev_text contains `cat $dev_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'` + echo $tgtdir/dev.txt contains `cat $tgtdir/dev.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'` +fi + +echo "-------------------" +echo "Good-Turing 2grams" +echo "-------------------" +ngram-count -lm $tgtdir/2gram.gt01.gz -gt1min 0 -gt2min 1 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/2gram.gt02.gz -gt1min 0 -gt2min 2 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + +echo "-------------------" +echo "Kneser-Ney 2grams" +echo "-------------------" +ngram-count -lm $tgtdir/2gram.kn01.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/2gram.kn02.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + +echo "-------------------" +echo "Good-Turing 3grams" +echo "-------------------" +ngram-count -lm $tgtdir/3gram.gt011.gz -gt1min 0 -gt2min 1 -gt3min 1 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.gt012.gz -gt1min 0 -gt2min 1 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.gt022.gz -gt1min 0 -gt2min 2 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.gt023.gz -gt1min 0 -gt2min 2 -gt3min 3 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + +echo "-------------------" +echo "Kneser-Ney 3grams" +echo "-------------------" +ngram-count -lm $tgtdir/3gram.kn011.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn012.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn022.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/3gram.kn023.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + + +echo "-------------------" +echo "Good-Turing 4grams" +echo "-------------------" +ngram-count -lm $tgtdir/4gram.gt0111.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 1 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0112.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0122.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0123.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0113.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0222.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.gt0223.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + +echo "-------------------" +echo "Kneser-Ney 4grams" +echo "-------------------" +ngram-count -lm $tgtdir/4gram.kn0111.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 1 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0112.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0113.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0122.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0123.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0222.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" +ngram-count -lm $tgtdir/4gram.kn0223.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol" + +if [ ! -z ${LIBLBFGS} ]; then + #please not that if the switch -map-unk "$oov_symbol" is used with -maxent-convert-to-arpa, ngram-count will segfault + #instead of that, we simply output the model in the maxent format and convert it using the "ngram" + echo "-------------------" + echo "Maxent 2grams" + echo "-------------------" + sed 's/'${oov_symbol}'//g' $tgtdir/train.txt | \ + ngram-count -lm - -order 2 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\ + sed 's//'${oov_symbol}'/g' | gzip -c > $tgtdir/2gram.me.gz || exit 1 + + echo "-------------------" + echo "Maxent 3grams" + echo "-------------------" + sed 's/'${oov_symbol}'//g' $tgtdir/train.txt | \ + ngram-count -lm - -order 3 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\ + sed 's//'${oov_symbol}'/g' | gzip -c > $tgtdir/3gram.me.gz || exit 1 + + echo "-------------------" + echo "Maxent 4grams" + echo "-------------------" + sed 's/'${oov_symbol}'//g' $tgtdir/train.txt | \ + ngram-count -lm - -order 4 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\ + sed 's//'${oov_symbol}'/g' | gzip -c > $tgtdir/4gram.me.gz || exit 1 + +fi + + +echo "--------------------" +echo "Computing perplexity" +echo "--------------------" +( + for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done + for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done +) | sort -r -n -k 15,15g | column -t | tee $tgtdir/perplexities.txt + +echo "The perlexity scores report is stored in $tgtdir/perplexities.txt " + +#This will link the lowest perplexity LM as the output LM. +#ln -sf $tgtdir/`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '` $outlm + +#A slight modification of the previous approach: +#We look at the two lowest perplexity LMs and use a 3gram LM if one of the two, even if the 4gram is of lower ppl +nof_trigram_lm=`head -n 2 $tgtdir/perplexities.txt | grep 3gram | wc -l` +if [[ $nof_trigram_lm -eq 0 ]] ; then + lmfilename=`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '` +elif [[ $nof_trigram_lm -eq 2 ]] ; then + lmfilename=`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '` +else #exactly one 3gram LM + lmfilename=`head -n 2 $tgtdir/perplexities.txt | grep 3gram | cut -f 1 -d ' '` +fi +(cd $tgtdir; ln -sf `basename $lmfilename` $outlm ) + diff --git a/egs/material/s5/local/wer_output_filter b/egs/material/s5/local/wer_output_filter new file mode 100755 index 00000000000..5195bb9150d --- /dev/null +++ b/egs/material/s5/local/wer_output_filter @@ -0,0 +1,25 @@ +#!/usr/bin/env perl +# Copyright 2012-2014 Johns Hopkins University (Author: Yenda Trmal) +# Apache 2.0 +use utf8; + +use open qw(:encoding(utf8)); +binmode STDIN, ":utf8"; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; + +while (<>) { + @F = split " "; + print $F[0] . " "; + foreach $s (@F[1..$#F]) { + if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL") || ($s =~ /--|\.|\?|\(\(\)\)|%incomplete/)) { + print ""; + } else { + print "$s" + } + print " "; + } + print "\n"; +} + + diff --git a/egs/material/s5/path.sh b/egs/material/s5/path.sh new file mode 100644 index 00000000000..ffa108b6737 --- /dev/null +++ b/egs/material/s5/path.sh @@ -0,0 +1,7 @@ +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5/:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +[ ! -f $KALDI_ROOT/tools/env.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/env.sh is not present (this is uncommon but might be OK)" +. $KALDI_ROOT/tools/env.sh +export LC_ALL=C diff --git a/egs/material/s5/rnnlm b/egs/material/s5/rnnlm new file mode 120000 index 00000000000..72302c5e570 --- /dev/null +++ b/egs/material/s5/rnnlm @@ -0,0 +1 @@ +../../../scripts/rnnlm \ No newline at end of file diff --git a/egs/material/s5/run.sh b/egs/material/s5/run.sh new file mode 100755 index 00000000000..4ba518f53e0 --- /dev/null +++ b/egs/material/s5/run.sh @@ -0,0 +1,322 @@ +#!/bin/bash + +# Copyright 2017-2018 Johns Hopkins University (Jan "Yenda" Trmal) +# 2017-2018 Johns Hopkins University (author: Daniel Povey) +# 2018 Yiming Wang +# 2019 Mahsa Yarmohammadi +# License: Apache 2.0 + +. ./path.sh +. ./cmd.sh + +nj=30 # number of parallel jobs +stage=1 +language=swahili +. utils/parse_options.sh + +set -e -o pipefail +set -o nounset # Treat unset variables as an error + +[ ! -f ./conf/lang/${language}.conf ] && \ + echo "Language configuration conf/lang/${language}.conf does not exist!" && exit 1 +ln -sf ./conf/lang/${language}.conf lang.conf +. ./lang.conf + +if [ $stage -le 1 ]; then + local/prepare_text_data.sh $corpus + local/prepare_audio_data.sh $corpus +fi + +if [ $stage -le 2 ]; then + local/prepare_dict.sh $corpus + utils/validate_dict_dir.pl data/local/dict_nosp + utils/prepare_lang.sh data/local/dict_nosp \ + "" data/local/lang_nosp data/lang_nosp + utils/validate_lang.pl data/lang_nosp +fi + +if [ $stage -le 3 ]; then + local/train_lms_srilm.sh --oov-symbol "" --words-file \ + data/lang_nosp/words.txt data data/lm + utils/format_lm.sh data/lang_nosp data/lm/lm.gz \ + data/local/dict_nosp/lexiconp.txt data/lang_nosp_test + utils/validate_lang.pl data/lang_nosp_test +fi + +if [ $stage -le 4 ]; then + for set in train dev; do + dir=data/$set + utils/fix_data_dir.sh $dir + steps/make_mfcc.sh --cmd "$train_cmd" --nj 16 $dir + steps/compute_cmvn_stats.sh $dir + utils/fix_data_dir.sh $dir + utils/validate_data_dir.sh $dir + done +fi + +# Create a subset with 40k short segments to make flat-start training easier +if [ $stage -le 5 ]; then + utils/subset_data_dir.sh --shortest data/train $numShorestUtts data/train_short +fi + +# monophone training +if [ $stage -le 6 ]; then + steps/train_mono.sh --nj $nj --cmd "$train_cmd" \ + data/train_short data/lang_nosp_test exp/mono + ( + utils/mkgraph.sh data/lang_nosp_test \ + exp/mono exp/mono/graph_nosp + for test in dev; do + steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/mono/graph_nosp \ + data/$test exp/mono/decode_nosp_$test + done + )& + + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/train data/lang_nosp_test exp/mono exp/mono_ali +fi + +# train a first delta + delta-delta triphone system on all utterances +if [ $stage -le 7 ]; then + steps/train_deltas.sh --cmd "$train_cmd" \ + $numLeavesTri1 $numGaussTri1 data/train data/lang_nosp_test exp/mono_ali exp/tri1 + + # decode using the tri1 model + ( + utils/mkgraph.sh data/lang_nosp_test exp/tri1 exp/tri1/graph_nosp + for test in dev; do + steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/tri1/graph_nosp \ + data/$test exp/tri1/decode_nosp_$test + done + )& + + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/train data/lang_nosp_test exp/tri1 exp/tri1_ali +fi + +# train an LDA+MLLT system. +if [ $stage -le 8 ]; then + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=3 --right-context=3" $numLeavesTri2 $numGaussTri2 \ + data/train data/lang_nosp_test exp/tri1_ali exp/tri2 + + # decode using the LDA+MLLT model + ( + utils/mkgraph.sh data/lang_nosp_test exp/tri2 exp/tri2/graph_nosp + for test in dev; do + steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/tri2/graph_nosp \ + data/$test exp/tri2/decode_nosp_$test + done + )& + + steps/align_si.sh --nj $nj --cmd "$train_cmd" --use-graphs true \ + data/train data/lang_nosp_test exp/tri2 exp/tri2_ali +fi + +# Train tri3, which is LDA+MLLT+SAT +if [ $stage -le 9 ]; then + steps/train_sat.sh --cmd "$train_cmd" $numLeavesTri3 $numGaussTri3 \ + data/train data/lang_nosp_test exp/tri2_ali exp/tri3 + + # decode using the tri3 model + ( + utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp + for test in dev; do + steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" exp/tri3/graph_nosp \ + data/$test exp/tri3/decode_nosp_$test + done + )& +fi + +# Now we compute the pronunciation and silence probabilities from training data, +# and re-create the lang directory. +if [ $stage -le 10 ]; then + steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_nosp_test exp/tri3 + utils/dict_dir_add_pronprobs.sh --max-normalize true \ + data/local/dict_nosp \ + exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \ + exp/tri3/pron_bigram_counts_nowb.txt data/local/dict + + utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang + + utils/format_lm.sh data/lang data/lm/lm.gz \ + data/local/dict/lexiconp.txt data/lang_test + + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + data/train data/lang_test exp/tri3 exp/tri3_ali +fi + +if [ $stage -le 11 ]; then + # Test the tri3 system with the silprobs and pron-probs. + + # decode using the tri3 model + utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph + for test in dev; do + steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" \ + exp/tri3/graph data/$test exp/tri3/decode_$test + done +fi + +mkdir -p data/bitext +mkdir -p data/mono + +srctext_bitext=data/bitext/text +srctext_mono=data/mono/text + +if [ $stage -le 12 ]; then + # Read the foreign part of the bitext as $srctext_bitext and preprocess the text + if [ "$number_mapping" != "" ]; then + echo Number mapping file Found. Converting numbers... + cat $bitext | awk -F"\t" '{print $2;}' | local/normalize_numbers.py $number_mapping > $srctext_bitext + if [[ $mono == *.gz ]]; then + gzip -cd $mono | local/normalize_numbers.py $number_mapping > $srctext_mono + else + cat $mono | local/normalize_numbers.py $number_mapping > $srctext_mono + fi + if [ "$mono2" != "" ]; then + if [[ $mono2 == *.gz ]]; then + gzip -cd $mono2 | local/normalize_numbers.py $number_mapping >> $srctext_mono + else + cat $mono2 | local/normalize_numbers.py $number_mapping >> $srctext_mono + fi + fi + else + cat $bitext | awk -F"\t" '{print $2;}' > $srctext_bitext + if [[ $mono == *.gz ]]; then + gzip -cd $mono > $srctext_mono + else + cat $mono > $srctext_mono + fi + if [ "$mono2" != "" ]; then + if [[ $mono2 == *.gz ]]; then + gzip -cd $mono2 >> $srctext_mono + else + cat $mono2 >> $srctext_mono + fi + fi + fi + + local/preprocess_external_text.sh --language $language \ + --srctext-bitext ${srctext_bitext} ${srctext_bitext}.txt + + local/preprocess_external_text.sh --language $language \ + --srctext-bitext ${srctext_mono} ${srctext_mono}.txt + + # Combine two sources of text + cat $bitext | awk '{print $1}' > ${srctext_bitext}.header + paste ${srctext_bitext}.header ${srctext_bitext}.txt > ${srctext_bitext}.processed + + if [[ $mono == *.gz ]]; then + gzip -cd $mono | awk '{printf("mono-%d\n",NR)}' > ${srctext_mono}.header + else + cat $mono | awk '{printf("mono-%d\n",NR)}' > ${srctext_mono}.header + fi + if [ "$mono2" != "" ]; then + if [[ $mono2 == *.gz ]]; then + gzip -cd $mono2 | awk '{printf("mono-%d\n",NR)}' >> ${srctext_mono}.header + else + cat $mono2 | awk '{printf("mono-%d\n",NR)}' >> ${srctext_mono}.header + fi + fi + paste ${srctext_mono}.header ${srctext_mono}.txt > ${srctext_mono}.processed +fi + +# The next 3 stages are to train g2p from the existing lexicon, +# apply g2p to expand the lexicon using oov words from bitext data +# as in ${dict_root}_nosp. +g2p_workdir=data/local/g2p_phonetisarus +if [ $stage -le 13 ]; then + echo 'Gathering missing words...' + mkdir -p ${g2p_workdir} + cat ${srctext_bitext}.txt ${srctext_mono}.txt | \ + local/count_oovs.pl data/local/dict_nosp/lexicon.txt | \ + awk '{for(i=4; i ${g2p_workdir}/missing.txt + cat ${g2p_workdir}/missing.txt | \ + grep "^[a-z]*$" > ${g2p_workdir}/missing_onlywords.txt +fi + +if [ $stage -le 14 ]; then + local/g2p/train_g2p.sh --stage 0 --silence-phones \ + "data/local/dict/silence_phones.txt" data/local/dict_nosp exp/g2p || touch exp/g2p/.error +fi + +dict_root=data/local/dict_combined +if [ $stage -le 15 ]; then + if [ -f exp/g2p/.error ]; then + rm exp/g2p/.error || true + echo "Fail to train the G2P model." && exit 1; + fi + mkdir -p ${dict_root}_nosp + rm ${dict_root}_nosp/lexiconp.txt 2>/dev/null || true + cp data/local/dict_nosp/{phones,oov,nonsilence_phones,silence_phones,optional_silence}.txt ${dict_root}_nosp + local/g2p/apply_g2p.sh --var-counts 1 exp/g2p/model.fst ${g2p_workdir} \ + data/local/dict_nosp/lexicon.txt ${dict_root}_nosp/lexicon.txt || exit 1; + + utils/validate_dict_dir.pl ${dict_root}_nosp +fi + +lang_root=data/lang_combined +lmdir=data/lm_combined +if [ $stage -le 16 ]; then + utils/prepare_lang.sh ${dict_root}_nosp "" data/local/lang_combined_nosp ${lang_root}_nosp + utils/validate_lang.pl ${lang_root}_nosp +fi + +# prepare the new LM with bitext data and the new lexicon, +# as in the new test lang directory ${lang_root}_nosp_test + +datadev="data/analysis1 data/analysis2 data/test_dev data/eval1 data/eval2 data/eval3" + +if [ $stage -le 17 ]; then + for datadir in $datadev; do + local/preprocess_test.sh $datadir & + done + wait + + mkdir -p $lmdir + mkdir -p $lmdir/mono + mkdir -p $lmdir/bitext + + cat data/analysis1/text | awk '{for(i=2;i<=NF;i++) printf("%s ", $i); print""}' \ + | grep . | shuf | head -n 2000 > $lmdir/dev_text || echo done + + local/train_lms_srilm.sh --oov-symbol "" --words-file ${lang_root}_nosp/words.txt \ + --train-text ${srctext_bitext}.processed --dev-text $lmdir/dev_text \ + data $lmdir/bitext + + local/train_lms_srilm.sh --oov-symbol "" --words-file ${lang_root}_nosp/words.txt \ + --train-text ${srctext_mono}.processed --dev-text $lmdir/dev_text \ + data $lmdir/mono +fi + +if [ $stage -le 18 ]; then + ngram -order 4 -lm data/lm/lm.gz -mix-lm $lmdir/bitext/lm.gz \ + -mix-lm2 $lmdir/mono/lm.gz -lambda 0.3 -mix-lambda2 0.4 \ + -write-lm $lmdir/lm.gz + + utils/format_lm.sh ${lang_root}_nosp $lmdir/lm.gz \ + ${dict_root}_nosp/lexiconp.txt ${lang_root}_nosp_test + utils/validate_lang.pl ${lang_root}_nosp_test +fi + +# Now we compute the pronunciation and silence probabilities from training data, +# and re-create the lang directory ${lang_root}_test. +if [ $stage -le 19 ]; then + steps/get_prons.sh --cmd "$train_cmd" data/train ${lang_root}_nosp_test exp/tri3 + utils/dict_dir_add_pronprobs.sh --max-normalize true \ + ${dict_root}_nosp \ + exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \ + exp/tri3/pron_bigram_counts_nowb.txt ${dict_root} + utils/prepare_lang.sh ${dict_root} "" data/local/lang_combined ${lang_root} + + utils/format_lm.sh ${lang_root} $lmdir/lm.gz \ + ${dict_root}/lexiconp.txt ${lang_root}_test +fi + +# After run.sh is finished, run the followings: +# ./local/chain/run_tdnn.sh +# ./local/chain/decode_test.sh --language +# ./local/rnnlm/run_tdnn_lstm.sh +exit 0; diff --git a/egs/material/s5/steps b/egs/material/s5/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/material/s5/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/material/s5/utils b/egs/material/s5/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/material/s5/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/wsj/s5/utils/data/subsegment_data_dir.sh b/egs/wsj/s5/utils/data/subsegment_data_dir.sh index 526fee0b4ef..1b399ba730a 100755 --- a/egs/wsj/s5/utils/data/subsegment_data_dir.sh +++ b/egs/wsj/s5/utils/data/subsegment_data_dir.sh @@ -222,8 +222,11 @@ fi if [ -f $srcdir/glm ]; then cp $srcdir/glm $dir fi +if [ -f $srcdir/stm ]; then + cp $srcdir/stm $dir +fi -for f in stm ctm; do +for f in ctm; do if [ -f $srcdir/$f ]; then echo "$0: not copying $srcdir/$f to $dir because sub-segmenting it is " echo " ... not implemented yet (and probably it's not needed.)" diff --git a/egs/wsj/s5/utils/validate_dict_dir.pl b/egs/wsj/s5/utils/validate_dict_dir.pl index 8f8534c329b..209f9fd40c1 100755 --- a/egs/wsj/s5/utils/validate_dict_dir.pl +++ b/egs/wsj/s5/utils/validate_dict_dir.pl @@ -35,7 +35,7 @@ sub get_utf8_or_bytestream { $is_utf_compatible = $is_utf_compatible && defined($decoded_text); push @unicode_lines, $decoded_text; } else { - #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; + #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n"; ; } push @raw_lines, $raw_text; diff --git a/scripts/rnnlm/lmrescore_nbest.sh b/scripts/rnnlm/lmrescore_nbest.sh index 58b19b9fa79..a22d43961ab 100755 --- a/scripts/rnnlm/lmrescore_nbest.sh +++ b/scripts/rnnlm/lmrescore_nbest.sh @@ -58,7 +58,7 @@ elif [ ! -f $oldlm ]; then exit 1; fi -for f in $rnndir/final.raw $data/feats.scp $indir/lat.1.gz; do +for f in $rnndir/final.raw $indir/lat.1.gz; do [ ! -f $f ] && echo "$0: expected file $f to exist." && exit 1; done @@ -174,6 +174,7 @@ if [ $stage -le 5 ]; then $adir.$n/lmwt.lmonly || exit 1; done fi + if [ $stage -le 6 ]; then echo "$0: invoking rnnlm/compute_sentence_scores.sh which calls rnnlm to get RNN LM scores." $cmd JOB=1:$nj $dir/log/rnnlm_compute_scores.JOB.log \ diff --git a/scripts/rnnlm/lmrescore_pruned.sh b/scripts/rnnlm/lmrescore_pruned.sh index 9ba78415708..b6ec694ffd4 100755 --- a/scripts/rnnlm/lmrescore_pruned.sh +++ b/scripts/rnnlm/lmrescore_pruned.sh @@ -26,7 +26,7 @@ normalize=false # If true, we add a normalization step to the output of the RNNL # as in our RNNLM setup, a properly trained network would automatically # have its normalization term close to 1. The details of this # could be found at http://www.danielpovey.com/files/2018_icassp_rnnlm.pdf -lattice_prune_beam=4 # Beam used in pruned lattice composition +lattice_prune_beam=8 # Beam used in pruned lattice composition # This option affects speed and how large the composed lattice may be # End configuration section. diff --git a/src/lat/compose-lattice-pruned.cc b/src/lat/compose-lattice-pruned.cc index 57a7432dca0..cc71db38eab 100644 --- a/src/lat/compose-lattice-pruned.cc +++ b/src/lat/compose-lattice-pruned.cc @@ -658,6 +658,7 @@ void PrunedCompactLatticeComposer::AddFirstState() { composed_state_queue_.push( std::pair(expected_cost_offset, state_id)); // actually (0.0, 0). + }