kaldi-asr · danpovey · Jun 6, 2019 · Oct 30, 2017 · Oct 31, 2017 · Oct 31, 2017
diff --git a/egs/material/README b/egs/material/README
diff --git a/egs/material/s5/README b/egs/material/s5/README
@@ -0,0 +1,35 @@
+About the MATERIAL corpus:
+
+The MATERIAL project:
+https://www.iarpa.gov/index.php/research-programs/material
+https://www.nist.gov/itl/iad/mig/openclir-evaluation
+
+The speech data in the MATERIAL corpus consist of four data sets for each
+language: train (BUILD), development (BUILD-dev), test (ANALYSIS1 and ANALYSIS2),
+and unlabeled evaluation audio (EVAL{1,2,3}). The train, development, test, and
+evaluation data contain around 40, 10, 20, and 250 hours of audio respectively.
+The train set is transcribed conversational audio that can be used for training
+an ASR system. It consists of some in 8-bit a-law .sph (Sphere) files and some
+in .wav files with 24-bit samples. The development set is transcribed
+conversational audio that can be used as development data for training to tune
+model parameters. The test data come in long unsegmented files. The reference
+transcripts for the test set is provided, hence, one can measure WER on the test
+set. The evaluation set is untranscribed audio that can be used for
+semi-supervised training of the acoustic model.
+Conversational speech data in the train and test sets are two-channel audio with
+the two channels temporally aligned. Each audio channel is provided and
+transcribed as a separate file, identified as inLine or outLine channel. Both
+audio channels are interleaved in a single file and a there is a single
+interleaved transcript that reflects the temporal alignments. In addition to
+conversational speech, the test and evlatuion sets also contain other
+genres of speech, namely news broadcast and topical broadcast, which are
+single channel files.
+
+
+Running the recipe:
+
+In s5)
+./run.sh --language <swahili|tagalog|somali>
+./local/chain/run_tdnn.sh
+./local/chain/decode_test.sh --language <swahili|tagalog|somali>
+./local/rnnlm/run_tdnn_lstm.sh
diff --git a/egs/material/s5/RESULTS b/egs/material/s5/RESULTS
@@ -0,0 +1,51 @@
+WER results for supervised and semi-supervised acoustic model training
+
+Baseline: GMM training to create alignments and lattice-free MMI-trained neural
+network with factorized TDNN. The BUILD package labeled audio is used for
+supervised acoustic model training, the EVALs unlabeled audio is added for
+semi-supervised acoustic model training.
+
+Source-side bitext on the BUILD package and crawled monolingual data are used in
+building the n-gram LM, RNNLM re-scoring, as well as extending the baseline lexicon.
+
+
+Results for *supervised* acoustic model training:
+
+Swahili
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   36.8    36.7    38.9
+ANALYSIS1   42.5    41.3    41.4
+ANALYSIS2   38.1    36.8    36.9
+
+Tagalog
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   46.4    46.1    47.5
+ANALYSIS1   52.1    51.0    50.9
+ANALYSIS2   53.6    52.3    52.2
+
+Somali
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   57.4    56.5    57.8
+ANALYSIS1   61.6    57.8    57.7
+ANALYSIS2   59.3    55.5    55.3
+
+
+Results for *semi-supervised* acoustic model training:
+
+Swahili
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   35.3    35.1    36.7
+ANALYSIS1   35.2    34.5    34.7
+ANALYSIS2   30.8    30.0    30.1
+
+Tagalog
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   45.0    45.2    46.6
+ANALYSIS1   40.8    40.1    40.1
+ANALYSIS2   41.1    40.6    40.6
+
+Somali
+          Baseline +RNNLM +RNNLM-nbest
+BUILD-dev   56.8    56.3    57.7
+ANALYSIS1   50.6    48.8    48.6
+ANALYSIS2   49.8    48.2    48.2
diff --git a/egs/material/s5/cmd.sh b/egs/material/s5/cmd.sh
@@ -0,0 +1,14 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="retry.pl --num-tries 3 queue.pl --mem 8G"
diff --git a/egs/material/s5/conf/decode.config b/egs/material/s5/conf/decode.config
@@ -0,0 +1 @@
+# empty config, just use the defaults.
diff --git a/egs/material/s5/conf/lang/somali.conf b/egs/material/s5/conf/lang/somali.conf
@@ -0,0 +1,26 @@
+# speech corpora files location
+# the user should replace the values with the ones that work for their location
+corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/BUILD/
+# test audio files to decode
+audio_path_analysis1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/ANALYSIS1/audio/
+audio_path_analysis2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/ANALYSIS2/audio/
+audio_path_dev=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/DEV/audio/
+audio_path_eval1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/EVAL1/audio/
+audio_path_eval2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/EVAL2/audio/
+audio_path_eval3=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1S/EVAL3/audio/
+# bitext file location
+bitext=$corpus/bitext/MATERIAL_BASE-1S-BUILD_bitext.txt
+mono=/home/pkoehn/statmt/data/site-crawl/corpus/paracrawl-release3.2018-11-05.en-so.zipporah-20-dedup.lang-filtered.so
+mono2=/home/pkoehn/statmt/data/data.statmt.org/lm/so.filtered.tok.gz
+# number_mapping is a 2-column file consisting of the numbers written as digits (1st column) and letters (2nd column)
+number_mapping=/home/pkoehn/experiment/material-asr-so-en/scripts/somali_1_9999.txt
+# Acoustic model parameters
+numShorestUtts=40000
+numLeavesTri1=2000
+numGaussTri1=30000
+numLeavesTri2=3000
+numGaussTri2=60000
+numLeavesTri3=6000
+numGaussTri3=80000
+
+
diff --git a/egs/material/s5/conf/lang/swahili.conf b/egs/material/s5/conf/lang/swahili.conf
@@ -0,0 +1,26 @@
+# speech corpora files location
+# the user should replace the values with the ones that work for their location
+corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/
+# test audio files to decode
+audio_path_analysis1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/ANALYSIS1/audio/
+audio_path_analysis2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/ANALYSIS2/audio/
+audio_path_dev=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/DEV/audio/
+audio_path_eval1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/EVAL1/audio/
+audio_path_eval2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/EVAL2/audio/
+audio_path_eval3=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/EVAL3/audio/
+# bitext file location
+bitext=$corpus/bitext/MATERIAL_BASE-1A-BUILD_bitext.txt
+mono=/home/pkoehn/statmt/data/site-crawl/mono-corpus/mono.2018-04-24.sw
+mono2=
+# number_mapping is a 2-column file consisting of the numbers written as digits (1st column) and letters (2nd column)
+number_mapping=/home/pkoehn/experiment/material-asr-so-en/scripts/swahili_1_9999.txt
+# Acoustic model parameters
+numShorestUtts=40000
+numLeavesTri1=2000
+numGaussTri1=30000
+numLeavesTri2=3000
+numGaussTri2=60000
+numLeavesTri3=6000
+numGaussTri3=80000
+
+
diff --git a/egs/material/s5/conf/lang/tagalog.conf b/egs/material/s5/conf/lang/tagalog.conf
@@ -0,0 +1,26 @@
+# speech corpora files location
+# the user should replace the values with the ones that work for their location
+corpus=/home/pkoehn/experiment/material-asr-so-en/scripts/swahili_1_9999.txt
+# test audio files to decode
+audio_path_analysis1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/ANALYSIS1/audio/
+audio_path_analysis2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/ANALYSIS2/audio/
+audio_path_dev=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/DEV/audio/
+audio_path_eval1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/EVAL1/audio/
+audio_path_eval2=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/EVAL2/audio/
+audio_path_eval3=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/EVAL3/audio/
+# bitext file location
+bitext=$corpus/bitext/MATERIAL_BASE-1B-BUILD_bitext.txt
+mono=/home/pkoehn/statmt/data/site-crawl/mono-corpus/mono.2018-04-24.tl
+mono2=
+# number_mapping is a 2-column file consisting of the numbers written as digits (1st column) and letters (2nd column)
+number_mapping=
+# Acoustic model parameters
+numShorestUtts=45000
+numLeavesTri1=4000
+numGaussTri1=60000
+numLeavesTri2=5000
+numGaussTri2=80000
+numLeavesTri3=7000
+numGaussTri3=100000
+
+
diff --git a/egs/material/s5/conf/mfcc.conf b/egs/material/s5/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false
+--sample-frequency=8000 
diff --git a/egs/material/s5/conf/mfcc_hires.conf b/egs/material/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 # most of the files are 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
diff --git a/egs/material/s5/conf/online_cmvn.conf b/egs/material/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/material/s5/conf/plp.conf b/egs/material/s5/conf/plp.conf
@@ -0,0 +1 @@
+--sample-frequency=8000
diff --git a/egs/material/s5/local/audio2wav_scp.pl b/egs/material/s5/local/audio2wav_scp.pl
@@ -0,0 +1,55 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+
+my $sox =  `which sox` or die "The sox binary does not exist";
+chomp $sox;
+my $sph2pipe = `which sph2pipe` or die "The sph2pipe binary does not exist";
+chomp $sph2pipe;
+
+while(<STDIN>) {
+  chomp;
+  my $full_path = $_;
+  (my $basename = $full_path) =~ s/.*\///g;
+
+  die "The filename $basename does not match the expected naming pattern!" unless $basename =~ /.*\.(wav|sph)$/;
+  (my $ext = $basename) =~ s/.*\.(wav|sph)$/$1/g;
+  (my $name = $basename) =~ s/(.*)\.(wav|sph)$/$1/g;
+
+
+  # name looks like this:
+  #   MATERIAL_BASE-1A-BUILD_10002_20131130_011225_inLine.sph
+  # Please note that the naming pattern must match
+  # the pattern in create_datafiles.pl
+  $name =~ s/inLine.*/0/g;
+  $name =~ s/outLine.*/1/g;
+  $name =~ s/_BASE//g;
+  $name =~ s/-BUILD//g;
+
+  if ($ext eq "wav") {
+    print "$name $sox $full_path -r 8000 -c 1 -b 16 -t wav - downsample|\n";
+  } else {
+    print "$name $sph2pipe -f wav -p -c 1 $full_path|\n";
+  }
+}
+
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh