diff --git a/egs/tunisian_msa/s5/README b/egs/tunisian_msa/s5/README new file mode 100644 index 00000000000..ae2aa2bc452 --- /dev/null +++ b/egs/tunisian_msa/s5/README @@ -0,0 +1,24 @@ +A Kaldi recipe for Arabic using the Tunisian_MSA corpus. + +Extra Requirements: +This recipe uses the QCRI lexicon which uses the Buckwalter encoding. +In order to convert the Buckwalter to utf-8, the Encode::Arabic::Buckwalter perl module is required. +On ubuntu install the package: libencode-arabic-perl. +On Mac OSX use cpanm (cpanminus) to install the perl module. + +Description of the Tunisian_MSA Corpus +The Tunisian_MSA corpus was originally collected to train acoustic models for pronunciation modeling in Arabic language learning applications. +The data collection took place near Tunis the capital of the Republic of Tunisia in 2003 at the Military Academy of Fondouk Jedied . +The Tunisian_MSA corpus is divided into recited and prompted speech subcorpora. +The recited speech appears under the recordings directory and the prompted speech under the answers directory. +Each of the 118 informants contributed to both subcorpora by reciting sentences and providing answers to prompted questions. +The Tunisian_MSA corpus has 11.2 hours of speech. + +With the exception of speech from two speakers , all the corpus was used for training. + +A small corpus was collected for testing. + +A pronunciation dictionary is also available from openslrm.org. +It covers all the words uttered in the Tunisian_MSA corpus and the test corpus. +The QCRI lexicon was used as a starting point for writing this lexicon. +The phones are the same as those used in the QCRI lexicon. diff --git a/egs/tunisian_msa/s5/cmd.sh b/egs/tunisian_msa/s5/cmd.sh new file mode 100644 index 00000000000..71dd849a93b --- /dev/null +++ b/egs/tunisian_msa/s5/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/tunisian_msa/s5/conf/mfcc.conf b/egs/tunisian_msa/s5/conf/mfcc.conf new file mode 100644 index 00000000000..7361509099f --- /dev/null +++ b/egs/tunisian_msa/s5/conf/mfcc.conf @@ -0,0 +1 @@ +--use-energy=false # only non-default option. diff --git a/egs/tunisian_msa/s5/conf/mfcc_hires.conf b/egs/tunisian_msa/s5/conf/mfcc_hires.conf new file mode 100644 index 00000000000..434834a6725 --- /dev/null +++ b/egs/tunisian_msa/s5/conf/mfcc_hires.conf @@ -0,0 +1,10 @@ +# config for high-resolution MFCC features, intended for neural network training +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so + # there might be some information at the low end. +--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) diff --git a/egs/tunisian_msa/s5/conf/online_cmvn.conf b/egs/tunisian_msa/s5/conf/online_cmvn.conf new file mode 100644 index 00000000000..7748a4a4dd3 --- /dev/null +++ b/egs/tunisian_msa/s5/conf/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/egs/tunisian_msa/s5/conf/pitch.conf b/egs/tunisian_msa/s5/conf/pitch.conf new file mode 100644 index 00000000000..e959a19d5b8 --- /dev/null +++ b/egs/tunisian_msa/s5/conf/pitch.conf @@ -0,0 +1 @@ +--sample-frequency=16000 diff --git a/egs/tunisian_msa/s5/conf/plp.conf b/egs/tunisian_msa/s5/conf/plp.conf new file mode 100644 index 00000000000..e959a19d5b8 --- /dev/null +++ b/egs/tunisian_msa/s5/conf/plp.conf @@ -0,0 +1 @@ +--sample-frequency=16000 diff --git a/egs/tunisian_msa/s5/local/answers_make_lists.pl b/egs/tunisian_msa/s5/local/answers_make_lists.pl new file mode 100755 index 00000000000..55ee5751d9b --- /dev/null +++ b/egs/tunisian_msa/s5/local/answers_make_lists.pl @@ -0,0 +1,77 @@ +#!/usr/bin/env perl + +# Copyright 2018 John Morgan +# Apache 2.0. + +# answers_make_lists.pl - make acoustic model training lists + +use strict; +use warnings; +use Carp; + +use File::Spec; +use File::Copy; +use File::Basename; + +my $tmpdir = 'data/local/tmp/tunis'; + +system "mkdir -p $tmpdir/answers"; + +# input wav file list +my $wav_list = "$tmpdir/answers_wav.txt"; + +# output temporary wav.scp files +my $wav_scp = "$tmpdir/answers/wav.scp"; + +# output temporary utt2spk files +my $u = "$tmpdir/answers/utt2spk"; + +# output temporary text files +my $t = "$tmpdir/answers/text"; + +# initialize hash for prompts +my %prompt = (); + +# store prompts in hash +LINEA: while ( my $line = <> ) { + chomp $line; + my ($num,$sent) = split /\t/sxm, $line, 2; + + my ($machine,$s,$mode,$language,$i) = split /\_/sxm, $num; + # the utterance name + my $utt = $machine . '_' . $s . '_' . 'a' . '_' . $i; + $prompt{$utt} = $sent; +} + +# Write wav.scp, utt2spk and text files. +open my $W, '<', $wav_list or croak "problem with $wav_list $!"; +open my $O, '+>', $wav_scp or croak "problem with $wav_scp $!"; +open my $U, '+>', $u or croak "problem with $u"; +open my $T, '+>', $t or croak "problem with $t"; + + LINE: while ( my $line = <$W> ) { + chomp $line; + next LINE if ( $line !~ /Answers/sxm ); + next LINE if ( $line =~ /Recordings/sxm ); + my ($volume,$directories,$file) = File::Spec->splitpath( $line ); + my @dirs = split /\//sxm, $directories; + my $r = basename $line, '.wav'; + my $machine = $dirs[-3]; + my $s = $dirs[-1]; + my $rid = $machine . '_' . $s . '_' . 'a' . '_' . $r; + if ( exists $prompt{$rid} ) { + print ${T} "$rid\t$prompt{$rid}\n" or croak; + } elsif ( defined $rid ) { + print STDERR "problem\t$rid" or croak; + next LINE; + } else { + croak "$line"; + } + + print ${O} "$rid sox $line -t wav - |\n" or croak; + print ${U} "$rid ${machine}_${s}_a\n" or croak; +} +close $U or croak; +close $T or croak; +close $W or croak; +close $O or croak; diff --git a/egs/tunisian_msa/s5/local/buckwalter2unicode.py b/egs/tunisian_msa/s5/local/buckwalter2unicode.py new file mode 100755 index 00000000000..94fec3225dd --- /dev/null +++ b/egs/tunisian_msa/s5/local/buckwalter2unicode.py @@ -0,0 +1,453 @@ +#!/usr/bin/python + +# buckwalter2unicode.py - A script to convert transliterated Arabic +# (using the Buckwalter system) to Unicode. +# +# Version 0.2 - 15th September 2004 +# +# Andrew Roberts (andyr [at] comp (dot) leeds [dot] ac (dot) uk) +# +# Project homepage: http://www.comp.leeds.ac.uk/andyr/software/ +# +# Now, listen carefully... +# +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +import sys, getopt, codecs, os, re + +# Declare a dictionary with Buckwalter's ASCII symbols as the keys, and +# their unicode equivalents as values. + +buck2uni = {"'": u"\u0621", # hamza-on-the-line + "|": u"\u0622", # madda + ">": u"\u0623", # hamza-on-'alif + "&": u"\u0624", # hamza-on-waaw + "<": u"\u0625", # hamza-under-'alif + "}": u"\u0626", # hamza-on-yaa' + "A": u"\u0627", # bare 'alif + "b": u"\u0628", # baa' + "p": u"\u0629", # taa' marbuuTa + "t": u"\u062A", # taa' + "v": u"\u062B", # thaa' + "j": u"\u062C", # jiim + "H": u"\u062D", # Haa' + "x": u"\u062E", # khaa' + "d": u"\u062F", # daal + "*": u"\u0630", # dhaal + "r": u"\u0631", # raa' + "z": u"\u0632", # zaay + "s": u"\u0633", # siin + "$": u"\u0634", # shiin + "S": u"\u0635", # Saad + "D": u"\u0636", # Daad + "T": u"\u0637", # Taa' + "Z": u"\u0638", # Zaa' (DHaa') + "E": u"\u0639", # cayn + "g": u"\u063A", # ghayn + "_": u"\u0640", # taTwiil + "f": u"\u0641", # faa' + "q": u"\u0642", # qaaf + "k": u"\u0643", # kaaf + "l": u"\u0644", # laam + "m": u"\u0645", # miim + "n": u"\u0646", # nuun + "h": u"\u0647", # haa' + "w": u"\u0648", # waaw + "Y": u"\u0649", # 'alif maqSuura + "y": u"\u064A", # yaa' + "F": u"\u064B", # fatHatayn + "N": u"\u064C", # Dammatayn + "K": u"\u064D", # kasratayn + "a": u"\u064E", # fatHa + "u": u"\u064F", # Damma + "i": u"\u0650", # kasra + "~": u"\u0651", # shaddah + "o": u"\u0652", # sukuun + "`": u"\u0670", # dagger 'alif + "{": u"\u0671", # waSla +} + +# For a reverse transliteration (Unicode -> Buckwalter), a dictionary +# which is the reverse of the above buck2uni is essential. + +uni2buck = {} + +# Iterate through all the items in the buck2uni dict. +for (key, value) in buck2uni.iteritems(): + # The value from buck2uni becomes a key in uni2buck, and vice + # versa for the keys. + uni2buck[value] = key + +# Declare some global variables... + + +inFilename = "" # Name of filename containing input. +outFilename = "" # Name of filename to send the output +inEnc = "" # The text encoding of the input file +outEnc = "" # The text encoding for the output file +ignoreChars = "" # If lines begin with these symbols, ignore. +columnRange = "" # Holds columns numbers to transliterate. +delimiter = "" # Holds user-defined column delimiter. +reverse = 0 # When equal to 1, perform reverse transliteration, i.e., + # Unicode -> Buckwalter. + +# A function to print to screen the usage details of this script. + +def usage(): + print "Usage:", sys.argv[0], "-i INFILE -o OUTFILE [-g CHARS -c RANGE -d CHAR" + print " -r -e INPUT_ENCODING, -E OUTPUT ENCODING]" + print " ", sys.argv[0], "-l" + print " ", sys.argv[0], "-h" + print "" + print " -i INFILE, --input=INFILE:" + print " Path to text file to be transliterated to Unicode." + print " -o OUTFILE, --output=OUTFILE:" + print " Path of file to output the newly transliterated text." + print " -e ENC, --input-encoding=ENC:" + print " Specify the text encoding of the source file. Default: latin_1." + print " -E ENC, --output-encoding=ENC:" + print " Specify the text encoding of the target file. Default: utf_8." + print " -g CHARS, --ignore-lines=CHARS:" + print " Will not transliterate lines that start with any of the CHARS" + print " given. E.g., -g #; will not alter lines starting with # or ;." + print " (May need to be -g \#\; on some platforms. See README.txt.)" + print " -c RANGE, --columns=RANGE:" + print " If in columns, select columns to apply transliteration. Can be" + print " comma separated numbers, or a range. E.g., -c 1, -c 1-3, -c 1,3." + print " -d CHAR, --delimiter=CHAR:" + print " Specify the delimiter that defines the column if using the -c" + print " option above. Default is ' ' (space)." + print " -r, --reverse:" + print " Reverses the transliteration, i.e., Arabic to Buckwalter." + print " When used, it will change the default input encoding to utf_8 and" + print " output encoding to latin_1" + print " -l, --list-encodings:" + print " Displays all supported file encodings." + print " -h, --help:" + print " Displays this page." + print "" + +# A function to print to screen all the available encodings supported by +# Python. + +def displayEncodings(): + print "Codec Aliases Languages" + print "ascii 646, us-ascii English" + print "cp037 IBM037, IBM039 English" + print "cp424 EBCDIC-CP-HE, IBM424 Hebrew" + print "cp437 437, IBM437 English" + print "cp500 EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500 Western Europe" + print "cp737 Greek" + print "cp775 IBM775 Baltic languages" + print "cp850 850, IBM850 Western Europe" + print "cp852 852, IBM852 Central and Eastern Europe" + print "cp855 855, IBM855 Bulgarian, Byelorussian, Macedonian, Russian, Serbian" + print "cp856 Hebrew" + print "cp857 857, IBM857 Turkish" + print "cp860 860, IBM860 Portuguese" + print "cp861 861, CP-IS, IBM861 Icelandic" + print "cp862 862, IBM862 Hebrew" + print "cp863 863, IBM863 Canadian" + print "cp864 IBM864 Arabic" + print "cp865 865, IBM865 Danish, Norwegian" + print "cp869 869, CP-GR, IBM869 Greek" + print "cp874 Thai" + print "cp875 Greek" + print "cp1006 Urdu" + print "cp1026 ibm1026 Turkish" + print "cp1140 ibm1140 Western Europe" + print "cp1250 windows-1250 Central and Eastern Europe" + print "cp1251 windows-1251 Bulgarian, Byelorussian, Macedonian, Russian, Serbian" + print "cp1252 windows-1252 Western Europe" + print "cp1253 windows-1253 Greek" + print "cp1254 windows-1254 Turkish" + print "cp1255 windows-1255 Hebrew" + print "cp1256 windows-1256 Arabic" + print "cp1257 windows-1257 Baltic languages" + print "cp1258 windows-1258 Vietnamese" + print "latin_1 iso-8859-1, iso8859-1, 8859, cp819, latin, latin1, L1 West Europe" + print "iso8859_2 iso-8859-2, latin2, L2 Central and Eastern Europe" + print "iso8859_3 iso-8859-3, latin3, L3 Esperanto, Maltese" + print "iso8859_4 iso-8859-4, latin4, L4 Baltic languagues" + print "iso8859_5 iso-8859-5, cyrillic Bulgarian, Byelorussian, Macedonian, Russian, Serbian" + print "iso8859_6 iso-8859-6, arabic Arabic" + print "iso8859_7 iso-8859-7, greek, greek8 Greek" + print "iso8859_8 iso-8859-8, hebrew Hebrew" + print "iso8859_9 iso-8859-9, latin5, L5 Turkish" + print "iso8859_10 iso-8859-10, latin6, L6 Nordic languages" + print "iso8859_13 iso-8859-13 Baltic languages" + print "iso8859_14 iso-8859-14, latin8, L8 Celtic languages" + print "iso8859_15 iso-8859-15 Western Europe" + print "koi8_r Russian" + print "koi8_u Ukrainian" + print "mac_cyrillic maccyrillic Bulgarian, Byelorussian, Macedonian, Russian, Serbian" + print "mac_greek macgreek Greek" + print "mac_iceland maciceland Icelandic" + print "mac_latin2 maclatin2, maccentraleurope Central and Eastern Europe" + print "mac_roman macroman Western Europe" + print "mac_turkish macturkish Turkish" + print "utf_16 U16, utf16 all languages" + print "utf_16_be UTF-16BE all languages (BMP only)" + print "utf_16_le UTF-16LE all languages (BMP only)" + print "utf_7 U7 all languages" + print "utf_8 U8, UTF, utf8 all languages" + +def parseIgnoreString(string): + + symbols = [] + + for char in string: + symbols.append(char) + + return symbols + +# Begin parsing the command-line arguments... + +try: + (options, args) = getopt.getopt(sys.argv[1:], "i:o:e:E:g:c:d:rlh", + ["input=","output=", "input-encoding=", "output-encoding=", + "ignore-lines=", "columns=", "delimiter=" "reverse", "list-encodings", + "help"]) + +except getopt.GetoptError: + # print help information and exit: + usage() + sys.exit(1) + +# Loop over all arguments supplied by the user. +for (x, y) in options: + if x in ("-h", "--help"): + usage() + sys.exit(0) + + if x in ("-l", "--list-encodings"): + displayEncodings() + sys.exit(0) + + if x in ("-i", "--input"): inFilename = y + if x in ("-o", "--output"): outFilename = y + if x in ("-e", "--input-encoding"): inEnc= y + if x in ("-E", "--output-encoding"): outEnc= y + if x in ("-r", "--reverse"): reverse = 1 + if x in ("-g", "--ignore-lines"): ignoreChars = y + if x in ("-c", "--columns"): columnRange = y + if x in ("-d", "--delimiter"): + delimiter = y + # Tabs come in off the command line from "\\t" to "\t". However, + # that's equivalent to "\\t" from python's point of view. + # Therefore replace any inputted "tabs" with proper tabs before + # proceeding. + delimiter = delimiter.replace("\\t", "\t") + # Do some error checking + if len(delimiter) > 1: + print >>sys.stderr, "Delimeter should only be a single character. Using first character" + delimiter[0] + delimiter = delimiter[0] + + if buck2uni.get(delimiter): + print >> sys.stderr, "Invalid delimiter. \"" + delimiter + "\" is part of the Buckwalter character set." + print >> sys.stderr, "This will obviously cause much confusion as a delimiter!" + print >> sys.stderr, "Please try again. Aborting..." + sys.exit(1) + +# If no delimiter was set then, set the default to " " (space) +if not delimiter: + delimiter = " " + +# If user didn't specify the encoding of the input file, then revert to +# defaults. The defaults can depending on the direction of +# transliteration: +# +# Buckwalter -> Unicode, default = latin1 +# Unicode -> Buckwalter, default = utf_8 + + +if not inEnc: + if reverse: + inEnc = "utf_8" + else: + inEnc = "latin_1" + +# Similarly, if user didn't specify the encoding of the output file, +# then revert to defaults. The defaults can depending on the direction +# of transliteration: +# +# Buckwalter -> Unicode, default = utf_8 +# Unicode -> Buckwalter, default # = latin_1 + +if not outEnc: + if reverse: + outEnc = "latin_1" + else: + outEnc = "utf_8" + +# Ok, let's get the files open! + +# Providing a file for output was specified... +if outFilename: + try: + # Create a file object, set it to "write" mode using the + # specified output encoding. + outFile = codecs.open(outFilename, "w", outEnc) + + except IOError, msg: + # A problem occurred when trying to open this file. Report to + # user... + print msg + sys.exit(1) + +# Script can not work without somewhere to store the transliteration. +# Exit. +else: + print "Must specify a file to use store the output! Aborting..." + sys.exit(1) + +# Providing a file for input was specified... +if inFilename: + try: + # Create a file object, set it to "read" mode using the + # specified input encoding. + inFile = codecs.open(inFilename, "r", inEnc) + + except IOError, msg: + # A problem occurred when trying to open this file. Report to + # user... + print msg + sys.exit(1) + +# This script requires a file to read from. Exit. +else: + print "Must specify a file to use as input! Aborting..." + sys.exit(1) + +def getColsFromRange(cRange): + + columns = [] + hyphenSearch = re.compile(r'-') + + rangeElements = cRange.split(",") + + for i in rangeElements: + # If it contains a hyphen (e.g., 1-3) + if hyphenSearch.search(i): + [start, end] = i.split("-") + columns = columns + range(int(start)-1,int(end)) + else: + columns.append(int(i)-1) + + return columns + +# This function transliterates a given string. It checks the direction +# of the transliteration and then uses the appropriate dictionary. A +# transliterated string is returned. + +def transliterate(inString, lineNumber): + out = "" + + if columnRange: + columns = getColsFromRange(columnRange) + + # Split the line on the delimiter + lineCols = inString.split(delimiter) + + # Iterate over each column. If it's one of the ones in the range + # specified, then transliterate, otherwise just output column + # unchanged. + + for i in range(len(lineCols)): + + # If first column, then don't prefix the delimiter + if i == 0: + if i in columns: + out = transliterateString(lineCols[i]) + else : + out = lineCols[i] + else : + if i in columns: + out = out + delimiter + transliterateString(lineCols[i]) + else : + out = out + delimiter + lineCols[i] + + else: + out = transliterateString(inString) + + + + return out + +def transliterateString(inString): + + out = "" + + # For normal Buckwalter -> Unicode transliteration.. + if not reverse: + + # Loop over each character in the string, inString. + for char in inString: + # Look up current char in the dictionary to get its + # respective value. If there is no match, e.g., chars like + # spaces, then just stick with the current char without any + # conversion. + out = out + buck2uni.get(char, char) + + # Same as above, just in the other direction. + else: + + for char in inString: + out = out + uni2buck.get(char, char) + + return out + +#while 1: +# line = inFile.readline().strip() +# line = line.decode(inEnc) +# if not line: +# break + + # process string +# outFile.write(transliterate(line) + os.linesep) + +# Read in the lines of the input file. +lines = inFile.readlines() + +currentLineNumber = 1 +# Loop over each line +for line in lines: + line = line.strip() + try: + # Transliterate the current line, and then write the output to + # file. + + if not ignoreChars: + outFile.write(transliterate(line, currentLineNumber) + " " + os.linesep) + else: + if line[0] in parseIgnoreString(ignoreChars): + outFile.write(line + " " + os.linesep) + else: + outFile.write(transliterate(line, currentLineNumber) + " " + os.linesep) + + currentLineNumber = currentLineNumber + 1 + + except UnicodeError, msg: + # A problem when writing occurred. Report to user... + print msg + sys.exit(1) + +# All done! Better close the files used before terminating... +inFile.close() +outFile.close() + +# ... and relax! :) diff --git a/egs/tunisian_msa/s5/local/buckwalter2utf8.pl b/egs/tunisian_msa/s5/local/buckwalter2utf8.pl new file mode 100755 index 00000000000..c952e554f86 --- /dev/null +++ b/egs/tunisian_msa/s5/local/buckwalter2utf8.pl @@ -0,0 +1,11 @@ +#!/usr/bin/env perl +# Input buckwalter encoded Arabic and print it out as utf-8 encoded Arabic. +use strict; +use warnings; +use Carp; + +use Encode::Arabic::Buckwalter; # imports just like 'use Encode' would, plus more + +while ( my $line = <>) { + print encode 'utf8', decode 'buckwalter', $line; +} diff --git a/egs/tunisian_msa/s5/local/chain/compare_wer.sh b/egs/tunisian_msa/s5/local/chain/compare_wer.sh new file mode 100755 index 00000000000..c6a3a91ea69 --- /dev/null +++ b/egs/tunisian_msa/s5/local/chain/compare_wer.sh @@ -0,0 +1,133 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3} + + +if [ $# == 0 ]; then + echo "Usage: $0: [--looped] [--online] [ ... ]" + echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp" + echo "or (with epoch numbers for discriminative training):" + echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}" + exit 1 +fi + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +test_sets=(devtest test) + +for t in ${test_sets[@]}; do + printf '# %%WER % 14s ' $t + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat $dirname/decode_$t/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat $dirname/decode_looped_$t/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi + if $include_online; then + echo -n "# [online:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat ${dirname}_online/decode_$t/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Num-params " +for x in $*; do + printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}') +done +echo diff --git a/egs/tunisian_msa/s5/local/chain/run_tdnn.sh b/egs/tunisian_msa/s5/local/chain/run_tdnn.sh new file mode 120000 index 00000000000..34499362831 --- /dev/null +++ b/egs/tunisian_msa/s5/local/chain/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1a.sh \ No newline at end of file diff --git a/egs/tunisian_msa/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/tunisian_msa/s5/local/chain/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..d3c4a4ef11f --- /dev/null +++ b/egs/tunisian_msa/s5/local/chain/tuning/run_tdnn_1a.sh @@ -0,0 +1,292 @@ +#!/bin/bash + +# Uses a resnet-style factored TDNN-F model. + +# ./local/chain/compare_wer.sh exp/chain/tdnn1a_sp +# System tdnn1a_sp +# %WER devtest 39.25 +# %WER test 49.74 +# Final train prob -0.0473 +# Final valid prob -0.0538 +# Final train prob (xent) -1.0935 +# Final valid prob (xent) -1.0817 +# Num-params 3466448 + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train +test_sets="devtest test" +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1a # affix for the TDNN directory name +tree_affix= +train_stage=22 +get_egs_stage=-10 +decode_iter= + +num_leaves=3500 + +# training options +# training chunk-options +chunk_width=140,100,160 +# we don't need extra left/right context for TDNN systems. +dropout_schedule='0,0@0.20,0.3@0.50,0' +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 20 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --cmd "$train_cmd" \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + $num_leaves \ + ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + affine_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.015" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-dropout-layer name=tdnn1 $affine_opts dim=768 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=64 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + ## adding the layers for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=768 small-dim=192 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + # adding the layers for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=768 small-dim=192 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + steps/nnet3/chain/train.py \ + --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=8 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 \ + data/lang_test \ + $tree_dir \ + $tree_dir/graph || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l +example: +$0 Tunisian_MSA/data/transcripts/devtest/recordings.tsv 6 tunisia +"; +} + +my ($tr,$spk,$l) = @ARGV; + +open my $I, '<', $tr or croak "problems with $tr"; + +my $tmp_dir = "data/local/tmp/$l/$spk"; + +# input wav file list +my $wav_list = "$tmp_dir/wav.txt"; +croak "$!" unless ( -f $wav_list ); +# output temporary wav.scp files +my $wav_scp = "$tmp_dir/wav.scp"; + +# output temporary utt2spk files +my $u = "$tmp_dir/utt2spk"; + +# output temporary text files +my $t = "$tmp_dir/text"; + +# initialize hash for prompts +my %p = (); + +# store prompts in hash +LINEA: while ( my $line = <$I> ) { + chomp $line; + my ($s,$sent) = split /\t/, $line, 2; + $p{$s} = $sent; +} + +open my $W, '<', $wav_list or croak "problem with $wav_list $!"; +open my $O, '+>', $wav_scp or croak "problem with $wav_scp $!"; +open my $U, '+>', $u or croak "problem with $u $!"; +open my $T, '+>', $t or croak "problem with $t $!"; + + LINE: while ( my $line = <$W> ) { + chomp $line; + next LINE if ($line =~ /answers/ ); + next LINE unless ( $line =~ /Recordings/ ); + my ($volume,$directories,$file) = File::Spec->splitpath( $line ); + my @dirs = split /\//, $directories; + my $b = basename $line, ".wav"; + my $s = $dirs[-1]; + my $rid = $s . '_' . 'recording' . '_' . $b; + my $uid = $s . '_' . 'recording'; + if ( exists $p{$b} ) { + print $T "$rid\t$p{$b}\n"; + } elsif ( defined $s ) { + warn "problem\t$s"; + next LINE; + } else { + croak "$line"; + } + + print $O "$rid sox $line -t wav - |\n"; + print $U "$rid\t$uid\n"; +} +close $T; +close $O; +close $U; +close $W; diff --git a/egs/tunisian_msa/s5/local/nnet3/run_ivector_common.sh b/egs/tunisian_msa/s5/local/nnet3/run_ivector_common.sh new file mode 100755 index 00000000000..e8ff9a150ea --- /dev/null +++ b/egs/tunisian_msa/s5/local/nnet3/run_ivector_common.sh @@ -0,0 +1,185 @@ +#!/bin/bash + +set -euo pipefail + +# This script is called from local/nnet3/run_tdnn.sh and +# local/chain/run_tdnn.sh (and may eventually be called by more +# scripts). It contains the common feature preparation and +# iVector-related parts of the script. See those scripts for examples +# of usage. + +stage=0 +train_set=train +test_sets="devtest test" +gmm=tri3b + +nnet3_affix= + +. ./cmd.sh +. ./path.sh +. utils/parse_options.sh + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp + +for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + +if [ $stage -le 1 ]; then + # perturb data to get alignments + # nnet will be trained by high resolution data + # _sp stands for speed-perturbed + echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)" + utils/data/perturb_data_dir_speed_3way.sh \ + data/${train_set} \ + data/${train_set}_sp + echo "$0: making mfcc features for low-resolution speed-perturbed data" + steps/make_mfcc.sh \ + --cmd "$train_cmd" \ + --nj 10 \ + data/${train_set}_sp + steps/compute_cmvn_stats.sh \ + data/${train_set}_sp + utils/fix_data_dir.sh \ + data/${train_set}_sp +fi + +if [ $stage -le 2 ]; then + echo "$0: aligning with the perturbed low-resolution data" + steps/align_fmllr.sh \ + --nj 20 \ + --cmd "$train_cmd" \ + data/${train_set}_sp \ + data/lang \ + $gmm_dir \ + $ali_dir +fi + +if [ $stage -le 3 ]; then + # Create high-resolution MFCC features (with 40 cepstra instead of 13). + + echo "$0: creating high-resolution MFCC features" + mfccdir=data/${train_set}_sp_hires/data + for datadir in ${train_set}_sp ${test_sets}; do + utils/copy_data_dir.sh \ + data/$datadir \ + data/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh \ + data/${train_set}_sp_hires + + for datadir in ${train_set}_sp ${test_sets}; do + steps/make_mfcc.sh \ + --nj 10 \ + --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" \ + data/${datadir}_hires + steps/compute_cmvn_stats.sh \ + data/${datadir}_hires + utils/fix_data_dir.sh \ + data/${datadir}_hires + done +fi + +if [ $stage -le 4 ]; then + echo "$0: computing a subset of data to train the diagonal UBM." + # We'll use about a quarter of the data. + mkdir -p exp/nnet3${nnet3_affix}/diag_ubm + temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm + + num_utts_total=$(wc -l $tmp_tunis/$s/wav.txt + + local/devtest_recordings_make_lists.pl \ + $data_dir/transcripts/devtest/recordings.tsv $s tunis + + mkdir -p data/devtest + + for x in wav.scp utt2spk text; do + cat $tmp_tunis/$s/$x | tr " " " " >> data/devtest/$x + done +done + +utils/utt2spk_to_spk2utt.pl data/devtest/utt2spk | sort > data/devtest/spk2utt + +utils/fix_data_dir.sh data/devtest + +# training data consists of 2 parts: answers and recordings (recited) +answers_transcripts=$data_dir/transcripts/train/answers.tsv +recordings_transcripts=$data_dir/transcripts/train/recordings.tsv + +# location of test data +cls_rec_tr=$libyan_src/cls/data/transcripts/recordings/cls_recordings.tsv +lfi_rec_tr=$libyan_src/lfi/data/transcripts/recordings/lfi_recordings.tsv +srj_rec_tr=$libyan_src/srj/data/transcripts/recordings/srj_recordings.tsv +mbt_rec_tr=$data_dir/transcripts/test/mbt/recordings/mbt_recordings.tsv + +# make acoustic model training lists +mkdir -p $tmp_tunis + +# get wav file names + +# for recited speech +# the data collection laptops had names like CTELLONE CTELLTWO ... +for machine in CTELLONE CTELLTWO CTELLTHREE CTELLFOUR CTELLFIVE; do + find $data_dir/speech/train/$machine -type f -name "*.wav" | grep Recordings \ + >> $tmp_tunis/recordings_wav.txt +done + +# get file names for Answers +for machine in CTELLONE CTELLTWO CTELLTHREE CTELLFOUR CTELLFIVE; do + find $data_dir/speech/train/$machine -type f \ + -name "*.wav" \ + | grep Answers >> $tmp_tunis/answers_wav.txt +done + +# make separate transcription lists for answers and recordings +export LC_ALL=en_US.UTF-8 +local/answers_make_lists.pl $answers_transcripts + +utils/fix_data_dir.sh $tmp_tunis/answers + +local/recordings_make_lists.pl $recordings_transcripts + +utils/fix_data_dir.sh $tmp_tunis/recordings + +# consolidate lists +# acoustic models will be trained on both recited and prompted speech +mkdir -p $tmp_tunis/lists + +for x in wav.scp utt2spk text; do + cat $tmp_tunis/answers/$x $tmp_tunis/recordings/$x > $tmp_tunis/lists/$x +done + +utils/fix_data_dir.sh $tmp_tunis/lists + +# get training lists +mkdir -p data/train +for x in wav.scp utt2spk text; do + sort $tmp_tunis/lists/$x | tr " " " " > data/train/$x +done + +utils/utt2spk_to_spk2utt.pl data/train/utt2spk | sort > data/train/spk2utt + +utils/fix_data_dir.sh data/train + +# process the Libyan MSA data +mkdir -p $tmp_libyan + +for s in cls lfi srj; do + mkdir -p $tmp_libyan/$s + + # get list of wav files + find $libyan_src/$s -type f \ + -name "*.wav" \ + | grep recordings > $tmp_libyan/$s/recordings_wav.txt + + echo "$0: making recordings list for $s" + local/test_recordings_make_lists.pl \ + $libyan_src/$s/data/transcripts/recordings/${s}_recordings.tsv $s libyan +done + +# process the Tunisian MSA test data + +mkdir -p $tmp_tunis/mbt + +# get list of wav files +find $data_dir/speech/test/mbt -type f \ + -name "*.wav" \ + | grep recordings > $tmp_tunis/mbt/recordings_wav.txt + +echo "$0: making recordings list for mbt" +local/test_recordings_make_lists.pl \ + $data_dir/transcripts/test/mbt/recordings/mbt_recordings.tsv mbt tunis + +mkdir -p data/test +# get the Libyan files +for s in cls lfi srj; do + for x in wav.scp utt2spk text; do + cat $tmp_libyan/$s/recordings/$x | tr " " " " >> data/test/$x + done +done + +for x in wav.scp utt2spk text; do + cat $tmp_tunis/mbt/recordings/$x | tr " " " " >> data/test/$x +done + +utils/utt2spk_to_spk2utt.pl data/test/utt2spk | sort > data/test/spk2utt + +utils/fix_data_dir.sh data/test diff --git a/egs/tunisian_msa/s5/local/prepare_dict.sh b/egs/tunisian_msa/s5/local/prepare_dict.sh new file mode 100755 index 00000000000..f7d1ac3a619 --- /dev/null +++ b/egs/tunisian_msa/s5/local/prepare_dict.sh @@ -0,0 +1,43 @@ +#!/bin/bash -u + +# Copyright 2018 John Morgan +# Apache 2.0. + +set -o errexit + +[ -f ./path.sh ] && . ./path.sh + +if [ ! -d data/local/dict ]; then + mkdir -p data/local/dict +fi + +l=$1 +export LC_ALL=C + +cut -f2- -d " " $l | tr -s '[:space:]' '[\n*]' | grep -v SPN | \ + sort -u | tail -n+2 > data/local/dict/nonsilence_phones.txt + +expand -t 1 $l | sort -u | \ + sed "1d" > data/local/dict/lexicon.txt + +echo " SPN" >> data/local/dict/lexicon.txt + +# silence phones, one per line. +{ + echo SIL; + echo SPN; +} \ + > \ + data/local/dict/silence_phones.txt + +echo SIL > data/local/dict/optional_silence.txt + +# get the phone list from the lexicon file +( + tr '\n' ' ' < data/local/dict/silence_phones.txt; + echo; + tr '\n' ' ' < data/local/dict/nonsilence_phones.txt; + echo; +) >data/local/dict/extra_questions.txt + +echo "$0: Finished dictionary preparation." diff --git a/egs/tunisian_msa/s5/local/prepare_lm.sh b/egs/tunisian_msa/s5/local/prepare_lm.sh new file mode 100755 index 00000000000..4fc50b84d11 --- /dev/null +++ b/egs/tunisian_msa/s5/local/prepare_lm.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +# Copyright 2018 John Morgan +# Apache 2.0. + +. ./cmd.sh +set -e +. ./path.sh +. $KALDI_ROOT/tools/env.sh +stage=0 +nsegs=1000000; # limit the number of training segments + +. ./utils/parse_options.sh + +if [ ! -d data/local/lm ]; then + mkdir -p data/local/lm +fi + +corpus=$1 + +if [ ! -f $corpus ]; then + echo "$0: input data $corpus not found." + exit 1 +fi + +perl -MList::Util=shuffle -e 'print shuffle();' < $corpus | \ + head -n $nsegs > data/local/lm/train.txt + +if ! command ngram-count >/dev/null; then + if uname -a | grep darwin >/dev/null; then # For MACOSX... + sdir=$KALDI_ROOT/tools/srilm/bin/macosx + elif uname -a | grep 64 >/dev/null; then # some kind of 64 bit... + sdir=$KALDI_ROOT/tools/srilm/bin/i686-m64 + else + sdir=$KALDI_ROOT/tools/srilm/bin/i686 + fi + if [ -f $sdir/ngram-count ]; then + echo Using SRILM tools from $sdir + export PATH=$PATH:$sdir + else + echo You appear to not have SRILM tools installed, either on your path, + echo or installed in $sdir. See tools/install_srilm.sh for installation + echo instructions. + exit 1 + fi +fi + + +ngram-count -order 3 -interpolate -unk -map-unk "" \ + -limit-vocab -text data/local/lm/train.txt -lm data/local/lm/trigram.arpa || exit 1; + +gzip -f data/local/lm/trigram.arpa diff --git a/egs/tunisian_msa/s5/local/qcri_buckwalter2utf8.pl b/egs/tunisian_msa/s5/local/qcri_buckwalter2utf8.pl new file mode 100755 index 00000000000..9074d4807c2 --- /dev/null +++ b/egs/tunisian_msa/s5/local/qcri_buckwalter2utf8.pl @@ -0,0 +1,21 @@ +#!/usr/bin/env perl +#qcri_buckwalter2utf8.pl - convert the qcri dictionary toutf8 + +use strict; +use warnings; +use Carp; + +use Encode::Arabic::Buckwalter; # imports just like 'use Encode' would, plus more + +my $bw_dict = "qcri.txt"; + +open my $B, '<', $bw_dict or croak "Problem with $bw_dict $!"; + + LINE: while ( my $line = <$B> ) { + chomp $line; + next LINE if ( $line =~ /^\#/ ); + my ($w,$p) = split / /, $line, 2; + print encode 'utf8', decode 'buckwalter', $w; + print " $p\n"; +} + diff --git a/egs/tunisian_msa/s5/local/qcri_buckwalter2utf8.sh b/egs/tunisian_msa/s5/local/qcri_buckwalter2utf8.sh new file mode 100755 index 00000000000..b8433967e14 --- /dev/null +++ b/egs/tunisian_msa/s5/local/qcri_buckwalter2utf8.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +# write separate files for word and pronunciation fields +cut -d " " -f 1 qcri.txt > qcri_words_buckwalter.txt +cut -d " " -f 2- qcri.txt > qcri_prons.txt + +# convert words to utf8 +local/buckwalter2unicode.py -i qcri_words_buckwalter.txt -o qcri_words_utf8.txt + +paste qcri_words_utf8.txt qcri_prons.txt + +rm qcri_words_buckwalter.txt qcri_words_utf8.txt qcri_prons.txt diff --git a/egs/tunisian_msa/s5/local/qcri_lexicon_download.sh b/egs/tunisian_msa/s5/local/qcri_lexicon_download.sh new file mode 100755 index 00000000000..29a9ca1eed6 --- /dev/null +++ b/egs/tunisian_msa/s5/local/qcri_lexicon_download.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Copyright 2018 John Morgan +# Apache 2.0. + +# configuration variables +lex=$1 +tmpdir=data/local/tmp +# where to put the downloaded speech corpus +downloaddir=$(pwd) +# Where to put the uncompressed file +datadir=$(pwd) +# end of configuration variable settings + +# download the corpus +if [ ! -f $downloaddir/qcri.txt.bz2 ]; then + wget -O $downloaddir/qcri.txt.bz2 $lex + ( + cd $downloaddir + bzcat qcri.txt.bz2 | tail -n+4 > $datadir/qcri.txt + ) +else + echo "$0: The corpus $lex was already downloaded." +fi diff --git a/egs/tunisian_msa/s5/local/recordings_make_lists.pl b/egs/tunisian_msa/s5/local/recordings_make_lists.pl new file mode 100755 index 00000000000..41fc15e0dd3 --- /dev/null +++ b/egs/tunisian_msa/s5/local/recordings_make_lists.pl @@ -0,0 +1,72 @@ +#!/usr/bin/env perl + +# Copyright 2018 John Morgan +# Apache 2.0. + +# recordings_make_lists.pl - make acoustic model training lists + +use strict; +use warnings; +use Carp; + +use File::Spec; +use File::Copy; +use File::Basename; + +my $tmpdir = "data/local/tmp/tunis"; + +system "mkdir -p $tmpdir/recordings"; + +# input wav file list +my $w = "$tmpdir/recordings_wav.txt"; + +# output temporary wav.scp files +my $o = "$tmpdir/recordings/wav.scp"; + +# output temporary utt2spk files +my $u = "$tmpdir/recordings/utt2spk"; + +# output temporary text files +my $t = "$tmpdir/recordings/text"; + +# initialize hash for prompts +my %p = (); + +# store prompts in hash +LINEA: while ( my $line = <> ) { + chomp $line; + my ($s,$sent) = split /\t/, $line, 2; + $p{$s} = $sent; +} + +open my $W, '<', $w or croak "problem with $w $!"; +open my $O, '+>', $o or croak "problem with $o $!"; +open my $U, '+>', $u or croak "problem with $u $!"; +open my $T, '+>', $t or croak "problem with $t $!"; + + LINE: while ( my $line = <$W> ) { + chomp $line; + next LINE if ($line =~ /Answers/ ); + next LINE unless ( $line =~ /Recordings/ ); + my ($volume,$directories,$file) = File::Spec->splitpath( $line ); + my @dirs = split /\//, $directories; + my $machine = $dirs[-3]; + my $r = basename $line, ".wav"; + my $s = $dirs[-1]; + my $rid = $machine . '_' . $s . '_r_' . $r; + if ( exists $p{$r} ) { + print $T "$rid\t$p{$r}\n"; + } elsif ( defined $rid ) { + warn "problem\t$rid"; + next LINE; + } else { + croak "$line"; + } + + print $O "$rid sox $line -t wav - |\n"; + print $U "$rid\t${machine}_${s}_r\n"; +} +close $T; +close $O; +close $U; +close $W; diff --git a/egs/tunisian_msa/s5/local/score.sh b/egs/tunisian_msa/s5/local/score.sh new file mode 120000 index 00000000000..0afefc3158c --- /dev/null +++ b/egs/tunisian_msa/s5/local/score.sh @@ -0,0 +1 @@ +../steps/score_kaldi.sh \ No newline at end of file diff --git a/egs/tunisian_msa/s5/local/subs_download.sh b/egs/tunisian_msa/s5/local/subs_download.sh new file mode 100755 index 00000000000..7e46fd255aa --- /dev/null +++ b/egs/tunisian_msa/s5/local/subs_download.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# Copyright 2018 John Morgan +# Apache 2.0. + +# Begin configuration +subs_src=$1 +tmpdir=data/local/tmp +download_dir=$(pwd) +datadir=$(pwd) +# End configuration + +# download the subs corpus +if [ ! -f $download_dir/subs.txt.gz ]; then + wget -O $download_dir/subs.txt.gz $subs_src +else + echo "$0: The corpus $subs_src was already downloaded." +fi + +if [ ! -f $datadir/subs.txt ]; then + ( + cd $datadir + zcat < ./subs.txt.gz > subs.txt + ) + else + echo "$0: subs file already extracted." +fi diff --git a/egs/tunisian_msa/s5/local/subs_prepare_data.pl b/egs/tunisian_msa/s5/local/subs_prepare_data.pl new file mode 100755 index 00000000000..e39f77a25cb --- /dev/null +++ b/egs/tunisian_msa/s5/local/subs_prepare_data.pl @@ -0,0 +1,115 @@ +#!/usr/bin/env perl + +# Copyright 2018 John Morgan +# Apache 2.0. + +# subs_prepare_data.pl - condition subs data for lm training + +use strict; +use warnings; +use Carp; + +use Encode; + +# set lower and upper bounds +my $low_bound = 8; +# only segments with at least $low_bound words will be written +my $up_bound = 16; +# only segments with fewer than $up_bound words will be written + +# input and output files +my $corp = "subs.txt"; +my $symtab = "data/lang/words.txt"; +my $conditioned = "data/local/tmp/subs/lm/ar.txt"; +my $oo = "data/local/tmp/subs/lm/oovs.txt"; +my $iv = "data/local/tmp/subs/lm/in_vocabulary.txt"; + +open my $CORP, '<', $corp or croak "problems with $corp $!"; +system "mkdir -p data/local/tmp/subs/lm"; +open my $COND, '+>:utf8', $conditioned or croak "problems with $conditioned $!"; + +if ( -s $conditioned ) { + croak "$conditioned already exists."; +} else { + LINE: while ( my $line = <$CORP> ) { + $line = decode_utf8 $line; + chomp $line; + + my @tokens = split /\s+/, $line; + + next LINE if ( ($#tokens < $low_bound) or ($#tokens > $up_bound )); + + # remove punctuation + $line =~ s/(\p{Punctuation}+|\p{Dash_Punctuation}+|\p{Close_Punctuation}+|\p{Open_Punctuation}+|\p{Initial_Punctuation}+|\p{Final_Punctuation}+|\p{Connector_Punctuation}+|\p{Other_Punctuation}+|[ ]+)/ /msxg; + #convert tabs to white space + $line =~ s/\t/ /g; + #hard to soft space + $line =~ s/ / /g; + #squeeze white space + $line =~ s/\s+/ /g; + #initial and final white space + $line =~ s/^\p{Separator}+//; + $line =~ s/\p{Separator}+$//; + #down case + $line = lc $line; + + print $COND "$line\n"; + } +}close $CORP; +close $COND; + +# find out of vocabulary words +# $symtab points to a file containing a map of symbols to integers + +# hash for word to integer map +my %sym2int = (); + +open my $F, '<', $symtab or croak "problem with $symtab $!"; + +# store words to int map in hash +while( my $line = <$F>) { + chomp $line; + my ($s,$i) = split /\s/, $line, 2; + $sym2int{$s} = $i; +} +close $F; + +open my $I, '<', $conditioned or croak "problem with $conditioned $!"; +open my $OO, '+>', $oo or croak "problems with $oo $!"; + +while ( my $line = <$I>) { + chomp $line; + my @A = split /\s/, $line; + foreach my $a (@A) { + if (!defined ($sym2int{$a})) { + print $OO "$a\n"; + } + } +} +close $OO; +close $I; + +# remove segments with OOVs + +# store OOVS in hash +my %oov = (); +open my $V, '<', $oo or croak "problems with $oo $!"; +while ( my $line = <$V> ) { + chomp $line; + $oov{$line} = 1; +} +close $V; + +open my $L, '<', $conditioned or croak "problems with $conditioned $!"; +open my $IV, '+>', $iv or croak "problems with $iv $!"; + +SEGMENT: while ( my $segment = <$L> ) { + chomp $segment; + my @words = split /\s+/, $segment; + foreach my $word ( sort @words ) { + next SEGMENT if ( $oov{$word} ); + } + print $IV "$segment\n"; +} +close $IV; +close $L; diff --git a/egs/tunisian_msa/s5/local/tamsa_download.sh b/egs/tunisian_msa/s5/local/tamsa_download.sh new file mode 100755 index 00000000000..5e4666482ab --- /dev/null +++ b/egs/tunisian_msa/s5/local/tamsa_download.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# Copyright 2018 John Morgan +# Apache 2.0. + +speech=$1 + +# where to put the downloaded speech corpus +download_dir=$(pwd) +data_dir=$download_dir/Tunisian_MSA/data + +# download the corpus from openslr +if [ ! -f $download_dir/tamsa.tar.gz ]; then + wget -O $download_dir/tamsa.tar.gz $speech +else + echo "$0: The corpus $speech was already downloaded." +fi + +if [ ! -d $download_dir/Tunisian_MSA ]; then + ( + cd $download_dir + tar -xzf tamsa.tar.gz + ) +else + echo "$0: The corpus was already unzipped." +fi diff --git a/egs/tunisian_msa/s5/local/test_answers_make_lists.pl b/egs/tunisian_msa/s5/local/test_answers_make_lists.pl new file mode 100755 index 00000000000..aa7d0e314f3 --- /dev/null +++ b/egs/tunisian_msa/s5/local/test_answers_make_lists.pl @@ -0,0 +1,83 @@ +#!/usr/bin/env perl + +# Copyright 2018 John Zac76 +# Apache 2.0. + +# test_answers_make_lists.pl - make acoustic model training lists + +use strict; +use warnings; +use Carp; + +use File::Spec; +use File::Copy; +use File::Basename; + +BEGIN { + @ARGV == 3 or croak "USAGE $0 +example: +$0 /home/zak76/Desktop/Kaldi/kaldi-master/tunisian_msa-master/Libyan_collected_test/TEST/Libyan_MSA/adel/data/transcripts/answers/adel_answers.tsv adel libyan +"; +} + +my ($tr,$spk,$l) = @ARGV; + +open my $I, '<', $tr or croak "problems with $tr"; + +my $tmp_dir = "data/local/tmp/$l/$spk"; + +system "mkdir -p $tmp_dir/answers"; + +# input wav file list +my $w = "$tmp_dir/answers_wav.txt"; + +# output temporary wav.scp files +my $o = "$tmp_dir/answers/wav.scp"; + +# output temporary utt2spk files +my $u = "$tmp_dir/answers/utt2spk"; + +# output temporary text files +my $t = "$tmp_dir/answers/text"; + +# initialize hash for prompts +my %p = (); + +# store prompts in hash +LINEA: while ( my $line = <$I> ) { + chomp $line; + my ($s,$sent) = split /\t/, $line, 2; + $p{$s} = $sent; +} + +open my $W, '<', $w or croak "problem with $w $!"; +open my $O, '+>', $o or croak "problem with $o $!"; +open my $U, '+>', $u or croak "problem with $u $!"; +open my $T, '+>', $t or croak "problem with $t $!"; + + LINE: while ( my $line = <$W> ) { + chomp $line; + next LINE if ($line =~ /recordings/ ); + next LINE unless ( $line =~ /answers/ ); + my ($volume,$directories,$file) = File::Spec->splitpath( $line ); + my @dirs = split /\//, $directories; + my $b = basename $line, ".wav"; + my ($sk,$r) = split /\_/, $b, 2; + my $s = $dirs[-1]; + my $rid = $sk . '_' . $r; + if ( exists $p{$b} ) { + print $T "$rid\t$p{$b}\n"; + } elsif ( defined $rid ) { + warn "problem\t$rid"; + next LINE; + } else { + croak "$line"; + } + + print $O "$rid sox $line -t wav - |\n"; + print $U "$rid\t${sk}_a\n"; +} +close $T; +close $O; +close $U; +close $W; diff --git a/egs/tunisian_msa/s5/local/test_recordings_make_lists.pl b/egs/tunisian_msa/s5/local/test_recordings_make_lists.pl new file mode 100755 index 00000000000..0b1323f2738 --- /dev/null +++ b/egs/tunisian_msa/s5/local/test_recordings_make_lists.pl @@ -0,0 +1,83 @@ +#!/usr/bin/env perl + +# Copyright 2018 John Morgan +# Apache 2.0. + +# test_recordings_make_lists.pl - make acoustic model training lists + +use strict; +use warnings; +use Carp; + +use File::Spec; +use File::Copy; +use File::Basename; + +BEGIN { + @ARGV == 3 or croak "USAGE $0 +example: +$0 /mnt/disk01/Libyan_MSA/srj/data/transcripts/recordings/srj_recordings.tsv srj libyan +"; +} + +my ($tr,$spk,$l) = @ARGV; + +open my $I, '<', $tr or croak "problems with $tr"; + +my $tmp_dir = "data/local/tmp/$l/$spk"; + +system "mkdir -p $tmp_dir/recordings"; + +# input wav file list +my $w = "$tmp_dir/recordings_wav.txt"; + +# output temporary wav.scp files +my $o = "$tmp_dir/recordings/wav.scp"; + +# output temporary utt2spk files +my $u = "$tmp_dir/recordings/utt2spk"; + +# output temporary text files +my $t = "$tmp_dir/recordings/text"; + +# initialize hash for prompts +my %p = (); + +# store prompts in hash +LINEA: while ( my $line = <$I> ) { + chomp $line; + my ($s,$sent) = split /\t/, $line, 2; + $p{$s} = $sent; +} + +open my $W, '<', $w or croak "problem with $w $!"; +open my $O, '+>', $o or croak "problem with $o $!"; +open my $U, '+>', $u or croak "problem with $u $!"; +open my $T, '+>', $t or croak "problem with $t $!"; + + LINE: while ( my $line = <$W> ) { + chomp $line; + next LINE if ($line =~ /answers/ ); + next LINE unless ( $line =~ /recordings/ ); + my ($volume,$directories,$file) = File::Spec->splitpath( $line ); + my @dirs = split /\//, $directories; + my $b = basename $line, ".wav"; + my ($sk,$r) = split /\_/, $b, 2; + my $s = $dirs[-1]; + my $rid = $sk . '_' . $r; + if ( exists $p{$b} ) { + print $T "$rid\t$p{$b}\n"; + } elsif ( defined $rid ) { + warn "problem\t$rid"; + next LINE; + } else { + croak "$line"; + } + + print $O "$rid sox $line -t wav - |\n"; + print $U "$rid\t${sk}_r\n"; +} +close $T; +close $O; +close $U; +close $W; diff --git a/egs/tunisian_msa/s5/path.sh b/egs/tunisian_msa/s5/path.sh new file mode 100644 index 00000000000..705600ad47a --- /dev/null +++ b/egs/tunisian_msa/s5/path.sh @@ -0,0 +1,8 @@ +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C + +# For now, don't include any of the optional dependenices of the main +# librispeech recipe diff --git a/egs/tunisian_msa/s5/run.sh b/egs/tunisian_msa/s5/run.sh new file mode 100755 index 00000000000..107acdf271c --- /dev/null +++ b/egs/tunisian_msa/s5/run.sh @@ -0,0 +1,190 @@ +#!/bin/bash + +# Trains on 11 hours of speechfrom CTELL{ONE,TWO,THREE,FOUR,FIVE} +# Uses the QCRI vowelized Arabic lexicon. +# Converts the Buckwalter encoding to utf8. +. ./cmd.sh +. ./path.sh +stage=0 + +. ./utils/parse_options.sh + +set -e +set -o pipefail +set u + +# Do not change tmpdir, other scripts under local depend on it +tmpdir=data/local/tmp + +# The speech corpus is on openslr.org +speech="http://www.openslr.org/resources/46/Tunisian_MSA.tar.gz" + +# We use the QCRI lexicon. +lex="http://alt.qcri.org/resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2" + +# We train the lm on subtitles. +subs_src="http://opus.nlpl.eu/download.php?f=OpenSubtitles2018/mono/OpenSubtitles2018.ar.gz" + +if [ $stage -le 1 ]; then + # Downloads archive to this script's directory + local/tamsa_download.sh $speech + + local/qcri_lexicon_download.sh $lex + + local/subs_download.sh $subs_src +fi + +# preparation stages will store files under data/ +# Delete the entire data directory when restarting. +if [ $stage -le 2 ]; then + local/prepare_data.sh +fi + +if [ $stage -le 3 ]; then + mkdir -p $tmpdir/dict + local/qcri_buckwalter2utf8.sh > $tmpdir/dict/qcri_utf8.txt +fi + +if [ $stage -le 4 ]; then + local/prepare_dict.sh $tmpdir/dict/qcri_utf8.txt +fi + +if [ $stage -le 5 ]; then + # prepare the lang directory + utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang +fi + +if [ $stage -le 6 ]; then + echo "Preparing the subs data for lm training." + local/subs_prepare_data.pl +fi + +if [ $stage -le 7 ]; then + echo "lm training." + local/prepare_lm.sh $tmpdir/subs/lm/in_vocabulary.txt +fi + +if [ $stage -le 8 ]; then + echo "Making grammar fst." + utils/format_lm.sh \ + data/lang data/local/lm/trigram.arpa.gz data/local/dict/lexicon.txt \ + data/lang_test +fi + +if [ $stage -le 9 ]; then + # extract acoustic features + for fld in devtest train test; do + steps/make_mfcc.sh data/$fld exp/make_mfcc/$fld mfcc + utils/fix_data_dir.sh data/$fld + steps/compute_cmvn_stats.sh data/$fld exp/make_mfcc mfcc + utils/fix_data_dir.sh data/$fld + done +fi + +if [ $stage -le 10 ]; then + echo "$0: monophone training" + steps/train_mono.sh data/train data/lang exp/mono +fi + +if [ $stage -le 11 ]; then + # monophone evaluation + ( + # make decoding graph for monophones + utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph + + # test monophones + for x in devtest test; do + nspk=$(wc -l < data/$x/spk2utt) + steps/decode.sh --nj $nspk exp/mono/graph data/$x exp/mono/decode_${x} + done + ) & +fi + +if [ $stage -le 12 ]; then + # align with monophones + steps/align_si.sh data/train data/lang exp/mono exp/mono_ali +fi + +if [ $stage -le 13 ]; then + echo "$0: Starting triphone training in exp/tri1" + steps/train_deltas.sh \ + --boost-silence 1.25 1000 6000 data/train data/lang exp/mono_ali exp/tri1 +fi + +wait + +if [ $stage -le 14 ]; then + # test cd gmm hmm models + # make decoding graphs for tri1 + ( + utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph + + # decode test data with tri1 models + for x in devtest test; do + nspk=$(wc -l < data/$x/spk2utt) + steps/decode.sh --nj $nspk exp/tri1/graph data/$x exp/tri1/decode_${x} + done + ) & +fi + +if [ $stage -le 15 ]; then + # align with triphones + steps/align_si.sh data/train data/lang exp/tri1 exp/tri1_ali +fi + +if [ $stage -le 16 ]; then + echo "$0: Starting (lda_mllt) triphone training in exp/tri2b" + steps/train_lda_mllt.sh \ + --splice-opts "--left-context=3 --right-context=3" 500 5000 \ + data/train data/lang exp/tri1_ali exp/tri2b +fi + +wait + +if [ $stage -le 17 ]; then + ( + # make decoding FSTs for tri2b models + utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph + + # decode test with tri2b models + for x in devtest test; do + nspk=$(wc -l < data/$x/spk2utt) + steps/decode.sh --nj $nspk exp/tri2b/graph data/$x exp/tri2b/decode_${x} + done + ) & +fi + +if [ $stage -le 18 ]; then + # align with lda and mllt adapted triphones + steps/align_si.sh \ + --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali +fi + +if [ $stage -le 19 ]; then + echo "$0: Starting (SAT) triphone training in exp/tri3b" + steps/train_sat.sh 800 8000 data/train data/lang exp/tri2b_ali exp/tri3b +fi + +if [ $stage -le 20 ]; then + ( + # make decoding graphs for SAT models + utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph + + # decode test sets with tri3b models + for x in devtest test; do + nspk=$(wc -l < data/$x/spk2utt) + steps/decode_fmllr.sh --nj $nspk exp/tri3b/graph data/$x exp/tri3b/decode_${x} + done + ) & +fi + +if [ $stage -le 21 ]; then + # align with tri3b models + echo "$0: Starting exp/tri3b_ali" + steps/align_fmllr.sh data/train data/lang exp/tri3b exp/tri3b_ali +fi + +if [ $stage -le 22 ]; then + # train and test chain models + local/chain/run_tdnn.sh +fi diff --git a/egs/tunisian_msa/s5/steps b/egs/tunisian_msa/s5/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/tunisian_msa/s5/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/tunisian_msa/s5/utils b/egs/tunisian_msa/s5/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/tunisian_msa/s5/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/wsj/s5/utils/parallel/pbs.pl b/egs/wsj/s5/utils/parallel/pbs.pl index cbde8eb86d5..35a33ba2dca 100755 --- a/egs/wsj/s5/utils/parallel/pbs.pl +++ b/egs/wsj/s5/utils/parallel/pbs.pl @@ -18,12 +18,10 @@ # names and the commands configurable, as similar problems can be expected # with Torque, Univa... and who knows what else # -# queue.pl has the same functionality as run.pl, except that -# it runs the job in question on the queue (Sun GridEngine). -# This version of queue.pl uses the task array functionality -# of the grid engine. Note: it's different from the queue.pl -# in the s4 and earlier scripts. - +# pbs.pl has the same functionality as run.pl, except that +# it runs the job in question on the queue (PBS). +# This version of pbs.pl uses the task array functionality +# of PBS. # The script now supports configuring the queue system using a config file # (default in conf/pbs.conf; but can be passed specified with --config option) # and a set of command line options. @@ -78,12 +76,12 @@ sub print_usage() { print STDERR - "Usage: queue.pl [options] [JOB=1:n] log-file command-line arguments...\n" . - "e.g.: queue.pl foo.log echo baz\n" . + "Usage: pbs.pl [options] [JOB=1:n] log-file command-line arguments...\n" . + "e.g.: pbs.pl foo.log echo baz\n" . " (which will echo \"baz\", with stdout and stderr directed to foo.log)\n" . - "or: queue.pl -q all.q\@xyz foo.log echo bar \| sed s/bar/baz/ \n" . + "or: pbs.pl -q all.q\@xyz foo.log echo bar \| sed s/bar/baz/ \n" . " (which is an example of using a pipe; you can provide other escaped bash constructs)\n" . - "or: queue.pl -q all.q\@qyz JOB=1:10 foo.JOB.log echo JOB \n" . + "or: pbs.pl -q all.q\@qyz JOB=1:10 foo.JOB.log echo JOB \n" . " (which illustrates the mechanism to submit parallel jobs; note, you can use \n" . " another string other than JOB)\n" . "Note: if you pass the \"-sync y\" option to qsub, this script will take note\n" . @@ -113,7 +111,7 @@ () } else { my $argument = shift @ARGV; if ($argument =~ m/^--/) { - print STDERR "queue.pl: Warning: suspicious argument '$argument' to $switch; starts with '-'\n"; + print STDERR "pbs.pl: Warning: suspicious argument '$argument' to $switch; starts with '-'\n"; } if ($switch eq "-sync" && $argument =~ m/^[yY]/) { $sync = 1; @@ -141,7 +139,7 @@ () $jobend = $3; shift; if ($jobstart > $jobend) { - die "queue.pl: invalid job range $ARGV[0]"; + die "pbs.pl: invalid job range $ARGV[0]"; } if ($jobstart <= 0) { die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is a GridEngine limitation)."; @@ -153,7 +151,7 @@ () $jobend = $2; shift; } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) { - print STDERR "queue.pl: Warning: suspicious first argument to queue.pl: $ARGV[0]\n"; + print STDERR "pbs.pl: Warning: suspicious first argument to queue.pl: $ARGV[0]\n"; } } @@ -248,7 +246,7 @@ () $cli_options{$option} = $value; } } else { - print STDERR "queue.pl: unable to parse line '$line' in config file ($config)\n"; + print STDERR "pbs.pl: unable to parse line '$line' in config file ($config)\n"; exit(1); } } @@ -256,7 +254,7 @@ () close(CONFIG); if ($read_command != 1) { - print STDERR "queue.pl: config file ($config) does not contain the line \"command .*\"\n"; + print STDERR "pbs.pl: config file ($config) does not contain the line \"command .*\"\n"; exit(1); } @@ -271,7 +269,7 @@ () $qsub_opts .= "$cli_config_options{$option} "; } else { if ($opened_config_file == 0) { $config = "default config file"; } - die "queue.pl: Command line option $option not described in $config (or value '$value' not allowed)\n"; + die "pbs.pl: Command line option $option not described in $config (or value '$value' not allowed)\n"; } } @@ -280,7 +278,7 @@ () if ($array_job == 1 && $logfile !~ m/$jobname/ && $jobend > $jobstart) { - print STDERR "queue.pl: you are trying to run a parallel job but " + print STDERR "pbs.pl: you are trying to run a parallel job but " . "you are putting the output into just one log file ($logfile)\n"; exit(1); } @@ -289,7 +287,7 @@ () # Work out the command; quote escaping is done here. # Note: the rules for escaping stuff are worked out pretty # arbitrarily, based on what we want it to do. Some things that -# we pass as arguments to queue.pl, such as "|", we want to be +# we pass as arguments to pbs.pl, such as "|", we want to be # interpreted by bash, so we don't escape them. Other things, # such as archive specifiers like 'ark:gunzip -c foo.gz|', we want # to be passed, in quotes, to the Kaldi program. Our heuristic @@ -394,9 +392,9 @@ () if ($ret != 0) { if ($sync && $ret == 256) { # this is the exit status when a job failed (bad exit status) if (defined $jobname) { $logfile =~ s/\$PBS_ARRAY_INDEX/*/g; } - print STDERR "queue.pl: job writing to $logfile failed\n"; + print STDERR "pbs.pl: job writing to $logfile failed\n"; } else { - print STDERR "queue.pl: error submitting jobs to queue (return status was $ret)\n"; + print STDERR "pbs.pl: error submitting jobs to queue (return status was $ret)\n"; print STDERR "queue log file is $queue_logfile, command was $qsub_cmd\n"; print STDERR `tail $queue_logfile`; } @@ -501,13 +499,13 @@ () # time elapsed between file modification and the start of this # program], then we assume the program really finished OK, # and maybe something is up with the file system. - print STDERR "**queue.pl: syncfile $f was not created but job seems\n" . + print STDERR "**pbs.pl: syncfile $f was not created but job seems\n" . "**to have finished OK. Probably your file-system has problems.\n" . "**This is just a warning.\n"; last; } else { chop $last_line; - print STDERR "queue.pl: Error, unfinished job no " . + print STDERR "pbs.pl: Error, unfinished job no " . "longer exists, log is in $logfile, last line is '$last_line', " . "syncfile is $f, return status of qstat was $ret\n" . "Possible reasons: a) Exceeded time limit? -> Use more jobs!" . @@ -515,7 +513,7 @@ () exit(1); } } elsif ($ret != 0) { - print STDERR "queue.pl: Warning: qstat command returned status $ret (qstat -t $sge_job_id,$!)\n"; + print STDERR "pbs.pl: Warning: qstat command returned status $ret (qstat -t $sge_job_id,$!)\n"; } } } @@ -574,14 +572,14 @@ () else { # we failed. if (@logfiles == 1) { if (defined $jobname) { $logfile =~ s/\$PBS_ARRAY_INDEX/$jobstart/g; } - print STDERR "queue.pl: job failed with status $status, log is in $logfile\n"; + print STDERR "pbs.pl: job failed with status $status, log is in $logfile\n"; if ($logfile =~ m/JOB/) { - print STDERR "queue.pl: probably you forgot to put JOB=1:\$nj in your script.\n"; + print STDERR "pbs.pl: probably you forgot to put JOB=1:\$nj in your script.\n"; } } else { if (defined $jobname) { $logfile =~ s/\$PBS_ARRAY_INDEX/*/g; } my $numjobs = 1 + $jobend - $jobstart; - print STDERR "queue.pl: $num_failed / $numjobs failed, log is in $logfile\n"; + print STDERR "pbs.pl: $num_failed / $numjobs failed, log is in $logfile\n"; } exit(1); }