Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 6 additions & 16 deletions egs/sprakbanken/s5/local/copy_dict.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#!/bin/bash

# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Copyright 2014 Mirsk Digital ApS (Author: Andreas Kirkedal)
# Copyright 2014-15 Mirsk Digital ApS (Author: Andreas Kirkedal)
# Copyright 2016 Andreas Kirkedal

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -16,9 +17,7 @@
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

KALDI_ROOT=$(pwd)/../../..

exproot=$(pwd)
lex=lexicon-da-nonorm.tgz
dir=data/local/dict
mkdir -p $dir

Expand All @@ -31,22 +30,13 @@ cp local/dictsrc/complexphones.txt $dir/nonsilence_phones.txt
cp local/dictsrc/extra_questions.txt $dir/extra_questions.txt

# Copy pre-made lexicon
wget http://www.openslr.org/resources/8/lexicon-da.tgz --directory-prefix=data/local/data/download
tar -xzf data/local/data/download/lexicon-da.tgz -C $dir
wget http://www.openslr.org/resources/8/$lex --directory-prefix=data/local/data/download
tar -xzf data/local/data/download/$lex -C $dir


# silence phones, one per line.
echo SIL > $dir/silence_phones.txt
echo -e "SIL\nSPN" > $dir/silence_phones.txt
echo SIL > $dir/optional_silence.txt





wait


## TODO: add cleanup commands

echo "Dictionary preparation succeeded"

2 changes: 1 addition & 1 deletion egs/sprakbanken/s5/local/create_datasets.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ fi
src=$1
dest=$2
mkdir $dest
python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $src/text.unnormalised $src/onlyids $src/transcripts.am
python local/normalize_transcript_prefixed.py local/norm_dk/numbersLow.tbl $src/text.unnormalised $src/onlyids $src/transcripts.am
local/norm_dk/format_text.sh am $src/transcripts.am > $src/onlytext
paste -d ' ' $src/onlyids $src/onlytext > $dest/text
for f in wav.scp utt2spk; do
Expand Down
129 changes: 35 additions & 94 deletions egs/sprakbanken/s5/local/dict_prep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Copyright 2014 Mirsk Digital ApS (Author: Andreas Kirkedal)
# Copyright 2014-2016 Andreas Kirkedal5D

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -19,57 +20,24 @@
KALDI_ROOT=$(pwd)/../../..

exproot=$(pwd)
dir=data/local/dict
lmdir=data/local/transcript_lm
dictsrc=data/local/dictsrc
dictdir=data/local/dict
espeakdir='espeak-1.48.04-source'
mkdir -p $dir
mkdir -p $dictsrc $dictdir


# Dictionary preparation:


# Normalise transcripts and create a transcript file
# Removes '.,:;?' and removes '\' before '\Komma' (dictated ',')
# outputs a normalised transcript without utterance ids and a list of utterance ids
echo "Normalising"

# Create dir to hold lm files and other non-standard files, useful for debugging
trainsrc=data/local/trainsrc
rm -rf $trainsrc
mkdir $trainsrc
mv data/train/text1 $trainsrc/text1
python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $trainsrc/text1 $trainsrc/onlyids $dir/transcripts.tmp

# Additional normalisation, uppercasing, writing numbers etc.
# and recombine with
local/norm_dk/format_text.sh am $dir/transcripts.tmp > $dir/transcripts.am
cp $dir/transcripts.am $trainsrc/onlytext
paste $trainsrc/onlyids $trainsrc/onlytext > data/train/text
utils/validate_data_dir.sh --no-feat data/train || exit 1;



# lmsents is output by sprak_data_prep.sh and contains
# sentences that are disjoint from the test and dev set
python3 local/normalize_transcript.py local/norm_dk/numbersUp.tbl data/local/data/lmsents $dir/lmsents.norm
wait

# Create wordlist from the AM transcripts
cat $dir/transcripts.am | tr [:blank:] '\n' | sort -u > $dir/wlist.txt &

# Because training data is read aloud, there are many occurences of the same
# sentence and bias towards the domain. Make a version where
# the sentences are unique to reduce bias.
local/norm_dk/format_text.sh lm $dir/lmsents.norm > $dir/transcripts.txt
sort -u $dir/transcripts.txt > $dir/transcripts.uniq

cat $lmdir/transcripts.uniq | tr [:blank:] '\n' | sort -u > $dictsrc/wlist.txt &

# Install eSpeak if it is not installed already

if hash espeak 2>/dev/null;
then
then
echo 'eSpeak installed'
else
cd $KALDI_ROOT/tools || exit 1;
else
cd $KALDI_ROOT/tools || exit 1;
wget http://sourceforge.net/projects/espeak/files/espeak/espeak-1.48/${espeakdir}.zip
wait
unzip -q $espeakdir.zip
Expand All @@ -81,87 +49,60 @@ if hash espeak 2>/dev/null;
cd $exproot || exit 1;
fi



# Wait for the wordlist to be fully created
wait

wait

# Run wordlist through espeak to get phonetics
# improvised parallelisation - simple call because 'split' often has different versions
split -l 10000 $dir/wlist.txt $dir/Wtemp_
for w in $dir/Wtemp_*; do
(cat $w | espeak -q -vda -x > $w.pho) &
split -l 10000 $dictsrc/wlist.txt $dictsrc/Wtemp_
for w in $dictsrc/Wtemp_*; do
(cat $w | espeak -q -vda -x > $w.pho) &
done

wait

cat $dir/Wtemp_*.pho > $dir/plist.txt
rm -f $dir/Wtemp_*
cat $dictsrc/Wtemp_*.pho > $dictsrc/plist.txt
rm -f $dictsrc/Wtemp_*


# Filter transcription
# Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove
# Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove
# initial and trailing spaces and collapse 2 or more spaces to one space

cat $dir/plist.txt | perl -pe 's/\([[a-z]{2}\)//g' | perl -pe 's// /g' | perl -pe 's/ a I / aI /g' | perl -pe 's/ d Z / dZ /g' | perl -pe 's/ \? / /g' | perl -pe 's/ ([\#]) /\+ /g' | perl -pe 's/([\@n3]) \- /\1\- /g' | perl -pe "s/[\_\:\!\'\,\|2]//g" | perl -pe 's/ \- / /g' | tr -s ' ' | perl -pe 's/^ +| +$//g' > $dir/plist2.txt
cat $dictsrc/plist.txt | perl -pe 's/\([[a-z]{2}\)//g' | perl -pe 's// /g' | perl -pe 's/ a I / aI /g' | perl -pe 's/ d Z / dZ /g' | perl -pe 's/ \? / /g' | perl -pe 's/ ([\#]) /\+ /g' | perl -pe 's/([\@n3]) \- /\1\- /g' | perl -pe "s/[\_\:\!\'\,\|2]//g" | perl -pe 's/ \- / /g' | tr -s ' ' | perl -pe 's/^ +| +$//g' > $dictsrc/plist2.txt

#Some question marks are not caught above
perl -pe 's/ \? / /g' $dir/plist2.txt > $dir/plist3.txt
perl -pe 's/ \? / /g' $dictsrc/plist2.txt > $dictsrc/plist3.txt

# Create lexicon.txt and put it in data/local/dict
paste $dir/wlist.txt $dir/plist3.txt > $dir/lexicon1.txt
paste $dictsrc/wlist.txt $dictsrc/plist3.txt > $dictsrc/lexicon1.txt

# Remove entries without transcription
grep -P "^.+\t.+$" $dir/lexicon1.txt > $dir/lexicon2.txt
grep -P "^.+\t.+$" $dictsrc/lexicon1.txt > $dictsrc/lexicon2.txt

# Copy pre-made phone table with
cp local/dictsrc/complexphones.txt $dir/nonsilence_phones.txt
cp local/dictsrc/complexphones.txt $dictdir/nonsilence_phones.txt


# Add "!SIL SIL" to lexicon.txt
echo -e '!SIL\tSIL' > $dir/lex_first
echo -e '<UNK>\tSPN' >> $dir/lex_first
cat $dir/lexicon2.txt >> $dir/lex_first
mv $dir/lex_first $dir/lexicon.txt
echo -e '!SIL\tSIL' > $dictsrc/lex_first
echo -e '<UNK>\tSPN' >> $dictsrc/lex_first
cat $dictsrc/lexicon2.txt >> $dictsrc/lex_first
mv $dictsrc/lex_first $dictdir/lexicon.txt

# silence phones, one per line.
echo SIL > $dir/silence_phones.txt
echo SIL > $dir/optional_silence.txt

touch $dir/extra_questions.txt

# Repeat text preparation on test set, but do not add to dictionary
# Create dir to hold lm files and other non-standard files
testsrc=data/local/testsrc
rm -rf $testsrc
mkdir $testsrc
mv data/test/text1 $testsrc/text1
python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $testsrc/text1 $testsrc/onlyids $testsrc/transcripts.am
local/norm_dk/format_text.sh am $testsrc/transcripts.am > $testsrc/onlytext
paste $testsrc/onlyids $testsrc/onlytext > data/test/text
utils/validate_data_dir.sh --no-feat data/test || exit 1;

# Repeat text preparation on dev set, but do not add to dictionary
# Create dir to hold lm files and other non-standard files
devsrc=data/local/devsrc
rm -rf $devsrc
mkdir $devsrc
mv data/dev/text1 $devsrc/text1
python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $devsrc/text1 $devsrc/onlyids $devsrc/transcripts.tmp
local/norm_dk/format_text.sh am $devsrc/transcripts.tmp > $devsrc/onlytext
paste $devsrc/onlyids $devsrc/onlytext > data/dev/text &

# Also create a file that can be used for reranking using text features
local/norm_dk/format_text.sh lm $devsrc/transcripts.tmp > data/dev/transcripts.txt
sort -u data/dev/transcripts.txt > data/dev/transcripts.uniq


utils/validate_data_dir.sh --no-feat data/dev || exit 1;

if [ ! -f $dictdir/silence_phones.txt ]; then
echo SIL > $dictdir/silence_phones.txt
fi

if [ ! -f $dictdir/optional_silence.txt ]; then
echo SIL > $dictdir/optional_silence.txt
fi

## TODO: add cleanup commands
if [ ! -f $dictdir/extra_questions.txt ]; then
touch $dictdir/extra_questions.txt
fi

echo "Normalisation and dictionary preparation succeeded"

echo "Dictionary preparation succeeded"
11 changes: 6 additions & 5 deletions egs/sprakbanken/s5/local/norm_dk/format_text.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ nonum=$tmp/nonum.tmp

cat $2 | tr -d '\r' > $src

$dir/expand_abbr_medical.sh $src > $abbr;
$dir/remove_annotation.sh $abbr > $rem;
#$dir/expand_abbr_medical.sh $src > $abbr;
$dir/remove_annotation.sh $src > $rem;
if [ $mode != "am" ]; then
$dir/sent_split.sh $rem > $line;
else
Expand All @@ -45,10 +45,11 @@ fi
$dir/expand_dates.sh $line |\
$dir/format_punct.sh > $num;
#python3 $dir/writenumbers.py $dir/numbersUp.tbl $num $nonum;
cat $num | $dir/write_punct.sh | \
# $dir/write_punct.sh | \
cat $num | \
perl -pi -e "s/^\n//" | \
perl -pe 's/ (.{4}.*?)\./ \1/g' | \
PERLIO=:utf8 perl -pe '$_=uc'
perl -pe 's/ (.{4}.*?)\./ \1/g'
# | PERLIO=:utf8 perl -pe '$_=lc'

# Comment this line for debugging
wait
Expand Down
Loading