Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
207 changes: 207 additions & 0 deletions egs/wsj/s5/steps/compare_alignments.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
#!/bin/bash

# Copyright 2018 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0.

set -e
stage=0
cmd=run.pl # We use this only for get_ctm.sh, which can be a little slow.
num_to_sample=1000 # We sample this many utterances for human-readable display, starting from the worst and then
# starting from the middle.
cleanup=true

if [ -f ./path.sh ]; then . ./path.sh; fi

. ./utils/parse_options.sh

if [ $# -ne 5 ] && [ $# -ne 6 ]; then
cat <<EOF
This script compares two directories containing data alignments, and
creates statistics showing how much the phone and word alignments differ,
including breakdown by phones and words; and which utterances differ the
most. This is intended for diagnostic purposes. Both alignment directories
should be for the same data (or at least the data sets should overlap).
The word alignment stats may not be correctly obtained if the data-dirs are
not the same.

Usage: $0 [options] <lang-directory> (<data-directory> | <data-directory1> <data-directory2>) <ali-dir1> <ali-dir2> <work-dir>
e.g.: $0 data/lang data/train exp/tri2_ali exp/tri3_ali exp/compare_ali_2_3

Options:
--cmd (run.pl|queue.pl...) # specify how to run the sub-processes.
# (passed through to get_train_ctm.sh)
--cleanup <true|false> # Specify --cleanup false to prevent
# cleanup of temporary files.
--stage <n> # Enables you to run part of the script.

EOF
exit 1
fi

lang=$1
data1=$2
if [ $# -eq 5 ]; then
data2=$2
ali_dir1=$3
ali_dir2=$4
dir=$5
else
data2=$3
ali_dir1=$4
ali_dir2=$5
dir=$6
fi

for f in $lang/phones.txt $ali_dir1/ali.1.gz $ali_dir2/ali.2.gz; do
if [ ! -f $f ]; then
echo "$0: expected file $f to exist"
exit 1
fi
done

nj1=$(cat $ali_dir1/num_jobs)
nj2=$(cat $ali_dir2/num_jobs)

mkdir -p $dir/log


if [ $stage -le 0 ]; then
echo "$0: converting alignments to phones."

for j in $(seq $nj1); do gunzip -c $ali_dir1/ali.$j.gz; done | \
ali-to-phones --per-frame=true $ali_dir1/final.mdl ark:- ark:- | gzip -c > $dir/phones1.gz

for j in $(seq $nj2); do gunzip -c $ali_dir2/ali.$j.gz; done | \
ali-to-phones --per-frame=true $ali_dir2/final.mdl ark:- ark:- | gzip -c > $dir/phones2.gz
fi

if [ $stage -le 1 ]; then
echo "$0: getting comparison stats and utterance stats."
compare-int-vector --binary=false --write-confusion-matrix=$dir/conf.mat \
"ark:gunzip -c $dir/phones1.gz|" "ark:gunzip -c $dir/phones2.gz|" 2>$dir/log/compare_phones.log > $dir/utt_stats.phones
tail -n 8 $dir/log/compare_phones.log
fi

if [ $stage -le 3 ]; then
cat $dir/conf.mat | grep -v -F '[' | sed 's/]//' | awk '{n=NF; for (k=1;k<=n;k++) { conf[NR,k] = $k; row_tot[NR] += $k; col_tot[k] += $k; } } END{
for (row=1;row<=n;row++) for (col=1;col<=n;col++) {
val = conf[row,col]; this_row_tot = row_tot[row]; this_col_tot = col_tot[col];
rval=conf[col,row]
min_tot = (this_row_tot < this_col_tot ? this_row_tot : this_col_tot);
if (val != 0) {
phone1 = row-1; phone2 = col-1;
if (row == col) printf("COR %d %d %.2f%\n", phone1, val, (val * 100 / this_row_tot));
else {
norm_prob = val * val / min_tot; # heuristic for sorting.
printf("SUB %d %d %d %d %.2f%% %.2f%%\n",
norm_prob, phone1, phone2, val, (val * 100 / min_tot), (rval * 100 / min_tot)); }}}}' > $dir/phone_stats.all

(
echo "# Format: <phone> <frame-count> <percent-correct>"
grep '^COR' $dir/phone_stats.all | sort -n -k4,4 | awk '{print $2, $3, $4}' | utils/int2sym.pl -f 1 $lang/phones.txt
) > $dir/phones_correct.txt

(
echo "#Format: <phone1> <phone2> <num-frames> <prob-wrong%> <reverse-prob-wrong%>"
echo "# <num-frames> is the number of frames that were labeled <phone1> in the first"
echo "# set of alignments and <phone2> in the second."
echo "# <prob-wrong> is <num-frames> divided by the smaller of the total num-frames of"
echo "# phone1 or phone2, whichever is smaller; expressed as a percentage."
echo "#<reverse-prob-wrong> is the same but for the reverse substitution, from"
echo "#<phone2> to <phone1>; the comparison with <prob-wrong> the substitutions are)."
grep '^SUB' $dir/phone_stats.all | sort -nr -k2,2 | awk '{print $3,$4,$5,$6,$7}' | utils/int2sym.pl -f 1-2 $lang/phones.txt
) > $dir/phone_subs.txt
fi

if [ $stage -le 4 ]; then
echo "$0: getting CTMs"
steps/get_train_ctm.sh --use-segments false --print-silence true --cmd "$cmd" --frame-shift 1.0 $data1 $lang $ali_dir1 $dir/ctm1
steps/get_train_ctm.sh --use-segments false --print-silence true --cmd "$cmd" --frame-shift 1.0 $data2 $lang $ali_dir2 $dir/ctm2
fi

if [ $stage -le 5 ]; then
for n in 1 2; do
cat $dir/ctm${n}/ctm | utils/sym2int.pl -f 5 $lang/words.txt | \
awk 'BEGIN{utt_id="";} { if (utt_id != $1) { if (utt_id != "") printf("\n"); utt_id=$1; printf("%s ", utt_id); } t_start=int($3); t_end=t_start + int($4); word=$5; for (t=t_start; t<t_end; t++) printf("%s ", word); } END{printf("\n")}' | \
copy-int-vector ark:- ark:- | gzip -c >$dir/words${n}.gz
done
fi

if [ $stage -le 5 ]; then
compare-int-vector --binary=false --write-tot-counts=$dir/words_tot.vec --write-diff-counts=$dir/words_diff.vec \
"ark:gunzip -c $dir/words1.gz|" "ark:gunzip -c $dir/words2.gz|" 2>$dir/log/compare_words.log >$dir/utt_stats.words
tail -n 8 $dir/log/compare_words.log
fi

if [ $stage -le 6 ]; then

( echo "# Word stats. Format:";
echo "<proportion-of-wrong-frames> <num-wrong-frames> <num-correct-frames> <word>"

paste <(awk '{for (n=2;n<NF;n++) print $n;}' <$dir/words_diff.vec) \
<(awk '{for (n=2;n<NF;n++) print $n;}' <$dir/words_tot.vec) | \
awk '{ if($2 > 0) print $1*$1/$2, $1/$2, $1, $2, (NR-1)}' | utils/int2sym.pl -f 5 $lang/words.txt | \
sort -nr | awk '{print $2, $3, $4, $5;}'
) > $dir/word_stats.txt

fi

if [ $stage -le 7 ]; then
for type in phones words; do
num_utts=$(wc -l <$dir/utt_stats.$type)
cat $dir/utt_stats.$type | awk -v type=$type 'BEGIN{print "Utterance-id proportion-"type"-changed num-frames num-wrong-frames"; }
{print $1, $3 * 1.0 / $2, $2, $3; }' | sort -nr -k2,2 > $dir/utt_stats.$type.sorted
(
echo "$0: Percentiles 100, 90, .. 0 of proportion-$type-changed distribution (over utterances) are:"
cat $dir/utt_stats.$type.sorted | awk -v n=$num_utts 'BEGIN{k=int((n-1)/10);} {if (NR % k == 1) printf("%s ", $2); } END{print "";}'
) | tee $dir/utt_stats.$type.percentiles
done
fi


if [ $stage -le 8 ]; then
# Display the 1000 worst utterances, and 1000 utterances from the middle of the pack, in a readable format.
num_utts=$(wc -l <$dir/utt_stats.words.sorted)
half_num_utts=$[$num_utts/2];
if [ $num_to_sample -gt $half_num_utts ]; then
num_to_sample=$half_num_utts
fi
head -n $num_to_sample $dir/utt_stats.words.sorted | awk '{print $1}' > $dir/utt_ids.worst
tail -n +$half_num_utts $dir/utt_stats.words.sorted | head -n $num_to_sample | awk '{print $1}' > $dir/utt_ids.mid

for suf in worst mid; do
for n in 1 2; do
gunzip -c $dir/phones${n}.gz | copy-int-vector ark:- ark,t:- | utils/filter_scp.pl $dir/utt_ids.$suf >$dir/temp
# the next command reorders them, and duplicates the utterance-idwhich we'll later use
# that to display the word sequence.
awk '{print $1,$1,$1}' <$dir/utt_ids.$suf | utils/apply_map.pl -f 3 $dir/temp > $dir/phones${n}.$suf
rm $dir/temp
done
# the stuff with 0 and <eps> below is a kind of hack so that if the phones are the same, we end up
# with just the phone, but if different, we end up with p1/p2.
# The apply_map.pl stuff is to put the transcript there.

(
echo "# Format: <utterance-id> <word1> <word2> ... <wordN> <frame1-phone> ... <frameN-phone>"
echo "# If the two alignments have the same phone, just that phone will be printed;"
echo "# otherwise the two phones will be printed, as in 'phone1/phone2'. So '/' is present"
echo "# whenever there is a mismatch."

paste $dir/phones1.$suf $dir/phones2.$suf | perl -ane ' @A = split("\t", $_); @A1 = split(" ", $A[0]); @A2 = split(" ", $A[1]);
$utt = shift @A1; shift @A2; print $utt, " ";
for ($n = 0; $n < @A1 && $n < @A2; $n++) { $a1=$A1[$n]; $a2=$A2[$n]; if ($a1 eq $a2) { print "$a1 "; } else { print "$a1 0 $a2 "; }}
print "\n" ' | utils/int2sym.pl -f 3- $lang/phones.txt | sed 's: <eps> :/:g' | \
utils/apply_map.pl -f 2 $data1/text
) > $dir/compare_phones_${suf}.txt
done
fi


if [ $stage -le 9 ] && $cleanup; then
rm $dir/phones{1,2}.gz $dir/words{1,2}.gz $dir/ctm*/ctm $dir/*.vec $dir/conf.mat \
$dir/utt_ids.* $dir/phones{1,2}.{mid,worst} $dir/utt_stats.{phones,words} \
$dir/phone_stats.all
fi

# clean up
exit 0
24 changes: 14 additions & 10 deletions egs/wsj/s5/steps/get_train_ctm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ echo "$0 $@" # Print the command line for logging
[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -ne 3 ]; then
echo "Usage: $0 [options] <data-dir> <lang-dir> <ali-dir|model-dir>"
if [ $# -ne 3 ] && [ $# -ne 4 ]; then
echo "Usage: $0 [options] <data-dir> <lang-dir> <ali-dir|model-dir> [<output-dir>]"
echo "(<output-dir> defaults to <ali-dir|model-dir>.)"
echo " Options:"
echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."
echo " --stage (0|1|2) # start scoring script from part-way through."
Expand All @@ -39,27 +40,31 @@ fi

data=$1
lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
dir=$3
ali_dir=$3
dir=$4
if [ -z $dir ]; then
dir=$ali_dir
fi


model=$dir/final.mdl # assume model one level up from decoding dir.
model=$ali_dir/final.mdl # assume model one level up from decoding dir.


for f in $lang/words.txt $model $dir/ali.1.gz $lang/oov.int; do
for f in $lang/words.txt $model $ali_dir/ali.1.gz $lang/oov.int; do
[ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
done

oov=`cat $lang/oov.int` || exit 1;
nj=`cat $dir/num_jobs` || exit 1;
nj=`cat $ali_dir/num_jobs` || exit 1;
split_data.sh $data $nj || exit 1;
sdata=$data/split$nj

mkdir -p $dir/log
mkdir -p $dir/log || exit 1;

if [ $stage -le 0 ]; then
if [ -f $lang/phones/word_boundary.int ]; then
$cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
set -o pipefail '&&' linear-to-nbest "ark:gunzip -c $dir/ali.JOB.gz|" \
set -o pipefail '&&' linear-to-nbest "ark:gunzip -c $ali_dir/ali.JOB.gz|" \
"ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \
'' '' ark:- \| \
lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
Expand All @@ -72,7 +77,7 @@ if [ $stage -le 0 ]; then
exit 1;
fi
$cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
set -o pipefail '&&' linear-to-nbest "ark:gunzip -c $dir/ali.JOB.gz|" \
set -o pipefail '&&' linear-to-nbest "ark:gunzip -c $ali_dir/ali.JOB.gz|" \
"ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \
'' '' ark:- \| \
lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
Expand All @@ -94,4 +99,3 @@ if [ $stage -le 1 ]; then
fi
rm $dir/ctm.*.gz
fi

5 changes: 3 additions & 2 deletions src/bin/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ BINFILES = align-equal align-equal-compiled acc-tree-stats \
post-to-pdf-post logprob-to-post prob-to-post copy-post \
matrix-sum build-pfile-from-ali get-post-on-ali tree-info am-info \
vector-sum matrix-sum-rows est-pca sum-lda-accs sum-mllt-accs \
transform-vec align-text matrix-dim post-to-smat compile-graph
transform-vec align-text matrix-dim post-to-smat compile-graph \
compare-int-vector


OBJFILES =
Expand All @@ -30,7 +31,7 @@ ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../lm/kaldi-lm.a \
../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
../base/kaldi-base.a
../base/kaldi-base.a


TESTFILES =
Expand Down
4 changes: 1 addition & 3 deletions src/bin/ali-to-phones.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ int main(int argc, char *argv[]) {
" ali-to-phones 1.mdl ark:1.ali ark:-\n"
"or:\n"
" ali-to-phones --ctm-output 1.mdl ark:1.ali 1.ctm\n"
"See also: show-alignments lattice-align-phones\n";
"See also: show-alignments lattice-align-phones, compare-int-vector\n";
ParseOptions po(usage);
bool per_frame = false;
bool write_lengths = false;
Expand Down Expand Up @@ -137,5 +137,3 @@ int main(int argc, char *argv[]) {
return -1;
}
}


Loading