kaldi-asr · danpovey · Oct 13, 2018 · Oct 7, 2018 · Oct 7, 2018
diff --git a/egs/wsj/s5/steps/compare_alignments.sh b/egs/wsj/s5/steps/compare_alignments.sh
@@ -0,0 +1,207 @@
+#!/bin/bash
+
+# Copyright 2018  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0.
+
+set -e
+stage=0
+cmd=run.pl   # We use this only for get_ctm.sh, which can be a little slow.
+num_to_sample=1000  # We sample this many utterances for human-readable display, starting from the worst and then
+                    # starting from the middle.
+cleanup=true
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+
+. ./utils/parse_options.sh
+
+if [ $# -ne 5 ] && [ $# -ne 6 ]; then
+  cat <<EOF
+  This script compares two directories containing data alignments, and
+  creates statistics showing how much the phone and word alignments differ,
+  including breakdown by phones and words; and which utterances differ the
+  most.  This is intended for diagnostic purposes.  Both alignment directories
+  should be for the same data (or at least the data sets should overlap).
+  The word alignment stats may not be correctly obtained if the data-dirs are
+  not the same.
+
+  Usage: $0 [options] <lang-directory> (<data-directory> | <data-directory1> <data-directory2>) <ali-dir1> <ali-dir2> <work-dir>
+   e.g.: $0 data/lang data/train exp/tri2_ali exp/tri3_ali exp/compare_ali_2_3
+
+  Options:
+              --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes.
+                                              # (passed through to get_train_ctm.sh)
+              --cleanup <true|false>          # Specify --cleanup false to prevent
+                                              # cleanup of temporary files.
+              --stage  <n>                    # Enables you to run part of the script.
+
+EOF
+  exit 1
+fi
+
+lang=$1
+data1=$2
+if [ $# -eq 5 ]; then
+  data2=$2
+  ali_dir1=$3
+  ali_dir2=$4
+  dir=$5
+else
+  data2=$3
+  ali_dir1=$4
+  ali_dir2=$5
+  dir=$6
+fi
+
+for f in $lang/phones.txt $ali_dir1/ali.1.gz $ali_dir2/ali.2.gz; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+nj1=$(cat $ali_dir1/num_jobs)
+nj2=$(cat $ali_dir2/num_jobs)
+
+mkdir -p $dir/log
+
+
+if [ $stage -le 0 ]; then
+  echo "$0: converting alignments to phones."
+
+  for j in $(seq $nj1); do gunzip -c $ali_dir1/ali.$j.gz; done | \
+    ali-to-phones --per-frame=true $ali_dir1/final.mdl ark:- ark:- | gzip -c > $dir/phones1.gz
+
+  for j in $(seq $nj2); do gunzip -c $ali_dir2/ali.$j.gz; done | \
+    ali-to-phones --per-frame=true $ali_dir2/final.mdl ark:- ark:- | gzip -c > $dir/phones2.gz
+fi
+
+if [ $stage -le 1 ]; then
+  echo "$0: getting comparison stats and utterance stats."
+  compare-int-vector --binary=false --write-confusion-matrix=$dir/conf.mat \
+            "ark:gunzip -c $dir/phones1.gz|" "ark:gunzip -c $dir/phones2.gz|" 2>$dir/log/compare_phones.log > $dir/utt_stats.phones
+  tail -n 8 $dir/log/compare_phones.log
+fi
+
+if [ $stage -le 3 ]; then
+  cat $dir/conf.mat | grep -v -F '[' | sed 's/]//' | awk '{n=NF; for (k=1;k<=n;k++) { conf[NR,k] = $k; row_tot[NR] += $k; col_tot[k] += $k; } } END{
+   for (row=1;row<=n;row++) for (col=1;col<=n;col++) {
+     val = conf[row,col]; this_row_tot = row_tot[row]; this_col_tot = col_tot[col];
+     rval=conf[col,row]
+     min_tot = (this_row_tot < this_col_tot ? this_row_tot : this_col_tot);
+     if (val != 0) {
+       phone1 = row-1; phone2 = col-1;
+       if (row == col) printf("COR %d %d %.2f%\n", phone1, val, (val * 100 / this_row_tot));
+       else {
+         norm_prob = val * val / min_tot;  # heuristic for sorting.
+         printf("SUB %d %d %d %d %.2f%% %.2f%%\n",
+                 norm_prob, phone1, phone2, val, (val * 100 / min_tot), (rval * 100 / min_tot)); }}}}' > $dir/phone_stats.all
+
+   (
+     echo "# Format: <phone> <frame-count> <percent-correct>"
+     grep '^COR' $dir/phone_stats.all | sort -n -k4,4 | awk '{print $2, $3, $4}' | utils/int2sym.pl -f 1 $lang/phones.txt
+   ) > $dir/phones_correct.txt
+
+   (
+     echo "#Format: <phone1> <phone2> <num-frames> <prob-wrong%> <reverse-prob-wrong%>"
+     echo "# <num-frames> is the number of frames that were labeled <phone1> in the first"
+     echo "# set of alignments and <phone2> in the second."
+     echo "# <prob-wrong> is <num-frames> divided by the smaller of the total num-frames of"
+     echo "#  phone1 or phone2, whichever is smaller; expressed as a percentage."
+     echo "#<reverse-prob-wrong> is the same but for the reverse substitution, from"
+     echo "#<phone2> to <phone1>; the comparison with <prob-wrong> the substitutions are)."
+     grep '^SUB' $dir/phone_stats.all | sort -nr -k2,2 | awk '{print $3,$4,$5,$6,$7}' | utils/int2sym.pl -f 1-2 $lang/phones.txt
+   ) > $dir/phone_subs.txt
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: getting CTMs"
+  steps/get_train_ctm.sh --use-segments false --print-silence true --cmd "$cmd" --frame-shift 1.0 $data1 $lang $ali_dir1 $dir/ctm1
+  steps/get_train_ctm.sh --use-segments false --print-silence true --cmd "$cmd" --frame-shift 1.0 $data2 $lang $ali_dir2 $dir/ctm2
+fi
+
+if [ $stage -le 5 ]; then
+  for n in 1 2; do
+    cat $dir/ctm${n}/ctm | utils/sym2int.pl -f 5 $lang/words.txt | \
+      awk 'BEGIN{utt_id="";} { if (utt_id != $1) { if (utt_id != "") printf("\n"); utt_id=$1; printf("%s ", utt_id); } t_start=int($3); t_end=t_start + int($4); word=$5; for (t=t_start; t<t_end; t++) printf("%s ", word); } END{printf("\n")}' | \
+      copy-int-vector ark:- ark:- | gzip -c >$dir/words${n}.gz
+  done
+fi
+
+if [ $stage -le 5 ]; then
+  compare-int-vector --binary=false --write-tot-counts=$dir/words_tot.vec --write-diff-counts=$dir/words_diff.vec \
+         "ark:gunzip -c $dir/words1.gz|" "ark:gunzip -c $dir/words2.gz|" 2>$dir/log/compare_words.log >$dir/utt_stats.words
+  tail -n 8 $dir/log/compare_words.log
+fi
+
+if [ $stage -le 6 ]; then
+
+  ( echo "# Word stats.  Format:";
+    echo "<proportion-of-wrong-frames> <num-wrong-frames> <num-correct-frames> <word>"
+
+    paste <(awk '{for (n=2;n<NF;n++) print $n;}' <$dir/words_diff.vec) \
+      <(awk '{for (n=2;n<NF;n++) print $n;}' <$dir/words_tot.vec) | \
+       awk '{ if($2 > 0) print $1*$1/$2, $1/$2, $1, $2, (NR-1)}' | utils/int2sym.pl -f 5 $lang/words.txt | \
+      sort -nr | awk '{print $2, $3, $4, $5;}'
+  ) > $dir/word_stats.txt
+
+fi
+
+if [ $stage -le 7 ]; then
+  for type in phones words; do
+    num_utts=$(wc -l <$dir/utt_stats.$type)
+    cat $dir/utt_stats.$type | awk -v type=$type 'BEGIN{print "Utterance-id proportion-"type"-changed num-frames num-wrong-frames"; }
+          {print $1, $3 * 1.0 / $2, $2, $3; }' | sort -nr -k2,2 > $dir/utt_stats.$type.sorted
+    (
+      echo "$0: Percentiles 100, 90, .. 0 of proportion-$type-changed distribution (over utterances) are:"
+    cat $dir/utt_stats.$type.sorted | awk -v n=$num_utts 'BEGIN{k=int((n-1)/10);} {if (NR % k == 1) printf("%s ", $2); } END{print "";}'
+    ) | tee $dir/utt_stats.$type.percentiles
+  done
+fi
+
+
+if [ $stage -le 8 ]; then
+  # Display the 1000 worst utterances, and 1000 utterances from the middle of the pack, in a readable format.
+  num_utts=$(wc -l <$dir/utt_stats.words.sorted)
+  half_num_utts=$[$num_utts/2];
+  if [ $num_to_sample -gt $half_num_utts ]; then
+    num_to_sample=$half_num_utts
+  fi
+  head -n $num_to_sample $dir/utt_stats.words.sorted | awk '{print $1}' > $dir/utt_ids.worst
+  tail -n +$half_num_utts $dir/utt_stats.words.sorted | head -n $num_to_sample | awk '{print $1}' > $dir/utt_ids.mid
+
+  for suf in worst mid; do
+    for n in 1 2; do
+      gunzip -c $dir/phones${n}.gz | copy-int-vector ark:- ark,t:- | utils/filter_scp.pl $dir/utt_ids.$suf  >$dir/temp
+      # the next command reorders them, and duplicates the utterance-idwhich we'll later use
+      # that to display the word sequence.
+      awk '{print $1,$1,$1}' <$dir/utt_ids.$suf | utils/apply_map.pl -f 3 $dir/temp > $dir/phones${n}.$suf
+      rm $dir/temp
+    done
+    # the stuff with 0 and <eps> below is a kind of hack so that if the phones are the same, we end up
+    # with just the phone, but if different, we end up with p1/p2.
+    # The apply_map.pl stuff is to put the transcript there.
+
+    (
+      echo "# Format: <utterance-id> <word1> <word2> ... <wordN>  <frame1-phone> ... <frameN-phone>"
+      echo "# If the two alignments have the same phone, just that phone will be printed;"
+      echo "# otherwise the two phones will be printed, as in 'phone1/phone2'.  So '/' is present"
+      echo "# whenever there is a mismatch."
+
+      paste $dir/phones1.$suf $dir/phones2.$suf | perl -ane ' @A = split("\t", $_); @A1 = split(" ", $A[0]); @A2 = split(" ", $A[1]);
+            $utt = shift @A1; shift @A2; print $utt, " ";
+            for ($n = 0; $n < @A1 && $n < @A2; $n++) { $a1=$A1[$n]; $a2=$A2[$n];  if ($a1 eq $a2) { print "$a1 "; } else { print "$a1 0 $a2 "; }}
+            print "\n" ' | utils/int2sym.pl -f 3- $lang/phones.txt | sed 's: <eps> :/:g' | \
+        utils/apply_map.pl -f 2 $data1/text
+    )  > $dir/compare_phones_${suf}.txt
+  done
+fi
+
+
+if [ $stage -le 9 ] && $cleanup; then
+  rm $dir/phones{1,2}.gz $dir/words{1,2}.gz $dir/ctm*/ctm $dir/*.vec $dir/conf.mat \
+     $dir/utt_ids.*  $dir/phones{1,2}.{mid,worst} $dir/utt_stats.{phones,words} \
+     $dir/phone_stats.all
+fi
+
+# clean up
+exit 0
diff --git a/egs/wsj/s5/steps/get_train_ctm.sh b/egs/wsj/s5/steps/get_train_ctm.sh
@@ -20,8 +20,9 @@ echo "$0 $@"  # Print the command line for logging
 [ -f ./path.sh ] && . ./path.sh
 . parse_options.sh || exit 1;
 
-if [ $# -ne 3 ]; then
-  echo "Usage: $0 [options] <data-dir> <lang-dir> <ali-dir|model-dir>"
+if [ $# -ne 3 ] && [ $# -ne 4 ]; then
+  echo "Usage: $0 [options] <data-dir> <lang-dir> <ali-dir|model-dir> [<output-dir>]"
+  echo "(<output-dir> defaults to  <ali-dir|model-dir>.)"
   echo " Options:"
   echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
   echo "    --stage (0|1|2)                 # start scoring script from part-way through."
@@ -39,27 +40,31 @@ fi
 
 data=$1
 lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
-dir=$3
+ali_dir=$3
+dir=$4
+if [ -z $dir ]; then
+  dir=$ali_dir
+fi
 
 
-model=$dir/final.mdl # assume model one level up from decoding dir.
+model=$ali_dir/final.mdl # assume model one level up from decoding dir.
 
 
-for f in $lang/words.txt $model $dir/ali.1.gz $lang/oov.int; do
+for f in $lang/words.txt $model $ali_dir/ali.1.gz $lang/oov.int; do
   [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
 done
 
 oov=`cat $lang/oov.int` || exit 1;
-nj=`cat $dir/num_jobs` || exit 1;
+nj=`cat $ali_dir/num_jobs` || exit 1;
 split_data.sh $data $nj || exit 1;
 sdata=$data/split$nj
 
-mkdir -p $dir/log
+mkdir -p $dir/log || exit 1;
 
 if [ $stage -le 0 ]; then
   if [ -f $lang/phones/word_boundary.int ]; then
     $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
-      set -o pipefail '&&' linear-to-nbest "ark:gunzip -c $dir/ali.JOB.gz|" \
+      set -o pipefail '&&' linear-to-nbest "ark:gunzip -c $ali_dir/ali.JOB.gz|" \
       "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \
       '' '' ark:- \| \
       lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
@@ -72,7 +77,7 @@ if [ $stage -le 0 ]; then
       exit 1;
     fi
     $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
-      set -o pipefail '&&' linear-to-nbest "ark:gunzip -c $dir/ali.JOB.gz|" \
+      set -o pipefail '&&' linear-to-nbest "ark:gunzip -c $ali_dir/ali.JOB.gz|" \
       "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \
       '' '' ark:- \| \
       lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
@@ -94,4 +99,3 @@ if [ $stage -le 1 ]; then
   fi
   rm $dir/ctm.*.gz
 fi
-
diff --git a/src/bin/Makefile b/src/bin/Makefile
@@ -21,7 +21,8 @@ BINFILES = align-equal align-equal-compiled acc-tree-stats \
         post-to-pdf-post logprob-to-post prob-to-post copy-post \
         matrix-sum build-pfile-from-ali get-post-on-ali tree-info am-info \
         vector-sum matrix-sum-rows est-pca sum-lda-accs sum-mllt-accs \
-        transform-vec align-text matrix-dim post-to-smat compile-graph
+        transform-vec align-text matrix-dim post-to-smat compile-graph \
+        compare-int-vector
 
 
 OBJFILES =
@@ -30,7 +31,7 @@ ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../lm/kaldi-lm.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
           ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../base/kaldi-base.a
 
 
 TESTFILES =

diff --git a/src/bin/ali-to-phones.cc b/src/bin/ali-to-phones.cc
@@ -38,7 +38,7 @@ int main(int argc, char *argv[]) {
         " ali-to-phones 1.mdl ark:1.ali ark:-\n"
         "or:\n"
         " ali-to-phones --ctm-output 1.mdl ark:1.ali 1.ctm\n"
-        "See also: show-alignments lattice-align-phones\n";
+        "See also: show-alignments lattice-align-phones, compare-int-vector\n";
     ParseOptions po(usage);
     bool per_frame = false;
     bool write_lengths = false;
@@ -137,5 +137,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-