kaldi-asr · danpovey · Feb 7, 2018 · Jan 9, 2018 · Jan 20, 2018 · Jan 23, 2018
diff --git a/egs/multi_en/s5/README.md b/egs/multi_en/s5/README.md
@@ -2,7 +2,7 @@ This is a WIP **English LVCSR recipe** that trains on data from multiple corpora
 * Fisher (1761 hours)
 * Switchboard (317 hours)
 * WSJ (81 hours)
-* HUB4 English Broadcast News (76 hours)
+* HUB4 (1996 & 1997) English Broadcast News (75 + 72 hours)
 * TED-LIUM (118 hours)
 * Librispeech (960 hours)
 

diff --git a/egs/multi_en/s5/RESULTS b/egs/multi_en/s5/RESULTS
@@ -36,21 +36,24 @@ exit 0
 # multi_a  tri5  tedlium_tg_tedlium.si    ||  %WER 29.0 | 1155 27512 | 75.8 20.3 3.9 4.8 29.0 93.3 | exp/multi_a/tri5/decode_tedlium_tg_tedlium.si/score_11_0.5/test.ctm.filt.sys
 
 # Results with the current data combination, lexicon preparation, and acoustic model training procedures.
-# On eval2000 the final GMM results is 24.3, which is better than the above result (24.9). 
-
-multi_a  tri1b  tg_eval2000        ||  %WER 40.3 | 4459 42989 | 63.7 26.1 10.2 4.0 40.3 72.9 | exp/multi_a/tri1b/decode_tg_eval2000/score_10_0.5/eval2000.ctm.filt.sys
-multi_a  tri1b  tg_eval2000.si     ||  %WER 45.3 | 4459 42989 | 59.2 29.3 11.4 4.6 45.3 75.4 | exp/multi_a/tri1b/decode_tg_eval2000.si/score_11_0.0/eval2000.ctm.filt.sys
-multi_a  tri3a  tg_eval2000        ||  %WER 33.3 | 4459 42989 | 70.4 21.0 8.6 3.7 33.3 69.6 | exp/multi_a/tri3a/decode_tg_eval2000/score_11_1.0/eval2000.ctm.filt.sys
-multi_a  tri3a  tg_eval2000.si     ||  %WER 38.5 | 4459 42989 | 65.9 24.7 9.5 4.4 38.5 72.5 | exp/multi_a/tri3a/decode_tg_eval2000.si/score_11_1.0/eval2000.ctm.filt.sys
-multi_a  tri3b  tg_eval2000        ||  %WER 27.9 | 4459 42989 | 75.8 17.9 6.3 3.7 27.9 67.1 | exp/multi_a/tri3b/decode_tg_eval2000/score_11_0.0/eval2000.ctm.filt.sys
-multi_a  tri3b  tg_eval2000.si     ||  %WER 31.6 | 4459 42989 | 71.9 20.3 7.8 3.5 31.6 68.8 | exp/multi_a/tri3b/decode_tg_eval2000.si/score_12_0.0/eval2000.ctm.filt.sys
-multi_a  tri3b  tg_sp_eval2000     ||  %WER 26.7 | 4459 42989 | 77.2 17.1 5.7 3.9 26.7 65.6 | exp/multi_a/tri3b/decode_tg_sp_eval2000/score_11_0.0/eval2000.ctm.filt.sys
-multi_a  tri3b  tg_sp_eval2000.si  ||  %WER 30.6 | 4459 42989 | 73.1 19.6 7.3 3.8 30.6 68.2 | exp/multi_a/tri3b/decode_tg_sp_eval2000.si/score_12_1.0/eval2000.ctm.filt.sys
-multi_a  tri4   tg_eval2000        ||  %WER 24.8 | 4459 42989 | 78.5 16.0 5.5 3.4 24.8 63.8 | exp/multi_a/tri4/decode_tg_eval2000/score_12_1.0/eval2000.ctm.filt.sys
-multi_a  tri4   tg_eval2000.si     ||  %WER 31.2 | 4459 42989 | 72.6 20.6 6.8 3.9 31.2 67.6 | exp/multi_a/tri4/decode_tg_eval2000.si/score_12_0.0/eval2000.ctm.filt.sys
-multi_a  tri5a  tg_eval2000        ||  %WER 24.3 | 4459 42989 | 78.8 15.6 5.6 3.2 24.3 63.3 | exp/multi_a/tri5a/decode_tg_eval2000/score_13_0.0/eval2000.ctm.filt.sys
-multi_a  tri5a  tg_eval2000.si     ||  %WER 30.6 | 4459 42989 | 73.7 20.2 6.1 4.3 30.6 67.9 | exp/multi_a/tri5a/decode_tg_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys
-multi_a  tri5a  tg_sp_eval2000     ||  %WER 24.2 | 4459 42989 | 79.1 15.6 5.3 3.3 24.2 63.2 | exp/multi_a/tri5a/decode_tg_sp_eval2000/score_12_0.0/eval2000.ctm.filt.sys
-multi_a  tri5a  tg_sp_eval2000.si  ||  %WER 30.5 | 4459 42989 | 73.7 20.3 6.0 4.2 30.5 67.8 | exp/multi_a/tri5a/decode_tg_sp_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys
-multi_a  tri5b  tg_eval2000        ||  %WER 24.3 | 4459 42989 | 79.3 15.7 5.0 3.6 24.3 63.5 | exp/multi_a/tri5b/decode_tg_eval2000/score_11_0.0/eval2000.ctm.filt.sys
-multi_a  tri5b  tg_eval2000.si     ||  %WER 30.7 | 4459 42989 | 73.6 20.4 6.0 4.3 30.7 68.1 | exp/multi_a/tri5b/decode_tg_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys
+# On eval2000 the final GMM results is 24.5, which is better than the above result (24.9). 
+multi_a  tri1b  tg_eval2000        ||  %WER 40.4 | 4459 42989 | 63.8 25.9 10.3 4.2 40.4 72.7 | exp/multi_a/tri1b/decode_tg_eval2000/score_10_0.5/eval2000.ctm.filt.sys
+multi_a  tri1b  tg_eval2000.si     ||  %WER 45.0 | 4459 42989 | 59.3 28.8 11.9 4.3 45.0 75.0 | exp/multi_a/tri1b/decode_tg_eval2000.si/score_12_0.0/eval2000.ctm.filt.sys
+multi_a  tri3a  tg_eval2000        ||  %WER 33.4 | 4459 42989 | 70.5 21.3 8.3 3.9 33.4 69.7 | exp/multi_a/tri3a/decode_tg_eval2000/score_11_0.0/eval2000.ctm.filt.sys
+multi_a  tri3a  tg_eval2000.si     ||  %WER 38.4 | 4459 42989 | 66.2 24.2 9.6 4.6 38.4 72.3 | exp/multi_a/tri3a/decode_tg_eval2000.si/score_11_1.0/eval2000.ctm.filt.sys
+multi_a  tri3b  tg_eval2000        ||  %WER 27.8 | 4459 42989 | 75.7 17.8 6.6 3.5 27.8 66.6 | exp/multi_a/tri3b/decode_tg_eval2000/score_12_0.0/eval2000.ctm.filt.sys
+multi_a  tri3b  tg_eval2000.si     ||  %WER 31.7 | 4459 42989 | 71.8 20.3 7.8 3.6 31.7 69.0 | exp/multi_a/tri3b/decode_tg_eval2000.si/score_12_0.5/eval2000.ctm.filt.sys
+multi_a  tri3b  tg_sp_eval2000     ||  %WER 26.8 | 4459 42989 | 77.0 17.3 5.7 3.8 26.8 65.2 | exp/multi_a/tri3b/decode_tg_sp_eval2000/score_11_1.0/eval2000.ctm.filt.sys
+multi_a  tri3b  tg_sp_eval2000.si  ||  %WER 30.5 | 4459 42989 | 73.7 19.7 6.7 4.2 30.5 68.0 | exp/multi_a/tri3b/decode_tg_sp_eval2000.si/score_11_0.0/eval2000.ctm.filt.sys
+multi_a  tri4   tg_eval2000        ||  %WER 24.8 | 4459 42989 | 78.6 15.8 5.5 3.5 24.8 64.1 | exp/multi_a/tri4/decode_tg_eval2000/score_12_1.0/eval2000.ctm.filt.sys
+multi_a  tri4   tg_eval2000.si     ||  %WER 31.3 | 4459 42989 | 73.1 20.8 6.2 4.4 31.3 68.7 | exp/multi_a/tri4/decode_tg_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys
+multi_a  tri5a  tg_eval2000        ||  %WER 24.5 | 4459 42989 | 79.0 15.7 5.3 3.5 24.5 63.4 | exp/multi_a/tri5a/decode_tg_eval2000/score_12_0.0/eval2000.ctm.filt.sys
+multi_a  tri5a  tg_eval2000.si     ||  %WER 30.4 | 4459 42989 | 73.3 20.0 6.6 3.8 30.4 67.5 | exp/multi_a/tri5a/decode_tg_eval2000.si/score_12_0.5/eval2000.ctm.filt.sys
+multi_a  tri5a  tg_sp_eval2000     ||  %WER 24.5 | 4459 42989 | 78.9 15.7 5.4 3.4 24.5 63.4 | exp/multi_a/tri5a/decode_tg_sp_eval2000/score_12_0.5/eval2000.ctm.filt.sys
+multi_a  tri5a  tg_sp_eval2000.si  ||  %WER 30.5 | 4459 42989 | 73.5 20.1 6.5 4.0 30.5 67.8 | exp/multi_a/tri5a/decode_tg_sp_eval2000.si/score_11_1.0/eval2000.ctm.filt.sys
+multi_a  tri5b  tg_eval2000        ||  %WER 24.4 | 4459 42989 | 79.1 15.6 5.3 3.5 24.4 63.4 | exp/multi_a/tri5b/decode_tg_eval2000/score_12_0.0/eval2000.ctm.filt.sys
+multi_a  tri5b  tg_eval2000.si     ||  %WER 30.5 | 4459 42989 | 73.5 20.2 6.3 4.0 30.5 67.3 | exp/multi_a/tri5b/decode_tg_eval2000.si/score_11_1.0/eval2000.ctm.filt.sys
+multi_a  tri6a  tg_eval2000        ||  %WER 24.5 | 4459 42989 | 78.8 15.7 5.5 3.4 24.5 63.0 | exp/multi_a/tri6a/decode_tg_eval2000/score_13_0.5/eval2000.ctm.filt.sys
+multi_a  tri6a  tg_eval2000.si     ||  %WER 31.5 | 4459 42989 | 73.1 21.0 5.9 4.6 31.5 68.1 | exp/multi_a/tri6a/decode_tg_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys
+multi_a  tri6a  tg_sp_eval2000     ||  %WER 24.6 | 4459 42989 | 78.9 15.8 5.3 3.5 24.6 63.3 | exp/multi_a/tri6a/decode_tg_sp_eval2000/score_12_1.0/eval2000.ctm.filt.sys
+multi_a  tri6a  tg_sp_eval2000.si  ||  %WER 31.5 | 4459 42989 | 72.6 21.0 6.4 4.2 31.5 67.9 | exp/multi_a/tri6a/decode_tg_sp_eval2000.si/score_11_1.0/eval2000.ctm.filt.sys
diff --git a/egs/multi_en/s5/conf/mfcc.conf b/egs/multi_en/s5/conf/mfcc.conf
@@ -2,3 +2,4 @@
 --sample-frequency=8000
 --low-freq=20
 --high-freq=3700
+--allow-downsample=true
diff --git a/egs/multi_en/s5/local/g2p/apply_g2p.sh b/egs/multi_en/s5/local/g2p/apply_g2p.sh
@@ -33,7 +33,7 @@ cat data/*/train/text | \
   perl -ape 's/\s/\n/g;' | \
   sort | uniq > $workdir/missing.txt
 cat $workdir/missing.txt | \
-  grep "^[a-z0-9.'_-]*$"  > $workdir/missing_onlywords.txt
+  grep "^[a-z]*$"  > $workdir/missing_onlywords.txt
 
 echo 'Synthesizing pronunciations for missing words...'
 phonetisaurus-apply --nbest $var_counts --model $model --thresh 5 --accumulate --word_list $workdir/missing_onlywords.txt > $workdir/missing_g2p_${var_counts}.txt 

diff --git a/egs/multi_en/s5/local/hub4_96_data_prep.sh b/egs/multi_en/s5/local/hub4_96_data_prep.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+###########################################################################################
+# This script was copied from egs/hub4_english/s5/local/data_prep/prepare_1996_bn_data.sh
+# The source commit was 191ae0a6e5db19d316c82a78c746bcd56cc2d7da
+# Changes in lower level script/dir names were made
+###########################################################################################
+
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<[email protected]>)
+#               2017  Vimal Manohar
+# License: Apache 2.0
+
+# This script prepares the 1996 English Broadcast News (HUB4) corpus.
+# /export/corpora/LDC/LDC97S44 
+# /export/corpora/LDC/LDC97T22
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset             # Treat unset variables as an error
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <text-source> <speech-source> <out-dir>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC97T22/hub4_eng_train_trans /export/corpora/LDC/LDC97S44/data data/local/data/train_bn96"
+  exit 1
+fi
+
+text_source_dir=$1    # /export/corpora/LDC/LDC97T22/hub4_eng_train_trans
+speech_source_dir=$2  # /export/corpora/LDC/LDC97S44/data
+out=$3
+
+mkdir -p $out;
+
+ls $text_source_dir/*/*.txt > $out/text.list
+ls $speech_source_dir/*.sph > $out/audio.list
+
+if [ ! -s $out/text.list ] || [ ! -s $out/audio.list ]; then
+  echo "$0: Could not get text and audio files"
+  exit 1
+fi
+
+local/hub4_96_parse_sgm.pl $out/text.list > \
+  $out/transcripts.txt 2> $out/parse_sgml.log || exit 1
+
+if [ ! -s $out/transcripts.txt ]; then
+  echo "$0: Could not parse SGML files in $out/text.list"
+  exit 1
+fi
+
+echo "$0: 1996 English Broadcast News training data (HUB4) prepared in $out"
+exit 0
diff --git a/egs/multi_en/s5/local/hub4_96_parse_sgm.pl b/egs/multi_en/s5/local/hub4_96_parse_sgm.pl
@@ -0,0 +1,235 @@
+#!/usr/bin/env perl
+###########################################################################################
+# This script was copied from egs/hub4_english/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl
+# The source commit was 9f61a1b0efa76f37fc29fa2dbeede6dc776a0203
+# No change was made
+###########################################################################################
+
+#===============================================================================
+# Copyright (c) 2017  Johns Hopkins University (Author: Jan "Yenda" Trmal <[email protected]>)
+#               2017  Vimal Manohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+require HTML::Parser or die "This script needs HTML::Parser from CPAN";
+HTML::Parser->import();
+
+binmode(STDOUT, ":utf8");
+
+sub  trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s };
+
+sub parse_sgml_tag {
+  my $tag = shift(@_);
+  my %ret;
+
+  if ($tag !~ /=/) {
+    return %ret;
+  }
+
+  $tag =~ s/<[a-zA-Z]+ //;
+  $tag =~ s/> *$//;
+  #print $tag . "\n";
+
+  my @key_value_pairs = split / *,? +/, $tag;
+  for my $entry(@key_value_pairs) {
+    (my $key, my $value) = split '=', $entry, 2;
+    $ret{$key}=$value;
+  }
+  return %ret;
+}
+
+if (@ARGV != 1) {
+  print STDERR "$0: This script needs exactly one parameter (list of SGML files)\n";
+  print STDERR "  Usage: $0 <transripts>\n";
+  print STDERR "  where\n";
+  print STDERR "    <transcripts> is a file containing the official SGML format\n";
+  print STDERR "      transcripts. The files are parsed and the parsed representation\n";
+  print STDERR "      is dumped to STDOUT (one utterance + the additional data fields\n";
+  print STDERR "      per line (we dump all the fields, but not all fields are used\n";
+  print STDERR "      in the recipe).\n";
+  die;
+}
+my $filelist=$ARGV[0];
+
+my $p = HTML::Parser->new();
+
+my @files=();
+open(F, '<', $filelist) or die "Could not open file $filelist: $?\n";
+while(<F>) {
+  chomp;
+  push @files, $_;
+}
+
+foreach my $file (@files) {
+  my $reporter="";
+  my $start = -1;
+  my $end = -1;
+  my $segment_start = -1;
+  my $segment_end = -1;
+  my $segment_speaker;
+  my $segment_fidelity = "XXX";
+  my $segment_mode = "XXX";
+  my $section_start = -1;
+  my $section_end = -1;
+  my $filename = "";
+  my $seq = 0;
+  my @text = ();
+  my $time;
+  my @tagqueue;
+
+  my $sgml_file = `basename $file`;
+  $sgml_file = trim $sgml_file;
+  $sgml_file =~ s/\.txt$//g;
+  $sgml_file =~ s/\.sgml$//g;
+  $sgml_file =~ s/_$//g;
+
+  open(my $f, '<:encoding(iso-8859-1)', $file) or die "Could not open file $file: $?\n";
+
+  while(my $line = <$f>) {
+    chomp $line;
+    $line = trim $line;
+    $line = lc $line;
+    next unless $line;
+
+    if ($line =~ /<episode/) {
+      my %tags = parse_sgml_tag $line;
+      $filename = $tags{'filename'};
+      $filename =~ s/"//g;
+      $filename =~ s/\.sph//g;
+
+      if ($sgml_file ne $filename) {
+        print STDERR "$0: WARNING: SGML filename does not match episode filename $filename in file $file\n";
+      }
+      #print "BS: $line\n";
+      push @tagqueue, ["episode", \%tags];
+      ;
+    } elsif ($line =~ /<\/episode/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+      #print "ES: $line\n";
+      ;
+    } elsif ($line =~ /<section/) {
+      my %tags = parse_sgml_tag $line;
+
+      if ($tags{'s_time'}) {
+        $section_start = $tags{'s_time'};
+      } else {
+        $section_start = $tags{'starttime'};
+      }
+
+      if ($tags{'e_time'}) {
+        $section_end = $tags{'e_time'};
+      } else {
+        $section_end = $tags{'endtime'};
+      }
+
+      #print "BS: $line\n";
+      push @tagqueue, ["section", \%tags];
+      ;
+    } elsif ($line =~ /<\/section/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+      #print "ES: $line\n";
+      ;
+    } elsif ($line =~ /<segment/) { 
+      #print "BT: $line\n";
+      my %tags = parse_sgml_tag $line;
+      $segment_speaker = $tags{'speaker'};
+      $segment_speaker =~ s/"//g;
+      $segment_start = $tags{'s_time'};
+      $segment_end = $tags{'e_time'};
+      $segment_fidelity = $tags{'fidelity'} if $tags{'fidelity'};
+      $segment_mode = $tags{'mode'} if $tags{'mode'};
+      $time = $segment_start;
+      push @tagqueue, ["segment", \%tags];
+      ;
+    } elsif ($line =~ /<\/segment/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+
+      #print join(" ", @text) . "\n" if @text > 0;
+      my $new_time = $segment_end;
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = 0;
+      $segment_speaker = "XXX";
+      $segment_start = "XXX";
+      $segment_end = "XXX";
+      $segment_fidelity = "XXX";
+      $segment_mode = "XXX";
+      #print "ET: $line\n";
+      ;
+    } elsif ($line =~ /<sync/) {
+      my %tags = parse_sgml_tag $line;
+      my $new_time = $tags{'time'};
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = $new_time;
+      ;
+    } elsif ($line =~ /<\/sync/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<overlap/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<\/overlap/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<background/ || $line =~ /<comment/) {
+      # ignore line;
+    } elsif ($line =~ /<foreign/) {
+      $line = trim $line;
+      push @text, $line;
+    } elsif ($line =~ /<\/foreign/) {
+      $line = trim $line;
+      push @text, $line;
+      ;
+    } elsif ($line =~ /<unclear/) {
+      $line = trim $line;
+      push @text, $line;
+    } elsif ($line =~ /<\/unclear/) {
+      $line = trim $line;
+      push @text, $line;
+      ;
+    } elsif ($line =~ /<[^\/]/) {
+      parse_sgml_tag $line;
+      print STDERR "$0: INFO: Unknown tag $line in file $file\n";
+    } elsif ($line =~ /<\//) {
+      ;
+    } else {
+      $line = trim $line;
+      push @text, $line if $line;
+      ;
+    }
+
+  }
+  close($f);
+}