From 827ddbfff87c0bc5e9ed0ed71000e325268fa7b0 Mon Sep 17 00:00:00 2001 From: xiaohui-zhang Date: Tue, 9 Jan 2018 13:07:06 -0500 Subject: [PATCH 1/7] multi_en: Fixed acronym normalization, swbd lexicon preparation, OOV pronunciation generation, acoustic data sub-sampling,.etc; Added hub4_97 data --- egs/multi_en/s5/README.md | 2 +- egs/multi_en/s5/RESULTS | 21 +- egs/multi_en/s5/conf/mfcc.conf | 1 + egs/multi_en/s5/local/g2p/apply_g2p.sh | 2 +- egs/multi_en/s5/local/hub4_96_data_prep.sh | 52 ++++ egs/multi_en/s5/local/hub4_96_parse_sgm.pl | 235 +++++++++++++++++ egs/multi_en/s5/local/hub4_97_data_prep.sh | 50 ++++ egs/multi_en/s5/local/hub4_97_parse_sgm.pl | 235 +++++++++++++++++ egs/multi_en/s5/local/hub4_data_prep.py | 242 ------------------ egs/multi_en/s5/local/hub4_en_data_prep.sh | 62 +++++ egs/multi_en/s5/local/hub4_format_data.pl | 138 ++++++++++ .../local/hub4_normalize_bn96_transcripts.pl | 33 +++ .../local/hub4_normalize_bn97_transcripts.pl | 42 +++ egs/multi_en/s5/local/hub4_utils.py | 174 ------------- .../s5/local/librispeech_data_prep.sh | 13 +- egs/multi_en/s5/local/make_partitions.sh | 8 +- egs/multi_en/s5/local/swbd1_data_prep.sh | 10 +- egs/multi_en/s5/local/tedlium_prepare_data.sh | 3 +- egs/multi_en/s5/local/wsj_data_prep.sh | 3 +- egs/multi_en/s5/run.sh | 64 ++--- 20 files changed, 915 insertions(+), 475 deletions(-) mode change 100644 => 100755 egs/multi_en/s5/RESULTS create mode 100755 egs/multi_en/s5/local/hub4_96_data_prep.sh create mode 100755 egs/multi_en/s5/local/hub4_96_parse_sgm.pl create mode 100755 egs/multi_en/s5/local/hub4_97_data_prep.sh create mode 100755 egs/multi_en/s5/local/hub4_97_parse_sgm.pl delete mode 100755 egs/multi_en/s5/local/hub4_data_prep.py create mode 100755 egs/multi_en/s5/local/hub4_en_data_prep.sh create mode 100755 egs/multi_en/s5/local/hub4_format_data.pl create mode 100755 egs/multi_en/s5/local/hub4_normalize_bn96_transcripts.pl create mode 100755 egs/multi_en/s5/local/hub4_normalize_bn97_transcripts.pl delete mode 100644 egs/multi_en/s5/local/hub4_utils.py diff --git a/egs/multi_en/s5/README.md b/egs/multi_en/s5/README.md index 0affcb9cf08..20505c5af6f 100755 --- a/egs/multi_en/s5/README.md +++ b/egs/multi_en/s5/README.md @@ -2,7 +2,7 @@ This is a WIP **English LVCSR recipe** that trains on data from multiple corpora * Fisher (1761 hours) * Switchboard (317 hours) * WSJ (81 hours) -* HUB4 English Broadcast News (76 hours) +* HUB4 (1996 & 1997) English Broadcast News (75 + 72 hours) * TED-LIUM (118 hours) * Librispeech (960 hours) diff --git a/egs/multi_en/s5/RESULTS b/egs/multi_en/s5/RESULTS old mode 100644 new mode 100755 index 24b82755b94..2b7e5329f20 --- a/egs/multi_en/s5/RESULTS +++ b/egs/multi_en/s5/RESULTS @@ -37,17 +37,16 @@ exit 0 # Results with the current data combination, lexicon preparation, and acoustic model training procedures. # On eval2000 the final GMM results is 24.3, which is better than the above result (24.9). - -multi_a tri1b tg_eval2000 || %WER 40.3 | 4459 42989 | 63.7 26.1 10.2 4.0 40.3 72.9 | exp/multi_a/tri1b/decode_tg_eval2000/score_10_0.5/eval2000.ctm.filt.sys -multi_a tri1b tg_eval2000.si || %WER 45.3 | 4459 42989 | 59.2 29.3 11.4 4.6 45.3 75.4 | exp/multi_a/tri1b/decode_tg_eval2000.si/score_11_0.0/eval2000.ctm.filt.sys -multi_a tri3a tg_eval2000 || %WER 33.3 | 4459 42989 | 70.4 21.0 8.6 3.7 33.3 69.6 | exp/multi_a/tri3a/decode_tg_eval2000/score_11_1.0/eval2000.ctm.filt.sys -multi_a tri3a tg_eval2000.si || %WER 38.5 | 4459 42989 | 65.9 24.7 9.5 4.4 38.5 72.5 | exp/multi_a/tri3a/decode_tg_eval2000.si/score_11_1.0/eval2000.ctm.filt.sys -multi_a tri3b tg_eval2000 || %WER 27.9 | 4459 42989 | 75.8 17.9 6.3 3.7 27.9 67.1 | exp/multi_a/tri3b/decode_tg_eval2000/score_11_0.0/eval2000.ctm.filt.sys -multi_a tri3b tg_eval2000.si || %WER 31.6 | 4459 42989 | 71.9 20.3 7.8 3.5 31.6 68.8 | exp/multi_a/tri3b/decode_tg_eval2000.si/score_12_0.0/eval2000.ctm.filt.sys -multi_a tri3b tg_sp_eval2000 || %WER 26.7 | 4459 42989 | 77.2 17.1 5.7 3.9 26.7 65.6 | exp/multi_a/tri3b/decode_tg_sp_eval2000/score_11_0.0/eval2000.ctm.filt.sys -multi_a tri3b tg_sp_eval2000.si || %WER 30.6 | 4459 42989 | 73.1 19.6 7.3 3.8 30.6 68.2 | exp/multi_a/tri3b/decode_tg_sp_eval2000.si/score_12_1.0/eval2000.ctm.filt.sys -multi_a tri4 tg_eval2000 || %WER 24.8 | 4459 42989 | 78.5 16.0 5.5 3.4 24.8 63.8 | exp/multi_a/tri4/decode_tg_eval2000/score_12_1.0/eval2000.ctm.filt.sys -multi_a tri4 tg_eval2000.si || %WER 31.2 | 4459 42989 | 72.6 20.6 6.8 3.9 31.2 67.6 | exp/multi_a/tri4/decode_tg_eval2000.si/score_12_0.0/eval2000.ctm.filt.sys +multi_a tri1b tg_eval2000 || %WER 40.4 | 4459 42989 | 63.8 25.9 10.3 4.2 40.4 72.7 | exp/multi_a/tri1b/decode_tg_eval2000/score_10_0.5/eval2000.ctm.filt.sys +multi_a tri1b tg_eval2000.si || %WER 45.0 | 4459 42989 | 59.3 28.8 11.9 4.3 45.0 75.0 | exp/multi_a/tri1b/decode_tg_eval2000.si/score_12_0.0/eval2000.ctm.filt.sys +multi_a tri3a tg_eval2000 || %WER 33.4 | 4459 42989 | 70.5 21.3 8.3 3.9 33.4 69.7 | exp/multi_a/tri3a/decode_tg_eval2000/score_11_0.0/eval2000.ctm.filt.sys +multi_a tri3a tg_eval2000.si || %WER 38.4 | 4459 42989 | 66.2 24.2 9.6 4.6 38.4 72.3 | exp/multi_a/tri3a/decode_tg_eval2000.si/score_11_1.0/eval2000.ctm.filt.sys +multi_a tri3b tg_eval2000 || %WER 27.8 | 4459 42989 | 75.7 17.8 6.6 3.5 27.8 66.6 | exp/multi_a/tri3b/decode_tg_eval2000/score_12_0.0/eval2000.ctm.filt.sys +multi_a tri3b tg_eval2000.si || %WER 31.7 | 4459 42989 | 71.8 20.3 7.8 3.6 31.7 69.0 | exp/multi_a/tri3b/decode_tg_eval2000.si/score_12_0.5/eval2000.ctm.filt.sys +multi_a tri3b tg_sp_eval2000 || %WER 26.8 | 4459 42989 | 77.0 17.3 5.7 3.8 26.8 65.2 | exp/multi_a/tri3b/decode_tg_sp_eval2000/score_11_1.0/eval2000.ctm.filt.sys +multi_a tri3b tg_sp_eval2000.si || %WER 30.5 | 4459 42989 | 73.7 19.7 6.7 4.2 30.5 68.0 | exp/multi_a/tri3b/decode_tg_sp_eval2000.si/score_11_0.0/eval2000.ctm.filt.sys +multi_a tri4 tg_eval2000 || %WER 24.8 | 4459 42989 | 78.6 15.8 5.5 3.5 24.8 64.1 | exp/multi_a/tri4/decode_tg_eval2000/score_12_1.0/eval2000.ctm.filt.sys +multi_a tri4 tg_eval2000.si || %WER 31.3 | 4459 42989 | 73.1 20.8 6.2 4.4 31.3 68.7 | exp/multi_a/tri4/decode_tg_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys multi_a tri5a tg_eval2000 || %WER 24.3 | 4459 42989 | 78.8 15.6 5.6 3.2 24.3 63.3 | exp/multi_a/tri5a/decode_tg_eval2000/score_13_0.0/eval2000.ctm.filt.sys multi_a tri5a tg_eval2000.si || %WER 30.6 | 4459 42989 | 73.7 20.2 6.1 4.3 30.6 67.9 | exp/multi_a/tri5a/decode_tg_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys multi_a tri5a tg_sp_eval2000 || %WER 24.2 | 4459 42989 | 79.1 15.6 5.3 3.3 24.2 63.2 | exp/multi_a/tri5a/decode_tg_sp_eval2000/score_12_0.0/eval2000.ctm.filt.sys diff --git a/egs/multi_en/s5/conf/mfcc.conf b/egs/multi_en/s5/conf/mfcc.conf index 4f780bf520c..9a17e801b3f 100644 --- a/egs/multi_en/s5/conf/mfcc.conf +++ b/egs/multi_en/s5/conf/mfcc.conf @@ -2,3 +2,4 @@ --sample-frequency=8000 --low-freq=20 --high-freq=3700 +--allow-downsample=true diff --git a/egs/multi_en/s5/local/g2p/apply_g2p.sh b/egs/multi_en/s5/local/g2p/apply_g2p.sh index 88b37f21ad8..f8e50302c29 100755 --- a/egs/multi_en/s5/local/g2p/apply_g2p.sh +++ b/egs/multi_en/s5/local/g2p/apply_g2p.sh @@ -33,7 +33,7 @@ cat data/*/train/text | \ perl -ape 's/\s/\n/g;' | \ sort | uniq > $workdir/missing.txt cat $workdir/missing.txt | \ - grep "^[a-z0-9.'_-]*$" > $workdir/missing_onlywords.txt + grep "^[a-z]*$" > $workdir/missing_onlywords.txt echo 'Synthesizing pronunciations for missing words...' phonetisaurus-apply --nbest $var_counts --model $model --thresh 5 --accumulate --word_list $workdir/missing_onlywords.txt > $workdir/missing_g2p_${var_counts}.txt diff --git a/egs/multi_en/s5/local/hub4_96_data_prep.sh b/egs/multi_en/s5/local/hub4_96_data_prep.sh new file mode 100755 index 00000000000..f258ea7b7f5 --- /dev/null +++ b/egs/multi_en/s5/local/hub4_96_data_prep.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +########################################################################################### +# This script was copied from egs/hub4_english/s5/local/data_prep/prepare_1996_bn_data.sh +# The source commit was 191ae0a6e5db19d316c82a78c746bcd56cc2d7da +# Changes in lower level script/dir names were made +########################################################################################### + +#!/bin/bash +# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal) +# 2017 Vimal Manohar +# License: Apache 2.0 + +# This script prepares the 1996 English Broadcast News (HUB4) corpus. +# /export/corpora/LDC/LDC97S44 +# /export/corpora/LDC/LDC97T22 + +# Begin configuration section. +# End configuration section +set -e -o pipefail +set -o nounset # Treat unset variables as an error + +if [ $# -ne 3 ]; then + echo "Usage: $0 " + echo " e.g.: $0 /export/corpora/LDC/LDC97T22/hub4_eng_train_trans /export/corpora/LDC/LDC97S44/data data/local/data/train_bn96" + exit 1 +fi + +text_source_dir=$1 # /export/corpora/LDC/LDC97T22/hub4_eng_train_trans +speech_source_dir=$2 # /export/corpora/LDC/LDC97S44/data +out=$3 + +mkdir -p $out; + +ls $text_source_dir/*/*.txt > $out/text.list +ls $speech_source_dir/*.sph > $out/audio.list + +if [ ! -s $out/text.list ] || [ ! -s $out/audio.list ]; then + echo "$0: Could not get text and audio files" + exit 1 +fi + +local/hub4_96_parse_sgm.pl $out/text.list > \ + $out/transcripts.txt 2> $out/parse_sgml.log || exit 1 + +if [ ! -s $out/transcripts.txt ]; then + echo "$0: Could not parse SGML files in $out/text.list" + exit 1 +fi + +echo "$0: 1996 English Broadcast News training data (HUB4) prepared in $out" +exit 0 diff --git a/egs/multi_en/s5/local/hub4_96_parse_sgm.pl b/egs/multi_en/s5/local/hub4_96_parse_sgm.pl new file mode 100755 index 00000000000..172ec5bb563 --- /dev/null +++ b/egs/multi_en/s5/local/hub4_96_parse_sgm.pl @@ -0,0 +1,235 @@ +#!/usr/bin/env perl +########################################################################################### +# This script was copied from egs/hub4_english/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl +# The source commit was 9f61a1b0efa76f37fc29fa2dbeede6dc776a0203 +# No change was made +########################################################################################### + +#=============================================================================== +# Copyright (c) 2017 Johns Hopkins University (Author: Jan "Yenda" Trmal ) +# 2017 Vimal Manohar +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +use strict; +use warnings; +use utf8; + +require HTML::Parser or die "This script needs HTML::Parser from CPAN"; +HTML::Parser->import(); + +binmode(STDOUT, ":utf8"); + +sub trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s }; + +sub parse_sgml_tag { + my $tag = shift(@_); + my %ret; + + if ($tag !~ /=/) { + return %ret; + } + + $tag =~ s/<[a-zA-Z]+ //; + $tag =~ s/> *$//; + #print $tag . "\n"; + + my @key_value_pairs = split / *,? +/, $tag; + for my $entry(@key_value_pairs) { + (my $key, my $value) = split '=', $entry, 2; + $ret{$key}=$value; + } + return %ret; +} + +if (@ARGV != 1) { + print STDERR "$0: This script needs exactly one parameter (list of SGML files)\n"; + print STDERR " Usage: $0 \n"; + print STDERR " where\n"; + print STDERR " is a file containing the official SGML format\n"; + print STDERR " transcripts. The files are parsed and the parsed representation\n"; + print STDERR " is dumped to STDOUT (one utterance + the additional data fields\n"; + print STDERR " per line (we dump all the fields, but not all fields are used\n"; + print STDERR " in the recipe).\n"; + die; +} +my $filelist=$ARGV[0]; + +my $p = HTML::Parser->new(); + +my @files=(); +open(F, '<', $filelist) or die "Could not open file $filelist: $?\n"; +while() { + chomp; + push @files, $_; +} + +foreach my $file (@files) { + my $reporter=""; + my $start = -1; + my $end = -1; + my $segment_start = -1; + my $segment_end = -1; + my $segment_speaker; + my $segment_fidelity = "XXX"; + my $segment_mode = "XXX"; + my $section_start = -1; + my $section_end = -1; + my $filename = ""; + my $seq = 0; + my @text = (); + my $time; + my @tagqueue; + + my $sgml_file = `basename $file`; + $sgml_file = trim $sgml_file; + $sgml_file =~ s/\.txt$//g; + $sgml_file =~ s/\.sgml$//g; + $sgml_file =~ s/_$//g; + + open(my $f, '<:encoding(iso-8859-1)', $file) or die "Could not open file $file: $?\n"; + + while(my $line = <$f>) { + chomp $line; + $line = trim $line; + $line = lc $line; + next unless $line; + + if ($line =~ //$1/g; + $line = trim $line; + die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line; + #print "ES: $line\n"; + ; + } elsif ($line =~ /
/$1/g; + $line = trim $line; + die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line; + #print "ES: $line\n"; + ; + } elsif ($line =~ //$1/g; + $line = trim $line; + die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line; + + #print join(" ", @text) . "\n" if @text > 0; + my $new_time = $segment_end; + if (@text > 0) { + print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time "; + print join(" ", @text) . "\n"; + } + @text = (); + $time = 0; + $segment_speaker = "XXX"; + $segment_start = "XXX"; + $segment_end = "XXX"; + $segment_fidelity = "XXX"; + $segment_mode = "XXX"; + #print "ET: $line\n"; + ; + } elsif ($line =~ / 0) { + print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time "; + print join(" ", @text) . "\n"; + } + @text = (); + $time = $new_time; + ; + } elsif ($line =~ /<\/sync/) { + #print $line; + ; + } elsif ($line =~ /) +# 2017 Vimal Manohar +# License: Apache 2.0 + +# This script prepares the 1997 English Broadcast News (HUB4) corpus. +# /export/corpora/LDC/LDC98S71 +# /export/corpora/LDC/LDC98T28 + +# Begin configuration section. +# End configuration section +set -e -o pipefail +set -o nounset # Treat unset variables as an error + +if [ $# -ne 3 ]; then + echo "Usage: $0 " + echo " e.g.: $0 /export/corpora/LDC/LDC98T28/hub4e97_trans_980217 /export/corpora/LDC/LDC98S71/97_eng_bns_hub4 data/local/data/train_bn97" + exit 1 +fi + +text_source_dir=$1 # /export/corpora/LDC/LDC98T28/hub4e97_trans_980217 +speech_source_dir=$2 # /export/corpora/LDC/LDC98S71/97_eng_bns_hub4 +out=$3 + +mkdir -p $out; + +ls $text_source_dir/transcrp/*.sgml > $out/text.list +ls $speech_source_dir/*.sph > $out/audio.list + +if [ ! -s $out/text.list ] || [ ! -s $out/audio.list ]; then + echo "$0: Could not get text and audio files" + exit 1 +fi + +local/hub4_97_parse_sgm.pl $out/text.list > \ + $out/transcripts.txt 2> $out/parse_sgml.log || exit 1 + +if [ ! -s $out/transcripts.txt ]; then + echo "$0: Could not parse SGML files in $out/text.list" + exit 1 +fi + +echo "$0: 1997 English Broadcast News training data (HUB4) prepared in $out" +exit 0 diff --git a/egs/multi_en/s5/local/hub4_97_parse_sgm.pl b/egs/multi_en/s5/local/hub4_97_parse_sgm.pl new file mode 100755 index 00000000000..da2344df7c7 --- /dev/null +++ b/egs/multi_en/s5/local/hub4_97_parse_sgm.pl @@ -0,0 +1,235 @@ +#!/usr/bin/env perl +########################################################################################### +# This script was copied from egs/hub4_english/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl +# The source commit was 191ae0a6e5db19d316c82a78c746bcd56cc2d7da +# No change was made +########################################################################################### + +#!/usr/bin/env perl +#=============================================================================== +# Copyright (c) 2017 Johns Hopkins University (Author: Jan "Yenda" Trmal ) +# 2017 Vimal Manohar +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +use strict; +use warnings; +use utf8; + +require HTML::Parser or die "This script needs HTML::Parser from CPAN"; +HTML::Parser->import(); + +binmode(STDOUT, ":utf8"); + +sub trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s }; + +sub parse_sgml_tag { + my $tag = shift(@_); + my %ret; + + if ($tag !~ /=/) { + return %ret; + } + + $tag =~ s/<[a-zA-Z]+ //; + $tag =~ s/> *$//; + #print $tag . "\n"; + + my @key_value_pairs = split / *,? +/, $tag; + for my $entry(@key_value_pairs) { + (my $key, my $value) = split '=', $entry, 2; + $ret{$key}=$value; + } + return %ret; +} + +if (@ARGV != 1) { + print STDERR "$0: This script needs exactly one parameter (list of SGML files)\n"; + print STDERR " Usage: $0 \n"; + print STDERR " where\n"; + print STDERR " is a file containing the official SGML format\n"; + print STDERR " transcripts. The files are parsed and the parsed representation\n"; + print STDERR " is dumped to STDOUT (one utterance + the additional data fields\n"; + print STDERR " per line (we dump all the fields, but not all fields are used\n"; + print STDERR " in the recipe).\n"; + die; +} +my $filelist=$ARGV[0]; + +my $p = HTML::Parser->new(); + +my @files=(); +open(F, '<', $filelist) or die "Could not open file $filelist: $?\n"; +while() { + chomp; + push @files, $_; +} + +foreach my $file (@files) { + my $reporter=""; + my $start = -1; + my $end = -1; + my $segment_start = -1; + my $segment_end = -1; + my $segment_speaker; + my $segment_fidelity = "XXX"; + my $segment_mode = "XXX"; + my $section_start = -1; + my $section_end = -1; + my $filename = ""; + my $seq = 0; + my @text = (); + my $time; + my @tagqueue; + + my $sgml_file = `basename $file`; + $sgml_file = trim $sgml_file; + $sgml_file =~ s/\.txt$//g; + $sgml_file =~ s/\.sgml$//g; + $sgml_file =~ s/_$//g; + + open(my $f, '<:encoding(iso-8859-1)', $file) or die "Could not open file $file: $?\n"; + + while(my $line = <$f>) { + chomp $line; + $line = trim $line; + $line = lc $line; + next unless $line; + + if ($line =~ //$1/g; + $line = trim $line; + die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line; + #print "ES: $line\n"; + ; + } elsif ($line =~ /
/$1/g; + $line = trim $line; + die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line; + #print "ES: $line\n"; + ; + } elsif ($line =~ //$1/g; + $line = trim $line; + die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line; + + #print join(" ", @text) . "\n" if @text > 0; + my $new_time = $segment_end; + if (@text > 0) { + print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time "; + print join(" ", @text) . "\n"; + } + @text = (); + $time = 0; + $segment_speaker = "XXX"; + $segment_start = "XXX"; + $segment_end = "XXX"; + $segment_fidelity = "XXX"; + $segment_mode = "XXX"; + #print "ET: $line\n"; + ; + } elsif ($line =~ /