+
+ if(/[0-9]/o && !/ $40
+@@ -362,32 +363,32 @@
+ if($x =~ /\//)
+ { $x =~ s/^\D*//;
+ $x =~ s/\D*$//;
+- &printfrac($x);
++ if (! &printfrac($x)) {return 0;}
+ &pusho("of a $unit");
+ $x="";
+ $plural=0;
+ }
+
+ $x =~ s/^\D*([\d,]*)\D*.*$/$1/; # int part of string
+- if($x ne "") {&printint($x);} # print int part (eg. dollars)
++ if($x ne "") {if (! &printint($x)) {return 0;} } # print int part (eg. dollars)
+
+ if($next eq "and" && $next2 =~ /\d\/\d/ && next2 !~ /\/.*\//)
+ { if($unit && $x ne "") {&pusho("and");} # frac: eg 4 1/16
+ $z=$next2;
+ $z =~ s/\D*$//;
+- &printfrac($z);
++ if (! &printfrac($z)) {return 0;}
+ ($punct)=($next2 =~ /(\D*)$/);
+ $field+=2;
+ &pusho("${unit}s");
+
+- if($back) {&perr("money: back and 1 1/3");}
++ if($back) {&perr("money: back and 1 1/3"); return 0;}
+
+ if($punct) {&appendo($punct);} # punctuation from *illion
+- return;
++ return 1;
+ }
+
+ if($back eq "" && $next =~ /^(thousands?|[a-z]*illions?)(\W*)/i)
+- { &printdecfrac($_); # multiplier
++ { if (! &printdecfrac($_)) {return 0;} # multiplier
+ &pusho($1);
+ $punct=$2;
+ $plural=1; ### if adj '', if noun 's'
+@@ -395,7 +396,7 @@
+ $frac=1;
+ }
+ elsif(/\.\d$/ || /\.\d\D/ || /\.\d{3}/ ) # .d or .ddd+
+- { &printdecfrac($_);
++ { if (! &printdecfrac($_)) {return 0;}
+ $plural=1; # can be either
+ $frac=1;
+ }
+@@ -409,7 +410,7 @@
+ { $unit=""; # fix "$1 dollar" wsj typo
+ $subunit_sing="";
+ $subunit_pl="";
+- &printdecfrac($_);
++ if (! &printdecfrac($_)) {return 0;}
+ $frac=1;
+ }
+
+@@ -447,24 +448,26 @@
+ { $y=$_;
+ $y =~ s/^[^\.]*\.([\d]*)\D?.*$/$1/; # get fractional part
+ if($unit && $x ne "") {&pusho("and");}
+- &printint($y);
++ if (! &printint($y)) {return 0;}
+ if($sing || int($y)==1) {&pusho($subunit_sing);}
+ else {&pusho($subunit_pl);}
+ }
+
+ if($back) # punctuation from this field
+- { if($punct) {&perr("money: back and punct");}
++ { if($punct) {&perr("money: back and punct"); return 0;}
+
+ if($back =~ /^\w/) {&pusho($back);}
+ else {&appendo($back);}
+ }
+
+ if($punct) {&appendo($punct);} # punctuation from *illion
++
++ return 1;
+ }
+
+ sub printyear # &printyear(x)
+ { if($vflg) {print "printyear: $_[0]\n";}
+- &printnum($_[0]); # for now
++ return &printnum($_[0]); # for now
+ }
+
+ sub printtime # &printtime(x)
+@@ -475,7 +478,7 @@
+ local($front);
+ local($back);
+
+- if(/:{2,}/ || !/\d:\d/) {&perr("printtime: not a time");}
++ if(/:{2,}/ || !/\d:\d/) {&perr("printtime: not a time"); return 0;}
+
+ @x=split(/:/,$_);
+ ($front)=($x[0] =~ /^(\D*)/);
+@@ -487,20 +490,21 @@
+ { &pusho($front); # generally punctuation
+ if($front !~ /\w$/) {$appendflg=1;}
+ }
+- &printint($x[0]);
++ if (! &printint($x[0])) {return 0;}
+ if($x[1]==0)
+ { $_=$next;
+ if(!/^[aApP]\.?[nM]\.?$/) {&pusho("o'clock");}
+ }
+ elsif ($x[1]<10)
+ { &pusho("oh");
+- &printint($x[1]);
++ if (!&printint($x[1])) {return 0;}
+ }
+- else {&printint($x[1]);}
++ else {if (! &printint($x[1])) {return 0;} }
+ if($back)
+ { if($back =~ /^\w/) {&pusho($back);}
+ else {&appendo($back);} # generally punctuation
+ }
++ return 1;
+ }
+
+ sub printfrac
+@@ -530,8 +534,8 @@
+ }
+
+ @z=split(/\//,$x);
+- if($#z !=1) {&perr("printfrac: illegal fraction: $_[0]");}
+- if($z[1] <= 1) {&perr("printfrac: den too small: $_[0]");}
++ if($#z !=1) {&perr("printfrac: illegal fraction: $_[0]"); return 0;}
++ if($z[1] <= 1) {&perr("printfrac: den too small: $_[0]"); return 0;}
+
+ if($front)
+ { &pusho($front);
+@@ -541,22 +545,22 @@
+
+ if($sign) {&pusho($sign);}
+
+- &printint($z[0]); #numerator
++ if (! &printint($z[0])) { return 0;} #numerator
+ if($z[1] <= $#den) # small den from table (<20)
+ { &pusho($den[$z[1]]);
+- if($z[0]!=1) {&pluralize;}
++ if($z[0]!=1) {if (! &pluralize) {return 0;} }
+ }
+ else #large den
+ { $ones=int($z[1]%100);
+ $hun=100*int($z[1]/100);
+- if($hun>0) {&printint($hun);}
++ if($hun>0) {if (!&printint($hun)) {return 0;} }
+ if($ones==0)
+ { &appendo("th");
+- if($z[0]!=1) {&pluralize;}
++ if($z[0]!=1) {if (! &pluralize) {return 0;} }
+ }
+ elsif($ones<=$#largeden) # <20
+ { &pusho($largeden[$ones]);
+- if($z[0]!=1) {&pluralize;};
++ if($z[0]!=1) {if (!&pluralize) {return 0;} }
+ }
+ else
+ { $x=int($ones%10);
+@@ -569,11 +573,11 @@
+ }
+ if($x==0)
+ { &pusho("th");
+- if($z[0]!=1) {&pluralize;}
++ if($z[0]!=1) {if (! &pluralize) {return 0;} }
+ }
+ else
+ { &pusho($largeden[$x]);
+- if($z[0]!=1) {&pluralize;}
++ if($z[0]!=1) {if (! &pluralize) {return 0;} }
+ }
+ }
+ }
+@@ -585,6 +589,8 @@
+ &appendo($back);
+ }
+ }
++
++ return 1;
+ }
+
+ sub printnum # printnum(n)
+@@ -624,7 +630,7 @@
+ $x =~ s/\D*$//; # strip back: final . is punct
+ }
+
+- if($x =~ /[^\d\.,]/) {&perr("printnum: $_[0] is not a number");}
++ if($x =~ /[^\d\.,]/) {&perr("printnum: $_[0] is not a number"); return 0;}
+
+ if($x!=0 && $x =~ /^0/ && $x =~ /^\d*$/) # "oh" numbers
+ { if($front)
+@@ -641,7 +647,7 @@
+
+ if($back)
+ { if($back =~ /^s$/ || $back =~ /^s\W/) # back = s
+- { &pluralize; # eg. 1960s
++ { if (! &pluralize) {return 0;} # eg. 1960s
+ $back =~ s/^s//;
+ }
+ if($back)
+@@ -649,7 +655,7 @@
+ else {&appendo($back);} # back = punct or "'s"
+ }
+ }
+- return;
++ return 1;
+ }
+
+ if($x =~ /^\d/) # get integer part
+@@ -675,48 +681,48 @@
+ if($sign) { &pusho($sign); }
+
+ $ones=int($intpart%100);
+- if($comma) {&printint($intpart);}
++ if($comma) {if (! &printint($intpart)) {return 0;} }
+ elsif(($intpart>=1900 || $intpart>=1100 && $ones==0)
+ && $intpart<2000 && !$fracpart) #4 digit -> 2+2
+ { $hun=int($intpart/100);
+- &printint($hun);
+- if($ones>=10) {&printint($ones);}
++ if (! &printint($hun)) {return 0;}
++ if($ones>=10) {if (! &printint($ones)) {return 0;} }
+ elsif($ones>0)
+ { &pusho("oh");
+- &printint($ones);
++ if (! &printint($ones)) {return 0;}
+ }
+ else {&pusho("hundred");}
+ }
+ else
+- { &printint($intpart);
++ { if (! &printint($intpart)) {return 0;}
+ $y=$last;
+ $y =~ s/^\W*//; # thize dates: May 25th
+ if(length($intpart)<=2 && $months{$y})
+- { &thize("");
++ { if (! &thize("")) {return 0;}
+ $back =~ s/[a-z]//g;
+ }
+ }
+- if($fracpart) {&printdecfrac($fracpart);}
++ if($fracpart) {if (! &printdecfrac($fracpart)) {return 0;} }
+
+ if($back)
+ { if($back =~ /^s$/ || $back =~ /^s\W/) # back = s
+- { &pluralize; # eg. 1960s
++ { if (! &pluralize) {return 0;} # eg. 1960s
+ $back =~ s/^s//;
+ }
+ if($back =~ /^st$/ || $back =~ /^st\W/) # back= st
+- { &thize("st"); # eg. 1st
++ { if (! &thize("st")) {return 0;} # eg. 1st
+ $back =~ s/^st//;
+ }
+ if($back =~ /^nd$/ || $back =~ /^nd\W/) # back= nd
+- { &thize("nd"); # eg. 2nd
++ { if (! &thize("nd")) {return 0;} # eg. 2nd
+ $back =~ s/^nd//;
+ }
+ if($back =~ /^rd$/ || $back =~ /^rd\W/) # back= rd
+- { &thize("rd"); # eg. 3rd
++ { if (! &thize("rd")) {return 0;} # eg. 3rd
+ $back =~ s/^rd//;
+ }
+ if($back =~ /^th$/ || $back =~ /^th\W/) # back= th
+- { &thize("th"); # eg. 4th
++ { if (! &thize("th")) {return 0;} # eg. 4th
+ $back =~ s/^th//;
+ }
+ if($back)
+@@ -724,6 +730,7 @@
+ else {&appendo($back);} # back = punct or "'s"
+ }
+ }
++ return 1;
+ }
+
+ sub printdate # printdate(n): x/x/x format
+@@ -741,7 +748,7 @@
+ $back=$1;
+
+ if($x !~ /^\d{1,2}\/\d{1,2}\/(19)?\d{2}$/)
+- {&perr("printdate: $_[0] is not a date");}
++ {&perr("printdate: $_[0] is not a date"); return 0;}
+
+ @y=split(/\//,$x);
+ $y[2] =~ s/^19(\d{2})$/$1/;
+@@ -752,20 +759,21 @@
+ $appendflg=1;
+ }
+
+- &printint($y[0]);
++ if (! &printint($y[0])) {return 0;}
+ &appendo("/");
+
+ $appendflg=1;
+- &printint($y[1]);
++ if (! &printint($y[1])) {return 0;}
+ &appendo("/");
+
+ $appendflg=1;
+- &printint($y[2]);
++ if (! &printint($y[2])) {return 0;}
+
+ if($back)
+ { if($back =~ /^[a-zA-Z]/) {&appendo("-");}
+ &appendo($back);
+ }
++ return 1;
+ }
+
+ sub printserno # printserno(n): eg. B1, 3b2, 10W-40
+@@ -815,12 +823,12 @@
+ } # (should expand here unless in dictionary)
+ $x =~ s/^(\d*)//; # strip off dig
+ $y=$1;
+- if($y ne "") { &printdigstr($y); }
++ if($y ne "") { if (! &printdigstr($y)) {return 0;} }
+ }
+
+ if($back =~ /^s\b/) # back = s
+ { # eg. 2C60s
+- &pluralize;
++ if (! &pluralize) {return 0;}
+ $back =~ s/^s//;
+ }
+ if($back)
+@@ -828,6 +836,7 @@
+ else {&appendo($back);}
+ }
+ $appendflg=0;
++ return 1;
+ }
+
+ sub printdigstr # printdigstr(x)
+@@ -841,14 +850,13 @@
+ if($x =~ /^0/) # leading zero
+ { while($x ne "")
+ { $x =~ s/^(.)//;
+- if($1 !~ /\d/) {&perr("printdigstr: non-digit");}
++ if($1 !~ /\d/) {&perr("printdigstr: non-digit"); return 0;}
+ &pusho("$ones_z[$1]");
+ }
+ return;
+ }
+ if($x =~ /^\d0*$/) # d, d0, d00, d000, etc
+- { &printint($x);
+- return;
++ { return &printint($x);
+ }
+
+ $_=$x;
+@@ -857,30 +865,29 @@
+ for($k=0;$y[$k]==0;$k++) {} # k= nr following 0s
+
+ if($j==2) # 2 dig
+- { &printint($x);
+- return;
++ { return &printint($x);
+ }
+ if($j==3)
+- { &printint($y[2]);
++ { if (! &printint($y[2])) {return 0;}
+ if($y[1]==0) {&pusho("oh");}
+- &printint("$y[1]$y[0]");
+- return;
++ return &printint("$y[1]$y[0]");
+ }
+ if($j==5 && $k<=2)
+- { &printint("$y[4]");
++ { if (! &printint("$y[4]")) {return 0;}
+ $j=4;
+ }
+ if($j==4)
+- { &printint("$y[3]$y[2]");
++ { if (! &printint("$y[3]$y[2]")) {return 0;}
+ if($k==2) {&pusho("hundred");}
+ else
+ { if($y[1]==0) {&pusho("oh");}
+- &printint("$y[1]$y[0]");
++ return &printint("$y[1]$y[0]");
+ }
+- return;
++ return 1;
+ }
+ # >5 dig: just sequential dig
+ for($j--;$j>=0;$j--) {&pusho("$ones_oh[$y[$j]]");}
++ return 1;
+ }
+
+ sub printftin # printftin(n): eg. 6\'-4\"
+@@ -905,19 +912,19 @@
+
+ $x =~ s/^([\d\.]*)//; # strip off dig & .
+ $y=$1;
+- if(!$y) {&perr("printftin: bad feet");}
+- &printnum($y);
++ if(!$y) {&perr("printftin: bad feet"); return 0;}
++ if (! &printnum($y)) {return 0;}
+ if($y==1) {&appendo("-foot");}
+ else {&appendo("-feet");}
+
+ $x =~ s/^\'//; # strip off \'
+ $x =~ s/^-//; # strip off -
+- if(!$x) {&perr("printftin: bad intermed");}
++ if(!$x) {&perr("printftin: bad intermed"); return 0;}
+
+ $x =~ s/^([\d\.]*)//; # strip off dig & .
+ $y=$1;
+- if(!$y) {&perr("printftin: bad inches");}
+- &printnum($y);
++ if(!$y) {&perr("printftin: bad inches"); return 0;}
++ if (! &printnum($y)) {return 0;}
+ if($y==1) {&appendo("-inch");}
+ else {&appendo("-inches");}
+
+@@ -925,6 +932,7 @@
+ { if($back !~ /^[a-zA-Z]/) {&appendo($back);}
+ else {&pusho($back);}
+ }
++ return 1;
+ }
+
+ sub printint # printint(x)
+@@ -968,13 +976,14 @@
+ }
+ if(int($j/3)>0)
+ { if(int($j/3) > $#mult)
+- { &perr("printint: too big"); }
++ { &perr("printint: too big"); return 0;}
+ &pusho($mult[int($j/3)]);
+ }
+ $commanextflg=1;
+ }
+ }
+ $commanextflg=0;
++ return 1;
+ }
+
+ sub printdecfrac
+@@ -989,6 +998,8 @@
+ if($leadingzeroflg)
+ {for($j=0;$j<=$#y;$j++) { &pusho($ones_z[$y[$j]]);}}
+ else {for($j=0;$j<=$#y;$j++) { &pusho($ones_oh[$y[$j]]);}}
++
++ return 1;
+ }
+
+ sub pluralize # pluralize(): pluralize last entry on output stack
+@@ -1016,7 +1027,9 @@
+ $x =~ s/y$/ies/;
+ &pusho($x);
+ }
+- else {&perr("pluralize: unknown word: $_");}
++ else {&perr("pluralize: unknown word: $_"); return 0;}
++
++ return 1;
+ }
+
+ sub thize # thize(): add th to last entry on output stack
+@@ -1028,50 +1041,51 @@
+ $_=&geto;
+ if( /four$/ || /six$/ || /seven$/ || /ten$/ ||
+ /eleven$/ || /een$/ || /hundred$/ || /thousand$/ || /illion$/ )
+- { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");} # xth
++ { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;} # xth
+ &appendo("th");
+ }
+ elsif( /one$/ ) # 1st
+- { if($y && $y ne "st") {&perr("thize: mismatch: $_ $y\n");}
++ { if($y && $y ne "st") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ $x=&popo();
+ $x =~ s/one$/first/;
+ &pusho($x);
+ }
+ elsif( /two$/ ) # 2nd
+- { if($y && $y ne "nd") {&perr("thize: mismatch: $_ $y\n");}
++ { if($y && $y ne "nd") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ $x=&popo();
+ $x =~ s/two$/second/;
+ &pusho($x);
+ }
+ elsif( /three$/ ) # 3rd
+- { if($y && $y ne "rd") {&perr("thize: mismatch: $_ $y\n");}
++ { if($y && $y ne "rd") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ $x=&popo();
+ $x =~ s/three$/third/;
+ &pusho($x);
+ }
+ elsif( /five$/ || /twelve$/ ) # 5th, 12th
+- { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");}
++ { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ $x=&popo();
+ $x =~ s/ve$/fth/;
+ &pusho($x);
+ }
+ elsif(/eight$/)
+- { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");} # 8th
++ { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;} # 8th
+ &appendo("h");
+ }
+ elsif( /nine$/ )
+- { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");}
++ { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ $x=&popo();
+ $x =~ s/nine$/ninth/;
+ &pusho($x);
+ }
+ elsif( /ty$/ )
+- { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");}
++ { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ $x=&popo();
+ $x =~ s/ty$/tieth/;
+ &pusho($x);
+ }
+- else {&perr("thize: unknown word: $_");}
++ else {&perr("thize: unknown word: $_"); return 0;j}
++ return 1;
+ }
+
+ sub pusho # pusho($x): push output
+@@ -1089,17 +1103,17 @@
+ sub appendo # appendo($x): append to output
+ { $appendflg=0;
+ # if($#output < 0) {&pusho("");}
+- if($#output < 0) {&perr("appendo: output empty");}
++ if($#output < 0) {&perr("appendo: output empty"); return 0;}
+ $output[$#output] .= @_[0];
+ }
+
+ sub popo # popo(): pop last output
+-{ if($#output < 0) {&perr("popo: output empty");}
++{ if($#output < 0) {&perr("popo: output empty"); return 0;}
+ pop(@output);
+ }
+
+ sub geto # geto(): get last output
+-{ if($#output < 0) {&perr("geto: output empty");}
++{ if($#output < 0) {&perr("geto: output empty"); return 0;}
+ return $output[$#output];
+ }
+
+@@ -1111,8 +1125,6 @@
+ $appendflg=0;
+ $commanextflg=0;
+ &pusho($this);
+- $field++; # graceful error recovery
+- goto wloop;
+ }
+
+ sub perr2
+diff -Naur tools/csr4_utils/pare-sgml.perl local/data_prep/csr_hub4_utils/pare-sgml.perl
+--- tools/csr4_utils/pare-sgml.perl 1996-08-27 15:25:17.000000000 -0400
++++ local/data_prep/csr_hub4_utils/pare-sgml.perl 2017-11-03 13:22:09.486213159 -0400
+@@ -1,11 +1,14 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+
+ # $Id: pare-sgml.perl,v 1.3 1996/08/15 02:51:17 robertm Rel $
+ # removes extraneous headers and other non-LM fields
+ # translates into LM-standard
+ # removes comments (enclosed in brackets)
+
+-$intext=0;
++use strict;
++use warnings;
++
++my $intext=0;
+ while (<>)
+ {
+ if ($intext == 0)
+diff -Naur tools/csr4_utils/process_filelist.sh local/data_prep/csr_hub4_utils/process_filelist.sh
+--- tools/csr4_utils/process_filelist.sh 1969-12-31 19:00:00.000000000 -0500
++++ local/data_prep/csr_hub4_utils/process_filelist.sh 2017-11-03 13:22:09.490213160 -0400
+@@ -0,0 +1,30 @@
++#! /bin/bash
++
++set -e
++set -o pipefail
++set -u
++set -x
++
++if [ $# -ne 2 ]; then
++ echo "Usage: $0 "
++ exit 1
++fi
++
++filelist=$1
++dir=$2
++
++export PATH=$PATH:tools/csr4_utils
++
++for file in `cat $filelist`; do
++ BASENM=`basename $file`
++ name="${BASENM%.*}"
++
++ echo "Running LM pipeline for |$BASENM|..." 1>&2
++ gunzip -c $file | pare-sgml.perl | \
++ bugproc.perl | \
++ numhack.perl | \
++ numproc.perl -xtools/csr4_utils/num_excp | \
++ abbrproc.perl tools/csr4_utils/abbrlist | \
++ puncproc.perl -np | gzip -c > $dir/$name.txt.gz
++ echo "Done with $BASENM."
++done
+diff -Naur tools/csr4_utils/progsummary.perl local/data_prep/csr_hub4_utils/progsummary.perl
+--- tools/csr4_utils/progsummary.perl 1996-07-12 09:26:35.000000000 -0400
++++ local/data_prep/csr_hub4_utils/progsummary.perl 2017-11-03 13:22:09.494213160 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+
+ # Program: progsummary.perl
+ # Written by: dave graff
+diff -Naur tools/csr4_utils/puncproc.perl local/data_prep/csr_hub4_utils/puncproc.perl
+--- tools/csr4_utils/puncproc.perl 1996-08-27 15:25:17.000000000 -0400
++++ local/data_prep/csr_hub4_utils/puncproc.perl 2017-11-03 13:22:09.494213160 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+
+ # $Id: puncproc.perl,v 1.2 1996/08/05 16:12:42 robertm Rel $
+ ###############################################################################
+@@ -59,7 +59,7 @@
+ # forbidden symbols
+ if(/) {&perr("<");} # <
+ if(/>/) {&perr(">");} # >
+- if(/\$/) {&perr("$");} # $
++ if(/\$/) {&perr("\$");} # $
+ if(/_/) {&perr("_");} # _
+ if(/\d/) {&perr("[0-9]");} # 0-9
+
+diff -Naur tools/csr4_utils/tr-bn-char.fast.perl local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl
+--- tools/csr4_utils/tr-bn-char.fast.perl 1996-08-21 02:39:12.000000000 -0400
++++ local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl 2017-11-03 13:22:09.502213160 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl -pi.old-char
++#!/usr/bin/perl -pi.old-char
+
+ # handles nonprinting characters in Broadcast News material, to the extent
+ # that they can be handled, and perhaps a bit beyond...
+diff -Naur tools/csr4_utils/tr-bn-char.slow.perl local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl
+--- tools/csr4_utils/tr-bn-char.slow.perl 1996-08-21 01:30:18.000000000 -0400
++++ local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl 2017-11-03 13:22:09.502213160 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl -p
++#!/usr/bin/perl -p
+
+ # handles nonprinting characters in Broadcast News material, to the extent
+ # that they can be handled, and perhaps a bit beyond...
diff --git a/egs/hub4_english/s5/local/data_prep/format_1996_bn_data.pl b/egs/hub4_english/s5/local/data_prep/format_1996_bn_data.pl
new file mode 100755
index 00000000000..84913e9a8b0
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/format_1996_bn_data.pl
@@ -0,0 +1,131 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright (c) 2017 Johns Hopkins University
+# (Author: Jan "Yenda" Trmal )
+# 2017 Vimal Manohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+use List::Util qw(max);
+
+my $audio_width=1;
+my $speaker_width=1;
+my $time_width=1;
+
+binmode(STDOUT, ":utf8");
+binmode(STDERR, ":utf8");
+
+if (@ARGV != 3) {
+ print STDERR "$0: Error: Unsupported number of arguments: " . scalar @ARGV ."\n";
+ print STDERR " Usage: $0 \n";
+ print STDERR " where\n";
+ print STDERR " is a file containing list of audio files\n";
+ print STDERR " (single absolute path name per line)\n";
+ print STDERR " is a file containing transcripts obtained\n";
+ print STDERR " obtained by processing the official SGML format\n";
+ print STDERR " transcripts. See parse_sgm.pl for further info.\n";
+ print STDERR " target directory (should already exist)\n";
+ print STDERR " See also: local/parse_sgm.pl\n";
+ die;
+}
+
+my $audio_files = $ARGV[0];
+my $transcripts = $ARGV[1];
+my $out = $ARGV[2];
+
+my %AUDIO;
+open(my $audio_f, "<", $audio_files)
+ or die "$0: Error: Could not open $audio_files: $!\n";
+while(my $line = <$audio_f>) {
+ chomp $line;
+ (my $basename = $line) =~ s/.*\/([^\/]+).sph/$1/g;
+ $basename =~ s/_$//g;
+ $AUDIO{$basename} = $line;
+}
+close($audio_f);
+
+my %TRANSCRIPT;
+open(my $transcript_f, "<:encoding(utf-8)", $transcripts)
+ or die "$0: Error: Could not open $transcripts: $!\n";
+while(my $line = <$transcript_f>) {
+ chomp $line;
+ my @F = split / /, $line, 8;
+ push @{$TRANSCRIPT{$F[0]}}, \@F;
+
+ my $f1 = $F[0];
+ my $f2 = $F[1];
+ my $speaker = $F[2];
+ my $t1 = $F[5];
+ my $t2 = $F[6];
+
+ $time_width = max $time_width, length($t1), length($t2);
+ $speaker_width = max $speaker_width, length($speaker);
+ $audio_width = max $audio_width, length($f1);
+}
+close($transcript_f);
+#print Dumper(\%TRANSCRIPT);
+
+print STDERR $time_width . " " . $speaker_width . " " . $audio_width . "\n";
+
+my $sph2pipe = `which sph2pipe` or do {
+ die "$0: Error: sph2pipe is not installed. Did you run make in the tools/ directory?\n";
+};
+chomp $sph2pipe;
+
+open(my $wav_file, ">", "$out/wav.scp")
+ or die "$0: Error: Cannot create file $out/wav.scp: $!\n";
+open(my $text_file, ">:encoding(utf-8)", "$out/text")
+ or die "$0: Error: Cannot create file $out/text: $!\n";
+open(my $segments_file, ">", "$out/segments")
+ or die "$0: Error: Cannot create file $out/segments: $!\n";
+open(my $spk_file, ">", "$out/utt2spk")
+ or die "$0: Error: Cannot create file $out/utt2spk: $!\n";
+
+foreach my $file (sort keys %AUDIO) {
+ print "$0 Error: $file does not exist in transcripts!\n"
+ unless exists $TRANSCRIPT{$file};
+ my $transcripts = $TRANSCRIPT{$file};
+
+ #my $file_fmt = sprintf("%0${audio_width}s", $file);
+ my $file_fmt = sprintf("%s", $file);
+
+ print $wav_file "$file_fmt $sph2pipe -f wav $AUDIO{$file}|\n";
+
+ foreach my $utt (@{$transcripts}) {
+ my $start = $utt->[5] + 0.0;
+ my $end = $utt->[6] + 0.0;
+ if ($end - $start < 0.005) { # remove very short segments
+ next;
+ }
+ my $start_time = sprintf("%0${time_width}d", $utt->[5]*1000);
+ my $end_time = sprintf("%0${time_width}d", $utt->[6]*1000);
+ my $spk = sprintf("%0${speaker_width}s", $utt->[2]);
+ # my $spk = sprintf("%s", $utt->[2]);
+ my $spkid = "${file_fmt}_${spk}";
+ my $uttid = "${file_fmt}_${spk}_${start_time}_${end_time}";
+
+ print $text_file "$uttid $utt->[7]\n";
+ print $spk_file "$uttid $spkid\n";
+ print $segments_file "$uttid $file_fmt $start $end\n";
+ }
+}
+
+close($wav_file);
+close($text_file);
+close($segments_file);
+close($spk_file);
diff --git a/egs/hub4_english/s5/local/data_prep/format_1997_bn_data.pl b/egs/hub4_english/s5/local/data_prep/format_1997_bn_data.pl
new file mode 120000
index 00000000000..844c16bbe06
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/format_1997_bn_data.pl
@@ -0,0 +1 @@
+format_1996_bn_data.pl
\ No newline at end of file
diff --git a/egs/hub4_english/s5/local/data_prep/hub4_utils.py b/egs/hub4_english/s5/local/data_prep/hub4_utils.py
new file mode 100644
index 00000000000..4ee9eab1c7e
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/hub4_utils.py
@@ -0,0 +1,156 @@
+# Copyright 2016 Vimal Manohar
+# Apache 2.0.
+
+"""This module contains utilities for preparing the HUB4 broadcast news
+evaluation corpora.
+"""
+
+import os
+import re
+import sys
+
+
+def parse_uem_line(reco, line):
+ """This method parses a 'line' from the UEM for recording 'reco'
+ and returns the line converted to kaldi segments format.
+ The format of UEM is
+
+
+ We force the channel to be 1 and take the file-id to be the recording-id.
+ """
+ line = line.strip()
+ if len(line) == 0 or line[0:2] == ";;":
+ return None
+ parts = line.split()
+
+ if reco is None:
+ reco = parts[0]
+
+ # The channel ID is expected to be 1.
+ if parts[1] != "1":
+ raise TypeError("Invalid line {0}".format(line))
+
+ start_time = float(parts[2])
+ end_time = float(parts[3])
+
+ utt = "{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100),
+ int(end_time * 100))
+ return "{0} {1} {2} {3}".format(utt, reco, start_time, end_time)
+
+
+def parse_cmu_seg_line(line, prepend_reco_to_spk=False):
+ """This line parses a 'line' from the CMU automatic segmentation for
+ recording.
+ The CMU segmentation has the following format:
+
+
+ e.g.:
+ h4e_98_1 1 F0-0000 0.00 28.22 F0
+
+ We force the channel to be 1 and take the file-id to be the recording-id.
+ """
+ line = line.strip()
+ if len(line) == 0 or line[0:2] == ";;":
+ return None
+ parts = line.split()
+
+ # Actually a file, but we assuming 1-1 mapping to recording and force
+ # channel to be 1.
+ reco = parts[0]
+
+ # The channel ID is expected to be 1.
+ if parts[1] != "1":
+ raise TypeError("Invalid line {0}".format(line))
+ spk = parts[2]
+
+ start_time = float(parts[3])
+ end_time = float(parts[4])
+
+ if prepend_reco_to_spk:
+ spk = reco + '-' + spk
+ utt = "{spk}-{0:06d}-{1:06d}".format(int(start_time * 100),
+ int(end_time * 100), spk=spk)
+ else:
+ utt = "{spk}-{reco}-{0:06d}-{1:06d}".format(int(start_time * 100),
+ int(end_time * 100),
+ reco=reco, spk=spk)
+
+ segment_line = "{0} {1} {st:.3f} {end:.3f}".format(
+ utt, reco, st=start_time, end=end_time)
+ utt2spk_line = "{0} {1}".format(utt, spk)
+
+ return (segment_line, utt2spk_line)
+
+
+def normalize_csr_transcript(text, noise_word, spoken_noise_word):
+ """Normalize broadcast news transcript for audio."""
+ text = text.upper()
+
+ # Remove long event markings
+ text = re.sub(r"\[[^]/]+/\]|\[/[^]/]+\]", "", text)
+ # Remove comments
+ text = re.sub(r"\{\{[^}]*\}\}", "", text)
+ # Replace alternative words with a single one (second alternative)
+ text = re.sub(r"\{[^}/]+/([^}/]+)[^}]*\}", r"\1", text)
+ # Remove partial word completions
+ text = re.sub(r"\([^)]+\)-|-\([^)]+\)", "-", text)
+ # Remove accent marks and diacritics
+ text = re.sub(r"\\[3-8]", "", text)
+
+ # Remove unclear speech markings
+ text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text)
+ text = re.sub(r"#", "", text) # Remove overlapped speech markings
+ # Remove invented word markings
+ text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
+ # Replace speaker-made noises with
+ text = re.sub(r"\[INHALING\]|\[COUGH\]|\[THROAT_CLEARING\]|\[SIGN\]",
+ spoken_noise_word, text)
+ # Replace noise with
+ text = re.sub(r"\[[^]]+\]", noise_word, text)
+ text = re.sub(r"\+([^+]+)\+", r"\1", text)
+
+ # Remove periods after letter.
+ text = re.sub(r"([A-Z])\.( |$)", r"\1 ", text)
+ # Replace \. with .
+ text = re.sub(r"\\.", r".", text)
+
+ text1 = []
+ for word in text.split():
+ if word == spoken_noise_word or word == noise_word:
+ text1.append(word)
+ continue
+
+ # Remove mispronunciation brackets
+ word = re.sub(r"^@(\w+)$", r"\1", word)
+ # Remove everything other than the standard ASCII symbols
+ word = re.sub("[^A-Za-z0-9.' _-]", "", word)
+ text1.append(word)
+ return " ".join(text1)
+
+
+def remove_punctuations(text):
+ """Remove punctuations and some other processing for text sentence."""
+ # Remove HTML new lines that are not end of sentences
+ text1 = re.sub("\n", " ", text)
+
+ # Remove some markers like double dash that are normally used to separate
+ # name titles in newspapers.
+ text1 = re.sub(r"(&[^;]+;|--)", " ", text1)
+
+ # Remove quotation marks
+ text1 = re.sub(r"''|``|\(|\)", " ", text1)
+
+ # Remove everything other than the standard ASCII symbols
+ text1 = re.sub("[^A-Za-z0-9.' _-]", "", text1)
+
+ # Replace multiple .'s with single and then remove isolated '.'
+ text1 = re.sub(r"\.[.]+ ", ".", text1)
+ text1 = re.sub(r" \. ", " ", text1)
+
+ # Remove isolated '-'
+ text1 = re.sub(r" - ", " ", text1)
+
+ # Replace multiple spaces with single.
+ text1 = re.sub(r"[ ]+", " ", text1)
+
+ return text1
diff --git a/egs/hub4_english/s5/local/data_prep/normalize_bn96_transcripts.pl b/egs/hub4_english/s5/local/data_prep/normalize_bn96_transcripts.pl
new file mode 100755
index 00000000000..3db0e1c71c3
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/normalize_bn96_transcripts.pl
@@ -0,0 +1,28 @@
+#!/usr/bin/env perl
+
+# Copyright 2017 Vimal Manohar
+# Apache 2.0
+
+@ARGV == 2 || die "usage: normalize_bn96_transcripts.pl noise_word spoken_noise_word < transcript > transcript2";
+$noise_word = shift @ARGV;
+$spoken_noise_word = shift @ARGV;
+
+while() {
+ $_ =~ m:^(\S+) (.+): || die "bad line $_";
+ $utt = $1;
+ $trans = $2;
+ print "$utt";
+
+ $trans =~ tr:a-z:A-Z:;
+ $trans =~ s:\(\(([^)]*)\)\):$1 :g; # Remove unclear speech markings
+ $trans =~ s:#: :g; # Remove overlapped speech markings
+ $trans =~ s:\*\*([^*]+)\*\*:$1 :g; # Remove invented word markings
+ $trans =~ s:\[[^]]+\]:$noise_word :g;
+ $trans =~ s:\{[^}]+\}:$spoken_noise_word :g;
+ $trans =~ s:^[+]([^+]+)[+]$:$1:; # Remove mispronunciation brackets
+ foreach $w (split (" ",$trans)) {
+ $w =~ s:^@(.*)$:$1:; # Remove best guess marking for proper nouns
+ print " $w";
+ }
+ print "\n";
+}
diff --git a/egs/hub4_english/s5/local/data_prep/normalize_bn97_transcripts.pl b/egs/hub4_english/s5/local/data_prep/normalize_bn97_transcripts.pl
new file mode 100755
index 00000000000..b27f8da65f8
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/normalize_bn97_transcripts.pl
@@ -0,0 +1,36 @@
+#!/usr/bin/env perl
+
+# Copyright 2017 Vimal Manohar
+# Apache 2.0
+
+@ARGV == 2 || die "usage: normalize_bn97_transcripts.pl noise_word spoken_noise_word < transcript > transcript2";
+$noise_word = shift @ARGV;
+$spoken_noise_word = shift @ARGV;
+
+while() {
+ $_ =~ m:^(\S+) (.+): || die "bad line $_";
+ $utt = $1;
+ $trans = $2;
+ print "$utt";
+
+ $trans =~ tr:a-z:A-Z:;
+ $trans =~ s:\(\(([^)]*)\)\):$1 :g; # Remove unclear speech markings
+ $trans =~ s:#: :g; # Remove overlapped speech markings
+ $trans =~ s:\*\*([^*]+)\*\*:$1 :g; # Remove invented word markings
+ $trans =~ s:\[[^]]+\]:$noise_word :g;
+ $trans =~ s:\{[^}]+\}:$spoken_noise_word :g;
+ $trans =~ s:^[+]([^+]+)[+]$:$1:; # Remove mispronunciation brackets
+ foreach $w (split (" ",$trans)) {
+ if ($w ne $noise_word && $w ne $spoken_noise_word) {
+ $w =~ s:[?.,!]+$::; # Remove punctuations
+ $w =~ s:^@(.*)$:$1:; # Remove best guess marking for proper nouns
+ $w =~ s:^[\^](.*)$:$1:; # Remove capitalization marks
+ $w =~ s:_([A-Z])'S$:$1.'S :g; # Normalize abbreviations from _f_b_i to f. b. i.
+ $w =~ s:_([A-Z]):$1. :g; # Normalize abbreviations from _f_b_i to f. b. i.
+ $w =~ s:[ ]+$::; # Remove trailing spaces
+ }
+
+ print " $w";
+ }
+ print "\n";
+}
diff --git a/egs/hub4_english/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl b/egs/hub4_english/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl
new file mode 100755
index 00000000000..37487296809
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl
@@ -0,0 +1,229 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright (c) 2017 Johns Hopkins University (Author: Jan "Yenda" Trmal )
+# 2017 Vimal Manohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+require HTML::Parser or die "This script needs HTML::Parser from CPAN";
+HTML::Parser->import();
+
+binmode(STDOUT, ":utf8");
+
+sub trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s };
+
+sub parse_sgml_tag {
+ my $tag = shift(@_);
+ my %ret;
+
+ if ($tag !~ /=/) {
+ return %ret;
+ }
+
+ $tag =~ s/<[a-zA-Z]+ //;
+ $tag =~ s/> *$//;
+ #print $tag . "\n";
+
+ my @key_value_pairs = split / *,? +/, $tag;
+ for my $entry(@key_value_pairs) {
+ (my $key, my $value) = split '=', $entry, 2;
+ $ret{$key}=$value;
+ }
+ return %ret;
+}
+
+if (@ARGV != 1) {
+ print STDERR "$0: This script needs exactly one parameter (list of SGML files)\n";
+ print STDERR " Usage: $0 \n";
+ print STDERR " where\n";
+ print STDERR " is a file containing the official SGML format\n";
+ print STDERR " transcripts. The files are parsed and the parsed representation\n";
+ print STDERR " is dumped to STDOUT (one utterance + the additional data fields\n";
+ print STDERR " per line (we dump all the fields, but not all fields are used\n";
+ print STDERR " in the recipe).\n";
+ die;
+}
+my $filelist=$ARGV[0];
+
+my $p = HTML::Parser->new();
+
+my @files=();
+open(F, '<', $filelist) or die "Could not open file $filelist: $?\n";
+while() {
+ chomp;
+ push @files, $_;
+}
+
+foreach my $file (@files) {
+ my $reporter="";
+ my $start = -1;
+ my $end = -1;
+ my $segment_start = -1;
+ my $segment_end = -1;
+ my $segment_speaker;
+ my $segment_fidelity = "XXX";
+ my $segment_mode = "XXX";
+ my $section_start = -1;
+ my $section_end = -1;
+ my $filename = "";
+ my $seq = 0;
+ my @text = ();
+ my $time;
+ my @tagqueue;
+
+ my $sgml_file = `basename $file`;
+ $sgml_file = trim $sgml_file;
+ $sgml_file =~ s/\.txt$//g;
+ $sgml_file =~ s/\.sgml$//g;
+ $sgml_file =~ s/_$//g;
+
+ open(my $f, '<:encoding(iso-8859-1)', $file) or die "Could not open file $file: $?\n";
+
+ while(my $line = <$f>) {
+ chomp $line;
+ $line = trim $line;
+ $line = lc $line;
+ next unless $line;
+
+ if ($line =~ //$1/g;
+ $line = trim $line;
+ die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+ #print "ES: $line\n";
+ ;
+ } elsif ($line =~ //$1/g;
+ $line = trim $line;
+ die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+ #print "ES: $line\n";
+ ;
+ } elsif ($line =~ //$1/g;
+ $line = trim $line;
+ die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+
+ #print join(" ", @text) . "\n" if @text > 0;
+ my $new_time = $segment_end;
+ if (@text > 0) {
+ print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+ print join(" ", @text) . "\n";
+ }
+ @text = ();
+ $time = 0;
+ $segment_speaker = "XXX";
+ $segment_start = "XXX";
+ $segment_end = "XXX";
+ $segment_fidelity = "XXX";
+ $segment_mode = "XXX";
+ #print "ET: $line\n";
+ ;
+ } elsif ($line =~ / 0) {
+ print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+ print join(" ", @text) . "\n";
+ }
+ @text = ();
+ $time = $new_time;
+ ;
+ } elsif ($line =~ /<\/sync/) {
+ #print $line;
+ ;
+ } elsif ($line =~ /)
+# 2017 Vimal Manohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+require HTML::Parser or die "This script needs HTML::Parser from CPAN";
+HTML::Parser->import();
+
+binmode(STDOUT, ":utf8");
+
+sub trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s };
+
+sub parse_sgml_tag {
+ my $tag = shift(@_);
+ my %ret;
+
+ if ($tag !~ /=/) {
+ return %ret;
+ }
+
+ $tag =~ s/<[a-zA-Z]+ //;
+ $tag =~ s/> *$//;
+ #print $tag . "\n";
+
+ my @key_value_pairs = split / *,? +/, $tag;
+ for my $entry(@key_value_pairs) {
+ (my $key, my $value) = split '=', $entry, 2;
+ $ret{$key}=$value;
+ }
+ return %ret;
+}
+
+if (@ARGV != 1) {
+ print STDERR "$0: This script needs exactly one parameter (list of SGML files)\n";
+ print STDERR " Usage: $0 \n";
+ print STDERR " where\n";
+ print STDERR " is a file containing the official SGML format\n";
+ print STDERR " transcripts. The files are parsed and the parsed representation\n";
+ print STDERR " is dumped to STDOUT (one utterance + the additional data fields\n";
+ print STDERR " per line (we dump all the fields, but not all fields are used\n";
+ print STDERR " in the recipe).\n";
+ die;
+}
+my $filelist=$ARGV[0];
+
+my $p = HTML::Parser->new();
+
+my @files=();
+open(F, '<', $filelist) or die "Could not open file $filelist: $?\n";
+while() {
+ chomp;
+ push @files, $_;
+}
+
+foreach my $file (@files) {
+ my $reporter="";
+ my $start = -1;
+ my $end = -1;
+ my $segment_start = -1;
+ my $segment_end = -1;
+ my $segment_speaker;
+ my $segment_fidelity = "XXX";
+ my $segment_mode = "XXX";
+ my $section_start = -1;
+ my $section_end = -1;
+ my $filename = "";
+ my $seq = 0;
+ my @text = ();
+ my $time;
+ my @tagqueue;
+
+ my $sgml_file = `basename $file`;
+ $sgml_file = trim $sgml_file;
+ $sgml_file =~ s/\.txt$//g;
+ $sgml_file =~ s/\.sgml$//g;
+ $sgml_file =~ s/_$//g;
+
+ open(my $f, '<:encoding(iso-8859-1)', $file) or die "Could not open file $file: $?\n";
+
+ while(my $line = <$f>) {
+ chomp $line;
+ $line = trim $line;
+ $line = lc $line;
+ next unless $line;
+
+ if ($line =~ //$1/g;
+ $line = trim $line;
+ die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+ #print "ES: $line\n";
+ ;
+ } elsif ($line =~ //$1/g;
+ $line = trim $line;
+ die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+ #print "ES: $line\n";
+ ;
+ } elsif ($line =~ //$1/g;
+ $line = trim $line;
+ die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+
+ #print join(" ", @text) . "\n" if @text > 0;
+ my $new_time = $segment_end;
+ if (@text > 0) {
+ print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+ print join(" ", @text) . "\n";
+ }
+ @text = ();
+ $time = 0;
+ $segment_speaker = "XXX";
+ $segment_start = "XXX";
+ $segment_end = "XXX";
+ $segment_fidelity = "XXX";
+ $segment_mode = "XXX";
+ #print "ET: $line\n";
+ ;
+ } elsif ($line =~ /