diff --git a/egs/hub4_english/s5/README b/egs/hub4_english/s5/README new file mode 100644 index 00000000000..7db319fe174 --- /dev/null +++ b/egs/hub4_english/s5/README @@ -0,0 +1,33 @@ +This is the English Broadcast News (HUB4) corpus. + +1996 English Broadcast News Train (HUB4) + Speech LDC97S44 + Transcripts LDC97T22 + +1997 English Broadcast News Train (HUB4) + Speech LDC98S71 + Transcripts LDC98T28 + +1995 English Broadcast News (CSR-IV HUB4) + LDC96S31 + +North American News Text Corpus + LDC95T21 + +North American News Text Supplement Corpus + LDC98T30 + +1996 CSR HUB4 Language Model + LDC98T31 + +1996 English Broadcast News Dev and Eval (HUB4) + LDC97S66 + +1997 HUB4 English Evaluation corpus + LDC2002S11 + +1998 HUB4 Broadcast News Evaluation English Test Material + LDC2000S86 + +1999 HUB4 Broadcast News Evaluation English Test Material + LDC2000S88 diff --git a/egs/hub4_english/s5/RESULTS b/egs/hub4_english/s5/RESULTS new file mode 100644 index 00000000000..c6c719f51fb --- /dev/null +++ b/egs/hub4_english/s5/RESULTS @@ -0,0 +1,9 @@ +for x in exp/*/decode*; do grep Sum $x/score*/*.ctm.*sys | utils/best_wer.sh ; done | sort -k2,2n +exit 0 + +%WER 17.8 | 728 32834 | 84.1 11.8 4.1 1.9 17.8 82.8 | exp/tri4/decode_nosp_eval97.pem_rescore/score_13_0.5/eval97.pem.ctm.filt.sys +%WER 19.0 | 728 32834 | 83.0 12.7 4.3 2.0 19.0 84.2 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys +%WER 19.4 | 728 32834 | 82.7 13.1 4.2 2.1 19.4 83.8 | exp/tri3/decode_nosp_eval97.pem_rescore/score_13_0.0/eval97.pem.ctm.filt.sys +%WER 20.5 | 728 32834 | 81.7 13.9 4.4 2.3 20.5 85.0 | exp/tri3/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys +%WER 23.7 | 728 32834 | 79.0 16.0 5.0 2.7 23.7 85.3 | exp/tri4/decode_nosp_eval97.pem.si/score_12_0.0/eval97.pem.ctm.filt.sys +%WER 25.7 | 728 32834 | 77.1 17.6 5.3 2.8 25.7 85.9 | exp/tri3/decode_nosp_eval97.pem.si/score_13_0.0/eval97.pem.ctm.filt.sys diff --git a/egs/hub4_english/s5/cmd.sh b/egs/hub4_english/s5/cmd.sh new file mode 100755 index 00000000000..43f7b21771a --- /dev/null +++ b/egs/hub4_english/s5/cmd.sh @@ -0,0 +1,14 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 1G" +export decode_cmd="queue.pl --mem 4G" diff --git a/egs/hub4_english/s5/conf/mfcc.conf b/egs/hub4_english/s5/conf/mfcc.conf new file mode 100644 index 00000000000..7361509099f --- /dev/null +++ b/egs/hub4_english/s5/conf/mfcc.conf @@ -0,0 +1 @@ +--use-energy=false # only non-default option. diff --git a/egs/hub4_english/s5/conf/vad.conf b/egs/hub4_english/s5/conf/vad.conf new file mode 100644 index 00000000000..a0ca2449b10 --- /dev/null +++ b/egs/hub4_english/s5/conf/vad.conf @@ -0,0 +1,2 @@ +--vad-energy-threshold=5.5 +--vad-energy-mean-scale=0.5 diff --git a/egs/hub4_english/s5/local/data_prep/csr4_utils.patch b/egs/hub4_english/s5/local/data_prep/csr4_utils.patch new file mode 100644 index 00000000000..1b7dcb4ec1b --- /dev/null +++ b/egs/hub4_english/s5/local/data_prep/csr4_utils.patch @@ -0,0 +1,793 @@ +diff -Naur tools/csr4_utils/abbrproc.perl local/data_prep/csr_hub4_utils/abbrproc.perl +--- tools/csr4_utils/abbrproc.perl 1996-08-27 15:25:15.000000000 -0400 ++++ local/data_prep/csr_hub4_utils/abbrproc.perl 2017-11-03 13:22:09.466213159 -0400 +@@ -1,4 +1,4 @@ +-#!/usr/local/bin/perl ++#!/usr/bin/perl + # $Id: abbrproc.perl,v 1.3 1996/08/21 20:05:09 robertm Rel $ + ############################################################################### + # This software is being provided to you, the LICENSEE, by the Massachusetts # +diff -Naur tools/csr4_utils/artfilter.perl local/data_prep/csr_hub4_utils/artfilter.perl +--- tools/csr4_utils/artfilter.perl 1996-01-04 11:31:57.000000000 -0500 ++++ local/data_prep/csr_hub4_utils/artfilter.perl 2017-11-03 13:22:09.470213159 -0400 +@@ -1,4 +1,4 @@ +-#!/usr/local/bin/perl ++#!/usr/bin/perl + + # artfilter.perl + +diff -Naur tools/csr4_utils/bugproc.perl local/data_prep/csr_hub4_utils/bugproc.perl +--- tools/csr4_utils/bugproc.perl 1996-08-27 15:25:15.000000000 -0400 ++++ local/data_prep/csr_hub4_utils/bugproc.perl 2017-11-03 13:22:09.474213159 -0400 +@@ -1,4 +1,4 @@ +-#!/usr/local/bin/perl ++#!/usr/bin/perl + # $Id: bugproc.perl,v 1.4 1996/08/21 23:55:40 robertm Rel $ + ############################################################################### + # This software is being provided to you, the LICENSEE, by the Massachusetts # +diff -Naur tools/csr4_utils/do-lm local/data_prep/csr_hub4_utils/do-lm +--- tools/csr4_utils/do-lm 1996-08-27 15:25:15.000000000 -0400 ++++ local/data_prep/csr_hub4_utils/do-lm 2017-11-27 14:21:15.965400509 -0500 +@@ -22,19 +22,22 @@ + exit 1 + fi + +-PATH=$PATH:./bin ; export PATH ++dir=$1 ++shift + + for file in $* + do + BASENM=`basename $file` ++ name="${BASENM%.*}" ++ + echo "Running LM pipeline for |$BASENM|..." 1>&2 + set -x +- pare-sgml.perl $file | +- bugproc.perl | +- numhack.perl | +- numproc.perl | +- abbrproc.perl | +- puncproc.perl > lm/$BASENM ++ gunzip -c $file | pare-sgml.perl | \ ++ bugproc.perl | \ ++ numhack.perl | \ ++ numproc.perl -xtools/csr4_utils/num_excp | \ ++ abbrproc.perl tools/csr4_utils/abbrlist | \ ++ puncproc.perl -np | gzip -c > $dir/$name.txt.gz + set +x + echo "Done with $BASENM." + done +diff -Naur tools/csr4_utils/numhack.perl local/data_prep/csr_hub4_utils/numhack.perl +--- tools/csr4_utils/numhack.perl 1996-08-27 15:25:16.000000000 -0400 ++++ local/data_prep/csr_hub4_utils/numhack.perl 2017-11-03 13:22:09.482213158 -0400 +@@ -1,4 +1,4 @@ +-#!/usr/local/bin/perl ++#!/usr/bin/perl + + # $Id: numhack.perl,v 1.4 1996/08/23 05:12:27 robertm Rel $ + # preprocessor for numproc, potentially specialized for Broadcast News material +diff -Naur tools/csr4_utils/numproc.perl local/data_prep/csr_hub4_utils/numproc.perl +--- tools/csr4_utils/numproc.perl 1996-08-27 15:25:16.000000000 -0400 ++++ local/data_prep/csr_hub4_utils/numproc.perl 2017-11-08 16:59:50.497562934 -0500 +@@ -1,4 +1,5 @@ +-#!/usr/local/bin/perl ++#! /usr/bin/perl ++# + # $Id: numproc.perl,v 1.7 1996/08/23 05:04:11 robertm Rel $ + ############################################################################### + # This software is being provided to you, the LICENSEE, by the Massachusetts # +@@ -74,7 +75,7 @@ + { if($ARGV[$i] =~ /^-/) + { if($ARGV[$i] =~ /^-v/) {$vflg=1;} + elsif($ARGV[$i] =~ /^-x/) +- { $exfile=$ARGV[i]; ++ { $exfile=$ARGV[$i]; + $exfile =~ s/^-x//; + } + else {&perr2("illegal flag: $ARGV[$i]");} +@@ -237,7 +238,7 @@ + if(/\d/ && !/^<\/?[spa]/) # opt and protect sgml + { @input = split(/\s+/o); + @output=(); +- wloop: for($field=0;$field<=$#input;$field++) # $field is global ++ for($field=0;$field<=$#input;$field++) # $field is global + { if($field>0) {$last=$input[$field-1];} + else {$last='';} + if($field<$#input) {$next=$input[$field+1];} +@@ -248,27 +249,27 @@ + $_=$input[$field]; + + if(/<[\w\.\/]*>/o && !/

/o) # pass only +- {&perr("spurious SGML: $_");} # ++ {&perr("spurious SGML: $_"); next; } # + + if(/[0-9]/o && !/

$40 +@@ -362,32 +363,32 @@ + if($x =~ /\//) + { $x =~ s/^\D*//; + $x =~ s/\D*$//; +- &printfrac($x); ++ if (! &printfrac($x)) {return 0;} + &pusho("of a $unit"); + $x=""; + $plural=0; + } + + $x =~ s/^\D*([\d,]*)\D*.*$/$1/; # int part of string +- if($x ne "") {&printint($x);} # print int part (eg. dollars) ++ if($x ne "") {if (! &printint($x)) {return 0;} } # print int part (eg. dollars) + + if($next eq "and" && $next2 =~ /\d\/\d/ && next2 !~ /\/.*\//) + { if($unit && $x ne "") {&pusho("and");} # frac: eg 4 1/16 + $z=$next2; + $z =~ s/\D*$//; +- &printfrac($z); ++ if (! &printfrac($z)) {return 0;} + ($punct)=($next2 =~ /(\D*)$/); + $field+=2; + &pusho("${unit}s"); + +- if($back) {&perr("money: back and 1 1/3");} ++ if($back) {&perr("money: back and 1 1/3"); return 0;} + + if($punct) {&appendo($punct);} # punctuation from *illion +- return; ++ return 1; + } + + if($back eq "" && $next =~ /^(thousands?|[a-z]*illions?)(\W*)/i) +- { &printdecfrac($_); # multiplier ++ { if (! &printdecfrac($_)) {return 0;} # multiplier + &pusho($1); + $punct=$2; + $plural=1; ### if adj '', if noun 's' +@@ -395,7 +396,7 @@ + $frac=1; + } + elsif(/\.\d$/ || /\.\d\D/ || /\.\d{3}/ ) # .d or .ddd+ +- { &printdecfrac($_); ++ { if (! &printdecfrac($_)) {return 0;} + $plural=1; # can be either + $frac=1; + } +@@ -409,7 +410,7 @@ + { $unit=""; # fix "$1 dollar" wsj typo + $subunit_sing=""; + $subunit_pl=""; +- &printdecfrac($_); ++ if (! &printdecfrac($_)) {return 0;} + $frac=1; + } + +@@ -447,24 +448,26 @@ + { $y=$_; + $y =~ s/^[^\.]*\.([\d]*)\D?.*$/$1/; # get fractional part + if($unit && $x ne "") {&pusho("and");} +- &printint($y); ++ if (! &printint($y)) {return 0;} + if($sing || int($y)==1) {&pusho($subunit_sing);} + else {&pusho($subunit_pl);} + } + + if($back) # punctuation from this field +- { if($punct) {&perr("money: back and punct");} ++ { if($punct) {&perr("money: back and punct"); return 0;} + + if($back =~ /^\w/) {&pusho($back);} + else {&appendo($back);} + } + + if($punct) {&appendo($punct);} # punctuation from *illion ++ ++ return 1; + } + + sub printyear # &printyear(x) + { if($vflg) {print "printyear: $_[0]\n";} +- &printnum($_[0]); # for now ++ return &printnum($_[0]); # for now + } + + sub printtime # &printtime(x) +@@ -475,7 +478,7 @@ + local($front); + local($back); + +- if(/:{2,}/ || !/\d:\d/) {&perr("printtime: not a time");} ++ if(/:{2,}/ || !/\d:\d/) {&perr("printtime: not a time"); return 0;} + + @x=split(/:/,$_); + ($front)=($x[0] =~ /^(\D*)/); +@@ -487,20 +490,21 @@ + { &pusho($front); # generally punctuation + if($front !~ /\w$/) {$appendflg=1;} + } +- &printint($x[0]); ++ if (! &printint($x[0])) {return 0;} + if($x[1]==0) + { $_=$next; + if(!/^[aApP]\.?[nM]\.?$/) {&pusho("o'clock");} + } + elsif ($x[1]<10) + { &pusho("oh"); +- &printint($x[1]); ++ if (!&printint($x[1])) {return 0;} + } +- else {&printint($x[1]);} ++ else {if (! &printint($x[1])) {return 0;} } + if($back) + { if($back =~ /^\w/) {&pusho($back);} + else {&appendo($back);} # generally punctuation + } ++ return 1; + } + + sub printfrac +@@ -530,8 +534,8 @@ + } + + @z=split(/\//,$x); +- if($#z !=1) {&perr("printfrac: illegal fraction: $_[0]");} +- if($z[1] <= 1) {&perr("printfrac: den too small: $_[0]");} ++ if($#z !=1) {&perr("printfrac: illegal fraction: $_[0]"); return 0;} ++ if($z[1] <= 1) {&perr("printfrac: den too small: $_[0]"); return 0;} + + if($front) + { &pusho($front); +@@ -541,22 +545,22 @@ + + if($sign) {&pusho($sign);} + +- &printint($z[0]); #numerator ++ if (! &printint($z[0])) { return 0;} #numerator + if($z[1] <= $#den) # small den from table (<20) + { &pusho($den[$z[1]]); +- if($z[0]!=1) {&pluralize;} ++ if($z[0]!=1) {if (! &pluralize) {return 0;} } + } + else #large den + { $ones=int($z[1]%100); + $hun=100*int($z[1]/100); +- if($hun>0) {&printint($hun);} ++ if($hun>0) {if (!&printint($hun)) {return 0;} } + if($ones==0) + { &appendo("th"); +- if($z[0]!=1) {&pluralize;} ++ if($z[0]!=1) {if (! &pluralize) {return 0;} } + } + elsif($ones<=$#largeden) # <20 + { &pusho($largeden[$ones]); +- if($z[0]!=1) {&pluralize;}; ++ if($z[0]!=1) {if (!&pluralize) {return 0;} } + } + else + { $x=int($ones%10); +@@ -569,11 +573,11 @@ + } + if($x==0) + { &pusho("th"); +- if($z[0]!=1) {&pluralize;} ++ if($z[0]!=1) {if (! &pluralize) {return 0;} } + } + else + { &pusho($largeden[$x]); +- if($z[0]!=1) {&pluralize;} ++ if($z[0]!=1) {if (! &pluralize) {return 0;} } + } + } + } +@@ -585,6 +589,8 @@ + &appendo($back); + } + } ++ ++ return 1; + } + + sub printnum # printnum(n) +@@ -624,7 +630,7 @@ + $x =~ s/\D*$//; # strip back: final . is punct + } + +- if($x =~ /[^\d\.,]/) {&perr("printnum: $_[0] is not a number");} ++ if($x =~ /[^\d\.,]/) {&perr("printnum: $_[0] is not a number"); return 0;} + + if($x!=0 && $x =~ /^0/ && $x =~ /^\d*$/) # "oh" numbers + { if($front) +@@ -641,7 +647,7 @@ + + if($back) + { if($back =~ /^s$/ || $back =~ /^s\W/) # back = s +- { &pluralize; # eg. 1960s ++ { if (! &pluralize) {return 0;} # eg. 1960s + $back =~ s/^s//; + } + if($back) +@@ -649,7 +655,7 @@ + else {&appendo($back);} # back = punct or "'s" + } + } +- return; ++ return 1; + } + + if($x =~ /^\d/) # get integer part +@@ -675,48 +681,48 @@ + if($sign) { &pusho($sign); } + + $ones=int($intpart%100); +- if($comma) {&printint($intpart);} ++ if($comma) {if (! &printint($intpart)) {return 0;} } + elsif(($intpart>=1900 || $intpart>=1100 && $ones==0) + && $intpart<2000 && !$fracpart) #4 digit -> 2+2 + { $hun=int($intpart/100); +- &printint($hun); +- if($ones>=10) {&printint($ones);} ++ if (! &printint($hun)) {return 0;} ++ if($ones>=10) {if (! &printint($ones)) {return 0;} } + elsif($ones>0) + { &pusho("oh"); +- &printint($ones); ++ if (! &printint($ones)) {return 0;} + } + else {&pusho("hundred");} + } + else +- { &printint($intpart); ++ { if (! &printint($intpart)) {return 0;} + $y=$last; + $y =~ s/^\W*//; # thize dates: May 25th + if(length($intpart)<=2 && $months{$y}) +- { &thize(""); ++ { if (! &thize("")) {return 0;} + $back =~ s/[a-z]//g; + } + } +- if($fracpart) {&printdecfrac($fracpart);} ++ if($fracpart) {if (! &printdecfrac($fracpart)) {return 0;} } + + if($back) + { if($back =~ /^s$/ || $back =~ /^s\W/) # back = s +- { &pluralize; # eg. 1960s ++ { if (! &pluralize) {return 0;} # eg. 1960s + $back =~ s/^s//; + } + if($back =~ /^st$/ || $back =~ /^st\W/) # back= st +- { &thize("st"); # eg. 1st ++ { if (! &thize("st")) {return 0;} # eg. 1st + $back =~ s/^st//; + } + if($back =~ /^nd$/ || $back =~ /^nd\W/) # back= nd +- { &thize("nd"); # eg. 2nd ++ { if (! &thize("nd")) {return 0;} # eg. 2nd + $back =~ s/^nd//; + } + if($back =~ /^rd$/ || $back =~ /^rd\W/) # back= rd +- { &thize("rd"); # eg. 3rd ++ { if (! &thize("rd")) {return 0;} # eg. 3rd + $back =~ s/^rd//; + } + if($back =~ /^th$/ || $back =~ /^th\W/) # back= th +- { &thize("th"); # eg. 4th ++ { if (! &thize("th")) {return 0;} # eg. 4th + $back =~ s/^th//; + } + if($back) +@@ -724,6 +730,7 @@ + else {&appendo($back);} # back = punct or "'s" + } + } ++ return 1; + } + + sub printdate # printdate(n): x/x/x format +@@ -741,7 +748,7 @@ + $back=$1; + + if($x !~ /^\d{1,2}\/\d{1,2}\/(19)?\d{2}$/) +- {&perr("printdate: $_[0] is not a date");} ++ {&perr("printdate: $_[0] is not a date"); return 0;} + + @y=split(/\//,$x); + $y[2] =~ s/^19(\d{2})$/$1/; +@@ -752,20 +759,21 @@ + $appendflg=1; + } + +- &printint($y[0]); ++ if (! &printint($y[0])) {return 0;} + &appendo("/"); + + $appendflg=1; +- &printint($y[1]); ++ if (! &printint($y[1])) {return 0;} + &appendo("/"); + + $appendflg=1; +- &printint($y[2]); ++ if (! &printint($y[2])) {return 0;} + + if($back) + { if($back =~ /^[a-zA-Z]/) {&appendo("-");} + &appendo($back); + } ++ return 1; + } + + sub printserno # printserno(n): eg. B1, 3b2, 10W-40 +@@ -815,12 +823,12 @@ + } # (should expand here unless in dictionary) + $x =~ s/^(\d*)//; # strip off dig + $y=$1; +- if($y ne "") { &printdigstr($y); } ++ if($y ne "") { if (! &printdigstr($y)) {return 0;} } + } + + if($back =~ /^s\b/) # back = s + { # eg. 2C60s +- &pluralize; ++ if (! &pluralize) {return 0;} + $back =~ s/^s//; + } + if($back) +@@ -828,6 +836,7 @@ + else {&appendo($back);} + } + $appendflg=0; ++ return 1; + } + + sub printdigstr # printdigstr(x) +@@ -841,14 +850,13 @@ + if($x =~ /^0/) # leading zero + { while($x ne "") + { $x =~ s/^(.)//; +- if($1 !~ /\d/) {&perr("printdigstr: non-digit");} ++ if($1 !~ /\d/) {&perr("printdigstr: non-digit"); return 0;} + &pusho("$ones_z[$1]"); + } + return; + } + if($x =~ /^\d0*$/) # d, d0, d00, d000, etc +- { &printint($x); +- return; ++ { return &printint($x); + } + + $_=$x; +@@ -857,30 +865,29 @@ + for($k=0;$y[$k]==0;$k++) {} # k= nr following 0s + + if($j==2) # 2 dig +- { &printint($x); +- return; ++ { return &printint($x); + } + if($j==3) +- { &printint($y[2]); ++ { if (! &printint($y[2])) {return 0;} + if($y[1]==0) {&pusho("oh");} +- &printint("$y[1]$y[0]"); +- return; ++ return &printint("$y[1]$y[0]"); + } + if($j==5 && $k<=2) +- { &printint("$y[4]"); ++ { if (! &printint("$y[4]")) {return 0;} + $j=4; + } + if($j==4) +- { &printint("$y[3]$y[2]"); ++ { if (! &printint("$y[3]$y[2]")) {return 0;} + if($k==2) {&pusho("hundred");} + else + { if($y[1]==0) {&pusho("oh");} +- &printint("$y[1]$y[0]"); ++ return &printint("$y[1]$y[0]"); + } +- return; ++ return 1; + } + # >5 dig: just sequential dig + for($j--;$j>=0;$j--) {&pusho("$ones_oh[$y[$j]]");} ++ return 1; + } + + sub printftin # printftin(n): eg. 6\'-4\" +@@ -905,19 +912,19 @@ + + $x =~ s/^([\d\.]*)//; # strip off dig & . + $y=$1; +- if(!$y) {&perr("printftin: bad feet");} +- &printnum($y); ++ if(!$y) {&perr("printftin: bad feet"); return 0;} ++ if (! &printnum($y)) {return 0;} + if($y==1) {&appendo("-foot");} + else {&appendo("-feet");} + + $x =~ s/^\'//; # strip off \' + $x =~ s/^-//; # strip off - +- if(!$x) {&perr("printftin: bad intermed");} ++ if(!$x) {&perr("printftin: bad intermed"); return 0;} + + $x =~ s/^([\d\.]*)//; # strip off dig & . + $y=$1; +- if(!$y) {&perr("printftin: bad inches");} +- &printnum($y); ++ if(!$y) {&perr("printftin: bad inches"); return 0;} ++ if (! &printnum($y)) {return 0;} + if($y==1) {&appendo("-inch");} + else {&appendo("-inches");} + +@@ -925,6 +932,7 @@ + { if($back !~ /^[a-zA-Z]/) {&appendo($back);} + else {&pusho($back);} + } ++ return 1; + } + + sub printint # printint(x) +@@ -968,13 +976,14 @@ + } + if(int($j/3)>0) + { if(int($j/3) > $#mult) +- { &perr("printint: too big"); } ++ { &perr("printint: too big"); return 0;} + &pusho($mult[int($j/3)]); + } + $commanextflg=1; + } + } + $commanextflg=0; ++ return 1; + } + + sub printdecfrac +@@ -989,6 +998,8 @@ + if($leadingzeroflg) + {for($j=0;$j<=$#y;$j++) { &pusho($ones_z[$y[$j]]);}} + else {for($j=0;$j<=$#y;$j++) { &pusho($ones_oh[$y[$j]]);}} ++ ++ return 1; + } + + sub pluralize # pluralize(): pluralize last entry on output stack +@@ -1016,7 +1027,9 @@ + $x =~ s/y$/ies/; + &pusho($x); + } +- else {&perr("pluralize: unknown word: $_");} ++ else {&perr("pluralize: unknown word: $_"); return 0;} ++ ++ return 1; + } + + sub thize # thize(): add th to last entry on output stack +@@ -1028,50 +1041,51 @@ + $_=&geto; + if( /four$/ || /six$/ || /seven$/ || /ten$/ || + /eleven$/ || /een$/ || /hundred$/ || /thousand$/ || /illion$/ ) +- { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");} # xth ++ { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;} # xth + &appendo("th"); + } + elsif( /one$/ ) # 1st +- { if($y && $y ne "st") {&perr("thize: mismatch: $_ $y\n");} ++ { if($y && $y ne "st") {&perr("thize: mismatch: $_ $y\n"); return 0;} + $x=&popo(); + $x =~ s/one$/first/; + &pusho($x); + } + elsif( /two$/ ) # 2nd +- { if($y && $y ne "nd") {&perr("thize: mismatch: $_ $y\n");} ++ { if($y && $y ne "nd") {&perr("thize: mismatch: $_ $y\n"); return 0;} + $x=&popo(); + $x =~ s/two$/second/; + &pusho($x); + } + elsif( /three$/ ) # 3rd +- { if($y && $y ne "rd") {&perr("thize: mismatch: $_ $y\n");} ++ { if($y && $y ne "rd") {&perr("thize: mismatch: $_ $y\n"); return 0;} + $x=&popo(); + $x =~ s/three$/third/; + &pusho($x); + } + elsif( /five$/ || /twelve$/ ) # 5th, 12th +- { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");} ++ { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;} + $x=&popo(); + $x =~ s/ve$/fth/; + &pusho($x); + } + elsif(/eight$/) +- { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");} # 8th ++ { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;} # 8th + &appendo("h"); + } + elsif( /nine$/ ) +- { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");} ++ { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;} + $x=&popo(); + $x =~ s/nine$/ninth/; + &pusho($x); + } + elsif( /ty$/ ) +- { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");} ++ { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;} + $x=&popo(); + $x =~ s/ty$/tieth/; + &pusho($x); + } +- else {&perr("thize: unknown word: $_");} ++ else {&perr("thize: unknown word: $_"); return 0;j} ++ return 1; + } + + sub pusho # pusho($x): push output +@@ -1089,17 +1103,17 @@ + sub appendo # appendo($x): append to output + { $appendflg=0; + # if($#output < 0) {&pusho("");} +- if($#output < 0) {&perr("appendo: output empty");} ++ if($#output < 0) {&perr("appendo: output empty"); return 0;} + $output[$#output] .= @_[0]; + } + + sub popo # popo(): pop last output +-{ if($#output < 0) {&perr("popo: output empty");} ++{ if($#output < 0) {&perr("popo: output empty"); return 0;} + pop(@output); + } + + sub geto # geto(): get last output +-{ if($#output < 0) {&perr("geto: output empty");} ++{ if($#output < 0) {&perr("geto: output empty"); return 0;} + return $output[$#output]; + } + +@@ -1111,8 +1125,6 @@ + $appendflg=0; + $commanextflg=0; + &pusho($this); +- $field++; # graceful error recovery +- goto wloop; + } + + sub perr2 +diff -Naur tools/csr4_utils/pare-sgml.perl local/data_prep/csr_hub4_utils/pare-sgml.perl +--- tools/csr4_utils/pare-sgml.perl 1996-08-27 15:25:17.000000000 -0400 ++++ local/data_prep/csr_hub4_utils/pare-sgml.perl 2017-11-03 13:22:09.486213159 -0400 +@@ -1,11 +1,14 @@ +-#!/usr/local/bin/perl ++#!/usr/bin/perl + + # $Id: pare-sgml.perl,v 1.3 1996/08/15 02:51:17 robertm Rel $ + # removes extraneous headers and other non-LM fields + # translates into LM-standard + # removes comments (enclosed in brackets) + +-$intext=0; ++use strict; ++use warnings; ++ ++my $intext=0; + while (<>) + { + if ($intext == 0) +diff -Naur tools/csr4_utils/process_filelist.sh local/data_prep/csr_hub4_utils/process_filelist.sh +--- tools/csr4_utils/process_filelist.sh 1969-12-31 19:00:00.000000000 -0500 ++++ local/data_prep/csr_hub4_utils/process_filelist.sh 2017-11-03 13:22:09.490213160 -0400 +@@ -0,0 +1,30 @@ ++#! /bin/bash ++ ++set -e ++set -o pipefail ++set -u ++set -x ++ ++if [ $# -ne 2 ]; then ++ echo "Usage: $0

" ++ exit 1 ++fi ++ ++filelist=$1 ++dir=$2 ++ ++export PATH=$PATH:tools/csr4_utils ++ ++for file in `cat $filelist`; do ++ BASENM=`basename $file` ++ name="${BASENM%.*}" ++ ++ echo "Running LM pipeline for |$BASENM|..." 1>&2 ++ gunzip -c $file | pare-sgml.perl | \ ++ bugproc.perl | \ ++ numhack.perl | \ ++ numproc.perl -xtools/csr4_utils/num_excp | \ ++ abbrproc.perl tools/csr4_utils/abbrlist | \ ++ puncproc.perl -np | gzip -c > $dir/$name.txt.gz ++ echo "Done with $BASENM." ++done +diff -Naur tools/csr4_utils/progsummary.perl local/data_prep/csr_hub4_utils/progsummary.perl +--- tools/csr4_utils/progsummary.perl 1996-07-12 09:26:35.000000000 -0400 ++++ local/data_prep/csr_hub4_utils/progsummary.perl 2017-11-03 13:22:09.494213160 -0400 +@@ -1,4 +1,4 @@ +-#!/usr/local/bin/perl ++#!/usr/bin/perl + + # Program: progsummary.perl + # Written by: dave graff +diff -Naur tools/csr4_utils/puncproc.perl local/data_prep/csr_hub4_utils/puncproc.perl +--- tools/csr4_utils/puncproc.perl 1996-08-27 15:25:17.000000000 -0400 ++++ local/data_prep/csr_hub4_utils/puncproc.perl 2017-11-03 13:22:09.494213160 -0400 +@@ -1,4 +1,4 @@ +-#!/usr/local/bin/perl ++#!/usr/bin/perl + + # $Id: puncproc.perl,v 1.2 1996/08/05 16:12:42 robertm Rel $ + ############################################################################### +@@ -59,7 +59,7 @@ + # forbidden symbols + if(//) {&perr(">");} # > +- if(/\$/) {&perr("$");} # $ ++ if(/\$/) {&perr("\$");} # $ + if(/_/) {&perr("_");} # _ + if(/\d/) {&perr("[0-9]");} # 0-9 + +diff -Naur tools/csr4_utils/tr-bn-char.fast.perl local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl +--- tools/csr4_utils/tr-bn-char.fast.perl 1996-08-21 02:39:12.000000000 -0400 ++++ local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl 2017-11-03 13:22:09.502213160 -0400 +@@ -1,4 +1,4 @@ +-#!/usr/local/bin/perl -pi.old-char ++#!/usr/bin/perl -pi.old-char + + # handles nonprinting characters in Broadcast News material, to the extent + # that they can be handled, and perhaps a bit beyond... +diff -Naur tools/csr4_utils/tr-bn-char.slow.perl local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl +--- tools/csr4_utils/tr-bn-char.slow.perl 1996-08-21 01:30:18.000000000 -0400 ++++ local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl 2017-11-03 13:22:09.502213160 -0400 +@@ -1,4 +1,4 @@ +-#!/usr/local/bin/perl -p ++#!/usr/bin/perl -p + + # handles nonprinting characters in Broadcast News material, to the extent + # that they can be handled, and perhaps a bit beyond... diff --git a/egs/hub4_english/s5/local/data_prep/format_1996_bn_data.pl b/egs/hub4_english/s5/local/data_prep/format_1996_bn_data.pl new file mode 100755 index 00000000000..84913e9a8b0 --- /dev/null +++ b/egs/hub4_english/s5/local/data_prep/format_1996_bn_data.pl @@ -0,0 +1,131 @@ +#!/usr/bin/env perl +#=============================================================================== +# Copyright (c) 2017 Johns Hopkins University +# (Author: Jan "Yenda" Trmal ) +# 2017 Vimal Manohar +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +use strict; +use warnings; +use utf8; +use List::Util qw(max); + +my $audio_width=1; +my $speaker_width=1; +my $time_width=1; + +binmode(STDOUT, ":utf8"); +binmode(STDERR, ":utf8"); + +if (@ARGV != 3) { + print STDERR "$0: Error: Unsupported number of arguments: " . scalar @ARGV ."\n"; + print STDERR " Usage: $0 \n"; + print STDERR " where\n"; + print STDERR " is a file containing list of audio files\n"; + print STDERR " (single absolute path name per line)\n"; + print STDERR " is a file containing transcripts obtained\n"; + print STDERR " obtained by processing the official SGML format\n"; + print STDERR " transcripts. See parse_sgm.pl for further info.\n"; + print STDERR " target directory (should already exist)\n"; + print STDERR " See also: local/parse_sgm.pl\n"; + die; +} + +my $audio_files = $ARGV[0]; +my $transcripts = $ARGV[1]; +my $out = $ARGV[2]; + +my %AUDIO; +open(my $audio_f, "<", $audio_files) + or die "$0: Error: Could not open $audio_files: $!\n"; +while(my $line = <$audio_f>) { + chomp $line; + (my $basename = $line) =~ s/.*\/([^\/]+).sph/$1/g; + $basename =~ s/_$//g; + $AUDIO{$basename} = $line; +} +close($audio_f); + +my %TRANSCRIPT; +open(my $transcript_f, "<:encoding(utf-8)", $transcripts) + or die "$0: Error: Could not open $transcripts: $!\n"; +while(my $line = <$transcript_f>) { + chomp $line; + my @F = split / /, $line, 8; + push @{$TRANSCRIPT{$F[0]}}, \@F; + + my $f1 = $F[0]; + my $f2 = $F[1]; + my $speaker = $F[2]; + my $t1 = $F[5]; + my $t2 = $F[6]; + + $time_width = max $time_width, length($t1), length($t2); + $speaker_width = max $speaker_width, length($speaker); + $audio_width = max $audio_width, length($f1); +} +close($transcript_f); +#print Dumper(\%TRANSCRIPT); + +print STDERR $time_width . " " . $speaker_width . " " . $audio_width . "\n"; + +my $sph2pipe = `which sph2pipe` or do { + die "$0: Error: sph2pipe is not installed. Did you run make in the tools/ directory?\n"; +}; +chomp $sph2pipe; + +open(my $wav_file, ">", "$out/wav.scp") + or die "$0: Error: Cannot create file $out/wav.scp: $!\n"; +open(my $text_file, ">:encoding(utf-8)", "$out/text") + or die "$0: Error: Cannot create file $out/text: $!\n"; +open(my $segments_file, ">", "$out/segments") + or die "$0: Error: Cannot create file $out/segments: $!\n"; +open(my $spk_file, ">", "$out/utt2spk") + or die "$0: Error: Cannot create file $out/utt2spk: $!\n"; + +foreach my $file (sort keys %AUDIO) { + print "$0 Error: $file does not exist in transcripts!\n" + unless exists $TRANSCRIPT{$file}; + my $transcripts = $TRANSCRIPT{$file}; + + #my $file_fmt = sprintf("%0${audio_width}s", $file); + my $file_fmt = sprintf("%s", $file); + + print $wav_file "$file_fmt $sph2pipe -f wav $AUDIO{$file}|\n"; + + foreach my $utt (@{$transcripts}) { + my $start = $utt->[5] + 0.0; + my $end = $utt->[6] + 0.0; + if ($end - $start < 0.005) { # remove very short segments + next; + } + my $start_time = sprintf("%0${time_width}d", $utt->[5]*1000); + my $end_time = sprintf("%0${time_width}d", $utt->[6]*1000); + my $spk = sprintf("%0${speaker_width}s", $utt->[2]); + # my $spk = sprintf("%s", $utt->[2]); + my $spkid = "${file_fmt}_${spk}"; + my $uttid = "${file_fmt}_${spk}_${start_time}_${end_time}"; + + print $text_file "$uttid $utt->[7]\n"; + print $spk_file "$uttid $spkid\n"; + print $segments_file "$uttid $file_fmt $start $end\n"; + } +} + +close($wav_file); +close($text_file); +close($segments_file); +close($spk_file); diff --git a/egs/hub4_english/s5/local/data_prep/format_1997_bn_data.pl b/egs/hub4_english/s5/local/data_prep/format_1997_bn_data.pl new file mode 120000 index 00000000000..844c16bbe06 --- /dev/null +++ b/egs/hub4_english/s5/local/data_prep/format_1997_bn_data.pl @@ -0,0 +1 @@ +format_1996_bn_data.pl \ No newline at end of file diff --git a/egs/hub4_english/s5/local/data_prep/hub4_utils.py b/egs/hub4_english/s5/local/data_prep/hub4_utils.py new file mode 100644 index 00000000000..4ee9eab1c7e --- /dev/null +++ b/egs/hub4_english/s5/local/data_prep/hub4_utils.py @@ -0,0 +1,156 @@ +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +"""This module contains utilities for preparing the HUB4 broadcast news +evaluation corpora. +""" + +import os +import re +import sys + + +def parse_uem_line(reco, line): + """This method parses a 'line' from the UEM for recording 'reco' + and returns the line converted to kaldi segments format. + The format of UEM is + + + We force the channel to be 1 and take the file-id to be the recording-id. + """ + line = line.strip() + if len(line) == 0 or line[0:2] == ";;": + return None + parts = line.split() + + if reco is None: + reco = parts[0] + + # The channel ID is expected to be 1. + if parts[1] != "1": + raise TypeError("Invalid line {0}".format(line)) + + start_time = float(parts[2]) + end_time = float(parts[3]) + + utt = "{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100), + int(end_time * 100)) + return "{0} {1} {2} {3}".format(utt, reco, start_time, end_time) + + +def parse_cmu_seg_line(line, prepend_reco_to_spk=False): + """This line parses a 'line' from the CMU automatic segmentation for + recording. + The CMU segmentation has the following format: + + + e.g.: + h4e_98_1 1 F0-0000 0.00 28.22 F0 + + We force the channel to be 1 and take the file-id to be the recording-id. + """ + line = line.strip() + if len(line) == 0 or line[0:2] == ";;": + return None + parts = line.split() + + # Actually a file, but we assuming 1-1 mapping to recording and force + # channel to be 1. + reco = parts[0] + + # The channel ID is expected to be 1. + if parts[1] != "1": + raise TypeError("Invalid line {0}".format(line)) + spk = parts[2] + + start_time = float(parts[3]) + end_time = float(parts[4]) + + if prepend_reco_to_spk: + spk = reco + '-' + spk + utt = "{spk}-{0:06d}-{1:06d}".format(int(start_time * 100), + int(end_time * 100), spk=spk) + else: + utt = "{spk}-{reco}-{0:06d}-{1:06d}".format(int(start_time * 100), + int(end_time * 100), + reco=reco, spk=spk) + + segment_line = "{0} {1} {st:.3f} {end:.3f}".format( + utt, reco, st=start_time, end=end_time) + utt2spk_line = "{0} {1}".format(utt, spk) + + return (segment_line, utt2spk_line) + + +def normalize_csr_transcript(text, noise_word, spoken_noise_word): + """Normalize broadcast news transcript for audio.""" + text = text.upper() + + # Remove long event markings + text = re.sub(r"\[[^]/]+/\]|\[/[^]/]+\]", "", text) + # Remove comments + text = re.sub(r"\{\{[^}]*\}\}", "", text) + # Replace alternative words with a single one (second alternative) + text = re.sub(r"\{[^}/]+/([^}/]+)[^}]*\}", r"\1", text) + # Remove partial word completions + text = re.sub(r"\([^)]+\)-|-\([^)]+\)", "-", text) + # Remove accent marks and diacritics + text = re.sub(r"\\[3-8]", "", text) + + # Remove unclear speech markings + text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text) + text = re.sub(r"#", "", text) # Remove overlapped speech markings + # Remove invented word markings + text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) + # Replace speaker-made noises with + text = re.sub(r"\[INHALING\]|\[COUGH\]|\[THROAT_CLEARING\]|\[SIGN\]", + spoken_noise_word, text) + # Replace noise with + text = re.sub(r"\[[^]]+\]", noise_word, text) + text = re.sub(r"\+([^+]+)\+", r"\1", text) + + # Remove periods after letter. + text = re.sub(r"([A-Z])\.( |$)", r"\1 ", text) + # Replace \. with . + text = re.sub(r"\\.", r".", text) + + text1 = [] + for word in text.split(): + if word == spoken_noise_word or word == noise_word: + text1.append(word) + continue + + # Remove mispronunciation brackets + word = re.sub(r"^@(\w+)$", r"\1", word) + # Remove everything other than the standard ASCII symbols + word = re.sub("[^A-Za-z0-9.' _-]", "", word) + text1.append(word) + return " ".join(text1) + + +def remove_punctuations(text): + """Remove punctuations and some other processing for text sentence.""" + # Remove HTML new lines that are not end of sentences + text1 = re.sub("\n", " ", text) + + # Remove some markers like double dash that are normally used to separate + # name titles in newspapers. + text1 = re.sub(r"(&[^;]+;|--)", " ", text1) + + # Remove quotation marks + text1 = re.sub(r"''|``|\(|\)", " ", text1) + + # Remove everything other than the standard ASCII symbols + text1 = re.sub("[^A-Za-z0-9.' _-]", "", text1) + + # Replace multiple .'s with single and then remove isolated '.' + text1 = re.sub(r"\.[.]+ ", ".", text1) + text1 = re.sub(r" \. ", " ", text1) + + # Remove isolated '-' + text1 = re.sub(r" - ", " ", text1) + + # Replace multiple spaces with single. + text1 = re.sub(r"[ ]+", " ", text1) + + return text1 diff --git a/egs/hub4_english/s5/local/data_prep/normalize_bn96_transcripts.pl b/egs/hub4_english/s5/local/data_prep/normalize_bn96_transcripts.pl new file mode 100755 index 00000000000..3db0e1c71c3 --- /dev/null +++ b/egs/hub4_english/s5/local/data_prep/normalize_bn96_transcripts.pl @@ -0,0 +1,28 @@ +#!/usr/bin/env perl + +# Copyright 2017 Vimal Manohar +# Apache 2.0 + +@ARGV == 2 || die "usage: normalize_bn96_transcripts.pl noise_word spoken_noise_word < transcript > transcript2"; +$noise_word = shift @ARGV; +$spoken_noise_word = shift @ARGV; + +while() { + $_ =~ m:^(\S+) (.+): || die "bad line $_"; + $utt = $1; + $trans = $2; + print "$utt"; + + $trans =~ tr:a-z:A-Z:; + $trans =~ s:\(\(([^)]*)\)\):$1 :g; # Remove unclear speech markings + $trans =~ s:#: :g; # Remove overlapped speech markings + $trans =~ s:\*\*([^*]+)\*\*:$1 :g; # Remove invented word markings + $trans =~ s:\[[^]]+\]:$noise_word :g; + $trans =~ s:\{[^}]+\}:$spoken_noise_word :g; + $trans =~ s:^[+]([^+]+)[+]$:$1:; # Remove mispronunciation brackets + foreach $w (split (" ",$trans)) { + $w =~ s:^@(.*)$:$1:; # Remove best guess marking for proper nouns + print " $w"; + } + print "\n"; +} diff --git a/egs/hub4_english/s5/local/data_prep/normalize_bn97_transcripts.pl b/egs/hub4_english/s5/local/data_prep/normalize_bn97_transcripts.pl new file mode 100755 index 00000000000..b27f8da65f8 --- /dev/null +++ b/egs/hub4_english/s5/local/data_prep/normalize_bn97_transcripts.pl @@ -0,0 +1,36 @@ +#!/usr/bin/env perl + +# Copyright 2017 Vimal Manohar +# Apache 2.0 + +@ARGV == 2 || die "usage: normalize_bn97_transcripts.pl noise_word spoken_noise_word < transcript > transcript2"; +$noise_word = shift @ARGV; +$spoken_noise_word = shift @ARGV; + +while() { + $_ =~ m:^(\S+) (.+): || die "bad line $_"; + $utt = $1; + $trans = $2; + print "$utt"; + + $trans =~ tr:a-z:A-Z:; + $trans =~ s:\(\(([^)]*)\)\):$1 :g; # Remove unclear speech markings + $trans =~ s:#: :g; # Remove overlapped speech markings + $trans =~ s:\*\*([^*]+)\*\*:$1 :g; # Remove invented word markings + $trans =~ s:\[[^]]+\]:$noise_word :g; + $trans =~ s:\{[^}]+\}:$spoken_noise_word :g; + $trans =~ s:^[+]([^+]+)[+]$:$1:; # Remove mispronunciation brackets + foreach $w (split (" ",$trans)) { + if ($w ne $noise_word && $w ne $spoken_noise_word) { + $w =~ s:[?.,!]+$::; # Remove punctuations + $w =~ s:^@(.*)$:$1:; # Remove best guess marking for proper nouns + $w =~ s:^[\^](.*)$:$1:; # Remove capitalization marks + $w =~ s:_([A-Z])'S$:$1.'S :g; # Normalize abbreviations from _f_b_i to f. b. i. + $w =~ s:_([A-Z]):$1. :g; # Normalize abbreviations from _f_b_i to f. b. i. + $w =~ s:[ ]+$::; # Remove trailing spaces + } + + print " $w"; + } + print "\n"; +} diff --git a/egs/hub4_english/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl b/egs/hub4_english/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl new file mode 100755 index 00000000000..37487296809 --- /dev/null +++ b/egs/hub4_english/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl @@ -0,0 +1,229 @@ +#!/usr/bin/env perl +#=============================================================================== +# Copyright (c) 2017 Johns Hopkins University (Author: Jan "Yenda" Trmal ) +# 2017 Vimal Manohar +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +use strict; +use warnings; +use utf8; + +require HTML::Parser or die "This script needs HTML::Parser from CPAN"; +HTML::Parser->import(); + +binmode(STDOUT, ":utf8"); + +sub trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s }; + +sub parse_sgml_tag { + my $tag = shift(@_); + my %ret; + + if ($tag !~ /=/) { + return %ret; + } + + $tag =~ s/<[a-zA-Z]+ //; + $tag =~ s/> *$//; + #print $tag . "\n"; + + my @key_value_pairs = split / *,? +/, $tag; + for my $entry(@key_value_pairs) { + (my $key, my $value) = split '=', $entry, 2; + $ret{$key}=$value; + } + return %ret; +} + +if (@ARGV != 1) { + print STDERR "$0: This script needs exactly one parameter (list of SGML files)\n"; + print STDERR " Usage: $0 \n"; + print STDERR " where\n"; + print STDERR " is a file containing the official SGML format\n"; + print STDERR " transcripts. The files are parsed and the parsed representation\n"; + print STDERR " is dumped to STDOUT (one utterance + the additional data fields\n"; + print STDERR " per line (we dump all the fields, but not all fields are used\n"; + print STDERR " in the recipe).\n"; + die; +} +my $filelist=$ARGV[0]; + +my $p = HTML::Parser->new(); + +my @files=(); +open(F, '<', $filelist) or die "Could not open file $filelist: $?\n"; +while() { + chomp; + push @files, $_; +} + +foreach my $file (@files) { + my $reporter=""; + my $start = -1; + my $end = -1; + my $segment_start = -1; + my $segment_end = -1; + my $segment_speaker; + my $segment_fidelity = "XXX"; + my $segment_mode = "XXX"; + my $section_start = -1; + my $section_end = -1; + my $filename = ""; + my $seq = 0; + my @text = (); + my $time; + my @tagqueue; + + my $sgml_file = `basename $file`; + $sgml_file = trim $sgml_file; + $sgml_file =~ s/\.txt$//g; + $sgml_file =~ s/\.sgml$//g; + $sgml_file =~ s/_$//g; + + open(my $f, '<:encoding(iso-8859-1)', $file) or die "Could not open file $file: $?\n"; + + while(my $line = <$f>) { + chomp $line; + $line = trim $line; + $line = lc $line; + next unless $line; + + if ($line =~ //$1/g; + $line = trim $line; + die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line; + #print "ES: $line\n"; + ; + } elsif ($line =~ /
/$1/g; + $line = trim $line; + die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line; + #print "ES: $line\n"; + ; + } elsif ($line =~ //$1/g; + $line = trim $line; + die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line; + + #print join(" ", @text) . "\n" if @text > 0; + my $new_time = $segment_end; + if (@text > 0) { + print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time "; + print join(" ", @text) . "\n"; + } + @text = (); + $time = 0; + $segment_speaker = "XXX"; + $segment_start = "XXX"; + $segment_end = "XXX"; + $segment_fidelity = "XXX"; + $segment_mode = "XXX"; + #print "ET: $line\n"; + ; + } elsif ($line =~ / 0) { + print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time "; + print join(" ", @text) . "\n"; + } + @text = (); + $time = $new_time; + ; + } elsif ($line =~ /<\/sync/) { + #print $line; + ; + } elsif ($line =~ /) +# 2017 Vimal Manohar +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +use strict; +use warnings; +use utf8; + +require HTML::Parser or die "This script needs HTML::Parser from CPAN"; +HTML::Parser->import(); + +binmode(STDOUT, ":utf8"); + +sub trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s }; + +sub parse_sgml_tag { + my $tag = shift(@_); + my %ret; + + if ($tag !~ /=/) { + return %ret; + } + + $tag =~ s/<[a-zA-Z]+ //; + $tag =~ s/> *$//; + #print $tag . "\n"; + + my @key_value_pairs = split / *,? +/, $tag; + for my $entry(@key_value_pairs) { + (my $key, my $value) = split '=', $entry, 2; + $ret{$key}=$value; + } + return %ret; +} + +if (@ARGV != 1) { + print STDERR "$0: This script needs exactly one parameter (list of SGML files)\n"; + print STDERR " Usage: $0 \n"; + print STDERR " where\n"; + print STDERR " is a file containing the official SGML format\n"; + print STDERR " transcripts. The files are parsed and the parsed representation\n"; + print STDERR " is dumped to STDOUT (one utterance + the additional data fields\n"; + print STDERR " per line (we dump all the fields, but not all fields are used\n"; + print STDERR " in the recipe).\n"; + die; +} +my $filelist=$ARGV[0]; + +my $p = HTML::Parser->new(); + +my @files=(); +open(F, '<', $filelist) or die "Could not open file $filelist: $?\n"; +while() { + chomp; + push @files, $_; +} + +foreach my $file (@files) { + my $reporter=""; + my $start = -1; + my $end = -1; + my $segment_start = -1; + my $segment_end = -1; + my $segment_speaker; + my $segment_fidelity = "XXX"; + my $segment_mode = "XXX"; + my $section_start = -1; + my $section_end = -1; + my $filename = ""; + my $seq = 0; + my @text = (); + my $time; + my @tagqueue; + + my $sgml_file = `basename $file`; + $sgml_file = trim $sgml_file; + $sgml_file =~ s/\.txt$//g; + $sgml_file =~ s/\.sgml$//g; + $sgml_file =~ s/_$//g; + + open(my $f, '<:encoding(iso-8859-1)', $file) or die "Could not open file $file: $?\n"; + + while(my $line = <$f>) { + chomp $line; + $line = trim $line; + $line = lc $line; + next unless $line; + + if ($line =~ //$1/g; + $line = trim $line; + die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line; + #print "ES: $line\n"; + ; + } elsif ($line =~ /
/$1/g; + $line = trim $line; + die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line; + #print "ES: $line\n"; + ; + } elsif ($line =~ //$1/g; + $line = trim $line; + die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line; + + #print join(" ", @text) . "\n" if @text > 0; + my $new_time = $segment_end; + if (@text > 0) { + print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time "; + print join(" ", @text) . "\n"; + } + @text = (); + $time = 0; + $segment_speaker = "XXX"; + $segment_start = "XXX"; + $segment_end = "XXX"; + $segment_fidelity = "XXX"; + $segment_mode = "XXX"; + #print "ET: $line\n"; + ; + } elsif ($line =~ /