From ebe5e8d264d5084096ddece7750744a854369d2b Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 26 Sep 2017 12:37:35 -0400 Subject: [PATCH 01/38] [egs] Bug fix in train_raw_dnn.py --- egs/wsj/s5/steps/nnet3/train_raw_dnn.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py index 030be1ad8b8..38396f0b4e7 100755 --- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py +++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py @@ -303,19 +303,19 @@ def train(args, run_opts): else: models_to_combine = None - if os.path.exists('{0}/valid_diagnostic.scp'.format(args.egs_dir)): - if os.path.exists('{0}/valid_diagnostic.egs'.format(args.egs_dir)): + if os.path.exists('{0}/valid_diagnostic.scp'.format(egs_dir)): + if os.path.exists('{0}/valid_diagnostic.egs'.format(egs_dir)): raise Exception('both {0}/valid_diagnostic.egs and ' '{0}/valid_diagnostic.scp exist.' 'This script expects only one of them to exist.' - ''.format(args.egs_dir)) + ''.format(egs_dir)) use_multitask_egs = True else: - if not os.path.exists('{0}/valid_diagnostic.egs'.format(args.egs_dir)): + if not os.path.exists('{0}/valid_diagnostic.egs'.format(egs_dir)): raise Exception('neither {0}/valid_diagnostic.egs nor ' '{0}/valid_diagnostic.scp exist.' 'This script expects one of them.' - ''.format(args.egs_dir)) + ''.format(egs_dir)) use_multitask_egs = False logger.info("Training will run for {0} epochs = " From fbedee05b61f479b9e58d2699cc85656ed234a0e Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Wed, 1 Nov 2017 14:59:54 -0400 Subject: [PATCH 02/38] steps/cleanup: Fixed corner case in resolve_ctm_edits_overlaps.py --- .../cleanup/internal/resolve_ctm_edits_overlaps.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py b/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py index be58ccac855..09cc90c4b60 100755 --- a/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py +++ b/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py @@ -155,8 +155,8 @@ def resolve_overlaps(ctm_edits, segments): Returns new lines of CTM for the recording. Arguments: - ctms - The CTM lines for a single recording. This is one value stored - in the dictionary read by read_ctm(). Assumes that the lines + ctm_edits - The CTM lines for a single recording. This is one value + stored in the dictionary read by read_ctm(). Assumes that the lines are sorted by the utterance-ids. The format is the following: [[(utteranceA, channelA, start_time1, duration1, hyp_word1, conf1), @@ -171,13 +171,12 @@ def resolve_overlaps(ctm_edits, segments): [... (utteranceZ, channelZ, start_timeN, durationN, hyp_wordN, confN)] ] + Expects this to be non-empty. segments - Dictionary containing the output of read_segments() { utterance_id: (recording_id, start_time, end_time) } """ total_ctm_edits = [] - if len(ctm_edits) == 0: - raise RuntimeError('CTMs for recording is empty. ' - 'Something wrong with the input ctms') + assert len(ctm_edits) > 0 # First column of first line in CTM for first utterance next_utt = ctm_edits[0][0][0] @@ -306,6 +305,11 @@ def run(args): if (reco, utt) in ctm_edits: ctm_edits_for_reco.append(ctm_edits[(reco, utt)]) try: + if len(ctm_edits_for_reco) == 0: + logger.warn('CTMs for recording %s is empty.', + reco) + break # Go to the next recording + # Process CTMs in the recordings ctm_edits_for_reco = resolve_overlaps(ctm_edits_for_reco, segments) write_ctm_edits(ctm_edits_for_reco, args.ctm_edits_out) From f0627cf122112062f4841805528df414aa124a6a Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Fri, 6 Jan 2017 14:26:48 -0500 Subject: [PATCH 03/38] bn: Adding BN recipe --- egs/bn/s5/README | 6 + egs/bn/s5/cmd.sh | 14 + egs/bn/s5/conf/merge_vad_map.txt | 16 + egs/bn/s5/conf/mfcc.conf | 6 + egs/bn/s5/conf/vad.conf | 2 + .../local/data_prep/csr_hub4_utils/INVENTORY | 56 + .../s5/local/data_prep/csr_hub4_utils/README | 34 + .../local/data_prep/csr_hub4_utils/abbrlist | 2403 +++++++++++++++++ .../data_prep/csr_hub4_utils/abbrproc.perl | 465 ++++ .../data_prep/csr_hub4_utils/addressforms | 38 + .../data_prep/csr_hub4_utils/artfilter.perl | 83 + .../data_prep/csr_hub4_utils/bugproc.perl | 69 + .../s5/local/data_prep/csr_hub4_utils/do-lm | 43 + .../csr_hub4_utils/eval-material.ptrns | 4 + .../local/data_prep/csr_hub4_utils/num_excp | 528 ++++ .../data_prep/csr_hub4_utils/numhack.perl | 80 + .../data_prep/csr_hub4_utils/numproc.perl | 1134 ++++++++ .../data_prep/csr_hub4_utils/pare-sgml.perl | 36 + .../csr_hub4_utils/process_filelist.py | 164 ++ .../csr_hub4_utils/process_filelist.sh | 30 + .../data_prep/csr_hub4_utils/progsummary.perl | 44 + .../data_prep/csr_hub4_utils/puncproc.perl | 196 ++ .../data_prep/csr_hub4_utils/sent-init.vocab | 411 +++ .../local/data_prep/csr_hub4_utils/sentag.c | 674 +++++ .../csr_hub4_utils/tr-bn-char.fast.perl | 13 + .../csr_hub4_utils/tr-bn-char.slow.perl | 46 + egs/bn/s5/local/data_prep/do-lm-csr96 | 40 + .../data_prep/prepare_1996_csr_hub4_corpus.sh | 51 + .../prepare_1998_hub4_bn_eng_eval.sh | 87 + egs/bn/s5/local/data_prep/prepare_bn_data.py | 208 ++ .../data_prep/prepare_na_news_text_corpus.sh | 51 + .../local/data_prep/process_na_news_text.py | 91 + egs/bn/s5/local/dict | 1 + egs/bn/s5/local/format_data.sh | 28 + egs/bn/s5/local/format_lms.sh | 47 + egs/bn/s5/local/lm/merge_word_counts.py | 30 + egs/bn/s5/local/lm/text_normalization.py | 42 + egs/bn/s5/local/normalize_transcripts.pl | 47 + egs/bn/s5/local/prepare_dict.sh | 191 ++ egs/bn/s5/local/run_cleanup_segmentation.sh | 93 + egs/bn/s5/local/score.sh | 1 + egs/bn/s5/local/score_sclite.sh | 94 + egs/bn/s5/local/train_lm.sh | 149 + egs/bn/s5/path.sh | 6 + egs/bn/s5/run.sh | 90 + egs/bn/s5/steps | 1 + egs/bn/s5/utils | 1 + 47 files changed, 7944 insertions(+) create mode 100644 egs/bn/s5/README create mode 100755 egs/bn/s5/cmd.sh create mode 100644 egs/bn/s5/conf/merge_vad_map.txt create mode 100644 egs/bn/s5/conf/mfcc.conf create mode 100644 egs/bn/s5/conf/vad.conf create mode 100644 egs/bn/s5/local/data_prep/csr_hub4_utils/INVENTORY create mode 100644 egs/bn/s5/local/data_prep/csr_hub4_utils/README create mode 100644 egs/bn/s5/local/data_prep/csr_hub4_utils/abbrlist create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/abbrproc.perl create mode 100644 egs/bn/s5/local/data_prep/csr_hub4_utils/addressforms create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/artfilter.perl create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/bugproc.perl create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/do-lm create mode 100644 egs/bn/s5/local/data_prep/csr_hub4_utils/eval-material.ptrns create mode 100644 egs/bn/s5/local/data_prep/csr_hub4_utils/num_excp create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/numhack.perl create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/numproc.perl create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/pare-sgml.perl create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/process_filelist.py create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/process_filelist.sh create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/progsummary.perl create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/puncproc.perl create mode 100644 egs/bn/s5/local/data_prep/csr_hub4_utils/sent-init.vocab create mode 100644 egs/bn/s5/local/data_prep/csr_hub4_utils/sentag.c create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl create mode 100755 egs/bn/s5/local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl create mode 100755 egs/bn/s5/local/data_prep/do-lm-csr96 create mode 100755 egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_corpus.sh create mode 100755 egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh create mode 100755 egs/bn/s5/local/data_prep/prepare_bn_data.py create mode 100755 egs/bn/s5/local/data_prep/prepare_na_news_text_corpus.sh create mode 100755 egs/bn/s5/local/data_prep/process_na_news_text.py create mode 120000 egs/bn/s5/local/dict create mode 100755 egs/bn/s5/local/format_data.sh create mode 100755 egs/bn/s5/local/format_lms.sh create mode 100755 egs/bn/s5/local/lm/merge_word_counts.py create mode 100644 egs/bn/s5/local/lm/text_normalization.py create mode 100755 egs/bn/s5/local/normalize_transcripts.pl create mode 100755 egs/bn/s5/local/prepare_dict.sh create mode 100755 egs/bn/s5/local/run_cleanup_segmentation.sh create mode 120000 egs/bn/s5/local/score.sh create mode 100755 egs/bn/s5/local/score_sclite.sh create mode 100755 egs/bn/s5/local/train_lm.sh create mode 100755 egs/bn/s5/path.sh create mode 100755 egs/bn/s5/run.sh create mode 120000 egs/bn/s5/steps create mode 120000 egs/bn/s5/utils diff --git a/egs/bn/s5/README b/egs/bn/s5/README new file mode 100644 index 00000000000..8a8ae65108d --- /dev/null +++ b/egs/bn/s5/README @@ -0,0 +1,6 @@ + The MUSAN corpus is required for system training. It is available at: + http://www.openslr.org/17/ + + The test requires Broadcast News data. The LDC Catalog numbers are: + Speech LDC97S44 + Transcripts LDC97T22 diff --git a/egs/bn/s5/cmd.sh b/egs/bn/s5/cmd.sh new file mode 100755 index 00000000000..43f7b21771a --- /dev/null +++ b/egs/bn/s5/cmd.sh @@ -0,0 +1,14 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 1G" +export decode_cmd="queue.pl --mem 4G" diff --git a/egs/bn/s5/conf/merge_vad_map.txt b/egs/bn/s5/conf/merge_vad_map.txt new file mode 100644 index 00000000000..216dee78b65 --- /dev/null +++ b/egs/bn/s5/conf/merge_vad_map.txt @@ -0,0 +1,16 @@ +# This table defines the mapping used by the binary merge-vads to +# combine the output of compute-vad and compute-vad-from-frame-likes. +# The first column corresponds to VAD decisions from compute-vad +# and the second corresponds to VAD decisions from +# compute-vad-from-frame-likes. The labels "0" and "1" in the +# first column represent (approximately) silence and nonsilence +# respectively. The labels "0," "1," and "2" in the second column +# represent noise, speech, and music, respectively. The third +# column lists the resulting output labels: "0," "1," and "2" +# corresponding to silence/noise, speech, and music. +0 0 0 +1 0 0 +0 1 0 +1 1 1 +0 2 0 +1 2 2 diff --git a/egs/bn/s5/conf/mfcc.conf b/egs/bn/s5/conf/mfcc.conf new file mode 100644 index 00000000000..a4be40be454 --- /dev/null +++ b/egs/bn/s5/conf/mfcc.conf @@ -0,0 +1,6 @@ +--sample-frequency=16000 +--frame-length=25 # the default is 25 +--low-freq=20 # the default. +--high-freq=3700 # the default is zero meaning use the Nyquist (4k in this case). +--num-ceps=20 # higher than the default which is 12. +--snip-edges=false diff --git a/egs/bn/s5/conf/vad.conf b/egs/bn/s5/conf/vad.conf new file mode 100644 index 00000000000..a0ca2449b10 --- /dev/null +++ b/egs/bn/s5/conf/vad.conf @@ -0,0 +1,2 @@ +--vad-energy-threshold=5.5 +--vad-energy-mean-scale=0.5 diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/INVENTORY b/egs/bn/s5/local/data_prep/csr_hub4_utils/INVENTORY new file mode 100644 index 00000000000..73229812231 --- /dev/null +++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/INVENTORY @@ -0,0 +1,56 @@ +INVENTORY + This file, a short description of included tools. +README + Introduction to the tools. +abbrlist +abbrproc.perl + Part of LM conditioning pipeline. + Spells out abbreviations and such. + "abbrlist" is an auxiliary data file for abbrproc. +addressforms + auxiliary file used by "sentag.c" +artfilter.perl + Selects articles based on content of SGML tags. +bugproc.perl + Part of LM conditioning pipeline. + Corrects a few common typos and non-standard spellings. +do-lm + Bourne shell script that executes language modeling conditioning + pipeline. +eval-material.ptrns + Pattern file used to separate reserved "test" (evaluation) articles + from "train" articles (training material). Used with "artfilter" + program along the following lines: + foreach $file + artfilter.perl -t program -f eval-material.ptrns -v -r \ + $file.test $file > $file.train +num_excp +numhack.perl +numproc.perl + Part of LM conditioning pipeline. + Spells out numberical expressions. + "num_excp" is an auxiliary data file for numproc. + "numhack.perl" is a new module for phone numbers and zip codes. +pare-sgml.perl + Part of LM conditioning pipeline. + Removes extraneous SGML tagging and transcriber comments enclosed + in brackets. +progsummary.perl + extracts program information from sgml-ized PSM texts +puncproc.perl + Part of LM conditioning pipeline. + Verbalizes punctuation (or removes, with -np switch). +sent-init.vocab +sentag.c + Program used to tag sentences in "raw" version. + Revised since last CSR_LM95 to handle over-long + sentences/paragraphs and to pass material lacking any obvious + end-of-sentence markers or alphabetic characters, since the + transcriptions are more likely to contain such text. Uses + auxiliary "sent-init.vocab" file. +tr-bn-char.fast.perl +tr-bn-char.slow.perl + Program used to translate 8-bit character encoding occasionally + found in the documents. The two versions should be identical in + output; the "slow" version is more readable while the "fast" + version is more efficient. diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/README b/egs/bn/s5/local/data_prep/csr_hub4_utils/README new file mode 100644 index 00000000000..fa73f3a4dc3 --- /dev/null +++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/README @@ -0,0 +1,34 @@ +SOFTWARE ACCOMPANYING CSR LM DATA +--------------------------------- + +The files in this directory fall into three categories: + +(1) C source code (*.c) + +(2) Perl source code (*.perl) + +(3) shell scripts and auxiliary data files + +In general, program summaries are provided within the source files, and are +often printed on stderr by the programs when the user enters some +unsuitable command line option (e.g. -h). The "do-lm" shell script shows +the components that were used in the LM-conditioning pipeline; other perl +programs were used for data summaries and correction of minor glitches. +See INVENTORY for more information. + +For further information on these programs, please contact Robert MacIntyre +or David Graff at the Linguistic Data Consortium: + robertm@ldc.upenn.edu, (215) 573-5491 + graff@ldc.upenn.edu, (215) 898-0887 + +While disclaimers have not been systematically placed in all source +code files, users are expected to understand that the following +applies to all source code files in this directory, unless otherwise +noted in particular files: + +This software is being provided by the Linguistic Data Consortium, and +the University of Pennsylvania, without any guarantee, warrantee or +implication about its correctness, usefulness or suitability to any +purpose. You may copy, modify and redistribute it, but you may not +hold the LDC or Univ. of Penn. responsible for any damages resulting +from its use. diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/abbrlist b/egs/bn/s5/local/data_prep/csr_hub4_utils/abbrlist new file mode 100644 index 00000000000..0c15bbd2eb5 --- /dev/null +++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/abbrlist @@ -0,0 +1,2403 @@ +############################################################################### +# This software is being provided to you, the LICENSEE, by the Massachusetts # +# Institute of Technology (M.I.T.) under the following license. By # +# obtaining, using and/or copying this software, you agree that you have # +# read, understood, and will comply with these terms and conditions: # +# # +# Permission to use, copy, modify and distribute, including the right to # +# grant others the right to distribute at any tier, this software and its # +# documentation for any purpose and without fee or royalty is hereby granted, # +# provided that you agree to comply with the following copyright notice and # +# statements, including the disclaimer, and that the same appear on ALL # +# copies of the software and documentation, including modifications that you # +# make for internal use or for distribution: # +# # +# Copyright 1991-4 by the Massachusetts Institute of Technology. All rights # +# reserved. # +# # +# THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS OR # +# WARRANTIES, EXPRESS OR IMPLIED. By way of example, but not limitation, # +# M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS # +# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR # +# DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, # +# TRADEMARKS OR OTHER RIGHTS. # +# # +# The name of the Massachusetts Institute of Technology or M.I.T. may NOT be # +# used in advertising or publicity pertaining to distribution of the # +# software. Title to copyright in this software and any associated # +# documentation shall at all times remain with M.I.T., and USER agrees to # +# preserve same. # +############################################################################### + +# abbreviation list +# derived from unigram file 29 Aug 91 mods to 17 Sept 91 +# x.y. mapped to x. y. in program + +# true abbreviations (must end with .) +# if key includes lower case, an upper case version will be created +Adm. Admiral +Ala. Alabama +Alex. Alexander +Apr. April +Ariz. Arizona +Ark. Arkansas +AUG. AUGUST +Aug. August +Ave. Avenue +Bancorp. Bancorp +Bhd. B. H. D. +Blvd. Boulevard +Brig. Brigadeer +Bros. Brothers +Cal. Calorie +Ca. California +Calif. California +Capt. Captain +Cie. Company +Cmdr. Commander +Co. Company +co. Company +Col. Colonel +Colo. Colorado +Conn. Connecticut +Corp. Corporation +Cos. Companies +Cpl. Corporal +Dec. December +Del. Delaware +Dept. Department +Dr. Doctor +Drs. Doctors +Feb. February +Fla. Florida +Fr. Friar +Fri. Friday +Ft. Fort +Ga. Georgia +Gen. General +Gov. Governor +Ill. Illinois +Inc. Incorporated +Ind. Indiana +InfoCorp. InfoCorp +Infocorp. InfoCorp +Intercorp. Intercorp +Jan. January +Jr. Junior +Jul. July +Jun. June +Kan. Kansas +Ky. Kentucky +La. Louisiana +lb. pound +lbs. pounds +Lt. Lieutenant +Ltd. Limited +Ltda. Company +Maj. Major +Mar. March +Mass. Massachusetts +MCorp. M. Corporation +Md. Maryland +Me. Maine +# Some Italian company +Me.T.A. M. E. T. A. +Mfg. Manufacturing +Mich. Michigan +Minn. Minnesota +Miss. Mississippi +Mo. Missouri +Mt. Mountain +Mont. Montana +# meaning of mistress has changed + symmetry +#Mr. Mister +#Mrs. Mistress +#Ms. Miz +#Messrs. +# +Neb. Nebraska +Nev. Nevada +No. Number +Nos. Numbers +Nov. November +Oct. October +Okla. Oklahoma +Ont. Ontario +Op. Opus +Ore. Oregon +Pa. Pennsylvania +PacifiCorp. PacifiCorp +Penn. Pennsylvania +PHLCorp. P. H. L. Corporation +Ph.D. P. H. D. +PhD. P. H. D. +Prof. Professor +Prop. Proposition +Pte. Point +Pty. Party +Pvt. Private +Rep. Representative +Reps. Representatives +Rev. Reverend +Sen. Senator +Sens. Senators +Sept. September +Sgt. Sargent +S.p.A. Company +Sr. Senior +#St. Street or Saint Context dependent (in abbrevproc) +Ste. Saint +Tel. Telephone +Tenn. Tennessee +Tex. Texas +Va. Virginia +Vt. Vermont +W.Va. West Virginia +Wash. Washington +Wis. Wisconsin +Wyo. Wyoming +Yr. Year +etc. et-cetera +Etc. Et-cetera +ft. feet +inc. incorporated +mfg. manufacturing +vs. versus + +# left contexts for roman cardinal numerals +# case independent comparisons +*r Act +*r Advantage +*r amendment +*r angiotensin +*r Antrim +*r Appendix +*r Apple +*r Arrow +*r Article +*r Associates +*r Astros +*r Bank +*r Bowl +*r Bronco +*r Busch +*r CSPAN +*r Canada +*r Century +*r Class +*r Cleopatra +*r Concepts +*r Cop +*r dBase +*r database +*r Delta +*r Detente +*r Dundee +*r Esprit +*r Explorer +*r Express +*r Eyes +*r Factor +*r Ford +*r Freaks +*r Fund +*r Funding +*r Funds +*r Future +*r GOD +*r GSTAR +*r Gemini +*r Ghostbusters +*r Global +*r Group +*r Gulfstream +*r Hybrid +*r Intelsat +*r Investment +*r Investments +*r Iron +*r Jets +*r Journalism +*r Kong +# LaSalle nuclear plant +*r LaSalle +*r LaserWriter +*r Lighthouse +*r Linen +*r Mark +*r Mac +*r MacDraw +*r MacProject +*r Macintosh +*r Management +*r Mark +*r Metro +*r MicroVAX +*r Minuteman +*r Monopoly +*r Notes +*r numeral +*r OPEC +*r Officer +*r Overseas +*r Part +*r Partners +*r Pershing +*r Phantasm +*r Phase +*r Phobos +*r Pioneer +*r Pirate +*r Play +*r Plus +*r Point +*r Portable +*r Quick +*r Rambo +*r Ransom +*r Resorts +*r SALT +*r Screen +*r Series +*r Stage +*r Superman +*r System +*r TIAA +*r Titan +*r Title +*r Toxic +*r Trac +*r Trek +*r Trident +*r Trooper +*r Trust +*r Ultima +*r Vatican +*r Ventures +*r Volume +*r WW +*r War +*r Weapon +*r Wespac +*r Westar +*r Wrestlemania + +# Roman ordinals (I, V, and X not included: too many false hits) +II the second +III the third +IV the fourth +VI the sixth +VII the seventh +VIII the eighth +IX the ninth +XI the eleventh +XII the twelfth +XIII the thirteenth +XIV the fourteenth +XV the fifteenth +XVI the sixteenth +XVII the seventeenth +XVIII the eighteenth +XIX the nineteenth +XX the twentieth +XXI the twenty-first +XXII the twenty-second +XXIII the twenty-third +XXIV the twenty-fourth +XXV the twenty-fifth + +# acronyms (not ending in .) needing translation +# if key includes lower case, an upper case version will be created +# keys can include - / & . +AA Double A. +AAA Triple A. +AAI A. A. I. +AAP A. A. P. +AAR A. A. R. +AARP A. A. R. P. +AAS A. A. S. +AB A. B. +ABA A. B. A. +ABB A. B. B. +ABC A. B. C. +ABD A. B. D. +ABF A. B. F. +ABI A. B. I. +ABM A. B. M. +ABN A. B. N. +ABS A. B. S. +ABT A. B. T. +AC A. C. +ACA A. C. A. +ACC A. C. C. +ACCT A. C. C. T. +ACEC A. C. E. C. +ACF A. C. F. +ACI A. C. I. +ACLI A. C. L. I. +ACLU A. C. L. U. +ACM A. C. M. +ACO A. C. O. +ACP A. C. P. +ACSH A. C. S. H. +ACTV A. C. T. V. +ADB A. D. B. +ADC A. D. C. +ADI A. D. I. +ADIA A. D. I. A. +ADM A. D. M. +ADN A. D. N. +ADP A. D. P. +ADR A. D. R. +ADT A. D. T. +ADV A. D. V. +adv A. D. V. +AD&P A. D. & P. +AD/SAT AD / SAT +AE A. E. +AEA A. E. A. +AEC A. E. C. +AEG A. E. G. +AEI A. E. I. +AEL A. E. L. +AEP A. E. P. +AER A. E. R. +AES A. E. S. +AEU A. E. U. +AEW A. E. W. +AFA A. F. A. +AFC A. F. C. +AFCO A. F. C. O. +AFDC A. F. D. C. +AFG A. F. G. +AFGE A. F. G. E. +AFIS A. F. I. S. +AFL A. F. L. +AFP A. F. P. +AFSCME A. F. S. C. M. E. +AG A. G. +AGA A. G. A. +AGB A. G. B. +AGEF A. G. E. F. +AGF A. G. F. +AGI A. G. I. +AGIP A. G. I. P. +AGS A. G. S. +AGT A. G. T. +AHA A. H. A. +AHL A. H. L. +AI A. I. +AIBD A. I. B. D. +AIC A. I. C. +AICPA A. I. C. P. A. +AIFS A. I. F. S. +AIG A. I. G. +AIL A. I. L. +AIME A. I. M. E. +AIT A. I. T. +AIW A. I. W. +AIX A. I. X. +AK A. K. +AKA A. K. A. +ALC A. L. C. +ALQ A. L. Q. +ALR A. L. R. +AM A. M. +AMA A. M. A. +AMC A. M. C. +AMCA A. M. C. A. +AMCC A. M. C. C. +AMD A. M. D. +AME A. M. E. +AMF A. M. F. +AMG A. M. G. +AMI A. M. I. +AML A. M. L. +AMO A. M. O. +AMP A. M. P. +AMR A. M. R. +AMT A. M. T. +ANB A. N. B. +ANC A. N. C. +ANF A. N. F. +ANMC A. N. M. C. +ANR A. N. R. +ANWR A. N. W. R. +ANZ A. N. Z. +AO A. O. +AOC A. O. C. +AOI A. O. I. +AOK A. O. K. +AON A. O. N. +AP A. P. +A&P A. & P. +APA A. P. A. +APAC A. P. A. C. +API A. P. I. +APL A. P. L. +APMA A. P. M. A. +APN A. P. N. +APPWP A. P. P. W. P. +APR A. P. R. +APS A. P. S. +APSAC A. P. S. A. C. +APV A. P. V. +APW A. P. W. +ARA A. R. A. +ARB A. R. B. +ARD A. R. D. +ARX A. R. X. +ASA A. S. A. +ASB A. S. B. +ASC A. S. C. +ASEA A. S. E. A. +ASI A. S. I. +ASPCA A. S. P. C. A. +AST A. S. T. +AT A. T. +ATA A. T. A. +ATC A. T. C. +ATF A. T. F. +ATI A. T. I. +ATM A. T. M. +ATN A. T. N. +ATR A. T. R. +ATS A. T. S. +AT&T A. T. & T. +ATV A. T. V. +AUS A. U. S. +AV A. V. +AVAQ A. V. A. Q. +AVC A. V. C. +AVX A. V. X. +AWA A. W. A. +AWD A. W. D. +AWOL A. W. O. L. +AWSJ A. W. S. J. +AWT A. W. T. +AXA A. X. A. +AXP A. X. P. +AY A. Y. +AZL A. Z. L. +AZP A. Z. P. +AZT A. Z. T. +BA B. A. +Ba B. a. +BAA B. A. A. +Baa B. a. a. +BAC B. A. C. +BAII B. A. I. I. +B.A.IT B. A. IT +BASF B. A. S. F. +B.A.T B. A. T. +BB Double B. +BBA B. B. A. +BBB Triple B. +BBC B. B. C. +BBDO B. B. D. O. +BBN B. B. N. +BC B. C. +BCA B. C. A. +BCCI B. C. C. I. +BCE B. C. E. +BCEAO B. C. E. A. O. +BCG B. C. G. +BCI B. C. I. +BCM B. C. M. +BCOA B. C. O. A. +BCS B. C. S. +BCV B. C. V. +BCW B. C. W. +BDC B. D. C. +BDDP B. D. D. P. +BDM B. D. M. +BDO B. D. O. +BDR B. D. R. +BEC B. E. C. +BEI B. E. I. +BF B. F. +BFEA B. F. E. A. +BFS B. F. S. +BGH B. G. H. +BGS B. G. S. +BHC B. H. C. +Bhd B. H. D. +BHF B. H. F. +BHP B. H. P. +BHS B. H. S. +BHW B. H. W. +BI B. I. +BIA B. I. A. +BICC B. I. C. C. +BiiN B. i. i. N. +BIP B. I. P. +BIR B. I. R. +BIS B. I. S. +BIW B. I. W. +BJ B. J. +BJF B. J. F. +BK B. K. +BL B. L. +BLM B. L. M. +BLS B. L. S. +BM B. M. +BMA B. M. A. +BMC B. M. C. +BMI B. M. I. +BMP B. M. P. +BMW B. M. W. +BMY B. M. Y. +BN B. N. +BNL B. N. L. +BNP B. N. P. +BNS B. N. S. +BNY B. N. Y. +BOC B. O. C. +BOJ B. O. J. +BOT B. O. T. +BP B. P. +bpd B. P. D. +BPB B. P. B. +BPC B. P. C. +BPCA B. P. C. A. +BPCC B. P. C. C. +BPD B. P. D. +BPI B. P. I. +BR B. R. +BRE B. R. E. +BRNF B. R. N. F. +BRT B. R. T. +BRZ B. R. Z. +BS B. S. +BSB B. S. B. +BSD B. S. D. +BSE B. S. E. +BSI B. S. I. +BSN B. S. N. +BSO B. S. O. +BST B. S. T. +BT B. T. +BTL B. T. L. +BTR B. T. R. +BTU B. T. U. +BV B. V. +BVI B. V. I. +BVL B. V. L. +BW B. W. +BWA B. W. A. +BWAC B. W. A. C. +BZ B. Z. +BZW B. Z. W. +CA C. A. +Ca C. a. +CAA C. A. A. +Caa C. a. a. +CAAC C. A. A. C. +CAC C. A. C. +CACI C. A. C. I. +CAD C. A. D. +CAE C. A. E. +CAID C. A. I. D. +CAMI C. A. M. I. +CARU C. A. R. U. +CATV C. A. T. V. +CAV C. A. V. +CAW C. A. W. +CB C. B. +CBC C. B. C. +CBI C. B. I. +CBN C. B. N. +CBO C. B. O. +CBOE C. B. O. E. +CBOT C. B. O. T. +CBS C. B. S. +CBT C. B. T. +CBW C. B. W. +CCA C. C. A. +CCC C. C. C. +CCD C. C. D. +CCE C. C. E. +CCH C. C. H. +CCK C. C. K. +CCL C. C. L. +CCX C. C. X. +CD C. D. +CDA C. D. A. +CDC C. D. C. +CDF C. D. F. +CDI C. D. I. +CDL C. D. L. +CDS C. D. S. +CDT C. D. T. +CDU C. D. U. +CDW C. D. W. +CE C. E. +CEA C. E. A. +CED C. E. D. +CEE C. E. E. +CEI C. E. I. +CEL C. E. L. +CEO C. E. O. +CEP C. E. P. +CES C. E. S. +CF C. F. +CFA C. F. A. +CFC C. F. C. +CFM C. F. M. +CFO C. F. O. +CFP C. F. P. +CFS C. F. S. +CFTC C. F. T. C. +CFTR C. F. T. R. +CGB C. G. B. +CGCT C. G. C. T. +CGE C. G. E. +CGM C. G. M. +CGS C. G. S. +CGT C. G. T. +CH C. H. +CHC C. H. C. +CHG C. H. G. +CI C. I. +CIA C. I. A. +CIBC C. I. B. C. +CIC C. I. C. +CID C. I. D. +CIE C. I. E. +CIGS C. I. G. S. +CIM C. I. M. +CIO C. I. O. +CIP C. I. P. +CIR C. I. R. +CIS C. I. S. +CIT C. I. T. +CJ C. J. +CJI C. J. I. +CJM C. J. M. +CK C. K. +CL C. L. +CLC C. L. C. +CLS C. L. S. +CLU C. L. U. +CLX C. L. X. +CM C. M. +CMA C. M. A. +CMB C. M. B. +CMC C. M. C. +CME C. M. E. +CMF C. M. F. +CMI C. M. I. +CML C. M. L. +CMO C. M. O. +CMQ C. M. Q. +CMS C. M. S. +CMV C. M. V. +CMS C. M. X. +CN C. N. +CNA C. N. A. +CNB C. N. B. +CNBC C. N. B. C. +CNCL C. N. C. L. +CNCP C. N. C. P. +CNFR C. N. F. R. +CNG C. N. G. +CNN C. N. N. +CNOOC C. N. O. O. C. +CNW C. N. W. +Corp Corporation +CP C. P. +CPA C. P. A. +CPAC C. P. A. C. +CPB C. P. B. +CPC C. P. C. +CPE C. P. E. +CPI C. P. I. +CPL C. P. L. +CPM C. P. M. +CPP C. P. P. +CPR C. P. R. +CPSC C. P. S. C. +CPT C. P. T. +CQ C. Q. +CR C. R. +CRA C. R. A. +CRB C. R. B. +CRC C. R. C. +CRI C. R. I. +CRL C. R. L. +CRS C. R. S. +CRT C. R. T. +CRTC C. R. T. C. +CRX C. R. X. +CS C. S. +CSA C. S. A. +CSB C. S. B. +CSC C. S. C. +CSF C. S. F. +CSFB C. S. F. B. +CSI C. S. I. +CSIS C. S. I. S. +CSK C. S. K. +CSO C. S. O. +CSR C. S. R. +CSS C. S. S. +CST C. S. T. +CSU C. S. U. +CSV C. S. V. +CSX C. S. X. +CT C. T. +CTA C. T. A. +CTB C. T. B. +CTBS C. T. B. S. +CTC C. T. C. +CTG C. T. G. +CTI C. T. I. +CTK C. T. K. +CTM C. T. M. +CTS C. T. S. +CTV C. T. V. +CU C. U. +CUC C. U. C. +CVB C. V. B. +CVG C. V. G. +CVN C. V. N. +CVNY C. V. N. Y. +CVS C. V. S. +CW C. W. +CWA C. W. A. +CWB C. W. B. +CWT C. W. T. +CX C. X. +CXR C. X. R. +DAF D. A. F. +DAP D. A. P. +DAX D. A. X. +DB D. B. +DBA D. B. A. +DBI D. B. I. +DBL D. B. L. +DBS D. B. S. +DC D. C. +DCCC D. C. C. C. +DCI D. C. I. +DCNY D. C. N. Y. +DD D. D. +DDA D. D. A. +DDB D. D. B. +DDC D. D. C. +DDG D. D. G. +DDI D. D. I. +DDR D. D. R. +DDT D. D. T. +DEA D. E. A. +DEC D. E. C. +DES D. E. S. +DFA D. F. A. +DFC D. F. C. +DFMO D. F. M. O. +DFS D. F. S. +DG D. G. +DGA D. G. A. +DGPT D. G. P. T. +DH D. H. +DHB D. H. B. +DHL D. H. L. +DIA D. I. A. +DIW D. I. W. +DJ D. J. +DJIA D. J. I. A. +DJP D. J. P. +DJS D. J. S. +DKB D. K. B. +DKM D. K. M. +DL D. L. +DLC D. L. C. +DLJ D. L. J. +DM D. M. +DMA D. M. A. +DMB D. M. B. +DMC D. M. C. +DMD D. M. D. +DME D. M. E. +DMI D. M. I. +DMS D. M. S. +DMW D. M. W. +DMZ D. M. Z. +DN D. N. +DNA D. N. A. +DNC D. N. C. +DNX D. N. X. +DOC D. O. C. +DOD D. O. D. +DOE D. O. E. +DOS D. O. S. +DOT D. O. T. +DP D. P. +DPC D. P. C. +DPG D. P. G. +DPL D. P. L. +DPP D. P. P. +DPS D. P. S. +DPT D. P. T. +Dr Doctor +DRG D. R. G. +DRI D. R. I. +DS D. S. +DSA D. S. A. +DSC D. S. C. +DSL D. S. L. +DSLT D. S. L. T. +DSM D. S. M. +DSP D. S. P. +DST D. S. T. +DTC D. T. C. +DTH D. T. H. +DTI D. T. I. +DV D. V. +DVFA D. V. F. A. +DWG D. W. G. +DX D. X. +DYR D. Y. R. +EA E. A. +EAC E. A. C. +EAL E. A. L. +EAS E. A. S. +EB E. B. +EBDC E. B. D. C. +EBRD E. B. R. D. +EBS E. B. S. +EC E. C. +ECC E. C. C. +ECD E. C. D. +ECI E. C. I. +ECL E. C. L. +ECPA E. C. P. A. +ECU E. C. U. +EDA E. D. A. +EDB E. D. B. +EDC E. D. C. +EDI E. D. I. +EDM E. D. M. +EDP E. D. P. +EDS E. D. S. +EDT E. D. T. +EEC E. E. C. +EECO E. E. C. O. +EEI E. E. I. +EEOC E. E. O. C. +EEP E. E. P. +EES E. E. S. +EESP E. E. S. P. +EF E. F. +EFA E. F. A. +EFC E. F. C. +EG E. G. +EGA E. G. A. +EI E. I. +EIA E. I. A. +EIB E. I. B. +EIC E. I. C. +EIP E. I. P. +EITC E. I. T. C. +EIU E. I. U. +ELN E. L. N. +EMC E. M. C. +EMEA E. M. E. A. +EMI E. M. I. +EMS E. M. S. +EMT E. M. T. +ENI E. N. I. +ENSR E. N. S. R. +EP E. P. +EPA E. P. A. +EPLF E. P. L. F. +EPO E. M. O. +EPO E. P. O. +EPRI E. P. R. I. +ERC E. R. C. +ERG E. R. G. +ERIS E. R. I. S. +ERM E. R. M. +ERO E. R. O. +ERS E. R. S. +ES E. S. +ESA E. S. A. +ESB E. S. B. +ESI E. S. I. +ESL E. S. L. +ESOP E. S. O. P. +ESP E. S. P. +ESPN E. S. P. N. +ESS E. S. S. +EST E. S. T. +ET E. T. +ETA E. T. A. +ETBE E. T. B. E. +ETS E. T. S. +EU E. U. +EUA E. U. A. +EWE E. W. E. +EXL E. X. L. +EXP E. X. P. +EZ E. Z. +FA F. A. +FAA F. A. A. +FAC F. A. C. +FADA F. A. D. A. +FAI F. A. I. +FAO F. A. O. +FARC F. A. R. C. +FAS F. A. S. +FASB F. A. S. B. +FAZ F. A. Z. +FBI F. B. I. +FBS F. B. S. +FC F. C. +FCA F. C. A. +FCB F. C. B. +FCC F. C. C. +FCD F. C. D. +FCMI F. C. M. I. +FDA F. D. A. +FDC F. D. C. +FDIC F. D. I. C +FDIC F. D. I. C. +FDN F. D. N. +FDP F. D. P. +FDR F. D. R. +FEA F. E. A. +FEC F. E. C. +FEMA F. E. M. A. +FERC F. E. R. C. +FF F. F. +FFA F. F. A. +FFB F. F. B. +FFP F. F. P. +FGH F. G. H. +FGIC F. G. I. C. +FH F. H. +FHA F. H. A. +FHAA F. H. A. A. +FHFB F. H. F. B. +FHLB F. H. L. B. +FHLBB F. H. L. B. B. +FHP F. H. P. +FIA F. I. A. +FIAC F. I. A. C. +FICA F. I. C. A. +FICO F. I. C. O. +FIFA F. I. F. A. +FII F. I. I. +FIP F. I. P. +FK F. K. +FKB F. K. B. +FKI F. K. I. +FL F. L. +FLA F. L. A. +FLX F. L. X. +FM F. M. +FMC F. M. C. +FMHA F. M. H. A. +FmHA F. M. H. A. +FMI F. M. I. +FMLN F. M. L. N. +FMR F. M. R. +FMS F. M. S. +FN F. N. +FNN F. N. N. +FNS F. N. S. +FOMC F. O. M. C. +FP F. P. +FPA F. P. A. +FPC F. P. C. +FPCO F. P. C. O. +FPL F. P. L. +FR F. R. +FRA F. R. A. +FS F. S. +FSA F. S. A. +FSB F. S. B. +FSC F. S. C. +FSD F. S. D. +FSIA F. S. I. A. +FSLIC F. S. L. I. C. +FSLN F. S. L. N. +FSX F. S. X. +FT F. T. +FTC F. T. C. +FTS F. T. S. +FTSE F. T. S. E. +FX F. X. +FYI F. Y. I. +GA G. A. +GAAP G. A. A. P. +GAC G. A. C. +GAF G. A. F. +GAO G. A. O. +GASB G. A. S. B. +GATT G. A. T. T. +GATX G. A. T. X. +GB G. B. +GBL G. B. L. +GBM G. B. M. +GBS G. B. S. +GC G. C. +GCA G. C. A. +GCC G. C. C. +GCI G. C. I. +GDM G. D. M. +GDP G. D. P. +GDR G. D. R. +GE G. E. +GEC G. E. C. +GECC G. E. C. C. +GF G. F. +GFI G. F. I. +GFT G. F. T. +GGK G. G. K. +GHF G. H. F. +GHKM G. H. K. M. +GHR G. H. R. +GHS G. H. S. +GHRF G. H. R. F. +GI G. I. +GIA G. I. A. +GIC G. I. C. +GIS G. I. S. +GK G. K. +GKN G. K. N. +GL G. L. +GLCM G. L. C. M. +GLI G. L. I. +GM G. M. +GMA G. M. A. +GMAC G. M. A. C. +GMBH G. M. B. H. +GMC G. M. C. +GMF G. M. F. +GMHC G. M. H. C. +GMN G. M. N. +GMT G. M. T. +GMTV G. M. T. V. +GNB G. N. B. +GNI G. N. I. +GNMA G. N. M. A. +GNP G. N. P. +GOP G. O. P. +GP G. P. +GPA G. P. A. +GPD G. P. D. +GPG G. P. G. +GPO G. P. O. +GPS G. P. S. +GPT G. P. T. +GPU G. P. U. +GQ G. Q. +GR G. R. +GRE G. R. E. +GRI G. R. I. +GRU G. R. U. +GS G. S. +GSA G. S. A. +GSD G. S. D. +GSI G. S. I. +GSL G. S. L. +GSP G. S. P. +GSS G. S. S. +GST G. S. T. +GSX G. S. X. +GT G. T. +GTA G. T. A. +GTC G. T. C. +GTE G. T. E. +GTECH G. Tech +GTG G. T. G. +GTI G. T. I. +GTS G. T. S. +GV G. V. +GW G. W. +GWC G. W. C. +GXE G. X. E. +HBJ H. B. J. +HBM H. B. M. +HBO H. B. O. +HCA H. C. A. +HCC H. C. C. +HCI H. C. I. +HCFA H. C. F. A. +HCFC H. C. F. C. +HCS H. C. S. +HD H. D. +HDL H. D. L. +HDM H. D. M. +HDTV H. D. T. V. +HEI H. E. I. +HF H. F. +HFC H. F. C. +HG H. G. +HGTV H. G. T. V. +HH H. H. +HHB H. H. B. +HHS H. H. S. +HILB H. I. L. B. +HIV H. I. V. +HK H. K. +HKSAR H. K. S. A. R. +HL H. L. +HLM H. L. M. +HLX H. L. X. +HMA H. M. A. +HMDA H. M. D. A. +HMG H. M. G. +HMO H. M. O. +HMS H. M. S. +HMSS H. M. S. S. +HN H. N. +HNSX H. N. S. X. +HNV H. N. V. +HP H. P. +HPB H. P. B. +HQ H. Q. +HR H. R. +HRB H. R. B. +HRE H. R. E. +HRI H. R. I. +HRS H. R. S. +HSA H. S. A. +HSBC H. S. B. C. +HSH H. S. H. +HSST H. S. S. T. +HSV H. S. V. +HT H. T. +HTLV H. T. L. V. +HWC H. W. C. +HZN H. Z. N. +IADB I. A. D. B. +IAE I. A. E. +IAEA I. A. E. A. +IAEC I. A. E. C. +IAFP I. A. F. P. +IAM I. A. M. +IATA I. A. T. A. +IB I. B. +IBA I. B. A. +IBAA I. B. A. A. +IBC I. B. C. +IBCA I. B. C. A. +IBES I. B. E. S. +IBEW I. B. E. W. +IBH I. B. H. +IBI I. B. I. +IBJ I. B. J. +IBM I. B. M. +IBP I. B. P. +IC I. C. +ICA I. C. A. +ICAO I. C. A. O. +ICBM I. C. B. M. +ICC I. C. C. +ICCO I. C. C. O. +ICEE I. C. E. E. +ICF I. C. F. +ICG I. C. G. +ICH I. C. H. +ICI I. C. I. +ICL I. C. L. +ICM I. C. M. +ICN I. C. N. +ICO I. C. O. +ICRP I. C. R. P. +ICSL I. C. S. L. +ID I. D. +IDA I. D. A. +IDB I. D. B. +IDC I. D. C. +IDD I. D. D. +IDF I. D. F. +IDG I. D. G. +IDI I. D. I. +IDS I. D. S. +IEA I. E. A. +IEC I. E. C. +IEJW I. E. J. W. +IFA I. F. A. +IFAR I. F. A. R. +IFB I. F. B. +IFC I. F. C. +IFE I. F. E. +IFF I. F. F. +IFI I. F. I. +IFO I. F. O. +IFR I. F. R. +IFRB I. F. R. B. +IG I. G. +IGB I. G. B. +IgG I. g. G. +IGI I. G. I. +IGT I. G. T. +IGX I. G. X. +IH I. H. +IHI I. H. I. +IIGS I. I. G. S. +IIS I. I. S. +IIT I. I. T. +IJ I. J. +IKEA I. K. E. A. +IL I. L. +ILA I. L. A. +ILC I. L. C. +ILGWU I. L. G. W. U. +ILO I. L. O. +ILS I. L. S. +IM I. M. +IMA I. M. A. +IMC I. M. C. +IMD I. M. D. +IMF I. M. F. +IMG I. M. G. +IMI I. M. I. +IMM I. M. M. +IMO I. M. O. +IMS I. M. S. +IMT I. M. T. +IMU I. M. U. +INA I. N. A. +INB I. N. B. +Inc Incorporated +IND I. N. D. +INF I. N. F. +ING I. N. G. +INI I. N. I. +INPO I. N. P. O. +INR I. N. R. +INS I. N. S. +Intl International +Intercorp Intercorporation +IOC I. O. C. +IOR I. O. R. +IOS I. O. S. +IOU I. O. U. +IP I. P. +IPC I. P. C. +IPE I. P. E. +IPFA I. P. F. A. +IPM I. P. M. +IPO I. P. O. +IPS I. P. S. +IQ I. Q. +IRA I. R. A. +IRI I. R. I. +IRNA I. R. N. A. +IROC I. R. O. C. +IRS I. R. S. +IRT I. R. T. +ISC I. S. C. +ISDN I. S. D. N. +ISE I. S. E. +ISI I. S. I. +ISL I. S. L. +ISM I. S. M. +ISO I. S. O. +ISS I. S. S. +ITA I. T. A. +ITC I. T. C. +ITG I. T. G. +ITN I. T. N. +ITT I. T. T. +ITV I. T. V. +IU I. U. +IUD I. U. D. +IUE I. U. E. +IUR I. U. R. +IVF I. V. F. +IVI I. V. I. +IVIG I. V. I. G. +IXL I. X. L. +IWA I. W. A. +JAL J. A. L. +JAMA J. A. M. A. +JATP J. A. T. P. +JBA J. B. A. +JC J. C. +JCB J. C. B. +JCP J. C. P. +JCS J. C. S. +JCT J. C. T. +JDS J. D. S. +JEC J. E. C. +JFA J. F. A. +JFK J. F. K. +JGC J. G. C. +JHM J. H. M. +JIT J. I. T. +JLG J. L. G. +JMB J. M. B. +JMR J. M. R. +JOA J. O. A. +JP J. P. +JPL J. P. L. +JPM J. P. M. +JR J. R. +JRA J. R. A. +JSP J. S. P. +JT J. T. +JTL J. T. L. +JTM J. T. M. +JTPA J. T. P. A. +JVC J. V. C. +JVP J. V. P. +JWD J. W. D. +JWP J. W. P. +JWT J. W. T. +KAL K. A. L. +KB K. B. +KBA K. B. A. +KBGS K. B. G. S. +KBS K. B. S. +KC K. C. +KCBS K. C. B. S. +KCP K. C. P. +KCS K. C. S. +KCST K. C. S. T. +KD K. D. +KDD K. D. D. +KDI K. D. I. +KETV K. E. T. V. +KF K. F. +KFC K. F. C. +KFF K. F. F. +KFW K. F. W. +KG K. G. +KGaA K. G. a. A. +KGB K. G. B. +KGF K. G. F. +KGMC K. G. M. C. +KH K. H. +KHD K. H. D. +KHJ K. H. J. +KIC K. I. C. +KIO K. I. O. +KK K. K. +KKB K. K. B. +KKR K. K. R. +KLA K. L. A. +KLM K. L. M. +KLP K. L. P. +KLUC K. L. U. C. +KMA K. M. A. +KMET K. M. E. T. +KMG K. M. G. +KMS K. M. S. +KMT K. M. T. +KMW K. M. W. +KN K. N. +KNON K. N. O. N. +KOP K. O. P. +KPAX K. P. A. X. +KPC K. P. C. +KPFK K. P. F. K. +KPMG K. P. M. G. +KPRC K. P. R. C. +KSI K. S. I. +KSZ K. S. Z. +KTF K. T. F. +KTM K. T. M. +KTWV K. T. W. V. +KV K. V. +KVIL K. V. I. L. +KW K. W. +KWU K. W. U. +KZKC K. Z. K. C. +LA L. A. +LB L. B. +LBJ L. B. J. +LBO L. B. O. +LBS L. B. S. +LCA L. C. A. +LCD L. C. D. +LCG L. C. G. +LCI L. C. I. +LCP L. C. P. +LDC L. D. C. +LDDS L. D. D. S. +LDI L. D. I. +LDL L. D. L. +LDP L. D. P. +LDS L. D. S. +LDX L. D. X. +LFB L. F. B. +LFC L. F. C. +LG L. G. +LGP L. G. P. +LH L. H. +LHS L. H. S. +LHX L. H. X. +LIC L. I. C. +LiFeS L. i. F. e. S. +LIG L. I. G. +LIN L. I. N. +LIPA L. I. P. A. +LISC L. I. S. C. +LJN L. J. N. +LL L. L. +LLC L. L. C. +LME L. M. E. +LMT L. M. T. +LN L. N. +LNG L. N. G. +LNR L. N. R. +LNS L. N. S. +LOF L. O. F. +LOR L. O. R. +LOT L. O. T. +LP L. P. +LPC L. P. C. +LPGA L. P. G. A. +LPL L. P. L. +LPP L. P. P. +LS L. S. +LSB L. S. B. +LSC L. S. C. +LSD L. S. D. +LSI L. S. I. +LSU L. S. U. +LT L. T. +LTCB L. T. C. B. +LTD L. T. D. +LTV L. T. V. +LTX L. T. X. +LVI L. V. I. +LVMH L. V. M. H. +LX L. X. +LY L. Y. +MAI M. A. I. +MB M. B. +MBA M. B. A. +MBAA M. B. A. A. +MBB M. B. B. +MBE M. B. E. +MBF M. B. F. +MBFR M. B. F. R. +MBH M. B. H. +MBI M. B. I. +MBIA M. B. I. A. +MBS M. B. S. +MC M. C. +MCA M. C. A. +MCC M. C. C. +MCCP M. C. C. P. +MCEG M. C. E. G. +MCI M. C. I. +MCM M. C. M. +MCN M. C. N. +MCO M. C. O. +MCP M. C. P. +MCS M. C. S. +MD M. D. +MDA M. D. A. +MDB M. D. B. +MDC M. D. C. +MDI M. D. I. +MDM M. D. M. +MDT M. D. T. +MEBA M. E. B. A. +MEI M. E. I. +MEK M. E. K. +MEM M. E. M. +MEPC M. E. P. C. +MFA M. F. A. +MFI M. F. I. +MFL M. F. L. +MFN M. F. N. +MFS M. F. S. +MGC M. G. C. +MGI M. G. I. +MGM M. G. M. +MH M. H. +MHA M. H. A. +MHC M. H. C. +MHI M. H. I. +MHP M. H. P. +MHQ M. H. Q. +MI M. I. +MIA M. I. A. +MICC M. I. C. C. +MIGA M. I. G. A. +MIM M. I. M. +MIP M. I. P. +MIPS M. I. P. S. +MIS M. I. S. +MIT M. I. T. +MITI M. I. T. I. +MK M. K. +MKI M. K. I. +ML M. L. +MLP M. L. P. +MLPI M. L. P. I. +MLS M. L. S. +MLX M. L. X. +MMAC M. M. A. C. +MMC M. M. C. +MMI M. M. I. +MMPI M. M. P. I. +MMR M. M. R. +MMS M. M. S. +MMWEC M. M. W. E. C. +MNC M. N. C. +MNet M. Net +MNX M. N. X. +MP M. P. +MPAA M. P. A. A. +MPB M. P. B. +MPLA M. P. L. A. +MPS M. P. S. +MPT M. P. T. +MPTP M. P. T. P. +MPV M. P. V. +MRC M. R. C. +MRCA M. R. C. A. +MRI M. R. I. +MRP M. R. P. +MRTA M. R. T. A. +MS M. S. +MSA M. S. A. +MSHA M. S. H. A. +MSI M. S. I. +MSL M. S. L. +MSM M. S. M. +MSOE M. S. O. E. +MSP M. S. P. +MSRB M. S. R. B. +MSU M. S. U. +MSX M. S. X. +MTA M. T. A. +MTB M. T. B. +MTBE M. T. B. E. +MTech M. Tech +MTI M. T. I. +MTM M. T. M. +MTR M. T. R. +MTS M. T. S. +MTU M. T. U. +MTV M. T. V. +MV M. V. +MVP M. V. P. +MVS M. V. S. +MX M. X. +NA N. A. +NAACP N. Double A. C. P. +NAC N. A. C. +NACA N. A. C. A. +NACM N. A. C. M. +NAD N. A. D. +NAEIR N. A. E. I. R. +NAEP N. A. E. P. +NAHB N. A. H. B. +NAIC N. A. I. C. +NAL N. A. L. +NALU N. A. L. U. +NAM N. A. M. +NAPAP N. A. P. A. P. +NAPM N. A. P. M. +NAR N. A. R. +NARFE N. A. R. F. E. +NAS N. A. S. +#NASA N. A. S. A. +NASD N. A. S. D. +NASSA N. A. S. S. A. +NATCA N. A. T. C. A. +NAV N. A. V. +NBA N. B. A. +NBC N. B. C. +NBD N. B. D. +NBER N. B. E. R. +NBI N. B. I. +NBO N. B. O. +NBS N. B. S. +NC N. C. +NCA N. C. A. +NCAA N. C. A. A. +NCB N. C. B. +NCC N. C. C. +NCI N. C. I. +NCIF N. C. I. F. +NCMS N. C. M. S. +NCNB N. C. N. B. +NCR N. C. R. +NCTA N. C. T. A. +NDF N. D. F. +NDI N. D. I. +NDP N. D. P. +NEA N. E. A. +NEC N. E. C. +NEH N. E. H. +NEI N. E. I. +NESB N. E. S. B. +NETAAC N. E. T. A. A. C. +NFA N. F. A. +NFC N. F. C. +NFIB N. F. I. B. +NFIC N. F. I. C. +NFL N. F. L. +NFPA N. F. P. A. +NFS N. F. S. +NFSW N. F. S. W. +NGL N. G. L. +NH N. H. +NHK N. H. K. +NHL N. H. L. +NHS N. H. S. +NHTSA N. H. T. S. A. +NI N. I. +NIA N. I. A. +NIC N. I. C. +NIDA N. I. D. A. +NIH N. I. H. +NIMH N. I. M. H. +NIOSH N. I. O. S. H. +NIS N. I. S. +NJ N. J. +NKF N. K. F. +NKK N. K. K. +NKVD N. K. V. D. +NL N. L. +NLD N. L. D. +NLI N. L. I. +NLM N. L. M. +NLO N. L. O. +NLRB N. L. R. B. +NM N. M. +NME N. M. E. +NMP N. M. P. +NMS N. M. S. +NMTBA N. M. T. B. A. +NMU N. M. U. +NOAA N. O. A. A. +NOX N. O. X. +NPA N. P. A. +NPC N. P. C. +NPD N. P. D. +NPM N. P. M. +NRA N. R. A. +NRC N. R. C. +NRDC N. R. D. C. +NRECA N. R. E. C. A. +NRM N. R. M. +NS N. S. +NSA N. S. A. +NSC N. S. C. +NSF N. S. F. +NSM N. S. M. +NSPA N. S. P. A. +NT N. T. +NTC N. T. C. +NTG N. T. G. +NTIA N. T. I. A. +NTN N. T. N. +NTSB N. T. S. B. +NTT N. T. T. +NTX N. T. X. +NUI N. U. I. +NUM N. U. M. +NUS N. U. S. +NV N. V. +NVF N. V. F. +NW N. W. +NWA N. W. A. +NWQ N. W. Q. +NX N. X. +NY N. Y. +NYCB N. Y. C. B. +NYCE N. Y. C. E. +NYFE N. Y. F. E. +NYSE N. Y. S. E. +NYT N. Y. T. +NYU N. Y. U. +NZI N. Z. I. +OAG O. A. G. +OAS O. A. S. +OASDI O. A. S. D. I. +OAT O. A. T. +OCC O. C. C. +OCE O. C. E. +OCR O. C. R. +OCS O. C. S. +OCU O. C. U. +ODS O. D. S. +OEC O. E. C. +OECD O. E. C. D. +OED O. E. D. +OEL O. E. L. +OEM O. E. M. +OEX O. E. X. +OG O. G. +OIRA O. I. R. A. +OIS O. I. S. +OK O. K. +OKC O. K. C. +OMB O. M. B. +OMI O. M. I. +OMV O. M. V. +ONG O. N. G. +OPIC O. P. I. C. +OPM O. P. M. +ORI O. R. I. +ORS O. R. S. +OS O. S. +OSF O. S. F. +OSI O. S. I. +OSS O. S. S. +OTA O. T. A. +OTC O. T. C. +OTF O. T. F. +OTN O. T. N. +OTS O. T. S. +OTV O. T. V. +OV O. V. +PA P. A. +PAE P. A. E. +PAK P. A. K. +PATC P. A. T. C. +PB P. B. +PBA P. B. A. +PBGC P. B. G. C. +PBHG P. B. H. G. +PBI P. B. I. +PBR P. B. R. +PBS P. B. S. +PBX P. B. X. +PC P. C. +PCA P. C. A. +PCB P. C. B. +PCC P. C. C. +PCE P. C. E. +PCI P. C. I. +PCjr P. C. Junior +PCL P. C. L. +PCM P. C. M. +PCMCIA P. C. M. C. I. A. +PCN P. C. N. +PCP P. C. P. +PCR P. C. R. +PCS P. C. S. +PCW P. C. W. +PD P. D. +PDA P. D. A. +PDF P. D. F. +PDI P. D. I. +PDLA P. D. L. A. +PDR P. D. R. +PDT P. D. T. +PE P. E. +PECC P. E. C. C. +PF P. F. +PFM P. F. M. +PG P. G. +PGA P. G. A. +PGH P. G. H. +PhD P. H. D. +Ph.D P. H. D. +Ph.D.s P. H. D.s +Ph.Ds P. H. D.s +PhDs P. H. D.s +PHH P. H. H. +PHLCorp P. H. L. Corporation +PHM P. H. M. +PHP P. H. P. +PHPO P. H. P. O. +PI P. I. +PIK P. I. K. +PIP P. I. P. +PIR P. I. R. +PIW P. I. W. +PL P. L. +PLC P. L. C. +PLE P. L. E. +PLM P. L. M. +PLO P. L. O. +PM P. M. +PMA P. M. A. +PMC P. M. C. +PMDB P. M. D. B. +PMI P. M. I. +PMS P. M. S. +PMT P. M. T. +PNB P. N. B. +PNC P. N. C. +PNG P. N. G. +PNM P. N. M. +PNOC P. N. O. C. +POW P. O. W. +PP P. P. +PPD P. P. D. +PPG P. P. G. +PPI P. P. I. +PPM P. P. M. +PPO P. P. O. +PPP P. P. P. +PQQ P. Q. Q. +PR P. R. +PRB P. R. B. +PRC P. R. C. +PRD P. R. D. +PRI P. R. I. +PRSA P. R. S. A. +Pvt Private +PRK P. R. K. +PRP P. R. P. +PS P. S. +PSA P. S. A. +PSC P. S. C. +PSE P. S. E. +PSG P. S. G. +PSI P. S. I. +PSNH P. S. N. H. +PSR P. S. R. +PST P. S. T. +PSUM P. S. U. M. +PT P. T. +PTA P. T. A. +PTI P. T. I. +PTL P. T. L. +PTT P. T. T. +PUC P. U. C. +PV P. V. +PVC P. V. C. +PW P. W. +PWA P. W. A. +PWS P. W. S. +PX P. X. +PYA P. Y. A. +QB Q. B. +QDE Q. D. E. +QE Q. E. +QFB Q. F. B. +QMS Q. M. S. +QO Q. O. +QVC Q. V. C. +RAC R. A. C. +RAF R. A. F. +RAI R. A. I. +RB R. B. +RBC R. B. C. +RC R. C. +RCA R. C. A. +RCI R. C. I. +RCM R. C. M. +RD R. D. +RDF R. D. F. +RDP R. D. P. +REIT R. E. I. T. +RF R. F. +RFC R. F. C. +RFD R. F. D. +RFE R. F. E. +RFI R. F. I. +RFTV R. F. T. V. +RG R. G. +RHI R. H. I. +RHM R. H. M. +RI R. I. +RJ R. J. +RJR R. J. R. +RKO R. K. O. +RL R. L. +RLC R. L. C. +RLI R. L. I. +RLR R. L. R. +RMC R. M. C. +RMI R. M. I. +RMJ R. M. J. +RMS R. M. S. +RMV R. M. V. +RNA R. N. A. +RNC R. N. C. +RO R. O. +ROA R. O. A. +ROC R. O. C. +ROTC R. O. T. C. +RPA R. P. A. +RPM R. P. M. +RREEF R. R. E. E. F. +RS R. S. +RSC R. S. C. +RSCG R. S. C. G. +RSI R. S. I. +RSO R. S. O. +RSV R. S. V. +RT R. T. +RTBF R. T. B. F. +RTC R. T. C. +RTE R. T. E. +RTHK R. T. H. K. +RTL R. T. L. +RTM R. T. M. +RTS R. T. S. +RTZ R. T. Z. +RU R. U. +RUC R. U. C. +RV R. V. +RWE R. W. E. +RX R. X. +SA S. A. +SAA S. A. A. +SAB S. A. B. +SACC S. A. C. C. +SACP S. A. C. P. +SAI S. A. I. +SAL S. A. L. +SALP S. A. L. P. +SAO S. A. O. +SAPC S. A. P. C. +SAS S. A. S. +SAT S. A. T. +SB S. B. +SBA S. B. A. +SBC S. B. C. +SBCI S. B. C. I. +SBIC S. B. I. C. +SBIR S. B. I. R. +SBK S. B. K. +SBS S. B. S. +SC S. C. +SCA S. C. A. +SCE S. C. E. +SCEcorp S. C. E. Corporation +SCI S. C. I. +SCM S. C. M. +SD S. D. +SDA S. D. A. +SDC S. D. C. +SDG S. D. G. +SDI S. D. I. +SDP S. D. P. +SDR S. D. R. +SDRC S. D. R. C. +SDS S. D. S. +SE S. E. +SEC S. E. C. +SEEQ S. E. E. Q. +SEI S. E. I. +SEL S. E. L. +SEM S. E. M. +SES S. E. S. +SF S. F. +SFC S. F. C. +SFE S. F. E. +SFN S. F. N. +SFO S. F. O. +SGB S. G. B. +SGC S. G. C. +SGI S. G. I. +SGS S. G. S. +SH S. H. +SHL S. H. L. +SHV S. H. V. +SI S. I. +SIA S. I. A. +SIB S. I. B. +SIBV S. I. B. V. +SIPC S. I. P. C. +SIV S. I. V. +SJNB S. J. N. B. +SK S. K. +SKF S. K. F. +SKK S. K. K. +SL S. L. +SLA S. L. A. +SLH S. L. H. +SLM S. L. M. +SLR S. L. R. +SMC S. M. C. +SME S. M. E. +SMES S. M. E. S. +SMR S. M. R. +SMS S. M. S. +SMU S. M. U. +SMUD S. M. U. D. +SNC S. N. C. +SNCF S. N. C. F. +SNET S. N. E. T. +SNIA S. N. I. A. +SNL S. N. L. +SNPE S. N. P. E. +SOES S. O. E. S. +SOS S. O. S. +SP S. P. +SPD S. P. D. +SPE S. P. E. +SPEP S. Pep +SPG S. P. G. +SPI S. P. I. +SPS S. P. S. +SPSF S. P. S. F. +SPX S. P. X. +SpA Company +S.p.A Company +SQL S. Q. L. +SR S. R. +SRI S. R. I. +SRK S. R. K. +SRL S. R. L. +SRO S. R. O. +SRS S. R. S. +SS S. S. +SSA S. S. A. +SSB S. S. B. +SSBI S. S. B. I. +SSC S. S. C. +SSI S. S. I. +SSMC S. S. M. C. +SSN S. S. N. +SSP S. S. P. +SST S. S. T. +STC S. T. C. +Ste Saint +STS S. T. S. +SVP S. V. P. +SX S. X. +TA T. A. +TB T. B. +TBA T. B. A. +TBC T. B. C. +TBF T. B. F. +TBG T. B. G. +TBK T. B. K. +TBN T. B. N. +TBS T. B. S. +TBWA T. B. W. A. +TC T. C. +TCA T. C. A. +TCBY T. C. B. Y. +TCC T. C. C. +TCF T. C. F. +TCI T. C. I. +TCMP T. C. M. P. +TCP T. C. P. +TCS T. C. S. +TCU T. C. U. +TCW T. C. W. +TD T. D. +TDD T. D. D. +TDK T. D. K. +TDU T. D. U. +TE T. E. +TEC T. E. C. +TEP T. E. P. +TF T. F. +TFBA T. F. B. A. +TFD T. F. D. +TFF T. F. F. +TFR T. F. R. +TGI T. G. I. +TGL T. G. L. +TGWU T. G. W. U. +THA T. H. A. +THI T. H. I. +THT T. H. T. +TI T. I. +TIAA T. I. A. A. +TII T. I. I. +TIL T. I. L. +TIMI T. I. M. I. +TJ T. J. +TJX T. J. X. +TKR T. K. R. +TLC T. L. C. +TM T. M. +TMC T. M. C. +TMI T. M. I. +TMIC T. M. I. C. +TMK T. M. K. +TMOC T. M. O. C. +TNA T. N. A. +TNF T. N. F. +TNM T. N. M. +TNP T. N. P. +TNT T. N. T. +TOA T. O. A. +TPA T. P. A. +tPA t. P. A. +TPF T. P. F. +TPI T. P. I. +TPS T. P. S. +TR T. R. +TRC T. R. C. +TRE T. R. E. +TRO T. R. O. +TRS T. R. S. +TRT T. R. T. +TRW T. R. W. +TS T. S. +TSA T. S. A. +TSB T. S. B. +TSE T. S. E. +TSF T. S. F. +TSI T. S. I. +TSO T. S. O. +TSSU T. S. S. U. +TTAC T. T. A. C. +TTAPS T. T. A. P. S. +TU T. U. +TV T. V. +TVA T. V. A. +TVI T. V. I. +TVS T. V. S. +TVSM T. V. S. M. +TVX T. V. X. +TW T. W. +TWA T. W. A. +TX T. X. +TXI T. X. I. +TXL T. X. L. +TXO T. X. O. +UA U. A. +UAE U. A. E. +UAL U. A. L. +UAP U. A. P. +UAW U. A. W. +UBAF U. B. A. F. +UBS U. B. S. +UC U. C. +UCLA U. C. L. A. +UCLAF U. C. L. A. F. +UCSD U. C. S. D. +UCSF U. C. S. F. +UD U. D. +UDAG U. D. A. G. +UDC U. D. C. +UDF U. D. F. +UEI U. E. I. +UFO U. F. O. +UFT U. F. T. +UFW U. F. W. +UGI U. G. I. +UH U. H. +UHF U. H. F. +UHL U. H. L. +UI U. I. +UIC U. I. C. +UIS U. I. S. +UJA U. J. A. +UK U. K. +UKI U. K. I. +ULI U. L. I. +UMBC U. M. B. C. +UMC U. M. C. +UMNO U. M. N. O. +UMTA U. M. T. A. +UMW U. M. W. +UNAM U. N. A. M. +UNC U. N. C. +UNCF U. N. C. F. +UNDP U. N. D. P. +UNHCR U. N. H. C. R. +UNLV U. N. L. V. +UNR U. N. R. +UOP U. O. P. +UPC U. P. C. +UPI U. P. I. +UPS U. P. S. +URS U. R. S. +URW U. R. W. +US U. S. +USA U. S. A. +U.S.A U. S. A. +USAA U. S. A. A. +USACafes U. S. A. Cafes +USADirect U. S. A. Direct +USAir U. S. Air +USC U. S. C. +USCB U. S. C. B. +USDA U. S. D. A. +USF U. S. F. +USFL U. S. F. L. +USG U. S. G. +USH U. S. H. +USI U. S. I. +USIA U. S. I. A. +USLico U. S. Lico +USLife U. S. Life +USO U. S. O. +USOC U. S. O. C. +Uspci U. S. P. C. I. +USPS U. S. P. S. +USS U. S. S. +USSC U. S. S. C. +USSR U. S. S. R. +UST U. S. T. +USW U. S. W. +USX U. S. X. +UT U. T. +UTA U. T. A. +UTC U. T. C. +UTL U. T. L. +UTU U. T. U. +UV U. V. +UX U. X. +VA V. A. +VAAP V. A. A. P. +VAD V. A. D. +VAN V. A. N. +VBI V. B. I. +VC V. C. +VCI V. C. I. +VCR V. C. R. +VCRS V. C. R. S. +VCS V. C. S. +VD V. D. +VDT V. D. T. +VF V. F. +VFW V. F. W. +VG V. G. +VGA V. G. A. +VH V. H. +VHA V. H. A. +VHF V. H. F. +VHL V. H. L. +VHS V. H. S. +VIP V. I. P. +VIR V. I. R. +VISN V. I. S. N. +VJN V. J. N. +VLI V. L. I. +VLSI V. L. S. I. +VM V. M. +VMS V. M. S. +VMX V. M. X. +VNA V. N. A. +VNR V. N. R. +VNU V. N. U. +VO V. O. +VOA V. O. A. +VOR V. O. R. +VP V. P. +VPI V. P. I. +VPT V. P. T. +VQT V. Q. T. +VR V. R. +VRA V. R. A. +VS V. S. +VSAT V. S. A. T. +VSB V. S. B. +VTC V. T. C. +VTR V. T. R. +VTX V. T. X. +VW V. W. +VWR V. W. R. +WABC W. A. B. C. +WAFA W. A. F. A. +WASP W. A. S. P. +WATS W. A. T. S. +WB W. B. +WBA W. B. A. +WBAI W. B. A. I. +WBBM W. B. B. M. +WBZ W. B. Z. +WCBS W. C. B. S. +WCI W. C. I. +WCIX W. C. I. X. +WCK W. C. K. +WCRS W. C. R. S. +WCVB W. C. V. B. +WD W. D. +WDB W. D. B. +WEFA W. E. F. A. +WEG W. E. G. +WestLB West L. B. +WEU W. E. U. +WFAN W. F. A. N. +WFBQ W. F. B. Q. +WFC W. F. C. +WFIA W. F. I. A. +WFLA W. F. L. A. +WFRR W. F. R. R. +WFXT W. F. X. T. +WGA W. G. A. +WGBH W. G. B. H. +WGC W. G. C. +WGHP W. G. H. P. +WGMS W. G. M. S. +WGN W. G. N. +WHAS W. H. A. S. +WHBQ W. H. B. Q. +WIC W. I. C. +WITI W. I. T. I. +WJBK W. J. B. K. +WJW W. J. W. +WKYS W. K. Y. S. +WLR W. L. R. +WM W. M. +WMAQ W. M. A. Q. +WMG W. M. G. +WMMS W. M. M. S. +WMS W. M. S. +WNAC W. N. A. C. +WNBC W. N. B. C. +WNCN W. N. C. N. +WNET W. N. E. T. +WNEW W. N. E. W. +WNS W. N. S. +WNW W. N. W. +WNYC W. N. Y. C. +WNYW W. N. Y. W. +WOJB W. O. J. B. +WOMC W. O. M. C. +WOR W. O. R. +WPA W. P. A. +WPBF W. P. B. F. +WPGH W. P. G. H. +WPIX W. P. I. X. +WPP W. P. P. +WPPSS W. P. P. S. S. +WQHT W. Q. H. T. +WQTV W. Q. T. V. +WQUE W. Q. U. E. +WR W. R. +WRKO W. R. K. O. +WROR W. R. O. R. +WSBK W. S. B. K. +WSCV W. S. C. V. +WSGP W. S. G. P. +WSJ W. S. J. +WSVN W. S. V. N. +WTBS W. T. B. S. +WTC W. T. C. +WTD W. T. D. +WTI W. T. I. +WTLV W. T. L. V. +WTO W. T. O. +WTTV W. T. T. V. +WTVJ W. T. V. J. +WTVT W. T. V. T. +WTXF W. T. X. F. +WW W. W. +WWII W. W. I. I. +WWL W. W. L. +WWOR W. W. O. R. +WXRK W. X. R. K. +WYLD W. Y. L. D. +WYNY W. Y. N. Y. +WZTV W. Z. T. V. +XA X. A. +XE X. E. +XJ X. J. +XL X. L. +XMP X. M. P. +XP X. P. +XR X. R. +XT X. T. +XTC X. T. C. +XYZ X. Y. Z. +YMCA Y. M. C. A. +YSL Y. S. L. +YTT Y. T. T. +YWCA Y. W. C. A. +ZCB Z. C. B. +ZDF Z. D. F. +ZMI Z. M. I. +ZR Z. R. +ZTS Z. T. S. +ZX Z. X. +mm m. m. +mg m. g. diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/abbrproc.perl b/egs/bn/s5/local/data_prep/csr_hub4_utils/abbrproc.perl new file mode 100755 index 00000000000..f22684c5742 --- /dev/null +++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/abbrproc.perl @@ -0,0 +1,465 @@ +#!/usr/bin/perl +# $Id: abbrproc.perl,v 1.3 1996/08/21 20:05:09 robertm Rel $ +############################################################################### +# This software is being provided to you, the LICENSEE, by the Massachusetts # +# Institute of Technology (M.I.T.) under the following license. By # +# obtaining, using and/or copying this software, you agree that you have # +# read, understood, and will comply with these terms and conditions: # +# # +# Permission to use, copy, modify and distribute, including the right to # +# grant others the right to distribute at any tier, this software and its # +# documentation for any purpose and without fee or royalty is hereby granted, # +# provided that you agree to comply with the following copyright notice and # +# statements, including the disclaimer, and that the same appear on ALL # +# copies of the software and documentation, including modifications that you # +# make for internal use or for distribution: # +# # +# Copyright 1991-4 by the Massachusetts Institute of Technology. All rights # +# reserved. # +# # +# THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS OR # +# WARRANTIES, EXPRESS OR IMPLIED. By way of example, but not limitation, # +# M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS # +# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR # +# DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, # +# TRADEMARKS OR OTHER RIGHTS. # +# # +# The name of the Massachusetts Institute of Technology or M.I.T. may NOT be # +# used in advertising or publicity pertaining to distribution of the # +# software. Title to copyright in this software and any associated # +# documentation shall at all times remain with M.I.T., and USER agrees to # +# preserve same. # +############################################################################### + +# abbreviation preprocessor for WSJ +# assumes 1 sentence per line +# +# 1. map "x.y." -> "x. y." +# 2. convert Roman numerals with appropriate left context into cardinal no.s +# 3. expand abbreviations and word translations +# expands remaining Roman numerals into ordinal no.s +# 4. map isolated letters: "x" -> "x." + +# Minor modifications by David Graff, Linguistic Data Consortium, in +# preparation for publishing on cdrom; Aug. 11, 1994. + +# Major modifications by Robert MacIntyre, LDC, attempting to improve +# performance (~50% speedup), in preparation of Broadcast News material, +# August 1996. + + +$file="$ENV{HOME}/bc-news/bin/abbrlist"; # default abbreviation file + +for($i=0,$j=0;$i<=$#ARGV;$i++) +{ if($ARGV[$i] =~ /^-/) + { if($ARGV[$i] =~ /^-v/) {$vflg=1;} + else {&perr("illegal flag: $ARGV[$i]");} + } + else + { # if($file) {&perr("multiple file arg");} + $file=$ARGV[i]; + } +} +@ARGV=(); +if(!file) {&perr("no abbreviation file specified"); } + +if(!open(FILE,$file)) {&perr("cannot open abbreviation file"); } +while() +{ if(/^#/) {next;} # comment + s/\n//; + if(!$_) {next;} # blank + $y=$_; + s/^(\S+)\s+//; # extract 1st word + $x=$1; + if(!$x) {&perr("no word: $y");} + if(!$_) {&perr("no value: $y");} + + if($x =~ /^\*r/) # left context for roman numeral + { if(!/^[a-zA-Z]{2,}$/) + {&perr("illegal roman: $x");} + tr/a-z/A-Z/; # map to UC + $romanlc{$_}=1; + } + elsif($x =~ /\.$/) # abbreviations + { if($x !~ /^[a-zA-Z][a-zA-Z\.]+\.$/) + {&perr("illegal abbreviation: $x");} + $x =~ s/\.$//; + $abbrev{$x}=$_; + if($x =~ /[a-z]/) + { $x =~ tr/a-z/A-Z/; #UC version + tr/a-z/A-Z/; + $abbrev{$x}=$_; + } + #if(length($x)>$maxabl) {$maxabl=length($x);} + } + else # translations + { if($x !~ /^[a-zA-Z\.&\/-]+[a-zA-Z]$/) + {&perr("illegal translation: $x");} + $trans{$x}=$_; + if($x =~ /[a-z]/) + { $x =~ tr/a-z/A-Z/; #UC version + tr/a-z/A-Z/; + $trans{$x}=$_; + } + #if(length($x)>$maxtrl) {$maxtrl=length($x);} + } + $n++; +} +#if($vflg) {print STDERR "$n lines read from file\n";} + +&setupRoman; + +while(<>) +{ ########################### abbrevproc #################################### + + # pass SGML as is + if (/^<\/?[spa]/) + { + print; + next; + } + chop; + + + s/&/ & /g; # & + s=/= / =g; # / + s/ - / -- /g; # save (long) dashes + s/\b(-+)\b/ $1 /g; # -, --, etc. in words + s/([^-\s])(-+)([^-\s])/$1 $2 $3/g; + + if(/_/) + { + &perr2("removing illegal underscores (_) in:\n $_\n"); + s/_//g; + } + + @input = split(/\s+/); + @output=(); + for($field=0;$field<=$#input;$field++) + { + $_ = $input[$field]; + # if($vflg) {print "in: $_\n";} + + s/^(\W*)//; # strip front + $front=$1; + + s/(\W*)$//; # strip back + $back=$1; + if(/\.?\'[sS]$/) # possessive + { + s/(\.?\'[sS])$//; + $back="$1$back"; + } + elsif (/^[A-Z]+s$/) # eg Cs or Xs + { + s/s$//; + $back="_s$back"; + } + + $ptbkflg = ($back =~ /^\./); + + #if($vflg) {print "f=$front, m=$_, b=$back\n";} + + + # Roman numerals + if(/^[IVX]{1,6}$/ && $front eq "" && $field>0 && + ($x=&geto())) + { + $x =~ tr/a-z/A-Z/; # map lc to UC + $x =~ s/^\W//; # strip initial punct from lc + if($romanlc{$x}) # left context check + { + if($front) + { + &pusho($front); + if($front !~ /[\w]$/) {$appendflg=1;} + } + + if ($x=$Roman{$_}) + { + &pusho($x); + } + else + { + &perr2("illegal roman: $_"); + &pusho($_); + } + + if($back) + { + if($back !~ /^[\w]/) {&appendo($back);} + else {&pusho($back);} + } + next; + } + + } + + + # St. or St ["Street" vs. "Saint"] + if($_ eq "St") + { $back =~ s/^\.//; + if($front ne "" && $back ne "") + { &perr2("Cannot resove St.: $input[$field-1] $input[$field] $input[$field+1]"); + $x=Street; # Wild guess + } + elsif($front) { $x="Saint"; } + elsif($back) { $x="Street"; } + elsif($input[$field-1] !~ /^[A-Z]/ + && $input[$field+1] =~ /^[A-Z]/) + { $x = "Saint"; } + elsif($input[$field-1] =~ /^[A-Z]/ + && $input[$field+1] !~ /^[A-Z]/) + { $x = "Street"; } + + elsif(!$back && $input[$field+1] =~ /^[A-Z]/) + { $x = "Saint"; } + elsif(!$back && $input[$field+1] eq '-' && + $input[$field+2] =~ /^[A-Z]/) + { $x = "Saint"; } + else + { &perr2("Cannot resove St.: $input[$field-1] $input[$field] $input[$field+1]"); + $x=Street; # Wild guess + } + + + if($front) + { &pusho($front); + if($front !~ /[\w]$/) {$appendflg=1;} + } + + &pusho($x); + + if($back) + { if($back !~ /^[\w]/) {&appendo($back);} + else {&pusho($back);} + } + next; + } + + # abbreviations (end with .) + if($ptbkflg && ($x=$abbrev{$_})) + { + if($front) + { &pusho($front); + if($front !~ /[\w]$/) + {$appendflg=1;} + } + + &pusho($x); + + if($field<$#input || $back =~ /[!?]/) + { $back =~ s/^\.//; } # rm . + else # end of sent + { $back =~ s/^\.(\'s)/$1./; + if($back =~ /\..*\./) # 2 dots + {$back=~s/\.([^\.]*)/$1/;} + } + + if($back) + { if($back !~ /^[\w]/) + {&appendo($back);} + else {&pusho($back);} + } + next; + + } + + # translations (do not end with .) + # first merge multi-token translations + if($input[$field+1] =~ /^[-\/&]$/ && $back eq "") + { $x=$input[$field+2]; + $x =~ s/(\W*)$//; + $xback=$1; + if($x =~ /\.?\'[sS]$/) # possessive + { $x =~ s/(\.?\'[sS])$//; + $xback="$1$xback"; + } + elsif ($x =~ /^[A-Z]+s$/) # eg Cs or Xs + { $x =~ s/s$//; + $xback="_s$xback"; + } + if($trans{"$_$input[$field+1]$x"}) # eg. AT&T + { $_="$_$input[$field+1]$x"; + $field+=2; + + $back=$xback; + $ptbkflg = ($back =~ /^\./); + } + } + # then see if we have a translation + if ($x=$trans{$_}) + { if($front) + { &pusho($front); + if($front !~ /[\w]$/) {$appendflg=1;} + } + + &pusho($x); + + if($x =~ /\.$/) { $back =~ s/^\.//; } # only 1 . + if($back) + { if($back !~ /^[\w]/) {&appendo($back);} + else {&pusho($back);} + } + next; + } + + # eg. Cs, but not As Is Ms Us + if(($back =~ /^_s/) && /^[B-HJ-LN-TV-Z]$/) + { if($front) + { &pusho($front); + if($front !~ /[\w]$/) {$appendflg=1;} + } + + &pusho("$_."); + + if($back) + { if($back !~ /^[\w]/) {&appendo($back);} + else {&pusho($back);} + } + next; + } + + # split x.y. + $_ .= '.' if $ptbkflg; # NOTE THIS CHANGES $_ FOR FUTURE MATCHES + # but it has no more uses in this loop, + # so this _should_ be okay. + if (/^([a-zA-Z]\.)+([sS]?)$/) + { + $sflag = $2; # remember if plural (as opposed to a.s.) + + chop if $ptbkflg; # trim period that we just added + + s/\./. /g; # x.y. -> x. y. + + s/ ([sS])$/$1/ if $sflag; # reattach final "s" + + if($front) + { &pusho($front); + if($front !~ /[\w]$/) {$appendflg=1;} + } + + &pusho($_); + + if($back) + { if($back !~ /^[\w]/) {&appendo($back);} + else {&pusho($back);} + } + next; + } + + # remaining tokens are passed "as is" + # [Below does "&pusho($input[$field]);" but faster, since we avoid + # the subroutine call for the most common case.] + push(@output,$input[$field]); + } + + $_=join(" ",@output); + + # if($vflg) {print "ab:\t$_\n";} + + ######################### lettproc ###################################### + if (/\b[b-zB-HJ-Z]\b/) + { + @output = split(/\s+/); + + foreach(@output) + { + next unless /^\W*[b-zB-HJ-Z]\W*$/; + + #if($vflg) {print "le: $_\n";} + + # some cases to skip/pre-change. (Note that backslashing of + # quotes is for the sake of Emacs, not Perl.) + next if (/^[\'][nN]$/); # Spic \'n Span + + s/(^[\`\'][nN])[\`\']$/$1/ && next; # Rock 'n' Roll: 'n' -> \'n + + s/^[\`\'\"]R[\'\`\"]$/"R"/ && next; # Toys "R" Us + + next if (/^o\'$/); # Man o\' War + + # put . at end of remaining single-letter words + s/^(\W*)([b-zB-HJ-Z])([^.\w]\W*|[^\w.]*)$/$1$2.$3/; + } + + $_=join(" ",@output); + } + + s/\s+/ /g; + s/^ //; + s/ $//; + + s/ _//g; # attach final s for Cs or AFLs + s/_//g; # clear _ + s/ - /-/g; + + print $_,"\n" if $_; +} + +sub pusho # pusho($x): push output +{ if($appendflg) # global: used for fronts + { + &appendo(@_[0]); + } + else {push(@output,@_);} +} + +sub appendo # appendo($x): append to output +{ $appendflg=0; + if($#output < 0) {&perr("appendo: output empty");} + $output[$#output] .= @_[0]; +} + +sub geto # geto(): get last output +{ if($#output < 0) {print STDERR ("geto: output empty\n");} + return $output[$#output]; +} + +sub perr +{ print STDERR "abbrevproc: $_[0]\n"; + exit(1); +} + +sub perr2 +{ print STDERR "abbrevproc: $_[0]\n"; +} + +sub setupRoman +{ + $Roman{I}="one"; + $Roman{II}="two"; + $Roman{III}="three"; + $Roman{IV}="four"; + $Roman{V}="five"; + $Roman{VI}="six"; + $Roman{VII}="seven"; + $Roman{VIII}="eight"; + $Roman{IX}="nine"; + $Roman{X}="ten"; + $Roman{XI}="eleven"; + $Roman{XII}="twelve"; + $Roman{XIII}="thirteen"; + $Roman{XIV}="fourteen"; + $Roman{XV}="fifteen"; + $Roman{XVI}="sixteen"; + $Roman{XVII}="seventeen"; + $Roman{XVIII}="eighteen"; + $Roman{XIX}="nineteen"; + $Roman{XX}="twenty"; + $Roman{XXI}="twenty-one"; + $Roman{XXII}="twenty-two"; + $Roman{XXIII}="twenty-three"; + $Roman{XXIV}="twenty-four"; + $Roman{XXV}="twenty-five"; + $Roman{XXVI}="twenty-six"; + $Roman{XXVII}="twenty-seven"; + $Roman{XXVIII}="twenty-eight"; + $Roman{XXIX}="twenty-nine"; + $Roman{XXX}="thirty"; + $Roman{XXXI}="thirty-one"; + $Roman{XXXII}="thirty-two"; + $Roman{XXXIII}="thirty-three"; + $Roman{XXXIV}="thirty-four"; + $Roman{XXXV}="thirty-five"; +} diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/addressforms b/egs/bn/s5/local/data_prep/csr_hub4_utils/addressforms new file mode 100644 index 00000000000..f3dcdddea7b --- /dev/null +++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/addressforms @@ -0,0 +1,38 @@ +# abbreviation list used for WSJ0 (pilot) processing +# generated by Doug Paul, MIT/LL +# derived from unigram file 29 Aug 91 mods to 17 Sept 91 + +# true abbreviations (must end with .) +# if key includes lower case, an upper case version will be created +Adm. Admiral +Brig. Brigadeer +Capt. Captain +Cmdr. Commander +Col. Colonel +Cpl. Corporal +Dr. Doctor +Drs. Doctors +Fr. Friar +Ft. Fort +Gen. General +Gov. Governor +Lt. Lieutenant +Maj. Major +Mr. Mister +Mrs. Mistress +Ms. Miz +Messrs. Misters +Prof. Professor +Prop. Proposition +Pte. Point +Pvt. Private +Rep. Representative +Reps. Representatives +Rev. Reverend +Sen. Senator +Sens. Senators +Sgt. Sargent +St. Saint +Ste. Saint +vs. versus +v. versus diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/artfilter.perl b/egs/bn/s5/local/data_prep/csr_hub4_utils/artfilter.perl new file mode 100755 index 00000000000..ed464e4a31d --- /dev/null +++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/artfilter.perl @@ -0,0 +1,83 @@ +#!/usr/bin/perl + +# artfilter.perl + +# This perl script can be used to (de)select articles from TIPSTER +# format newswire data on the basis of the content of a specific +# tagged element. This version allows a number of string patterns +# (drawn from a separate input file) to be checked against the content +# of a chosen tag, and allows residue articles to be sent to a +# separate file (in addition to having selected articles written to +# stdout). + +require "newgetopt.pl"; +$cmd_okay = &NGetOpt( 't=s', 'p=s', 'f=s', 'r=s', 'v', 'x' ); +$arg_okay = ( $opt_t ne "" && ( $opt_p ne "" || $opt_f ne "" )); + +if ( ! $cmd_okay || ! $arg_okay ) { + print +"\nUsage: artfilter.perl -t tag [-p ptrn | -f ptrns] [-r resid] [-vx] [infile]\n"; + print " writes DOCs with containing /ptrn(s)/ to stdout\n"; + print " -v = select DOCs NOT containing /ptrn(s)/ in \n"; + print " -x = exclude DOCs that do not contain \n"; + print " -r = write residue DOCs to resid file\n"; + exit; +} + +@patrns = (); +if ( $opt_f ne "" ) { + open( PATRNS, "<$opt_f" ); + while () { + chop; + push( @patrns, $_ ); + } +} else { + push( @patrns, $opt_p ); +} +close PATRNS; + +if ( $opt_r ) { + open( RESID, ">$opt_r" ); +} + +$outputOn = $foundtag = 0; + +while (<>) +{ + if ( /]/ ) { + $artbuf = $_; + $outputOn = 1; + } + elsif ( /<\/DOC>/ ) { + if ( $outputOn ) { + $artbuf .= $_; + if ( $outputOn == 1 && ( ! $opt_x || $foundtag )) { + print $artbuf; + } elsif ( $opt_r && ( ! $opt_x || $foundtag )) { + print RESID $artbuf; + } + $outputOn = 0; + } + $foundtag = 0; + } + elsif ( $outputOn ) { + $artbuf .= $_; + if ( /\<$opt_t/ ) { + $foundtag = 1; + $tagdata = $_; + while ( $tagdata !~ /\<\/$opt_t/ ) { + $_ = <>; + $artbuf .= $_; + $tagdata .= $_; + } + foreach $ptn ( @patrns ) { + last if (( $i = ( $tagdata !~ /$ptn/ )) == 0 ); + } + if ( $i ^ $opt_v ) { $outputOn = ( $opt_r ) ? 2 : 0; } + } + } +} + +if ( $opt_r ) { + close RESID; +} diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/bugproc.perl b/egs/bn/s5/local/data_prep/csr_hub4_utils/bugproc.perl new file mode 100755 index 00000000000..48acad96c4e --- /dev/null +++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/bugproc.perl @@ -0,0 +1,69 @@ +#!/usr/bin/perl +# $Id: bugproc.perl,v 1.4 1996/08/21 23:55:40 robertm Rel $ +############################################################################### +# This software is being provided to you, the LICENSEE, by the Massachusetts # +# Institute of Technology (M.I.T.) under the following license. By # +# obtaining, using and/or copying this software, you agree that you have # +# read, understood, and will comply with these terms and conditions: # +# # +# Permission to use, copy, modify and distribute, including the right to # +# grant others the right to distribute at any tier, this software and its # +# documentation for any purpose and without fee or royalty is hereby granted, # +# provided that you agree to comply with the following copyright notice and # +# statements, including the disclaimer, and that the same appear on ALL # +# copies of the software and documentation, including modifications that you # +# make for internal use or for distribution: # +# # +# Copyright 1991-4 by the Massachusetts Institute of Technology. All rights # +# reserved. # +# # +# THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS OR # +# WARRANTIES, EXPRESS OR IMPLIED. By way of example, but not limitation, # +# M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS # +# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR # +# DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, # +# TRADEMARKS OR OTHER RIGHTS. # +# # +# The name of the Massachusetts Institute of Technology or M.I.T. may NOT be # +# used in advertising or publicity pertaining to distribution of the # +# software. Title to copyright in this software and any associated # +# documentation shall at all times remain with M.I.T., and USER agrees to # +# preserve same. # +############################################################################### + +# bugproc.comm +# Removes some bugs common to all sources. +# This script has no source-dependencies. + +while(<>) +{ + if ( /^ x ( + s/\)(\w)/) $1/g; # eg. )x -> ) x; + + s/(\d)\((\d)/$1 ($2/g; # \d(\d + s/(\d)\)(\d)/$1) $2/g; # \d)\d; + s/([a-zA-Z]{2,}\.)(\d)/$1 $2/g; # eg. Sept.30 + s/,([a-zA-Z])/, $1/g; # eg. 20,Smith + s/(\W)milion(\W)/$1million$2/g; # spelling err + + s/(\W&\s*)Co([^\w\.-])/$1Co.$2/g; # "& Co" -> "& Co." + s/(\WU\.S)([^\.\w])/$1.$2/g; # U.S -> U.S. + + # next block added for Broadcast News archive processing + s/\$ +(\d)/\$$1/g; # e.g. "$ 5" -> "$5" + s/\$\#/\$/g; # e.g. "$#5" -> "$5" (typo??) + s/\#/number /g; # in bc-news, "#" = "number" not "pound" + s=([^\s "2002 / " + s=([0-9])/1,000([^0-9,])=$1/1000$2=g; # e.g. "1/1,000" -> "1/1000" + + s/\s{2,}/ /g; + s/^ //; + s/\s*$/ \n/; + + print; +} diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/do-lm b/egs/bn/s5/local/data_prep/csr_hub4_utils/do-lm new file mode 100755 index 00000000000..6a4f66eef4e --- /dev/null +++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/do-lm @@ -0,0 +1,43 @@ +#!/bin/sh +# $Id: do-lm,v 1.3 1996/08/23 22:43:23 robertm Rel $ +Usage() +{ +cat << EOM 1>&2 +Usage: $0 file(s) + Runs LM pipeline on FILES, with output to "lm" subdirectory of cwd. + Expects to find LM conditioning tools in PATH or ./bin. +EOM +} + +# Excludes "fixvp" stage which has the main effect of killing off +# any SGML tagging that contains a space, e.g.

. + +# BBN used -np switch for puncproc, removing punctuation; this chooses the +# "verbalize" option instead. + +# Includes new "numhack" module to deal with zip codes and phone numbers. + +if [ $# -eq 0 ] || [ $1 = "-h" ]; then + Usage + exit 1 +fi + +dir=$1 +shift + +for file in $* +do + BASENM=`basename $file` + name="${BASENM%.*}" + + echo "Running LM pipeline for |$BASENM|..." 1>&2 + set -x + gunzip -c $file | pare-sgml.perl | \ + bugproc.perl | \ + numhack.perl | \ + numproc.perl -xlocal/data_prep/csr_hub4_utils/num_excp | \ + abbrproc.perl local/data_prep/csr_hub4_utils/abbrlist | \ + puncproc.perl -np | gzip -c > $dir/$name.txt.gz + set +x + echo "Done with $BASENM." +done diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/eval-material.ptrns b/egs/bn/s5/local/data_prep/csr_hub4_utils/eval-material.ptrns new file mode 100644 index 00000000000..d6e34eb7357 --- /dev/null +++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/eval-material.ptrns @@ -0,0 +1,4 @@ +ABCPrimetime Live +CNNMorning News +CNNWorld View +NPRMorning Edition diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/num_excp b/egs/bn/s5/local/data_prep/csr_hub4_utils/num_excp new file mode 100644 index 00000000000..0f93e6ae51c --- /dev/null +++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/num_excp @@ -0,0 +1,528 @@ +############################################################################### +# This software is being provided to you, the LICENSEE, by the Massachusetts # +# Institute of Technology (M.I.T.) under the following license. By # +# obtaining, using and/or copying this software, you agree that you have # +# read, understood, and will comply with these terms and conditions: # +# # +# Permission to use, copy, modify and distribute, including the right to # +# grant others the right to distribute at any tier, this software and its # +# documentation for any purpose and without fee or royalty is hereby granted, # +# provided that you agree to comply with the following copyright notice and # +# statements, including the disclaimer, and that the same appear on ALL # +# copies of the software and documentation, including modifications that you # +# make for internal use or for distribution: # +# # +# Copyright 1991-4 by the Massachusetts Institute of Technology. All rights # +# reserved. # +# # +# THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS OR # +# WARRANTIES, EXPRESS OR IMPLIED. By way of example, but not limitation, # +# M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS # +# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR # +# DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, # +# TRADEMARKS OR OTHER RIGHTS. # +# # +# The name of the Massachusetts Institute of Technology or M.I.T. may NOT be # +# used in advertising or publicity pertaining to distribution of the # +# software. Title to copyright in this software and any associated # +# documentation shall at all times remain with M.I.T., and USER agrees to # +# preserve same. # +############################################################################### + +# exceptions: list of numbers to be expanded in exceptional ways +# derived by manual scan of early unigram file +# executed BEFORE numproc +# +# comments indicated by "#" in the first column + +# years +'20s twenties +'30s thirties +'40s forties +'50s fifties +'60s sixties +'70s seventies +'80s eighties +'90s nineties + +# processors +8086 eighty eighty-six +186 one eighty-six +286 two eighty-six +386 three eighty-six +486 four eight-six +187 one eighty-seven +287 two eighty-seven +387 three eighty-seven +80286 eighty two eighty-six +80386 eighty three eighty-six +80486 eighty four eighty-six +3090 thirty ninety +68020 sixty-eight oh twenty +68030 sixty-eight oh thirty + +# aircraft +707 seven oh seven +707s seven oh sevens +707's seven oh seven's +727 seven twenty-seven +727s seven twenty-sevens +727's seven twenty-seven's +737 seven thirty-seven +737s seven thirty-sevens +737-100 seven thirty-seven -- one hundred +737-100s seven thirty-seven -- one hundreds +737-200 seven thirty-seven -- two hundred +737-200s seven thirty-seven -- two hundreds +737-205 seven thirty-seven -- two oh five +737-300 seven thirty-seven -- three hundred +737-300s seven thirty-seven -- three hundreds +737-400 seven thirty-seven -- four hundred +737-400s seven thirty-seven -- four hundreds +737-500 seven thirty-seven -- five hundred +737-500s seven thirty-seven -- five hundreds +737-500's seven thirty-seven -- five hundred's +747 seven forty-seven +747s seven forty-sevens +747's seven forty-seven's +747F seven forty-seven F. +747-100 seven forty-seven -- one hundred +747-100s seven forty-seven -- one hundreds +747-124SF seven forty-seven -- one twenty four S. F. +747-200 seven forty-seven -- two hundred +747-200s seven forty-seven -- two hundreds +747-200's seven forty-seven -- two hundred's +747-200B seven forty-seven -- two hundred B. +747-200F seven forty-seven -- two hundred F. +747-273 seven forty-seven -- two seventy-three +747-300 seven forty-seven -- three hundred +747-341B seven forty-seven -- three forty-one B. +747-400 seven forty-seven -- four hundred +747-400s seven forty-seven -- four hundreds +747-500 seven forty-seven -- five hundred +747-500s seven forty-seven -- five hundreds +747-500's seven forty-seven -- five hundred's +757 seven fifty-seven +757s seven fifty-sevens +757's seven fifty-seven's +757-200 seven fifty-seven -- two hundred +757-200s seven fifty-seven -- two hundreds +757-225 seven fifty-seven -- two two five +757-232s seven fifty-seven -- two three twos +757-767 seven fifty-seven - seven sixty-seven +767 seven sixty-seven +767s seven sixty-sevens +767-200 seven sixty-seven -- two hundred +767-200s seven sixty-seven -- two hundreds +767-200ER seven sixty-seven -- two hundred E R +767-300 seven sixty-seven -- three hundred +767-300s seven sixty-seven -- three hundreds +767-300ER seven sixty-seven -- three hundred E R +767-300-ER seven sixty-seven -- three hundred E R + +A310 A. three ten +A320 A. three twenty +A330 A. three thirty +A340 A. three forty +A-310 A. three ten +A-320 A. three twenty +A-330 A. three thirty +A-340 A. three forty +A310s A. three tens +A320s A. three twenties +A330s A. three thirties +A340s A. three forties +A-310s A. three tens +A-320s A. three twenties +A-330s A. three thirties +A-340s A. three forties + +1011 ten eleven +1011s ten elevens + +MD-80 M. D. eighty + +# misc +#8mm eight millimeter +#35mm 35 millimeter +gp120 g. p. one-twenty +240SX two forty S. X. +RU486 R. U. four eighty-six +RU-486 R. U. four eighty-six + +3Com three Com +3COM three COM +3Com's three Com's +3COM's three COM's + +# serial number mode words +# marked by initial * (stripped in numproc) + +*year +*VAX +*Up +*mm +*ish +*point +*May +*Station +*inch +*ers +*and +*mark +*sec +*stock +*mid +*pre +*dBase +*Co + +# right contexts for dollar +$accord +$account +$acquisition +$ad +$addition +$additional +$advance +$agreement +$aid +$Air +$airport +$allowance +$amount +$annual +$appropriation +# "apartment" and "apartments" should be fixed, but would alter v1.0 +#$apartment +#$apartments +$area +$arms +$Army +$arrangement +$asset +$Atari +$auction +$average +$award +$backlog +$bailout +$balance +$bank +$bankroll +$barrier +$base +$based +$benchmark +$bid +$bill +$bills +$bond +$bonds +$bonus +$book +$bridge +$budget +$building +$Burger +$business +$buyout +$campaign +$cap +$capital +$car +$ceiling +$charge +$check +$checks +$claim +$Clean +$coffeepot +$coffeepots +$company +$companies +$compensation +$complex +$computer +$consortium +$construction +$consulting +$contract +$contracts +$contribution +$contributions +$convertible +$cost +$costs +$court +$credit +$cumulative +$cut +$deal +$debenture +$debentures +$debt +$decline +$decrease +$deductible +$default +$Defense +$defense +$defensive +$deficit +$denominations +$deposit +$development +$difference +$disallowance +$Distillers +$dividend +$domestic +$donor +$donors +$drop +$effort +$emergency +$endowment +$energy +$equity +$estate +$estimate +$Eurobond +$exemption +$expansion +$expense +$face +$facility +$fare +$federal +$fee +$fence +$Fidelity +$figure +$financing +$fine +$fines +$First +$foreign +$FSLIC +$fund +$funds +$gain +$gains +$gap +$goal +$gold +$grant +$guarantee +$hammer +$hammers +$highway +$home +$Hong +$hostile +$house +$income +$increase +$industry +$infusion +$initial +$installment +$investment +$issue +$issues +$judgment +$junk +$Kansai +$laboratory +$lawsuit +$LBO +$legal +$letter +$level +$leveraged +$liability +$limit +$line +$litigation +$loan +$loans +$loss +$machine +$mark +$market +$maximum +$measure +$merger +$Midland +$minimum +$mortgage +$Navy +$net +$note +$notes +$obligation +$obligations +$offer +$offering +$offerings +$office +$order +$outlay +$package +$pact +$payout +$payment +$payments +$penalty +$Pennzoil +$pension +$Pentagon +$pipeline +$plan +$plant +$portion +$premium +$price +$principal +$prize +$proceeds +$production +$profit +$program +$project +$proposal +$provision +$purchase +$purse +$Putnam +$question +$range +$rate +$reactor +$reactors +$rebate +$rebates +$recapitalization +$record +$redemption +$reduction +$refund +$renovation +$request +$rescue +$research +$reserve +$restructuring +$retirement +$revolving +$rise +$River +$salary +$sale +$sales +$Saturn +$savings +$series +$settlement +$share +$shelf +$shortage +$shortfall +$software +$spacecraft +$special +$stake +$station +$stock +$study +$suit +$suits +$sum +$surge +$surplus +$system +$tab +$takeover +$takeover +$target +$tax +$Templeton +$tender +$threshold +$toilet +$total +$trade +$transaction +$trigger +$trust +$value +$venture +$verdict +$vessel +$Waterford +$windfall +$wine +$Winsor +$world +$World + +# skip before right context for dollar +$$advertising +$$asking +$$civil +$$closing +$$commercial +$$common +$$compensatory +$$Contra +$$corporate +$$damage +$$economic +$$energy +$$European +$$first +$$general +$$global +$$government +$$housing +$$insurance +$$interest +$$interim +$$international +$$junior +$$libel +$$marketing +$$municipal +$$nationwide +$$new +$$nuclear +$$omnibus +$$outstanding +$$personal +$$pretax +$$private +$$projected +$$proposed +$$public +$$punitive +$$real +$$retail +$$refunding +$$refinancing +$$retirement +$$revenue +$$second +$$secured +$$security +$$semi-annual +$$senior +$$space +$$state +$$State +$$stated +$$taxable +$$term +$$testing +$$thrift +$$trading diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/numhack.perl b/egs/bn/s5/local/data_prep/csr_hub4_utils/numhack.perl new file mode 100755 index 00000000000..be8e611a2b0 --- /dev/null +++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/numhack.perl @@ -0,0 +1,80 @@ +#!/usr/bin/perl + +# $Id: numhack.perl,v 1.4 1996/08/23 05:12:27 robertm Rel $ +# preprocessor for numproc, potentially specialized for Broadcast News material + +# tries to patch numproc's problems with: +# - telephone numbers +# - zip codes +# for example: +# 1-800-555-1212 +# => one - eight hundred - five five five - one two one two +# (215) 555-1212 +# => two one five - five five five - one two one two +# 212/285-9400 +# => two one two - two eight five - nine four zero zero +# 1-(800)-CAR-CASH +# => one - eight hundred -CAR-CASH +# New York, NY 10007 +# => New York, NY one zero zero zero seven +# Philadelphia, PA 19104-6789 +# => Philadelphia, PA one nine one oh four - six seven eight nine + +# may leave behind extra spaces here and there, but later processes ought +# to correct that... + +@ones_oh=("oh","one","two","three","four", + "five","six","seven","eight","nine"); + +while(<>) +{ + next unless /\d/; # skip lines without numbers + next if /^<\/?[aps]/; # skip SGML + + # probable Zip codes + s/\b(\d{5}-\d{4})\b/&SpellDigits($1)/eg; # 12345 + s/\b(\d{5})\b/&SpellDigits($1)/eg; # 12345-6789 + + # phone numbers + s=(^| )([1l][- ])?\(?([2-9]\d{2})\)?[-/]? ?(\d{3})-(\d{4})\b=&SpellTel($2,$3,$4,$5)=eg; # 215-555-1212 etc. + s/(^| )(\d{3}-\d{4})\b/&SpellDigits($2)/eg; # 555-1212 + s/\b1-\(?800\)?(\W)/ one - eight hundred $1/g; # isolated 1-800 + s/([Aa]rea code) (\d{3})(\W)/"$1 ".&SpellDigits($2)."$3"/eg; + +} continue { + print; +} + +exit; + +sub SpellDigits +{ + local($num)=$_[0]; + $num =~ s/(\d)(\D)(\d)/$1 $2 $3/g; # add space around non-digits + # isolated zeros become "oh", string of them become "zero ..." + $num =~ s/(00+)/" zero" x length($1)/eg; + $num =~ s/(\d)/" $ones_oh[$1]"/eg; + return $num; +} + +sub SpellTel +{ + local($pre,$area,$exch,$rest)=@_; + $return = $pre ? " one -" : " "; + if ($area =~ /(\d)00/) + { + $return .= &SpellDigits($1); + $return .= " hundred"; + } + else + { + $return .= &SpellDigits($area); + } + $return .= " - "; + + $return .= &SpellDigits($exch); + $return .= " - "; + $return .= &SpellDigits($rest); + + return $return; +} diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/numproc.perl b/egs/bn/s5/local/data_prep/csr_hub4_utils/numproc.perl new file mode 100755 index 00000000000..e97d3ae51dd --- /dev/null +++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/numproc.perl @@ -0,0 +1,1134 @@ +#! /usr/bin/perl +# +# $Id: numproc.perl,v 1.7 1996/08/23 05:04:11 robertm Rel $ +############################################################################### +# This software is being provided to you, the LICENSEE, by the Massachusetts # +# Institute of Technology (M.I.T.) under the following license. By # +# obtaining, using and/or copying this software, you agree that you have # +# read, understood, and will comply with these terms and conditions: # +# # +# Permission to use, copy, modify and distribute, including the right to # +# grant others the right to distribute at any tier, this software and its # +# documentation for any purpose and without fee or royalty is hereby granted, # +# provided that you agree to comply with the following copyright notice and # +# statements, including the disclaimer, and that the same appear on ALL # +# copies of the software and documentation, including modifications that you # +# make for internal use or for distribution: # +# # +# Copyright 1991-4 by the Massachusetts Institute of Technology. All rights # +# reserved. # +# # +# THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS OR # +# WARRANTIES, EXPRESS OR IMPLIED. By way of example, but not limitation, # +# M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS # +# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR # +# DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, # +# TRADEMARKS OR OTHER RIGHTS. # +# # +# The name of the Massachusetts Institute of Technology or M.I.T. may NOT be # +# used in advertising or publicity pertaining to distribution of the # +# software. Title to copyright in this software and any associated # +# documentation shall at all times remain with M.I.T., and USER agrees to # +# preserve same. # +############################################################################### + +# preprocessor for WSJ +# assumes 1 sentence per line +# +# 1. expand numerical exceptions: eg. 386 +# 2. do regular numerical expansions + +# Minor modifications by David Graff, Linguistic Data Consortium, in preparation +# for publishing on cdrom; Aug. 11, 1994. + +$POINT='.POINT'; # orthographic notation for . + + # final s in name indicates plural version, otherwise just add s +@ones_z=("zero","one","two","three","four", + "five","six","seven","eight","nine"); +@ones_oh=("oh","one","two","three","four", + "five","six","seven","eight","nine"); +@ten=("","ten","twenty","thirty","forty","fifty", + "sixty","seventy","eighty","ninety"); +@teen=("ten","eleven","twelve","thirteen","fourteen","fifteen", + "sixteen","seventeen","eighteen","nineteen"); +@mult=("","thousand","million","billion","trillion" + ,"quadrillion","quintillion","sextillion","septillion","octillion"); +@den=("","","half","third","quarter","fifth", + "sixth","seventh","eighth","ninth","tenth", + "eleventh","twelfth","thirteenth","fourteenth","fifteenth", + "sixteenth","seventeenth","eighteenth","nineteenth"); +@largeden=("","first","second","third","fourth","fifth", + "sixth","seventh","eighth","ninth","tenth", + "eleventh","twelfth","thirteenth","fourteenth","fifteenth", + "sixteenth","seventeenth","eighteenth","nineteenth"); +@ordnal=("","first","second","third","fourth","fifth", + "sixth","seventh","eighth","ninth","tenth", + "eleventh","twelfth","thirteenth","fourteenth","fifteenth","sixteenth"); +@months=("Jan.","Feb.","Mar.","Apr.","Jun.","Jul.","Aug.","Sept.","Oct.", + "Nov.","Dec.","January","February","March","April","May","June", + "July","August","September","October","November","December"); + +$exfile="$ENV{HOME}/bc-news/bin/num_excp"; # default exceptions file name + +for($i=0,$j=0;$i<=$#ARGV;$i++) +{ if($ARGV[$i] =~ /^-/) + { if($ARGV[$i] =~ /^-v/) {$vflg=1;} + elsif($ARGV[$i] =~ /^-x/) + { $exfile=$ARGV[$i]; + $exfile =~ s/^-x//; + } + else {&perr2("illegal flag: $ARGV[$i]"); } + } + else { &perr2("no file args"); } +} +@ARGV=(); + +if(!exfile) {&perr2("no exceptions file specified"); } + +if(!open(EXFILE,$exfile)) {&perr2("cannot open $exfile"); } +while() +{ if(/^#/) {next;} # comment + s/\n//; + if(!$_) {next;} # blank + $y=$_; + s/^(\S+)\s*//; # extract 1st word + $x=$1; + if($x eq "") {&perr2("$exfile: no word: $y");} + if($x =~ /^\$\$/) # $$word => skip + { $x =~ s/^\$*//; + $sing_dollar{$x}=2; + } + elsif($x =~ /^\$/) # $word => singular right context + { $x =~ s/^\$*//; + $sing_dollar{$x}=1; + } + elsif($x =~ /^\*/) + { $x =~ s/\**//g; + if(!$x) {&perr2("$exfile: no serno word");} + $sernowd{$x}=1; # serial no words + } + else + { if($x !~ /\d/) {&perr2("$exfile: non-numerical key");} + if(!$_) {&perr2("$exfile: no value");} + + $except{$x}=$_; # translations + } + $n++; +} +close(EXFILE); +if($vflg) {print STDERR "$n lines read from exceptions file\n";} + +for($i=0;$i<=$#months;$i++) # make months hash +{ $_=$months[$i]; + $months{$_}=1; # mixed case + tr/a-z/A-Z/; + $months{$_}=1; # UC +} + +while(<>) +{ # removed local($front.$back,$x) to conserve memory RWM 8/96 + +############################## exceptproc ################################## + s/^\s*//; + s/\n//o; + if($vflg) {print "input:\t$_\n";} + if(/\d/ && !/^<\/?[spa]/) # opt and protect sgml + { @input = split(/\s+/o); + @output=(); + for($field=0;$field<=$#input;$field++) # $field is global + { $_=$input[$field]; + + if(!/\d/) # only processes numbers + { &pusho($input[$field]); # not processed + next; + } + + s/^(\W*)//o; # strip front + $front=$1; + if($front =~ /\$$/ || $front =~ /#$/) # protect money + { &pusho($input[$field]); # not processed + next; + } + + s/(\W*)$//o; # strip back + $back=$1; + + if($front =~ /\'$/ && $except{"'$_"}) # eg "'20s" + { $front =~ s/\'$//; + if($front) + { &pusho($front); + if($front !~ /[\w]$/o) {$appendflg=1;} + } + + &pusho($except{"'$_"}); # translation + + if($back) + { if($back !~ /^[\w]/o) {&appendo($back);} + else {&pusho($back);} + } + } + elsif($except{$_}) + { if($front) + { &pusho($front); + if($front !~ /[\w]$/o) {$appendflg=1;} + } + + &pusho($except{$_}); # translation + + if($back) + { if($back !~ /^[\w]/o) {&appendo($back);} + else {&pusho($back);} + } + } + else {&pusho($input[$field]);} # not processed + } + $_=join(" ",@output); + } + s/\s+/ /g; + s/^ //o; + s/ $//o; + if($vflg) {print "ex:\t$_\n";} + +############################ numproc ######################################## + if(!/^<\/?[spa]/) # protect sgml, also art + { s/(\d+)-(\d+)-(\d+)/$1 - $2 - $3/g; # eg. 1-2-3 + s/(\d+)x(\d+)/$1 by $2/g; # eg. 2x4 + s/(\d+)\+(\d+)/$1 plus $2/g; # eg. 2+2 + s=(\d)-(\d)[/\\](\d)=$1 $2/$3=g; # e.g. 3-1/2 + s=(\d)\\(\d)=$1/$2=g; # e.g. 1\2 for 1/2 + s/\$(\d[\d,]*)-\$(\d)/$1 to \$$2/g; # $ range: eg. $1-$2 + s/\$(\d[\d,]*)-(\d)/$1 to \$$2/g; # $ range: eg. $1-2 + s/(\d)-(\'?)(\d)/$1 to $2$3/g; # range: eg. 1-2 + s/%-(\d)/% to $1/g; # % range: eg. 1%-2% + s/(\d)=(\d)/$1 equals $2/g; # equation: x=y + s/ - / -- /g; # recode dashes + s/([^-\d\s])-([^-\d\s])/$1 - $2/g; # split in-word hyphens + s/- +-/--/g; s/- +-/--/g; # close dashes + s/-{3,}/--/g; # map dashes to -- + s/--/ -- /g; # space around -- + s/(\d) +(\d+\/\d)/$1 and $2/g; # dig frac -> dig and frac + s/([a-zA-Z])\//$1 \/ /g; # text/* + s/\/([a-zA-Z])/ \/ $1/g; # */text + + s/([a-zA-Z]\d+)\/(\d+)/$1 \/ $2/g; # eg. a1/3 -> a1 / 3 + s/(\/\d*)th/$1/ig; # eg. 1/10th -> 1/10 + s/(\/\d*1)st/$1/ig; # eg. 1/x1st -> 1/x1 + s/(\/\d*2)nd/$1/ig; # eg. 1/x2nd -> 1/x2 + s/(\/\d*3)rd/$1/ig; # eg. 1/x3rd -> 1/x3 + s/(\d+)\/(\d+[a-zA-Z])/$1 \/ $2/g; # eg. 1/3a -> 1 / 3a + s/([a-zA-Z])-(19\d\d\D)/$1 - $2/g; # eg. mid-1990 -> mid - 1990 +# s/([a-zA-Z])-(\d)/$1 $2/g; # eg. a-1 -> a 1 +# s/(\d)-([a-zA-Z])/$1 $2/g; # eg. 1-a -> 1 a + s/([a-zA-Z])-(\d)/$1 - $2/g; # eg. a-1 -> a - 1 + s/(\d)-([a-zA-Z])/$1 - $2/g; # eg. 1-a -> 1 - a + + # fix common time typo (; for :) + s/\b([012]?\d);([0-5]\d)\b/$1:$2/g; # e.g. 11;00 -> 11:00 + + if(!/\d:\d\d$/o && !/\d:\d\d\D/o) # preprocess non-time \d:\d + { s/(\d):(\d)/$1 : $2/g; + s/(\S):(\d)/$1: $2/g; + } + } + + if($vflg) {print "num1:\t$_\n";} + + s/^\s*//; + if(/\d/ && !/^<\/?[spa]/) # opt and protect sgml + { @input = split(/\s+/o); + @output=(); + for($field=0;$field<=$#input;$field++) # $field is global + { if($field>0) {$last=$input[$field-1];} + else {$last='';} + if($field<$#input) {$next=$input[$field+1];} + else {$next='';} + if($field<$#input-1) {$next2=$input[$field+2];} + else {$next2='';} + $this=$input[$field]; + $_=$input[$field]; + + if(/<[\w\.\/]*>/o && !/

/o) # pass only + {&perr("spurious SGML: $_"); next; } # + + if(/[0-9]/o && !/

$40 + + if($front) + { &pusho($front); # generally punctuation + if($front !~ /\w$/) {$appendflg=1;} + } + + $x=$_; + if($x =~ /\//) + { $x =~ s/^\D*//; + $x =~ s/\D*$//; + if (! &printfrac($x)) {return 0;} + &pusho("of a $unit"); + $x=""; + $plural=0; + } + + $x =~ s/^\D*([\d,]*)\D*.*$/$1/; # int part of string + if($x ne "") {if (! &printint($x)) {return 0;} } # print int part (eg. dollars) + + if($next eq "and" && $next2 =~ /\d\/\d/ && next2 !~ /\/.*\//) + { if($unit && $x ne "") {&pusho("and");} # frac: eg 4 1/16 + $z=$next2; + $z =~ s/\D*$//; + if (! &printfrac($z)) {return 0;} + ($punct)=($next2 =~ /(\D*)$/); + $field+=2; + &pusho("${unit}s"); + + if($back) {&perr("money: back and 1 1/3"); return 0;} + + if($punct) {&appendo($punct);} # punctuation from *illion + return 1; + } + + if($back eq "" && $next =~ /^(thousands?|[a-z]*illions?)(\W*)/i) + { if (! &printdecfrac($_)) {return 0;} # multiplier + &pusho($1); + $punct=$2; + $plural=1; ### if adj '', if noun 's' + $field++; + $frac=1; + } + elsif(/\.\d$/ || /\.\d\D/ || /\.\d{3}/ ) # .d or .ddd+ + { if (! &printdecfrac($_)) {return 0;} + $plural=1; # can be either + $frac=1; + } + else + { $y=$x; + $y =~ s/,//g; # remove commas + if(int($y)!=1) {$plural=1;} + } + + if($back eq "" && $input[$field+1] =~ /dollar/i) + { $unit=""; # fix "$1 dollar" wsj typo + $subunit_sing=""; + $subunit_pl=""; + if (! &printdecfrac($_)) {return 0;} + $frac=1; + } + +#print "f=$front, m=$_, b=$back\n"; +#foo + $sing=0; + if($last =~ /^\W*[aA][nN]?\W*$/) {$sing=1;} # a $123, an $80 + elsif($input[$field+1] eq "-") {$sing=1;} # eg. $123-a-day + # next one is chancy + elsif($input[$field] !~ /\W$/ && $input[$field+1] !~ /^\W/ && + $input[$field+1] =~ /[a-zA-Z]$/ && $input[$field+2] eq "-" && + $input[$field+3] =~ /^[a-zA-Z]/) {$sing=1;} # $ after-tax + + elsif($back eq "" && !$punct) # right contexts with no intervening punct + { $j=$field+1; # includes *ly as a skip + $z=""; + for($i=0;$i<2;$i++,$j++) # skip ? + { $y=$input[$j]; # strip final punct + $y =~ s/\W*$//; + if($y !~ /\w*ly$/i && $sing_dollar{$y}!=2) {last;} + ($y)=($input[$j] =~ /(\W*)$/); # get final punct + $z .= $y; # accumulate + } + $y=$input[$j]; # strip final punct + $y =~ s/\W*$//; + if($z eq "" && $sing_dollar{$y}==1) {$sing=1;} + } + + if($unit) # print unit + { &pusho($unit); + if($plural && !$sing) {&appendo("s");} # just add s for plural + } + + if(!$frac && /\.\d{2}/) # .dd (eg. cents) + { $y=$_; + $y =~ s/^[^\.]*\.([\d]*)\D?.*$/$1/; # get fractional part + if($unit && $x ne "") {&pusho("and");} + if (! &printint($y)) {return 0;} + if($sing || int($y)==1) {&pusho($subunit_sing);} + else {&pusho($subunit_pl);} + } + + if($back) # punctuation from this field + { if($punct) {&perr("money: back and punct"); return 0;} + + if($back =~ /^\w/) {&pusho($back);} + else {&appendo($back);} + } + + if($punct) {&appendo($punct);} # punctuation from *illion + + return 1; +} + +sub printyear # &printyear(x) +{ if($vflg) {print "printyear: $_[0]\n";} + return &printnum($_[0]); # for now +} + +sub printtime # &printtime(x) +{ if($vflg) {print "printtime: $_[0]\n";} + $_=$_[0]; + + local(@x); + local($front); + local($back); + + if(/:{2,}/ || !/\d:\d/) {&perr("printtime: not a time"); return 0;} + + @x=split(/:/,$_); + ($front)=($x[0] =~ /^(\D*)/); + $x[0] =~ s/^(\D*)//; + ($back)=($x[1] =~ /(\D*)$/); + $x[1] =~ s/(\D*)$//; + + if($front) + { &pusho($front); # generally punctuation + if($front !~ /\w$/) {$appendflg=1;} + } + if (! &printint($x[0])) {return 0;} + if($x[1]==0) + { $_=$next; + if(!/^[aApP]\.?[nM]\.?$/) {&pusho("o'clock");} + } + elsif ($x[1]<10) + { &pusho("oh"); + if (!&printint($x[1])) {return 0;} + } + else {if (! &printint($x[1])) {return 0;} } + if($back) + { if($back =~ /^\w/) {&pusho($back);} + else {&appendo($back);} # generally punctuation + } + return 1; +} + +sub printfrac +{ if($vflg) {print "printfrac: $_[0]\n";} + local($x)=$_[0]; + + local(@z); #Perl BUG: lists do not seem to be local + local($sign); + local($front); + local($back); + local($sign); + + $x =~ s/^([^\d\.]*)//; # strip front + $front=$1; + if($front =~ /^\+$/) # get sign + { $sign="plus"; + $front =~ s/\+$//; + } + if($front =~ /^-$/) + { $sign="minus"; + $front =~ s/-$//; + } + + if($x =~ /\D$/) + { ($back)=( $x =~ /(\D*)$/ ); + $x =~ s/\D*$//; # strip back: final . is punct + } + + @z=split(/\//,$x); + if($#z !=1) {&perr("printfrac: illegal fraction: $_[0]"); return 0;} + if($z[1] <= 1) {&perr("printfrac: den too small: $_[0]"); return 0;} + + if($front) + { &pusho($front); + if($front =~ /[a-zA-Z]$/) {&appendo("-");} + $appendflg=1; + } + + if($sign) {&pusho($sign);} + + if (! &printint($z[0])) { return 0;} #numerator + if($z[1] <= $#den) # small den from table (<20) + { &pusho($den[$z[1]]); + if($z[0]!=1) {if (! &pluralize) {return 0;} } + } + else #large den + { $ones=int($z[1]%100); + $hun=100*int($z[1]/100); + if($hun>0) {if (!&printint($hun)) {return 0;} } + if($ones==0) + { &appendo("th"); + if($z[0]!=1) {if (! &pluralize) {return 0;} } + } + elsif($ones<=$#largeden) # <20 + { &pusho($largeden[$ones]); + if($z[0]!=1) {if (!&pluralize) {return 0;} } + } + else + { $x=int($ones%10); + if(int($ones/10)) + { &pusho($ten[int($ones/10)]); + if($x) + { &appendo("-"); # eg. twenty-five + $appendflg=1; + } + } + if($x==0) + { &pusho("th"); + if($z[0]!=1) {if (! &pluralize) {return 0;} } + } + else + { &pusho($largeden[$x]); + if($z[0]!=1) {if (! &pluralize) {return 0;} } + } + } + } + + if($back) + { $x=&geto; # in case of 1/10th etc ([stndrth]=st nd rd th) + if($back !~ /^[stndrth]{2}/ || $x !~ /$back$/) + { if($back =~ /^[a-zA-Z]/) {&appendo("-");} + &appendo($back); + } + } + + return 1; +} + +sub printnum # printnum(n) +{ if($vflg) {print "printnum: $_[0]\n";} + local($x)=$_[0]; # print ordinary numbers + + $leadingzeroflg=''; # global + local($front); + local($back); + local($intpart); + local($fracpart); + local($hun); + local($ones); + local($comma); + local($sign); + local($y); + + $x =~ s/^(\D*)//; # strip front + $front=$1; + if($front =~ /^\.$/ || $front =~ /\W\.$/ || + ($front =~ /\.$/ && $x =~ /^0/ )) # leading . + { $front =~ s/\.$//; + $x = "." . $x; + } + if($front =~ /^\+$/) # get sign + { $sign="plus"; + $front =~ s/\+$//; + } + if($front =~ /^-$/) + { $sign="minus"; + $front =~ s/-$//; + } + + if($x =~ /\D$/) + { $back=$x; + $back =~ s/^[\d\.,]*\d//; + $x =~ s/\D*$//; # strip back: final . is punct + } + + if($x =~ /[^\d\.,]/) {&perr("printnum: $_[0] is not a number"); return 0;} + + if($x!=0 && $x =~ /^0/ && $x =~ /^\d*$/) # "oh" numbers + { if($front) + { &pusho($front); + if($front !~ /[a-zA-Z]$/) {$appendflg=1;} + } + + if($sign) { &pusho($sign); } + + while($x ne '') + { $x =~ s/^(.)//; + &pusho($ones_oh[$1]); + } + + if($back) + { if($back =~ /^s$/ || $back =~ /^s\W/) # back = s + { if (! &pluralize) {return 0;} # eg. 1960s + $back =~ s/^s//; + } + if($back) + { if($back =~ /^[a-zA-Z]/) {&pusho($back);} + else {&appendo($back);} # back = punct or "'s" + } + } + return 1; + } + + if($x =~ /^\d/) # get integer part + { if($x =~ /,/) + { $comma=1; + $x =~ s/,//g; # strip commas + } + $intpart=$x; + $intpart =~ s/\..*$//; + if($x =~ /^0/) {$leadingzeroflg=1;} + } + + if($x =~ /\./) # get fractional part + { $fracpart=$x; + $fracpart =~ s/^.*\././; + } + + if($front) + { &pusho($front); + if($front !~ /[a-zA-Z]$/) {$appendflg=1;} + } + + if($sign) { &pusho($sign); } + + $ones=int($intpart%100); + if($comma) {if (! &printint($intpart)) {return 0;} } + elsif(($intpart>=1900 || $intpart>=1100 && $ones==0) + && $intpart<2000 && !$fracpart) #4 digit -> 2+2 + { $hun=int($intpart/100); + if (! &printint($hun)) {return 0;} + if($ones>=10) {if (! &printint($ones)) {return 0;} } + elsif($ones>0) + { &pusho("oh"); + if (! &printint($ones)) {return 0;} + } + else {&pusho("hundred");} + } + else + { if (! &printint($intpart)) {return 0;} + $y=$last; + $y =~ s/^\W*//; # thize dates: May 25th + if(length($intpart)<=2 && $months{$y}) + { if (! &thize("")) {return 0;} + $back =~ s/[a-z]//g; + } + } + if($fracpart) {if (! &printdecfrac($fracpart)) {return 0;} } + + if($back) + { if($back =~ /^s$/ || $back =~ /^s\W/) # back = s + { if (! &pluralize) {return 0;} # eg. 1960s + $back =~ s/^s//; + } + if($back =~ /^st$/ || $back =~ /^st\W/) # back= st + { if (! &thize("st")) {return 0;} # eg. 1st + $back =~ s/^st//; + } + if($back =~ /^nd$/ || $back =~ /^nd\W/) # back= nd + { if (! &thize("nd")) {return 0;} # eg. 2nd + $back =~ s/^nd//; + } + if($back =~ /^rd$/ || $back =~ /^rd\W/) # back= rd + { if (! &thize("rd")) {return 0;} # eg. 3rd + $back =~ s/^rd//; + } + if($back =~ /^th$/ || $back =~ /^th\W/) # back= th + { if (! &thize("th")) {return 0;} # eg. 4th + $back =~ s/^th//; + } + if($back) + { if($back =~ /^[a-zA-Z]/) {&pusho($back);} + else {&appendo($back);} # back = punct or "'s" + } + } + return 1; +} + +sub printdate # printdate(n): x/x/x format +{ if($vflg) {print "printdate: $_[0]\n";} + local($x)=$_[0]; # print ordinary numbers + + local(@y); + local($front); + local($back); + + $x =~ s/^(\D*)//; # strip front + $front=$1; + + $x =~ s/(\D*)$//; # strip back + $back=$1; + + if($x !~ /^\d{1,2}\/\d{1,2}\/(19)?\d{2}$/) + {&perr("printdate: $_[0] is not a date"); return 0;} + + @y=split(/\//,$x); + $y[2] =~ s/^19(\d{2})$/$1/; + + if($front) + { &pusho($front); + if($front =~ /[a-zA-Z]$/) {&appendo("-");} + $appendflg=1; + } + + if (! &printint($y[0])) {return 0;} + &appendo("/"); + + $appendflg=1; + if (! &printint($y[1])) {return 0;} + &appendo("/"); + + $appendflg=1; + if (! &printint($y[2])) {return 0;} + + if($back) + { if($back =~ /^[a-zA-Z]/) {&appendo("-");} + &appendo($back); + } + return 1; +} + +sub printserno # printserno(n): eg. B1, 3b2, 10W-40 +{ if($vflg) {print "printserno: $_[0]\n";} + local($x)=$_[0]; # print mixed sequences of dig and let + + local($y); + local($z); + local($front); + local($back); + + $x =~ s/^(\W*)//; # strip front + $front=$1; + if($front) + { &pusho($front); + if($front !~ /[a-zA-Z]$/) {$appendflg=1;} + } + + $x =~ s/(\W*)$//; # strip back + $back=$1; + $x =~ s/(\d[a-zA-Z]+\d+)(\'?s)$/$1/ # strip "s" or "'s" + && ($back = $2 . $back); + + while($x) + { $x =~ s/^(\D*)//; # strip off non-dig + $y=$1; + if($y) + { $y =~ s/-//g; # remove - + if($y eq "") {} + elsif($sernowd{$y}) {&pusho($y);} # word + else + { while($y) # spell out + { if($y =~ /[a-zA-Z]\'s$/) + { &pusho($y); + $y =~ s/[a-zA-Z]\'s*$//; + } + elsif($y =~ /[A-Z]s$/) + { &pusho($y); + $y =~ s/[A-Z]s$//; + } + else + { $y =~ s/^(.\.?)//; + &pusho($1); + } + } + } + } # (should expand here unless in dictionary) + $x =~ s/^(\d*)//; # strip off dig + $y=$1; + if($y ne "") { if (! &printdigstr($y)) {return 0;} } + } + + if($back =~ /^s\b/) # back = s + { # eg. 2C60s + if (! &pluralize) {return 0;} + $back =~ s/^s//; + } + if($back) + { if($back =~ /^\w/) {&pusho($back);} + else {&appendo($back);} + } + $appendflg=0; + return 1; +} + +sub printdigstr # printdigstr(x) +{ if($vflg) {print "printdigstr: $_[0]\n";} + local($x)=$_[0]; + + local(@y); + local($j); + local($k); + + if($x =~ /^0/) # leading zero + { while($x ne "") + { $x =~ s/^(.)//; + if($1 !~ /\d/) {&perr("printdigstr: non-digit"); return 0;} + &pusho("$ones_z[$1]"); + } + return; + } + if($x =~ /^\d0*$/) # d, d0, d00, d000, etc + { return &printint($x); + } + + $_=$x; + @y=(); + for($j=0;$_ ne "";$j++) { $y[$j]=chop($_); } # j=no digits + for($k=0;$y[$k]==0;$k++) {} # k= nr following 0s + + if($j==2) # 2 dig + { return &printint($x); + } + if($j==3) + { if (! &printint($y[2])) {return 0;} + if($y[1]==0) {&pusho("oh");} + return &printint("$y[1]$y[0]"); + } + if($j==5 && $k<=2) + { if (! &printint("$y[4]")) {return 0;} + $j=4; + } + if($j==4) + { if (! &printint("$y[3]$y[2]")) {return 0;} + if($k==2) {&pusho("hundred");} + else + { if($y[1]==0) {&pusho("oh");} + return &printint("$y[1]$y[0]"); + } + return 1; + } + # >5 dig: just sequential dig + for($j--;$j>=0;$j--) {&pusho("$ones_oh[$y[$j]]");} + return 1; +} + +sub printftin # printftin(n): eg. 6\'-4\" +{ if($vflg) {print "printftin: $_[0]\n";} + local($x)=$_[0]; # print mixed sequences of dig and let + + local($y); + local($front); + local($back); + + $x =~ s/^(\D*)//; # strip front + $front=$1; + + $x =~ s/(\D*)$//; # strip back + $back=$1; + $back =~ s/^\"//; # remove \" + + if($front) + { &pusho($front); + if($front !~ /[a-zA-Z]$/) {$appendflg=1;} + } + + $x =~ s/^([\d\.]*)//; # strip off dig & . + $y=$1; + if(!$y) {&perr("printftin: bad feet"); return 0;} + if (! &printnum($y)) {return 0;} + if($y==1) {&appendo("-foot");} + else {&appendo("-feet");} + + $x =~ s/^\'//; # strip off \' + $x =~ s/^-//; # strip off - + if(!$x) {&perr("printftin: bad intermed"); return 0;} + + $x =~ s/^([\d\.]*)//; # strip off dig & . + $y=$1; + if(!$y) {&perr("printftin: bad inches"); return 0;} + if (! &printnum($y)) {return 0;} + if($y==1) {&appendo("-inch");} + else {&appendo("-inches");} + + if($back) + { if($back !~ /^[a-zA-Z]/) {&appendo($back);} + else {&pusho($back);} + } + return 1; +} + +sub printint # printint(x) +{ if($vflg) {print "printint: $_[0]\n";} + local($x)=$_[0]; + + local($comma); + local($leading_zero); + local($fractional); + local(@y); + + $fractional=$x =~ /\.\d/; + $x =~ s/^\D*([\d,]*)\D*.*$/$1/; # int part of string + $leading_zero=$x =~ /^0/; + $comma=$x =~ /,/; + $x =~ s/,//g; + if($x eq "") {return;} + + if($x == 0) + { &pusho("zero"); + $leadingzeroflg=1; + return; + } + + @y=(); + for($j=0;$x;$j++) { $y[$j]=chop($x); } + + if($comma || $fractional || 1) + { for($j=3*int($#y/3);$j>=0;$j-=3) + { if($y[$j+2]) { &pusho("$ones_z[$y[$j+2]] hundred");} + if($y[$j+1]==1) { &pusho($teen[$y[$j]]);} + else + { if($y[$j+1]>1) + { &pusho($ten[$y[$j+1]]); + if($y[$j]) + { &appendo("-"); # twenty-five + $appendflg=1; + } + } + if($y[$j]>0) { &pusho($ones_z[$y[$j]]);} + } + if(int($j/3)>0) + { if(int($j/3) > $#mult) + { &perr("printint: too big"); return 0;} + &pusho($mult[int($j/3)]); + } + $commanextflg=1; + } + } + $commanextflg=0; + return 1; +} + +sub printdecfrac +{ if($vflg) {print "printdecfrac: $_[0]\n";} + local($x)=@_[0]; + + if($x !~ /\.\d/) {return;} + $x =~ s/^[^\.]*\.([\d]*)\D?.*$/$1/; # get fractional part + + &pusho($POINT); + @y=split(//,$x); + if($leadingzeroflg) + {for($j=0;$j<=$#y;$j++) { &pusho($ones_z[$y[$j]]);}} + else {for($j=0;$j<=$#y;$j++) { &pusho($ones_oh[$y[$j]]);}} + + return 1; +} + +sub pluralize # pluralize(): pluralize last entry on output stack +{ if($vflg) {print "pluralize: $_[0]\n";} + local($x); + + $_=&geto; + if( /st$/ || /nd$/ || /rd$/ || /th$/ || /quarter$/ || /zero$/ || /oh/ || + /one$/ || /two$/ || /three$/ || /four$/ || /five$/ || + /seven$/ || /eight$/ || /nine$/ || + /ten$/ || /eleven$/ || /twelve$/ || /een$/ || + /hundred$/ || /thousand$/ || /illion$/ ) + { &appendo("s"); + } + elsif (/six$/) + { &appendo("es"); + } + elsif (/half$/) + { $x=&popo(); + $x =~ s/f$/ves/; + &pusho($x); + } + elsif (/ty$/) # fifty etc. + { $x=&popo(); + $x =~ s/y$/ies/; + &pusho($x); + } + else {&perr("pluralize: unknown word: $_"); return 0;} + + return 1; +} + +sub thize # thize(): add th to last entry on output stack +{ if($vflg) {print "printthize: $_[0]\n";} + local($y)=$_[0]; + + local($x); + + $_=&geto; + if( /four$/ || /six$/ || /seven$/ || /ten$/ || + /eleven$/ || /een$/ || /hundred$/ || /thousand$/ || /illion$/ ) + { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;} # xth + &appendo("th"); + } + elsif( /one$/ ) # 1st + { if($y && $y ne "st") {&perr("thize: mismatch: $_ $y\n"); return 0;} + $x=&popo(); + $x =~ s/one$/first/; + &pusho($x); + } + elsif( /two$/ ) # 2nd + { if($y && $y ne "nd") {&perr("thize: mismatch: $_ $y\n"); return 0;} + $x=&popo(); + $x =~ s/two$/second/; + &pusho($x); + } + elsif( /three$/ ) # 3rd + { if($y && $y ne "rd") {&perr("thize: mismatch: $_ $y\n"); return 0;} + $x=&popo(); + $x =~ s/three$/third/; + &pusho($x); + } + elsif( /five$/ || /twelve$/ ) # 5th, 12th + { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;} + $x=&popo(); + $x =~ s/ve$/fth/; + &pusho($x); + } + elsif(/eight$/) + { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;} # 8th + &appendo("h"); + } + elsif( /nine$/ ) + { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;} + $x=&popo(); + $x =~ s/nine$/ninth/; + &pusho($x); + } + elsif( /ty$/ ) + { if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;} + $x=&popo(); + $x =~ s/ty$/tieth/; + &pusho($x); + } + else {&perr("thize: unknown word: $_"); return 0;j} + return 1; +} + +sub pusho # pusho($x): push output +{ if($commanextflg) # global: used for commas in printint + { $commanextflg=0; + &appendo(","); + } + if($appendflg) # global: used for fronts + { $appendflg=0; + &appendo(@_[0]); + } + else {push(@output,@_);} +} + +sub appendo # appendo($x): append to output +{ $appendflg=0; +# if($#output < 0) {&pusho("");} + if($#output < 0) {&perr("appendo: output empty"); return 0;} + $output[$#output] .= @_[0]; +} + +sub popo # popo(): pop last output +{ if($#output < 0) {&perr("popo: output empty"); return 0;} + pop(@output); +} + +sub geto # geto(): get last output +{ if($#output < 0) {&perr("geto: output empty"); return 0;} + return $output[$#output]; +} + +sub perr +{ print STDERR "numproc: $_[0]\n"; + print STDERR "line number=$.: fields=$last, $this, $next\n"; +# exit(1); + + $appendflg=0; + $commanextflg=0; + &pusho($this); +# $field++; # graceful error recovery +} + +sub perr2 +{ print STDERR "numproc: $_[0]\n"; + exit(1); +} diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/pare-sgml.perl b/egs/bn/s5/local/data_prep/csr_hub4_utils/pare-sgml.perl new file mode 100755 index 00000000000..6caf474e3af --- /dev/null +++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/pare-sgml.perl @@ -0,0 +1,36 @@ +#!/usr/bin/perl + +# $Id: pare-sgml.perl,v 1.3 1996/08/15 02:51:17 robertm Rel $ +# removes extraneous headers and other non-LM fields +# translates into LM-standard +# removes comments (enclosed in brackets) + +use strict; +use warnings; + +my $intext=0; +while (<>) +{ + if ($intext == 0) + { + print if (s=/; + next; + } + if (/^<\/TEXT>/) + { + $intext = 0; + next; + } + next if /^/; + next if /^/; + + s/\[+[^\[\]]*\]+//g; + if (/[\[\]]/) + { + warn "pare-sgml: warning - unbalanced comment brackets at $ARGV line $.\n"; + print STDERR " line=$_"; + } + print; +} diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/process_filelist.py b/egs/bn/s5/local/data_prep/csr_hub4_utils/process_filelist.py new file mode 100755 index 00000000000..3c8a50e3fe4 --- /dev/null +++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/process_filelist.py @@ -0,0 +1,164 @@ +#! /usr/bin/env python + +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +"""Prepare CSR-IV 1996 Language model text corpus (LDC98T31).""" + +from __future__ import print_function +import argparse +import gzip +import logging +import os +import re +import subprocess +from bs4 import BeautifulSoup + + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - " + "%(funcName)s - %(levelname)s ] %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) + + +def get_args(): + """Parses command-line arguments.""" + + parser = argparse.ArgumentParser("""Prepare CSR-IV 1996 Language model text + corpus (LDC98T31).""") + parser.add_argument("--verbose", choices=[0,1,2,3], type=int, default=0, + help="Set higher for more verbose logging.") + parser.add_argument("file_list", type=argparse.FileType('r'), + help="""List of compressed source files""") + parser.add_argument("dir", type=str, + help="Output directory to dump processed files to") + + args = parser.parse_args() + + if args.verbose > 2: + logger.setLevel(logging.DEBUG) + handler.setLevel(logging.DEBUG) + + return args + + +def normalize_text(text): + """Normalizes text and returns the normalized version. + The normalization involves converting text to upper case. + """ + text1 = text.strip() + # text2 = text_normalization.remove_punctuations(text1) + text2 = text1.upper() + text2 = re.sub(r" [ ]*", " ", text2) + return text2 + + +def process_file_lines(lines, out_file_handle): + """Processes input lines from a file by removing SGML tags and + writes normalized plain text to output stream.""" + + doc = re.sub(r"", "", ''.join(lines)) + soup = BeautifulSoup(doc, 'lxml') + + num_written = 0 + + for art in soup.html.body.children: + try: + if art.name != "art": + continue + for para in art.find_all('p'): + assert para.name == 'p' + + for x in para.contents: + try: + if x.name is None: + normalized_text = normalize_text(unicode(x)) + if len(normalized_text) == 0: + continue + out_file_handle.write("{0}\n".format( + normalized_text.encode('ascii'))) + num_written += 1 + except Exception: + logger.error("Failed to process content %s in para " + "%s", x, para) + raise + + except Exception: + try: + logger.error("Failed to process article %s", art['id']) + except AttributeError: + logger.error("Failed to process body content %s", art) + raise + if num_written == 0: + raise RuntimeError("0 sentences written.") + + +def run_command(*args, **kwargs): + if type(args[0]) is list: + command = ' '.join(args[0]) + else: + command = args[0] + + logger.debug("Running command '%s'", command) + p = subprocess.Popen(*args, **kwargs) + return p, command + + +def run(args): + """The one that does it all.""" + + for line in args.file_list.readlines(): + try: + file_ = line.strip() + base_name = os.path.basename(file_) + name = os.path.splitext(base_name)[0] + + out_file = gzip.open("{0}/{1}.txt.gz".format(args.dir, name), + 'w') + + logger.info("Running LM pipefile for |%s|...", base_name) + + p = run_command( + "gunzip -c {0} | " + "local/data_prep/csr_hub4_utils/pare-sgml.perl | " + "perl local/data_prep/csr_hub4_utils/bugproc.perl | " + "perl local/data_prep/csr_hub4_utils/numhack.perl | " + "perl local/data_prep/csr_hub4_utils/numproc.perl " + " -xlocal/data_prep/csr_hub4_utils/num_excp | " + "perl local/data_prep/csr_hub4_utils/abbrproc.perl " + " local/data_prep/csr_hub4_utils/abbrlist | " + "perl local/data_prep/csr_hub4_utils/puncproc.perl -np" + "".format(file_), + stdout=subprocess.PIPE, shell=True) + + stdout = p[0].communicate()[0] + if p[0].returncode is not 0: + logger.error( + "Command '%s' failed with return status %d", + p[1], p[0].returncode) + raise RuntimeError + + process_file_lines(stdout, out_file) + out_file.close() + except Exception: + logger.error("Failed processing file %s", file_) + raise + + +def main(): + """The main function""" + try: + args = get_args() + run(args) + except Exception: + raise + finally: + args.file_list.close() + + +if __name__ == '__main__': + main() diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/process_filelist.sh b/egs/bn/s5/local/data_prep/csr_hub4_utils/process_filelist.sh new file mode 100755 index 00000000000..15249ae9a19 --- /dev/null +++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/process_filelist.sh @@ -0,0 +1,30 @@ +#! /bin/bash + +set -e +set -o pipefail +set -u +set -x + +if [ $# -ne 2 ]; then + echo "Usage: $0

" + exit 1 +fi + +filelist=$1 +dir=$2 + +export PATH=local/data_prep/csr_hub4_utils:$PATH + +for file in `cat $filelist`; do + BASENM=`basename $file` + name="${BASENM%.*}" + + echo "Running LM pipeline for |$BASENM|..." 1>&2 + gunzip -c $file | pare-sgml.perl | \ + bugproc.perl | \ + numhack.perl | \ + numproc.perl -xlocal/data_prep/csr_hub4_utils/num_excp | \ + abbrproc.perl local/data_prep/csr_hub4_utils/abbrlist | \ + puncproc.perl -np | gzip -c > $dir/$name.txt.gz + echo "Done with $BASENM." +done diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/progsummary.perl b/egs/bn/s5/local/data_prep/csr_hub4_utils/progsummary.perl new file mode 100755 index 00000000000..891e26d5650 --- /dev/null +++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/progsummary.perl @@ -0,0 +1,44 @@ +#!/usr/bin/perl + +# Program: progsummary.perl +# Written by: dave graff +# Usage: [file.list] +# Purpose: extracts program information from sgml-ized PSM texts + +$degbug = 0; +if ( $ARGV[0] eq "-d" ) { + $debug = 1; + shift; +} + +while (<>) +{ + chop; + open( INP, "<$_" ); + $progdate = $progid = "unknown"; + while () { + if ( /^/ ) { + $_ = ; + print STDERR if ( $debug ); + $netwrk = substr( $_, 0, 3 ); + $rest = substr( $_, 3 ); + if ( $rest =~ /^(20\/20)/ ) { + $progid = $1; + } + elsif ( $rest =~ /^([A-Z a-z\&]+)/ ) { + $progid = $1; + } + } + elsif ( /^/ ) { + $_ = ; + print STDERR "$_===\n" if ( $debug ); + if ( /\d+\\(\d{6})\\\d+/ ) { + $progdate = $1; + } + } + elsif ( /^<\/art>/ ) { + print "$netwrk\t$progdate\t\"$progid\"\n"; + } + } + close INP; +} diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/puncproc.perl b/egs/bn/s5/local/data_prep/csr_hub4_utils/puncproc.perl new file mode 100755 index 00000000000..a6e1f19ba56 --- /dev/null +++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/puncproc.perl @@ -0,0 +1,196 @@ +#!/usr/bin/perl + +# $Id: puncproc.perl,v 1.2 1996/08/05 16:12:42 robertm Rel $ +############################################################################### +# This software is being provided to you, the LICENSEE, by the Massachusetts # +# Institute of Technology (M.I.T.) under the following license. By # +# obtaining, using and/or copying this software, you agree that you have # +# read, understood, and will comply with these terms and conditions: # +# # +# Permission to use, copy, modify and distribute, including the right to # +# grant others the right to distribute at any tier, this software and its # +# documentation for any purpose and without fee or royalty is hereby granted, # +# provided that you agree to comply with the following copyright notice and # +# statements, including the disclaimer, and that the same appear on ALL # +# copies of the software and documentation, including modifications that you # +# make for internal use or for distribution: # +# # +# Copyright 1991-4 by the Massachusetts Institute of Technology. All rights # +# reserved. # +# # +# THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS OR # +# WARRANTIES, EXPRESS OR IMPLIED. By way of example, but not limitation, # +# M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS # +# FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR # +# DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, # +# TRADEMARKS OR OTHER RIGHTS. # +# # +# The name of the Massachusetts Institute of Technology or M.I.T. may NOT be # +# used in advertising or publicity pertaining to distribution of the # +# software. Title to copyright in this software and any associated # +# documentation shall at all times remain with M.I.T., and USER agrees to # +# preserve same. # +############################################################################### + +# punctuation preprocessor for WSJ +# assumes 1 sentence per line +# places spaces around punctuation and translates to IBM-like notation +# +# punctproc -np removes punctuation +# +# NOTE: wsj89 starts single quotes with ` or ' +# + +for($i=0,$j=0;$i<=$#ARGV;$i++) +{ if($ARGV[$i] =~ /^-/) + { if($ARGV[$i] =~ /^-np$/) {$npflg=1;} + else {&perr2("illegal flag: $ARGV[$i]");} + } + else { &perr2("no file args"); } +} +@ARGV=(); + +while(<>) +{ s/^/ /; + s/\n$/ /; + + next if (/<\/?[spa]/); # protect sgml + + # forbidden symbols + if(//) {&perr(">");} # > + if(/\$/) {&perr("\$");} # $ + if(/_/) {&perr("_");} # _ + if(/\d/) {&perr("[0-9]");} # 0-9 + + # protect contractions with _ + s/([a-zA-Z]in')([^a-zA-Z])/$1_ $2/g; # *in' e.g. Dunkin', singin' + # Rock 'n' Roll + s/(\W)['`]([nN])(\W)/$1 _'$2$3/g; # [`'][nN] -> _'[nN] + s/(\W)([nN]')(\W)/$1$2_ $3/g; # [nN]' + s/(\W)('[eE]m)(\W)/$1_$2$3/g; # '[eE]m + s/(\W)[`'"]R\.?['"](\W)/$1 _"R."_ $3/g; # toys "R" us + s/(\W)(Cos.')(\W)/$1$2_ $3/g; # Cos.' (companies') + s/(\W)(de.')(\W)/$1$2_ $3/g; # de' Imelda de' Lambertazzi + s/(\W)(Bros.')(\W)/$1$2_ $3/g; # Bros.' + s/(\W)(o')(\W)/$1$2_ $3/g; # o' Man o' War + s/(\W)(ol')(\W)/$1$2_ $3/g; # ol' old + s/(\W)maitre *d'(\W)/$1maitre_d'_ $2/g; # maitre d' + s/(\W)maitres *d'(\W)/$1maitres_d'_ $2/g; # maitres d' + s/(\W)('neath)(\W)/$1 _$2$3/g; # 'neath beneath + s/(\W)('Wadoo)(\W)/$1 _$2$3/g; + # 'Wadoo 'Wadoo , zim bam , boodleoo , hoodle ahdawam + s/(\W)('cause)(\W)/$1 _$2$3/g; # 'cause because + s/(\W)('burbs)(\W)/$1 _$2$3/g; # 'burbs suburbs + s/(\W)('[nN]uf)(\W)/$1 _$2$3/g; # 'Nuf enough + s/(\W)('til)(\W)/$1 _$2$3/g; # 'til + + + s/([^\w\.\'\`_ -])/ $1 /g; # SP around most punct + # but not .'`\_- + + if(!$npflg) + { s/ty-(one)/ty $1/g; # rm - from twenty-one + s/ty-(first)/ty $1/g; # rm - from twenty-first + s/ty-(two)/ty $1/g; # rm - from twenty-two + s/ty-(second)/ty $1/g; # rm - from twenty-second + s/ty-(three)/ty $1/g; # rm - from twenty-three + s/ty-(third)/ty $1/g; # rm - from twenty-third + s/ty-(four)/ty $1/g; # rm - from twenty-four + s/ty-(five)/ty $1/g; # rm - from twenty-five + s/ty-(six)/ty $1/g; # rm - from twenty-six + s/ty-(seven)/ty $1/g; # rm - from twenty-seven + s/ty-(eight)/ty $1/g; # rm - from twenty-eight + s/ty-(nin)/ty $1/g; # rm - from twenty-nine{th} + } + #s/([^-])-([^-])/$1 - $2/g; # - + #s/([^-])-([^-])/$1 - $2/g; # - + + s/([^\.]) *\. *\. *\. *\. *([^\.])/$1 _..._ . $2/g; # x ... . + s/([^\.]) *\. *\. *\. *([^\.])/$1 _..._ $2/g; # x ... + + s/([^\w'\.][b-zB-HJ-Z]\.)([^\.\w]*)$/$1 .$2/; # eg. S. at end -> S. . + s/(\s[a-z]\.\s[a-z]\.)([^\.\w]*)$/$1 .$2/i; #eg. S. I. at end -> S. I. . + s/(\WMr\.)(\W*)$/$1 . $2/i; # Mr. at end -> Mr. . + s/(\WMrs\.)(\W*)$/$1 . $2/i; # Mrs. at end -> Mrs. . + s/(\WMs\.)(\W*)$/$1 . $2/i; # Ms. at end -> Ms. . + s/(\WMessrs\.)(\W*)$/$1 . $2/i; # Messrs. at end -> Messrs. . + + s/\.([^.\w]*)$/ . $1/; # SP around . at end of sent + + s/([^\w\.])['`]([a-zA-Z]*)'(\W)/$1 ' $2 ' $3/g; # `word' + s/([^\w\.])['`]([a-zA-Z])/$1 ' $2/g; # 'word + s/([^sS])' /$1 ' /g; # non plural-possessives + + s/([^_])`/$1 ` /g; # SP around ` (should not need) + s/`/'/g; # ` -> ' (should not need) + + s/_/ /g; # clear _ + + if(!$npflg) + { s/ , / ,COMMA /g; # map punct to words + s/ \? / ?QUESTION-MARK /g; + s/ : / :COLON /g; + s/ # / #SHARP-SIGN /g; + s/ @ / @AT-SIGN /g; + s/ ' / 'SINGLE-QUOTE /g; + s/ " / "DOUBLE-QUOTE /g; + s/ ; / ;SEMI-COLON /g; + s/ ! / !EXCLAMATION-POINT /g; + s/ & / &ERSAND /g; + s/ \+ / +PLUS /g; + s/ \{ / {LEFT-BRACE /g; + s/ \} / }RIGHT-BRACE /g; + s/ \( / (LEFT-PAREN /g; + s/ \) / )RIGHT-PAREN /g; + s/ \. / .PERIOD /g; + s/ \.{3} / ...ELLIPSIS /g; + s/ -- / --DASH /g; + # s/ - / -HYPHEN /g; + s/ = / =EQUALS /g; + s/ % / %PERCENT /g; + s/ \/ / \/SLASH /g; + s/ ([b-zB-HJ-Z]) / $1. /g; # restore . removed by elipsis err + } + else + { s/ , / /g; # map punct to words + s/ \? / /g; + s/ : / /g; + s/ # / /g; + s/ @ / at /g; + s/ ' / /g; + s/ " / /g; + s/ ; / /g; + s/ ! / /g; + s/ & / and /g; + s/ \+ / plus /g; + s/ \{ / /g; + s/ \} / /g; + s/ \( / /g; + s/ \) / /g; + s/ \. / /g; + s/ \.{3} / /g; + s/ -- / /g; + s/ ?- ?/ /g; + s/ = / equals /g; + s/ % / percent /g; + s/ \/ / slash /g; + s/\.POINT/point/g; + } +} continue { + # this block is executed even if we use "next" + s/ {2,}/ /g; + s/^ //; + s/ $//; + if($_) {print "$_\n";} +} + +sub perr #perr(error,line); +{ print STDERR "punctproc: line no=$.: $_[0]\n"; + print STDERR "line=$_\n"; +} + +sub perr2 +{ print STDERR "num: $_[0]\n"; + exit(1); +} diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/sent-init.vocab b/egs/bn/s5/local/data_prep/csr_hub4_utils/sent-init.vocab new file mode 100644 index 00000000000..375f5ddf99b --- /dev/null +++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/sent-init.vocab @@ -0,0 +1,411 @@ +A +ABORTION +ABOUT +ACCORDING +ACCORDINGLY +ACTORS +ADDED +ADDEDON +ADDING +ADDS +ADJUSTED +ADMITTEDLY +ADVISERS +ADVISORY +AFFLICTED +AFTER +AFTERTHOUGHTS +AGAIN +ALL +ALLIANCES +ALLOCATE +ALLTIME +ALMOST +ALONG +ALSO +ALTHOUGH +ALTOGETHER +AMID +AMONG +AN +AND +ANOTHER +ANY +APPROVED +ARCHRIVAL +ARE +ARRIVING +AS +ASIDE +ASKED +ASSUMPTIONS +AT +AUGUST +AVAILABLE +BACKERS +BANKERS +BARRING +BASED +BECAUSE +BEFORE +BEGINNING +BEHIND +BEING +BESIDES +BEYOND +BIG +BOTH +BROADLY +BURNING +BUT +BY +CALLABLE +CAN +CEASES +CHALLENGES +CHANCES +CHANGING +CHARGED +CHARGES +CLASSES +CLEANUPS +CLEARLY +COMMUNIST +COMPETITORS +COMPLEMENTARY +COMPLETION +CONSEQUENTLY +CONSIDER +CONSISTING +CONVERSELY +CONVICTED +COULD +COUNTING +CURFEWS +CURRENT +CURRENTLY +CUSTOMER +DEATH +DECEMBER +DEFENDERS +DESCRIBED +DETAILS +DIVERSITY +DO +DRACONIAN +DRAFTERS +DUMPING +EACH +EARLIER +EDUCATIONAL +EIGHT +EMBARGO +EUROPES +EVEN +EVENTUALLY +EVER +EVERY +EVERYBODYS +EVERYONE +EXAMPLE +EXCEPT +EXCLUDING +EXHAUSTED +EXPECT +EXPECTED +FAR +FARMERS +FATAL +FEW +FIRST +FIXED +FLOATING +FOLKS +FOR +FORMER +FROM +FURTHER +FURTHERMORE +GIVEN +HAVE +HAVING +HE +HEADING +HELPING +HER +HERE +HERES +HES +HIGHER +HIS +HOLDERS +HOLDING +HOW +HOWEVER +I +IF +ILLEGAL +IM +IMPOSED +IMPROVEMENT +IN +INCLUDING +INCREASINGLY +INDEED +INDEPENDENT +INDICTMENTS +INFORMING +INITIAL +INSTEAD +INSURERS +INTENDS +INTERESTINGLY +INTRODUCED +IS +IT +ITS +IVE +JANUARY +JUMPS +JUST +KNOWN +LAST +LATE +LATER +LEGALLY +LESS +LET +LIKE +LIKEWISE +LIMITS +LOCATED +LONGTERM +LONGTIME +LOOKING +LOOKS +LOSING +MADE +MANY +MARITAL +MAY +MAYBE +MEANWHILE +MEETING +MINIMUM +MONTHLY +MORE +MOREOVER +MOST +MOSTLY +MOUNTED +MR +MUCH +MY +NAMED +NATURAL +NATURALLY +NEARLY +NEGOTIATORS +NEITHER +NEVER +NEXT +NINETYDAY +NOBODY +NONE +NONETHELESS +NOR +NOT +NOTABLY +NOTES +NOTHING +NOTING +NOW +NOWADAYS +OBVIOUSLY +OCCUPATIONAL +OCTOBER +OF +OFFERED +OFTEN +ON +ONCE +ONE +ONEYEAR +ONLY +OPERATING +OPINION +OPPOSITION +OR +OTHER +OTHERS +OTHERWISE +OUR +OUTSIDE +OVER +PARENTS +PART +PARTICIPATION +PAYMENT +PEOPLE +PESSIMISTS +PLANTS +PLEDGED +PLURALISTIC +POINTING +POLICY +POLITICAL +POSITIVE +POSTPONED +POTENTIAL +PRESENCE +PRESSURED +PREVIOUSLY +PRODUCERS +PROFIT +PROTECTING +PROTECTIONISM +PROVISIONAL +PURELY +PUT +QUICK +QUITE +RATHER +REACHED +READIED +RECENTLY +RECOGNITION +RECOVERIES +REDEMPTION +REFERRING +RELYING +REMAINING +REMOVING +REOFFER +REPRESENTING +REQUEST +RESEARCHERS +RESULTS +REVIEWED +RIOTS +RIVAL +RUMORS +RUSSIAS +SAYS +SCORING +SECRETARIES +SECTION +SEEKING +SELFDEFENSE +SENDS +SEPARATELY +SEPTEMBER +SEVERAL +SEXUAL +SHE +SHELTER +SHES +SHOPKEEPERS +SHORTLY +SHOULD +SIMILARLY +SINCE +SLIGHTLY +SMALL +SMALLER +SO +SOLDIERS +SOME +SOON +SORRY +SOUGHT +STEPPED +STILL +STUDIES +SUBSCRIBERS +SUBSTANTIAL +SUCH +SUPPORT +SURELY +SWEETHEART +TALKS +TAXPAYERS +THAT +THATS +THE +THEIR +THEN +THERE +THEREAFTER +THEREFORE +THERES +THESE +THEY +THEYLL +THEYRE +THIS +THOSE +THOUGH +THREATENED +THROUGH +THROUGHOUT +THURSDAY +THUS +TO +TODAY +TOGETHER +TONIGHT +TOO +TRADITIONALLY +TRANSFERRED +TROTTING +TRUTH +TUMBLES +TWOFIFTHS +TWOTHIRDS +UNDER +UNFORTUNATELY +UNINSURED +UNLESS +UNLIKE +UNTIL +UPON +URGED +USERS +USING +USUALLY +VENTURE +VERSION +VIRTUALLY +WAS +WATCHING +WE +WEDNESDAY +WEIGHED +WELCOME +WELL +WERE +WEVE +WHAT +WHATS +WHEN +WHENEVER +WHETHER +WHICHEVER +WHILE +WHOEVER +WHY +WITH +WONDERS +WORST +WOULD +WRITTEN +YEARS +YES +YESTERDAY +YESTERDAYS +YET +YOU +YOULL +YOUR +YOURE +YOUVE diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/sentag.c b/egs/bn/s5/local/data_prep/csr_hub4_utils/sentag.c new file mode 100644 index 00000000000..af70504d1f1 --- /dev/null +++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/sentag.c @@ -0,0 +1,674 @@ +static char rcsid[] = "$Id: sentag.c,v 1.9 1996/08/13 15:57:35 robertm Rel $"; +/************************************************************* + * sentag.c + *------------------------------------------------------------ + * Intended to do the best possible sentence tagging of + * text data from journalistic sources. Input format is + * the typical TIPSTER-style SGML, in which the critical + * tags required are indicated below, and other tags are + * passed through without modifications: + * + * + * ... + * + *

+ * All text should be prepared with one paragraph on a line, regardless \ + * how long it is (up to 65536 chars). + *

+ * The sentag program will make changes within the "TEXT" region only. This \ + * is an example. + *

+ * In addition to putting one whole paragraph on one line, other cleaning up \ + * may be needed so that output sentences are tidy. This might include removing \ + * "datelines", etc. + *

+ * Note that closing tags are implicit for paragraphs. The same will apply to \ + * sentence tags in the output. + * + * ... + * + * + * Output format is: + * + * + * ... + * + *

+ * + * All text should be prepared with one paragraph on a line, regardless \ + * how long it is (up to 65536 chars). + *

+ * + * The sentag program will make changes within the "TEXT" region only. + * + * This is an example. + *

+ * + * In addition to putting one whole paragraph on one line, other cleaning up \ + * may be needed so that output sentences are tidy. + * + * This might include removing "datelines", etc. + *

+ * + * Note that closing tags are implicit for paragraphs. + * + * The same will apply to sentence tags in the output. + * + * ... + * + * + * In a nutshell, this program applies unique ID strings to all + * paragraph tags, inserts an initial tag at the start of each + * paragraph, and for each period "." character that marks the end of + * a sentence within a paragraph, it replaces the following space with + * "\n\n". + * + * This program operates as a pipeline filter. + * + * By default, it looks in "./addressforms" for a list of + * sentence-internal abbreviations, and in "./sent-init.vocab" for a + * list of words that would only be capitalized at the beginning of a + * sentence. The arguments "-a abbrevfile" and "-i sent-init.list" + * can override the defaults. + * + * If either "abbrev" or "sent-init" file is not found, the program exits. + * + * A "sent-init.candidate" file is created, containing all the cases + * in which a capitalized word following a period has been _assumed_ + * to be a continuation of an abbreviated proper noun phrase + * (e.g. U.S. Treasury). This "candidate" file (and a histogram of + * its tokens) should be reviewed to look for (classes of) possible + * missed boundaries. Sentence breaks are NOT applied to these cases, + * and a second pass over the same input data should be made if the + * "sent-init" file is updated to include any of these candidates. + * The argument "-t candidate.file" will override the default name. + */ + +#include +#include +#include +#include + +#define BUFSIZE 65536 +#define MAXABRV 2048 +#define MAXIVCB 1024 +#define MAXBRKS 256 +#define IDLEN 64 +#define MAXSENTLEN 4096 + +char *abbrevs[MAXABRV]; /* contains sentence-internal abbrevs */ +char idstr[IDLEN]; +struct si_word { + char *wd; +} si_node, s_init_wd[MAXIVCB]; /* contains non-capitalized words */ + +int n_abbrevs = 0; +int n_mid_abbrevs, n_s_init = 0, pid; + +FILE *tfp; + +/* -------------------------------------------------- + * w_compare() : comparison function for bsearch() + */ +int w_compare( w1, w2 ) + struct si_word *w1, *w2; +{ + return strcmp( w1->wd, w2->wd ); +} + + +main( ac, av ) + int ac; + char **av; +{ + FILE *afp, *ifp; + int c, i, j, inText; + char buf[BUFSIZE], *cp; + extern int optind, opterr; + extern char *optarg; + int w_compare(); + +/* Handle options or defaults + */ + afp = ifp = tfp = NULL; + while (( c = getopt( ac, av, "a:i:t:" )) != -1 ) + switch ( c ) + { + case 'a': + if (( afp = fopen( optarg, "r" )) == NULL ) { + fprintf( stderr, "Unable to open abbrev file %s\n", optarg ); + exit(1); + } + break; + case 'i': + if (( ifp = fopen( optarg, "r" )) == NULL ) { + fprintf( stderr, "Sent-init.vocab file %s not found.\n", optarg ); + exit(1); + } + break; + case 't': + if (( tfp = fopen( optarg, "w" )) == NULL ) { + fprintf( stderr, "Can't create %s -- quitting.\n", optarg ); + exit(1); + } + break; + default: + fprintf( stderr, "Usage: %s [-a abbrevs] [-i sent-init.vocab]\n", av[0] ); + fprintf( stderr, "version: %s\n", rcsid ); + exit(1); + } + +/* Always create a table of uncertain capitalized words + */ + if ( ! tfp && ( tfp = fopen( "sent-init.candidate", "a" )) == NULL ) { + fprintf( stderr, "Can't create/append-to ./sent-init.candidate\n" ); + exit(1); + } + +/* Load typical sentence-initial words (capitalized only when sentence-intial) + * -- input list file must be presorted alphabetically + */ + if ( ! ifp && ( ifp = fopen( "sent-init.vocab", "r" )) == NULL ) { + fprintf( stderr, "File ./sent-init.vocab not found.\n" ); + exit(1); + } + while ( n_s_init < MAXIVCB && fgets( buf, BUFSIZE, ifp ) != NULL ) + if ( buf[0] != '#' ) + s_init_wd[ n_s_init++ ].wd = strdup( strtok( buf, "\n" )); + fclose( ifp ); + +/* Load definite within-sentence abbrevs + */ + if ( ! afp && ( afp = fopen( "addressforms", "r" )) == NULL ) { + fprintf( stderr, "Unable to open file ./addressforms\n" ); + exit(1); + } + while ( n_abbrevs < MAXABRV && fgets( buf, BUFSIZE, afp ) != NULL ) + if ( buf[0] != '#' ) + abbrevs[ n_abbrevs++ ] = strdup( strtok( buf, "." )); + fclose( afp ); + n_mid_abbrevs = n_abbrevs; + +/* Add some special abbrevs to the list + */ + abbrevs[ n_abbrevs++ ] = strdup( "Dr" ); + abbrevs[ n_abbrevs++ ] = strdup( "St" ); + +/* Scan and tag text data + */ + inText = 0; + *idstr = 0; + while ( gets( buf )) + { + if (strlen(buf) > BUFSIZE) + { + fprintf( stderr, "input buffer size exceeded!!\n" ); + fprintf( stderr, "last input:\n%s\n", buf ); + exit(-1); + } + if ( !inText ) { + if ( buf[0] == '<' ) + switch ( buf[1] ) + { + case 'D': + if ( !strncmp( buf, "", 6 )) { + if ( ! *idstr ) { + fprintf( stderr, "No DOCID string -- quitting.\n" ); + exit(1); + } + inText = 1; + pid = 0; + } + break; + default: + break; + } + puts( buf ); + } + else { + if ( buf[0] == '<' ) + switch ( buf[1] ) + { + case 'p': + pid++; + printf( "

\n", idstr, pid ); + break; + case '/': + if ( !strncmp( buf, "", 7 )) + inText = 0; + puts( buf ); + break; + default: + if (( !strncmp( buf, "", 9 )) + || ( !strncmp( buf, "", 9 ))) + { + puts( buf ); + } + else + { + fprintf( stderr, "Warning: passing odd markup in %s:\n\t%s\n", idstr, buf ); + puts( buf ); + } + } + else { + strcat( buf, " " ); + sentBreak( buf ); + } + } + } + exit(0); +} + + +char *ucs = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; +char *lcs = "abcdefghijklmnopqrstuvwxyz"; +char *crp_abbrv[] = { "CORP", "INC", "CO", "PLC", "LTD", "BHD", "CIE", + "DEPT", "LTDA", "MFG", "SPA" }; +int n_crp_abbrv = 11; +char *time_zone[] = { "EST", "EDT", "PST", "PDT", "CST", "CDT", "MST", "MDT", "GMT" }; +int n_time_zone = 9; + +#define MAXWDLEN 64 +#define DoNextPeriod continue + +sentBreak( buf ) + char *buf; +{ + char *period[MAXBRKS], *start, perchr, nxtwd[MAXWDLEN]; + char *nxtch, *nxtuc, *nxtsp, *prvch, *prvsp, *endwd, *prvwd, *endpg; + char *openbracketp; + int n_per, i, j, k; + + n_per = 0; + nxtuc = start = buf; + endpg = buf + strlen( buf ) -1; + + /* Locate all possible sentence terminations in this paragraph; + * if none, print what we have as a sentence. + */ + openbracketp=0; + for(nxtsp = buf; *nxtsp != NULL ; nxtsp++) + switch (*nxtsp) + { + case '[': + if ( strchr(nxtsp,']') != NULL ) + openbracketp=nxtsp; + break; + case ']': + if (openbracketp && n_per + && period[n_per-1]+4 > openbracketp + && strchr(".!?",*(nxtsp-1))) + period[n_per-1]=nxtsp-1; + openbracketp=0; + break; + case '.': + case '?': + case '!': + if (openbracketp) continue; + period[n_per++] = nxtsp; + if (n_per >= MAXBRKS) + { + fprintf(stderr, + "MAXBRKS exceeded - more than %d `periods' in\n%s\n", + MAXBRKS, buf); + exit(-1); + } + break; + default: + break; + } + + if ( ! n_per ) { + /* if ( endpg - buf > 3 && strchr(( endpg-2 ), ':' ) != NULL ) */ + tagSentence( buf, endpg ); + return; + } + + /* Check each possible sentence break, using a variety of + * heuristics... At each stage, if evidence indicates a + * clear decision, write the tagged sentence if appropriate, + * and continue on to the next candidate. + */ + for ( i=0; i period[i]; endwd-- ) + *(endwd+1) = *endwd; + *(++endwd) = ' '; + for ( j=i+1; j nxtsp + 3 || + ( nxtuc == nxtsp + 3 && *( nxtuc -1 ) == ' ' )) + && (! (( *(nxtsp+1) == '[' ) + && ( strchr( nxtsp, ']') +2 == nxtuc ) + && ( strchr( ".!?", *(nxtuc-3)) == NULL)))) + DoNextPeriod; + + /* If next token after period is a corporate abbrev, this is + * not a break + */ + j = k = 0; + while ( k < MAXWDLEN && nxtuc[j] != ' ' ) { + if ( isalpha( nxtuc[j] )) + nxtwd[k++] = toupper( nxtuc[j] ); + j++; + } + if ( k < MAXWDLEN ) { + nxtwd[k] = 0; + for ( j=0; j %s\n", idstr, pid, start ); + DoNextPeriod; + } + + /* Inspect the token that precedes the period + */ + perchr = *period[i]; + *period[i] = 0; + + if (( prvsp = strrchr( start, ' ' )) != NULL ) + { + + /* This block looks at a pre-break token that is not sentence-initial. + * Make sure we point to the first alphanumeric character, if any + */ + endwd = prvsp +1; + while ( *endwd && !isalnum( *endwd )) + *endwd++; + if ( ! *endwd ) { /* This was probably an ellipsis "..." */ + *period[i] = perchr; + tagSentence( start, nxtsp ); + start = nxtsp + 1; + DoNextPeriod; + } + + /* - if token ends in a bracket or quote, this is a clear sentence break + */ + if ( strchr( "\")}]", *prvch )) + { + *period[i] = perchr; + tagSentence( start, nxtsp ); + start = nxtsp + 1; + DoNextPeriod; + } + + /* - if token does not begin with upper-case, and is not a time designation + * ("a.m" or "p.m") followed by a time-zone name, and is not "vs" or "excl", + * then this is a real break + */ + if ( !isupper( *endwd )) { + if ( strstr( endwd, ".m" )) { + for ( j=0; j %s\n", idstr, pid, start ); + DoNextPeriod; + } + + /* - if it is a single letter, this is almost certainly + * not a real break (it's a first or middle initial) + */ + if ( strlen( endwd ) == 1 ) { + *period[i] = perchr; + DoNextPeriod; + } + + /* At this point, we are looking at a non-initial multi-char token that + * begins with upper-case, is not a clear mid-sentence abbrev, and is + * followed by a capitalized word that is not a corporate abbrev. + * If the "period" character is actually "?" or "!", OR (the token + * contains lower case and, if a corp-abbrev, is not followed by "(") + * then this is almost certainly a real break (if it is a corp-abbrev + * followed by "(", this is most likely not a break) + */ + if ( perchr != '.' ) { + *period[i] = perchr; + tagSentence( start, nxtsp ); + start = nxtsp + 1; + DoNextPeriod; + } + if ( strpbrk( endwd, lcs )) { + for ( j=0; j\n", nxtwd, idstr, pid ); + } + DoNextPeriod; + + } /* prvsp != NULL */ + + else + + { /* prvsp == NULL */ + /* This block looks at a sentence-initial token preceding + * the period; if "period" is acually "?!", or if the token + * looks like any kind of abbreviation, this is not a real break + */ + if ( perchr != '.' ) { + *period[i] = perchr; + tagSentence( start, nxtsp ); + start = nxtsp + 1; + DoNextPeriod; + } + endwd = start; + while ( *endwd && !isalpha( *endwd )) + endwd++; + if ( ! *endwd ) { + *period[i] = perchr; + DoNextPeriod; + } + for ( j=0; j MAXSENTLEN ) + { + fprintf( stderr, "Warning: in %s, ", idstr ); + fprintf(stderr,"sentence length of %d exceeds MAXSENTLEN (%d)\n", + len,MAXSENTLEN); + strncpy(sent,start,75); + sent[75]=0; + fprintf(stderr,"ignoring `sentence' beginning with:\n %s\n", + sent); + return; + } + + si = start; + so = sent; + alpha = 0; + + while ( si < end ) { + alpha |= (! isspace( *si )); + *so++ = *si++; + } + *so = 0; + + if ( ! alpha ) + return; + + printf( "\n%s\n", sent ); +} + +/* +unpicky_tagSentence( start, end ) + char *start, *end; +{ + if ( start >= end ) { + fprintf( stderr, "ignoring bad sentence mark (%x !< %x) in %s\n", + start, end, idstr ); + fprintf( stderr, "`sentence' from start-pointer:\n%s\n", start ); + return; + } + printf("\n"); + while ( start < end ) + putchar(*start++); + putchar('\n'); +} +*/ diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl b/egs/bn/s5/local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl new file mode 100755 index 00000000000..947ee28e2dc --- /dev/null +++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl @@ -0,0 +1,13 @@ +#!/usr/bin/perl -pi.old-char + +# handles nonprinting characters in Broadcast News material, to the extent +# that they can be handled, and perhaps a bit beyond... + +tr/\xc4\x82\x90\xa4\x89\x8a\x87\xe9/-eEneece/; + +s=\xae=<<=g; +s=\xaf=>>=g; +s=\xab= 1/2=g; +s=\xac= 1/4=g; +s=\xf8= degrees=g; +s=\xf1= plus or minus =g; diff --git a/egs/bn/s5/local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl b/egs/bn/s5/local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl new file mode 100755 index 00000000000..8dc87917c0c --- /dev/null +++ b/egs/bn/s5/local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl @@ -0,0 +1,46 @@ +#!/usr/bin/perl -p + +# handles nonprinting characters in Broadcast News material, to the extent +# that they can be handled, and perhaps a bit beyond... + +s=\xc4=-=g; +s=\xae=<<=g; +s=\xaf=>>=g; +s=\x82=e=g; # e' (é) in IBMPC +s=\xab= 1/2=g; +# next most frequent, \xfa, appears to have various use as hard-space, +# hard-return, or noise +s=\x90=E=g; # E' (É) in IBMPC +s=\xa4=n=g; # n~ (ñ) in IBMPC +s=\xac= 1/4=g; +# ^G => noise +# ^A => noise +s=\xf8= degrees=g; +# \x1b => noise? +# \x02 => noise? + +# remainder occur 4 or fewer times each -- may be better to do by hand? +s=\x89=e=g; # e: or E: +s=\xf1= plus or minus =g; +# \xc9 = graphics character => ??? +# \x03 => noise? +# \x04 => noise? +s=\x8a=e=g; # e` (è) in IBMPC +s=\x87=c=g; # c, (ç) in IBMPC +s=\xe9=e=g; # e' (é) in ISO!! +# \xad => spanish inverted question mark (¡), appears (with Spanish) twice! +s=\xad==g; + +# remainder occur only once each -- probably best to check by hand +# \xff +# \xdd +# \xbb +# \xa1 +# \x8d +# \x81 +# \x1c +# \x1a +# \x16 +# \x11 +# \x10 +# \x0c diff --git a/egs/bn/s5/local/data_prep/do-lm-csr96 b/egs/bn/s5/local/data_prep/do-lm-csr96 new file mode 100755 index 00000000000..eec6791904f --- /dev/null +++ b/egs/bn/s5/local/data_prep/do-lm-csr96 @@ -0,0 +1,40 @@ +#!/bin/sh +# $Id: do-lm,v 1.3 1996/08/23 22:43:23 robertm Rel $ +Usage() +{ +cat << EOM 1>&2 +Usage: $0 file(s) + Runs LM pipeline on FILES, with output to "lm" subdirectory of cwd. + Expects to find LM conditioning tools in PATH or ./bin. +EOM +} + +# Excludes "fixvp" stage which has the main effect of killing off +# any SGML tagging that contains a space, e.g.

. + +# BBN used -np switch for puncproc, removing punctuation; this chooses the +# "verbalize" option instead. + +# Includes new "numhack" module to deal with zip codes and phone numbers. + +if [ $# -eq 0 ] || [ $1 = "-h" ]; then + Usage + exit 1 +fi + +PATH=$PATH:./bin ; export PATH + +for file in $* +do + BASENM=`basename $file` + echo "Running LM pipeline for |$BASENM|..." 1>&2 + set -x + perl pare-sgml.perl $file | + perl bugproc.perl | + perl numhack.perl | + perl numproc.perl | + perl abbrproc.perl | + perl puncproc.perl > lm/$BASENM + set +x + echo "Done with $BASENM." +done diff --git a/egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_corpus.sh b/egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_corpus.sh new file mode 100755 index 00000000000..fc20758eec0 --- /dev/null +++ b/egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_corpus.sh @@ -0,0 +1,51 @@ +#! /bin/bash + +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +set -e +set -o pipefail +set -u + +nj=4 +cmd=run.pl +stage=0 + +. path.sh +. utils/parse_options.sh + +if [ $# -ne 2 ]; then + echo "Usage: $0

" + echo " e.g.: $0 /export/corpora/LDC/LDC98T31/ data/local/data/csr96_hub4" + exit 1 +fi + +SOURCE_DIR=$1 +dir=$2 + +mkdir -p $dir + +ls $SOURCE_DIR/1996_csr_hub4_model/st_train/*.stZ \ + $SOURCE_DIR/1996_csr_hub4_model/st_test/*.stZ | sort > \ + $dir/filelist + +mkdir -p $dir/split$nj/ + +if [ $stage -le 1 ]; then + eval utils/split_scp.pl $dir/filelist $dir/split$nj/filelist.{`seq -s, $nj`} + $cmd JOB=1:$nj $dir/log/process_text.JOB.log \ + local/data_prep/csr_hub4_utils/process_filelist.py \ + $dir/split$nj/filelist.JOB $dir +fi + +for x in `ls $SOURCE_DIR/1996_csr_hub4_model/st_train/*.stZ`; do + y=`basename $x` + name=${y%.stZ} + echo $dir/${name}.txt.gz +done > $dir/train.filelist + +for x in `ls $SOURCE_DIR/1996_csr_hub4_model/st_test/*.stZ`; do + y=`basename $x` + name=${y%.stZ} + echo $dir/${name}.txt.gz +done > $dir/test.filelist diff --git a/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh b/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh new file mode 100755 index 00000000000..a167c2cfee0 --- /dev/null +++ b/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh @@ -0,0 +1,87 @@ +#! /bin/bash + +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + echo "local/prepare_1998_hub4_bn_eng_eval.sh /export/corpora/LDC/LDC2000S86/ data/local/data/eval98" + exit 1 +fi + +SOURCE_DIR=$1 +dir=$2 + +mkdir -p $dir + +for uem in $SOURCE_DIR/h4e_evl/h4e_98_{1,2}.uem; do + python -c ' +import sys, os +uem = sys.argv[1] +reco, ext = os.path.splitext(os.path.basename(uem)) +for line in open(uem).readlines(): + line = line.strip() + if len(line) == 0 or line[0:2] == ";;": + continue + parts = line.split() + + assert parts[1] == "1" + start_time = float(parts[2]) + end_time = float(parts[3]) + + utt = "{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100), + int(end_time * 100)) + print ("{0} {1} {2} {3}".format(utt, reco, start_time, end_time))' $uem +done > $dir/segments + +cat $SOURCE_DIR/h4e_evl/h4e_98_{1,2}.seg | \ + python -c ' +from __future__ import print_function +import sys + +segments_handle = open(sys.argv[1], "w") +utt2spk_handle = open(sys.argv[2], "w") +for line in sys.stdin.readlines(): + line = line.strip() + if len(line) == 0 or line[0:2] == ";;": + continue + parts = line.split() + + reco = parts[0] + assert parts[1] == "1" + spk = parts[2] + start_time = float(parts[3]) + end_time = float(parts[4]) + + utt = "{spk}-{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100), + int(end_time * 100), spk=spk) + + print ("{0} {1} {2} {3}".format(utt, reco, start_time, end_time), + file=segments_handle) + print ("{0} {1}".format(utt, spk), + file=utt2spk_handle) +segments_handle.close() +utt2spk_handle.close() +' $dir/segments.pem $dir/utt2spk.pem + +export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5 +sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; } +for x in `ls $SOURCE_DIR/h4e_evl/*.sph`; do + y=`basename $x` + z=${y%.sph} + echo "$z $sph2pipe -f wav $x |"; +done > $dir/wav.scp + +awk '{print $1" "$1" 1"}' $dir/wav.scp > $dir/reco2file_and_channel + +cp $SOURCE_DIR/h4e_evl/h4e_98.glm $dir/glm +cp $SOURCE_DIR/h4e_evl/h4e_98.stm $dir/stm + +awk '{print $1" "$2}' $dir/segments > $dir/utt2spk + +utils/fix_data_dir.sh $dir +utils/copy_data_dir.sh $dir ${dir}.pem + +cp $dir/segments.pem ${dir}.pem/segments +cp $dir/utt2spk.pem ${dir}.pem/utt2spk +utils/fix_data_dir.sh ${dir}.pem diff --git a/egs/bn/s5/local/data_prep/prepare_bn_data.py b/egs/bn/s5/local/data_prep/prepare_bn_data.py new file mode 100755 index 00000000000..b96d0503367 --- /dev/null +++ b/egs/bn/s5/local/data_prep/prepare_bn_data.py @@ -0,0 +1,208 @@ +#! /usr/bin/env python + +from __future__ import print_function +import argparse +import glob +import logging +import os +import re +from bs4 import BeautifulSoup + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - " + "%(funcName)s - %(levelname)s ] %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) + + +def get_args(): + parser = argparse.ArgumentParser("Prepare BN corpus.") + parser.add_argument("--split-at-sync", type=str, + choices=["true", "false"], default="false", + help="If true, creates separate segments split " + "at each sync tag.") + parser.add_argument("audio_source_dir", type=str, + help="Source directory of audio of BN corpus " + "(LDC97S44)") + parser.add_argument("text_source_dir", type=str, + help="Source directory of text of BN corpus " + "(LDC97T22)") + parser.add_argument("dir", type=str, + help="Output directory to write the kaldi files") + + args = parser.parse_args() + + args.split_at_sync = bool(args.split_at_sync == "true") + return args + + +class Segment(object): + """A class to store a segment with start time, end time, recording id, + speaker, and the text. + """ + def __init__(self, reco_id, speaker=None): + self.reco_id = reco_id + self.text = None + self.start_time = -1 + self.end_time = -1 + if speaker is not None: + self.speaker = speaker + else: + self.speaker = reco_id + + def write_segment(self, out_file): + """writes segment in kaldi segments format""" + print("{0} {1} {2} {3}".format(self.utt_id(), self.reco_id, + self.start_time, self.end_time), + file=out_file) + + def write_utt2spk(self, out_file): + """writes speaker information in kaldi utt2spk format""" + print("{0} {1}".format(self.utt_id(), self.speaker), + file=out_file) + + def write_text(self, out_file): + print("{0} {1}".format(self.utt_id(), self.text), + file=out_file) + + def check(self): + """checks if this is a valid segment""" + assert self.end_time > self.start_time + + def utt_id(self): + """returns the utterance id created from the recording id and + the timing information""" + return ("{spkr}-{0}-{1:06d}-{2:06d}".format( + self.reco_id, int(self.start_time * 100), + int(self.end_time * 100), spkr=self.speaker)) + + def duration(self): + """returns the duration of the segment""" + return self.end_time - self.start_time + + +def process_segment_soup(reco_id, soup, split_at_sync=False): + """Processes the input segment soup into a list of objects of class + Segment. + If split_at_sync is False, then only a segment is created for the soup + without consideration to the sync tags. + """ + start_time = float(soup['s_time']) + end_time = float(soup['e_time']) + speaker = soup['speaker'] + + segments = [] + + create_new_segment = True + for x in soup.children: + try: + if x.name == "sync": + assert not create_new_segment + if not split_at_sync: + continue + start_time = float(x['time']) + segments[-1].end_time = start_time + create_new_segment = True + elif x.name == "background" or x.name == "comment": + continue + else: + if create_new_segment: + assert split_at_sync or len(segments) == 0 + segment = Segment(reco_id, speaker) + segment.text = x.encode('ascii').strip().replace('\n', ' ') + segment.start_time = start_time + segment.end_time = end_time + if segment.duration() > 0: + segments.append(segment) + create_new_segment = False + else: + segments[-1].text += ( + ' ' + x.encode('ascii').strip().replace('\n', ' ')) + except Exception: + logger.error("Error processing element %s", x) + raise + + return segments + + +def process_transcription(transcription_file, segments_handle, utt2spk_handle, + text_handle, split_at_sync=False): + """Processes transcription file into segments.""" + doc = ''.join(open(transcription_file).readlines()) + tag_matcher = re.compile(r"(<(Sync|Background)[^>]+>)") + doc_modified = tag_matcher.sub(r"\1", doc) + + soup = BeautifulSoup(doc_modified, 'lxml') + + reco_id, ext = os.path.splitext(os.path.basename(transcription_file)) + reco_id = reco_id.strip('_') # remove trailing underscores in the name + + for episode in soup.find_all("episode"): + for section in episode.find_all("section"): + s_time = section['s_time'] + e_time = section['e_time'] + section_type = section['type'] + + logger.debug("Processing section st = %d, end = %d, " + "type = %s", s_time, e_time, section_type) + + for seg in section.find_all("segment"): + try: + segments = process_segment_soup( + reco_id, seg, split_at_sync=split_at_sync) + for s in segments: + if s.duration() == 0: + continue + s.write_segment(segments_handle) + s.write_utt2spk(utt2spk_handle) + s.write_text(text_handle) + except Exception: + logger.error("Failed processing segment %s", seg) + raise + + +def _run(args): + if not os.path.isdir(args.dir): + os.makedirs(args.dir) + + with open(os.path.join(args.dir, "wav.scp"), 'w') as wav_scp_handle: + for file_ in glob.glob("{0}/{1}/*.sph".format(args.audio_source_dir, + "data")): + reco, ext = os.path.splitext(os.path.basename(file_)) + reco = reco.strip('_') + + print("{0} sox {1} -c 1 -r 16000 -t wav - |".format( + reco, file_), file=wav_scp_handle) + + segments_handle = open(os.path.join(args.dir, "segments"), 'w') + utt2spk_handle = open(os.path.join(args.dir, "utt2spk"), 'w') + text_handle = open(os.path.join(args.dir, "text"), 'w') + for dir_ in glob.glob("{0}/{1}/*/".format(args.text_source_dir, + "hub4_eng_train_trans")): + for x in glob.glob("{0}/*.txt".format(dir_)): + try: + process_transcription(x, segments_handle, utt2spk_handle, + text_handle, + split_at_sync=args.split_at_sync) + except Exception: + logger.error("Failed to process file %s", + x) + raise + segments_handle.close() + utt2spk_handle.close() + text_handle.close() + + +def main(): + try: + args = get_args() + _run(args) + except Exception: + raise + + +if __name__ == '__main__': + main() diff --git a/egs/bn/s5/local/data_prep/prepare_na_news_text_corpus.sh b/egs/bn/s5/local/data_prep/prepare_na_news_text_corpus.sh new file mode 100755 index 00000000000..44138e2a228 --- /dev/null +++ b/egs/bn/s5/local/data_prep/prepare_na_news_text_corpus.sh @@ -0,0 +1,51 @@ +#! /bin/bash + +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +. cmd.sh +. path.sh + +set -e +set -o pipefail +set -u + +nj=4 +cmd=run.pl + +. utils/parse_options.sh + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + echo " e.g.: $0 /export/corpora/LDC/LDC95T21 data/local/data/na_news" + exit 1 +fi + +SOURCE_DIR=$1 +dir=$2 + +for x in $SOURCE_DIR/*/*/*; do + year=`basename $x` + newspaper=`basename $(dirname $x)` + d=$dir/${newspaper}_${year} + + mkdir -p $d + + list_file=$d/articles.list + ls $x/*.gz > $list_file + + mkdir -p $d/split$nj + + eval utils/split_scp.pl $d/articles.list \ + $d/split$nj/articles.list.{`seq -s, $nj`} + + ( + $cmd JOB=1:$nj $d/log/get_processed_text.JOB.log \ + local/data_prep/process_na_news_text.py $d/split$nj/articles.list.JOB - \| \ + gzip -c '>' $d/corpus.JOB.gz || exit 1 + gunzip -c $d/corpus.*.gz | gzip -c > $d/corpus.gz || exit 1 + rm $d/corpus.*.gz + ) & +done + +wait diff --git a/egs/bn/s5/local/data_prep/process_na_news_text.py b/egs/bn/s5/local/data_prep/process_na_news_text.py new file mode 100755 index 00000000000..10941dd3186 --- /dev/null +++ b/egs/bn/s5/local/data_prep/process_na_news_text.py @@ -0,0 +1,91 @@ +#! /usr/bin/env python + +from __future__ import print_function +from bs4 import BeautifulSoup +import argparse +import gzip +import logging +import sys + +sys.path.insert(0, 'local/lm') +import text_normalization + + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) +handler = logging.StreamHandler() +handler.setLevel(logging.DEBUG) +formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - " + "%(funcName)s - %(levelname)s ] %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) + + +def get_args(): + parser = argparse.ArgumentParser("Prepare NA News Text corpus (LDC95T21).") + parser.add_argument("file_list", type=argparse.FileType('r'), + help="List of compressed source files for NA News Text. " + "e.g: /export/corpora/LDC/LDC95T21/na_news_1/latwp/1994") + parser.add_argument("out_file", type=argparse.FileType('w'), + help="Output file to write to.") + + args = parser.parse_args() + + return args + + +def normalize_text(text): + text1 = text.strip() + text2 = text_normalization.remove_punctuations(text1) + text2 = text2.upper() + return text2 + + +def process_file(file_handle, out_file_handle): + doc = ' '.join(file_handle.readlines()) + soup = BeautifulSoup(doc, 'lxml') + + num_written = 0 + + for doc in soup.html.body.children: + try: + if doc.name != "doc": + continue + for para in doc.find_all('p'): + assert para.name == 'p' + text = ' '.join([unicode(x).strip() for x in para.contents]) + normalized_text = normalize_text(text) + out_file_handle.write("{0}\n".format( + normalized_text.encode('ascii'))) + num_written += 1 + except: + logger.error("Failed to process document %s", doc) + raise + if num_written == 0: + raise RuntimeError("0 sentences written.") + + +def _run(args): + for line in args.file_list.readlines(): + try: + file_ = line.strip() + with gzip.open(file_, 'r') as f: + process_file(f, args.out_file) + except Exception: + logger.error("Failed processing file %s", file_) + raise + + +def main(): + try: + args = get_args() + _run(args) + except Exception: + raise + finally: + args.out_file.close() + args.file_list.close() + + +if __name__ == '__main__': + main() diff --git a/egs/bn/s5/local/dict b/egs/bn/s5/local/dict new file mode 120000 index 00000000000..384304fdf2a --- /dev/null +++ b/egs/bn/s5/local/dict @@ -0,0 +1 @@ +../../../wsj/s5/local/dict/ \ No newline at end of file diff --git a/egs/bn/s5/local/format_data.sh b/egs/bn/s5/local/format_data.sh new file mode 100755 index 00000000000..b7d58f83718 --- /dev/null +++ b/egs/bn/s5/local/format_data.sh @@ -0,0 +1,28 @@ +#! /bin/bash + +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +echo "$0 $@" # Print the command line for logging +. utils/parse_options.sh || exit 1; + +. ./path.sh || exit 1; + +srcdir=data/local/data +tmpdir=data/local/ + +for t in train; do + utils/fix_data_dir.sh $srcdir/$t + utils/copy_data_dir.sh $srcdir/$t data/$t + cat $srcdir/$t/text | \ + local/normalize_transcripts.pl "" "" > \ + data/$t/text + utils/fix_data_dir.sh data/$t +done + +for t in eval98 eval98.pem; do + utils/copy_data_dir.sh $srcdir/$t data/$t + utils/fix_data_dir.sh data/$t +done + + diff --git a/egs/bn/s5/local/format_lms.sh b/egs/bn/s5/local/format_lms.sh new file mode 100755 index 00000000000..7d9e3b82bfb --- /dev/null +++ b/egs/bn/s5/local/format_lms.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# +# Copyright 2014 Nickolay V. Shmyrev +# Apache 2.0 + +if [ -f path.sh ]; then . path.sh; fi + +set -e -o pipefail -u + +lang_suffix=_test + +. utils/parse_options.sh + +#arpa_lm=data/local/local_lm/data/arpa/4gram.arpa.gz +small_arpa_lm=data/local/local_lm/data/arpa/4gram_small.arpa.gz +big_arpa_lm=data/local/local_lm/data/arpa/4gram_big.arpa.gz + +for f in $small_arpa_lm $big_arpa_lm data/lang_nosp/words.txt; do + [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 +done + + +set -e + +cp -rT data/lang_nosp/ data/lang_nosp${lang_suffix} + +if [ -f data/lang_nosp${lang_suffix}/G.fst ] && [ data/lang_nosp${lang_suffix}/G.fst -nt $small_arpa_lm ]; then + echo "$0: not regenerating data/lang_nosp${lang_suffix}/G.fst as it already exists and " + echo ".. is newer than the source LM." +else + arpa2fst --disambig-symbol=#0 --read-symbol-table=data/lang_nosp/words.txt \ + "gunzip -c $small_arpa_lm|" data/lang_nosp${lang_suffix}/G.fst + echo "$0: Checking how stochastic G is (the first of these numbers should be small):" + fstisstochastic data/lang_nosp${lang_suffix}/G.fst || true + utils/validate_lang.pl --skip-determinization-check data/lang_nosp${lang_suffix} +fi + + +if [ -f data/lang_nosp${lang_suffix}_rescore/G.carpa ] && [ data/lang_nosp${lang_suffix}_rescore/G.carpa -nt $big_arpa_lm ] && \ + [ data/lang_nosp${lang_suffix}_rescore/G.carpa -nt data/lang_nosp/words.txt ]; then + echo "$0: not regenerating data/lang_nosp_rescore/ as it seems to already by up to date." +else + utils/build_const_arpa_lm.sh $big_arpa_lm data/lang_nosp \ + data/lang_nosp${lang_suffix}_rescore || exit 1; +fi + +exit 0; diff --git a/egs/bn/s5/local/lm/merge_word_counts.py b/egs/bn/s5/local/lm/merge_word_counts.py new file mode 100755 index 00000000000..6338cbbf875 --- /dev/null +++ b/egs/bn/s5/local/lm/merge_word_counts.py @@ -0,0 +1,30 @@ +#! /usr/bin/env python + +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +"""This script merges pocolm word_counts and write a new word_counts file. +A min-count argument is required to only write counts that are above the +specified minimum count. +""" + +import sys + + +def main(): + if len(sys.argv) != 2: + sys.stderr.write("Usage: {0} \n".format(sys.argv[0])) + raise SystemExit(1) + + words = {} + for line in sys.stdin.readlines(): + parts = line.strip().split() + words[parts[1]] = words.get(parts[1], 0) + int(parts[0]) + + for word, count in words.iteritems(): + if count >= int(sys.argv[1]): + print ("{0} {1}".format(count, word)) + + +if __name__ == '__main__': + main() diff --git a/egs/bn/s5/local/lm/text_normalization.py b/egs/bn/s5/local/lm/text_normalization.py new file mode 100644 index 00000000000..f74da60a6ef --- /dev/null +++ b/egs/bn/s5/local/lm/text_normalization.py @@ -0,0 +1,42 @@ + +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +"""This module contains methods for doing text normalization of broadcast news +and similar text corpora. +""" + +import re + + +def normalize_bn_transcript(text, noise_word, spoken_noise_word): + """Normalize broadcast news transcript for audio.""" + text.upper() + # Remove unclear speech markings + text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text) + text = re.sub(r"#", "", text) # Remove overlapped speech markings + # Remove invented word markings + text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) + text = re.sub(r"\[[^]]+\]", noise_word, text) + text = re.sub(r"\{[^}]+\}", spoken_noise_word, text) + text = re.sub(r"\+([^+]+)\+", r"\1", text) + + text1 = [] + for word in text.split(): + # Remove mispronunciation brackets + word = re.sub(r"^@(\w+)$", r"\1", word) + text1.append(word) + return " ".join(text1) + + +def remove_punctuations(text): + """Remove punctuations and some other processing for text sentence.""" + text1 = re.sub("\n", " ", text) + text1 = re.sub(r"(&[^;]+;|--)", " ", text1) + text1 = re.sub(r"''|``|\(|\)", " ", text1) + text1 = re.sub("[^A-Za-z0-9.' _-]", "", text1) + text1 = re.sub(r"\. ", " ", text1) + text1 = re.sub(r"([^0-9$-])\.([^0-9]|$)", r"\1\2", text1) + text1 = re.sub(r" - ", " ", text1) + text1 = re.sub(r"[ ]+", " ", text1) + return text1 diff --git a/egs/bn/s5/local/normalize_transcripts.pl b/egs/bn/s5/local/normalize_transcripts.pl new file mode 100755 index 00000000000..cccf75def4a --- /dev/null +++ b/egs/bn/s5/local/normalize_transcripts.pl @@ -0,0 +1,47 @@ +#!/usr/bin/env perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# This takes data from the standard input that's unnormalized transcripts in the format +# 4k2c0308 Of course there isn\'t any guarantee the company will keep its hot hand [misc_noise] +# 4k2c030a [loud_breath] And new hardware such as the set of personal computers I\. B\. M\. introduced last week can lead to unexpected changes in the software business [door_slam] +# and outputs normalized transcripts. +# c.f. /mnt/matylda2/data/WSJ0/11-10.1/wsj0/transcrp/doc/dot_spec.doc + +@ARGV == 2 || die "usage: normalize_transcript.pl noise_word < transcript > transcript2"; +$noise_word = shift @ARGV; +$spoken_noise_word = shift @ARGV; + +while() { + $_ =~ m:^(\S+) (.+): || die "bad line $_"; + $utt = $1; + $trans = $2; + print "$utt"; + + $trans =~ tr:a-z:A-Z:; + $trans =~ s:\(\(([^)]*)\)\):$1 :g; # Remove unclear speech markings + $trans =~ s:#: :g; # Remove overlapped speech markings + $trans =~ s:\*\*([^*]+)\*\*:$1 :g; # Remove invented word markings + $trans =~ s:\[[^]]+\]:$noise_word :g; + $trans =~ s:\{[^}]+\}:$spoken_noise_word :g; + foreach $w (split (" ",$trans)) { + $w =~ s:^[+](.+)[+]$:$1:; # Remove mispronunciation brackets + $w =~ s:^@(.*)$:$1:; # Remove best guesses for proper nouns + print " $w"; + } + print "\n"; +} + diff --git a/egs/bn/s5/local/prepare_dict.sh b/egs/bn/s5/local/prepare_dict.sh new file mode 100755 index 00000000000..441849329e1 --- /dev/null +++ b/egs/bn/s5/local/prepare_dict.sh @@ -0,0 +1,191 @@ +#!/bin/bash + +# Copyright 2010-2012 Microsoft Corporation +# 2012-2014 Johns Hopkins University (Author: Daniel Povey) +# 2015 Guoguo Chen +# 2016 Vimal Manohar + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# Call this script from one level above, e.g. from the s3/ directory. It puts +# its output in data/local/. + +# The parts of the output of this that will be needed are +# [in data/local/dict/ ] +# lexicon.txt +# extra_questions.txt +# nonsilence_phones.txt +# optional_silence.txt +# silence_phones.txt + +. path.sh +. cmd.sh + +set -e +set -o pipefail +set -u + +# run this from ../ +dict_suffix= + +echo "$0 $@" # Print the command line for logging +. utils/parse_options.sh || exit 1; + +if [ $# -ne 1 ]; then + echo "Usage: $0 " + echo "e.g. : $0 data/local/local_lm/data/work/wordlist" + exit 1 +fi + +wordlist=$1 + +dir=data/local/dict${dict_suffix} +mkdir -p $dir + +if [ ! -d $dir/cmudict ]; then + # (1) Get the CMU dictionary + svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \ + $dir/cmudict || exit 1; +fi + +# can add -r 10966 for strict compatibility. + + +#(2) Dictionary preparation: + + +# Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point). +# We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones. + +# silence phones, one per line. +(echo SIL; echo SPN; echo NSN) > $dir/silence_phones.txt +echo SIL > $dir/optional_silence.txt + +# nonsilence phones; on each line is a list of phones that correspond +# really to the same base phone. +cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \ + perl -e 'while(<>){ + chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; + $phones_of{$1} .= "$_ "; } + foreach $list (values %phones_of) {print $list . "\n"; } ' \ + > $dir/nonsilence_phones.txt || exit 1; + +# A few extra questions that will be added to those obtained by automatically clustering +# the "real" phones. These ask about stress; there's also one for silence. +cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1; +cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) { + $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \ + >> $dir/extra_questions.txt || exit 1; + +grep -v ';;;' $dir/cmudict/cmudict.0.7a | \ + perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \ + > $dir/dict.cmu || exit 1; + +# Add to cmudict the silences, noises etc. + +(echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; echo ' NSN'; ) | \ + cat - $dir/dict.cmu > $dir/lexicon2_raw.txt +awk '{print $1}' $dir/lexicon2_raw.txt > $dir/orig_wordlist + +cat <$dir/silence_phones.txt +SIL +SPN +NSN +EOF + +if [ ! -f exp/g2p/.done ]; then + steps/dict/train_g2p.sh --cmd "$train_cmd" \ + --silence-phones $dir/silence_phones.txt \ + $dir/dict.cmu exp/g2p + touch exp/g2p/.done +fi + +cat $wordlist | python -c ' +import sys + +words = {} +for line in open(sys.argv[1]).readlines(): + words[line.strip()] = 1 + +oovs = {} +for line in sys.stdin.readlines(): + word = line.strip() + if word not in words: + oovs[word] = 1 + +for oov in oovs: + print (oov)' $dir/orig_wordlist | sort -u > $dir/oovlist + +export PATH=$PATH:`pwd`/local/dict + +cat $dir/oovlist | get_acronym_prons.pl $dir/lexicon2_raw.txt > $dir/dict.acronyms + +mkdir -p $dir/f $dir/b # forward, backward directions of rules... + # forward is normal suffix + # rules, backward is reversed (prefix rules). These + # dirs contain stuff we create while making the rule-based + # extensions to the dictionary. + +# Remove ; and , from words, if they are present; these +# might crash our scripts, as they are used as separators there. +filter_dict.pl $dir/dict.cmu > $dir/f/dict +cat $dir/oovlist | filter_dict.pl > $dir/f/oovs +reverse_dict.pl $dir/f/dict > $dir/b/dict +reverse_dict.pl $dir/f/oovs > $dir/b/oovs + +# The next stage takes a few minutes. +# Note: the forward stage takes longer, as English is +# mostly a suffix-based language, and there are more rules +# that it finds. +for d in $dir/f $dir/b; do + ( + cd $d + cat dict | get_rules.pl 2>get_rules.log >rules + get_rule_hierarchy.pl rules >hierarchy + awk '{print $1}' dict | get_candidate_prons.pl rules dict | \ + limit_candidate_prons.pl hierarchy | \ + score_prons.pl dict | \ + count_rules.pl >rule.counts + # the sort command below is just for convenience of reading. + score_rules.pl rules.with_scores + get_candidate_prons.pl rules.with_scores dict oovs | \ + limit_candidate_prons.pl hierarchy > oovs.candidates + ) & +done +wait + +# Merge the candidates. +reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates +select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s %s\n", $1, $2);}' \ + > $dir/dict.oovs + +cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged +awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled +sort $dir/oovlist | diff - $dir/oovlist.handled | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled + +steps/dict/apply_g2p.sh --cmd "$train_cmd" \ + $dir/oovlist.not_handled exp/g2p exp/g2p/oov_lex +cat exp/g2p/oov_lex/lexicon.lex | cut -f 1,3 | awk '{if (NF > 1) print $0}' > \ + $dir/dict.oovs_g2p + +# the sort | uniq is to remove a duplicated pron from cmudict. +cat $dir/lexicon2_raw.txt $dir/dict.oovs_merged $dir/dict.oovs_g2p | sort | uniq > \ + $dir/lexicon.txt || exit 1; +# lexicon.txt is without the _B, _E, _S, _I markers. + +rm $dir/lexiconp.txt 2>/dev/null || true + +echo "Dictionary preparation succeeded" + + diff --git a/egs/bn/s5/local/run_cleanup_segmentation.sh b/egs/bn/s5/local/run_cleanup_segmentation.sh new file mode 100755 index 00000000000..0927b9f9a7d --- /dev/null +++ b/egs/bn/s5/local/run_cleanup_segmentation.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +# Copyright 2016 Vimal Manohar +# 2016 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 + +# This script demonstrates how to re-segment training data selecting only the +# "good" audio that matches the transcripts. +# The basic idea is to decode with an existing in-domain acoustic model, and a +# biased language model built from the reference, and then work out the +# segmentation from a ctm like file. + +# For nnet3 and chain results after cleanup, see the scripts in +# local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh + +# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets +# [will add these later]. + +set -e +set -o pipefail +set -u + +stage=0 +cleanup_stage=0 +data=data/train +cleanup_affix=cleaned +srcdir=exp/tri3 +nj=100 +decode_nj=16 +decode_num_threads=4 + +. ./path.sh +. ./cmd.sh +. utils/parse_options.sh + +cleaned_data=${data}_${cleanup_affix} + +dir=${srcdir}_${cleanup_affix}_work +cleaned_dir=${srcdir}_${cleanup_affix} + +if [ $stage -le 1 ]; then + # This does the actual data cleanup. + steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj $nj --cmd "$train_cmd" \ + $data data/lang_nosp $srcdir $dir $cleaned_data +fi + +if [ $stage -le 2 ]; then + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + $cleaned_data data/lang_nosp $srcdir ${srcdir}_ali_${cleanup_affix} +fi + +if [ $stage -le 3 ]; then + steps/train_sat.sh --cmd "$train_cmd" \ + 4200 40000 $cleaned_data data/lang_nosp ${srcdir}_ali_${cleanup_affix} ${cleaned_dir} +fi + +if [ $stage -le 4 ]; then + # Test with the models trained on cleaned-up data. + utils/mkgraph.sh data/lang_nosp_test ${cleaned_dir} ${cleaned_dir}/graph_nosp + + for dset in eval98.pem; do + steps/decode_fmllr.sh --nj $decode_nj --num-threads $decode_num_threads \ + --cmd "$decode_cmd" --num-threads 4 \ + ${cleaned_dir}/graph_nosp data/${dset} ${cleaned_dir}/decode_nosp_${dset} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp_test data/lang_nosp_test_rescore \ + data/${dset} ${cleaned_dir}/decode_nosp_${dset} ${cleaned_dir}/decode_nosp_${dset}_rescore + done +fi + +if [ $stage -le 5 ]; then + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + $cleaned_data data/lang_nosp ${cleaned_dir} ${cleaned_dir}_ali_${cleanup_affix} +fi + +if [ $stage -le 6 ]; then + steps/train_sat.sh --cmd "$train_cmd" \ + 5000 100000 $cleaned_data data/lang_nosp \ + ${cleaned_dir}_ali_${cleanup_affix} exp/tri4b_${cleanup_affix} +fi + +cleaned_dir=exp/tri4b_${cleanup_affix} +if [ $stage -le 7 ]; then + # Test with the models trained on cleaned-up data. + utils/mkgraph.sh data/lang_nosp_test ${cleaned_dir} ${cleaned_dir}/graph_nosp + + for dset in eval98.pem; do + steps/decode_fmllr.sh --nj $decode_nj --num-threads $decode_num_threads \ + --cmd "$decode_cmd" --num-threads 4 \ + ${cleaned_dir}/graph_nosp data/${dset} ${cleaned_dir}/decode_nosp_${dset} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp_test data/lang_nosp_test_rescore \ + data/${dset} ${cleaned_dir}/decode_nosp_${dset} ${cleaned_dir}/decode_nosp_${dset}_rescore + done +fi diff --git a/egs/bn/s5/local/score.sh b/egs/bn/s5/local/score.sh new file mode 120000 index 00000000000..d89286dc25a --- /dev/null +++ b/egs/bn/s5/local/score.sh @@ -0,0 +1 @@ +score_sclite.sh \ No newline at end of file diff --git a/egs/bn/s5/local/score_sclite.sh b/egs/bn/s5/local/score_sclite.sh new file mode 100755 index 00000000000..20045c2e96b --- /dev/null +++ b/egs/bn/s5/local/score_sclite.sh @@ -0,0 +1,94 @@ +#!/bin/bash +# Copyright Johns Hopkins University (Author: Daniel Povey) 2012. Apache 2.0. + +# begin configuration section. +cmd=run.pl +stage=0 +min_lmwt=5 +max_lmwt=17 +iter=final +word_ins_penalty=0.0,0.5,1.0 +#end configuration section. + +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: local/score_sclite.sh [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --stage (0|1|2) # start scoring script from part-way through." + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + exit 1; +fi + +data=$1 +lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied. +dir=$3 + +model=$dir/../$iter.mdl # assume model one level up from decoding dir. + +hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl +[ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1; +hubdir=`dirname $hubscr` + +for f in $data/stm $data/glm $lang/words.txt $lang/phones/word_boundary.int \ + $model $data/segments $data/reco2file_and_channel $dir/lat.1.gz; do + [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; +done + +if [ -f $dir/../frame_shift ]; then + frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)" + echo "$0: $dir/../frame_shift exists, using $frame_shift_opt" +elif [ -f $dir/../frame_subsampling_factor ]; then + factor=$(cat $dir/../frame_subsampling_factor) || exit 1 + frame_shift_opt="--frame-shift=0.0$factor" + echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt" +fi + +name=`basename $data`; # e.g. eval2000 + +mkdir -p $dir/scoring/log + +if [ $stage -le 0 ]; then + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \ + mkdir -p $dir/score_LMWT_${wip}/ '&&' \ + lattice-scale --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-1best ark:- ark:- \| \ + lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \ + nbest-to-ctm $frame_shift_opt ark:- - \| \ + utils/int2sym.pl -f 5 $lang/words.txt \| \ + utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \ + '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1; + done +fi + +if [ $stage -le 1 ]; then + # Remove some stuff we don't want to score, from the ctm. + # the big expression in parentheses contains all the things that get mapped + # by the glm file, into hesitations. + # The -$ expression removes partial words. + # the aim here is to remove all the things that appear in the reference as optionally + # deletable (inside parentheses), as if we delete these there is no loss, while + # if we get them correct there is no gain. + for x in $dir/score_*/$name.ctm; do + cp $x $dir/tmpf; + cat $dir/tmpf | grep -i -v -E '' | \ + grep -i -v -E ' (UH|UM|EH|MM|HM|AH|HUH|HA|ER|OOF|HEE|ACH|EEE|EW)$' | \ + grep -v -- '-$' > $x; + done +fi + +# Score the set... +if [ $stage -le 2 ]; then + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.${wip}.log \ + cp $data/stm $dir/score_LMWT_${wip}/ '&&' \ + $hubscr -p $hubdir -V -l english -h hub4 -g $data/glm -r $dir/score_LMWT_${wip}/stm $dir/score_LMWT_${wip}/${name}.ctm || exit 1; + done +fi + +exit 0 diff --git a/egs/bn/s5/local/train_lm.sh b/egs/bn/s5/local/train_lm.sh new file mode 100755 index 00000000000..d8523ca30f4 --- /dev/null +++ b/egs/bn/s5/local/train_lm.sh @@ -0,0 +1,149 @@ +#!/bin/bash + +# Copyright 2016 Vimal Manohar +# Apache 2.0 +# +# This script trains a LM on the Broadcast News transcripts. +# It is based on the example scripts distributed with PocoLM. + +# It will first check if pocolm is installed and if not will process with installation + + +set -e +set -o pipefail +set -u + +stage=0 + +echo "$0 $@" # Print the command line for logging +. utils/parse_options.sh || exit 1; + +dir=data/local/local_lm +lm_dir=${dir}/data + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +num_dev_sentences=5000 +RANDOM=0 + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + cat data/train/text | shuf > ${dir}/train_text + head -n $num_dev_sentences < ${dir}/train_text | cut -d ' ' -f 2- > ${dir}/data/text/dev.txt + tail -n +$[num_dev_sentences+1] < ${dir}/train_text | cut -d ' ' -f 2- > ${dir}/data/text/bn.txt + + for x in data/local/data/na_news/*; do + y=`basename $x` + [ -f $x/corpus.gz ] && ln -sf `readlink -f $x/corpus.gz` ${dir}/data/text/${y}.txt.gz + done + + # for reporting perplexities, we'll use the "real" dev set. + # (a subset of the training data is used as ${dir}/data/text/ted.txt to work + # out interpolation weights. + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cat data/eval98/stm | awk '!/^;;/ {if (NF > 6) print $0}' | cut -d ' ' -f 1,7- | \ + local/normalize_transcripts.pl "" "" | \ + cut -d ' ' -f 2- > ${dir}/data/real_dev_set.txt +fi + +if [ $stage -le 1 ]; then + mkdir -p $dir/data/work + if [ ! -f $dir/data/work/word_counts/.done ]; then + get_word_counts.py $dir/data/text $dir/data/work/word_counts + touch $dir/data/work/word_counts/.done + fi +fi + +if [ $stage -le 2 ]; then + for x in data/local/data/na_news/*; do + y=$dir/data/work/word_counts/`basename $x`.counts + [ -f $y ] && cat $y + done | local/lm/merge_word_counts.py 15 > $dir/data/work/na_news.wordlist_counts + + cat $dir/data/work/word_counts/{bn,dev}.counts | \ + local/lm/merge_word_counts.py 2 > $dir/data/work/bn.wordlist_counts + + cat $dir/data/work/na_news.wordlist_counts $dir/data/work/bn.wordlist_counts | \ + perl -ane 'if ($F[1] =~ m/[A-Za-z]/) { print "$F[1]\n"; }' | \ + sort -u > $dir/data/work/wordlist +fi + +order=4 +wordlist=$dir/data/work/wordlist + +min_counts='default=5 bn=1' + +lm_name="`basename ${wordlist}`_${order}" +if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" +fi +unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + +if [ $stage -le 3 ]; then + # decide on the vocabulary. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + train_lm.py --wordlist=$wordlist --num-splits=10 --warm-start-ratio=20 \ + --limit-unk-history=true \ + --fold-dev-into=bn \ + --min-counts="${min_counts}" \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + #[perplexity = 157.87] over 18290.0 words + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram.arpa.gz +fi + +if [ $stage -le 4 ]; then + echo "$0: pruning the LM (to larger size)" + # Using 10 million n-grams for a big LM for rescoring purposes. + size=10000000 + prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' + + # current results, after adding --limit-unk-history=true: + # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_big was -5.16562818753 per word [perplexity = 175.147449465] over 18290.0 words. + + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz +fi + +if [ $stage -le 5 ]; then + echo "$0: pruning the LM (to smaller size)" + # Using 2 million n-grams for a smaller LM for graph building. Prune from the + # bigger-pruned LM, it'll be faster. + size=2000000 + prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' + + # current results, after adding --limit-unk-history=true (needed for modeling OOVs and not blowing up LG.fst): + # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_small was -5.29432352378 per word [perplexity = 199.202824404 over 18290.0 words. + + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz +fi + diff --git a/egs/bn/s5/path.sh b/egs/bn/s5/path.sh new file mode 100755 index 00000000000..da29adb7b2a --- /dev/null +++ b/egs/bn/s5/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +. $KALDI_ROOT/tools/env.sh +export LC_ALL=C diff --git a/egs/bn/s5/run.sh b/egs/bn/s5/run.sh new file mode 100755 index 00000000000..24c47cb90ba --- /dev/null +++ b/egs/bn/s5/run.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +# See README.txt for more info on data required. + +. cmd.sh +. path.sh + +set -o pipefail + +mfccdir=`pwd`/mfcc +nj=40 + +local/data_prep/prepare_bn_data.py --split-at-sync=false \ + /export/corpora5/LDC/LDC97S44 \ + /export/corpora/LDC/LDC97T22 data/local/data/train + +local/data_prep/prepare_na_news_test_corpus.sh --nj 40 --cmd "$train_cmd" \ + /export/corpora/LDC/LDC95T21 data/local/data/na_news + +local/data_prep/prepare_1996_csr_hub4_corpus.sh --nj 10 --cmd "$train_cmd" \ + /export/corpora/LDC/LDC98T31 data/local/data/csr96_hub4 + +local/prepare_1998_hub4_bn_eng_eval.sh /export/corpora/LDC/LDC2000S86/ \ + data/local/data/eval98 + +local/format_data.sh + +local/train_lm.sh + +local/prepare_dict.sh --dict-suffix "_nosp" \ + data/local/local_lm/data/work/wordlist + +utils/prepare_lang.sh data/local/dict_nosp \ + "" data/local/lang_tmp_nosp data/lang_nosp + +local/format_lms.sh + +for x in train eval98 eval98.pem; do + this_nj=$(cat data/$x/utt2spk | wc -l) + if [ $this_nj -gt 30 ]; then + this_nj=30 + fi + + steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj $this_nj \ + --cmd "$train_cmd" \ + data/$x exp/make_mfcc $mfccdir + steps/compute_cmvn_stats.sh data/$x exp/make_mfcc $mfccdir + utils/fix_data_dir.sh data/$x +done + +utils/subset_data_dir.sh --shortest data/train 1000 data/train_1kshort +utils/subset_data_dir.sh data/train 2000 data/train_2k + +# Note: the --boost-silence option should probably be omitted by default +# for normal setups. It doesn't always help. [it's to discourage non-silence +# models from modeling silence.] +steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \ + data/train_1kshort data/lang_nosp exp/mono0a + +steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \ + data/train_2k data/lang_nosp exp/mono0a exp/mono0a_ali + +steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 \ + data/train_2k data/lang_nosp exp/mono0a_ali exp/tri1 + +steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/train data/lang_nosp exp/tri1 exp/tri1_ali + +steps/train_lda_mllt.sh --cmd "$train_cmd" 2500 15000 \ + data/train data/lang_nosp exp/tri1_ali exp/tri2 + +steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/train data/lang_nosp exp/tri2 exp/tri2_ali + +steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \ + data/train data/lang_nosp exp/tri2_ali exp/tri3 + +utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp + +steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ + exp/tri3/graph_nosp data/eval98.pem exp/tri3/decode_nosp_eval98.pem +steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_nosp_test data/lang_nosp_test_rescore \ + data/eval98.pem exp/tri3/decode_nosp_eval98.pem \ + exp/tri3/decode_rescore_nosp_eval98.pem + +exit 0 diff --git a/egs/bn/s5/steps b/egs/bn/s5/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/bn/s5/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/bn/s5/utils b/egs/bn/s5/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/bn/s5/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file From 6e73dec63bd19363b9e633d884ff9917b4b3e932 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Fri, 6 Jan 2017 18:57:47 -0500 Subject: [PATCH 04/38] bn: Add 1999 BN eval preparation --- egs/bn/s5/local/data_prep/hub4_utils.py | 68 +++++++++++++++++++ .../prepare_1998_hub4_bn_eng_eval.sh | 52 ++++---------- .../prepare_1999_hub4_bn_eng_eval.sh | 66 ++++++++++++++++++ 3 files changed, 148 insertions(+), 38 deletions(-) create mode 100644 egs/bn/s5/local/data_prep/hub4_utils.py create mode 100644 egs/bn/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh diff --git a/egs/bn/s5/local/data_prep/hub4_utils.py b/egs/bn/s5/local/data_prep/hub4_utils.py new file mode 100644 index 00000000000..a5f11f67c31 --- /dev/null +++ b/egs/bn/s5/local/data_prep/hub4_utils.py @@ -0,0 +1,68 @@ +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +"""This module contains utilities for preparing the HUB4 broadcast news +evaluation corpora. +""" + +import sys +import os + + +def parse_uem_line(reco, line): + """This method parses a 'line' from the UEM for recording 'reco' + and returns the line converted to kaldi segments format. + The format of UEM is + + + We force the channel to be 1 and take the file-id to be the recording-id. + """ + line = line.strip() + if len(line) == 0 or line[0:2] == ";;": + continue + parts = line.split() + + # The channel ID is expected to be 1. + if parts[1] != "1": + raise TypeError("Invalid line {0}".format(line)) + + start_time = float(parts[2]) + end_time = float(parts[3]) + + utt = "{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100), + int(end_time * 100)) + return "{0} {1} {2} {3}".format(utt, reco, start_time, end_time) + + +def parse_cmu_seg_line(reco, line): + """This line parses a 'line' from the CMU automatic segmentation for + recording 'reco'. + The CMU segmentation has the following format: + + + We force the channel to be 1 and take the file-id to be the recording-id. + """ + line = line.strip() + if len(line) == 0 or line[0:2] == ";;": + continue + parts = line.split() + + # Actually a file, but we assuming 1-1 mapping to recording and force + # channel to be 1. + reco = parts[0] + + # The channel ID is expected to be 1. + if parts[1] != "1": + raise TypeError("Invalid line {0}".format(line)) + spk = parts[2] + start_time = float(parts[3]) + end_time = float(parts[4]) + + utt = "{spk}-{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100), + int(end_time * 100), spk=spk) + + segment_line = "{0} {1} {st:.3f} {end:.3f}".format( + utt, reco, st=start_time, end=end_time) + utt2spk_line = "{0} {1}".format(utt, spk) + + return (segment_line, utt2spk_line) diff --git a/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh b/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh index a167c2cfee0..f990adbd74a 100755 --- a/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh +++ b/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh @@ -5,7 +5,7 @@ if [ $# -ne 2 ]; then echo "Usage: $0 " - echo "local/prepare_1998_hub4_bn_eng_eval.sh /export/corpora/LDC/LDC2000S86/ data/local/data/eval98" + echo "$0 /export/corpora/LDC/LDC2000S86/ data/local/data/eval98" exit 1 fi @@ -14,55 +14,31 @@ dir=$2 mkdir -p $dir +if [ ! -d $SOURCE_DIR/h4e_evl/ ]; then + echo "$0: Invalid SOURCE-DIR for LDC2000S86 corpus" + exit 1 +fi + for uem in $SOURCE_DIR/h4e_evl/h4e_98_{1,2}.uem; do python -c ' import sys, os +import hub4_utils uem = sys.argv[1] reco, ext = os.path.splitext(os.path.basename(uem)) for line in open(uem).readlines(): line = line.strip() - if len(line) == 0 or line[0:2] == ";;": - continue - parts = line.split() - - assert parts[1] == "1" - start_time = float(parts[2]) - end_time = float(parts[3]) - - utt = "{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100), - int(end_time * 100)) - print ("{0} {1} {2} {3}".format(utt, reco, start_time, end_time))' $uem + print (parse_uem_line(line))' $uem done > $dir/segments cat $SOURCE_DIR/h4e_evl/h4e_98_{1,2}.seg | \ python -c ' -from __future__ import print_function import sys - -segments_handle = open(sys.argv[1], "w") -utt2spk_handle = open(sys.argv[2], "w") -for line in sys.stdin.readlines(): - line = line.strip() - if len(line) == 0 or line[0:2] == ";;": - continue - parts = line.split() - - reco = parts[0] - assert parts[1] == "1" - spk = parts[2] - start_time = float(parts[3]) - end_time = float(parts[4]) - - utt = "{spk}-{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100), - int(end_time * 100), spk=spk) - - print ("{0} {1} {2} {3}".format(utt, reco, start_time, end_time), - file=segments_handle) - print ("{0} {1}".format(utt, spk), - file=utt2spk_handle) -segments_handle.close() -utt2spk_handle.close() -' $dir/segments.pem $dir/utt2spk.pem +with open(sys.argv[1], "w") as s_f, open(sys.argv[2], "w") as u_f: + for line in sys.stdin.readlines(): + segments_line, utt2spk_line = parse_cmu_seg_line(reco, line) + s_f.write("{0}\n".format(segments_line)) + u_f.write("{0}\n".format(utt2spk_line))' \ + $dir/segments.pem $dir/utt2spk.pem export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5 sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; } diff --git a/egs/bn/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh b/egs/bn/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh new file mode 100644 index 00000000000..133b56b5b36 --- /dev/null +++ b/egs/bn/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh @@ -0,0 +1,66 @@ +#! /bin/bash + +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + echo "$0 /export/corpora5/LDC/LDC2000S88/ data/local/data/eval99" + exit 1 +fi + +SOURCE_DIR=$1 +dir=$2 + +mkdir -p $dir + +if [ ! -d $SOURCE_DIR/hub4_1999/ ]; then + echo "$0: Invalid SOURCE-DIR for LDC2000S88 corpus" + exit 1 +fi + +for uem in $SOURCE_DIR/hub4_1999/bnews_99/bn99en_{1,2}.uem; do + python -c ' +import sys, os +import hub4_utils +uem = sys.argv[1] +reco, ext = os.path.splitext(os.path.basename(uem)) +for line in open(uem).readlines(): + print (parse_uem_line(line))' $uem +done > $dir/segments + +awk '{print $1" "$2}' $dir/segments > $dir/utt2spk + +cat $SOURCE_DIR/hub4_1999/bnews_99/bn99en_{1,2}.seg | \ + python -c ' +import sys +with open(sys.argv[1], "w") as s_f, open(sys.argv[2], "w") as u_f: + for line in sys.stdin.readlines(): + segments_line, utt2spk_line = parse_cmu_seg_line(reco, line) + s_f.write("{0}\n".format(segments_line)) + u_f.write("{0}\n".format(utt2spk_line))' \ + $dir/segments.pem $dir/utt2spk.pem + +export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5 +sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; } +for x in `ls $SOURCE_DIR/h4e_evl/*.sph`; do + y=`basename $x` + z=${y%.sph} + echo "$z $sph2pipe -f wav $x |"; +done > $dir/wav.scp + +awk '{print $1" "$1" 1"}' $dir/wav.scp > $dir/reco2file_and_channel + +cp $SOURCE_DIR/hub4_1999/bnews99/en981118.glm $dir/en981118.glm +cp $SOURCE_DIR/hub4_1999/bnews99/bn99en_1.stm $dir/bn99en_1.stm + +cp $SOURCE_DIR/hub4_1999/bnews99/en991231.glm $dir/en991231.glm +cp $SOURCE_DIR/hub4_1999/bnews99/bn99en_2.stm $dir/bn99en_2.stm + +utils/fix_data_dir.sh $dir +utils/copy_data_dir.sh $dir ${dir}.pem +cp $dir/*.stm ${dir}.pem/ + +cp $dir/segments.pem ${dir}.pem/segments +cp $dir/utt2spk.pem ${dir}.pem/utt2spk +utils/fix_data_dir.sh ${dir}.pem From 917a67059070f1c4e35b295830583c9bd36ada54 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 10 Jan 2017 20:22:00 -0500 Subject: [PATCH 05/38] bn: Add more data preparation scripts --- egs/bn/s5/local/data_prep/hub4_utils.py | 120 +++++++- .../data_prep/prepare_1995_csr_hub4_corpus.sh | 59 ++++ ...are_bn_data.py => prepare_1996_bn_data.py} | 70 +++-- ....sh => prepare_1996_csr_hub4_lm_corpus.sh} | 19 +- .../prepare_1996_hub4_bn_eng_dev_and_eval.sh | 99 +++++++ .../local/data_prep/prepare_1997_bn_data.py | 2 + .../prepare_1997_hub4_bn_eng_eval.sh | 64 ++++ .../prepare_1998_hub4_bn_eng_eval.sh | 18 +- .../prepare_1999_hub4_bn_eng_eval.sh | 74 ++--- .../data_prep/prepare_na_news_text_corpus.sh | 3 + .../prepare_na_news_text_supplement.sh | 61 ++++ .../data_prep/process_1995_bn_annotation.py | 273 ++++++++++++++++++ .../local/data_prep/process_na_news_text.py | 37 ++- 13 files changed, 816 insertions(+), 83 deletions(-) create mode 100755 egs/bn/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh rename egs/bn/s5/local/data_prep/{prepare_bn_data.py => prepare_1996_bn_data.py} (74%) rename egs/bn/s5/local/data_prep/{prepare_1996_csr_hub4_corpus.sh => prepare_1996_csr_hub4_lm_corpus.sh} (59%) create mode 100755 egs/bn/s5/local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh create mode 100755 egs/bn/s5/local/data_prep/prepare_1997_bn_data.py create mode 100755 egs/bn/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh mode change 100644 => 100755 egs/bn/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh create mode 100644 egs/bn/s5/local/data_prep/prepare_na_news_text_supplement.sh create mode 100755 egs/bn/s5/local/data_prep/process_1995_bn_annotation.py diff --git a/egs/bn/s5/local/data_prep/hub4_utils.py b/egs/bn/s5/local/data_prep/hub4_utils.py index a5f11f67c31..b43de80c73b 100644 --- a/egs/bn/s5/local/data_prep/hub4_utils.py +++ b/egs/bn/s5/local/data_prep/hub4_utils.py @@ -5,8 +5,9 @@ evaluation corpora. """ -import sys import os +import re +import sys def parse_uem_line(reco, line): @@ -19,9 +20,12 @@ def parse_uem_line(reco, line): """ line = line.strip() if len(line) == 0 or line[0:2] == ";;": - continue + return None parts = line.split() + if reco is None: + reco = parts[0] + # The channel ID is expected to be 1. if parts[1] != "1": raise TypeError("Invalid line {0}".format(line)) @@ -34,17 +38,17 @@ def parse_uem_line(reco, line): return "{0} {1} {2} {3}".format(utt, reco, start_time, end_time) -def parse_cmu_seg_line(reco, line): +def parse_cmu_seg_line(line, prepend_reco_to_spk=False): """This line parses a 'line' from the CMU automatic segmentation for - recording 'reco'. + recording. The CMU segmentation has the following format: - + We force the channel to be 1 and take the file-id to be the recording-id. """ line = line.strip() if len(line) == 0 or line[0:2] == ";;": - continue + return None parts = line.split() # Actually a file, but we assuming 1-1 mapping to recording and force @@ -55,14 +59,116 @@ def parse_cmu_seg_line(reco, line): if parts[1] != "1": raise TypeError("Invalid line {0}".format(line)) spk = parts[2] + start_time = float(parts[3]) end_time = float(parts[4]) - utt = "{spk}-{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100), + if prepend_reco_to_spk: + spk = reco + '-' + spk + utt = "{spk}-{0:06d}-{1:06d}".format(int(start_time * 100), int(end_time * 100), spk=spk) + else: + utt = "{spk}-{reco}-{0:06d}-{1:06d}".format(int(start_time * 100), + int(end_time * 100), + reco=reco, spk=spk) segment_line = "{0} {1} {st:.3f} {end:.3f}".format( utt, reco, st=start_time, end=end_time) utt2spk_line = "{0} {1}".format(utt, spk) return (segment_line, utt2spk_line) + + +def normalize_bn_transcript(text, noise_word, spoken_noise_word): + """Normalize broadcast news transcript for audio.""" + text = text.upper() + # Remove unclear speech markings + text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text) + text = re.sub(r"#", "", text) # Remove overlapped speech markings + # Remove invented word markings + text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) + text = re.sub(r"\[[^]]+\]", noise_word, text) + text = re.sub(r"\{[^}]+\}", spoken_noise_word, text) + # Remove mispronunciation brackets + text = re.sub(r"\+([^+]+)\+", r"\1", text) + + text1 = [] + for word in text.split(): + # Remove best guesses for proper nouns + word = re.sub(r"^@(\w+)$", r"\1", word) + text1.append(word) + return " ".join(text1) + + +def normalize_csr_transcript(text, noise_word, spoken_noise_word): + """Normalize broadcast news transcript for audio.""" + text = text.upper() + + # Remove long event markings + text = re.sub(r"\[[^]/]+/\]|\[/[^]/]+\]", "", text) + # Remove comments + text = re.sub(r"\{\{[^}]*\}\}", "", text) + # Replace alternative words with a single one (second alternative) + text = re.sub(r"\{[^}/]+/([^}/]+)[^}]*\}", r"\1", text) + # Remove partial word completions + text = re.sub(r"\([^)]+\)-|-\([^)]+\)", "-", text) + # Remove accent marks and diacritics + text = re.sub(r"\\[3-8]", "", text) + + # Remove unclear speech markings + text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text) + text = re.sub(r"#", "", text) # Remove overlapped speech markings + # Remove invented word markings + text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) + # Replace speaker-made noises with + text = re.sub(r"\[INHALING\]|\[COUGH\]|\[THROAT_CLEARING\]|\[SIGN\]", + spoken_noise_word, text) + # Replace noise with + text = re.sub(r"\[[^]]+\]", noise_word, text) + text = re.sub(r"\+([^+]+)\+", r"\1", text) + + # Remove periods after letter. + text = re.sub(r"([A-Z])\.( |$)", r"\1 ", text) + # Replace \. with . + text = re.sub(r"\\.", r".", text) + + text1 = [] + for word in text.split(): + if word == spoken_noise_word or word == noise_word: + text1.append(word) + continue + + # Remove mispronunciation brackets + word = re.sub(r"^@(\w+)$", r"\1", word) + # Remove everything other than the standard ASCII symbols + word = re.sub("[^A-Za-z0-9.' _-]", "", word) + text1.append(word) + return " ".join(text1) + + +def remove_punctuations(text): + """Remove punctuations and some other processing for text sentence.""" + # Remove HTML new lines that are not end of sentences + text1 = re.sub("\n", " ", text) + + # Remove some markers like double dash that are normally used to separate + # name titles in newspapers. + text1 = re.sub(r"(&[^;]+;|--)", " ", text1) + + # Remove quotation marks + text1 = re.sub(r"''|``|\(|\)", " ", text1) + + # Remove everything other than the standard ASCII symbols + text1 = re.sub("[^A-Za-z0-9.' _-]", "", text1) + + # Replace multiple .'s with single and then remove isolated '.' + text1 = re.sub(r"\.[.]+ ", ".", text1) + text1 = re.sub(r" \. ", " ", text1) + + # Remove isolated '-' + text1 = re.sub(r" - ", " ", text1) + + # Replace multiple spaces with single. + text1 = re.sub(r"[ ]+", " ", text1) + + return text1 diff --git a/egs/bn/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh b/egs/bn/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh new file mode 100755 index 00000000000..b199fdc8a48 --- /dev/null +++ b/egs/bn/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh @@ -0,0 +1,59 @@ +#! /bin/bash + +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +# This script prepares the 1995 CSR-IV HUB4 corpus +# https://catalog.ldc.upenn.edu/LDC96S31 + +set -e +set -o pipefail +set -u + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + echo " e.g.: $0 /export/corpora5/LDC/LDC96S31/csr95_hub4 data/local/data/csr95_hub4" + exit 1 +fi + +SOURCE_DIR=$1 +dir=$2 + +for d in $SOURCE_DIR/csr95/h4/devtst $SOURCE_DIR/csr95/h4/evltst \ + $SOURCE_DIR/csr95/h4/train; do + if [ ! -d $d ]; then + echo "$0: Invalid SOURCE-DIR $SOURCE_DIR for LDC96S31 corpus" + exit 1 + fi +done + +mkdir -p $dir + +for x in `ls $SOURCE_DIR/csr95/h4/*/*.wav`; do + y=`basename $x` + z=${y%.wav} + echo "$z $x" +done > $dir/wav_scp + +cat $dir/wav_scp | grep "csr95/h4/train" > $dir/train95_wav_scp +cat $dir/wav_scp | grep "csr95/h4/devtst" > $dir/dev95_wav_scp +cat $dir/wav_scp | grep "csr95/h4/evltst" > $dir/eval95_wav_scp + +rm $dir/*_{segments,utt2spk,text} || true + +for x in `ls $SOURCE_DIR/csr95/h4/*/*.txt`; do + if [[ $x =~ "csr95/h4/train" ]]; then + local/data_prep/process_1995_bn_annotation.py $x \ + $dir/train95_segments $dir/train95_utt2spk $dir/train95_text + fi + + if [[ $x =~ "csr95/h4/devtst" ]]; then + local/data_prep/process_1995_bn_annotation.py $x \ + $dir/dev95_segments $dir/dev95_utt2spk $dir/dev95_text + fi + + if [[ $x =~ "csr95/h4/evltst" ]]; then + local/data_prep/process_1995_bn_annotation.py $x \ + $dir/eval95_segments $dir/eval95_utt2spk $dir/eval95_text + fi +done diff --git a/egs/bn/s5/local/data_prep/prepare_bn_data.py b/egs/bn/s5/local/data_prep/prepare_1996_bn_data.py similarity index 74% rename from egs/bn/s5/local/data_prep/prepare_bn_data.py rename to egs/bn/s5/local/data_prep/prepare_1996_bn_data.py index b96d0503367..26bc69f572b 100755 --- a/egs/bn/s5/local/data_prep/prepare_bn_data.py +++ b/egs/bn/s5/local/data_prep/prepare_1996_bn_data.py @@ -1,5 +1,13 @@ #! /usr/bin/env python +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +"""This script prepares the 1996 English Broadcast News (HUB4) corpus. +https://catalog.ldc.upenn.edu/LDC97S44 +https://catalog.ldc.upenn.edu/LDC97T22 +""" + from __future__ import print_function import argparse import glob @@ -7,6 +15,7 @@ import os import re from bs4 import BeautifulSoup +import hub4_utils logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -20,6 +29,13 @@ def get_args(): parser = argparse.ArgumentParser("Prepare BN corpus.") + parser.add_argument("--noise-word", type=str, default="", + help="""Replace all noise words in transcript + with this noise_word""") + parser.add_argument("--spoken-noise-word", type=str, + default="", + help="""Replace all speaker noise words in transcript + with this spoken_noise_word""") parser.add_argument("--split-at-sync", type=str, choices=["true", "false"], default="false", help="If true, creates separate segments split " @@ -48,36 +64,49 @@ def __init__(self, reco_id, speaker=None): self.text = None self.start_time = -1 self.end_time = -1 - if speaker is not None: - self.speaker = speaker - else: - self.speaker = reco_id + self.speaker = speaker def write_segment(self, out_file): """writes segment in kaldi segments format""" - print("{0} {1} {2} {3}".format(self.utt_id(), self.reco_id, + print("{0} {1} {2} {3}".format(self.get_utt_id(), self.reco_id, self.start_time, self.end_time), file=out_file) def write_utt2spk(self, out_file): """writes speaker information in kaldi utt2spk format""" - print("{0} {1}".format(self.utt_id(), self.speaker), + print("{0} {1}".format(self.get_utt_id(), self.get_spk_id()), file=out_file) - def write_text(self, out_file): - print("{0} {1}".format(self.utt_id(), self.text), - file=out_file) + def write_text(self, out_file, noise_word="", + spoken_noise_word=""): + text = hub4_utils.normalize_bn_transcript( + self.text, noise_word, spoken_noise_word) + if len(text) == 0 or re.match(r"^\s*$", text): + return + print("{0} {1}".format(self.get_utt_id(), text), file=out_file) def check(self): """checks if this is a valid segment""" assert self.end_time > self.start_time - def utt_id(self): + def get_utt_id(self): """returns the utterance id created from the recording id and the timing information""" - return ("{spkr}-{0}-{1:06d}-{2:06d}".format( - self.reco_id, int(self.start_time * 100), - int(self.end_time * 100), spkr=self.speaker)) + if self.speaker is None: + return ("{0}-{1:06d}-{2:06d}".format( + self.reco_id, int(self.start_time * 100), + int(self.end_time * 100))) + else: + return ("{0}-{1:06d}-{2:06d}".format( + self.get_spk_id(), int(self.start_time * 100), + int(self.end_time * 100))) + + def get_spk_id(self): + if self.speaker is None: + return ("{0}-{1:06d}-{2:06d}".format( + self.reco_id, int(self.start_time * 100), + int(self.end_time * 100))) + return "{0}-{1}".format(self.reco_id, self.speaker) def duration(self): """returns the duration of the segment""" @@ -129,7 +158,9 @@ def process_segment_soup(reco_id, soup, split_at_sync=False): def process_transcription(transcription_file, segments_handle, utt2spk_handle, - text_handle, split_at_sync=False): + text_handle, split_at_sync=False, + noise_word="", + spoken_noise_word=""): """Processes transcription file into segments.""" doc = ''.join(open(transcription_file).readlines()) tag_matcher = re.compile(r"(<(Sync|Background)[^>]+>)") @@ -158,13 +189,14 @@ def process_transcription(transcription_file, segments_handle, utt2spk_handle, continue s.write_segment(segments_handle) s.write_utt2spk(utt2spk_handle) - s.write_text(text_handle) + s.write_text(text_handle, noise_word, + spoken_noise_word) except Exception: logger.error("Failed processing segment %s", seg) raise -def _run(args): +def run(args): if not os.path.isdir(args.dir): os.makedirs(args.dir) @@ -186,7 +218,9 @@ def _run(args): try: process_transcription(x, segments_handle, utt2spk_handle, text_handle, - split_at_sync=args.split_at_sync) + split_at_sync=args.split_at_sync, + noise_word=args.noise_word, + spoken_noise_word=args.spoken_noise_word) except Exception: logger.error("Failed to process file %s", x) @@ -199,7 +233,7 @@ def _run(args): def main(): try: args = get_args() - _run(args) + run(args) except Exception: raise diff --git a/egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_corpus.sh b/egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh similarity index 59% rename from egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_corpus.sh rename to egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh index fc20758eec0..444a491c7b8 100755 --- a/egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_corpus.sh +++ b/egs/bn/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh @@ -3,6 +3,9 @@ # Copyright 2016 Vimal Manohar # Apache 2.0. +# This script prepares the 1996 CSR HUB4 Language Model corpus +# https://catalog.ldc.upenn.edu/LDC98T31 + set -e set -o pipefail set -u @@ -16,7 +19,7 @@ stage=0 if [ $# -ne 2 ]; then echo "Usage: $0 " - echo " e.g.: $0 /export/corpora/LDC/LDC98T31/ data/local/data/csr96_hub4" + echo " e.g.: $0 /export/corpora/LDC/LDC98T31/1996_csr_hub4_model data/local/data/csr96_hub4" exit 1 fi @@ -25,9 +28,13 @@ dir=$2 mkdir -p $dir -ls $SOURCE_DIR/1996_csr_hub4_model/st_train/*.stZ \ - $SOURCE_DIR/1996_csr_hub4_model/st_test/*.stZ | sort > \ - $dir/filelist +for d in $SOURCE_DIR/st_train/ $SOURCE_DIR/st_test/; do + if [ ! -d $d ]; then + echo "$0: Invalid SOURCE-DIR $SOURCE_DIR for LDC98T31 corpus" + exit 1 + fi + ls $d/*.stZ +done | sort > $dir/filelist mkdir -p $dir/split$nj/ @@ -38,13 +45,13 @@ if [ $stage -le 1 ]; then $dir/split$nj/filelist.JOB $dir fi -for x in `ls $SOURCE_DIR/1996_csr_hub4_model/st_train/*.stZ`; do +for x in `ls $SOURCE_DIR/st_train/*.stZ`; do y=`basename $x` name=${y%.stZ} echo $dir/${name}.txt.gz done > $dir/train.filelist -for x in `ls $SOURCE_DIR/1996_csr_hub4_model/st_test/*.stZ`; do +for x in `ls $SOURCE_DIR/st_test/*.stZ`; do y=`basename $x` name=${y%.stZ} echo $dir/${name}.txt.gz diff --git a/egs/bn/s5/local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh b/egs/bn/s5/local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh new file mode 100755 index 00000000000..7c11531dda5 --- /dev/null +++ b/egs/bn/s5/local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh @@ -0,0 +1,99 @@ +#! /bin/bash + +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +# This script prepares 1996 English Broadcast News Dev and Eval (HUB4) +# https://catalog.ldc.upenn.edu/LDC97S66 + +set -e +set -o pipefail + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + echo "$0 /export/corpora/LDC/LDC97S66/1996_eng_bcast_dev_eval data/local/data/hub4_96_dev_eval" + exit 1 +fi + +SOURCE_DIR=$1 +dir=$2 + +mkdir -p $dir + +for d in $SOURCE_DIR/dev/devdata $SOURCE_DIR/eval/evaldata; do + if [ ! -d $d ]; then + echo "$0: Invalid SOURCE-DIR $SOURCE_DIR for LDC97S66 corpus" + exit 1 + fi +done + +for d in dev eval; do + if [ $d == "dev" ]; then + suffix=dt + else + suffix=ev + fi + + python -c ' +import sys, os +sys.path.insert(0, "local/data_prep") +import hub4_utils +uem = sys.argv[1] +for line in open(uem).readlines(): + line = hub4_utils.parse_uem_line(None, line) + if line is not None: + print (line)' $SOURCE_DIR/${d}/${d}data/h496${suffix}.uem > $dir/${d}96_uem_segments + awk '{print $1" "$2}' $dir/${d}96_uem_segments > $dir/${d}96_uem_utt2spk +done + +for d in dev eval; do + if [ $d == "dev" ]; then + suffix=dt + else + suffix=ev + fi + + cat $SOURCE_DIR/${d}/${d}data/h496${suffix}.pem | \ + python -c ' +import sys +sys.path.insert(0, "local/data_prep") +import hub4_utils +with open(sys.argv[1], "w") as s_f, open(sys.argv[2], "w") as u_f: + for line in sys.stdin.readlines(): + tup = hub4_utils.parse_cmu_seg_line(line, prepend_reco_to_spk=True) + if tup is not None: + segments_line, utt2spk_line = tup + s_f.write("{0}\n".format(segments_line)) + u_f.write("{0}\n".format(utt2spk_line))' \ + $dir/${d}96_pem_segments $dir/${d}96_pem_utt2spk +done + +export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5 +sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; } + +for x in `ls $SOURCE_DIR/dev/devdata/*.sph`; do + y=`basename $x` + z=${y%.sph} + echo "$z $sph2pipe -f wav $x |"; +done > $dir/dev96_wav_scp + +cat $dir/dev96_pem_segments | awk '{print $2}' | \ + utils/filter_scp.pl /dev/stdin $dir/dev96_wav_scp > $dir/dev96_pem_wav_scp +cat $dir/dev96_uem_segments | awk '{print $2}' | \ + utils/filter_scp.pl /dev/stdin $dir/dev96_wav_scp > $dir/dev96_uem_wav_scp + +for x in `ls $SOURCE_DIR/eval/evaldata/*.sph`; do + y=`basename $x` + z=${y%.sph} + echo "$z $sph2pipe -f wav $x |"; +done > $dir/eval96_wav_scp + +cp $SOURCE_DIR/eval/evaldata/et96_1.glm $dir/glm + +cp $SOURCE_DIR/eval/evaldata/et96_1.utm $dir/eval96_utm +cp $SOURCE_DIR/dev/devdata/et96_1.utm $dir/dev96_utm + +cp $SOURCE_DIR/eval/evaldata/h496ev.stm $dir/eval96_stm + +cp $SOURCE_DIR/dev/devdata/h496dtpe.stm $dir/dev96_pem_stm +cp $SOURCE_DIR/dev/devdata/h496dtue.stm $dir/dev96_uem_stm diff --git a/egs/bn/s5/local/data_prep/prepare_1997_bn_data.py b/egs/bn/s5/local/data_prep/prepare_1997_bn_data.py new file mode 100755 index 00000000000..0dd9b4dca58 --- /dev/null +++ b/egs/bn/s5/local/data_prep/prepare_1997_bn_data.py @@ -0,0 +1,2 @@ + +/export/corpora/LDC/LDC98T28 diff --git a/egs/bn/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh b/egs/bn/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh new file mode 100755 index 00000000000..8ef0817065f --- /dev/null +++ b/egs/bn/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh @@ -0,0 +1,64 @@ +#! /bin/bash + +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +# This script prepares 1997 HUB4 English Evaluation corpus +# https://catalog.ldc.upenn.edu/LDC2002S11 + +set -e +set -o pipefail + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + echo "$0 /export/corpora/LDC/LDC2002S11/hub4e_97 data/local/data/eval97" + exit 1 +fi + +SOURCE_DIR=$1 +dir=$2 + +mkdir -p $dir + +if [ ! -d $SOURCE_DIR/h4e_evl/ ]; then + echo "$0: Invalid SOURCE-DIR $SOURCE_DIR for LDC2002S11 corpus" + exit 1 +fi + +for uem in $SOURCE_DIR/h4e_evl/h4e_97.uem; do + python -c ' +import sys, os +sys.path.insert(0, "local/data_prep") +import hub4_utils +uem = sys.argv[1] +reco, ext = os.path.splitext(os.path.basename(uem)) +for line in open(uem).readlines(): + line = hub4_utils.parse_uem_line(reco, line) + if line is not None: + print (line)' $uem +done > $dir/segments +awk '{print $1" "$2}' $dir/segments > $dir/utt2spk + +cat $SOURCE_DIR/h4e_evl/h4e_97.seg | \ + python -c ' +import sys +sys.path.insert(0, "local/data_prep") +import hub4_utils +with open(sys.argv[1], "w") as s_f, open(sys.argv[2], "w") as u_f: + for line in sys.stdin.readlines(): + tup = hub4_utils.parse_cmu_seg_line(line) + if tup is not None: + segments_line, utt2spk_line = tup + s_f.write("{0}\n".format(segments_line)) + u_f.write("{0}\n".format(utt2spk_line))' $dir/segments.pem $dir/utt2spk.pem + +export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5 +sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; } +for x in `ls $SOURCE_DIR/h4e_evl/*.sph`; do + y=`basename $x` + z=${y%.sph} + echo "$z $sph2pipe -f wav $x |"; +done > $dir/wav.scp + +cp $SOURCE_DIR/h4e_evl/h4e_97_1.glm $dir/glm +cp $SOURCE_DIR/h4e_evl/h4e_97.stm $dir/stm diff --git a/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh b/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh index f990adbd74a..ccefc3dcd66 100755 --- a/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh +++ b/egs/bn/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh @@ -3,6 +3,12 @@ # Copyright 2016 Vimal Manohar # Apache 2.0. +# This script prepares 1998 HUB4 Broadcast News Evaluation English Test Material +# https://catalog.ldc.upenn.edu/LDC2000S86 + +set -e +set -o pipefail + if [ $# -ne 2 ]; then echo "Usage: $0 " echo "$0 /export/corpora/LDC/LDC2000S86/ data/local/data/eval98" @@ -29,6 +35,7 @@ for line in open(uem).readlines(): line = line.strip() print (parse_uem_line(line))' $uem done > $dir/segments +awk '{print $1" "$2}' $dir/segments > $dir/utt2spk cat $SOURCE_DIR/h4e_evl/h4e_98_{1,2}.seg | \ python -c ' @@ -48,16 +55,5 @@ for x in `ls $SOURCE_DIR/h4e_evl/*.sph`; do echo "$z $sph2pipe -f wav $x |"; done > $dir/wav.scp -awk '{print $1" "$1" 1"}' $dir/wav.scp > $dir/reco2file_and_channel - cp $SOURCE_DIR/h4e_evl/h4e_98.glm $dir/glm cp $SOURCE_DIR/h4e_evl/h4e_98.stm $dir/stm - -awk '{print $1" "$2}' $dir/segments > $dir/utt2spk - -utils/fix_data_dir.sh $dir -utils/copy_data_dir.sh $dir ${dir}.pem - -cp $dir/segments.pem ${dir}.pem/segments -cp $dir/utt2spk.pem ${dir}.pem/utt2spk -utils/fix_data_dir.sh ${dir}.pem diff --git a/egs/bn/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh b/egs/bn/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh old mode 100644 new mode 100755 index 133b56b5b36..8a6d4d4b8ae --- a/egs/bn/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh +++ b/egs/bn/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh @@ -3,9 +3,15 @@ # Copyright 2016 Vimal Manohar # Apache 2.0. +# This script prepares 1999 HUB4 Broadcast News Evaluation English Test Material +# https://catalog.ldc.upenn.edu/LDC2000S88 + +set -e +set -o pipefail + if [ $# -ne 2 ]; then echo "Usage: $0 " - echo "$0 /export/corpora5/LDC/LDC2000S88/ data/local/data/eval99" + echo "$0 /export/corpora5/LDC/LDC2000S88/hub4_1999 data/local/data/eval99" exit 1 fi @@ -14,53 +20,53 @@ dir=$2 mkdir -p $dir -if [ ! -d $SOURCE_DIR/hub4_1999/ ]; then +if [ ! -d $SOURCE_DIR/bnews_99/ ]; then echo "$0: Invalid SOURCE-DIR for LDC2000S88 corpus" exit 1 fi -for uem in $SOURCE_DIR/hub4_1999/bnews_99/bn99en_{1,2}.uem; do +export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5 +sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; } + +for f in bn99en_1 bn99en_2; do + if [ $f == "bn99en_1" ]; then + affix=eval99_1 + elif [ $z == "bn99en_2" ]; then + affix=eval99_2 + fi + python -c ' import sys, os +sys.path.insert(0, "local/data_prep") import hub4_utils uem = sys.argv[1] reco, ext = os.path.splitext(os.path.basename(uem)) for line in open(uem).readlines(): - print (parse_uem_line(line))' $uem -done > $dir/segments + line = hub4_utils.parse_uem_line(reco, line) + if line is not None: + print (line)' $SOURCE_DIR/bnews_99/$f.uem > $dir/${affix}_uem_segments -awk '{print $1" "$2}' $dir/segments > $dir/utt2spk + awk '{print $1" "$2}' $dir/${affix}_uem_segments > $dir/${affix}_uem_utt2spk -cat $SOURCE_DIR/hub4_1999/bnews_99/bn99en_{1,2}.seg | \ - python -c ' + cat $SOURCE_DIR/bnews_99/$f.seg | \ + python -c ' import sys +sys.path.insert(0, "local/data_prep") +import hub4_utils with open(sys.argv[1], "w") as s_f, open(sys.argv[2], "w") as u_f: for line in sys.stdin.readlines(): - segments_line, utt2spk_line = parse_cmu_seg_line(reco, line) - s_f.write("{0}\n".format(segments_line)) - u_f.write("{0}\n".format(utt2spk_line))' \ - $dir/segments.pem $dir/utt2spk.pem - -export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5 -sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; } -for x in `ls $SOURCE_DIR/h4e_evl/*.sph`; do - y=`basename $x` - z=${y%.sph} - echo "$z $sph2pipe -f wav $x |"; -done > $dir/wav.scp - -awk '{print $1" "$1" 1"}' $dir/wav.scp > $dir/reco2file_and_channel - -cp $SOURCE_DIR/hub4_1999/bnews99/en981118.glm $dir/en981118.glm -cp $SOURCE_DIR/hub4_1999/bnews99/bn99en_1.stm $dir/bn99en_1.stm - -cp $SOURCE_DIR/hub4_1999/bnews99/en991231.glm $dir/en991231.glm -cp $SOURCE_DIR/hub4_1999/bnews99/bn99en_2.stm $dir/bn99en_2.stm + tup = hub4_utils.parse_cmu_seg_line(line) + if tup is not None: + segments_line, utt2spk_line = tup + s_f.write("{0}\n".format(segments_line)) + u_f.write("{0}\n".format(utt2spk_line))' \ + $dir/${affix}_pem_segments $dir/${affix}_pem_uttspk + + echo "$f $sph2pipe -f wav $SOURCE_DIR/bnews_99/$f.sph |" > ${affix}_wav_scp +done -utils/fix_data_dir.sh $dir -utils/copy_data_dir.sh $dir ${dir}.pem -cp $dir/*.stm ${dir}.pem/ +cp $SOURCE_DIR/bnews_99/en981118.glm $dir/eval98_2_glm +cp $SOURCE_DIR/bnews_99/bn99en_1.stm $dir/eval99_1_stm -cp $dir/segments.pem ${dir}.pem/segments -cp $dir/utt2spk.pem ${dir}.pem/utt2spk -utils/fix_data_dir.sh ${dir}.pem +cp $SOURCE_DIR/bnews_99/en991231.glm $dir/eval99_2_glm +cp $SOURCE_DIR/bnews_99/bn99en_2.stm $dir/eval99_2_stm diff --git a/egs/bn/s5/local/data_prep/prepare_na_news_text_corpus.sh b/egs/bn/s5/local/data_prep/prepare_na_news_text_corpus.sh index 44138e2a228..c32e48a3d7e 100755 --- a/egs/bn/s5/local/data_prep/prepare_na_news_text_corpus.sh +++ b/egs/bn/s5/local/data_prep/prepare_na_news_text_corpus.sh @@ -3,6 +3,9 @@ # Copyright 2016 Vimal Manohar # Apache 2.0. +# This script prepares the North American News Text Corpus +# https://catalog.ldc.upenn.edu/LDC95T21 + . cmd.sh . path.sh diff --git a/egs/bn/s5/local/data_prep/prepare_na_news_text_supplement.sh b/egs/bn/s5/local/data_prep/prepare_na_news_text_supplement.sh new file mode 100644 index 00000000000..dd463df46fc --- /dev/null +++ b/egs/bn/s5/local/data_prep/prepare_na_news_text_supplement.sh @@ -0,0 +1,61 @@ +#! /bin/bash + +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +# This script prepares the North American News Text Supplement Corpus +# https://catalog.ldc.upenn.edu/LDC98T30 + +. cmd.sh +. path.sh + +set -e +set -o pipefail +set -u + +nj=4 +cmd=run.pl + +. utils/parse_options.sh + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + echo " e.g.: $0 /export/corpora/LDC/LDC98T30/northam_news_txt_sup data/local/data/na_news_supp" + exit 1 +fi + +SOURCE_DIR=$1 +dir=$2 + +for x in $SOURCE_DIR/nyt/*/ $SOURCE_DIR/latwp/ $SOURCE_DIR/apws/*/; do + year=`basename $x` + newspaper=`basename $(dirname $x)` + + d=$dir/${newspaper}_${year} + + if [ $year == latwp ]; then + d=$dir/latwp_1997 + elif [ $year == english ]; then + d=$dir/apws + fi + + mkdir -p $d + + list_file=$d/articles.list + ls $x/*.gz > $list_file + + mkdir -p $d/split$nj + + eval utils/split_scp.pl $d/articles.list \ + $d/split$nj/articles.list.{`seq -s, $nj`} + + ( + $cmd JOB=1:$nj $d/log/get_processed_text.JOB.log \ + local/data_prep/process_na_news_text.py $d/split$nj/articles.list.JOB - \| \ + gzip -c '>' $d/corpus.JOB.gz || exit 1 + gunzip -c $d/corpus.*.gz | gzip -c > $d/corpus.gz || exit 1 + rm $d/corpus.*.gz + ) & +done + +wait diff --git a/egs/bn/s5/local/data_prep/process_1995_bn_annotation.py b/egs/bn/s5/local/data_prep/process_1995_bn_annotation.py new file mode 100755 index 00000000000..be0c7ad8e0d --- /dev/null +++ b/egs/bn/s5/local/data_prep/process_1995_bn_annotation.py @@ -0,0 +1,273 @@ +#! /usr/bin/env python + +# Copyright 2016 Vimal Manohar +# Apache 2.0. + +"""This script process a 1995 CSR-IV annotation file and writes to +utt2spk, segments and text files. +""" + +from __future__ import print_function +import argparse +import os +import logging +import re +from bs4 import BeautifulSoup +import hub4_utils + + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - " + "%(funcName)s - %(levelname)s ] %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) + + +def get_args(): + """Get command-line arguments""" + + parser = argparse.ArgumentParser("Process 1995 CSR-IV HUB4 transcripts") + + parser.add_argument("--noise-word", type=str, default="", + help="Word to add in-place of noise words") + parser.add_argument("--spoken-noise-word", type=str, + default="", + help="Word to add in-place of speaker noise words") + parser.add_argument("in_file", type=argparse.FileType('r'), + help="Input transcript file") + parser.add_argument("segments_file", type=argparse.FileType('a'), + help="Output segments file") + parser.add_argument("utt2spk_file", type=argparse.FileType('a'), + help="Output utt2spk file") + parser.add_argument("text_file", type=argparse.FileType('a'), + help="Output text file") + + args = parser.parse_args() + return args + + +class Segment(object): + """Class to store an utterance (segment)""" + + def __init__(self, reco_id, spk=None, start_time=-1, + end_time=-2, text=""): + """The arguments are straight-forward. + spk can be None if speaker is not known, in which case the utterance-id + and speaker-id are made the same. + end_time can be -1 to mean the end of the recording. + """ + self.reco_id = reco_id + self.spk = spk + self.start_time = float(start_time) + self.end_time = float(end_time) + self.text = text + + def get_utt_id(self): + """Return the utterance-id, which is + -- if spk is not known. + Otherwise it is speaker-id is added as a suffix to + above. + """ + if self.spk is None: + return "{reco}-{0:06d}-{1:06d}".format( + int(self.start_time * 100), int(self.end_time * 100), + reco=self.reco_id) + return "{reco}-{spk}-{0:06d}-{1:06d}".format( + int(self.start_time * 100), int(self.end_time * 100), + reco=self.reco_id, spk=self.spk) + + def get_spk_id(self): + """Returns the speaker-id appended to the recording-id, if speaker is + known. Otherwise returns the utterance-id as speaker-id. + """ + if self.spk is None: + return "{reco}-{0:06d}-{1:06d}".format( + int(self.start_time * 100), int(self.end_time * 100), + reco=self.reco_id) + return "{reco}-{spk}".format(reco=self.reco_id, spk=self.spk) + + def write_utt2spk(self, out_file): + """Writes this segment's entry into utt2spk file.""" + print ("{0} {1}".format(self.get_utt_id(), self.get_spk_id()), + file=out_file) + + def write_segment(self, out_file): + """Writes this segment's entry into segments file.""" + print ("{0} {1} {2:.3f} {3:.3f}".format( + self.get_utt_id(), self.reco_id, + self.start_time, self.end_time), + file=out_file) + + def write_text(self, out_file): + """Writes this segment's entry into kaldi text file.""" + print ("{0} {1}".format(self.get_utt_id(), self.text), + file=out_file) + + +def write_segments(segments, args): + """Write segments with non-empty transcripts.""" + for segment in segments: + if len(segment.text) == 0: + continue + segment.write_utt2spk(args.utt2spk_file) + segment.write_segment(args.segments_file) + segment.write_text(args.text_file) + + +def process_text(text, noise_word, spoken_noise_word): + """Returns normalized text""" + text = re.sub(r"\[pause\]", "", text) + text = hub4_utils.normalize_csr_transcript(text, noise_word, + spoken_noise_word) + return text + + +test_spk_matcher = re.compile(r"(\S+)\(bt=(\S+)\set=(\S+)\):\s(.+)$") +train_spk_matcher = re.compile(r"(\S+):\s(.+)$") + + +def process_story_content(args, reco_id, content, + start_time, end_time): + """Process the contents in a story and converts into a set of segments. + + Arguments: + args -- A reference to the CLI arguments + reco_id -- Recording id + content -- A string containing all the contents of a story (or the + stuff before the story like the credits and announcements). + It is split on a double-newline characters. + start_time -- Start time of this 'story'. + end_time -- End time of this 'story'. + """ + + segments = [] + segment_tmp = Segment(reco_id=reco_id, spk=None, + start_time=start_time, end_time=-2, text="") + + for line in content.split('\n\n'): + line = re.sub('\n', ' ', line) + + if len(line) == 0 or re.match(r"\[[^]]+\]$|\s*$", line): + continue + + m = test_spk_matcher.match(line) + if m: + # A line of story in test file that has start and end times + # and speaker name. + spk = m.group(1) + bt = float(m.group(2)) + et = float(m.group(3)) + + # Once we know the end-time of the temporary segment, we can + # write that out (Only if it is non-empty). + if len(segment_tmp.text) > 0: + segment_tmp.end_time = bt + segments.append(segment_tmp) + segment_tmp = Segment(reco_id, spk=None, start_time=et) + + text = process_text(m.group(4), args.noise_word, + args.spoken_noise_word) + if len(text) == 0 or re.match(r"\[[^]]+\]$|\s*$", text): + continue + segments.append(Segment(reco_id=reco_id, spk=spk, + start_time=bt, end_time=et, + text=text)) + continue + + m = train_spk_matcher.match(line) + if m: + # A line of story in train file that has no time segment + # information. So speaker information is not useful. + text = process_text(m.group(2), args.noise_word, + args.spoken_noise_word) + else: + # A line of story that does not have a speaker marking. + text = process_text(line, args.noise_word, args.spoken_noise_word) + if len(text) == 0 or re.match(r"\[[^]]+\]$|\s*$", text): + continue + segment_tmp.text += ' ' + text + + if len(segment_tmp.text) > 0: + segment_tmp.end_time = end_time + segments.append(segment_tmp) + + return segments + + +def process_float(string): + string = re.sub(r"'|\"", "", string) + return float(string) + + +def run(args): + base = os.path.basename(args.in_file.name) + reco_id = os.path.splitext(base)[0] + + doc = ''.join(args.in_file.readlines()) + + soup = BeautifulSoup(doc, 'lxml') + for broadcast in soup.find_all('broadcast'): + non_story_contents = [] + start_time = 0.0 + end_time = -1.0 + for s in broadcast.children: + try: + if s.name == 'story': + story_begin_time = process_float(s['bt']) + story_end_time = process_float(s['et']) + for x in s.find_all('language') + s.find_all('sung'): + x.replaceWithChildren() + if len(non_story_contents): + end_time = story_begin_time + segments = process_story_content( + args, reco_id, ' '.join(non_story_contents), + start_time=start_time, end_time=end_time) + write_segments(segments, args) + non_story_contents = [] + start_time = story_end_time + segments = process_story_content( + args, reco_id, + ' '.join([unicode(x) for x in s.children]), + start_time=story_begin_time, end_time=story_end_time) + write_segments(segments, args) + elif (s.name is not None and s.name != "language" + and s.name != 'sung'): + raise RuntimeError( + "Expected a NavigableString or " + "or or ; got {0}".format(s)) + elif s.name == "language" or s.name == "sung": + non_story_contents.append( + ' '.join([unicode(x) for x in s.children])) + else: + non_story_contents.append(unicode(s)) + except RuntimeError: + raise + except Exception: + logger.error("Failed to process broadcast children %s", s) + raise + # End for loop over broadcast children + if len(non_story_contents) > 0: + segments = process_story_content( + args, reco_id, ' '.join(non_story_contents), + start_time=start_time, end_time=-1) + write_segments(segments, args) + + +def main(): + try: + args = get_args() + run(args) + except Exception: + raise + finally: + for f in [args.in_file, args.segments_file, + args.utt2spk_file, args.text_file]: + if f is not None: + f.close() + + +if __name__ == '__main__': + main() diff --git a/egs/bn/s5/local/data_prep/process_na_news_text.py b/egs/bn/s5/local/data_prep/process_na_news_text.py index 10941dd3186..d7bb36aa3f7 100755 --- a/egs/bn/s5/local/data_prep/process_na_news_text.py +++ b/egs/bn/s5/local/data_prep/process_na_news_text.py @@ -41,17 +41,21 @@ def normalize_text(text): return text2 -def process_file(file_handle, out_file_handle): - doc = ' '.join(file_handle.readlines()) +def process_file_lines(lines, out_file_handle): + doc = '' + for line in lines: + line = re.sub(r"([^", "", line) + line = re.sub(r"

", "

", line) + doc += line soup = BeautifulSoup(doc, 'lxml') num_written = 0 - for doc in soup.html.body.children: + for art in soup.html.body.children: try: - if doc.name != "doc": + if art.name != "art": continue - for para in doc.find_all('p'): + for para in art.find_all('p'): assert para.name == 'p' text = ' '.join([unicode(x).strip() for x in para.contents]) normalized_text = normalize_text(text) @@ -69,8 +73,27 @@ def _run(args): for line in args.file_list.readlines(): try: file_ = line.strip() - with gzip.open(file_, 'r') as f: - process_file(f, args.out_file) + p = run_command( + "gunzip -c {0} | " + "local/data_prep/csr_hub4_utils/pare-sgml.perl | " + "perl local/data_prep/csr_hub4_utils/bugproc.perl | " + "perl local/data_prep/csr_hub4_utils/numhack.perl | " + "perl local/data_prep/csr_hub4_utils/numproc.perl " + " -xlocal/data_prep/csr_hub4_utils/num_excp | " + "perl local/data_prep/csr_hub4_utils/abbrproc.perl " + " local/data_prep/csr_hub4_utils/abbrlist | " + "perl local/data_prep/csr_hub4_utils/puncproc.perl -np" + "".format(file_), + stdout=subprocess.PIPE, shell=True) + + stdout = p[0].communicate()[0] + if p[0].returncode is not 0: + logger.error( + "Command '%s' failed with return status %d", + p[1], p[0].returncode) + raise RuntimeError + + process_file_lines(stdout, args.out_file) except Exception: logger.error("Failed processing file %s", file_) raise From d275480af7c4ae310a673dd0fd9b7a00be5aeb01 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 10 Jan 2017 20:22:27 -0500 Subject: [PATCH 06/38] bn: Fix MFCC config --- egs/bn/s5/conf/mfcc.conf | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/egs/bn/s5/conf/mfcc.conf b/egs/bn/s5/conf/mfcc.conf index a4be40be454..7361509099f 100644 --- a/egs/bn/s5/conf/mfcc.conf +++ b/egs/bn/s5/conf/mfcc.conf @@ -1,6 +1 @@ ---sample-frequency=16000 ---frame-length=25 # the default is 25 ---low-freq=20 # the default. ---high-freq=3700 # the default is zero meaning use the Nyquist (4k in this case). ---num-ceps=20 # higher than the default which is 12. ---snip-edges=false +--use-energy=false # only non-default option. From 4f94a5c563ed6bda7c18b537a15294bfb32eac85 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 10 Jan 2017 20:22:57 -0500 Subject: [PATCH 07/38] bn: Clean and update recipe --- egs/bn/s5/local/format_data.sh | 90 ++++++++++++++-- egs/bn/s5/local/format_lms.sh | 7 +- egs/bn/s5/local/prepare_dict.sh | 184 ++++++++++++++++++-------------- egs/bn/s5/local/train_lm.sh | 154 ++++++++++++++++++-------- 4 files changed, 298 insertions(+), 137 deletions(-) diff --git a/egs/bn/s5/local/format_data.sh b/egs/bn/s5/local/format_data.sh index b7d58f83718..3f5bb29195d 100755 --- a/egs/bn/s5/local/format_data.sh +++ b/egs/bn/s5/local/format_data.sh @@ -11,18 +11,86 @@ echo "$0 $@" # Print the command line for logging srcdir=data/local/data tmpdir=data/local/ -for t in train; do - utils/fix_data_dir.sh $srcdir/$t - utils/copy_data_dir.sh $srcdir/$t data/$t - cat $srcdir/$t/text | \ - local/normalize_transcripts.pl "" "" > \ - data/$t/text - utils/fix_data_dir.sh data/$t -done +############################################################################### +# Format 1996 English Broadcast News Train (HUB4) +############################################################################### +mkdir -p data/train_bn96 +cp $srcdir/train_bn96/{wav.scp,segments,utt2spk} data/train_bn96 + +############################################################################### +# Format 1996 English Broadcast News Dev (HUB4) +############################################################################### +mkdir -p data/dev96pe +mkdir -p data/dev96ue + +cp $srcdir/hub4_96_dev_eval/dev96_uem_segments data/dev96ue/segments +cp $srcdir/hub4_96_dev_eval/dev96_uem_utt2spk data/dev96ue/utt2spk +cp $srcdir/hub4_96_dev_eval/dev96_uem_wav_scp data/dev96ue/wav.scp +cp $srcdir/hub4_96_dev_eval/dev96_uem_stm data/dev96ue/stm +cp $srcdir/hub4_96_dev_eval/glm data/dev96ue/glm + +awk '{if ($4 > $3) print $0}' $srcdir/hub4_96_dev_eval/dev96_pem_segments \ + > data/dev96pe/segments +cp $srcdir/hub4_96_dev_eval/dev96_pem_utt2spk data/dev96pe/utt2spk +cp $srcdir/hub4_96_dev_eval/dev96_pem_wav_scp data/dev96pe/wav.scp +cp $srcdir/hub4_96_dev_eval/dev96_pem_stm data/dev96pe/stm +cp $srcdir/hub4_96_dev_eval/glm data/dev96pe/glm + +############################################################################### +# Format 1996 English Broadcast News Eval (HUB4) +############################################################################### +mkdir -p data/eval96 +mkdir -p data/eval96.pem -for t in eval98 eval98.pem; do - utils/copy_data_dir.sh $srcdir/$t data/$t - utils/fix_data_dir.sh data/$t +cp $srcdir/hub4_96_dev_eval/eval96_pem_segments data/eval96.pem/segments +cp $srcdir/hub4_96_dev_eval/eval96_pem_utt2spk data/eval96.pem/utt2spk +cp $srcdir/hub4_96_dev_eval/eval96_wav_scp data/eval96.pem/wav.scp +cp $srcdir/hub4_96_dev_eval/eval96_stm data/eval96.pem/stm +cp $srcdir/hub4_96_dev_eval/glm data/eval96.pem/glm + +cp $srcdir/hub4_96_dev_eval/eval96_uem_segments data/eval96/segments +cp $srcdir/hub4_96_dev_eval/eval96_uem_utt2spk data/eval96/utt2spk +cp $srcdir/hub4_96_dev_eval/eval96_wav_scp data/eval96/wav.scp +cp $srcdir/hub4_96_dev_eval/eval96_stm data/eval96/stm +cp $srcdir/hub4_96_dev_eval/glm data/eval96/glm + +############################################################################### +# Format 1997-98 Hub4 Broadcast news evalutation +############################################################################### +for t in eval97 eval98; do + mkdir -p data/$t data/${t}.pem + cp $srcdir/$t/segments data/$t/segments + cp $srcdir/$t/utt2spk data/$t/utt2spk + cp $srcdir/$t/segments.pem data/${t}.pem/segments + cp $srcdir/$t/utt2spk.pem data/${t}.pem/utt2spk + cp $srcdir/$t/wav.scp data/$t/wav.scp + cp $srcdir/$t/wav.scp data/${t}.pem/wav.scp + cp $srcdir/$t/stm data/$t/stm + cp $srcdir/$t/stm data/${t}.pem/stm + cp $srcdir/$t/glm data/$t/glm + cp $srcdir/$t/glm data/${t}.pem/glm done +############################################################################### +# Format 1999 Hub4 Broadcast news evalutation +############################################################################### +for d in eval99_1 eval99_2; do + mkdir -p data/${d} data/${d}.pem + cp $srcdir/eval99/${d}_uem_segments data/${d}/segments + cp $srcdir/eval99/${d}_uem_utt2spk data/${d}/utt2spk + cp $srcdir/eval99/${d}_pem_segments data/${d}.pem/segments + cp $srcdir/eval99/${d}_pem_utt2spk data/${d}.pem/utt2spk + cp $srcdir/eval99/${d}_wav_scp data/${d}/wav.scp + cp $srcdir/eval99/${d}_wav_scp data/${d}.pem/wav.scp + cp $srcdir/eval99/${d}_stm data/${d}/stm + cp $srcdir/eval99/${d}_stm data/${d}.pem/stm + cp $srcdir/eval99/${d}_glm data/${d}/glm + cp $srcdir/eval99/${d}_glm data/${d}.pem/glm +done +for d in train_bn96 eval96 eval96.pem dev96pe dev96ue eval97 eval97.pem \ + eval98 eval98.pem eval99_1 eval99_1.pem eval99_2 eval99_2.pem; do + utils/utt2spk_to_spk2utt.pl data/$d/utt2spk > data/$d/spk2utt + awk '{print $1" "$1" 1"}' data/${d}/wav.scp > data/${d}/reco2file_and_channel + utils/fix_data_dir.sh data/${d} +done diff --git a/egs/bn/s5/local/format_lms.sh b/egs/bn/s5/local/format_lms.sh index 7d9e3b82bfb..834e3d10d0a 100755 --- a/egs/bn/s5/local/format_lms.sh +++ b/egs/bn/s5/local/format_lms.sh @@ -8,12 +8,13 @@ if [ -f path.sh ]; then . path.sh; fi set -e -o pipefail -u lang_suffix=_test +local_lm_dir=data/local/local_lm . utils/parse_options.sh -#arpa_lm=data/local/local_lm/data/arpa/4gram.arpa.gz -small_arpa_lm=data/local/local_lm/data/arpa/4gram_small.arpa.gz -big_arpa_lm=data/local/local_lm/data/arpa/4gram_big.arpa.gz +#arpa_lm=$local_lm_dir/data/arpa/4gram.arpa.gz +small_arpa_lm=$local_lm_dir/data/arpa/4gram_small.arpa.gz +big_arpa_lm=$local_lm_dir/data/arpa/4gram_big.arpa.gz for f in $small_arpa_lm $big_arpa_lm data/lang_nosp/words.txt; do [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 diff --git a/egs/bn/s5/local/prepare_dict.sh b/egs/bn/s5/local/prepare_dict.sh index 441849329e1..d0a6a6703e4 100755 --- a/egs/bn/s5/local/prepare_dict.sh +++ b/egs/bn/s5/local/prepare_dict.sh @@ -38,6 +38,7 @@ set -u # run this from ../ dict_suffix= +stage=-1 echo "$0 $@" # Print the command line for logging . utils/parse_options.sh || exit 1; @@ -65,53 +66,63 @@ fi #(2) Dictionary preparation: -# Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point). -# We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones. +if [ $stage -le 0 ]; then + # Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point). + # We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones. -# silence phones, one per line. -(echo SIL; echo SPN; echo NSN) > $dir/silence_phones.txt -echo SIL > $dir/optional_silence.txt + # silence phones, one per line. + (echo SIL; echo SPN; echo NSN; echo UNK;) > $dir/silence_phones.txt + echo SIL > $dir/optional_silence.txt -# nonsilence phones; on each line is a list of phones that correspond -# really to the same base phone. -cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \ - perl -e 'while(<>){ + # nonsilence phones; on each line is a list of phones that correspond + # really to the same base phone. + cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \ + perl -e 'while(<>){ chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $phones_of{$1} .= "$_ "; } foreach $list (values %phones_of) {print $list . "\n"; } ' \ - > $dir/nonsilence_phones.txt || exit 1; + > $dir/nonsilence_phones.txt || exit 1; -# A few extra questions that will be added to those obtained by automatically clustering -# the "real" phones. These ask about stress; there's also one for silence. -cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1; -cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) { + # A few extra questions that will be added to those obtained by automatically clustering + # the "real" phones. These ask about stress; there's also one for silence. + cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1; + cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) { $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \ - >> $dir/extra_questions.txt || exit 1; + >> $dir/extra_questions.txt || exit 1; -grep -v ';;;' $dir/cmudict/cmudict.0.7a | \ - perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \ - > $dir/dict.cmu || exit 1; + grep -v ';;;' $dir/cmudict/cmudict.0.7a | \ + perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \ + > $dir/dict.cmu || exit 1; -# Add to cmudict the silences, noises etc. + # Add to cmudict the silences, noises etc. -(echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; echo ' NSN'; ) | \ - cat - $dir/dict.cmu > $dir/lexicon2_raw.txt -awk '{print $1}' $dir/lexicon2_raw.txt > $dir/orig_wordlist + (echo '!SIL SIL'; echo ' SPN'; echo ' UNK'; echo ' NSN'; ) | \ + cat - $dir/dict.cmu > $dir/lexicon2_raw.txt + awk '{print $1}' $dir/lexicon2_raw.txt > $dir/orig_wordlist -cat <$dir/silence_phones.txt + cat <$dir/silence_phones.txt SIL SPN NSN +UNK EOF -if [ ! -f exp/g2p/.done ]; then - steps/dict/train_g2p.sh --cmd "$train_cmd" \ - --silence-phones $dir/silence_phones.txt \ - $dir/dict.cmu exp/g2p - touch exp/g2p/.done fi -cat $wordlist | python -c ' + +if [ $stage -le 2 ]; then + if [ ! -f exp/g2p/.done ]; then + steps/dict/train_g2p.sh --cmd "$train_cmd" \ + --silence-phones $dir/silence_phones.txt \ + $dir/dict.cmu exp/g2p + touch exp/g2p/.done + fi +fi + +export PATH=$PATH:`pwd`/local/dict + +if [ $stage -le 3 ]; then + cat $wordlist | python -c ' import sys words = {} @@ -126,65 +137,76 @@ for line in sys.stdin.readlines(): for oov in oovs: print (oov)' $dir/orig_wordlist | sort -u > $dir/oovlist - -export PATH=$PATH:`pwd`/local/dict - -cat $dir/oovlist | get_acronym_prons.pl $dir/lexicon2_raw.txt > $dir/dict.acronyms + + cat $dir/oovlist | \ + get_acronym_prons.pl $dir/lexicon2_raw.txt > $dir/dict.acronyms +fi mkdir -p $dir/f $dir/b # forward, backward directions of rules... + +if [ $stage -le 4 ]; then # forward is normal suffix # rules, backward is reversed (prefix rules). These # dirs contain stuff we create while making the rule-based # extensions to the dictionary. -# Remove ; and , from words, if they are present; these -# might crash our scripts, as they are used as separators there. -filter_dict.pl $dir/dict.cmu > $dir/f/dict -cat $dir/oovlist | filter_dict.pl > $dir/f/oovs -reverse_dict.pl $dir/f/dict > $dir/b/dict -reverse_dict.pl $dir/f/oovs > $dir/b/oovs - -# The next stage takes a few minutes. -# Note: the forward stage takes longer, as English is -# mostly a suffix-based language, and there are more rules -# that it finds. -for d in $dir/f $dir/b; do - ( - cd $d - cat dict | get_rules.pl 2>get_rules.log >rules - get_rule_hierarchy.pl rules >hierarchy - awk '{print $1}' dict | get_candidate_prons.pl rules dict | \ - limit_candidate_prons.pl hierarchy | \ - score_prons.pl dict | \ - count_rules.pl >rule.counts - # the sort command below is just for convenience of reading. - score_rules.pl rules.with_scores - get_candidate_prons.pl rules.with_scores dict oovs | \ - limit_candidate_prons.pl hierarchy > oovs.candidates - ) & -done -wait - -# Merge the candidates. -reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates -select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s %s\n", $1, $2);}' \ - > $dir/dict.oovs - -cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged -awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled -sort $dir/oovlist | diff - $dir/oovlist.handled | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled - -steps/dict/apply_g2p.sh --cmd "$train_cmd" \ - $dir/oovlist.not_handled exp/g2p exp/g2p/oov_lex -cat exp/g2p/oov_lex/lexicon.lex | cut -f 1,3 | awk '{if (NF > 1) print $0}' > \ - $dir/dict.oovs_g2p - -# the sort | uniq is to remove a duplicated pron from cmudict. -cat $dir/lexicon2_raw.txt $dir/dict.oovs_merged $dir/dict.oovs_g2p | sort | uniq > \ - $dir/lexicon.txt || exit 1; -# lexicon.txt is without the _B, _E, _S, _I markers. - -rm $dir/lexiconp.txt 2>/dev/null || true + # Remove ; and , from words, if they are present; these + # might crash our scripts, as they are used as separators there. + filter_dict.pl $dir/dict.cmu > $dir/f/dict + cat $dir/oovlist | filter_dict.pl > $dir/f/oovs + reverse_dict.pl $dir/f/dict > $dir/b/dict + reverse_dict.pl $dir/f/oovs > $dir/b/oovs +fi + +if [ $stage -le 5 ]; then + # The next stage takes a few minutes. + # Note: the forward stage takes longer, as English is + # mostly a suffix-based language, and there are more rules + # that it finds. + for d in $dir/f $dir/b; do + ( + cd $d + cat dict | get_rules.pl 2>get_rules.log >rules + get_rule_hierarchy.pl rules >hierarchy + awk '{print $1}' dict | get_candidate_prons.pl rules dict | \ + limit_candidate_prons.pl hierarchy | \ + score_prons.pl dict | \ + count_rules.pl >rule.counts + # the sort command below is just for convenience of reading. + score_rules.pl rules.with_scores + get_candidate_prons.pl rules.with_scores dict oovs | \ + limit_candidate_prons.pl hierarchy > oovs.candidates + ) & + done + wait +fi + +if [ $stage -le 6 ]; then + # Merge the candidates. + reverse_candidates.pl $dir/b/oovs.candidates | cat - $dir/f/oovs.candidates | sort > $dir/oovs.candidates + select_candidate_prons.pl <$dir/oovs.candidates | awk -F';' '{printf("%s %s\n", $1, $2);}' \ + > $dir/dict.oovs + + cat $dir/dict.acronyms $dir/dict.oovs | sort | uniq > $dir/dict.oovs_merged + awk '{print $1}' $dir/dict.oovs_merged | uniq > $dir/oovlist.handled + sort $dir/oovlist | { diff - $dir/oovlist.handled || true; } | grep -v 'd' | sed 's:< ::' > $dir/oovlist.not_handled +fi + +if [ $stage -le 7 ]; then + steps/dict/apply_g2p.sh --cmd "$train_cmd" \ + $dir/oovlist.not_handled exp/g2p exp/g2p/oov_lex + cat exp/g2p/oov_lex/lexicon.lex | cut -f 1,3 | awk '{if (NF > 1) print $0}' > \ + $dir/dict.oovs_g2p +fi + +if [ $stage -le 8 ]; then + # the sort | uniq is to remove a duplicated pron from cmudict. + cat $dir/lexicon2_raw.txt $dir/dict.oovs_merged $dir/dict.oovs_g2p | sort | uniq > \ + $dir/lexicon.txt || exit 1; + # lexicon.txt is without the _B, _E, _S, _I markers. + + rm $dir/lexiconp.txt 2>/dev/null || true +fi echo "Dictionary preparation succeeded" diff --git a/egs/bn/s5/local/train_lm.sh b/egs/bn/s5/local/train_lm.sh index d8523ca30f4..8f82fe397e0 100755 --- a/egs/bn/s5/local/train_lm.sh +++ b/egs/bn/s5/local/train_lm.sh @@ -14,11 +14,13 @@ set -o pipefail set -u stage=0 +dir=data/local/local_lm +cmd=run.pl +vocab_size= # Preferred vocabulary size echo "$0 $@" # Print the command line for logging . utils/parse_options.sh || exit 1; -dir=data/local/local_lm lm_dir=${dir}/data mkdir -p $dir @@ -46,23 +48,43 @@ if [ $stage -le 0 ]; then rm ${dir}/data/text/* 2>/dev/null || true - cat data/train/text | shuf > ${dir}/train_text - head -n $num_dev_sentences < ${dir}/train_text | cut -d ' ' -f 2- > ${dir}/data/text/dev.txt - tail -n +$[num_dev_sentences+1] < ${dir}/train_text | cut -d ' ' -f 2- > ${dir}/data/text/bn.txt + cat data/train_bn96/text | shuf > ${dir}/train_bn96_text + head -n $num_dev_sentences < ${dir}/train_bn96_text | cut -d ' ' -f 2- > \ + ${dir}/data/text/dev.txt + tail -n +$[num_dev_sentences+1] < ${dir}/train_text | cut -d ' ' -f 2- > \ + ${dir}/data/text/train_bn96.txt + # Get text from NA News corpus for x in data/local/data/na_news/*; do y=`basename $x` [ -f $x/corpus.gz ] && ln -sf `readlink -f $x/corpus.gz` ${dir}/data/text/${y}.txt.gz done + # Get text from 1996 CSR HUB4 LM corpus + for x in `cat data/local/data/csr96_hub4/{train,test}.filelist`; do + gunzip -c $x + done | gzip -c > ${dir}/data/text/csr96_hub4.txt.gz + + # Get text from 1995 CSR-IV HUB4 corpus + cat data/local/data/csr95_hub4/dev95_text \ + data/local/data/csr95_hub4/eval95_text \ + data/local/data/csr95_hub4/train95_text | cut -d ' ' -f 2- > \ + ${dir}/data/text/csr95_hub4.txt + + # # Get text from NA News supplement corpus + # for x in data/local/data/na_news/*; do + # y=`basename $x` + # [ -f $x/corpus.gz ] && ln -sf `readlink -f $x/corpus.gz` ${dir}/data/text/${y}.txt.gz + # done + # for reporting perplexities, we'll use the "real" dev set. - # (a subset of the training data is used as ${dir}/data/text/ted.txt to work - # out interpolation weights. # note, we can't put it in ${dir}/data/text/, because then pocolm would use # it as one of the data sources. - cat data/eval98/stm | awk '!/^;;/ {if (NF > 6) print $0}' | cut -d ' ' -f 1,7- | \ - local/normalize_transcripts.pl "" "" | \ - cut -d ' ' -f 2- > ${dir}/data/real_dev_set.txt + for x in dev96pe dev96ue eval96 eval97 eval98 eval99_1 eval99_2; do + cat data/$x/stm | awk '!/^;;/ {if (NF > 6) print $0}' | cut -d ' ' -f 1,7- | \ + local/normalize_transcripts.pl "" "" | \ + cut -d ' ' -f 2- > ${dir}/data/${x}.txt + done fi if [ $stage -le 1 ]; then @@ -74,59 +96,97 @@ if [ $stage -le 1 ]; then fi if [ $stage -le 2 ]; then - for x in data/local/data/na_news/*; do - y=$dir/data/work/word_counts/`basename $x`.counts - [ -f $y ] && cat $y - done | local/lm/merge_word_counts.py 15 > $dir/data/work/na_news.wordlist_counts - - cat $dir/data/work/word_counts/{bn,dev}.counts | \ - local/lm/merge_word_counts.py 2 > $dir/data/work/bn.wordlist_counts + # decide on the vocabulary. - cat $dir/data/work/na_news.wordlist_counts $dir/data/work/bn.wordlist_counts | \ - perl -ane 'if ($F[1] =~ m/[A-Za-z]/) { print "$F[1]\n"; }' | \ - sort -u > $dir/data/work/wordlist + # NA news corpus is not clean. So better not to get vocabulary from there. + # for x in data/local/data/na_news/*; do + # y=$dir/data/work/word_counts/`basename $x`.counts + # [ -f $y ] && cat $y + # done | local/lm/merge_word_counts.py 15 > $dir/data/work/na_news.wordlist_counts + + cat $dir/data/work/word_counts/{train_bn96,dev}.counts | \ + local/lm/merge_word_counts.py 2 > $dir/data/work/train_bn96.wordlist_counts + + cat $dir/data/work/word_counts/csr96_hub4_{tr,ts}.counts | \ + local/lm/merge_word_counts.py 5 > $dir/data/work/csr96_hub4.wordlist_counts + + cat $dir/data/work/word_counts/csr95_hub4.counts | \ + local/lm/merge_word_counts.py 5 > $dir/data/work/csr95_hub4.wordlist_counts + + cat $dir/data/work/{train_bn96,csr96_hub4,csr95_hub4}.wordlist_counts | \ + perl -ane 'if ($F[1] =~ m/[A-Za-z]/) { print "$F[0] $F[1]\n"; }' | \ + local/lm/merge_word_counts.py 1 | sort -k 1,1nr > $dir/data/work/final.wordlist_counts + + if [ ! -z "$vocab_size" ]; then + awk -v sz=$vocab_size 'BEGIN{count=-1;} + { i+=1; + if (i == int(sz)) { + count = $1; + }; + if (count > 0 && count != $1) { + exit(0); + } + print $0; + }' $dir/data/work/final.wordlist_counts + else + cat $dir/data/work/final.wordlist_counts + fi | awk '{print $2}' > $dir/data/work/wordlist fi order=4 wordlist=$dir/data/work/wordlist -min_counts='default=5 bn=1' +min_counts='default=5 train_bn96=1 csr96_hub4=2,3 csr95_hub4=2,3' lm_name="`basename ${wordlist}`_${order}" if [ -n "${min_counts}" ]; then - lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "," "." | tr "=" "-"`" fi unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH + if [ $stage -le 3 ]; then - # decide on the vocabulary. - # Note: if you have more than one order, use a certain amount of words as the - # vocab and want to restrict max memory for 'sort', echo "$0: training the unpruned LM" - train_lm.py --wordlist=$wordlist --num-splits=10 --warm-start-ratio=20 \ - --limit-unk-history=true \ - --fold-dev-into=bn \ - --min-counts="${min_counts}" \ - ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} - - get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' - #[perplexity = 157.87] over 18290.0 words + + $cmd ${unpruned_lm_dir}/log/train.log \ + train_lm.py --wordlist=$wordlist --num-splits=10 --warm-start-ratio=20 \ + --limit-unk-history=true \ + --fold-dev-into=train_bn96 \ + --min-counts="${min_counts}" \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do + $cmd ${unpruned_lm_dir}/log/compute_data_prob_${x}_set.log \ + get_data_prob.py ${dir}/data/${x}_set.txt ${unpruned_lm_dir} + + cat ${unpruned_lm_dir}/log/compute_data_prob_${x}_set.log | grep -F '[perplexity' + done - mkdir -p ${dir}/data/arpa - format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram.arpa.gz + # train_lm.py: Ngram counts: 190742 + 31139856 + 14766071 + 13851899 = 59948568 + # train_lm.py: You can set --bypass-metaparameter-optimization='1.000,0.007,0.000,0.002,0.000,0.006,0.003,0.000,0.000,0.000,0.001,0.002,0.002,0.000,0.000,0.000,0.003,0.000,0.000,0.604,0.187,0.044,0.012,1.000,0.490,0.026,0.001,0.732,0.328,0.281,0.218' to get equivalent results + # get_data_prob.py: log-prob of data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt given model data/local/local_lm_bn_nanews_csr96/data/wordlist_4_default-5_bn-1.pocolm was -4.9927348506 per word [perplexity = 147.338822662] over 33180.0 words. fi if [ $stage -le 4 ]; then echo "$0: pruning the LM (to larger size)" # Using 10 million n-grams for a big LM for rescoring purposes. size=10000000 - prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big + $cmd ${dir}/data/lm_${order}_prune_big/log/prune_lm.log \ + prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 \ + ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big - get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' + for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do + $cmd ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}_set.log \ + get_data_prob.py ${dir}/data/${x}_set.txt ${dir}/data/lm_${order}_prune_big - # current results, after adding --limit-unk-history=true: - # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_big was -5.16562818753 per word [perplexity = 175.147449465] over 18290.0 words. + cat ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}_set.log | grep -F '[perplexity' + done + # get_data_prob.py data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt data/local/local_lm_bn_nanews_csr96/data/lm_4_prune_big + # grep -F '[perplexity' + # get_data_prob.py: log-prob of data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt given model data/local/local_lm_bn_nanews_csr96/data/lm_4_prune_big was -5.05700399638 per word [perplexity = 157.11908113] + # over 33180.0 words. mkdir -p ${dir}/data/arpa format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz @@ -137,12 +197,22 @@ if [ $stage -le 5 ]; then # Using 2 million n-grams for a smaller LM for graph building. Prune from the # bigger-pruned LM, it'll be faster. size=2000000 - prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + + $cmd ${dir}/data/lm_${order}_prune_small/log/prune_lm.log \ + prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big \ + ${dir}/data/lm_${order}_prune_small - get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' + for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do + $cmd ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}_set.log \ + get_data_prob.py ${dir}/data/${x}_set.txt ${dir}/data/lm_${order}_prune_big + + cat ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}_set.log | grep -F '[perplexity' + done - # current results, after adding --limit-unk-history=true (needed for modeling OOVs and not blowing up LG.fst): - # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_4_prune_small was -5.29432352378 per word [perplexity = 199.202824404 over 18290.0 words. + # get_data_prob.py data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt data/local/local_lm_bn_nanews_csr96/data/lm_4_prune_small + # grep -F '[perplexity' + # get_data_prob.py: log-prob of data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt given model data/local/local_lm_bn_nanews_csr96/data/lm_4_prune_small was -5.27172473478 per word [perplexity = 194.751567749] over 33180.0 words. + # float-counts-to-pre-arpa: output [ 190743 673670 802551 351512 ] n-grams format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz fi From eb6fccbbf14d95a61f1d47c941ede41504a09d2e Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 10 Jan 2017 20:23:59 -0500 Subject: [PATCH 08/38] bn: Remove local/lm/text_normalization.py --- egs/bn/s5/local/lm/text_normalization.py | 42 ------------------------ 1 file changed, 42 deletions(-) delete mode 100644 egs/bn/s5/local/lm/text_normalization.py diff --git a/egs/bn/s5/local/lm/text_normalization.py b/egs/bn/s5/local/lm/text_normalization.py deleted file mode 100644 index f74da60a6ef..00000000000 --- a/egs/bn/s5/local/lm/text_normalization.py +++ /dev/null @@ -1,42 +0,0 @@ - -# Copyright 2016 Vimal Manohar -# Apache 2.0. - -"""This module contains methods for doing text normalization of broadcast news -and similar text corpora. -""" - -import re - - -def normalize_bn_transcript(text, noise_word, spoken_noise_word): - """Normalize broadcast news transcript for audio.""" - text.upper() - # Remove unclear speech markings - text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text) - text = re.sub(r"#", "", text) # Remove overlapped speech markings - # Remove invented word markings - text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) - text = re.sub(r"\[[^]]+\]", noise_word, text) - text = re.sub(r"\{[^}]+\}", spoken_noise_word, text) - text = re.sub(r"\+([^+]+)\+", r"\1", text) - - text1 = [] - for word in text.split(): - # Remove mispronunciation brackets - word = re.sub(r"^@(\w+)$", r"\1", word) - text1.append(word) - return " ".join(text1) - - -def remove_punctuations(text): - """Remove punctuations and some other processing for text sentence.""" - text1 = re.sub("\n", " ", text) - text1 = re.sub(r"(&[^;]+;|--)", " ", text1) - text1 = re.sub(r"''|``|\(|\)", " ", text1) - text1 = re.sub("[^A-Za-z0-9.' _-]", "", text1) - text1 = re.sub(r"\. ", " ", text1) - text1 = re.sub(r"([^0-9$-])\.([^0-9]|$)", r"\1\2", text1) - text1 = re.sub(r" - ", " ", text1) - text1 = re.sub(r"[ ]+", " ", text1) - return text1 From 351b447291e4a128eccfe66a1f590781a67b222c Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Tue, 10 Jan 2017 20:24:48 -0500 Subject: [PATCH 09/38] bn: Fix normalize_transcripts --- egs/bn/s5/local/normalize_transcripts.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/bn/s5/local/normalize_transcripts.pl b/egs/bn/s5/local/normalize_transcripts.pl index cccf75def4a..069476cbc37 100755 --- a/egs/bn/s5/local/normalize_transcripts.pl +++ b/egs/bn/s5/local/normalize_transcripts.pl @@ -37,8 +37,8 @@ $trans =~ s:\*\*([^*]+)\*\*:$1 :g; # Remove invented word markings $trans =~ s:\[[^]]+\]:$noise_word :g; $trans =~ s:\{[^}]+\}:$spoken_noise_word :g; + $trans =~ s:^[+]([^+]+)[+]$:$1:; # Remove mispronunciation brackets foreach $w (split (" ",$trans)) { - $w =~ s:^[+](.+)[+]$:$1:; # Remove mispronunciation brackets $w =~ s:^@(.*)$:$1:; # Remove best guesses for proper nouns print " $w"; } From 643881e17ce01b8eb0bf3d5059759893eca5a948 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 12 Jan 2017 15:10:11 -0500 Subject: [PATCH 10/38] bn: Updated recipe to add more LM corpora --- egs/bn/s5/local/prepare_dict.sh | 14 +++---- egs/bn/s5/local/train_lm.sh | 66 ++++++++++++++++++++++++++------- 2 files changed, 59 insertions(+), 21 deletions(-) diff --git a/egs/bn/s5/local/prepare_dict.sh b/egs/bn/s5/local/prepare_dict.sh index d0a6a6703e4..c0b2e7c0174 100755 --- a/egs/bn/s5/local/prepare_dict.sh +++ b/egs/bn/s5/local/prepare_dict.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright 2010-2012 Microsoft Corporation +# Copyright 2010-2012 Microsoft Corporation # 2012-2014 Johns Hopkins University (Author: Daniel Povey) # 2015 Guoguo Chen # 2016 Vimal Manohar @@ -30,9 +30,9 @@ # silence_phones.txt . path.sh -. cmd.sh +. cmd.sh -set -e +set -e set -o pipefail set -u @@ -78,7 +78,7 @@ if [ $stage -le 0 ]; then # really to the same base phone. cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \ perl -e 'while(<>){ - chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; + chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $phones_of{$1} .= "$_ "; } foreach $list (values %phones_of) {print $list . "\n"; } ' \ > $dir/nonsilence_phones.txt || exit 1; @@ -97,10 +97,10 @@ if [ $stage -le 0 ]; then # Add to cmudict the silences, noises etc. (echo '!SIL SIL'; echo ' SPN'; echo ' UNK'; echo ' NSN'; ) | \ - cat - $dir/dict.cmu > $dir/lexicon2_raw.txt + cat - $dir/dict.cmu > $dir/lexicon2_raw.txt awk '{print $1}' $dir/lexicon2_raw.txt > $dir/orig_wordlist - cat <$dir/silence_phones.txt + cat <$dir/silence_phones.txt SIL SPN NSN @@ -137,7 +137,7 @@ for line in sys.stdin.readlines(): for oov in oovs: print (oov)' $dir/orig_wordlist | sort -u > $dir/oovlist - + cat $dir/oovlist | \ get_acronym_prons.pl $dir/lexicon2_raw.txt > $dir/dict.acronyms fi diff --git a/egs/bn/s5/local/train_lm.sh b/egs/bn/s5/local/train_lm.sh index 8f82fe397e0..6522619cf77 100755 --- a/egs/bn/s5/local/train_lm.sh +++ b/egs/bn/s5/local/train_lm.sh @@ -37,7 +37,7 @@ export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH fi ) || exit 1; -num_dev_sentences=5000 +num_dev_sentences=4500 RANDOM=0 if [ $stage -le 0 ]; then @@ -48,10 +48,15 @@ if [ $stage -le 0 ]; then rm ${dir}/data/text/* 2>/dev/null || true - cat data/train_bn96/text | shuf > ${dir}/train_bn96_text - head -n $num_dev_sentences < ${dir}/train_bn96_text | cut -d ' ' -f 2- > \ + # Take unique subset to make sure that the training text is not in the + # dev set. + cat data/train_bn96/text | cut -d ' ' -f 2- | sort | uniq -c | \ + shuf > ${dir}/train_bn96_text + head -n $num_dev_sentences < ${dir}/train_bn96_text | \ + awk '{str=$2; for (i=3;i<=NF;i++) {str = str" "$i;}; for (i=0; i<$1; i++) {print str;} }' | cut -d ' ' -f 2- > \ ${dir}/data/text/dev.txt - tail -n +$[num_dev_sentences+1] < ${dir}/train_text | cut -d ' ' -f 2- > \ + tail -n +$[num_dev_sentences+1] < ${dir}/train_bn96_text | \ + awk '{str=$2; for (i=3;i<=NF;i++) {str = str" "$i;}; for (i=0; i<$1; i++) {print str;} }' | cut -d ' ' -f 2- > \ ${dir}/data/text/train_bn96.txt # Get text from NA News corpus @@ -107,7 +112,7 @@ if [ $stage -le 2 ]; then cat $dir/data/work/word_counts/{train_bn96,dev}.counts | \ local/lm/merge_word_counts.py 2 > $dir/data/work/train_bn96.wordlist_counts - cat $dir/data/work/word_counts/csr96_hub4_{tr,ts}.counts | \ + cat $dir/data/work/word_counts/csr96_hub4.counts | \ local/lm/merge_word_counts.py 5 > $dir/data/work/csr96_hub4.wordlist_counts cat $dir/data/work/word_counts/csr95_hub4.counts | \ @@ -157,16 +162,33 @@ if [ $stage -le 3 ]; then ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do - $cmd ${unpruned_lm_dir}/log/compute_data_prob_${x}_set.log \ - get_data_prob.py ${dir}/data/${x}_set.txt ${unpruned_lm_dir} + $cmd ${unpruned_lm_dir}/log/compute_data_prob_${x}.log \ + get_data_prob.py ${dir}/data/${x}.txt ${unpruned_lm_dir} - cat ${unpruned_lm_dir}/log/compute_data_prob_${x}_set.log | grep -F '[perplexity' + cat ${unpruned_lm_dir}/log/compute_data_prob_${x}.log | grep -F '[perplexity' done # train_lm.py: Ngram counts: 190742 + 31139856 + 14766071 + 13851899 = 59948568 # train_lm.py: You can set --bypass-metaparameter-optimization='1.000,0.007,0.000,0.002,0.000,0.006,0.003,0.000,0.000,0.000,0.001,0.002,0.002,0.000,0.000,0.000,0.003,0.000,0.000,0.604,0.187,0.044,0.012,1.000,0.490,0.026,0.001,0.732,0.328,0.281,0.218' to get equivalent results # get_data_prob.py: log-prob of data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt given model data/local/local_lm_bn_nanews_csr96/data/wordlist_4_default-5_bn-1.pocolm was -4.9927348506 per word [perplexity = 147.338822662] over 33180.0 words. + + # get_data_prob.py: log-prob of data/local/local_lm/data/dev96pe.txt given model data/local/local_lm/data/wordlist_4_default-5_train_bn96-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.92985727862 per word [perplexity = 138.359764034] over 23760.0 words. + # get_data_prob.py: log-prob of data/local/local_lm/data/dev96ue.txt given model data/local/local_lm/data/wordlist_4_default-5_train_bn96-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.88171588624 per word [perplexity = 131.85672102] over 18821.0 words. + # get_data_prob.py: log-prob of data/local/local_lm/data/eval96.txt given model data/local/local_lm/data/wordlist_4_default-5_train_bn96-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.85089075845 per word [perplexity = 127.85422637] over 20625.0 words. + # get_data_prob.py: log-prob of data/local/local_lm/data/eval97.txt given model data/local/local_lm/data/wordlist_4_default-5_train_bn96-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.84370861758 per word [perplexity = 126.939248987] over 33340.0 words. + # get_data_prob.py: log-prob of data/local/local_lm/data/eval98.txt given model data/local/local_lm/data/wordlist_4_default-5_train_bn96-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.91000862327 per word [perplexity = 135.640584068] over 33180.0 words. + # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_1.txt given model data/local/local_lm/data/wordlist_4_default-5_train_bn96-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -5.03738768271 per word [perplexity = 154.067016944] over 11529.0 words. + # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_2.txt given model data/local/local_lm/data/wordlist_4_default-5_train_bn96-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -5.02574438024 per word [perplexity = 152.283570813] over 16395.0 words. + fi + for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do + $cmd ${unpruned_lm_dir}/log/compute_data_prob_${x}.log \ + get_data_prob.py ${dir}/data/${x}.txt ${unpruned_lm_dir} + + cat ${unpruned_lm_dir}/log/compute_data_prob_${x}.log | grep -F '[perplexity' + done + + if [ $stage -le 4 ]; then echo "$0: pruning the LM (to larger size)" @@ -177,10 +199,10 @@ if [ $stage -le 4 ]; then ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do - $cmd ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}_set.log \ - get_data_prob.py ${dir}/data/${x}_set.txt ${dir}/data/lm_${order}_prune_big + $cmd ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}.log \ + get_data_prob.py ${dir}/data/${x}.txt ${dir}/data/lm_${order}_prune_big - cat ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}_set.log | grep -F '[perplexity' + cat ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}.log | grep -F '[perplexity' done # get_data_prob.py data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt data/local/local_lm_bn_nanews_csr96/data/lm_4_prune_big @@ -188,6 +210,14 @@ if [ $stage -le 4 ]; then # get_data_prob.py: log-prob of data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt given model data/local/local_lm_bn_nanews_csr96/data/lm_4_prune_big was -5.05700399638 per word [perplexity = 157.11908113] # over 33180.0 words. + # get_data_prob.py: log-prob of data/local/local_lm/data/dev96pe.txt given model data/local/local_lm/data/lm_4_prune_big was -5.00197658249 per word [perplexity = 148.706800062] over 23760.0 words. + # get_data_prob.py: log-prob of data/local/local_lm/data/dev96ue.txt given model data/local/local_lm/data/lm_4_prune_big was -4.95522131024 per word [perplexity = 141.914009921] over 18821.0 words. + # get_data_prob.py: log-prob of data/local/local_lm/data/eval96.txt given model data/local/local_lm/data/lm_4_prune_big was -4.91668501333 per word [perplexity = 136.54920329] over 20625.0 words. + # get_data_prob.py: log-prob of data/local/local_lm/data/eval97.txt given model data/local/local_lm/data/lm_4_prune_big was -4.92810468806 per word [perplexity = 138.117488385] over 33340.0 words. + # get_data_prob.py: log-prob of data/local/local_lm/data/eval98.txt given model data/local/local_lm/data/lm_4_prune_big was -4.98326999699 per word [perplexity = 145.950861062] over 33180.0 words. + # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_1.txt given model data/local/local_lm/data/lm_4_prune_big was -5.10923357186 per word [perplexity = 165.543429098] over 11529.0 words. + # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_2.txt given model data/local/local_lm/data/lm_4_prune_big was -5.10475193474 per word [perplexity = 164.803183515] over 16395.0 words. + mkdir -p ${dir}/data/arpa format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz fi @@ -203,10 +233,10 @@ if [ $stage -le 5 ]; then ${dir}/data/lm_${order}_prune_small for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do - $cmd ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}_set.log \ - get_data_prob.py ${dir}/data/${x}_set.txt ${dir}/data/lm_${order}_prune_big + $cmd ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}.log \ + get_data_prob.py ${dir}/data/${x}.txt ${dir}/data/lm_${order}_prune_small - cat ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}_set.log | grep -F '[perplexity' + cat ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}.log | grep -F '[perplexity' done # get_data_prob.py data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt data/local/local_lm_bn_nanews_csr96/data/lm_4_prune_small @@ -214,6 +244,14 @@ if [ $stage -le 5 ]; then # get_data_prob.py: log-prob of data/local/local_lm_bn_nanews_csr96/data/real_dev_set.txt given model data/local/local_lm_bn_nanews_csr96/data/lm_4_prune_small was -5.27172473478 per word [perplexity = 194.751567749] over 33180.0 words. # float-counts-to-pre-arpa: output [ 190743 673670 802551 351512 ] n-grams + # get_data_prob.py: log-prob of data/local/local_lm/data/dev96pe.txt given model data/local/local_lm/data/lm_4_prune_small was -5.15402161616 per word [perplexity = 173.126339858] over 23760.0 words. + # get_data_prob.py: log-prob of data/local/local_lm/data/dev96ue.txt given model data/local/local_lm/data/lm_4_prune_small was -5.10689797354 per word [perplexity = 165.157237313] over 18821.0 words. + # get_data_prob.py: log-prob of data/local/local_lm/data/eval96.txt given model data/local/local_lm/data/lm_4_prune_small was -5.07740442667 per word [perplexity = 160.357296176] over 20625.0 words. + # get_data_prob.py: log-prob of data/local/local_lm/data/eval97.txt given model data/local/local_lm/data/lm_4_prune_small was -5.09747614277 per word [perplexity = 163.608461382] over 33340.0 words. + # get_data_prob.py: log-prob of data/local/local_lm/data/eval98.txt given model data/local/local_lm/data/lm_4_prune_small was -5.13563068716 per word [perplexity = 169.971484911] over 33180.0 words. + # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_1.txt given model data/local/local_lm/data/lm_4_prune_small was -5.26596417642 per word [perplexity = 193.632915104] over 11529.0 words. + # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_2.txt given model data/local/local_lm/data/lm_4_prune_small was -5.26092885453 per word [perplexity = 192.660361662] over 16395.0 words. + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz fi From 6f316ef4a1c5d3aab6d4cdce238fa869b9f58b6a Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 12 Jan 2017 15:11:23 -0500 Subject: [PATCH 11/38] bn: Updating main recipe --- egs/bn/s5/run.sh | 91 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 75 insertions(+), 16 deletions(-) diff --git a/egs/bn/s5/run.sh b/egs/bn/s5/run.sh index 24c47cb90ba..6ed0a913b01 100755 --- a/egs/bn/s5/run.sh +++ b/egs/bn/s5/run.sh @@ -13,18 +13,46 @@ set -o pipefail mfccdir=`pwd`/mfcc nj=40 -local/data_prep/prepare_bn_data.py --split-at-sync=false \ - /export/corpora5/LDC/LDC97S44 \ - /export/corpora/LDC/LDC97T22 data/local/data/train +false && { -local/data_prep/prepare_na_news_test_corpus.sh --nj 40 --cmd "$train_cmd" \ +# Prepare 1996 English Broadcast News Train (HUB4) +local/data_prep/prepare_1996_bn_data.py --noise-word="" \ + --spoken-noise-word="" \ + /export/corpora/LDC/LDC97S44 /export/corpora/LDC/LDC97T22 \ + data/local/data/train_bn96 + +# Prepare 1995 CSR-IV HUB4 corpus +local/data_prep/prepare_1995_csr_hub4_corpus.sh \ + /export/corpora5/LDC/LDC96S31/csr95_hub4/ data/local/data/csr95_hub4 + +# Prepare North American News Text Corpus +local/data_prep/prepare_na_news_text_corpus.sh --nj 40 --cmd "$train_cmd" \ /export/corpora/LDC/LDC95T21 data/local/data/na_news -local/data_prep/prepare_1996_csr_hub4_corpus.sh --nj 10 --cmd "$train_cmd" \ - /export/corpora/LDC/LDC98T31 data/local/data/csr96_hub4 +# Prepare North American News Text Supplement Corpus +local/data/prep/prepare_na_news_text_supplement.sh --nj 10 --cmd "$train_cmd" \ + /export/corpura/LCD/LDC98T30/northam_news_txt_sup data/local/data/na_news_supp + +# Prepare 1996 CSR HUB4 Language Model +local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh --nj 10 --cmd "$train_cmd" \ + /export/corpora/LDC/LDC98T31/1996_csr_hub4_model data/local/data/csr96_hub4 + +# Prepare 1996 English Broadcast News Dev and Eval (HUB4) +local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh \ + /export/corpora/LDC/LDC97S66/1996_eng_bcast_dev_eval \ + data/local/data/hub4_96_dev_eval -local/prepare_1998_hub4_bn_eng_eval.sh /export/corpora/LDC/LDC2000S86/ \ - data/local/data/eval98 +# Prepare 1997 HUB4 English Evaluation corpus +local/data_prep/prepare_1997_hub4_bn_eng_eval.sh \ + /export/corpora/LDC/LDC2002S11/hub4e_97 data/local/data/eval97 + +# Prepare 1998 HUB4 Broadcast News Evaluation English Test Material +local/data_prep/prepare_1998_hub4_bn_eng_eval.sh \ + /export/corpora/LDC/LDC2000S86/ data/local/data/eval98 + +# Prepare 1999 HUB4 Broadcast News Evaluation English Test Material +local/data_prep/prepare_1999_hub4_bn_eng_eval.sh \ + /export/corpora5/LDC/LDC2000S88/hub4_1999 data/local/data/eval99 local/format_data.sh @@ -36,9 +64,9 @@ local/prepare_dict.sh --dict-suffix "_nosp" \ utils/prepare_lang.sh data/local/dict_nosp \ "" data/local/lang_tmp_nosp data/lang_nosp -local/format_lms.sh +local/format_lms.sh --local-lm-dir data/local/local_lm -for x in train eval98 eval98.pem; do +for x in train dev96ue dev96pe eval96 eval96.pem eval97 eval97.pem eval98 eval98.pem eval99_1 eval99_1.pem eval99_2 eval99_2.pem; do this_nj=$(cat data/$x/utt2spk | wc -l) if [ $this_nj -gt 30 ]; then this_nj=30 @@ -50,6 +78,7 @@ for x in train eval98 eval98.pem; do steps/compute_cmvn_stats.sh data/$x exp/make_mfcc $mfccdir utils/fix_data_dir.sh data/$x done +} utils/subset_data_dir.sh --shortest data/train 1000 data/train_1kshort utils/subset_data_dir.sh data/train 2000 data/train_2k @@ -80,11 +109,41 @@ steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \ utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp -steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ - exp/tri3/graph_nosp data/eval98.pem exp/tri3/decode_nosp_eval98.pem -steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ - data/lang_nosp_test data/lang_nosp_test_rescore \ - data/eval98.pem exp/tri3/decode_nosp_eval98.pem \ - exp/tri3/decode_rescore_nosp_eval98.pem +( +for dset in eval96.pem eval97.pem; do + this_nj=`cat data/$dset/spk2utt | wc -l` + if [ $this_nj -gt 20 ]; then + this_nj=20 + fi + steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri3/graph_nosp data/$dset exp/tri3/decode_nosp_${dset} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_nosp_test data/lang_nosp_test_rescore \ + data/${dset} exp/tri3/decode_nosp_${dset} \ + exp/tri3/decode_nosp_${dset}_rescore +done +) & + +steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + data/train data/lang_nosp exp/tri3 exp/tri3_ali + +steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \ + data/train data/lang_nosp exp/tri3_ali exp/tri4 + +utils/mkgraph.sh data/lang_nosp_test exp/tri4 exp/tri4/graph_nosp + +for dset in eval96.pem eval97.pem; do + this_nj=`cat data/$dset/spk2utt | wc -l` + if [ $this_nj -gt 20 ]; then + this_nj=20 + fi + steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri4/graph_nosp data/$dset exp/tri4/decode_nosp_${dset} + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_nosp_test data/lang_nosp_test_rescore \ + data/${dset} exp/tri4/decode_nosp_${dset} \ + exp/tri4/decode_nosp_${dset}_rescore +done +wait exit 0 From cc9752c115d9cecc3d2dff254f9cd9dcf333c769 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 23 Mar 2017 17:37:57 -0400 Subject: [PATCH 12/38] bn: Minor fixes in BN recipe --- egs/bn/s5/local/run_cleanup_segmentation.sh | 29 ++++++++++++++------- egs/bn/s5/local/score_sclite.sh | 26 +++++++++++++++--- egs/bn/s5/path.sh | 6 +++-- egs/bn/s5/run.sh | 7 +++-- 4 files changed, 48 insertions(+), 20 deletions(-) diff --git a/egs/bn/s5/local/run_cleanup_segmentation.sh b/egs/bn/s5/local/run_cleanup_segmentation.sh index 0927b9f9a7d..2a56884446c 100755 --- a/egs/bn/s5/local/run_cleanup_segmentation.sh +++ b/egs/bn/s5/local/run_cleanup_segmentation.sh @@ -16,10 +16,6 @@ # GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets # [will add these later]. -set -e -set -o pipefail -set -u - stage=0 cleanup_stage=0 data=data/train @@ -31,6 +27,11 @@ decode_num_threads=4 . ./path.sh . ./cmd.sh + +set -e +set -o pipefail +set -u + . utils/parse_options.sh cleaned_data=${data}_${cleanup_affix} @@ -55,12 +56,16 @@ if [ $stage -le 3 ]; then fi if [ $stage -le 4 ]; then - # Test with the models trained on cleaned-up data. + # Test with the model trained on cleaned-up data. utils/mkgraph.sh data/lang_nosp_test ${cleaned_dir} ${cleaned_dir}/graph_nosp - for dset in eval98.pem; do + for dset in eval97.pem eval98.pem eval99_1.pem eval99_2.pem; do + this_nj=`cat data/$dset/spk2utt | wc -l` + if [ $this_nj -gt $decode_nj ]; then + this_nj=$decode_nj + fi steps/decode_fmllr.sh --nj $decode_nj --num-threads $decode_num_threads \ - --cmd "$decode_cmd" --num-threads 4 \ + --cmd "$decode_cmd" \ ${cleaned_dir}/graph_nosp data/${dset} ${cleaned_dir}/decode_nosp_${dset} steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp_test data/lang_nosp_test_rescore \ data/${dset} ${cleaned_dir}/decode_nosp_${dset} ${cleaned_dir}/decode_nosp_${dset}_rescore @@ -80,12 +85,16 @@ fi cleaned_dir=exp/tri4b_${cleanup_affix} if [ $stage -le 7 ]; then - # Test with the models trained on cleaned-up data. + # Test with the larger model trained on cleaned-up data. utils/mkgraph.sh data/lang_nosp_test ${cleaned_dir} ${cleaned_dir}/graph_nosp - for dset in eval98.pem; do + for dset in eval97.pem eval98.pem eval99_1.pem eval99_2.pem; do + this_nj=`cat data/$dset/spk2utt | wc -l` + if [ $this_nj -gt $decode_nj ]; then + this_nj=$decode_nj + fi steps/decode_fmllr.sh --nj $decode_nj --num-threads $decode_num_threads \ - --cmd "$decode_cmd" --num-threads 4 \ + --cmd "$decode_cmd" \ ${cleaned_dir}/graph_nosp data/${dset} ${cleaned_dir}/decode_nosp_${dset} steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp_test data/lang_nosp_test_rescore \ data/${dset} ${cleaned_dir}/decode_nosp_${dset} ${cleaned_dir}/decode_nosp_${dset}_rescore diff --git a/egs/bn/s5/local/score_sclite.sh b/egs/bn/s5/local/score_sclite.sh index 20045c2e96b..ae372b21f04 100755 --- a/egs/bn/s5/local/score_sclite.sh +++ b/egs/bn/s5/local/score_sclite.sh @@ -8,6 +8,7 @@ min_lmwt=5 max_lmwt=17 iter=final word_ins_penalty=0.0,0.5,1.0 +resolve_ctm_overlaps=false #end configuration section. [ -f ./path.sh ] && . ./path.sh @@ -60,13 +61,30 @@ if [ $stage -le 0 ]; then lattice-1best ark:- ark:- \| \ lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \ nbest-to-ctm $frame_shift_opt ark:- - \| \ - utils/int2sym.pl -f 5 $lang/words.txt \| \ - utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \ - '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1; + utils/int2sym.pl -f 5 $lang/words.txt '>' \ + $dir/score_LMWT_${wip}/$name.utt_ctm || exit 1; done fi +utils/data/get_reco2utt.sh $data if [ $stage -le 1 ]; then + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + if $resolve_ctm_overlaps; then + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/resolve_ctm_overlaps.LMWT.${wip}.log \ + steps/resolve_ctm_overlaps.py $data/segments $data/reco2utt \ + $dir/score_LMWT_${wip}/$name.utt_ctm - \| \ + utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \ + '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1; + else + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/convert_ctm.LMWT.${wip}.log \ + cat $dir/score_LMWT_${wip}/$name.utt_ctm \| \ + utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \ + '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1; + fi + done +fi + +if [ $stage -le 2 ]; then # Remove some stuff we don't want to score, from the ctm. # the big expression in parentheses contains all the things that get mapped # by the glm file, into hesitations. @@ -83,7 +101,7 @@ if [ $stage -le 1 ]; then fi # Score the set... -if [ $stage -le 2 ]; then +if [ $stage -le 3 ]; then for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.${wip}.log \ cp $data/stm $dir/score_LMWT_${wip}/ '&&' \ diff --git a/egs/bn/s5/path.sh b/egs/bn/s5/path.sh index da29adb7b2a..dc878dc9c45 100755 --- a/egs/bn/s5/path.sh +++ b/egs/bn/s5/path.sh @@ -1,6 +1,8 @@ -export KALDI_ROOT=`pwd`/../../.. +export KALDI_ROOT=/home/vmanoha1/kaldi-diarization-v2 export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 . $KALDI_ROOT/tools/config/common_path.sh -. $KALDI_ROOT/tools/env.sh +export PATH=/home/vmanoha1/kaldi-diarization-v2/src/ivectorbin/:$PATH +export PATH=/home/vmanoha1/kaldi-diarization-v2/src/segmenterbin/:$PATH +export PATH=$KALDI_ROOT/tools/sctk/bin:$PATH export LC_ALL=C diff --git a/egs/bn/s5/run.sh b/egs/bn/s5/run.sh index 6ed0a913b01..c151c693493 100755 --- a/egs/bn/s5/run.sh +++ b/egs/bn/s5/run.sh @@ -14,7 +14,6 @@ mfccdir=`pwd`/mfcc nj=40 false && { - # Prepare 1996 English Broadcast News Train (HUB4) local/data_prep/prepare_1996_bn_data.py --noise-word="" \ --spoken-noise-word="" \ @@ -78,7 +77,6 @@ for x in train dev96ue dev96pe eval96 eval96.pem eval97 eval97.pem eval98 eval98 steps/compute_cmvn_stats.sh data/$x exp/make_mfcc $mfccdir utils/fix_data_dir.sh data/$x done -} utils/subset_data_dir.sh --shortest data/train 1000 data/train_1kshort utils/subset_data_dir.sh data/train 2000 data/train_2k @@ -110,7 +108,7 @@ steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \ utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp ( -for dset in eval96.pem eval97.pem; do +for dset in eval97.pem; do this_nj=`cat data/$dset/spk2utt | wc -l` if [ $this_nj -gt 20 ]; then this_nj=20 @@ -131,8 +129,9 @@ steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \ data/train data/lang_nosp exp/tri3_ali exp/tri4 utils/mkgraph.sh data/lang_nosp_test exp/tri4 exp/tri4/graph_nosp +} -for dset in eval96.pem eval97.pem; do +for dset in eval97.pem; do this_nj=`cat data/$dset/spk2utt | wc -l` if [ $this_nj -gt 20 ]; then this_nj=20 From 634d030bd88ff1533ef1a052c6b1e608cb11dadc Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Fri, 3 Nov 2017 12:49:07 -0400 Subject: [PATCH 13/38] HUB4 train preparation scripts --- .../s5/local/data_prep/format_1996_bn_data.pl | 131 +++++++++ .../s5/local/data_prep/format_1997_bn_data.pl | 1 + .../data_prep/normalize_bn_transcript.py | 43 +++ egs/bn/s5/local/data_prep/parse_sgm.pl | 275 ++++++++++++++++++ .../local/data_prep/prepare_1996_bn_data.sh | 44 +++ .../local/data_prep/prepare_1997_bn_data.sh | 44 +++ .../prepare_na_news_text_supplement.sh | 0 7 files changed, 538 insertions(+) create mode 100755 egs/bn/s5/local/data_prep/format_1996_bn_data.pl create mode 120000 egs/bn/s5/local/data_prep/format_1997_bn_data.pl create mode 100755 egs/bn/s5/local/data_prep/normalize_bn_transcript.py create mode 100755 egs/bn/s5/local/data_prep/parse_sgm.pl create mode 100755 egs/bn/s5/local/data_prep/prepare_1996_bn_data.sh create mode 100755 egs/bn/s5/local/data_prep/prepare_1997_bn_data.sh mode change 100644 => 100755 egs/bn/s5/local/data_prep/prepare_na_news_text_supplement.sh diff --git a/egs/bn/s5/local/data_prep/format_1996_bn_data.pl b/egs/bn/s5/local/data_prep/format_1996_bn_data.pl new file mode 100755 index 00000000000..84913e9a8b0 --- /dev/null +++ b/egs/bn/s5/local/data_prep/format_1996_bn_data.pl @@ -0,0 +1,131 @@ +#!/usr/bin/env perl +#=============================================================================== +# Copyright (c) 2017 Johns Hopkins University +# (Author: Jan "Yenda" Trmal ) +# 2017 Vimal Manohar +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +use strict; +use warnings; +use utf8; +use List::Util qw(max); + +my $audio_width=1; +my $speaker_width=1; +my $time_width=1; + +binmode(STDOUT, ":utf8"); +binmode(STDERR, ":utf8"); + +if (@ARGV != 3) { + print STDERR "$0: Error: Unsupported number of arguments: " . scalar @ARGV ."\n"; + print STDERR " Usage: $0 \n"; + print STDERR " where\n"; + print STDERR " is a file containing list of audio files\n"; + print STDERR " (single absolute path name per line)\n"; + print STDERR " is a file containing transcripts obtained\n"; + print STDERR " obtained by processing the official SGML format\n"; + print STDERR " transcripts. See parse_sgm.pl for further info.\n"; + print STDERR " target directory (should already exist)\n"; + print STDERR " See also: local/parse_sgm.pl\n"; + die; +} + +my $audio_files = $ARGV[0]; +my $transcripts = $ARGV[1]; +my $out = $ARGV[2]; + +my %AUDIO; +open(my $audio_f, "<", $audio_files) + or die "$0: Error: Could not open $audio_files: $!\n"; +while(my $line = <$audio_f>) { + chomp $line; + (my $basename = $line) =~ s/.*\/([^\/]+).sph/$1/g; + $basename =~ s/_$//g; + $AUDIO{$basename} = $line; +} +close($audio_f); + +my %TRANSCRIPT; +open(my $transcript_f, "<:encoding(utf-8)", $transcripts) + or die "$0: Error: Could not open $transcripts: $!\n"; +while(my $line = <$transcript_f>) { + chomp $line; + my @F = split / /, $line, 8; + push @{$TRANSCRIPT{$F[0]}}, \@F; + + my $f1 = $F[0]; + my $f2 = $F[1]; + my $speaker = $F[2]; + my $t1 = $F[5]; + my $t2 = $F[6]; + + $time_width = max $time_width, length($t1), length($t2); + $speaker_width = max $speaker_width, length($speaker); + $audio_width = max $audio_width, length($f1); +} +close($transcript_f); +#print Dumper(\%TRANSCRIPT); + +print STDERR $time_width . " " . $speaker_width . " " . $audio_width . "\n"; + +my $sph2pipe = `which sph2pipe` or do { + die "$0: Error: sph2pipe is not installed. Did you run make in the tools/ directory?\n"; +}; +chomp $sph2pipe; + +open(my $wav_file, ">", "$out/wav.scp") + or die "$0: Error: Cannot create file $out/wav.scp: $!\n"; +open(my $text_file, ">:encoding(utf-8)", "$out/text") + or die "$0: Error: Cannot create file $out/text: $!\n"; +open(my $segments_file, ">", "$out/segments") + or die "$0: Error: Cannot create file $out/segments: $!\n"; +open(my $spk_file, ">", "$out/utt2spk") + or die "$0: Error: Cannot create file $out/utt2spk: $!\n"; + +foreach my $file (sort keys %AUDIO) { + print "$0 Error: $file does not exist in transcripts!\n" + unless exists $TRANSCRIPT{$file}; + my $transcripts = $TRANSCRIPT{$file}; + + #my $file_fmt = sprintf("%0${audio_width}s", $file); + my $file_fmt = sprintf("%s", $file); + + print $wav_file "$file_fmt $sph2pipe -f wav $AUDIO{$file}|\n"; + + foreach my $utt (@{$transcripts}) { + my $start = $utt->[5] + 0.0; + my $end = $utt->[6] + 0.0; + if ($end - $start < 0.005) { # remove very short segments + next; + } + my $start_time = sprintf("%0${time_width}d", $utt->[5]*1000); + my $end_time = sprintf("%0${time_width}d", $utt->[6]*1000); + my $spk = sprintf("%0${speaker_width}s", $utt->[2]); + # my $spk = sprintf("%s", $utt->[2]); + my $spkid = "${file_fmt}_${spk}"; + my $uttid = "${file_fmt}_${spk}_${start_time}_${end_time}"; + + print $text_file "$uttid $utt->[7]\n"; + print $spk_file "$uttid $spkid\n"; + print $segments_file "$uttid $file_fmt $start $end\n"; + } +} + +close($wav_file); +close($text_file); +close($segments_file); +close($spk_file); diff --git a/egs/bn/s5/local/data_prep/format_1997_bn_data.pl b/egs/bn/s5/local/data_prep/format_1997_bn_data.pl new file mode 120000 index 00000000000..844c16bbe06 --- /dev/null +++ b/egs/bn/s5/local/data_prep/format_1997_bn_data.pl @@ -0,0 +1 @@ +format_1996_bn_data.pl \ No newline at end of file diff --git a/egs/bn/s5/local/data_prep/normalize_bn_transcript.py b/egs/bn/s5/local/data_prep/normalize_bn_transcript.py new file mode 100755 index 00000000000..1f7367438f4 --- /dev/null +++ b/egs/bn/s5/local/data_prep/normalize_bn_transcript.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python + +import re, sys + +def main(): + if len(sys.argv) != 3: + sys.stderr.write("{0} " + "< text_file > out_text_file\n".format(sys.argv[0])) + sys.exit(1) + + noise_word = sys.argv[1] + spoken_noise_word = sys.argv[2] + + for line in sys.stdin.readlines(): + parts = line.strip().split() + normalized_text = normalize_bn_transcript( + ' '.join(parts[1:]), noise_word, spoken_noise_word) + print ("{0} {1}".format(parts[0], normalized_text)) + + +def normalize_bn_transcript(text, noise_word, spoken_noise_word): + """Normalize broadcast news transcript for audio.""" + text = text.upper() + # Remove unclear speech markings + text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text) + text = re.sub(r"#", "", text) # Remove overlapped speech markings + # Remove invented word markings + text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) + text = re.sub(r"\[[^]]+\]", noise_word, text) + text = re.sub(r"\{[^}]+\}", spoken_noise_word, text) + # Remove mispronunciation brackets + text = re.sub(r"\+([^+]+)\+", r"\1", text) + + text1 = [] + for word in text.split(): + # Remove best guesses for proper nouns + word = re.sub(r"^@(\w+)$", r"\1", word) + text1.append(word) + return " ".join(text1) + + +if __name__ == "__main__": + main() diff --git a/egs/bn/s5/local/data_prep/parse_sgm.pl b/egs/bn/s5/local/data_prep/parse_sgm.pl new file mode 100755 index 00000000000..ad2964cc2f0 --- /dev/null +++ b/egs/bn/s5/local/data_prep/parse_sgm.pl @@ -0,0 +1,275 @@ +#!/usr/bin/env perl +#=============================================================================== +# Copyright (c) 2017 Johns Hopkins University (Author: Jan "Yenda" Trmal ) +# 2017 Vimal Manohar +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. +#=============================================================================== + +use strict; +use warnings; +use utf8; + +require HTML::Parser or die "This script needs HTML::Parser from CPAN"; +HTML::Parser->import(); + +binmode(STDOUT, ":utf8"); + +sub trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s }; + +sub parse_sgml_tag { + my $tag = shift(@_); + my %ret; + + if ($tag !~ /=/) { + return %ret; + } + + $tag =~ s/<[a-zA-Z]+ //; + $tag =~ s/> *$//; + #print $tag . "\n"; + + my @key_value_pairs = split / *,? +/, $tag; + for my $entry(@key_value_pairs) { + (my $key, my $value) = split '=', $entry, 2; + $ret{$key}=$value; + } + return %ret; +} + +if (@ARGV != 1) { + print STDERR "$0: This script needs exactly one parameter (list of SGML files)\n"; + print STDERR " Usage: $0 \n"; + print STDERR " where\n"; + print STDERR " is a file containing the official SGML format\n"; + print STDERR " transcripts. The files are parsed and the parsed representation\n"; + print STDERR " is dumped to STDOUT (one utterance + the additional data fields\n"; + print STDERR " per line (we dump all the fields, but not all fields are used\n"; + print STDERR " in the recipe).\n"; + die; +} +my $filelist=$ARGV[0]; + +my $p = HTML::Parser->new(); + +my @files=(); +open(F, '<', $filelist) or die "Could not open file $filelist: $?\n"; +while() { + chomp; + push @files, $_; +} + +foreach my $file (@files) { + my $reporter=""; + my $start = -1; + my $end = -1; + my $segment_start = -1; + my $segment_end = -1; + my $segment_speaker; + my $segment_fidelity = "XXX"; + my $segment_mode = "XXX"; + my $section_start = -1; + my $section_end = -1; + my $filename = ""; + my $seq = 0; + my @text = (); + my $time; + my @tagqueue; + + my $sgml_file = `basename $file`; + $sgml_file = trim $sgml_file; + $sgml_file =~ s/\.txt$//g; + $sgml_file =~ s/\.sgml$//g; + $sgml_file =~ s/_$//g; + + open(my $f, '<:encoding(iso-8859-1)', $file) or die "Could not open file $file: $?\n"; + + while(my $line = <$f>) { + chomp $line; + $line = trim $line; + $line = lc $line; + next unless $line; + + if ($line =~ //$1/g; + $line = trim $line; + die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line; + #print "ES: $line\n"; + ; + } elsif ($line =~ /
/$1/g; + $line = trim $line; + die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line; + #print "ES: $line\n"; + ; + } elsif ($line =~ //$1/g; + $line = trim $line; + die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line; + + #print join(" ", @text) . "\n" if @text > 0; + my $new_time = $segment_end; + if (@text > 0) { + print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time "; + print join(" ", @text) . "\n"; + } + @text = (); + $time = 0; + $segment_speaker = "XXX"; + $segment_start = "XXX"; + $segment_end = "XXX"; + $segment_fidelity = "XXX"; + $segment_mode = "XXX"; + #print "ET: $line\n"; + ; + } elsif ($line =~ //$1/g; + $line = trim $line; + die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line; + + #print join(" ", @text) . "\n" if @text > 0; + my $new_time = $segment_end; + if (@text > 0) { + print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time "; + print join(" ", @text) . "\n"; + } + @text = (); + $time = 0; + $segment_speaker = "XXX"; + $segment_start = "XXX"; + $segment_end = "XXX"; + $segment_fidelity = "XXX"; + $segment_mode = "XXX"; + #print "ET: $line\n"; + ; + } elsif ($line =~ / 0) { + print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time "; + print join(" ", @text) . "\n"; + } + @text = (); + $time = $new_time; + ; + } elsif ($line =~ /<\/sync/) { + #print $line; + ; + } elsif ($line =~ /