Tn tts e (#3988)

* refactor serial class Signed-off-by: ekmb <[email protected]> * add and for cardinal part Signed-off-by: ekmb <[email protected]> * move and to the cardinal tagger Signed-off-by: ekmb <[email protected]> * fix tests Signed-off-by: ekmb <[email protected]> * grammar folder on jenkins updated Signed-off-by: ekmb <[email protected]> * resolve merge conflicts, update test cases Signed-off-by: ekmb <[email protected]> * jenkins Signed-off-by: ekmb <[email protected]> Co-authored-by: Yang Zhang <[email protected]>
NVIDIA · Apr 15, 2022 · e4ee26b · e4ee26b
1 parent 0be1e94
commit e4ee26b
Show file tree

Hide file tree

Showing 36 changed files with 460 additions and 407 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -124,18 +124,18 @@ pipeline {
       parallel {
         stage('En TN grammars') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py "1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py "1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1'
           }
         }
         stage('En ITN grammars') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en "twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en "twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1'
           }
         }
         stage('Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14'
-            sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1'
+            sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1'
           }
         }
       }
@@ -152,17 +152,17 @@ pipeline {
       parallel {
         stage('L2: Eng TN') {
           steps {
-            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
+            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
             sh 'cd nemo_text_processing/text_normalization/ &&  python run_predict.py --input=/home/TestData/nlp/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output=/home/TestData/nlp/text_norm/output/test.pynini.txt --verbose'
             sh 'cat /home/TestData/nlp/text_norm/output/test.pynini.txt'
-            sh 'cmp --silent /home/TestData/nlp/text_norm/output/test.pynini.txt /home/TestData/nlp/text_norm/ci/test_goal_py_12-10.txt || exit 1'
+            sh 'cmp --silent /home/TestData/nlp/text_norm/output/test.pynini.txt /home/TestData/nlp/text_norm/ci/test_goal_py_04-14.txt || exit 1'
             sh 'rm -rf /home/TestData/nlp/text_norm/output/*'
           }
         }
 
         stage('L2: Eng ITN export') {
           steps {
-            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
+            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
             sh 'cd nemo_text_processing/inverse_text_normalization/ &&  python run_predict.py --input=/home/TestData/nlp/text_denorm/ci/test.txt --language=en --output=/home/TestData/nlp/text_denorm/output/test.pynini.txt --verbose'
             sh 'cmp --silent /home/TestData/nlp/text_denorm/output/test.pynini.txt /home/TestData/nlp/text_denorm/ci/test_goal_py.txt || exit 1'
             sh 'rm -rf /home/TestData/nlp/text_denorm/output/*'
@@ -171,23 +171,23 @@ pipeline {
         stage('L2: TN with Audio (audio and raw text)') {
           steps {
             sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14 --text "The total amounts to \\$4.76." \
+            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1 --text "The total amounts to \\$4.76." \
             --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_raw.txt 2>&1 && \
             cmp --silent /tmp/out_raw.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
           }
         }
         stage('L2: TN with Audio (audio and text file)') {
           steps {
             sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
+            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
             --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_file.txt 2>&1 && \
             cmp --silent /tmp/out_file.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
           }
         }
         stage('L2: TN with Audio (manifest)') {
           steps {
             sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14'
+            python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1'
           }
         }
       }

diff --git a/nemo_text_processing/text_normalization/data_loader_utils.py b/nemo_text_processing/text_normalization/data_loader_utils.py
@@ -237,7 +237,6 @@ def pre_process(text: str) -> str:
 
     # remove extra space
     text = re.sub(r' +', ' ', text)
-    text = re.sub(r'(^|\s)(&|#|@)(\w)', r'\1\2 \3', text)
     return text
 
 

diff --git a/nemo_text_processing/text_normalization/en/data/time_suffix.tsv b/nemo_text_processing/text_normalization/en/data/time_suffix.tsv
@@ -1,12 +1,12 @@
-p.m.	p m
-p.m	p m
-pm	p m
-P.M.	P M
-P.M	P M
-PM	P M
-a.m.	a m
-a.m	a m
-am	a m
-A.M.	A M
-A.M	A M
-AM	A M
+p.m.	PM
+p.m	PM
+pm	PM
+P.M.	PM
+P.M	PM
+PM	PM
+a.m.	AM
+a.m	AM
+am	AM
+A.M.	AM
+A.M	AM
+AM	AM
diff --git a/nemo_text_processing/text_normalization/en/data/time_zone.tsv b/nemo_text_processing/text_normalization/en/data/time_zone.tsv
@@ -1,14 +1,14 @@
-cst	c s t
-c.s.t	c s t
-cet	c e t
-c.e.t	c e t
-pst	p s t
-p.s.t	p s t
-est	e s t
-e.s.t	e s t
-pt	p t
-p.t	p t
-et	e t
-e.t	e t
-gmt	g m t
-g.m.t	g m t
+cst	CST
+c.s.t	CST
+cet	CET
+c.e.t	CET
+pst	PST
+p.s.t	PST
+est	EST
+e.s.t	EST
+pt	PT
+p.t	PT
+et	ET
+e.t	ET
+gmt	GMT
+g.m.t	GMT
diff --git a/nemo_text_processing/text_normalization/en/data/whitelist_symbols.tsv b/nemo_text_processing/text_normalization/en/data/whitelist_symbols.tsv
@@ -0,0 +1,21 @@
+&	and
+#	hash
+@	at
+§	section
+™	trademark
+®	registered trademark
+©	copyright
+_	underscore
+%	percent sign
+*	asterisk
++	plus
+/	slash
+=	equal sign
+^	circumflex
+|	vertical bar
+~	tilde
+$	dollar
+£	pound
+€	euro
+₩	won
+¥	yen
diff --git a/nemo_text_processing/text_normalization/en/data/whitelist_tts.tsv b/nemo_text_processing/text_normalization/en/data/whitelist_tts.tsv
@@ -1,12 +1,11 @@
 Ph.D.	PHD
 Hon.	honorable
-&	and
 Mt.	Mount
 Maj.	Major
 Rev.	Reverend
-#	hash
-@	at
-§	section
+w/o	without
+a/c number	account number
+c/o	care of
 Gov.	governor
 vs.	versus
 vs	versus
@@ -23,6 +22,9 @@ World War II	World War two
 etc.	etcetera.
 SnO2	tin four oxide
 dept	department
+HVAC	H-vac
+SPDR	spider
+ZIP	zip
 κ	kappa
 ω	omega
 α	alpha
@@ -74,18 +76,24 @@ dept	department
 Ψ	psi
 ζ	zeta
 Ζ	zeta
-_	underscore
-%	percent sign
-*	asterisk
-+	plus
-/	slash
-=	equal sign
-^	circumflex
-|	vertical bar
-~	tilde
 ltd	limited
 int'l	international
-$	dollar
+A. D	AD
+A.D	AD
+a. d	AD
+a.d	AD
+a. d.	AD
+a.d.	AD
+B. C	BC
+B.C	BC
+b. c	BC
+b.c	BC
+A. D.	AD
+A.D.	AD
+B. C.	BC
+B.C.	BC
+b. c.	BC
+b.c.	BC
 A. A.	a a
 A.A.	AA
 A&A	A and A
@@ -110,6 +118,7 @@ Abz	ABZ
 A&C	A and C
 A. C.	AC
 A.C.	AC
+A/C	AC
 acac	ACAC
 Acad	ACAD
 ACC&S	ACC and S
@@ -1462,6 +1471,7 @@ fx	FX
 Fyb	FYB
 F.Y.	FY
 Fyn	FYN
+fyi	FYI
 fyr	FYR
 Fyw	FYW
 F.Z.	FZ
@@ -3144,6 +3154,7 @@ T.A.	TA
 T&A	T and A
 TBCs	TBC's
 TBMs	TBM's
+tbh	TBH
 T. B.	TB
 T.B.	TB
 TCiAP	TCIAP

diff --git a/nemo_text_processing/text_normalization/en/data/year_suffix.tsv b/nemo_text_processing/text_normalization/en/data/year_suffix.tsv
@@ -1,8 +1,16 @@
-A. D	a d
-A.D	a d
-B. C	b c
-B.C	b c
-A. D.	a d
-A.D.	a d
-B. C.	b c
-B.C.	b c
+A. D	AD
+A.D	AD
+a. d	AD
+a.d	AD
+a. d.	AD
+a.d.	AD
+B. C	BC
+B.C	BC
+b. c	BC
+b.c	BC
+A. D.	AD
+A.D.	AD
+B. C.	BC
+B.C.	BC
+b. c.	BC
+b.c.	BC