Skip to content

Commit

Permalink
Tn tts e (#3988)
Browse files Browse the repository at this point in the history
* refactor serial class

Signed-off-by: ekmb <[email protected]>

* add and for cardinal part

Signed-off-by: ekmb <[email protected]>

* move and to the cardinal tagger

Signed-off-by: ekmb <[email protected]>

* fix tests

Signed-off-by: ekmb <[email protected]>

* grammar folder on jenkins updated

Signed-off-by: ekmb <[email protected]>

* resolve merge conflicts, update test cases

Signed-off-by: ekmb <[email protected]>

* jenkins

Signed-off-by: ekmb <[email protected]>

Co-authored-by: Yang Zhang <[email protected]>
  • Loading branch information
ekmb and yzhang123 authored Apr 15, 2022
1 parent 0be1e94 commit e4ee26b
Show file tree
Hide file tree
Showing 36 changed files with 460 additions and 407 deletions.
20 changes: 10 additions & 10 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -124,18 +124,18 @@ pipeline {
parallel {
stage('En TN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py "1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14'
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py "1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1'
}
}
stage('En ITN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en "twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14'
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en "twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1'
}
}
stage('Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14'
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14'
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1'
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1'
}
}
}
Expand All @@ -152,17 +152,17 @@ pipeline {
parallel {
stage('L2: Eng TN') {
steps {
sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
sh 'cd nemo_text_processing/text_normalization/ && python run_predict.py --input=/home/TestData/nlp/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output=/home/TestData/nlp/text_norm/output/test.pynini.txt --verbose'
sh 'cat /home/TestData/nlp/text_norm/output/test.pynini.txt'
sh 'cmp --silent /home/TestData/nlp/text_norm/output/test.pynini.txt /home/TestData/nlp/text_norm/ci/test_goal_py_12-10.txt || exit 1'
sh 'cmp --silent /home/TestData/nlp/text_norm/output/test.pynini.txt /home/TestData/nlp/text_norm/ci/test_goal_py_04-14.txt || exit 1'
sh 'rm -rf /home/TestData/nlp/text_norm/output/*'
}
}

stage('L2: Eng ITN export') {
steps {
sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
sh 'cd nemo_text_processing/inverse_text_normalization/ && python run_predict.py --input=/home/TestData/nlp/text_denorm/ci/test.txt --language=en --output=/home/TestData/nlp/text_denorm/output/test.pynini.txt --verbose'
sh 'cmp --silent /home/TestData/nlp/text_denorm/output/test.pynini.txt /home/TestData/nlp/text_denorm/ci/test_goal_py.txt || exit 1'
sh 'rm -rf /home/TestData/nlp/text_denorm/output/*'
Expand All @@ -171,23 +171,23 @@ pipeline {
stage('L2: TN with Audio (audio and raw text)') {
steps {
sh 'cd nemo_text_processing/text_normalization && \
python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14 --text "The total amounts to \\$4.76." \
python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1 --text "The total amounts to \\$4.76." \
--audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_raw.txt 2>&1 && \
cmp --silent /tmp/out_raw.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
}
}
stage('L2: TN with Audio (audio and text file)') {
steps {
sh 'cd nemo_text_processing/text_normalization && \
python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
--audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_file.txt 2>&1 && \
cmp --silent /tmp/out_file.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
}
}
stage('L2: TN with Audio (manifest)') {
steps {
sh 'cd nemo_text_processing/text_normalization && \
python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14'
python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/4-14-1'
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,6 @@ def pre_process(text: str) -> str:

# remove extra space
text = re.sub(r' +', ' ', text)
text = re.sub(r'(^|\s)(&|#|@)(\w)', r'\1\2 \3', text)
return text


Expand Down
24 changes: 12 additions & 12 deletions nemo_text_processing/text_normalization/en/data/time_suffix.tsv
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
p.m. p m
p.m p m
pm p m
P.M. P M
P.M P M
PM P M
a.m. a m
a.m a m
am a m
A.M. A M
A.M A M
AM A M
p.m. PM
p.m PM
pm PM
P.M. PM
P.M PM
PM PM
a.m. AM
a.m AM
am AM
A.M. AM
A.M AM
AM AM
28 changes: 14 additions & 14 deletions nemo_text_processing/text_normalization/en/data/time_zone.tsv
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
cst c s t
c.s.t c s t
cet c e t
c.e.t c e t
pst p s t
p.s.t p s t
est e s t
e.s.t e s t
pt p t
p.t p t
et e t
e.t e t
gmt g m t
g.m.t g m t
cst CST
c.s.t CST
cet CET
c.e.t CET
pst PST
p.s.t PST
est EST
e.s.t EST
pt PT
p.t PT
et ET
e.t ET
gmt GMT
g.m.t GMT
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
& and
# hash
@ at
§ section
trademark
® registered trademark
© copyright
_ underscore
% percent sign
* asterisk
+ plus
/ slash
= equal sign
^ circumflex
| vertical bar
~ tilde
$ dollar
£ pound
euro
won
¥ yen
39 changes: 25 additions & 14 deletions nemo_text_processing/text_normalization/en/data/whitelist_tts.tsv
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
Ph.D. PHD
Hon. honorable
& and
Mt. Mount
Maj. Major
Rev. Reverend
# hash
@ at
§ section
w/o without
a/c number account number
c/o care of
Gov. governor
vs. versus
vs versus
Expand All @@ -23,6 +22,9 @@ World War II World War two
etc. etcetera.
SnO2 tin four oxide
dept department
HVAC H-vac
SPDR spider
ZIP zip
κ kappa
ω omega
α alpha
Expand Down Expand Up @@ -74,18 +76,24 @@ dept department
Ψ psi
ζ zeta
Ζ zeta
_ underscore
% percent sign
* asterisk
+ plus
/ slash
= equal sign
^ circumflex
| vertical bar
~ tilde
ltd limited
int'l international
$ dollar
A. D AD
A.D AD
a. d AD
a.d AD
a. d. AD
a.d. AD
B. C BC
B.C BC
b. c BC
b.c BC
A. D. AD
A.D. AD
B. C. BC
B.C. BC
b. c. BC
b.c. BC
A. A. a a
A.A. AA
A&A A and A
Expand All @@ -110,6 +118,7 @@ Abz ABZ
A&C A and C
A. C. AC
A.C. AC
A/C AC
acac ACAC
Acad ACAD
ACC&S ACC and S
Expand Down Expand Up @@ -1462,6 +1471,7 @@ fx FX
Fyb FYB
F.Y. FY
Fyn FYN
fyi FYI
fyr FYR
Fyw FYW
F.Z. FZ
Expand Down Expand Up @@ -3144,6 +3154,7 @@ T.A. TA
T&A T and A
TBCs TBC's
TBMs TBM's
tbh TBH
T. B. TB
T.B. TB
TCiAP TCIAP
Expand Down
24 changes: 16 additions & 8 deletions nemo_text_processing/text_normalization/en/data/year_suffix.tsv
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
A. D a d
A.D a d
B. C b c
B.C b c
A. D. a d
A.D. a d
B. C. b c
B.C. b c
A. D AD
A.D AD
a. d AD
a.d AD
a. d. AD
a.d. AD
B. C BC
B.C BC
b. c BC
b.c BC
A. D. AD
A.D. AD
B. C. BC
B.C. BC
b. c. BC
b.c. BC
Loading

0 comments on commit e4ee26b

Please sign in to comment.