Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[TTS/TN/G2P] Remove Text Processing from NeMo, move G2P to TTS #5982

Merged
merged 26 commits into from
Feb 15, 2023
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
2966c2c
remove TN
ekmb Feb 10, 2023
730b497
Merge branch 'main' into g2p_to_tts
ekmb Feb 10, 2023
c023aca
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 10, 2023
e7019b7
fix imports
ekmb Feb 10, 2023
3921020
Merge branch 'g2p_to_tts' of https://github.com/NVIDIA/NeMo into g2p_…
ekmb Feb 10, 2023
312b966
Merge branch 'main' into g2p_to_tts
ekmb Feb 10, 2023
dec4c0b
fix import
ekmb Feb 10, 2023
48683ab
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 10, 2023
09e881f
add missing init
ekmb Feb 10, 2023
71b0616
fix import
ekmb Feb 10, 2023
0cde4de
Merge branch 'g2p_to_tts' of https://github.com/NVIDIA/NeMo into g2p_…
ekmb Feb 10, 2023
1fc2f82
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 10, 2023
79193e0
Merge branch 'main' into g2p_to_tts
ekmb Feb 10, 2023
0399299
rename unit test
ekmb Feb 10, 2023
fcc8f08
fix import
ekmb Feb 11, 2023
6bcdbf1
fix modules test
ekmb Feb 11, 2023
bfead4b
fix imports
ekmb Feb 11, 2023
ee0bf93
Merge branch 'main' into g2p_to_tts
ekmb Feb 15, 2023
be645ed
remove whitelist from config
ekmb Feb 15, 2023
6cbc570
delete wordid file
ekmb Feb 15, 2023
2696d10
remove pynini_install from tutorials
ekmb Feb 15, 2023
b3c3ca6
update requirements
ekmb Feb 15, 2023
eb8adf2
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 15, 2023
9e8e0dc
add support warning
ekmb Feb 15, 2023
680b383
Merge branch 'g2p_to_tts' of https://github.com/NVIDIA/NeMo into g2p_…
ekmb Feb 15, 2023
e304858
review
ekmb Feb 15, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
7 changes: 1 addition & 6 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,6 @@ WORKDIR /tmp/nemo
COPY requirements .
RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-check --no-cache-dir -r $f; done

# install pynini
COPY nemo_text_processing/install_pynini.sh /tmp/nemo/
RUN /bin/bash /tmp/nemo/install_pynini.sh
rlangman marked this conversation as resolved.
Show resolved Hide resolved

# install k2, skip if installation fails
COPY scripts /tmp/nemo/scripts/
RUN /bin/bash /tmp/nemo/scripts/speech_recognition/k2/setup.sh || exit 0
Expand All @@ -81,8 +77,7 @@ RUN --mount=from=nemo-src,target=/tmp/nemo cd /tmp/nemo && pip install ".[all]"

# Check install
RUN python -c "import nemo.collections.nlp as nemo_nlp" && \
python -c "import nemo.collections.tts as nemo_tts" && \
python -c "import nemo_text_processing.text_normalization as text_normalization"
rlangman marked this conversation as resolved.
Show resolved Hide resolved
python -c "import nemo.collections.tts as nemo_tts"

# TODO: Update to newer numba 0.56.0RC1 for 22.03 container if possible
# install pinned numba version
Expand Down
92 changes: 3 additions & 89 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -102,92 +102,6 @@ pipeline {
}
}


stage('L0: TN/ITN Tests CPU') {
when {
anyOf {
branch 'main'
changeRequest target: 'main'
}
}
failFast true
parallel {
stage('En TN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
}
}
stage('En ITN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
}
}
stage('Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
}
}
stage('Test En Hybrid TN') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/hybrid/wfst_lm_rescoring.py --data /home/TestData/nlp/text_norm/hybrid_tn/test.txt --regenerate_pkl --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 | grep "all_correct: True" || exit 1'
}
}
}
}

stage('L2: NeMo text processing') {
when {
anyOf {
branch 'main'
changeRequest target: 'main'
}
}
failFast true
parallel {
stage('L2: Eng TN') {
steps {
sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
sh 'cd nemo_text_processing/text_normalization/ && python normalize.py --input_file=/home/TestData/nlp/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output_file=/home/TestData/nlp/text_norm/output/test.pynini.txt --verbose'
sh 'cat /home/TestData/nlp/text_norm/output/test.pynini.txt'
sh 'cmp --silent /home/TestData/nlp/text_norm/output/test.pynini.txt /home/TestData/nlp/text_norm/ci/test_goal_py_05-25.txt || exit 1'
sh 'rm -rf /home/TestData/nlp/text_norm/output/*'
}
}

stage('L2: Eng ITN export') {
steps {
sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
sh 'cd nemo_text_processing/inverse_text_normalization/ && python inverse_normalize.py --input_file=/home/TestData/nlp/text_denorm/ci/test.txt --language=en --output_file=/home/TestData/nlp/text_denorm/output/test.pynini.txt --verbose'
sh 'cmp --silent /home/TestData/nlp/text_denorm/output/test.pynini.txt /home/TestData/nlp/text_denorm/ci/test_goal_py.txt || exit 1'
sh 'rm -rf /home/TestData/nlp/text_denorm/output/*'
}
}
stage('L2: TN with Audio (audio and raw text)') {
steps {
sh 'cd nemo_text_processing/text_normalization && \
python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --text "The total amounts to \\$4.76." \
--audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_raw.txt 2>&1 && \
cmp --silent /tmp/out_raw.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
}
}
stage('L2: TN with Audio (audio and text file)') {
steps {
sh 'cd nemo_text_processing/text_normalization && \
python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
--audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_file.txt 2>&1 && \
cmp --silent /tmp/out_file.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
}
}
stage('L2: TN with Audio (manifest)') {
steps {
sh 'cd nemo_text_processing/text_normalization && \
python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/11-16-22'
}
}
}
}

stage('L2: ASR dev run') {
when {
anyOf {
Expand Down Expand Up @@ -1073,7 +987,7 @@ pipeline {
parallel {
stage('G2P Conformer training, evaluation and inference') {
steps {
sh 'cd examples/text_processing/g2p && \
sh 'cd examples/tts/g2p && \
TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_CONFORMER=output_ctc_${TIME} && \
python g2p_train_and_evaluate.py \
train_manifest=/home/TestData/g2p/g2p.json \
Expand All @@ -1097,7 +1011,7 @@ pipeline {
}
stage('ByT5G2P training, evaluation and inference') {
steps {
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/text_processing/g2p && \
sh 'TRANSFORMERS_OFFLINE=0 && cd examples/tts/g2p && \
TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR_T5=output_byt5_${TIME} && \
python g2p_train_and_evaluate.py \
train_manifest=/home/TestData/g2p/g2p.json \
Expand All @@ -1119,7 +1033,7 @@ pipeline {
}
stage('HeteronymClassificationModel training, evaluation and inference') {
steps {
sh 'cd examples/text_processing/g2p && \
sh 'cd examples/tts/g2p && \
TIME=`date +"%Y-%m-%d-%T"` && OUTPUT_DIR=output_${TIME} && \
python heteronym_classification_train_and_evaluate.py \
train_manifest=/home/TestData/g2p/manifest.json \
Expand Down
2 changes: 1 addition & 1 deletion docs/source/tts/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -96,4 +96,4 @@ Dataset Processing Classes

.. autoclass:: nemo.collections.tts.torch.data.VocoderDataset
:show-inheritance:
:members:
:members:
2 changes: 1 addition & 1 deletion docs/source/tts/checkpoints.rst
Original file line number Diff line number Diff line change
Expand Up @@ -150,4 +150,4 @@ End2End models
.. csv-table::
:file: data/ngc_models_e2e.csv
:align: left
:header-rows: 1
:header-rows: 1
5 changes: 2 additions & 3 deletions docs/source/tts/configs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ Text normalization (TN) converts text from written form into its verbalized form
_target_: nemo_text_processing.text_normalization.normalize.Normalizer
lang: en
input_case: cased
whitelist: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed offline: please add a paragraph in the documentation describing how default/non-default settings for whitelist work in a future PR.


text_normalizer_call_kwargs:
verbose: false
Expand All @@ -118,7 +117,7 @@ Tokenization converts input text string to a list of integer tokens. It may pad
apostrophe: true
pad_with_space: true
g2p:
_target_: nemo_text_processing.g2p.modules.EnglishG2p
_target_: nemo.collections.tts.g2p.modules.EnglishG2p
phoneme_dict: ${phoneme_dict_path}
heteronyms: ${heteronyms_path}
phoneme_probability: 0.5
Expand Down Expand Up @@ -260,4 +259,4 @@ Fine-tuning via a Pytorch Lightning checkpoint
trainer.devices=-1 \
trainer.accelerator='gpu' \
trainer.max_epochs=50 \
+init_from_ptl_ckpt="<name of pytorch lightning checkpoint>"
+init_from_ptl_ckpt="<name of pytorch lightning checkpoint>"
4 changes: 1 addition & 3 deletions docs/source/tts/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,7 @@ LJSpeech
.. code-block:: shell-session

$ python scripts/dataset_processing/tts/ljspeech/get_data.py \
--data-root <your_local_dataset_root> \
--whitelist-path <your_local_whitelist_filepath> \
or default nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv
--data-root <your_local_dataset_root>

$ python scripts/dataset_processing/tts/extract_sup_data.py \
--config-path ljspeech/ds_conf \
Expand Down
12 changes: 6 additions & 6 deletions examples/tts/aligner_heteronym_disambiguation.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@


"""
G2P disambiguation using an Aligner model's input embedding distances.
G2P_paper disambiguation using an Aligner model's input embedding distances.

Does not handle OOV and leaves them as graphemes.

Expand All @@ -46,7 +46,7 @@
def get_args():
"""Retrieve arguments for disambiguation.
"""
parser = argparse.ArgumentParser("G2P disambiguation using Aligner input embedding distances.")
parser = argparse.ArgumentParser("G2P_paper disambiguation using Aligner input embedding distances.")
# TODO(jocelynh): Make this required=False with default download from NGC once ckpt uploaded
parser.add_argument('--model', required=True, type=str, help="Path to Aligner model checkpoint (.nemo file).")
parser.add_argument(
Expand Down Expand Up @@ -108,7 +108,7 @@ def disambiguate_candidates(aligner, text, spec, spec_len, confidence, device, h

Note: This could be sped up if multiple words' candidates were batched, but this is conceptually easier.
"""
# Grab original G2P result
# Grab original G2P_paper result
aligner_g2p = aligner.tokenizer.g2p
base_g2p = aligner_g2p(text)

Expand All @@ -122,7 +122,7 @@ def disambiguate_candidates(aligner, text, spec, spec_len, confidence, device, h
has_heteronym = False

for word in words:
# Retrieve the length of the word in the default G2P conversion
# Retrieve the length of the word in the default G2P_paper conversion
g2p_default_len = len(aligner_g2p(word))

# Check if word needs to be disambiguated
Expand All @@ -134,7 +134,7 @@ def disambiguate_candidates(aligner, text, spec, spec_len, confidence, device, h
candidate_prons_and_lengths = []

for pron in aligner_g2p.phoneme_dict[word]:
# Replace graphemes in the base G2P result with the current variant
# Replace graphemes in the base G2P_paper result with the current variant
candidate = base_g2p[:word_start_idx] + pron + base_g2p[word_start_idx + g2p_default_len :]
candidate_tokens = aligner.tokenizer.encode_from_g2p(candidate)

Expand Down Expand Up @@ -247,7 +247,7 @@ def disambiguate_dataset(
count = 0

for line in f_in:
# Retrieve entry and base G2P conversion for full text
# Retrieve entry and base G2P_paper conversion for full text
entry = json.loads(line)
# Set punct_post_process=True in order to preserve words with apostrophes
text = aligner.normalizer.normalize(entry['text'], punct_post_process=True)
Expand Down
4 changes: 1 addition & 3 deletions examples/tts/conf/aligner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ window: hann

phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"

model:
symbols_embedding_dim: 384
Expand All @@ -41,7 +40,6 @@ model:
_target_: nemo_text_processing.text_normalization.normalize.Normalizer
lang: en
input_case: cased
whitelist: ${whitelist_path}

text_normalizer_call_kwargs:
verbose: false
Expand All @@ -56,7 +54,7 @@ model:
apostrophe: true
pad_with_space: true
g2p:
_target_: nemo_text_processing.g2p.modules.EnglishG2p
_target_: nemo.collections.tts.g2p.modules.EnglishG2p
phoneme_dict: ${phoneme_dict_path}
heteronyms: ${heteronyms_path}

Expand Down
3 changes: 0 additions & 3 deletions examples/tts/conf/de/fastpitch_align_22050_grapheme.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ lowfreq: 0
highfreq: null
window: hann

whitelist_path: "nemo_text_processing/text_normalization/de/data/whitelist.tsv"

model:
learn_alignment: true
bin_loss_warmup_epochs: 100
Expand Down Expand Up @@ -58,7 +56,6 @@ model:
_target_: nemo_text_processing.text_normalization.normalize.Normalizer
lang: de
input_case: cased
whitelist: ${whitelist_path}

text_normalizer_call_kwargs:
verbose: false
Expand Down
4 changes: 1 addition & 3 deletions examples/tts/conf/de/fastpitch_align_22050_mix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ window: hann

phoneme_dict_path: "scripts/tts_dataset_files/de/de_nv230119.dict"
heteronyms_path: "scripts/tts_dataset_files/de/de_nv230119.heteronyms"
whitelist_path: "nemo_text_processing/text_normalization/de/data/whitelist.tsv"

model:
learn_alignment: true
Expand Down Expand Up @@ -60,7 +59,6 @@ model:
_target_: nemo_text_processing.text_normalization.normalize.Normalizer
lang: de
input_case: cased
whitelist: ${whitelist_path}

text_normalizer_call_kwargs:
verbose: false
Expand All @@ -74,7 +72,7 @@ model:
apostrophe: true
pad_with_space: true
g2p:
_target_: nemo_text_processing.g2p.modules.IPAG2P
_target_: nemo.collections.tts.g2p.modules.IPAG2P
locale: 'de-DE'
phoneme_dict: ${phoneme_dict_path}
heteronyms: ${heteronyms_path}
Expand Down
3 changes: 0 additions & 3 deletions examples/tts/conf/de/fastpitch_align_44100_grapheme.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ lowfreq: 0
highfreq: null
window: hann

whitelist_path: "nemo_text_processing/text_normalization/de/data/whitelist.tsv"

model:
learn_alignment: true
bin_loss_warmup_epochs: 100
Expand Down Expand Up @@ -58,7 +56,6 @@ model:
_target_: nemo_text_processing.text_normalization.normalize.Normalizer
lang: de
input_case: cased
whitelist: ${whitelist_path}

text_normalizer_call_kwargs:
verbose: false
Expand Down
3 changes: 0 additions & 3 deletions examples/tts/conf/de/fastpitch_align_44100_phoneme.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ lowfreq: 0
highfreq: null
window: hann

whitelist_path: "nemo_text_processing/text_normalization/de/data/whitelist.tsv"

model:
learn_alignment: true
bin_loss_warmup_epochs: 100
Expand Down Expand Up @@ -58,7 +56,6 @@ model:
_target_: nemo_text_processing.text_normalization.normalize.Normalizer
lang: de
input_case: cased
whitelist: ${whitelist_path}

text_normalizer_call_kwargs:
verbose: false
Expand Down
2 changes: 1 addition & 1 deletion examples/tts/conf/es/fastpitch_align_44100_ipa.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ model:
apostrophe: true
pad_with_space: true
g2p:
_target_: nemo_text_processing.g2p.modules.IPAG2P
_target_: nemo.collections.tts.g2p.modules.IPAG2P
locale: es-ES
phoneme_dict: ${phoneme_dict_path}
phoneme_probability: 0.5
Expand Down
2 changes: 1 addition & 1 deletion examples/tts/conf/es/fastpitch_align_44100_ipa_multi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ model:
apostrophe: true
pad_with_space: true
g2p:
_target_: nemo_text_processing.g2p.modules.IPAG2P
_target_: nemo.collections.tts.g2p.modules.IPAG2P
locale: es-ES
phoneme_dict: ${phoneme_dict_path}
phoneme_probability: 0.5
Expand Down
4 changes: 1 addition & 3 deletions examples/tts/conf/fastpitch_align_44100.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ window: hann

phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
whitelist_path: "nemo_text_processing/text_normalization/en/data/whitelist/lj_speech.tsv"

model:
learn_alignment: true
Expand Down Expand Up @@ -59,7 +58,6 @@ model:
_target_: nemo_text_processing.text_normalization.normalize.Normalizer
lang: en
input_case: cased
whitelist: ${whitelist_path}

text_normalizer_call_kwargs:
verbose: false
Expand All @@ -74,7 +72,7 @@ model:
apostrophe: true
pad_with_space: true
g2p:
_target_: nemo_text_processing.g2p.modules.EnglishG2p
_target_: nemo.collections.tts.g2p.modules.EnglishG2p
phoneme_dict: ${phoneme_dict_path}
heteronyms: ${heteronyms_path}
phoneme_probability: 0.5
Expand Down
Loading