Merge pull request #1693 from yt605155624/fix_ss_NHWC

[TTS]change NLC to NCL in speedyspeech, test=tts
PaddlePaddle · Apr 14, 2022 · 98f6787 · 98f6787
2 parents 8f1335d + 8b801ca
commit 98f6787
Show file tree

Hide file tree

Showing 18 changed files with 324 additions and 109 deletions.
diff --git a/docs/source/released_model.md b/docs/source/released_model.md
@@ -37,7 +37,7 @@ Model Type | Dataset| Example Link | Pretrained Models|Static Models|Size (stati
 Tacotron2|LJSpeech|[tacotron2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts0)|[tacotron2_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip)|||
 Tacotron2|CSMSC|[tacotron2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts0)|[tacotron2_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip)|[tacotron2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_static_0.2.0.zip)|103MB|
 TransformerTTS| LJSpeech| [transformer-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts1)|[transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/transformer_tts/transformer_tts_ljspeech_ckpt_0.4.zip)|||
-SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2) |[speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)|[speedyspeech_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_static_0.2.0.zip)|12MB|
+SpeedySpeech| CSMSC | [speedyspeech-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts2)|[speedyspeech_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip)|[speedyspeech_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_static_0.2.0.zip)|12MB|
 FastSpeech2| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)|[fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip)|157MB|
 FastSpeech2-Conformer| CSMSC |[fastspeech2-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3)|[fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)|||
 FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3)|[fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip)|||

diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md
@@ -223,22 +223,28 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
 ## Pretrained Model
 Pretrained SpeedySpeech model with no silence in the edge of audios:
 - [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip)
+- [speedyspeech_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip)
 
 The static model can be downloaded here:
 - [speedyspeech_nosil_baker_static_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_static_0.5.zip)
 - [speedyspeech_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_static_0.2.0.zip)
 
+The ONNX model can be downloaded here:
+- [speedyspeech_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_onnx_0.2.0.zip)
+
+
 Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/ssim_loss
 :-------------:| :------------:| :-----: | :-----: | :--------:|:--------:
-default| 1(gpu) x 11400|0.83655|0.42324|0.03211| 0.38119
+default| 1(gpu) x 11400|0.79532|0.400246|0.030259| 0.36482
 
 SpeedySpeech checkpoint contains files listed below.
+
 ```text
-speedyspeech_nosil_baker_ckpt_0.5
+speedyspeech_csmsc_ckpt_0.2.0
 ├── default.yaml            # default config used to train speedyspeech
 ├── feats_stats.npy         # statistics used to normalize spectrogram when training speedyspeech
 ├── phone_id_map.txt        # phone vocabulary file when training speedyspeech
-├── snapshot_iter_11400.pdz # model parameters and optimizer states
+├── snapshot_iter_30600.pdz # model parameters and optimizer states
 └── tone_id_map.txt         # tone vocabulary file when training speedyspeech
 ```
 You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained speedyspeech and parallel wavegan models.
@@ -249,9 +255,9 @@ FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../synthesize_e2e.py \
   --am=speedyspeech_csmsc \
-  --am_config=speedyspeech_nosil_baker_ckpt_0.5/default.yaml \
-  --am_ckpt=speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz \
-  --am_stat=speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy \
+  --am_config=speedyspeech_csmsc_ckpt_0.2.0/default.yaml \
+  --am_ckpt=speedyspeech_csmsc_ckpt_0.2.0/snapshot_iter_30600.pdz \
+  --am_stat=speedyspeech_csmsc_ckpt_0.2.0/feats_stats.npy \
   --voc=pwgan_csmsc \
   --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
   --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
@@ -260,6 +266,6 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --text=${BIN_DIR}/../sentences.txt \
   --output_dir=exp/default/test_e2e \
   --inference_dir=exp/default/inference \
-  --phones_dict=speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt \
-  --tones_dict=speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
+  --phones_dict=speedyspeech_csmsc_ckpt_0.2.0/phone_id_map.txt \
+  --tones_dict=speedyspeech_csmsc_ckpt_0.2.0/tone_id_map.txt
 ```
diff --git a/examples/csmsc/tts2/local/ort_predict.sh b/examples/csmsc/tts2/local/ort_predict.sh
@@ -0,0 +1,32 @@
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# only support default_fastspeech2/speedyspeech + hifigan/mb_melgan now!
+
+# synthesize from metadata
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../ort_predict.py \
+        --inference_dir=${train_output_path}/inference_onnx \
+        --am=speedyspeech_csmsc \
+        --voc=hifigan_csmsc \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/onnx_infer_out \
+        --device=cpu \
+        --cpu_threads=2
+fi
+
+# e2e, synthesize from text
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${BIN_DIR}/../ort_predict_e2e.py \
+        --inference_dir=${train_output_path}/inference_onnx \
+        --am=speedyspeech_csmsc \
+        --voc=hifigan_csmsc \
+        --output_dir=${train_output_path}/onnx_infer_out_e2e \
+        --text=${BIN_DIR}/../csmsc_test.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device=cpu \
+        --cpu_threads=2
+fi
diff --git a/examples/csmsc/tts2/local/paddle2onnx.sh b/examples/csmsc/tts2/local/paddle2onnx.sh
@@ -0,0 +1 @@
+../../tts3/local/paddle2onnx.sh
diff --git a/examples/csmsc/tts2/run.sh b/examples/csmsc/tts2/run.sh
@@ -40,3 +40,25 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     # inference with static model
     CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
 fi
+
+# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first
+# we have only tested the following models so far
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    # install paddle2onnx
+    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
+    if [[ -z "$version" || ${version} != '0.9.4' ]]; then
+        pip install paddle2onnx==0.9.4
+    fi
+    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx speedyspeech_csmsc
+    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
+fi
+
+# inference with onnxruntime, use fastspeech2 + hifigan by default
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    # install onnxruntime
+    version=$(echo `pip list |grep "onnxruntime"` |awk -F" " '{print $2}')
+    if [[ -z "$version" || ${version} != '1.10.0' ]]; then
+        pip install onnxruntime==1.10.0
+    fi
+    ./local/ort_predict.sh ${train_output_path}
+fi
diff --git a/examples/csmsc/tts3/local/ort_predict.sh b/examples/csmsc/tts3/local/ort_predict.sh
@@ -3,7 +3,7 @@ train_output_path=$1
 stage=0
 stop_stage=0
 
-# only support default_fastspeech2 + hifigan/mb_melgan now!
+# only support default_fastspeech2/speedyspeech + hifigan/mb_melgan now!
 
 # synthesize from metadata
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then

diff --git a/examples/csmsc/tts3/local/paddle2onnx.sh b/examples/csmsc/tts3/local/paddle2onnx.sh
@@ -19,4 +19,5 @@ paddle2onnx \
     --model_filename ${model}.pdmodel \
     --params_filename ${model}.pdiparams \
     --save_file ${train_output_path}/${output_dir}/${model}.onnx \
+    --opset_version 11 \
     --enable_dev_version ${enable_dev_version}
diff --git a/paddleaudio/paddleaudio/compliance/librosa.py b/paddleaudio/paddleaudio/compliance/librosa.py
@@ -341,7 +341,7 @@ def stft(x: np.ndarray,
         hop_length (Optional[int], optional): Number of steps to advance between adjacent windows. Defaults to None.
         win_length (Optional[int], optional): The size of window. Defaults to None.
         window (str, optional): A string of window specification. Defaults to "hann".
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
         dtype (type, optional): Data type of STFT results. Defaults to np.complex64.
         pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
 
@@ -509,7 +509,7 @@ def melspectrogram(x: np.ndarray,
         fmin (float, optional): Minimum frequency in Hz. Defaults to 50.0.
         fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
         window (str, optional): A string of window specification. Defaults to "hann".
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
         pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
         power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
         to_db (bool, optional): Enable db scale. Defaults to True.
@@ -564,7 +564,7 @@ def spectrogram(x: np.ndarray,
         window_size (int, optional): Size of FFT and window length. Defaults to 512.
         hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
         window (str, optional): A string of window specification. Defaults to "hann".
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
         pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
         power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
 

diff --git a/paddleaudio/paddleaudio/features/layers.py b/paddleaudio/paddleaudio/features/layers.py
@@ -42,7 +42,7 @@ class Spectrogram(nn.Layer):
         win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
         window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
         power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
         pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
         dtype (str, optional): Data type of input and window. Defaults to 'float32'.
     """
@@ -99,7 +99,7 @@ class MelSpectrogram(nn.Layer):
         win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
         window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
         power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
         pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
         n_mels (int, optional): Number of mel bins. Defaults to 64.
         f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
@@ -176,7 +176,7 @@ class LogMelSpectrogram(nn.Layer):
         win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
         window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
         power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
         pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
         n_mels (int, optional): Number of mel bins. Defaults to 64.
         f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
@@ -257,7 +257,7 @@ class MFCC(nn.Layer):
         win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
         window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
         power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
-        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
         pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
         n_mels (int, optional): Number of mel bins. Defaults to 64.
         f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.

diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
@@ -43,13 +43,13 @@
     # speedyspeech
     "speedyspeech_csmsc-zh": {
         'url':
-        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_nosil_baker_ckpt_0.5.zip',
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip',
         'md5':
-        '9edce23b1a87f31b814d9477bf52afbc',
+        '6f6fa967b408454b6662c8c00c0027cb',
         'config':
         'default.yaml',
         'ckpt':
-        'snapshot_iter_11400.pdz',
+        'snapshot_iter_30600.pdz',
         'speech_stats':
         'feats_stats.npy',
         'phones_dict':

diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py
@@ -38,9 +38,7 @@ def get_predictor(args, filed='am'):
         config.enable_use_gpu(100, 0)
     elif args.device == "cpu":
         config.disable_gpu()
-    # This line must be commented for fastspeech2, if not, it will OOM
-    if model_name != 'fastspeech2':
-        config.enable_memory_optim()
+    config.enable_memory_optim()
     predictor = inference.create_predictor(config)
     return predictor
 

diff --git a/paddlespeech/t2s/exps/ort_predict.py b/paddlespeech/t2s/exps/ort_predict.py
@@ -70,8 +70,15 @@ def ort_predict(args):
 
     # am warmup
     for T in [27, 38, 54]:
-        data = np.random.randint(1, 266, size=(T, ))
-        am_sess.run(None, {"text": data})
+        am_input_feed = {}
+        if am_name == 'fastspeech2':
+            phone_ids = np.random.randint(1, 266, size=(T, ))
+            am_input_feed.update({'text': phone_ids})
+        elif am_name == 'speedyspeech':
+            phone_ids = np.random.randint(1, 92, size=(T, ))
+            tone_ids = np.random.randint(1, 5, size=(T, ))
+            am_input_feed.update({'phones': phone_ids, 'tones': tone_ids})
+        am_sess.run(None, input_feed=am_input_feed)
 
     # voc warmup
     for T in [227, 308, 544]:
@@ -81,14 +88,20 @@ def ort_predict(args):
 
     N = 0
     T = 0
+    am_input_feed = {}
     for example in test_dataset:
         utt_id = example['utt_id']
-        phone_ids = example["text"]
+        if am_name == 'fastspeech2':
+            phone_ids = example["text"]
+            am_input_feed.update({'text': phone_ids})
+        elif am_name == 'speedyspeech':
+            phone_ids = example["phones"]
+            tone_ids = example["tones"]
+            am_input_feed.update({'phones': phone_ids, 'tones': tone_ids})
         with timer() as t:
-            mel = am_sess.run(output_names=None, input_feed={'text': phone_ids})
+            mel = am_sess.run(output_names=None, input_feed=am_input_feed)
             mel = mel[0]
             wav = voc_sess.run(output_names=None, input_feed={'logmel': mel})
-
             N += len(wav[0])
             T += t.elapse
             speed = len(wav[0]) / t.elapse
@@ -110,9 +123,7 @@ def parse_args():
         '--am',
         type=str,
         default='fastspeech2_csmsc',
-        choices=[
-            'fastspeech2_csmsc',
-        ],
+        choices=['fastspeech2_csmsc', 'speedyspeech_csmsc'],
         help='Choose acoustic model type of tts task.')
 
     # voc