PaddlePaddle · lym0302 · May 7, 2022 · May 6, 2022 · May 6, 2022
diff --git a/demos/streaming_tts_server/conf/tts_online_application.yaml b/demos/streaming_tts_server/conf/tts_online_application.yaml
@@ -43,12 +43,12 @@ tts_online:
     device: 'cpu' # set 'gpu:id' or 'cpu'
     # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
     # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
-    am_block: 42
+    am_block: 72
     am_pad: 12
     # voc_pad and voc_block voc model to streaming voc infer,
     # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
     # when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
-    voc_block: 14
+    voc_block: 36
     voc_pad: 14
 
 
@@ -91,12 +91,12 @@ tts_online-onnx:
     lang: 'zh'
     # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
     # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
-    am_block: 42
+    am_block: 72
     am_pad: 12
     # voc_pad and voc_block voc model to streaming voc infer,
     # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
     # when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
-    voc_block: 14
+    voc_block: 36
     voc_pad: 14
     # voc_upsample should be same as n_shift on voc config.
     voc_upsample: 300

diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py
@@ -31,6 +31,7 @@
 from paddlespeech.cli.log import logger
 from paddlespeech.server.utils.audio_handler import ASRWsAudioHandler
 from paddlespeech.server.utils.audio_process import wav2pcm
+from paddlespeech.server.utils.util import compute_delay
 from paddlespeech.server.utils.util import wav2base64
 
 __all__ = [
@@ -221,7 +222,7 @@ def execute(self, argv: List[str]) -> bool:
         play = args.play
 
         try:
-            res = self(
+            self(
                 input=input_,
                 server_ip=server_ip,
                 port=port,
@@ -257,17 +258,42 @@ def __call__(self,
             logger.info("tts http client start")
             from paddlespeech.server.utils.audio_handler import TTSHttpHandler
             handler = TTSHttpHandler(server_ip, port, play)
-            handler.run(input, spk_id, speed, volume, sample_rate, output)
+            first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run(
+                input, spk_id, speed, volume, sample_rate, output)
+            delay_time_list = compute_delay(receive_time_list,
+                                            chunk_duration_list)
 
         elif protocol == "websocket":
             from paddlespeech.server.utils.audio_handler import TTSWsHandler
             logger.info("tts websocket client start")
             handler = TTSWsHandler(server_ip, port, play)
             loop = asyncio.get_event_loop()
-            loop.run_until_complete(handler.run(input, output))
+            first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete(
+                handler.run(input, output))
+            delay_time_list = compute_delay(receive_time_list,
+                                            chunk_duration_list)
 
         else:
             logger.error("Please set correct protocol, http or websocket")
+            return False
+
+        logger.info(f"sentence: {input}")
+        logger.info(f"duration: {duration} s")
+        logger.info(f"first response: {first_response} s")
+        logger.info(f"final response: {final_response} s")
+        logger.info(f"RTF: {final_response/duration}")
+        if output is not None:
+            if save_audio_success:
+                logger.info(f"Audio successfully saved in {output}")
+            else:
+                logger.error("Audio save failed.")
+
+        if delay_time_list != []:
+            logger.info(
+                f"Delay situation: total number of packages: {len(receive_time_list)}, the number of delayed packets: {len(delay_time_list)}, minimum delay time: {min(delay_time_list)} s, maximum delay time: {max(delay_time_list)} s, average delay time: {sum(delay_time_list)/len(delay_time_list)} s, delay rate:{len(delay_time_list)/len(receive_time_list)}"
+            )
+        else:
+            logger.info("The sentence has no delay in streaming synthesis.")
 
 
 @cli_client_register(

diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml
@@ -1,4 +1,4 @@
-# This is the parameter configuration file for PaddleSpeech Serving.
+# This is the parameter configuration file for PaddleSpeech Offline Serving..
 
 #################################################################################
 #                             SERVER SETTING                                    #
@@ -7,9 +7,7 @@ host: 127.0.0.1
 port: 8090
 
 # The task format in the engin_list is: <speech task>_<engine type>
-# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference']
-# protocol = ['websocket', 'http'] (only one can be selected). 
-# http only support offline engine type.
+# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference']
 protocol: 'http'
 engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python', 'vector_python']
 
@@ -50,24 +48,6 @@ asr_inference:
         summary: True  # False -> do not show predictor config
 
 
-################### speech task: asr; engine_type: online #######################
-asr_online:
-    model_type: 'deepspeech2online_aishell'
-    am_model: # the pdmodel file of am static model [optional]
-    am_params:  # the pdiparams file of am static model [optional]
-    lang: 'zh'
-    sample_rate: 16000
-    cfg_path: 
-    decode_method: 
-    force_yes: True
-
-    am_predictor_conf:
-        device:  # set 'gpu:id' or 'cpu'
-        switch_ir_optim: True
-        glog_info: False  # True -> print glog
-        summary: True  # False -> do not show predictor config
-
-
 ################################### TTS #########################################
 ################### speech task: tts; engine_type: python #######################
 tts_python: 

diff --git a/paddlespeech/server/conf/tts_online_application.yaml b/paddlespeech/server/conf/tts_online_application.yaml
@@ -43,12 +43,12 @@ tts_online:
     device: 'cpu' # set 'gpu:id' or 'cpu'
     # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
     # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
-    am_block: 42
+    am_block: 72
     am_pad: 12
     # voc_pad and voc_block voc model to streaming voc infer,
     # when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
     # when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
-    voc_block: 14
+    voc_block: 36
     voc_pad: 14
 
 
@@ -91,12 +91,12 @@ tts_online-onnx:
     lang: 'zh'
     # am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
     # when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
-    am_block: 42
+    am_block: 72
     am_pad: 12
     # voc_pad and voc_block voc model to streaming voc infer,
     # when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
     # when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
-    voc_block: 14
+    voc_block: 36
     voc_pad: 14
     # voc_upsample should be same as n_shift on voc config.
     voc_upsample: 300

diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py
@@ -20,10 +20,9 @@
 from numpy import float32
 from yacs.config import CfgNode
 
+from .pretrained_models import pretrained_models
 from paddlespeech.cli.asr.infer import ASRExecutor
-from paddlespeech.cli.asr.infer import model_alias
 from paddlespeech.cli.log import logger
-from paddlespeech.cli.utils import download_and_decompress
 from paddlespeech.cli.utils import MODEL_HOME
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.speech import SpeechSegment
@@ -40,45 +39,6 @@
 
 __all__ = ['ASREngine']
 
-pretrained_models = {
-    "deepspeech2online_aishell-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
-        'md5':
-        '98b87b171b7240b7cae6e07d8d0bc9be',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/deepspeech2_online/checkpoints/avg_1',
-        'model':
-        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel',
-        'params':
-        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams',
-        'lm_url':
-        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
-        'lm_md5':
-        '29e02312deb2e59b3c8686c7966d4fe3'
-    },
-    "conformer_online_multicn-zh-16k": {
-        'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz',
-        'md5':
-        '0ac93d390552336f2a906aec9e33c5fa',
-        'cfg_path':
-        'model.yaml',
-        'ckpt_path':
-        'exp/chunk_conformer/checkpoints/multi_cn',
-        'model':
-        'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
-        'params':
-        'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
-        'lm_url':
-        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
-        'lm_md5':
-        '29e02312deb2e59b3c8686c7966d4fe3'
-    },
-}
-
 
 # ASR server connection process class
 class PaddleASRConnectionHanddler:
@@ -626,24 +586,7 @@ def rescoring(self):
 class ASRServerExecutor(ASRExecutor):
     def __init__(self):
         super().__init__()
-        pass
-
-    def _get_pretrained_path(self, tag: str) -> os.PathLike:
-        """
-        Download and returns pretrained resources path of current task.
-        """
-        support_models = list(pretrained_models.keys())
-        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
-            tag, '\n\t\t'.join(support_models))
-
-        res_path = os.path.join(MODEL_HOME, tag)
-        decompressed_path = download_and_decompress(pretrained_models[tag],
-                                                    res_path)
-        decompressed_path = os.path.abspath(decompressed_path)
-        logger.info(
-            'Use pretrained model stored in: {}'.format(decompressed_path))
-
-        return decompressed_path
+        self.pretrained_models = pretrained_models
 
     def _init_from_path(self,
                         model_type: str='deepspeech2online_aishell',
@@ -659,20 +602,20 @@ def _init_from_path(self,
         """
         self.model_type = model_type
         self.sample_rate = sample_rate
+        sample_rate_str = '16k' if sample_rate == 16000 else '8k'
+        tag = model_type + '-' + lang + '-' + sample_rate_str
         if cfg_path is None or am_model is None or am_params is None:
-            sample_rate_str = '16k' if sample_rate == 16000 else '8k'
-            tag = model_type + '-' + lang + '-' + sample_rate_str
             logger.info(f"Load the pretrained model, tag = {tag}")
             res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
             self.res_path = res_path
 
-            self.cfg_path = os.path.join(res_path,
-                                         pretrained_models[tag]['cfg_path'])
+            self.cfg_path = os.path.join(
+                res_path, self.pretrained_models[tag]['cfg_path'])
 
             self.am_model = os.path.join(res_path,
-                                         pretrained_models[tag]['model'])
+                                         self.pretrained_models[tag]['model'])
             self.am_params = os.path.join(res_path,
-                                          pretrained_models[tag]['params'])
+                                          self.pretrained_models[tag]['params'])
             logger.info(res_path)
         else:
             self.cfg_path = os.path.abspath(cfg_path)
@@ -700,8 +643,8 @@ def _init_from_path(self,
                 self.text_feature = TextFeaturizer(
                     unit_type=self.config.unit_type, vocab=self.vocab)
 
-                lm_url = pretrained_models[tag]['lm_url']
-                lm_md5 = pretrained_models[tag]['lm_md5']
+                lm_url = self.pretrained_models[tag]['lm_url']
+                lm_md5 = self.pretrained_models[tag]['lm_md5']
                 logger.info(f"Start to load language model {lm_url}")
                 self.download_lm(
                     lm_url,
@@ -774,7 +717,7 @@ def _init_from_path(self,
             model_name = model_type[:model_type.rindex(
                 '_')]  # model_type: {model_name}_{dataset}
             logger.info(f"model name: {model_name}")
-            model_class = dynamic_import(model_name, model_alias)
+            model_class = dynamic_import(model_name, self.model_alias)
             model_conf = self.config
             model = model_class.from_config(model_conf)
             self.model = model

diff --git a/paddlespeech/server/engine/asr/online/pretrained_models.py b/paddlespeech/server/engine/asr/online/pretrained_models.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+pretrained_models = {
+    "deepspeech2online_aishell-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
+        'md5':
+        '98b87b171b7240b7cae6e07d8d0bc9be',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/deepspeech2_online/checkpoints/avg_1',
+        'model':
+        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel',
+        'params':
+        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+        'lm_md5':
+        '29e02312deb2e59b3c8686c7966d4fe3'
+    },
+    "conformer_online_multicn-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz',
+        'md5':
+        '0ac93d390552336f2a906aec9e33c5fa',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/chunk_conformer/checkpoints/multi_cn',
+        'model':
+        'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
+        'params':
+        'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+        'lm_md5':
+        '29e02312deb2e59b3c8686c7966d4fe3'
+    },
+}