Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[R1.0][server] improve server code #1866

Merged
merged 2 commits into from
May 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions demos/streaming_tts_server/conf/tts_online_application.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,12 @@ tts_online:
device: 'cpu' # set 'gpu:id' or 'cpu'
# am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
# when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
am_block: 42
am_block: 72
am_pad: 12
# voc_pad and voc_block voc model to streaming voc infer,
# when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
# when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
voc_block: 14
voc_block: 36
voc_pad: 14


Expand Down Expand Up @@ -91,12 +91,12 @@ tts_online-onnx:
lang: 'zh'
# am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
# when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
am_block: 42
am_block: 72
am_pad: 12
# voc_pad and voc_block voc model to streaming voc infer,
# when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
# when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
voc_block: 14
voc_block: 36
voc_pad: 14
# voc_upsample should be same as n_shift on voc config.
voc_upsample: 300
Expand Down
32 changes: 29 additions & 3 deletions paddlespeech/server/bin/paddlespeech_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from paddlespeech.cli.log import logger
from paddlespeech.server.utils.audio_handler import ASRWsAudioHandler
from paddlespeech.server.utils.audio_process import wav2pcm
from paddlespeech.server.utils.util import compute_delay
from paddlespeech.server.utils.util import wav2base64

__all__ = [
Expand Down Expand Up @@ -221,7 +222,7 @@ def execute(self, argv: List[str]) -> bool:
play = args.play

try:
res = self(
self(
input=input_,
server_ip=server_ip,
port=port,
Expand Down Expand Up @@ -257,17 +258,42 @@ def __call__(self,
logger.info("tts http client start")
from paddlespeech.server.utils.audio_handler import TTSHttpHandler
handler = TTSHttpHandler(server_ip, port, play)
handler.run(input, spk_id, speed, volume, sample_rate, output)
first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run(
input, spk_id, speed, volume, sample_rate, output)
delay_time_list = compute_delay(receive_time_list,
chunk_duration_list)

elif protocol == "websocket":
from paddlespeech.server.utils.audio_handler import TTSWsHandler
logger.info("tts websocket client start")
handler = TTSWsHandler(server_ip, port, play)
loop = asyncio.get_event_loop()
loop.run_until_complete(handler.run(input, output))
first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete(
handler.run(input, output))
delay_time_list = compute_delay(receive_time_list,
chunk_duration_list)

else:
logger.error("Please set correct protocol, http or websocket")
return False

logger.info(f"sentence: {input}")
logger.info(f"duration: {duration} s")
logger.info(f"first response: {first_response} s")
logger.info(f"final response: {final_response} s")
logger.info(f"RTF: {final_response/duration}")
if output is not None:
if save_audio_success:
logger.info(f"Audio successfully saved in {output}")
else:
logger.error("Audio save failed.")

if delay_time_list != []:
logger.info(
f"Delay situation: total number of packages: {len(receive_time_list)}, the number of delayed packets: {len(delay_time_list)}, minimum delay time: {min(delay_time_list)} s, maximum delay time: {max(delay_time_list)} s, average delay time: {sum(delay_time_list)/len(delay_time_list)} s, delay rate:{len(delay_time_list)/len(receive_time_list)}"
)
else:
logger.info("The sentence has no delay in streaming synthesis.")


@cli_client_register(
Expand Down
24 changes: 2 additions & 22 deletions paddlespeech/server/conf/application.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# This is the parameter configuration file for PaddleSpeech Serving.
# This is the parameter configuration file for PaddleSpeech Offline Serving..

#################################################################################
# SERVER SETTING #
Expand All @@ -7,9 +7,7 @@ host: 127.0.0.1
port: 8090

# The task format in the engin_list is: <speech task>_<engine type>
# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference']
# protocol = ['websocket', 'http'] (only one can be selected).
# http only support offline engine type.
# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference']
protocol: 'http'
engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python', 'vector_python']

Expand Down Expand Up @@ -50,24 +48,6 @@ asr_inference:
summary: True # False -> do not show predictor config


################### speech task: asr; engine_type: online #######################
asr_online:
model_type: 'deepspeech2online_aishell'
am_model: # the pdmodel file of am static model [optional]
am_params: # the pdiparams file of am static model [optional]
lang: 'zh'
sample_rate: 16000
cfg_path:
decode_method:
force_yes: True

am_predictor_conf:
device: # set 'gpu:id' or 'cpu'
switch_ir_optim: True
glog_info: False # True -> print glog
summary: True # False -> do not show predictor config


################################### TTS #########################################
################### speech task: tts; engine_type: python #######################
tts_python:
Expand Down
8 changes: 4 additions & 4 deletions paddlespeech/server/conf/tts_online_application.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,12 @@ tts_online:
device: 'cpu' # set 'gpu:id' or 'cpu'
# am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
# when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
am_block: 42
am_block: 72
am_pad: 12
# voc_pad and voc_block voc model to streaming voc infer,
# when voc model is mb_melgan_csmsc, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
# when voc model is hifigan_csmsc, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
voc_block: 14
voc_block: 36
voc_pad: 14


Expand Down Expand Up @@ -91,12 +91,12 @@ tts_online-onnx:
lang: 'zh'
# am_block and am_pad only for fastspeech2_cnndecoder_onnx model to streaming am infer,
# when am_pad set 12, streaming synthetic audio is the same as non-streaming synthetic audio
am_block: 42
am_block: 72
am_pad: 12
# voc_pad and voc_block voc model to streaming voc infer,
# when voc model is mb_melgan_csmsc_onnx, voc_pad set 14, streaming synthetic audio is the same as non-streaming synthetic audio; The minimum value of pad can be set to 7, streaming synthetic audio sounds normal
# when voc model is hifigan_csmsc_onnx, voc_pad set 20, streaming synthetic audio is the same as non-streaming synthetic audio; voc_pad set 14, streaming synthetic audio sounds normal
voc_block: 14
voc_block: 36
voc_pad: 14
# voc_upsample should be same as n_shift on voc config.
voc_upsample: 300
Expand Down
79 changes: 11 additions & 68 deletions paddlespeech/server/engine/asr/online/asr_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,9 @@
from numpy import float32
from yacs.config import CfgNode

from .pretrained_models import pretrained_models
from paddlespeech.cli.asr.infer import ASRExecutor
from paddlespeech.cli.asr.infer import model_alias
from paddlespeech.cli.log import logger
from paddlespeech.cli.utils import download_and_decompress
from paddlespeech.cli.utils import MODEL_HOME
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.speech import SpeechSegment
Expand All @@ -40,45 +39,6 @@

__all__ = ['ASREngine']

pretrained_models = {
"deepspeech2online_aishell-zh-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
'md5':
'98b87b171b7240b7cae6e07d8d0bc9be',
'cfg_path':
'model.yaml',
'ckpt_path':
'exp/deepspeech2_online/checkpoints/avg_1',
'model':
'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel',
'params':
'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams',
'lm_url':
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
'lm_md5':
'29e02312deb2e59b3c8686c7966d4fe3'
},
"conformer_online_multicn-zh-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz',
'md5':
'0ac93d390552336f2a906aec9e33c5fa',
'cfg_path':
'model.yaml',
'ckpt_path':
'exp/chunk_conformer/checkpoints/multi_cn',
'model':
'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
'params':
'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
'lm_url':
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
'lm_md5':
'29e02312deb2e59b3c8686c7966d4fe3'
},
}


# ASR server connection process class
class PaddleASRConnectionHanddler:
Expand Down Expand Up @@ -626,24 +586,7 @@ def rescoring(self):
class ASRServerExecutor(ASRExecutor):
def __init__(self):
super().__init__()
pass

def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
Download and returns pretrained resources path of current task.
"""
support_models = list(pretrained_models.keys())
assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
tag, '\n\t\t'.join(support_models))

res_path = os.path.join(MODEL_HOME, tag)
decompressed_path = download_and_decompress(pretrained_models[tag],
res_path)
decompressed_path = os.path.abspath(decompressed_path)
logger.info(
'Use pretrained model stored in: {}'.format(decompressed_path))

return decompressed_path
self.pretrained_models = pretrained_models

def _init_from_path(self,
model_type: str='deepspeech2online_aishell',
Expand All @@ -659,20 +602,20 @@ def _init_from_path(self,
"""
self.model_type = model_type
self.sample_rate = sample_rate
sample_rate_str = '16k' if sample_rate == 16000 else '8k'
tag = model_type + '-' + lang + '-' + sample_rate_str
if cfg_path is None or am_model is None or am_params is None:
sample_rate_str = '16k' if sample_rate == 16000 else '8k'
tag = model_type + '-' + lang + '-' + sample_rate_str
logger.info(f"Load the pretrained model, tag = {tag}")
res_path = self._get_pretrained_path(tag) # wenetspeech_zh
self.res_path = res_path

self.cfg_path = os.path.join(res_path,
pretrained_models[tag]['cfg_path'])
self.cfg_path = os.path.join(
res_path, self.pretrained_models[tag]['cfg_path'])

self.am_model = os.path.join(res_path,
pretrained_models[tag]['model'])
self.pretrained_models[tag]['model'])
self.am_params = os.path.join(res_path,
pretrained_models[tag]['params'])
self.pretrained_models[tag]['params'])
logger.info(res_path)
else:
self.cfg_path = os.path.abspath(cfg_path)
Expand Down Expand Up @@ -700,8 +643,8 @@ def _init_from_path(self,
self.text_feature = TextFeaturizer(
unit_type=self.config.unit_type, vocab=self.vocab)

lm_url = pretrained_models[tag]['lm_url']
lm_md5 = pretrained_models[tag]['lm_md5']
lm_url = self.pretrained_models[tag]['lm_url']
lm_md5 = self.pretrained_models[tag]['lm_md5']
logger.info(f"Start to load language model {lm_url}")
self.download_lm(
lm_url,
Expand Down Expand Up @@ -774,7 +717,7 @@ def _init_from_path(self,
model_name = model_type[:model_type.rindex(
'_')] # model_type: {model_name}_{dataset}
logger.info(f"model name: {model_name}")
model_class = dynamic_import(model_name, model_alias)
model_class = dynamic_import(model_name, self.model_alias)
model_conf = self.config
model = model_class.from_config(model_conf)
self.model = model
Expand Down
52 changes: 52 additions & 0 deletions paddlespeech/server/engine/asr/online/pretrained_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

pretrained_models = {
"deepspeech2online_aishell-zh-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz',
'md5':
'98b87b171b7240b7cae6e07d8d0bc9be',
'cfg_path':
'model.yaml',
'ckpt_path':
'exp/deepspeech2_online/checkpoints/avg_1',
'model':
'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel',
'params':
'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams',
'lm_url':
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
'lm_md5':
'29e02312deb2e59b3c8686c7966d4fe3'
},
"conformer_online_multicn-zh-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/multi_cn/asr1/asr1_chunk_conformer_multi_cn_ckpt_0.2.3.model.tar.gz',
'md5':
'0ac93d390552336f2a906aec9e33c5fa',
'cfg_path':
'model.yaml',
'ckpt_path':
'exp/chunk_conformer/checkpoints/multi_cn',
'model':
'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
'params':
'exp/chunk_conformer/checkpoints/multi_cn.pdparams',
'lm_url':
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
'lm_md5':
'29e02312deb2e59b3c8686c7966d4fe3'
},
}
Loading