Skip to content

Commit

Permalink
Remove tts (blocking call) (#6869)
Browse files Browse the repository at this point in the history
  • Loading branch information
ic-xu authored Aug 1, 2024
1 parent f31142e commit a9cd6df
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 138 deletions.
46 changes: 1 addition & 45 deletions api/core/model_runtime/model_providers/azure_openai/tts/tts.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
import concurrent.futures
import copy
from functools import reduce
from io import BytesIO
from typing import Optional

from flask import Response
from openai import AzureOpenAI
from pydub import AudioSegment

from core.model_runtime.entities.model_entities import AIModelEntity
from core.model_runtime.errors.invoke import InvokeBadRequestError
Expand Down Expand Up @@ -51,7 +47,7 @@ def validate_credentials(self, model: str, credentials: dict) -> None:
:return: text translated to audio file
"""
try:
self._tts_invoke(
self._tts_invoke_streaming(
model=model,
credentials=credentials,
content_text='Hello Dify!',
Expand All @@ -60,45 +56,6 @@ def validate_credentials(self, model: str, credentials: dict) -> None:
except Exception as ex:
raise CredentialsValidateFailedError(str(ex))

def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
"""
_tts_invoke text2speech model
:param model: model name
:param credentials: model credentials
:param content_text: text content to be translated
:param voice: model timbre
:return: text translated to audio file
"""
audio_type = self._get_model_audio_type(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
max_workers = self._get_model_workers_limit(model, credentials)
try:
sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
audio_bytes_list = []

# Create a thread pool and map the function to the list of sentences
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice,
credentials=credentials) for sentence in sentences]
for future in futures:
try:
if future.result():
audio_bytes_list.append(future.result())
except Exception as ex:
raise InvokeBadRequestError(str(ex))

if len(audio_bytes_list) > 0:
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
audio_bytes_list if audio_bytes]
combined_segment = reduce(lambda x, y: x + y, audio_segments)
buffer: BytesIO = BytesIO()
combined_segment.export(buffer, format=audio_type)
buffer.seek(0)
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
except Exception as ex:
raise InvokeBadRequestError(str(ex))

def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
voice: str) -> any:
"""
Expand Down Expand Up @@ -144,7 +101,6 @@ def _process_sentence(self, sentence: str, model: str,
:param sentence: text content to be translated
:return: text translated to audio file
"""
# transform credentials to kwargs for model instance
credentials_kwargs = self._to_credential_kwargs(credentials)
client = AzureOpenAI(**credentials_kwargs)
response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())
Expand Down
52 changes: 5 additions & 47 deletions api/core/model_runtime/model_providers/openai/tts/tts.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
import concurrent.futures
from functools import reduce
from io import BytesIO
from typing import Optional

from flask import Response
from openai import OpenAI
from pydub import AudioSegment

from core.model_runtime.errors.invoke import InvokeBadRequestError
from core.model_runtime.errors.validate import CredentialsValidateFailedError
Expand All @@ -32,7 +28,8 @@ def _invoke(self, model: str, tenant_id: str, credentials: dict,
:return: text translated to audio file
"""

if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]:
if not voice or voice not in [d['value'] for d in
self.get_tts_model_voices(model=model, credentials=credentials)]:
voice = self._get_model_default_voice(model, credentials)
# if streaming:
return self._tts_invoke_streaming(model=model,
Expand All @@ -50,7 +47,7 @@ def validate_credentials(self, model: str, credentials: dict, user: Optional[str
:return: text translated to audio file
"""
try:
self._tts_invoke(
self._tts_invoke_streaming(
model=model,
credentials=credentials,
content_text='Hello Dify!',
Expand All @@ -59,46 +56,6 @@ def validate_credentials(self, model: str, credentials: dict, user: Optional[str
except Exception as ex:
raise CredentialsValidateFailedError(str(ex))

def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
"""
_tts_invoke text2speech model
:param model: model name
:param credentials: model credentials
:param content_text: text content to be translated
:param voice: model timbre
:return: text translated to audio file
"""
audio_type = self._get_model_audio_type(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
max_workers = self._get_model_workers_limit(model, credentials)
try:
sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
audio_bytes_list = []

# Create a thread pool and map the function to the list of sentences
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice,
credentials=credentials) for sentence in sentences]
for future in futures:
try:
if future.result():
audio_bytes_list.append(future.result())
except Exception as ex:
raise InvokeBadRequestError(str(ex))

if len(audio_bytes_list) > 0:
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
audio_bytes_list if audio_bytes]
combined_segment = reduce(lambda x, y: x + y, audio_segments)
buffer: BytesIO = BytesIO()
combined_segment.export(buffer, format=audio_type)
buffer.seek(0)
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
except Exception as ex:
raise InvokeBadRequestError(str(ex))


def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
voice: str) -> any:
"""
Expand All @@ -114,7 +71,8 @@ def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str
# doc: https://platform.openai.com/docs/guides/text-to-speech
credentials_kwargs = self._to_credential_kwargs(credentials)
client = OpenAI(**credentials_kwargs)
model_support_voice = [x.get("value") for x in self.get_tts_model_voices(model=model, credentials=credentials)]
model_support_voice = [x.get("value") for x in
self.get_tts_model_voices(model=model, credentials=credentials)]
if not voice or voice not in model_support_voice:
voice = self._get_model_default_voice(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
Expand Down
47 changes: 1 addition & 46 deletions api/core/model_runtime/model_providers/tongyi/tts/tts.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
import concurrent.futures
import threading
from functools import reduce
from io import BytesIO
from queue import Queue
from typing import Optional

import dashscope
from dashscope import SpeechSynthesizer
from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse
from dashscope.audio.tts import ResultCallback, SpeechSynthesisResult
from flask import Response
from pydub import AudioSegment

from core.model_runtime.errors.invoke import InvokeBadRequestError
from core.model_runtime.errors.validate import CredentialsValidateFailedError
Expand Down Expand Up @@ -55,7 +50,7 @@ def validate_credentials(self, model: str, credentials: dict, user: Optional[str
:return: text translated to audio file
"""
try:
self._tts_invoke(
self._tts_invoke_streaming(
model=model,
credentials=credentials,
content_text='Hello Dify!',
Expand All @@ -64,46 +59,6 @@ def validate_credentials(self, model: str, credentials: dict, user: Optional[str
except Exception as ex:
raise CredentialsValidateFailedError(str(ex))

def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
"""
_tts_invoke text2speech model
:param model: model name
:param credentials: model credentials
:param voice: model timbre
:param content_text: text content to be translated
:return: text translated to audio file
"""
audio_type = self._get_model_audio_type(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
max_workers = self._get_model_workers_limit(model, credentials)
try:
sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
audio_bytes_list = []

# Create a thread pool and map the function to the list of sentences
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(self._process_sentence, sentence=sentence,
credentials=credentials, voice=voice, audio_type=audio_type) for sentence in
sentences]
for future in futures:
try:
if future.result():
audio_bytes_list.append(future.result())
except Exception as ex:
raise InvokeBadRequestError(str(ex))

if len(audio_bytes_list) > 0:
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
audio_bytes_list if audio_bytes]
combined_segment = reduce(lambda x, y: x + y, audio_segments)
buffer: BytesIO = BytesIO()
combined_segment.export(buffer, format=audio_type)
buffer.seek(0)
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
except Exception as ex:
raise InvokeBadRequestError(str(ex))

def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
voice: str) -> any:
"""
Expand Down

0 comments on commit a9cd6df

Please sign in to comment.