Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove tts (blocking call) #6869

Merged
merged 2 commits into from
Aug 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 1 addition & 45 deletions api/core/model_runtime/model_providers/azure_openai/tts/tts.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
import concurrent.futures
import copy
from functools import reduce
from io import BytesIO
from typing import Optional

from flask import Response
from openai import AzureOpenAI
from pydub import AudioSegment

from core.model_runtime.entities.model_entities import AIModelEntity
from core.model_runtime.errors.invoke import InvokeBadRequestError
Expand Down Expand Up @@ -51,7 +47,7 @@ def validate_credentials(self, model: str, credentials: dict) -> None:
:return: text translated to audio file
"""
try:
self._tts_invoke(
self._tts_invoke_streaming(
model=model,
credentials=credentials,
content_text='Hello Dify!',
Expand All @@ -60,45 +56,6 @@ def validate_credentials(self, model: str, credentials: dict) -> None:
except Exception as ex:
raise CredentialsValidateFailedError(str(ex))

def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
"""
_tts_invoke text2speech model
:param model: model name
:param credentials: model credentials
:param content_text: text content to be translated
:param voice: model timbre
:return: text translated to audio file
"""
audio_type = self._get_model_audio_type(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
max_workers = self._get_model_workers_limit(model, credentials)
try:
sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
audio_bytes_list = []

# Create a thread pool and map the function to the list of sentences
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice,
credentials=credentials) for sentence in sentences]
for future in futures:
try:
if future.result():
audio_bytes_list.append(future.result())
except Exception as ex:
raise InvokeBadRequestError(str(ex))

if len(audio_bytes_list) > 0:
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
audio_bytes_list if audio_bytes]
combined_segment = reduce(lambda x, y: x + y, audio_segments)
buffer: BytesIO = BytesIO()
combined_segment.export(buffer, format=audio_type)
buffer.seek(0)
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
except Exception as ex:
raise InvokeBadRequestError(str(ex))

def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
voice: str) -> any:
"""
Expand Down Expand Up @@ -144,7 +101,6 @@ def _process_sentence(self, sentence: str, model: str,
:param sentence: text content to be translated
:return: text translated to audio file
"""
# transform credentials to kwargs for model instance
credentials_kwargs = self._to_credential_kwargs(credentials)
client = AzureOpenAI(**credentials_kwargs)
response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())
Expand Down
52 changes: 5 additions & 47 deletions api/core/model_runtime/model_providers/openai/tts/tts.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
import concurrent.futures
from functools import reduce
from io import BytesIO
from typing import Optional

from flask import Response
from openai import OpenAI
from pydub import AudioSegment

from core.model_runtime.errors.invoke import InvokeBadRequestError
from core.model_runtime.errors.validate import CredentialsValidateFailedError
Expand All @@ -32,7 +28,8 @@ def _invoke(self, model: str, tenant_id: str, credentials: dict,
:return: text translated to audio file
"""

if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]:
if not voice or voice not in [d['value'] for d in
self.get_tts_model_voices(model=model, credentials=credentials)]:
voice = self._get_model_default_voice(model, credentials)
# if streaming:
return self._tts_invoke_streaming(model=model,
Expand All @@ -50,7 +47,7 @@ def validate_credentials(self, model: str, credentials: dict, user: Optional[str
:return: text translated to audio file
"""
try:
self._tts_invoke(
self._tts_invoke_streaming(
model=model,
credentials=credentials,
content_text='Hello Dify!',
Expand All @@ -59,46 +56,6 @@ def validate_credentials(self, model: str, credentials: dict, user: Optional[str
except Exception as ex:
raise CredentialsValidateFailedError(str(ex))

def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
"""
_tts_invoke text2speech model
:param model: model name
:param credentials: model credentials
:param content_text: text content to be translated
:param voice: model timbre
:return: text translated to audio file
"""
audio_type = self._get_model_audio_type(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
max_workers = self._get_model_workers_limit(model, credentials)
try:
sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
audio_bytes_list = []

# Create a thread pool and map the function to the list of sentences
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice,
credentials=credentials) for sentence in sentences]
for future in futures:
try:
if future.result():
audio_bytes_list.append(future.result())
except Exception as ex:
raise InvokeBadRequestError(str(ex))

if len(audio_bytes_list) > 0:
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
audio_bytes_list if audio_bytes]
combined_segment = reduce(lambda x, y: x + y, audio_segments)
buffer: BytesIO = BytesIO()
combined_segment.export(buffer, format=audio_type)
buffer.seek(0)
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
except Exception as ex:
raise InvokeBadRequestError(str(ex))


def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
voice: str) -> any:
"""
Expand All @@ -114,7 +71,8 @@ def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str
# doc: https://platform.openai.com/docs/guides/text-to-speech
credentials_kwargs = self._to_credential_kwargs(credentials)
client = OpenAI(**credentials_kwargs)
model_support_voice = [x.get("value") for x in self.get_tts_model_voices(model=model, credentials=credentials)]
model_support_voice = [x.get("value") for x in
self.get_tts_model_voices(model=model, credentials=credentials)]
if not voice or voice not in model_support_voice:
voice = self._get_model_default_voice(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
Expand Down
47 changes: 1 addition & 46 deletions api/core/model_runtime/model_providers/tongyi/tts/tts.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
import concurrent.futures
import threading
from functools import reduce
from io import BytesIO
from queue import Queue
from typing import Optional

import dashscope
from dashscope import SpeechSynthesizer
from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse
from dashscope.audio.tts import ResultCallback, SpeechSynthesisResult
from flask import Response
from pydub import AudioSegment

from core.model_runtime.errors.invoke import InvokeBadRequestError
from core.model_runtime.errors.validate import CredentialsValidateFailedError
Expand Down Expand Up @@ -55,7 +50,7 @@ def validate_credentials(self, model: str, credentials: dict, user: Optional[str
:return: text translated to audio file
"""
try:
self._tts_invoke(
self._tts_invoke_streaming(
model=model,
credentials=credentials,
content_text='Hello Dify!',
Expand All @@ -64,46 +59,6 @@ def validate_credentials(self, model: str, credentials: dict, user: Optional[str
except Exception as ex:
raise CredentialsValidateFailedError(str(ex))

def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
"""
_tts_invoke text2speech model
:param model: model name
:param credentials: model credentials
:param voice: model timbre
:param content_text: text content to be translated
:return: text translated to audio file
"""
audio_type = self._get_model_audio_type(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
max_workers = self._get_model_workers_limit(model, credentials)
try:
sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
audio_bytes_list = []

# Create a thread pool and map the function to the list of sentences
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(self._process_sentence, sentence=sentence,
credentials=credentials, voice=voice, audio_type=audio_type) for sentence in
sentences]
for future in futures:
try:
if future.result():
audio_bytes_list.append(future.result())
except Exception as ex:
raise InvokeBadRequestError(str(ex))

if len(audio_bytes_list) > 0:
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
audio_bytes_list if audio_bytes]
combined_segment = reduce(lambda x, y: x + y, audio_segments)
buffer: BytesIO = BytesIO()
combined_segment.export(buffer, format=audio_type)
buffer.seek(0)
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
except Exception as ex:
raise InvokeBadRequestError(str(ex))

def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
voice: str) -> any:
"""
Expand Down