From ef871cf5741f1d7a4d94b2e49d88b20c34386059 Mon Sep 17 00:00:00 2001 From: TomasLiu Date: Mon, 21 Oct 2024 19:02:13 +0800 Subject: [PATCH 1/6] support non audio --- agents/property.json | 264 ++++++++++++++++++ .../extension/openai_v2v_python/conf.py | 4 +- .../extension/openai_v2v_python/extension.py | 115 ++++++-- 3 files changed, 358 insertions(+), 25 deletions(-) diff --git a/agents/property.json b/agents/property.json index 40bc5d04..080af142 100644 --- a/agents/property.json +++ b/agents/property.json @@ -2860,6 +2860,270 @@ ] } ] + }, + { + "name": "va.openai.v2v.fish", + "auto_start": false, + "nodes": [ + { + "type": "extension", + "extension_group": "rtc", + "addon": "agora_rtc", + "name": "agora_rtc", + "property": { + "app_id": "${env:AGORA_APP_ID}", + "token": "", + "channel": "ten_agent_test", + "stream_id": 1234, + "remote_stream_id": 123, + "subscribe_audio": true, + "publish_audio": true, + "publish_data": true, + "subscribe_audio_sample_rate": 24000 + } + }, + { + "type": "extension", + "extension_group": "llm", + "addon": "openai_v2v_python", + "name": "openai_v2v_python", + "property": { + "api_key": "${env:OPENAI_REALTIME_API_KEY}", + "temperature": 0.9, + "model": "gpt-4o-realtime-preview", + "max_tokens": 2048, + "audio_out": false, + "language": "en-US", + "server_vad": true, + "dump": true, + "history": 10 + } + }, + { + "type": "extension", + "extension_group": "tts", + "addon": "fish_audio_tts", + "name": "fish_audio_tts", + "property": { + "api_key": "${env:FISH_AUDIO_TTS_KEY}", + "model_id": "d8639b5cc95548f5afbcfe22d3ba5ce5", + "optimize_streaming_latency": true, + "request_timeout_seconds": 30, + "base_url": "https://api.fish.audio" + } + }, + { + "type": "extension", + "extension_group": "transcriber", + "addon": "message_collector", + "name": "message_collector" + }, + { + "type": "extension", + "extension_group": "tools", + "addon": "weatherapi_tool_python", + "name": "weatherapi_tool_python", + "property": { + "api_key": "${env:WEATHERAPI_API_KEY}" + } + }, + { + "type": "extension", + "extension_group": "tools", + "addon": "bingsearch_tool_python", + "name": "bingsearch_tool_python", + "property": { + "api_key": "${env:BING_API_KEY}" + } + } + ], + "connections": [ + { + "extension_group": "rtc", + "extension": "agora_rtc", + "audio_frame": [ + { + "name": "pcm_frame", + "dest": [ + { + "extension_group": "llm", + "extension": "openai_v2v_python" + } + ] + } + ] + }, + { + "extension_group": "tools", + "extension": "weatherapi_tool_python", + "cmd": [ + { + "name": "tool_register", + "dest": [ + { + "extension_group": "llm", + "extension": "openai_v2v_python" + } + ] + } + ] + }, + { + "extension_group": "tools", + "extension": "bingsearch_tool_python", + "cmd": [ + { + "name": "tool_register", + "dest": [ + { + "extension_group": "llm", + "extension": "openai_v2v_python" + } + ] + } + ] + }, + { + "extension_group": "llm", + "extension": "openai_v2v_python", + "data": [ + { + "name": "text_data", + "dest": [ + { + "extension_group": "transcriber", + "extension": "message_collector" + }, + { + "extension_group": "tts", + "extension": "fish_audio_tts" + } + ] + } + ], + "cmd": [ + { + "name": "flush", + "dest": [ + { + "extension_group": "rtc", + "extension": "agora_rtc" + } + ] + }, + { + "name": "tool_call_get_current_weather", + "dest": [ + { + "extension_group": "tools", + "extension": "weatherapi_tool_python", + "msg_conversion": { + "type": "per_property", + "keep_original": true, + "rules": [ + { + "path": "_ten.name", + "conversion_mode": "fixed_value", + "value": "tool_call" + } + ] + } + } + ] + }, + { + "name": "tool_call_get_past_weather", + "dest": [ + { + "extension_group": "tools", + "extension": "weatherapi_tool_python", + "msg_conversion": { + "type": "per_property", + "keep_original": true, + "rules": [ + { + "path": "_ten.name", + "conversion_mode": "fixed_value", + "value": "tool_call" + } + ] + } + } + ] + }, + { + "name": "tool_call_get_future_weather", + "dest": [ + { + "extension_group": "tools", + "extension": "weatherapi_tool_python", + "msg_conversion": { + "type": "per_property", + "keep_original": true, + "rules": [ + { + "path": "_ten.name", + "conversion_mode": "fixed_value", + "value": "tool_call" + } + ] + } + } + ] + }, + { + "name": "tool_call_bing_search", + "dest": [ + { + "extension_group": "tools", + "extension": "bingsearch_tool_python", + "msg_conversion": { + "type": "per_property", + "keep_original": true, + "rules": [ + { + "path": "_ten.name", + "conversion_mode": "fixed_value", + "value": "tool_call" + } + ] + } + } + ] + } + ] + }, + { + "extension_group": "tts", + "extension": "fish_audio_tts", + "audio_frame": [ + { + "name": "pcm_frame", + "dest": [ + { + "extension_group": "rtc", + "extension": "agora_rtc" + } + ] + } + ] + }, + { + "extension_group": "transcriber", + "extension": "message_collector", + "data": [ + { + "name": "data", + "dest": [ + { + "extension_group": "rtc", + "extension": "agora_rtc" + } + ] + } + ] + } + ] } ] } diff --git a/agents/ten_packages/extension/openai_v2v_python/conf.py b/agents/ten_packages/extension/openai_v2v_python/conf.py index d72d1766..ec997abe 100644 --- a/agents/ten_packages/extension/openai_v2v_python/conf.py +++ b/agents/ten_packages/extension/openai_v2v_python/conf.py @@ -26,7 +26,8 @@ def __init__( temperature: float = 0.5, max_tokens: int = 1024, voice: Voices = Voices.Alloy, - server_vad:bool=True + server_vad:bool=True, + audio_out:bool=True ): self.base_uri = base_uri self.api_key = api_key @@ -39,6 +40,7 @@ def __init__( self.max_tokens = max_tokens self.voice = voice self.server_vad = server_vad + self.audio_out = audio_out def build_ctx(self) -> dict: return { diff --git a/agents/ten_packages/extension/openai_v2v_python/extension.py b/agents/ten_packages/extension/openai_v2v_python/extension.py index 57ee3945..423f566e 100644 --- a/agents/ten_packages/extension/openai_v2v_python/extension.py +++ b/agents/ten_packages/extension/openai_v2v_python/extension.py @@ -41,6 +41,7 @@ PROPERTY_TEMPERATURE = "temperature" # Optional PROPERTY_MAX_TOKENS = "max_tokens" # Optional PROPERTY_VOICE = "voice" # Optional +PROPERTY_AUDIO_OUT = "audio_out" # Optional PROPERTY_SERVER_VAD = "server_vad" # Optional PROPERTY_STREAM_ID = "stream_id" PROPERTY_LANGUAGE = "language" @@ -249,9 +250,17 @@ def get_time_ms() -> int: logger.warning( f"On flushed transcript delta {message.response_id} {message.output_index} {message.content_index} {message.delta}") continue - self.transcript += message.delta self._send_transcript( - ten_env, self.transcript, Role.Assistant, False) + ten_env, message.delta, Role.Assistant, False) + case ResponseTextDelta(): + logger.info( + f"On response text delta {message.response_id} {message.output_index} {message.content_index} {message.delta}") + if message.response_id in flushed: + logger.warning( + f"On flushed text delta {message.response_id} {message.output_index} {message.content_index} {message.delta}") + continue + self._send_transcript( + ten_env, message.delta, Role.Assistant, False) case ResponseAudioTranscriptDone(): logger.info( f"On response transcript done {message.output_index} {message.content_index} {message.transcript}") @@ -261,7 +270,17 @@ def get_time_ms() -> int: continue self.transcript = "" self._send_transcript( - ten_env, message.transcript, Role.Assistant, True) + ten_env, "", Role.Assistant, True) + case ResponseTextDone(): + logger.info( + f"On response text done {message.output_index} {message.content_index} {message.text}") + if message.response_id in flushed: + logger.warning( + f"On flushed text done {message.response_id}") + continue + self.transcript = "" + self._send_transcript( + ten_env, "", Role.Assistant, True) case ResponseOutputItemDone(): logger.info(f"Output item done {message.item}") case ResponseOutputItemAdded(): @@ -391,6 +410,14 @@ def _fetch_properties(self, ten_env: TenEnv): f"GetProperty optional {PROPERTY_TEMPERATURE} failed, err: {err}" ) + try: + audio_out = ten_env.get_property_bool(PROPERTY_AUDIO_OUT) + self.config.audio_out = audio_out + except Exception as err: + logger.info( + f"GetProperty optional {PROPERTY_AUDIO_OUT} failed, err: {err}" + ) + try: max_tokens = ten_env.get_property_int(PROPERTY_MAX_TOKENS) if max_tokens > 0: @@ -459,15 +486,25 @@ def _update_session(self) -> SessionUpdate: self.ctx["tools"] = self.registry.to_prompt() prompt = self._replace(self.config.instruction) self.last_updated = datetime.now() - return SessionUpdate(session=SessionUpdateParams( - instructions=prompt, - model=self.config.model, - voice=self.config.voice, - input_audio_transcription=InputAudioTranscription( - model="whisper-1"), - tool_choice="auto", - tools=self.registry.get_tools() - )) + if self.config.audio_out: + return SessionUpdate(session=SessionUpdateParams( + instructions=prompt, + model=self.config.model, + voice=self.config.voice, + input_audio_transcription=InputAudioTranscription( + model="whisper-1"), + tool_choice="auto", + tools=self.registry.get_tools() + )) + else: + return SessionUpdate(session=SessionUpdateParams( + instructions=prompt, + model=self.config.model, + modalities=["text"], + # input_audio_transcription=InputAudioTranscription(model="whisper-1"), disable transcript for now. + tool_choice="auto", + tools=self.registry.get_tools() + )) ''' def _update_conversation(self) -> UpdateConversationConfig: @@ -506,20 +543,50 @@ def _on_audio_delta(self, ten_env: TenEnv, delta: bytes) -> None: f.unlock_buf(buff) ten_env.send_audio_frame(f) - def _send_transcript(self, ten_env: TenEnv, transcript: str, role: Role, is_final: bool) -> None: + def _send_transcript(self, ten_env: TenEnv, content: str, role: Role, is_final: bool) -> None: + def is_punctuation(char): + if char in [",", ",", ".", "。", "?", "?", "!", "!"]: + return True + return False + + def parse_sentences(sentence_fragment, content): + sentences = [] + current_sentence = sentence_fragment + for char in content: + current_sentence += char + if is_punctuation(char): + # Check if the current sentence contains non-punctuation characters + stripped_sentence = current_sentence + if any(c.isalnum() for c in stripped_sentence): + sentences.append(stripped_sentence) + current_sentence = "" # Reset for the next sentence + + remain = current_sentence # Any remaining characters form the incomplete sentence + return sentences, remain + + def send_data(ten_env: TenEnv, sentence: str, stream_id: int, is_final: bool): + try: + d = Data.create("text_data") + d.set_property_string("text", sentence) + d.set_property_bool("end_of_segment", is_final) + d.set_property_int("stream_id", stream_id) + logger.info( + f"send transcript text [{sentence}] stream_id {stream_id} is_final {is_final} end_of_segment {is_final} role {role}") + ten_env.send_data(d) + except: + logger.exception( + f"Error send text data {role}: {sentence} {is_final}") + + stream_id = self.remote_stream_id if role == Role.User else 0 try: - d = Data.create("text_data") - d.set_property_string("text", transcript) - d.set_property_bool("end_of_segment", is_final) - stream_id = self.remote_stream_id if role == Role.User else 0 - d.set_property_int("stream_id", stream_id) - d.set_property_bool("is_final", is_final) - logger.debug( - f"send transcript text [{transcript}] stream_id {stream_id} is_final {is_final} end_of_segment {is_final} role {role}") - ten_env.send_data(d) + if role == Role.Assistant and not is_final: + sentences, self.transcript = parse_sentences(self.transcript, content) + for s in sentences: + send_data(ten_env, s, stream_id, is_final) + else: + send_data(ten_env, content, stream_id, is_final) except: - logger.exception( - f"Error send text data {role}: {transcript} {is_final}") + logger.exception(f"Error send text data {role}: {content} {is_final}") def _flush(self, ten_env: TenEnv) -> None: try: From 73215065b84ca51abfee2bad41ed854157bcf380 Mon Sep 17 00:00:00 2001 From: TomasLiu Date: Mon, 21 Oct 2024 20:28:54 +0800 Subject: [PATCH 2/6] add input transcript switch --- agents/property.json | 8 ++++- .../extension/openai_v2v_python/conf.py | 6 ++-- .../extension/openai_v2v_python/extension.py | 31 +++++++++++-------- demo/src/app/api/agents/start/graph.tsx | 14 +++++++++ demo/src/common/constant.ts | 4 +++ 5 files changed, 47 insertions(+), 16 deletions(-) diff --git a/agents/property.json b/agents/property.json index 080af142..4295ce68 100644 --- a/agents/property.json +++ b/agents/property.json @@ -2879,7 +2879,13 @@ "subscribe_audio": true, "publish_audio": true, "publish_data": true, - "subscribe_audio_sample_rate": 24000 + "subscribe_audio_sample_rate": 24000, + "enable_agora_asr": true, + "agora_asr_vendor_name": "microsoft", + "agora_asr_language": "en-US", + "agora_asr_vendor_key": "${env:AZURE_STT_KEY}", + "agora_asr_vendor_region": "${env:AZURE_STT_REGION}", + "agora_asr_session_control_file_path": "session_control.conf" } }, { diff --git a/agents/ten_packages/extension/openai_v2v_python/conf.py b/agents/ten_packages/extension/openai_v2v_python/conf.py index ec997abe..bbd54b19 100644 --- a/agents/ten_packages/extension/openai_v2v_python/conf.py +++ b/agents/ten_packages/extension/openai_v2v_python/conf.py @@ -26,8 +26,9 @@ def __init__( temperature: float = 0.5, max_tokens: int = 1024, voice: Voices = Voices.Alloy, - server_vad:bool=True, - audio_out:bool=True + server_vad: bool = True, + audio_out: bool = True, + input_transcript: bool = True ): self.base_uri = base_uri self.api_key = api_key @@ -41,6 +42,7 @@ def __init__( self.voice = voice self.server_vad = server_vad self.audio_out = audio_out + self.input_transcript = input_transcript def build_ctx(self) -> dict: return { diff --git a/agents/ten_packages/extension/openai_v2v_python/extension.py b/agents/ten_packages/extension/openai_v2v_python/extension.py index 423f566e..9082165d 100644 --- a/agents/ten_packages/extension/openai_v2v_python/extension.py +++ b/agents/ten_packages/extension/openai_v2v_python/extension.py @@ -42,6 +42,7 @@ PROPERTY_MAX_TOKENS = "max_tokens" # Optional PROPERTY_VOICE = "voice" # Optional PROPERTY_AUDIO_OUT = "audio_out" # Optional +PROPERTY_INPUT_TRANSCRIPT = "input_transcript" PROPERTY_SERVER_VAD = "server_vad" # Optional PROPERTY_STREAM_ID = "stream_id" PROPERTY_LANGUAGE = "language" @@ -418,6 +419,14 @@ def _fetch_properties(self, ten_env: TenEnv): f"GetProperty optional {PROPERTY_AUDIO_OUT} failed, err: {err}" ) + try: + input_transcript = ten_env.get_property_bool(PROPERTY_INPUT_TRANSCRIPT) + self.config.input_transcript = input_transcript + except Exception as err: + logger.info( + f"GetProperty optional {PROPERTY_INPUT_TRANSCRIPT} failed, err: {err}" + ) + try: max_tokens = ten_env.get_property_int(PROPERTY_MAX_TOKENS) if max_tokens > 0: @@ -486,25 +495,21 @@ def _update_session(self) -> SessionUpdate: self.ctx["tools"] = self.registry.to_prompt() prompt = self._replace(self.config.instruction) self.last_updated = datetime.now() - if self.config.audio_out: - return SessionUpdate(session=SessionUpdateParams( + su = SessionUpdate(session=SessionUpdateParams( instructions=prompt, model=self.config.model, - voice=self.config.voice, - input_audio_transcription=InputAudioTranscription( - model="whisper-1"), tool_choice="auto", tools=self.registry.get_tools() )) + if self.config.audio_out: + su.session.voice=self.config.voice else: - return SessionUpdate(session=SessionUpdateParams( - instructions=prompt, - model=self.config.model, - modalities=["text"], - # input_audio_transcription=InputAudioTranscription(model="whisper-1"), disable transcript for now. - tool_choice="auto", - tools=self.registry.get_tools() - )) + su.session.modalities=["text"] + + if self.config.input_transcript: + su.session.input_audio_transcription=InputAudioTranscription( + model="whisper-1") + return su ''' def _update_conversation(self) -> UpdateConversationConfig: diff --git a/demo/src/app/api/agents/start/graph.tsx b/demo/src/app/api/agents/start/graph.tsx index f01d6d95..a1fda36c 100644 --- a/demo/src/app/api/agents/start/graph.tsx +++ b/demo/src/app/api/agents/start/graph.tsx @@ -116,6 +116,20 @@ export const getGraphProperties = ( "greeting": greeting, } } + } else if (graphName == "va.openai.v2v.fish") { + return { + "openai_v2v_python": { + "model": "gpt-4o-realtime-preview", + "voice": voiceNameMap[language]["openai"][voiceType], + "language": language, + ...localizationOptions, + "system_message": prompt, + "greeting": greeting, + }, + "agora_rtc": { + "agora_asr_language": language, + }, + } } else if (graphName == "va.openai.azure") { return { "agora_rtc": { diff --git a/demo/src/common/constant.ts b/demo/src/common/constant.ts index 00fe177d..6240fbd5 100644 --- a/demo/src/common/constant.ts +++ b/demo/src/common/constant.ts @@ -48,6 +48,10 @@ export const GRAPH_OPTIONS: GraphOptionItem[] = [ { label: "Voice Agent with OpenAI Realtime API (Beta)", value: "va.openai.v2v" + }, + { + label: "Voice Agent with OpenAI Realtime API (Beta) + FishAudio TTS", + value: "va.openai.v2v.fish" } ] From 83ad1ed5dff8c663324bf9dd6d0dc023f7a34644 Mon Sep 17 00:00:00 2001 From: TomasLiu Date: Mon, 21 Oct 2024 20:59:06 +0800 Subject: [PATCH 3/6] fix connection --- agents/property.json | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/agents/property.json b/agents/property.json index 4295ce68..7ef2c295 100644 --- a/agents/property.json +++ b/agents/property.json @@ -2957,6 +2957,17 @@ } ] } + ], + "data": [ + { + "name": "text_data", + "dest": [ + { + "extension_group": "transcriber", + "extension": "message_collector" + } + ] + } ] }, { From 3b76a392b56e9f53b7ca8a55098a03e952a40e7d Mon Sep 17 00:00:00 2001 From: TomasLiu Date: Mon, 21 Oct 2024 21:24:38 +0800 Subject: [PATCH 4/6] fix --- agents/property.json | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/agents/property.json b/agents/property.json index 7ef2c295..47ab8b88 100644 --- a/agents/property.json +++ b/agents/property.json @@ -3023,8 +3023,8 @@ "name": "flush", "dest": [ { - "extension_group": "rtc", - "extension": "agora_rtc" + "extension_group": "tts", + "extension": "fish_audio_tts" } ] }, @@ -3123,6 +3123,17 @@ } ] } + ], + "cmd": [ + { + "name": "flush", + "dest": [ + { + "extension_group": "rtc", + "extension": "agora_rtc" + } + ] + } ] }, { From bdf183d5bf1564a004791ebe9d81f3794ad2961b Mon Sep 17 00:00:00 2001 From: TomasLiu Date: Mon, 21 Oct 2024 21:31:43 +0800 Subject: [PATCH 5/6] fix --- agents/property.json | 1 + 1 file changed, 1 insertion(+) diff --git a/agents/property.json b/agents/property.json index 47ab8b88..5b6f46a9 100644 --- a/agents/property.json +++ b/agents/property.json @@ -2899,6 +2899,7 @@ "model": "gpt-4o-realtime-preview", "max_tokens": 2048, "audio_out": false, + "input_transcript": false, "language": "en-US", "server_vad": true, "dump": true, From 9cac4d768ca03626b7facd97322ce38ca7a9f541 Mon Sep 17 00:00:00 2001 From: TomasLiu Date: Mon, 21 Oct 2024 21:51:19 +0800 Subject: [PATCH 6/6] fix weather check failure --- .../extension/weatherapi_tool_python/extension.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/agents/ten_packages/extension/weatherapi_tool_python/extension.py b/agents/ten_packages/extension/weatherapi_tool_python/extension.py index c3a10486..d1a028ad 100644 --- a/agents/ten_packages/extension/weatherapi_tool_python/extension.py +++ b/agents/ten_packages/extension/weatherapi_tool_python/extension.py @@ -40,7 +40,7 @@ "properties": { "location": { "type": "string", - "description": "The city and state e.g. San Francisco, CA" + "description": "The city and state (use only English) e.g. San Francisco, CA" } }, "required": ["location"], @@ -54,7 +54,7 @@ "properties": { "location": { "type": "string", - "description": "The city and state e.g. San Francisco, CA" + "description": "The city and state (use only English) e.g. San Francisco, CA" }, "datetime": { "type": "string", @@ -72,7 +72,7 @@ "properties": { "location": { "type": "string", - "description": "The city and state e.g. San Francisco, CA" + "description": "The city and state (use only English) e.g. San Francisco, CA" } }, "required": ["location"],