diff --git a/agents/manifest-lock.json b/agents/manifest-lock.json index 67fe30ef..4a277f03 100644 --- a/agents/manifest-lock.json +++ b/agents/manifest-lock.json @@ -99,20 +99,21 @@ }, { "type": "extension", - "name": "azure_tts", - "version": "0.4.2", - "hash": "f1f3e58bd67ff45d537703690fe15f37f37b1e5754c29c483e74f15ee8fde7c8", + "name": "agora_sess_ctrl", + "version": "0.1.0", + "hash": "b789ec4864976c9c0ce512c118a3daedd4bb327cba00898816c546b6be4f8176", "dependencies": [ { "type": "system", "name": "ten_runtime" - }, - { - "type": "system", - "name": "azure_speech_sdk" } ], - "supports": [] + "supports": [ + { + "os": "linux", + "arch": "x64" + } + ] }, { "type": "system", diff --git a/agents/manifest.json b/agents/manifest.json index a2488fd2..a41cff9b 100644 --- a/agents/manifest.json +++ b/agents/manifest.json @@ -1,6 +1,6 @@ { "type": "app", - "name": "ten_agent", + "name": "astra_agents", "version": "0.4.0", "dependencies": [ { @@ -30,8 +30,8 @@ }, { "type": "extension", - "name": "azure_tts", - "version": "0.4.2" + "name": "agora_sess_ctrl", + "version": "0.1.0" } ] } \ No newline at end of file diff --git a/agents/ten_packages/extension/agora_sess_ctrl b/agents/ten_packages/extension/agora_sess_ctrl new file mode 160000 index 00000000..4ee79df9 --- /dev/null +++ b/agents/ten_packages/extension/agora_sess_ctrl @@ -0,0 +1 @@ +Subproject commit 4ee79df93cb2f3a2b63f143624730ae7443db118 diff --git a/agents/ten_packages/extension/minimax_v2v_python/BUILD.gn b/agents/ten_packages/extension/minimax_v2v_python/BUILD.gn new file mode 100644 index 00000000..bfc80b05 --- /dev/null +++ b/agents/ten_packages/extension/minimax_v2v_python/BUILD.gn @@ -0,0 +1,21 @@ +# +# +# Agora Real Time Engagement +# Created by Wei Hu in 2022-11. +# Copyright (c) 2024 Agora IO. All rights reserved. +# +# +import("//build/feature/ten_package.gni") + +ten_package("minimax_v2v_python") { + package_kind = "extension" + + resources = [ + "__init__.py", + "addon.py", + "extension.py", + "log.py", + "manifest.json", + "property.json", + ] +} diff --git a/agents/ten_packages/extension/minimax_v2v_python/README.md b/agents/ten_packages/extension/minimax_v2v_python/README.md new file mode 100644 index 00000000..3f1ece64 --- /dev/null +++ b/agents/ten_packages/extension/minimax_v2v_python/README.md @@ -0,0 +1,29 @@ +# minimax_v2v_python + + + +## Features + + + +- xxx feature + +## API + +Refer to `api` definition in [manifest.json] and default values in [property.json](property.json). + + + +## Development + +### Build + + + +### Unit test + + + +## Misc + + diff --git a/agents/ten_packages/extension/minimax_v2v_python/__init__.py b/agents/ten_packages/extension/minimax_v2v_python/__init__.py new file mode 100644 index 00000000..789f2990 --- /dev/null +++ b/agents/ten_packages/extension/minimax_v2v_python/__init__.py @@ -0,0 +1,11 @@ +# +# +# Agora Real Time Engagement +# Created by Wei Hu in 2024-08. +# Copyright (c) 2024 Agora IO. All rights reserved. +# +# +from . import addon +from .log import logger + +logger.info("minimax_v2v_python extension loaded") diff --git a/agents/ten_packages/extension/minimax_v2v_python/addon.py b/agents/ten_packages/extension/minimax_v2v_python/addon.py new file mode 100644 index 00000000..c8072a51 --- /dev/null +++ b/agents/ten_packages/extension/minimax_v2v_python/addon.py @@ -0,0 +1,22 @@ +# +# +# Agora Real Time Engagement +# Created by Wei Hu in 2024-08. +# Copyright (c) 2024 Agora IO. All rights reserved. +# +# +from ten import ( + Addon, + register_addon_as_extension, + TenEnv, +) +from .extension import MiniMaxExtension +from .log import logger + + +@register_addon_as_extension("minimax_v2v_python") +class MiniMaxExtensionAddon(Addon): + + def on_create_instance(self, ten_env: TenEnv, name: str, context) -> None: + logger.info("MiniMaxExtensionAddon on_create_instance") + ten_env.on_create_instance_done(MiniMaxExtension(name), context) diff --git a/agents/ten_packages/extension/minimax_v2v_python/extension.py b/agents/ten_packages/extension/minimax_v2v_python/extension.py new file mode 100644 index 00000000..86f33069 --- /dev/null +++ b/agents/ten_packages/extension/minimax_v2v_python/extension.py @@ -0,0 +1,300 @@ +# +# +# Agora Real Time Engagement +# Created by Tomas Liu in 2024-10. +# Copyright (c) 2024 Agora IO. All rights reserved. +# +# +import threading +from datetime import datetime +import base64 +import requests +import json + +from queue import Queue +from typing import Iterator, Any, List + +from ten import ( + AudioFrame, + AudioFrameDataFmt, + VideoFrame, + Extension, + TenEnv, + Cmd, + StatusCode, + CmdResult, + Data, +) +from .log import logger + +PROPERTY_TOKEN = "token" +PROPERTY_PROMPT = "prompt" +PROPERTY_SAMPLE_RATE = "sample_rate" +PROPERTY_MODEL = "model" +PROPERTY_VOICE_MODEL = "voice_model" +PROPERTY_VOICE_ID = "voice_id" + +class MiniMaxExtension(Extension): + ten_env:TenEnv = None + token:str = "" + thread:threading.Thread = None + queue = Queue() + + stopped: bool = False + outdate_ts = datetime.now() + mutex = threading.Lock() + history: List[str] = [] + max_history: int = 10 + + model:str = "abab6.5s-chat" + voice_model:str = "speech-01-turbo-240228" + voice_id:str = "female-tianmei" + prompt:str = "You are a voice assistant who talks in a conversational way and can chat with me like my friends. I will speak to you in English or Chinese, and you will answer in the corrected and improved version of my text with the language I use. Don’t talk like a robot, instead I would like you to talk like a real human with emotions. I will use your answer for text-to-speech, so don’t return me any meaningless characters. I want you to be helpful, when I’m asking you for advice, give me precise, practical and useful advice instead of being vague. When giving me a list of options, express the options in a narrative way instead of bullet points." + sample_rate:int = 32000 # in and out sample rate + + def on_init(self, ten_env: TenEnv) -> None: + logger.info("MiniMaxExtension on_init") + self.ten_env = ten_env + ten_env.on_init_done() + + def on_start(self, ten_env: TenEnv) -> None: + logger.info("MiniMaxExtension on_start") + + try: + self.token = ten_env.get_property_string(PROPERTY_TOKEN) + except Exception as err: + logger.info( + f"GetProperty required {PROPERTY_TOKEN} failed, err: {err}") + return + + try: + self.prompt = ten_env.get_property_string(PROPERTY_PROMPT) + except Exception as err: + logger.info( + f"GetProperty required {PROPERTY_PROMPT} failed, err: {err}") + + try: + self.sample_rate = ten_env.get_property_int(PROPERTY_SAMPLE_RATE) + except Exception as err: + logger.info( + f"GetProperty required {PROPERTY_SAMPLE_RATE} failed, err: {err}") + + try: + self.model = ten_env.get_property_string(PROPERTY_MODEL) + except Exception as err: + logger.info( + f"GetProperty required {PROPERTY_MODEL} failed, err: {err}") + + try: + self.voice_model = ten_env.get_property_string(PROPERTY_VOICE_MODEL) + except Exception as err: + logger.info( + f"GetProperty required {PROPERTY_VOICE_MODEL} failed, err: {err}") + + try: + self.voice_id = ten_env.get_property_string(PROPERTY_VOICE_ID) + except Exception as err: + logger.info( + f"GetProperty required {PROPERTY_VOICE_ID} failed, err: {err}") + + self.thread = threading.Thread(target=self.loop) + self.thread.start() + + ten_env.on_start_done() + + def on_stop(self, ten_env: TenEnv) -> None: + logger.info("MiniMaxExtension on_stop") + + self.stopped = True + self.queue.put(None) + self.thread.join() + + ten_env.on_stop_done() + + def loop(self) -> None: + while not self.stopped: + entry = self.queue.get() + if entry is None: + return + + try: + ts, buff = entry + if self._need_interrupt(ts): + continue + self._complete_with_history(ts, buff) + except: + logger.exception(f"Failed to handle entry") + + def on_deinit(self, ten_env: TenEnv) -> None: + logger.info("MiniMaxExtension on_deinit") + ten_env.on_deinit_done() + + def on_cmd(self, ten_env: TenEnv, cmd: Cmd) -> None: + cmd_name = cmd.get_name() + logger.info("on_cmd name {}".format(cmd_name)) + + cmd_result = CmdResult.create(StatusCode.OK) + ten_env.return_result(cmd_result, cmd) + + def on_data(self, ten_env: TenEnv, data: Data) -> None: + pass + + def on_audio_frame(self, ten_env: TenEnv, audio_frame: AudioFrame) -> None: + # Must be after vad + try: + ts = datetime.now() + with self.mutex: + self.outdate_ts = ts + + while not self.queue.empty(): + self.queue.get() + + frame_buf = audio_frame.get_buf() + logger.info(f"on audio frame {len(frame_buf)}") + self.queue.put((ts, frame_buf)) + + cmd = Cmd.create("flush") + ten_env.send_cmd( + cmd, lambda ten, result: logger.info("send_cmd flush done"), + ) + except: + logger.exception(f"MiniMaxExtension on audio frame failed") + + def on_video_frame(self, ten_env: TenEnv, video_frame: VideoFrame) -> None: + pass + + def _need_interrupt(self, ts: datetime.time) -> bool: + with self.mutex: + return self.outdate_ts > ts + + def _complete_with_history(self, ts: datetime, buff: bytearray) -> Iterator[bytes]: + messages = self._get_messages() + messages.append({ + "role": "user", + "content": [ + { + "type": "input_audio", + "input_audio": { + "data": base64.b64encode(buff).decode("utf-8"), + "format": "pcm", + "sample_rate": self.sample_rate, + "bit_depth": 16, + "channel": 1, + "encode": "base64" + } + } + ]}) + + url = "https://api.minimax.chat/v1/text/chatcompletion_v2" + headers = { + "Authorization": f"Bearer {self.token}", + "Content-Type": "application/json" + } + payload = { + "model": self.model, + "messages": messages, + "tools": [], + "tool_choice": "none", + "stream": True, + "stream_options": { # 开启语音输出 + "speech_output": True + }, + "voice_setting":{ + "model": self.voice_model, + "voice_id": self.voice_id + }, + "audio_setting": { + "sample_rate": self.sample_rate, + "format": "pcm", + "channel": 1, + "encode": "base64" + }, + "max_tokens": 1024, + "temperature": 0.8, + "top_p": 0.95 + } + + response = requests.post(url, headers=headers, json=payload, stream=True) + logger.info(f"Get response, trace-id: {response.headers.get('Trace-Id')}") + self.transcript = "" + for line in response.iter_lines(decode_unicode=True): + if self._need_interrupt(ts): + self.transcript += "[interrupted]" + self._append_message("assistant", self.transcript) + return + + if not line.startswith("data:"): + continue + resp = json.loads(line.strip("data:")) + if resp.get("choices") and resp["choices"][0].get("delta"): + delta = resp["choices"][0]["delta"] + if delta.get("role") == "assistant": + if delta.get("content"): + content = delta['content'] + self.transcript += content + logger.info(f"get transcript {content}") + self._send_transcript(content, "assistant", False) + elif delta.get("audio_content") and delta["audio_content"] != "": + buff = base64.b64decode(delta["audio_content"]) + self._send_audio_out(buff) + elif delta.get("tool_calls"): + logger.info(f"ignore tool call {delta}") + continue + else: + logger.warning(f"unknown delta {delta}") + + if self.transcript: + self._append_message("assistant", self.transcript) + self._send_transcript("", "assistant", True) + + def _get_messages(self) -> List[Any]: + messages = [] + if len(self.prompt) > 0: + messages.append({"role": "system", "content": self.prompt}) + self.mutex.acquire() + try: + for h in self.history: + messages.append(h) + finally: + self.mutex.release() + return messages + + def _append_message(self, role: str, content: str) -> None: + self.mutex.acquire() + try: + logger.info(f"append history {content}") + self.history.append({"role": role, "content": content}) + if len(self.history) > self.max_history: + self.history = self.history[1:] + finally: + self.mutex.release() + + def _send_audio_out(self, audio_data:bytearray) -> None: + try: + f = AudioFrame.create("pcm_frame") + f.set_sample_rate(self.sample_rate) + f.set_bytes_per_sample(2) + f.set_number_of_channels(1) + f.set_data_fmt(AudioFrameDataFmt.INTERLEAVE) + f.set_samples_per_channel(len(audio_data) // 2) + f.alloc_buf(len(audio_data)) + buff = f.lock_buf() + buff[:] = audio_data + f.unlock_buf(buff) + self.ten_env.send_audio_frame(f) + except: + logger.exception("Error send audio frame") + + def _send_transcript(self, content:str, role:str, is_final:bool) -> None: + try: + d = Data.create("text_data") + d.set_property_string("text", content) + d.set_property_bool("end_of_segment", is_final) + d.set_property_string("role", role) + d.set_property_bool("is_final", is_final) + logger.debug( + f"send transcript text [{content}] is_final {is_final} end_of_segment {is_final} role {role}") + self.ten_env.send_data(d) + except: + logger.exception( + f"Error send text data {role}: {content} {is_final}") \ No newline at end of file diff --git a/agents/ten_packages/extension/minimax_v2v_python/log.py b/agents/ten_packages/extension/minimax_v2v_python/log.py new file mode 100644 index 00000000..9a03c357 --- /dev/null +++ b/agents/ten_packages/extension/minimax_v2v_python/log.py @@ -0,0 +1,22 @@ +# +# +# Agora Real Time Engagement +# Created by Wei Hu in 2024-08. +# Copyright (c) 2024 Agora IO. All rights reserved. +# +# +import logging + +logger = logging.getLogger("minimax_v2v_python") +logger.setLevel(logging.INFO) + +formatter_str = ( + "%(asctime)s - %(name)s - %(levelname)s - %(process)d - " + "[%(filename)s:%(lineno)d] - %(message)s" +) +formatter = logging.Formatter(formatter_str) + +console_handler = logging.StreamHandler() +console_handler.setFormatter(formatter) + +logger.addHandler(console_handler) diff --git a/agents/ten_packages/extension/minimax_v2v_python/manifest.json b/agents/ten_packages/extension/minimax_v2v_python/manifest.json new file mode 100644 index 00000000..a122cb66 --- /dev/null +++ b/agents/ten_packages/extension/minimax_v2v_python/manifest.json @@ -0,0 +1,23 @@ +{ + "type": "extension", + "name": "minimax_v2v_python", + "version": "0.1.0", + "dependencies": [ + { + "type": "system", + "name": "ten_runtime_python", + "version": "0.2" + } + ], + "package": { + "include": [ + "manifest.json", + "property.json", + "BUILD.gn", + "**.tent", + "**.py", + "README.md" + ] + }, + "api": {} +} \ No newline at end of file diff --git a/agents/ten_packages/extension/minimax_v2v_python/property.json b/agents/ten_packages/extension/minimax_v2v_python/property.json new file mode 100644 index 00000000..9e26dfee --- /dev/null +++ b/agents/ten_packages/extension/minimax_v2v_python/property.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/demo/src/app/api/agents/start/graph.tsx b/demo/src/app/api/agents/start/graph.tsx index a1fda36c..c5005212 100644 --- a/demo/src/app/api/agents/start/graph.tsx +++ b/demo/src/app/api/agents/start/graph.tsx @@ -17,6 +17,10 @@ export const voiceNameMap: LanguageMap = { openai: { male: "alloy", female: "shimmer" + }, + minimax: { + male: "junlang_nanyou", + female: "female-yujie" } }, "en-US": { @@ -35,6 +39,10 @@ export const voiceNameMap: LanguageMap = { openai: { male: "alloy", female: "shimmer" + }, + minimax: { + male: "junlang_nanyou", + female: "female-yujie" } }, "ja-JP": { @@ -45,6 +53,10 @@ export const voiceNameMap: LanguageMap = { openai: { male: "alloy", female: "shimmer" + }, + minimax: { + male: "junlang_nanyou", + female: "female-yujie" } }, "ko-KR": { @@ -55,6 +67,10 @@ export const voiceNameMap: LanguageMap = { openai: { male: "alloy", female: "shimmer" + }, + minimax: { + male: "junlang_nanyou", + female: "female-yujie" } }, }; @@ -130,6 +146,17 @@ export const getGraphProperties = ( "agora_asr_language": language, }, } + } else if (graphName == "va.minimax.azure") { + return { + "minimax_v2v_python": { + "voice_id": voiceNameMap[language]["minimax"][voiceType], + ...localizationOptions, + "system_message": prompt, + }, + "agora_rtc": { + "agora_asr_language": language, + }, + } } else if (graphName == "va.openai.azure") { return { "agora_rtc": { diff --git a/demo/src/common/constant.ts b/demo/src/common/constant.ts index 6240fbd5..a84fb5d3 100644 --- a/demo/src/common/constant.ts +++ b/demo/src/common/constant.ts @@ -52,6 +52,10 @@ export const GRAPH_OPTIONS: GraphOptionItem[] = [ { label: "Voice Agent with OpenAI Realtime API (Beta) + FishAudio TTS", value: "va.openai.v2v.fish" + }, + { + label: "Voice Agent with Minimax", + value: "va.minimax.azure" } ]