From 616c55b7e9cfcec8ebbc2ec7f4d5ec02f7346904 Mon Sep 17 00:00:00 2001
From: sunshinexcode <24xinhui@163.com>
Date: Wed, 27 Nov 2024 17:06:06 +0800
Subject: [PATCH] refactor(): refactor cartesia tts (#444)

---
 agents/examples/experimental/property.json    |  10 +-
 .../extension/cartesia_tts/BUILD.gn           |  19 ++
 .../extension/cartesia_tts/README.md          |  29 +++
 .../extension/cartesia_tts/__init__.py        |  12 +-
 .../extension/cartesia_tts/addon.py           |  19 ++
 .../extension/cartesia_tts/cartesia_tts.py    |  42 ++++
 .../cartesia_tts/cartesia_tts_addon.py        |  24 ---
 .../cartesia_tts/cartesia_tts_extension.py    | 197 ------------------
 .../cartesia_tts/cartesia_wrapper.py          | 112 ----------
 .../extension/cartesia_tts/extension.py       |  53 ++++-
 .../extension/cartesia_tts/log.py             |  12 --
 .../extension/cartesia_tts/manifest.json      | 115 +++++-----
 .../extension/cartesia_tts/requirements.txt   |   1 +
 .../cartesia_tts/tests/test_basic.py          |  36 ++++
 .../deepgram_asr_python/manifest.json         | 166 +++++++--------
 15 files changed, 355 insertions(+), 492 deletions(-)
 create mode 100644 agents/ten_packages/extension/cartesia_tts/BUILD.gn
 create mode 100644 agents/ten_packages/extension/cartesia_tts/README.md
 create mode 100644 agents/ten_packages/extension/cartesia_tts/addon.py
 create mode 100644 agents/ten_packages/extension/cartesia_tts/cartesia_tts.py
 delete mode 100644 agents/ten_packages/extension/cartesia_tts/cartesia_tts_addon.py
 delete mode 100644 agents/ten_packages/extension/cartesia_tts/cartesia_tts_extension.py
 delete mode 100644 agents/ten_packages/extension/cartesia_tts/cartesia_wrapper.py
 delete mode 100644 agents/ten_packages/extension/cartesia_tts/log.py
 create mode 100644 agents/ten_packages/extension/cartesia_tts/requirements.txt
 create mode 100644 agents/ten_packages/extension/cartesia_tts/tests/test_basic.py

diff --git a/agents/examples/experimental/property.json b/agents/examples/experimental/property.json
index b532fea5..39dd74b6 100644
--- a/agents/examples/experimental/property.json
+++ b/agents/examples/experimental/property.json
@@ -3009,7 +3009,7 @@
               "api_key": "${env:DEEPGRAM_API_KEY}",
               "language": "en-US",
               "model": "nova-2",
-              "sample_rate": "16000"
+              "sample_rate": 16000
             }
           },
           {
@@ -3036,10 +3036,10 @@
             "name": "cartesia_tts",
             "property": {
               "api_key": "${env:CARTESIA_API_KEY}",
-              "cartesia_version": "2024-06-10",
+              "language": "en",
               "model_id": "sonic-english",
-              "voice_id": "f9836c6e-a0bd-460e-9d3c-f7299fa60f94",
-              "sample_rate": "16000"
+              "sample_rate": 16000,
+              "voice_id": "f9836c6e-a0bd-460e-9d3c-f7299fa60f94"
             }
           },
           {
@@ -3253,7 +3253,7 @@
               "api_key": "${env:DEEPGRAM_API_KEY}",
               "language": "en-US",
               "model": "nova-2",
-              "sample_rate": "16000"
+              "sample_rate": 16000
             }
           },
           {
diff --git a/agents/ten_packages/extension/cartesia_tts/BUILD.gn b/agents/ten_packages/extension/cartesia_tts/BUILD.gn
new file mode 100644
index 00000000..4e409853
--- /dev/null
+++ b/agents/ten_packages/extension/cartesia_tts/BUILD.gn
@@ -0,0 +1,19 @@
+#
+# This file is part of TEN Framework, an open source project.
+# Licensed under the Apache License, Version 2.0.
+# See the LICENSE file for more information.
+#
+import("//build/feature/ten_package.gni")
+
+ten_package("cartesia_tts") {
+  package_kind = "extension"
+
+  resources = [
+    "__init__.py",
+    "addon.py",
+    "extension.py",
+    "manifest.json",
+    "property.json",
+    "tests",
+  ]
+}
diff --git a/agents/ten_packages/extension/cartesia_tts/README.md b/agents/ten_packages/extension/cartesia_tts/README.md
new file mode 100644
index 00000000..931f0029
--- /dev/null
+++ b/agents/ten_packages/extension/cartesia_tts/README.md
@@ -0,0 +1,29 @@
+# cartesia_tts
+
+<!-- brief introduction for the extension -->
+
+## Features
+
+<!-- main features introduction -->
+
+- xxx feature
+
+## API
+
+Refer to `api` definition in [manifest.json] and default values in [property.json](property.json).
+
+<!-- Additional API.md can be referred to if extra introduction needed -->
+
+## Development
+
+### Build
+
+<!-- build dependencies and steps -->
+
+### Unit test
+
+<!-- how to do unit test for the extension -->
+
+## Misc
+
+<!-- others if applicable -->
diff --git a/agents/ten_packages/extension/cartesia_tts/__init__.py b/agents/ten_packages/extension/cartesia_tts/__init__.py
index f6bb8f4c..72593ab2 100644
--- a/agents/ten_packages/extension/cartesia_tts/__init__.py
+++ b/agents/ten_packages/extension/cartesia_tts/__init__.py
@@ -1,6 +1,6 @@
-from . import cartesia_tts_addon
-from .extension import EXTENSION_NAME
-from .log import logger
-
-
-logger.info(f"{EXTENSION_NAME} extension loaded")
+#
+# This file is part of TEN Framework, an open source project.
+# Licensed under the Apache License, Version 2.0.
+# See the LICENSE file for more information.
+#
+from . import addon
diff --git a/agents/ten_packages/extension/cartesia_tts/addon.py b/agents/ten_packages/extension/cartesia_tts/addon.py
new file mode 100644
index 00000000..55d63435
--- /dev/null
+++ b/agents/ten_packages/extension/cartesia_tts/addon.py
@@ -0,0 +1,19 @@
+#
+# This file is part of TEN Framework, an open source project.
+# Licensed under the Apache License, Version 2.0.
+# See the LICENSE file for more information.
+#
+from ten import (
+    Addon,
+    register_addon_as_extension,
+    TenEnv,
+)
+
+
+@register_addon_as_extension("cartesia_tts")
+class CartesiaTTSExtensionAddon(Addon):
+
+    def on_create_instance(self, ten_env: TenEnv, name: str, context) -> None:
+        from .extension import CartesiaTTSExtension
+        ten_env.log_info("CartesiaTTSExtensionAddon on_create_instance")
+        ten_env.on_create_instance_done(CartesiaTTSExtension(name), context)
diff --git a/agents/ten_packages/extension/cartesia_tts/cartesia_tts.py b/agents/ten_packages/extension/cartesia_tts/cartesia_tts.py
new file mode 100644
index 00000000..4cc79b49
--- /dev/null
+++ b/agents/ten_packages/extension/cartesia_tts/cartesia_tts.py
@@ -0,0 +1,42 @@
+#
+#
+# Agora Real Time Engagement
+# Created by XinHui Li in 2024.
+# Copyright (c) 2024 Agora IO. All rights reserved.
+#
+#
+
+from dataclasses import dataclass
+from typing import AsyncIterator
+from cartesia import AsyncCartesia
+
+from ten_ai_base.config import BaseConfig
+
+
+@dataclass
+class CartesiaTTSConfig(BaseConfig):
+    api_key: str = ""
+    language: str = "en"
+    model_id: str = "sonic-english"
+    request_timeout_seconds: int = 10
+    sample_rate: int = 16000
+    voice_id: str = "f9836c6e-a0bd-460e-9d3c-f7299fa60f94"
+
+class CartesiaTTS:
+    def __init__(self, config: CartesiaTTSConfig) -> None:
+        self.config = config
+        self.client = AsyncCartesia(api_key=config.api_key, timeout=config.request_timeout_seconds)
+
+    def text_to_speech_stream(self, text: str) -> AsyncIterator[bytes]:
+        return self.client.tts.sse(
+            language=self.config.language,
+            model_id=self.config.model_id,
+            output_format={
+                "container": "raw",
+                "encoding": "pcm_s16le",
+                "sample_rate": self.config.sample_rate,
+            },
+            stream=True,
+            transcript=text,
+            voice_id=self.config.voice_id,
+        )
\ No newline at end of file
diff --git a/agents/ten_packages/extension/cartesia_tts/cartesia_tts_addon.py b/agents/ten_packages/extension/cartesia_tts/cartesia_tts_addon.py
deleted file mode 100644
index 1883fb5e..00000000
--- a/agents/ten_packages/extension/cartesia_tts/cartesia_tts_addon.py
+++ /dev/null
@@ -1,24 +0,0 @@
-#
-#
-# Agora Real Time Engagement
-# Created by XinHui Li in 2024-07.
-# Copyright (c) 2024 Agora IO. All rights reserved.
-#
-#
-
-from ten import (
-    Addon,
-    register_addon_as_extension,
-    TenEnv,
-)
-from .extension import EXTENSION_NAME
-
-
-@register_addon_as_extension(EXTENSION_NAME)
-class CartesiaTTSExtensionAddon(Addon):
-    def on_create_instance(self, ten: TenEnv, addon_name: str, context) -> None:
-        from .log import logger
-        logger.info("on_create_instance")
-        from .cartesia_tts_extension import CartesiaTTSExtension
-
-        ten.on_create_instance_done(CartesiaTTSExtension(addon_name), context)
diff --git a/agents/ten_packages/extension/cartesia_tts/cartesia_tts_extension.py b/agents/ten_packages/extension/cartesia_tts/cartesia_tts_extension.py
deleted file mode 100644
index f18c9af3..00000000
--- a/agents/ten_packages/extension/cartesia_tts/cartesia_tts_extension.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# cartesia_tts_extension.py
-
-import queue
-import threading
-from datetime import datetime
-import asyncio
-import re
-from ten import (
-    Extension,
-    TenEnv,
-    Cmd,
-    AudioFrameDataFmt,
-    AudioFrame,
-    Data,
-    StatusCode,
-    CmdResult,
-)
-from .cartesia_wrapper import CartesiaWrapper, CartesiaConfig, CartesiaError
-from .log import logger
-
-class CartesiaCallback:
-    # Handles audio processing and interrupt checks
-    def __init__(self, ten: TenEnv, sample_rate: int, need_interrupt_callback):
-        self.ten = ten
-        self.sample_rate = sample_rate
-        self.need_interrupt_callback = need_interrupt_callback
-        self.ts = datetime.now()
-
-    def set_input_ts(self, ts: datetime):
-        # Updates timestamp for the current input
-        self.ts = ts
-
-    def need_interrupt(self) -> bool:
-        # Checks if current task should be interrupted
-        return self.need_interrupt_callback(self.ts)
-
-    def create_audio_frame(self, audio_data):
-        # Creates an AudioFrame from raw audio data
-        frame = AudioFrame.create("pcm_frame")
-        frame.set_sample_rate(self.sample_rate)
-        frame.set_bytes_per_sample(2)  # s16le is 2 bytes per sample
-        frame.set_number_of_channels(1)
-        frame.set_data_fmt(AudioFrameDataFmt.INTERLEAVE)
-        frame.set_samples_per_channel(len(audio_data) // 2)
-        frame.alloc_buf(len(audio_data))
-        buff = frame.lock_buf()
-        buff[:] = audio_data
-        frame.unlock_buf(buff)
-        return frame
-
-    def process_audio(self, audio_data):
-        # Processes audio data if not interrupted
-        if self.need_interrupt():
-            return
-        audio_frame = self.create_audio_frame(audio_data)
-        self.ten.send_audio_frame(audio_frame)
-
-class CartesiaTTSExtension(Extension):
-    def __init__(self, name: str):
-        super().__init__(name)
-        self.cartesia = None
-        self.loop = None
-        self.queue = queue.Queue()
-        self.outdate_ts = datetime.now()
-        self.stopped = False
-        self.thread = None
-        self.callback = None
-        self.skip_patterns = [r'\bssml_\w+\b']  # List of patterns to skip
-        self.ten = None
-
-    def on_start(self, ten: TenEnv) -> None:
-        self.ten = ten
-        try:
-            # Initialize Cartesia config and wrapper
-            cartesia_config = CartesiaConfig(
-                api_key=ten.get_property_string("api_key"),
-                model_id=ten.get_property_string("model_id"),
-                voice_id=ten.get_property_string("voice_id"),
-                sample_rate=int(ten.get_property_string("sample_rate")),
-                cartesia_version=ten.get_property_string("cartesia_version")
-            )
-            self.cartesia = CartesiaWrapper(cartesia_config)
-            self.callback = CartesiaCallback(ten, cartesia_config.sample_rate, self.need_interrupt)
-
-            # Set up asyncio event loop
-            self.loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(self.loop)
-
-            # Connect to Cartesia API
-            self.loop.run_until_complete(self.cartesia.connect())
-            logger.info("Successfully connected to Cartesia API")
-
-            # Start async handling thread
-            self.thread = threading.Thread(target=self.async_handle, args=[ten])
-            self.thread.start()
-
-            ten.on_start_done()
-        except Exception as e:
-            logger.error(f"Failed to start CartesiaTTSExtension: {e}")
-            ten.on_start_done()
-
-    def on_stop(self, ten: TenEnv) -> None:
-        # Clean up resources and stop thread
-        self.stopped = True
-        self.flush()
-        self.queue.put(None)
-        if self.thread is not None:
-            self.thread.join()
-            self.thread = None
-
-        if self.cartesia:
-            self.loop.run_until_complete(self.cartesia.close())
-        if self.loop:
-            self.loop.close()
-        ten.on_stop_done()
-
-    def need_interrupt(self, ts: datetime) -> bool:
-        # Check if task is outdated
-        return self.outdate_ts > ts
-
-    def process_input_text(self, input_text: str) -> str:
-        # Process input text to remove parts that should be skipped
-        for pattern in self.skip_patterns:
-            input_text = re.sub(pattern, '', input_text, flags=re.IGNORECASE)
-        return input_text.strip()
-
-    def create_pause_text(self, duration_ms: int) -> str:
-        # Create pause text
-        return f"PAUSE_{duration_ms}_MS"
-
-    def on_data(self, ten: TenEnv, data: Data) -> None:
-        # Queue incoming text for processing
-        input_text = data.get_property_string("text")
-        if not input_text:
-            return
-
-        # Handle the case of just a period or comma
-        if input_text.strip() in ['.', ',']:
-            pause_duration = 150 if input_text.strip() == '.' else 150
-            pause_text = self.create_pause_text(pause_duration)
-            self.queue.put(("PAUSE", pause_text, datetime.now()))
-            return
-
-        processed_text = self.process_input_text(input_text)
-
-        if processed_text.strip():
-            self.queue.put(("TEXT", processed_text, datetime.now()))
-        else:
-            logger.info("Processed text is empty. Skipping synthesis.")
-
-    def async_handle(self, ten: TenEnv):
-        # Process queue items asynchronously
-        while not self.stopped:
-            try:
-                value = self.queue.get()
-                if value is None:
-                    break
-
-                item_type, content, ts = value
-
-                self.callback.set_input_ts(ts)
-
-                if self.callback.need_interrupt():
-                    logger.info("Drop outdated input")
-                    continue
-
-                try:
-                    audio_data = self.loop.run_until_complete(self.cartesia.synthesize(content))
-                    self.callback.process_audio(audio_data)
-                except CartesiaError as e:
-                    logger.error(f"Failed to synthesize: {str(e)}. Moving to next item.")
-                    # Optionally, you could add some fallback behavior here, like playing an error sound
-
-            except Exception as e:
-                logger.exception(f"Error in async_handle: {e}")
-                # Continue processing the next item instead of breaking the loop
-
-    def on_cmd(self, ten: TenEnv, cmd: Cmd) -> None:
-        # Handle incoming commands
-        cmd_name = cmd.get_name()
-
-        if cmd_name == "flush":
-            self.outdate_ts = datetime.now()
-            self.flush()
-            cmd_result = CmdResult.create(StatusCode.OK)
-            cmd_result.set_property_string("detail", "Flush command executed")
-        else:
-            logger.warning(f"Unknown command received: {cmd_name}")
-            cmd_result = CmdResult.create(StatusCode.ERROR)
-            cmd_result.set_property_string("detail", f"Unknown command: {cmd_name}")
-
-        ten.return_result(cmd_result, cmd)
-
-    def flush(self):
-        # Clear the queue
-        while not self.queue.empty():
-            self.queue.get()
diff --git a/agents/ten_packages/extension/cartesia_tts/cartesia_wrapper.py b/agents/ten_packages/extension/cartesia_tts/cartesia_wrapper.py
deleted file mode 100644
index 37a3d7c4..00000000
--- a/agents/ten_packages/extension/cartesia_tts/cartesia_wrapper.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# cartesia_wrapper.py
-
-import asyncio
-import websockets
-import json
-import base64
-import logging
-from urllib.parse import urlparse
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-class CartesiaError(Exception):
-    """Custom exception class for Cartesia-related errors."""
-    pass
-
-class CartesiaConfig:
-    # Configuration class for Cartesia API
-    def __init__(self, api_key, model_id, voice_id, sample_rate, cartesia_version):
-        self.api_key = api_key
-        self.model_id = model_id
-        self.voice_id = voice_id
-        self.sample_rate = sample_rate
-        self.cartesia_version = cartesia_version
-
-class CartesiaWrapper:
-    # Wrapper class for Cartesia API interactions
-    def __init__(self, config: CartesiaConfig):
-        self.config = config
-        self.websocket = None
-        self.context_id = 0
-
-    async def connect(self):
-        # Establish WebSocket connection to Cartesia API
-        ws_url = f"wss://api.cartesia.ai/tts/websocket?api_key={self.config.api_key}&cartesia_version={self.config.cartesia_version}"
-        try:
-            self.websocket = await websockets.connect(ws_url)
-            logger.info("Connected to Cartesia WebSocket")
-        except Exception as e:
-            logger.error(f"Failed to connect to Cartesia API: {str(e)}")
-            raise CartesiaError(f"Connection failed: {str(e)}")
-
-    async def synthesize(self, text: str):
-        # Synthesize speech from text using Cartesia API
-        if not self.websocket:
-            await self.connect()
-
-        if text.startswith("PAUSE_"):
-            # Handle custom pause marker
-            try:
-                duration_ms = int(text.split("_")[1])
-                return self.generate_silence(duration_ms)
-            except (IndexError, ValueError):
-                logger.error(f"Invalid pause format: {text}")
-                raise CartesiaError(f"Invalid pause format: {text}")
-
-        self.context_id += 1
-        request = {
-            "context_id": f"context_{self.context_id}",
-            "model_id": self.config.model_id,
-            "transcript": text,
-            "voice": {"mode": "id", "id": self.config.voice_id},
-            "output_format": {
-                "container": "raw",
-                "encoding": "pcm_s16le",
-                "sample_rate": int(self.config.sample_rate)
-            },
-            "language": "en",
-            "add_timestamps": False
-        }
-
-        try:
-            # Send synthesis request
-            await self.websocket.send(json.dumps(request))
-
-            # Receive and process audio chunks
-            audio_data = bytearray()
-            while True:
-                response = await self.websocket.recv()
-                message = json.loads(response)
-
-                if message['type'] == 'chunk':
-                    chunk_data = base64.b64decode(message['data'])
-                    audio_data.extend(chunk_data)
-                elif message['type'] == 'done':
-                    break
-                elif message['type'] == 'error':
-                    raise CartesiaError(f"Synthesis error: {message.get('error', 'Unknown error')}")
-                else:
-                    logger.warning(f"Unknown message type: {message['type']}")
-
-            return audio_data
-        except websockets.exceptions.ConnectionClosed:
-            # Handle connection errors and retry
-            logger.error("WebSocket connection closed unexpectedly. Attempting to reconnect...")
-            await self.connect()
-            return await self.synthesize(text)  # Retry the synthesis after reconnecting
-        except Exception as e:
-            logger.error(f"Error during synthesis: {str(e)}")
-            raise CartesiaError(f"Synthesis failed: {str(e)}")
-
-    def generate_silence(self, duration_ms: int) -> bytes:
-        # Generate silent audio data
-        sample_rate = self.config.sample_rate
-        num_samples = int(sample_rate * duration_ms / 1000)
-        return b"\x00" * (num_samples * 2)  # Assuming 16-bit audio
-
-    async def close(self):
-        # Close WebSocket connection
-        if self.websocket:
-            await self.websocket.close()
-            logger.info("Closed WebSocket connection to Cartesia API")
diff --git a/agents/ten_packages/extension/cartesia_tts/extension.py b/agents/ten_packages/extension/cartesia_tts/extension.py
index 4883c11c..5729cfe6 100644
--- a/agents/ten_packages/extension/cartesia_tts/extension.py
+++ b/agents/ten_packages/extension/cartesia_tts/extension.py
@@ -1 +1,52 @@
-EXTENSION_NAME = "cartesia_tts"
+#
+# This file is part of TEN Framework, an open source project.
+# Licensed under the Apache License, Version 2.0.
+# See the LICENSE file for more information.
+#
+import traceback
+
+from .cartesia_tts import CartesiaTTS, CartesiaTTSConfig
+from ten import (
+    AsyncTenEnv,
+)
+from ten_ai_base.tts import AsyncTTSBaseExtension
+
+class CartesiaTTSExtension(AsyncTTSBaseExtension):
+    def __init__(self, name: str) -> None:
+        super().__init__(name)
+        self.config = None
+        self.client = None
+
+    async def on_init(self, ten_env: AsyncTenEnv) -> None:
+        await super().on_init(ten_env)
+        ten_env.log_debug("on_init")
+
+    async def on_start(self, ten_env: AsyncTenEnv) -> None:
+        try:
+            await super().on_start(ten_env)
+            ten_env.log_debug("on_start")
+            self.config = CartesiaTTSConfig.create(ten_env=ten_env)
+
+            if not self.config.api_key:
+                raise ValueError("api_key is required")
+
+            self.client = CartesiaTTS(self.config)
+        except Exception as err:
+            ten_env.log_error(f"on_start failed: {traceback.format_exc()}")
+
+    async def on_stop(self, ten_env: AsyncTenEnv) -> None:
+        await super().on_stop(ten_env)
+        ten_env.log_debug("on_stop")
+
+    async def on_deinit(self, ten_env: AsyncTenEnv) -> None:
+        await super().on_deinit(ten_env)
+        ten_env.log_debug("on_deinit")
+
+    async def on_request_tts(self, ten_env: AsyncTenEnv, input_text: str, end_of_segment: bool) -> None:
+        audio_stream = await self.client.text_to_speech_stream(input_text)
+
+        async for audio_data in audio_stream:
+            self.send_audio_out(ten_env, audio_data["audio"])
+
+    async def on_cancel_tts(self, ten_env: AsyncTenEnv) -> None:
+        return await super().on_cancel_tts(ten_env)
\ No newline at end of file
diff --git a/agents/ten_packages/extension/cartesia_tts/log.py b/agents/ten_packages/extension/cartesia_tts/log.py
deleted file mode 100644
index fad21710..00000000
--- a/agents/ten_packages/extension/cartesia_tts/log.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import logging
-from .extension import EXTENSION_NAME
-
-logger = logging.getLogger(EXTENSION_NAME)
-logger.setLevel(logging.INFO)
-
-formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(process)d - [%(filename)s:%(lineno)d] - %(message)s")
-
-console_handler = logging.StreamHandler()
-console_handler.setFormatter(formatter)
-
-logger.addHandler(console_handler)
diff --git a/agents/ten_packages/extension/cartesia_tts/manifest.json b/agents/ten_packages/extension/cartesia_tts/manifest.json
index 2a5c3578..85912f14 100644
--- a/agents/ten_packages/extension/cartesia_tts/manifest.json
+++ b/agents/ten_packages/extension/cartesia_tts/manifest.json
@@ -1,56 +1,67 @@
 {
-    "type": "extension",
-    "name": "cartesia_tts",
-    "version": "0.1.0",
-    "dependencies": [
-        {
-            "type": "system",
-            "name": "ten_runtime_python",
-            "version": "0.4"
+  "type": "extension",
+  "name": "cartesia_tts",
+  "version": "0.4.2",
+  "dependencies": [
+    {
+      "type": "system",
+      "name": "ten_runtime_python",
+      "version": "0.4.2"
+    }
+  ],
+  "package": {
+    "include": [
+      "manifest.json",
+      "property.json",
+      "BUILD.gn",
+      "**.tent",
+      "**.py",
+      "README.md",
+      "tests/**"
+    ]
+  },
+  "api": {
+    "property": {
+      "api_key": {
+        "type": "string"
+      },
+      "language": {
+        "type": "string"
+      },
+      "model_id": {
+        "type": "string"
+      },
+      "sample_rate": {
+        "type": "int64"
+      },
+      "voice_id": {
+        "type": "string"
+      }
+    },
+    "data_in": [
+      {
+        "name": "text_data",
+        "property": {
+          "text": {
+            "type": "string"
+          }
         }
+      }
     ],
-    "api": {
-        "property": {
-            "api_key": {
-                "type": "string"
-            },
-            "cartesia_version": {
-                "type": "string"
-            },
-            "model_id": {
-                "type": "string"
-            },
-            "sample_rate": {
-                "type": "string"
-            },
-            "voice_id": {
-                "type": "string"
-            }
-        },
-        "data_in": [
-            {
-                "name": "text_data",
-                "property": {
-                    "text": {
-                        "type": "string"
-                    }
-                }
-            }
-        ],
-        "cmd_in": [
-            {
-                "name": "flush"
-            }
-        ],
-        "cmd_out": [
-            {
-                "name": "flush"
-            }
-        ],
-        "audio_frame_out": [
-            {
-                "name": "pcm_frame"
-            }
-        ]
-    }
+    "cmd_in": [
+      {
+        "name": "flush"
+      }
+    ],
+    "cmd_out": [
+      {
+        "name": "flush"
+      }
+    ],
+    "audio_frame_out": [
+      {
+        "name": "pcm_frame"
+      }
+    ]
+  }
 }
\ No newline at end of file
diff --git a/agents/ten_packages/extension/cartesia_tts/requirements.txt b/agents/ten_packages/extension/cartesia_tts/requirements.txt
new file mode 100644
index 00000000..59c3d54e
--- /dev/null
+++ b/agents/ten_packages/extension/cartesia_tts/requirements.txt
@@ -0,0 +1 @@
+cartesia
\ No newline at end of file
diff --git a/agents/ten_packages/extension/cartesia_tts/tests/test_basic.py b/agents/ten_packages/extension/cartesia_tts/tests/test_basic.py
new file mode 100644
index 00000000..c3755f44
--- /dev/null
+++ b/agents/ten_packages/extension/cartesia_tts/tests/test_basic.py
@@ -0,0 +1,36 @@
+#
+# Copyright © 2024 Agora
+# This file is part of TEN Framework, an open source project.
+# Licensed under the Apache License, Version 2.0, with certain conditions.
+# Refer to the "LICENSE" file in the root directory for more information.
+#
+from pathlib import Path
+from ten import ExtensionTester, TenEnvTester, Cmd, CmdResult, StatusCode
+
+
+class ExtensionTesterBasic(ExtensionTester):
+    def check_hello(self, ten_env: TenEnvTester, result: CmdResult):
+        statusCode = result.get_status_code()
+        print("receive hello_world, status:" + str(statusCode))
+
+        if statusCode == StatusCode.OK:
+            ten_env.stop_test()
+
+    def on_start(self, ten_env: TenEnvTester) -> None:
+        new_cmd = Cmd.create("hello_world")
+
+        print("send hello_world")
+        ten_env.send_cmd(
+            new_cmd,
+            lambda ten_env, result: self.check_hello(ten_env, result),
+        )
+
+        print("tester on_start_done")
+        ten_env.on_start_done()
+
+
+def test_basic():
+    tester = ExtensionTesterBasic()
+    tester.add_addon_base_dir(str(Path(__file__).resolve().parent.parent))
+    tester.set_test_mode_single("default_async_extension_python")
+    tester.run()
diff --git a/agents/ten_packages/extension/deepgram_asr_python/manifest.json b/agents/ten_packages/extension/deepgram_asr_python/manifest.json
index e7914dd6..0ba17d06 100644
--- a/agents/ten_packages/extension/deepgram_asr_python/manifest.json
+++ b/agents/ten_packages/extension/deepgram_asr_python/manifest.json
@@ -1,88 +1,88 @@
 {
-    "type": "extension",
-    "name": "deepgram_asr_python",
-    "version": "0.1.0",
-    "dependencies": [
-        {
-            "type": "system",
-            "name": "ten_runtime_python",
-            "version": "0.4"
+  "type": "extension",
+  "name": "deepgram_asr_python",
+  "version": "0.1.0",
+  "dependencies": [
+    {
+      "type": "system",
+      "name": "ten_runtime_python",
+      "version": "0.4"
+    }
+  ],
+  "api": {
+    "property": {
+      "api_key": {
+        "type": "string"
+      },
+      "model": {
+        "type": "string"
+      },
+      "language": {
+        "type": "string"
+      },
+      "sample_rate": {
+        "type": "int64"
+      }
+    },
+    "audio_frame_in": [
+      {
+        "name": "pcm_frame",
+        "property": {}
+      }
+    ],
+    "cmd_in": [
+      {
+        "name": "on_user_joined",
+        "property": {
+          "user_id": {
+            "type": "string"
+          }
+        }
+      },
+      {
+        "name": "on_user_left",
+        "property": {
+          "user_id": {
+            "type": "string"
+          }
         }
+      },
+      {
+        "name": "on_connection_failure",
+        "property": {
+          "error": {
+            "type": "string"
+          }
+        }
+      }
     ],
-    "api": {
+    "data_out": [
+      {
+        "name": "text_data",
         "property": {
-            "api_key": {
-                "type": "string"
-            },
-            "model": {
-                "type": "string"
-            },
-            "language": {
-                "type": "string"
-            },
-            "sample_rate": {
-                "type": "string"
-            }
-        },
-        "audio_frame_in": [
-            {
-                "name": "pcm_frame",
-                "property": {}
-            }
-        ],
-        "cmd_in": [
-            {
-                "name": "on_user_joined",
-                "property": {
-                    "user_id": {
-                        "type": "string"
-                    }
-                }
-            },
-            {
-                "name": "on_user_left",
-                "property": {
-                    "user_id": {
-                        "type": "string"
-                    }
-                }
-            },
-            {
-                "name": "on_connection_failure",
-                "property": {
-                    "error": {
-                        "type": "string"
-                    }
-                }
-            }
-        ],
-        "data_out": [
-            {
-                "name": "text_data",
-                "property": {
-                    "time": {
-                        "type": "int64"
-                    },
-                    "duration_ms": {
-                        "type": "int64"
-                    },
-                    "language": {
-                        "type": "string"
-                    },
-                    "text": {
-                        "type": "string"
-                    },
-                    "is_final": {
-                        "type": "bool"
-                    },
-                    "stream_id": {
-                        "type": "uint32"
-                    },
-                    "end_of_segment": {
-                        "type": "bool"
-                    }
-                }
-            }
-        ]
-    }
+          "time": {
+            "type": "int64"
+          },
+          "duration_ms": {
+            "type": "int64"
+          },
+          "language": {
+            "type": "string"
+          },
+          "text": {
+            "type": "string"
+          },
+          "is_final": {
+            "type": "bool"
+          },
+          "stream_id": {
+            "type": "uint32"
+          },
+          "end_of_segment": {
+            "type": "bool"
+          }
+        }
+      }
+    ]
+  }
 }
\ No newline at end of file