Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 59 additions & 1 deletion examples/voice_agent/client/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion examples/voice_agent/client/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,17 @@
"devDependencies": {
"@types/node": "^22.15.30",
"@types/protobufjs": "^6.0.0",
"@types/react": "^19.2.2",
"@types/react-dom": "^19.2.2",
"@vitejs/plugin-react-swc": "^3.10.1",
"typescript": "^5.8.3",
"vite": "^6.3.5"
},
"dependencies": {
"@pipecat-ai/client-js": "^0.4.0",
"@pipecat-ai/websocket-transport": "^0.4.1",
"protobufjs": "^7.4.0"
"protobufjs": "^7.4.0",
"react": "^19.2.0",
"react-dom": "^19.2.0"
}
}
5 changes: 3 additions & 2 deletions examples/voice_agent/client/src/app.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,12 @@ class WebsocketClientApp {
private readonly serverConfigs = {
websocket: {
name: 'WebSocket Server',
baseUrl: 'http://localhost:7860',
baseUrl: `http://${window.location.hostname}:7860`,
port: 8765
},
fastapi: {
name: 'FastAPI Server',
baseUrl: 'http://localhost:8000',
baseUrl: `http://${window.location.hostname}:8000`,
port: 8000
}
};
Expand Down Expand Up @@ -257,6 +257,7 @@ class WebsocketClientApp {

this.log('Initializing devices...');
await this.rtviClient.initDevices();
this.log('Devices initialized successfully');

this.log('Connecting to bot...');
await this.rtviClient.connect();
Expand Down
16 changes: 8 additions & 8 deletions examples/voice_agent/client/tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */

/* Language and Environment */
"target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
// "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
// "jsx": "preserve", /* Specify what JSX code is generated. */
"target": "ES2020", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
"lib": ["ES2020", "DOM", "DOM.Iterable"], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
"jsx": "react-jsx", /* Specify what JSX code is generated. */
// "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */
// "emitDecoratorMetadata": true, /* Emit design-type metadata for decorated declarations in source files. */
// "jsxFactory": "", /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */
Expand All @@ -25,9 +25,9 @@
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */

/* Modules */
"module": "commonjs", /* Specify what module code is generated. */
"module": "ESNext", /* Specify what module code is generated. */
// "rootDir": "./", /* Specify the root folder within your source files. */
// "moduleResolution": "node10", /* Specify how TypeScript looks up a file from a given module specifier. */
"moduleResolution": "bundler", /* Specify how TypeScript looks up a file from a given module specifier. */
// "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */
Expand All @@ -41,7 +41,7 @@
// "resolvePackageJsonImports": true, /* Use the package.json 'imports' field when resolving imports. */
// "customConditions": [], /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
// "noUncheckedSideEffectImports": true, /* Check side effect imports. */
// "resolveJsonModule": true, /* Enable importing .json files. */
"resolveJsonModule": true, /* Enable importing .json files. */
// "allowArbitraryExtensions": true, /* Enable importing files with any extension, provided a declaration file is present. */
// "noResolve": true, /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */

Expand Down Expand Up @@ -74,10 +74,10 @@
// "declarationDir": "./", /* Specify the output directory for generated declaration files. */

/* Interop Constraints */
// "isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */
"isolatedModules": true, /* Ensure that each file can be safely transpiled without relying on other imports. */
// "verbatimModuleSyntax": true, /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */
// "isolatedDeclarations": true, /* Require sufficient annotation on exports so other tools can trivially generate declaration files. */
// "allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
"allowSyntheticDefaultImports": true, /* Allow 'import x from y' when a module doesn't have a default export. */
"esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
// "preserveSymlinks": true, /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
"forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */
Expand Down
1 change: 0 additions & 1 deletion examples/voice_agent/server/backchannel_phrases.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
- "great"
- "great thanks"
- "ha ha"
- "hi"
- "hmm"
- "humm"
- "huh"
Expand Down
48 changes: 42 additions & 6 deletions examples/voice_agent/server/bot_websocket_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,13 @@
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
from pipecat.processors.frameworks.rtvi import RTVIAction, RTVIConfig, RTVIObserver, RTVIProcessor
from pipecat.processors.frameworks.rtvi import RTVIAction, RTVIConfig, RTVIProcessor
from pipecat.serializers.protobuf import ProtobufFrameSerializer

from nemo.agents.voice_agent.pipecat.services.nemo.diar import NeMoDiarInputParams, NemoDiarService
from nemo.agents.voice_agent.pipecat.services.nemo.audio_logger import AudioLogger
from nemo.agents.voice_agent.pipecat.processors.frameworks.rtvi import RTVIObserver
from nemo.agents.voice_agent.pipecat.services.nemo.diar import NemoDiarService
from nemo.agents.voice_agent.pipecat.services.nemo.llm import get_llm_service_from_config
from nemo.agents.voice_agent.pipecat.services.nemo.stt import NeMoSTTInputParams, NemoSTTService
from nemo.agents.voice_agent.pipecat.services.nemo.stt import NemoSTTService
from nemo.agents.voice_agent.pipecat.services.nemo.tts import KokoroTTSService, NeMoFastPitchHiFiGANTTSService
from nemo.agents.voice_agent.pipecat.services.nemo.turn_taking import NeMoTurnTakingService
from nemo.agents.voice_agent.pipecat.transports.network.websocket_server import (
Expand Down Expand Up @@ -77,6 +78,8 @@ def setup_logging():

# Transport configuration
TRANSPORT_AUDIO_OUT_10MS_CHUNKS = config_manager.TRANSPORT_AUDIO_OUT_10MS_CHUNKS
RECORD_AUDIO_DATA = server_config.transport.get("record_audio_data", False)
AUDIO_LOG_DIR = server_config.transport.get("audio_log_dir", "./audio_logs")

# VAD configuration
vad_params = config_manager.get_vad_params()
Expand Down Expand Up @@ -127,6 +130,21 @@ async def run_bot_websocket_server():
- Server will run indefinitely until manually stopped (Ctrl+C)
"""

# Initialize AudioLogger if recording is enabled
audio_logger = None
if RECORD_AUDIO_DATA:
from datetime import datetime

session_id = f"session_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
audio_logger = AudioLogger(
log_dir=AUDIO_LOG_DIR,
session_id=session_id,
enabled=True,
)
logger.info(f"AudioLogger initialized for session: {session_id} at {AUDIO_LOG_DIR}")
else:
logger.info("Audio logging is disabled")

vad_analyzer = SileroVADAnalyzer(
sample_rate=SAMPLE_RATE,
params=vad_params,
Expand Down Expand Up @@ -161,6 +179,8 @@ async def run_bot_websocket_server():
has_turn_taking=True,
backend="legacy",
decoder_type="rnnt",
record_audio_data=RECORD_AUDIO_DATA,
audio_logger=audio_logger,
)
logger.info("STT service initialized")

Expand All @@ -183,6 +203,7 @@ async def run_bot_websocket_server():
max_buffer_size=TURN_TAKING_MAX_BUFFER_SIZE,
bot_stop_delay=TURN_TAKING_BOT_STOP_DELAY,
backchannel_phrases=TURN_TAKING_BACKCHANNEL_PHRASES_PATH,
audio_logger=audio_logger,
)
logger.info("Turn taking service initialized")

Expand All @@ -200,6 +221,8 @@ async def run_bot_websocket_server():
device=TTS_DEVICE,
text_aggregator=text_aggregator,
think_tokens=TTS_THINK_TOKENS,
record_audio_data=RECORD_AUDIO_DATA,
audio_logger=audio_logger,
)
elif TTS_TYPE == "kokoro":
tts = KokoroTTSService(
Expand All @@ -208,6 +231,8 @@ async def run_bot_websocket_server():
speed=config_manager.server_config.tts.speed,
text_aggregator=text_aggregator,
think_tokens=TTS_THINK_TOKENS,
record_audio_data=RECORD_AUDIO_DATA,
audio_logger=audio_logger,
)
else:
raise ValueError(f"Invalid TTS type: {TTS_TYPE}")
Expand Down Expand Up @@ -243,7 +268,9 @@ async def reset_context_handler(rtvi_processor: RTVIProcessor, service: str, arg
assistant_context_aggregator.reset()
user_context_aggregator.set_messages(copy.deepcopy(original_messages))
assistant_context_aggregator.set_messages(copy.deepcopy(original_messages))

text_aggregator.reset()
if diar is not None:
diar.reset()
logger.info("Conversation context reset successfully")
return True
except Exception as e:
Expand Down Expand Up @@ -276,6 +303,7 @@ async def reset_context_handler(rtvi_processor: RTVIProcessor, service: str, arg

pipeline = Pipeline(pipeline)

rtvi_text_aggregator = SimpleSegmentedTextAggregator("\n?!.", min_sentence_length=5)
task = PipelineTask(
pipeline,
params=PipelineParams(
Expand All @@ -286,7 +314,7 @@ async def reset_context_handler(rtvi_processor: RTVIProcessor, service: str, arg
report_only_initial_ttfb=True,
idle_timeout=None, # Disable idle timeout
),
observers=[RTVIObserver(rtvi)],
observers=[RTVIObserver(rtvi, text_aggregator=rtvi_text_aggregator)],
idle_timeout_secs=None,
cancel_on_idle_timeout=False,
)
Expand Down Expand Up @@ -317,6 +345,10 @@ async def on_client_connected(transport, client):
@ws_transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
logger.info(f"Pipecat Client disconnected from {client.remote_address}")
# Finalize audio logger session if enabled
if audio_logger:
audio_logger.finalize_session()
logger.info("Audio logger session finalized")
# Don't cancel the task immediately - let it handle the disconnection gracefully
# The task will continue running and can accept new connections
# Only send an EndTaskFrame to clean up the current session
Expand Down Expand Up @@ -349,6 +381,10 @@ async def on_session_timeout(transport, client):
logger.error(f"Pipeline runner error: {e}")
task_running = False
finally:
# Finalize audio logger on shutdown
if audio_logger:
audio_logger.finalize_session()
logger.info("Audio logger session finalized on shutdown")
logger.info("Pipeline runner stopped")


Expand Down
14 changes: 7 additions & 7 deletions examples/voice_agent/server/example_prompts/fast-bite.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Fast Bites Lunch Menu

Burgers and Sandwiches
Burgers and Sandwiches:
1. Classic Cheeseburger – $5.99
Juicy beef patty, cheddar cheese, pickles, ketchup & mustard on a toasted bun.
- Make it a double cheeseburger by adding another patty - $1.50
Expand All @@ -14,18 +14,18 @@ Combo Deals (includes small fries and fountain soda)
5. Chicken Sandwich Combo – $9.49
6. Veggie Wrap Combo – $8.49

Sides
Sides:
7. French Fries
- Small - $2.49
- Medium - $3.49
- Large - $4.49
8. Chicken Nuggets
- 4 pcs - $3.29
- 8 pcs - $5.99
- 12 pcs - $8.99
9. Side Salad - $2.99
- 4 pieces - $3.29
- 8 pieces - $5.99
- 12 pieces - $8.99
9. Side Salad - $2.99

Drinks
Drinks:
10. Fountain Soda (16 oz, choices: Coke, Diet Coke, Sprite, Fanta) – $1.99
11. Iced Tea or Lemonade – $2.29
12. Bottled Water – $1.49
Expand Down
7 changes: 5 additions & 2 deletions examples/voice_agent/server/server_configs/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

transport:
audio_out_10ms_chunks: 10 # use 4 as websocket default, but increasing to a larger number might have less glitches in TTS output
# record_audio_data: false
record_audio_data: true

vad:
type: silero
Expand All @@ -15,7 +17,8 @@ vad:

stt:
type: nemo # choices in ['nemo'] currently only NeMo is supported
model: "stt_en_fastconformer_hybrid_large_streaming_80ms"
# model: "stt_en_fastconformer_hybrid_large_streaming_80ms"
model: "nvidia/parakeet_realtime_eou_120m-v1"
model_config: "./server_configs/stt_configs/nemo_cache_aware_streaming.yaml"
device: "cuda"

Expand All @@ -41,12 +44,12 @@ llm:
# model_config: "./server_configs/llm_configs/qwen2.5-7B.yaml"
# model: "Qwen/Qwen3-8B"
# model_config: "./server_configs/llm_configs/qwen3-8B.yaml"
# model: meta-llama/Llama-3.1-8B-Instruct
device: "cuda"
enable_reasoning: false # it's best to turn-off reasoning for lowest latency
# `system_prompt` is used as the sytem prompt to the LLM, please refer to differnt LLM webpage for spcial functions like enabling/disabling thinking
# system_prompt: /path/to/prompt.txt # or use path to a txt file that contains a long prompt, for example in `../example_prompts/fast_bite.txt`
system_prompt: "You are a helpful AI agent named Lisa. Start by greeting the user warmly and introducing yourself within one sentence. Your answer should be concise and to the point. You might also see speaker tags (<speaker_0>, <speaker_1>, etc.) in the user context. You should respond to the user based on the speaker tag and the context of that speaker. Do not include the speaker tags in your response, use them only to identify the speaker. Do not include any emoji in response."

tts:
type: kokoro # choices in ['nemo', 'kokoro']
model: "hexgrad/Kokoro-82M"
Expand Down
Loading
Loading