NVIDIA-NeMo · tango4j · Oct 9, 2025 · Oct 9, 2025 · Oct 9, 2025 · Oct 9, 2025
diff --git a/examples/voice_agent/client/package-lock.json b/examples/voice_agent/client/package-lock.json
diff --git a/examples/voice_agent/client/package.json b/examples/voice_agent/client/package.json
@@ -14,13 +14,17 @@
   "devDependencies": {
     "@types/node": "^22.15.30",
     "@types/protobufjs": "^6.0.0",
+    "@types/react": "^19.2.2",
+    "@types/react-dom": "^19.2.2",
     "@vitejs/plugin-react-swc": "^3.10.1",
     "typescript": "^5.8.3",
     "vite": "^6.3.5"
   },
   "dependencies": {
     "@pipecat-ai/client-js": "^0.4.0",
     "@pipecat-ai/websocket-transport": "^0.4.1",
-    "protobufjs": "^7.4.0"
+    "protobufjs": "^7.4.0",
+    "react": "^19.2.0",
+    "react-dom": "^19.2.0"
   }
 }
diff --git a/examples/voice_agent/client/src/app.ts b/examples/voice_agent/client/src/app.ts
@@ -46,12 +46,12 @@ class WebsocketClientApp {
   private readonly serverConfigs = {
     websocket: {
       name: 'WebSocket Server',
-      baseUrl: 'http://localhost:7860',
+      baseUrl: `http://${window.location.hostname}:7860`,
       port: 8765
     },
     fastapi: {
       name: 'FastAPI Server', 
-      baseUrl: 'http://localhost:8000',
+      baseUrl: `http://${window.location.hostname}:8000`,
       port: 8000
     }
   };
@@ -257,6 +257,7 @@ class WebsocketClientApp {
 
       this.log('Initializing devices...');
       await this.rtviClient.initDevices();
+      this.log('Devices initialized successfully');
 
       this.log('Connecting to bot...');
       await this.rtviClient.connect();

diff --git a/examples/voice_agent/client/tsconfig.json b/examples/voice_agent/client/tsconfig.json
@@ -11,9 +11,9 @@
     // "disableReferencedProjectLoad": true,             /* Reduce the number of projects loaded automatically by TypeScript. */
 
     /* Language and Environment */
-    "target": "es2016",                                  /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
-    // "lib": [],                                        /* Specify a set of bundled library declaration files that describe the target runtime environment. */
-    // "jsx": "preserve",                                /* Specify what JSX code is generated. */
+    "target": "ES2020",                                  /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
+    "lib": ["ES2020", "DOM", "DOM.Iterable"],            /* Specify a set of bundled library declaration files that describe the target runtime environment. */
+    "jsx": "react-jsx",                                  /* Specify what JSX code is generated. */
     // "experimentalDecorators": true,                   /* Enable experimental support for legacy experimental decorators. */
     // "emitDecoratorMetadata": true,                    /* Emit design-type metadata for decorated declarations in source files. */
     // "jsxFactory": "",                                 /* Specify the JSX factory function used when targeting React JSX emit, e.g. 'React.createElement' or 'h'. */
@@ -25,9 +25,9 @@
     // "moduleDetection": "auto",                        /* Control what method is used to detect module-format JS files. */
 
     /* Modules */
-    "module": "commonjs",                                /* Specify what module code is generated. */
+    "module": "ESNext",                                  /* Specify what module code is generated. */
     // "rootDir": "./",                                  /* Specify the root folder within your source files. */
-    // "moduleResolution": "node10",                     /* Specify how TypeScript looks up a file from a given module specifier. */
+    "moduleResolution": "bundler",                       /* Specify how TypeScript looks up a file from a given module specifier. */
     // "baseUrl": "./",                                  /* Specify the base directory to resolve non-relative module names. */
     // "paths": {},                                      /* Specify a set of entries that re-map imports to additional lookup locations. */
     // "rootDirs": [],                                   /* Allow multiple folders to be treated as one when resolving modules. */
@@ -41,7 +41,7 @@
     // "resolvePackageJsonImports": true,                /* Use the package.json 'imports' field when resolving imports. */
     // "customConditions": [],                           /* Conditions to set in addition to the resolver-specific defaults when resolving imports. */
     // "noUncheckedSideEffectImports": true,             /* Check side effect imports. */
-    // "resolveJsonModule": true,                        /* Enable importing .json files. */
+    "resolveJsonModule": true,                           /* Enable importing .json files. */
     // "allowArbitraryExtensions": true,                 /* Enable importing files with any extension, provided a declaration file is present. */
     // "noResolve": true,                                /* Disallow 'import's, 'require's or '<reference>'s from expanding the number of files TypeScript should add to a project. */
 
@@ -74,10 +74,10 @@
     // "declarationDir": "./",                           /* Specify the output directory for generated declaration files. */
 
     /* Interop Constraints */
-    // "isolatedModules": true,                          /* Ensure that each file can be safely transpiled without relying on other imports. */
+    "isolatedModules": true,                             /* Ensure that each file can be safely transpiled without relying on other imports. */
     // "verbatimModuleSyntax": true,                     /* Do not transform or elide any imports or exports not marked as type-only, ensuring they are written in the output file's format based on the 'module' setting. */
     // "isolatedDeclarations": true,                     /* Require sufficient annotation on exports so other tools can trivially generate declaration files. */
-    // "allowSyntheticDefaultImports": true,             /* Allow 'import x from y' when a module doesn't have a default export. */
+    "allowSyntheticDefaultImports": true,                /* Allow 'import x from y' when a module doesn't have a default export. */
     "esModuleInterop": true,                             /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
     // "preserveSymlinks": true,                         /* Disable resolving symlinks to their realpath. This correlates to the same flag in node. */
     "forceConsistentCasingInFileNames": true,            /* Ensure that casing is correct in imports. */

diff --git a/examples/voice_agent/server/backchannel_phrases.yaml b/examples/voice_agent/server/backchannel_phrases.yaml
@@ -11,7 +11,6 @@
 - "great"
 - "great thanks"
 - "ha ha"
-- "hi"
 - "hmm"
 - "humm"
 - "huh"

diff --git a/examples/voice_agent/server/bot_websocket_server.py b/examples/voice_agent/server/bot_websocket_server.py
@@ -28,12 +28,13 @@
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
 from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
-from pipecat.processors.frameworks.rtvi import RTVIAction, RTVIConfig, RTVIObserver, RTVIProcessor
+from pipecat.processors.frameworks.rtvi import RTVIAction, RTVIConfig, RTVIProcessor
 from pipecat.serializers.protobuf import ProtobufFrameSerializer
-
-from nemo.agents.voice_agent.pipecat.services.nemo.diar import NeMoDiarInputParams, NemoDiarService
+from nemo.agents.voice_agent.pipecat.services.nemo.audio_logger import AudioLogger
+from nemo.agents.voice_agent.pipecat.processors.frameworks.rtvi import RTVIObserver
+from nemo.agents.voice_agent.pipecat.services.nemo.diar import NemoDiarService
 from nemo.agents.voice_agent.pipecat.services.nemo.llm import get_llm_service_from_config
-from nemo.agents.voice_agent.pipecat.services.nemo.stt import NeMoSTTInputParams, NemoSTTService
+from nemo.agents.voice_agent.pipecat.services.nemo.stt import NemoSTTService
 from nemo.agents.voice_agent.pipecat.services.nemo.tts import KokoroTTSService, NeMoFastPitchHiFiGANTTSService
 from nemo.agents.voice_agent.pipecat.services.nemo.turn_taking import NeMoTurnTakingService
 from nemo.agents.voice_agent.pipecat.transports.network.websocket_server import (
@@ -77,6 +78,8 @@ def setup_logging():
 
 # Transport configuration
 TRANSPORT_AUDIO_OUT_10MS_CHUNKS = config_manager.TRANSPORT_AUDIO_OUT_10MS_CHUNKS
+RECORD_AUDIO_DATA = server_config.transport.get("record_audio_data", False)
+AUDIO_LOG_DIR = server_config.transport.get("audio_log_dir", "./audio_logs")
 
 # VAD configuration
 vad_params = config_manager.get_vad_params()
@@ -127,6 +130,21 @@ async def run_bot_websocket_server():
     - Server will run indefinitely until manually stopped (Ctrl+C)
     """
 
+    # Initialize AudioLogger if recording is enabled
+    audio_logger = None
+    if RECORD_AUDIO_DATA:
+        from datetime import datetime
+
+        session_id = f"session_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+        audio_logger = AudioLogger(
+            log_dir=AUDIO_LOG_DIR,
+            session_id=session_id,
+            enabled=True,
+        )
+        logger.info(f"AudioLogger initialized for session: {session_id} at {AUDIO_LOG_DIR}")
+    else:
+        logger.info("Audio logging is disabled")
+
     vad_analyzer = SileroVADAnalyzer(
         sample_rate=SAMPLE_RATE,
         params=vad_params,
@@ -161,6 +179,8 @@ async def run_bot_websocket_server():
         has_turn_taking=True,
         backend="legacy",
         decoder_type="rnnt",
+        record_audio_data=RECORD_AUDIO_DATA,
+        audio_logger=audio_logger,
     )
     logger.info("STT service initialized")
 
@@ -183,6 +203,7 @@ async def run_bot_websocket_server():
         max_buffer_size=TURN_TAKING_MAX_BUFFER_SIZE,
         bot_stop_delay=TURN_TAKING_BOT_STOP_DELAY,
         backchannel_phrases=TURN_TAKING_BACKCHANNEL_PHRASES_PATH,
+        audio_logger=audio_logger,
     )
     logger.info("Turn taking service initialized")
 
@@ -200,6 +221,8 @@ async def run_bot_websocket_server():
             device=TTS_DEVICE,
             text_aggregator=text_aggregator,
             think_tokens=TTS_THINK_TOKENS,
+            record_audio_data=RECORD_AUDIO_DATA,
+            audio_logger=audio_logger,
         )
     elif TTS_TYPE == "kokoro":
         tts = KokoroTTSService(
@@ -208,6 +231,8 @@ async def run_bot_websocket_server():
             speed=config_manager.server_config.tts.speed,
             text_aggregator=text_aggregator,
             think_tokens=TTS_THINK_TOKENS,
+            record_audio_data=RECORD_AUDIO_DATA,
+            audio_logger=audio_logger,
         )
     else:
         raise ValueError(f"Invalid TTS type: {TTS_TYPE}")
@@ -243,7 +268,9 @@ async def reset_context_handler(rtvi_processor: RTVIProcessor, service: str, arg
             assistant_context_aggregator.reset()
             user_context_aggregator.set_messages(copy.deepcopy(original_messages))
             assistant_context_aggregator.set_messages(copy.deepcopy(original_messages))
-
+            text_aggregator.reset()
+            if diar is not None:
+                diar.reset()
             logger.info("Conversation context reset successfully")
             return True
         except Exception as e:
@@ -276,6 +303,7 @@ async def reset_context_handler(rtvi_processor: RTVIProcessor, service: str, arg
 
     pipeline = Pipeline(pipeline)
 
+    rtvi_text_aggregator = SimpleSegmentedTextAggregator("\n?!.", min_sentence_length=5)
     task = PipelineTask(
         pipeline,
         params=PipelineParams(
@@ -286,7 +314,7 @@ async def reset_context_handler(rtvi_processor: RTVIProcessor, service: str, arg
             report_only_initial_ttfb=True,
             idle_timeout=None,  # Disable idle timeout
         ),
-        observers=[RTVIObserver(rtvi)],
+        observers=[RTVIObserver(rtvi, text_aggregator=rtvi_text_aggregator)],
         idle_timeout_secs=None,
         cancel_on_idle_timeout=False,
     )
@@ -317,6 +345,10 @@ async def on_client_connected(transport, client):
     @ws_transport.event_handler("on_client_disconnected")
     async def on_client_disconnected(transport, client):
         logger.info(f"Pipecat Client disconnected from {client.remote_address}")
+        # Finalize audio logger session if enabled
+        if audio_logger:
+            audio_logger.finalize_session()
+            logger.info("Audio logger session finalized")
         # Don't cancel the task immediately - let it handle the disconnection gracefully
         # The task will continue running and can accept new connections
         # Only send an EndTaskFrame to clean up the current session
@@ -349,6 +381,10 @@ async def on_session_timeout(transport, client):
         logger.error(f"Pipeline runner error: {e}")
         task_running = False
     finally:
+        # Finalize audio logger on shutdown
+        if audio_logger:
+            audio_logger.finalize_session()
+            logger.info("Audio logger session finalized on shutdown")
         logger.info("Pipeline runner stopped")
 
 

diff --git a/examples/voice_agent/server/example_prompts/fast-bite.txt b/examples/voice_agent/server/example_prompts/fast-bite.txt
@@ -1,6 +1,6 @@
 Fast Bites Lunch Menu
 
-Burgers and Sandwiches
+Burgers and Sandwiches:
 1. Classic Cheeseburger – $5.99
    Juicy beef patty, cheddar cheese, pickles, ketchup & mustard on a toasted bun.
    - Make it a double cheeseburger by adding another patty - $1.50
@@ -14,18 +14,18 @@ Combo Deals (includes small fries and fountain soda)
 5. Chicken Sandwich Combo – $9.49
 6. Veggie Wrap Combo – $8.49
 
-Sides
+Sides:
 7. French Fries
  - Small - $2.49
  - Medium - $3.49
  - Large - $4.49
 8. Chicken Nuggets
- - 4 pcs - $3.29
- - 8 pcs - $5.99
- - 12 pcs - $8.99
-9. Side Salad - $2.99
+ - 4 pieces - $3.29
+ - 8 pieces -  $5.99
+ - 12 pieces -  $8.99
+9. Side Salad -  $2.99
 
-Drinks
+Drinks:
 10. Fountain Soda (16 oz, choices: Coke, Diet Coke, Sprite, Fanta) – $1.99
 11. Iced Tea or Lemonade – $2.29
 12. Bottled Water – $1.49

diff --git a/examples/voice_agent/server/server_configs/default.yaml b/examples/voice_agent/server/server_configs/default.yaml
@@ -5,6 +5,8 @@
 
 transport:
   audio_out_10ms_chunks: 10  # use 4 as websocket default, but increasing to a larger number might have less glitches in TTS output
+  # record_audio_data: false
+  record_audio_data: true
 
 vad:
   type: silero
@@ -15,7 +17,8 @@ vad:
 
 stt:
   type: nemo # choices in ['nemo'] currently only NeMo is supported
-  model: "stt_en_fastconformer_hybrid_large_streaming_80ms"
+  # model: "stt_en_fastconformer_hybrid_large_streaming_80ms"
+  model: "nvidia/parakeet_realtime_eou_120m-v1"
   model_config: "./server_configs/stt_configs/nemo_cache_aware_streaming.yaml"
   device: "cuda"
 
@@ -41,12 +44,12 @@ llm:
   # model_config: "./server_configs/llm_configs/qwen2.5-7B.yaml"
   # model: "Qwen/Qwen3-8B"
   # model_config: "./server_configs/llm_configs/qwen3-8B.yaml"
+  # model: meta-llama/Llama-3.1-8B-Instruct
   device: "cuda"
   enable_reasoning: false  # it's best to turn-off reasoning for lowest latency
   # `system_prompt` is used as the sytem prompt to the LLM, please refer to differnt LLM webpage for spcial functions like enabling/disabling thinking
   # system_prompt: /path/to/prompt.txt  # or use path to a txt file that contains a long prompt, for example in `../example_prompts/fast_bite.txt`
   system_prompt: "You are a helpful AI agent named Lisa. Start by greeting the user warmly and introducing yourself within one sentence. Your answer should be concise and to the point. You might also see speaker tags (<speaker_0>, <speaker_1>, etc.) in the user context. You should respond to the user based on the speaker tag and the context of that speaker. Do not include the speaker tags in your response, use them only to identify the speaker. Do not include any emoji in response."
-
 tts:
   type: kokoro # choices in ['nemo', 'kokoro']
   model: "hexgrad/Kokoro-82M"
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,7 +11,6 @@ @@
     - "great"
     - "great thanks"
     - "ha ha"
-    - "hi"
     - "hmm"
     - "humm"
     - "huh"
@@ Expand Down @@