1
1
import VoiceIcon from "@/app/icons/voice.svg" ;
2
2
import VoiceOffIcon from "@/app/icons/voice-off.svg" ;
3
3
import Close24Icon from "@/app/icons/close-24.svg" ;
4
+ import PowerIcon from "@/app/icons/power.svg" ;
5
+
4
6
import styles from "./realtime-chat.module.scss" ;
5
7
import clsx from "clsx" ;
6
8
7
- import { useState , useRef , useCallback } from "react" ;
9
+ import { useState , useRef , useCallback , useEffect } from "react" ;
8
10
9
11
import { useAccessStore , useChatStore , ChatMessage } from "@/app/store" ;
10
12
13
+ import { IconButton } from "@/app/components/button" ;
14
+
15
+ import {
16
+ Modality ,
17
+ RTClient ,
18
+ RTInputAudioItem ,
19
+ RTResponse ,
20
+ TurnDetection ,
21
+ } from "rt-client" ;
22
+ import { AudioHandler } from "@/app/lib/audio" ;
23
+
11
24
interface RealtimeChatProps {
12
25
onClose ?: ( ) => void ;
13
26
onStartVoice ?: ( ) => void ;
14
27
onPausedVoice ?: ( ) => void ;
15
- sampleRate ?: number ;
16
28
}
17
29
18
30
export function RealtimeChat ( {
19
31
onClose,
20
32
onStartVoice,
21
33
onPausedVoice,
22
- sampleRate = 24000 ,
23
34
} : RealtimeChatProps ) {
24
- const [ isVoicePaused , setIsVoicePaused ] = useState ( true ) ;
25
- const clientRef = useRef < null > ( null ) ;
26
35
const currentItemId = useRef < string > ( "" ) ;
27
36
const currentBotMessage = useRef < ChatMessage | null > ( ) ;
28
37
const currentUserMessage = useRef < ChatMessage | null > ( ) ;
29
38
const accessStore = useAccessStore . getState ( ) ;
30
39
const chatStore = useChatStore ( ) ;
31
40
41
+ const [ isRecording , setIsRecording ] = useState ( false ) ;
42
+ const [ isConnected , setIsConnected ] = useState ( false ) ;
43
+ const [ isConnecting , setIsConnecting ] = useState ( false ) ;
44
+ const [ modality , setModality ] = useState ( "audio" ) ;
45
+ const [ isAzure , setIsAzure ] = useState ( false ) ;
46
+ const [ endpoint , setEndpoint ] = useState ( "" ) ;
47
+ const [ deployment , setDeployment ] = useState ( "" ) ;
48
+ const [ useVAD , setUseVAD ] = useState ( true ) ;
49
+
50
+ const clientRef = useRef < RTClient | null > ( null ) ;
51
+ const audioHandlerRef = useRef < AudioHandler | null > ( null ) ;
52
+
53
+ const apiKey = accessStore . openaiApiKey ;
54
+
55
+ const handleConnect = async ( ) => {
56
+ if ( ! isConnected ) {
57
+ try {
58
+ setIsConnecting ( true ) ;
59
+ clientRef . current = isAzure
60
+ ? new RTClient ( new URL ( endpoint ) , { key : apiKey } , { deployment } )
61
+ : new RTClient (
62
+ { key : apiKey } ,
63
+ { model : "gpt-4o-realtime-preview-2024-10-01" } ,
64
+ ) ;
65
+ const modalities : Modality [ ] =
66
+ modality === "audio" ? [ "text" , "audio" ] : [ "text" ] ;
67
+ const turnDetection : TurnDetection = useVAD
68
+ ? { type : "server_vad" }
69
+ : null ;
70
+ clientRef . current . configure ( {
71
+ instructions : "Hi" ,
72
+ input_audio_transcription : { model : "whisper-1" } ,
73
+ turn_detection : turnDetection ,
74
+ tools : [ ] ,
75
+ temperature : 0.9 ,
76
+ modalities,
77
+ } ) ;
78
+ startResponseListener ( ) ;
79
+
80
+ setIsConnected ( true ) ;
81
+ } catch ( error ) {
82
+ console . error ( "Connection failed:" , error ) ;
83
+ } finally {
84
+ setIsConnecting ( false ) ;
85
+ }
86
+ } else {
87
+ await disconnect ( ) ;
88
+ }
89
+ } ;
90
+
91
+ const disconnect = async ( ) => {
92
+ if ( clientRef . current ) {
93
+ try {
94
+ await clientRef . current . close ( ) ;
95
+ clientRef . current = null ;
96
+ setIsConnected ( false ) ;
97
+ } catch ( error ) {
98
+ console . error ( "Disconnect failed:" , error ) ;
99
+ }
100
+ }
101
+ } ;
102
+
103
+ const startResponseListener = async ( ) => {
104
+ if ( ! clientRef . current ) return ;
105
+
106
+ try {
107
+ for await ( const serverEvent of clientRef . current . events ( ) ) {
108
+ if ( serverEvent . type === "response" ) {
109
+ await handleResponse ( serverEvent ) ;
110
+ } else if ( serverEvent . type === "input_audio" ) {
111
+ await handleInputAudio ( serverEvent ) ;
112
+ }
113
+ }
114
+ } catch ( error ) {
115
+ if ( clientRef . current ) {
116
+ console . error ( "Response iteration error:" , error ) ;
117
+ }
118
+ }
119
+ } ;
120
+
121
+ const handleResponse = async ( response : RTResponse ) => {
122
+ for await ( const item of response ) {
123
+ if ( item . type === "message" && item . role === "assistant" ) {
124
+ const message = {
125
+ type : item . role ,
126
+ content : "" ,
127
+ } ;
128
+ // setMessages((prevMessages) => [...prevMessages, message]);
129
+ for await ( const content of item ) {
130
+ if ( content . type === "text" ) {
131
+ for await ( const text of content . textChunks ( ) ) {
132
+ message . content += text ;
133
+ // setMessages((prevMessages) => {
134
+ // prevMessages[prevMessages.length - 1].content = message.content;
135
+ // return [...prevMessages];
136
+ // });
137
+ }
138
+ } else if ( content . type === "audio" ) {
139
+ const textTask = async ( ) => {
140
+ for await ( const text of content . transcriptChunks ( ) ) {
141
+ message . content += text ;
142
+ // setMessages((prevMessages) => {
143
+ // prevMessages[prevMessages.length - 1].content =
144
+ // message.content;
145
+ // return [...prevMessages];
146
+ // });
147
+ }
148
+ } ;
149
+ const audioTask = async ( ) => {
150
+ audioHandlerRef . current ?. startStreamingPlayback ( ) ;
151
+ for await ( const audio of content . audioChunks ( ) ) {
152
+ audioHandlerRef . current ?. playChunk ( audio ) ;
153
+ }
154
+ } ;
155
+ await Promise . all ( [ textTask ( ) , audioTask ( ) ] ) ;
156
+ }
157
+ }
158
+ }
159
+ }
160
+ } ;
161
+
162
+ const handleInputAudio = async ( item : RTInputAudioItem ) => {
163
+ audioHandlerRef . current ?. stopStreamingPlayback ( ) ;
164
+ await item . waitForCompletion ( ) ;
165
+ // setMessages((prevMessages) => [
166
+ // ...prevMessages,
167
+ // {
168
+ // type: "user",
169
+ // content: item.transcription || "",
170
+ // },
171
+ // ]);
172
+ } ;
173
+
174
+ const toggleRecording = async ( ) => {
175
+ if ( ! isRecording && clientRef . current ) {
176
+ try {
177
+ if ( ! audioHandlerRef . current ) {
178
+ audioHandlerRef . current = new AudioHandler ( ) ;
179
+ await audioHandlerRef . current . initialize ( ) ;
180
+ }
181
+ await audioHandlerRef . current . startRecording ( async ( chunk ) => {
182
+ await clientRef . current ?. sendAudio ( chunk ) ;
183
+ } ) ;
184
+ setIsRecording ( true ) ;
185
+ } catch ( error ) {
186
+ console . error ( "Failed to start recording:" , error ) ;
187
+ }
188
+ } else if ( audioHandlerRef . current ) {
189
+ try {
190
+ audioHandlerRef . current . stopRecording ( ) ;
191
+ if ( ! useVAD ) {
192
+ const inputAudio = await clientRef . current ?. commitAudio ( ) ;
193
+ await handleInputAudio ( inputAudio ! ) ;
194
+ await clientRef . current ?. generateResponse ( ) ;
195
+ }
196
+ setIsRecording ( false ) ;
197
+ } catch ( error ) {
198
+ console . error ( "Failed to stop recording:" , error ) ;
199
+ }
200
+ }
201
+ } ;
202
+
203
+ useEffect ( ( ) => {
204
+ const initAudioHandler = async ( ) => {
205
+ const handler = new AudioHandler ( ) ;
206
+ await handler . initialize ( ) ;
207
+ audioHandlerRef . current = handler ;
208
+ } ;
209
+
210
+ initAudioHandler ( ) . catch ( console . error ) ;
211
+
212
+ return ( ) => {
213
+ disconnect ( ) ;
214
+ audioHandlerRef . current ?. close ( ) . catch ( console . error ) ;
215
+ } ;
216
+ } , [ ] ) ;
217
+
32
218
// useEffect(() => {
33
219
// if (
34
220
// clientRef.current?.getTurnDetectionType() === "server_vad" &&
@@ -223,12 +409,16 @@ export function RealtimeChat({
223
409
224
410
const handleStartVoice = useCallback ( ( ) => {
225
411
onStartVoice ?.( ) ;
226
- setIsVoicePaused ( false ) ;
412
+ handleConnect ( ) ;
227
413
} , [ ] ) ;
228
414
229
415
const handlePausedVoice = ( ) => {
230
416
onPausedVoice ?.( ) ;
231
- setIsVoicePaused ( true ) ;
417
+ } ;
418
+
419
+ const handleClose = ( ) => {
420
+ onClose ?.( ) ;
421
+ disconnect ( ) ;
232
422
} ;
233
423
234
424
return (
@@ -241,15 +431,39 @@ export function RealtimeChat({
241
431
< div className = { styles [ "icon-center" ] } > </ div >
242
432
</ div >
243
433
< div className = { styles [ "bottom-icons" ] } >
244
- < div className = { styles [ "icon-left" ] } >
245
- { isVoicePaused ? (
246
- < VoiceOffIcon onClick = { handleStartVoice } />
247
- ) : (
248
- < VoiceIcon onClick = { handlePausedVoice } />
249
- ) }
434
+ < div >
435
+ < IconButton
436
+ icon = { isRecording ? < VoiceOffIcon /> : < VoiceIcon /> }
437
+ onClick = { toggleRecording }
438
+ disabled = { ! isConnected }
439
+ bordered
440
+ shadow
441
+ />
442
+ </ div >
443
+ < div className = { styles [ "icon-center" ] } >
444
+ < IconButton
445
+ icon = { < PowerIcon /> }
446
+ text = {
447
+ isConnecting
448
+ ? "Connecting..."
449
+ : isConnected
450
+ ? "Disconnect"
451
+ : "Connect"
452
+ }
453
+ onClick = { handleConnect }
454
+ disabled = { isConnecting }
455
+ bordered
456
+ shadow
457
+ />
250
458
</ div >
251
- < div className = { styles [ "icon-right" ] } onClick = { onClose } >
252
- < Close24Icon />
459
+ < div onClick = { handleClose } >
460
+ < IconButton
461
+ icon = { < Close24Icon /> }
462
+ onClick = { handleClose }
463
+ disabled = { ! isConnected }
464
+ bordered
465
+ shadow
466
+ />
253
467
</ div >
254
468
</ div >
255
469
</ div >
0 commit comments