@@ -876,6 +876,149 @@ This example it taken from OpenAI's [function calling guide](https://platform.op
876
876
```
877
877
878
878
879
+ ### How use realtime audio with OpenAI
880
+
881
+ Use this example to have a conversation with OpenAI's realtime models.
882
+
883
+ We recommend getting a basic chat completion with OpenAI working before attempting realtime.
884
+ Realtime is a more involved integration (as you can see from the code snippet below), and
885
+ getting a basic integration working first narrows down the source of any problem.
886
+
887
+ Take these steps to build and run an OpenAI realtime example:
888
+
889
+ 1 . Generate a new SwiftUI Xcode project called ` MyApp `
890
+ 2 . Add the ` NSMicrophoneUsageDescription ` key to your info.plist file
891
+ 3 . If macOS, tap your project > your target > Signing & Capabilities and add the following:
892
+ - App Sandbox > Outgoing Connections (client)
893
+ - App Sandbox > Audio Input
894
+ - Hardened Runtime > AudioInput
895
+ 4 . Replace the contents of ` MyApp.swift ` with the snippet below
896
+ 5 . Replace the placeholders in the snippet
897
+ - If connecting directly to OpenAI, replace ` your-openai-key `
898
+ - If protecting your connection through AIProxy, replace ` aiproxy-partial-key ` and ` aiproxy-service-url `
899
+ 6 . Set the ` logLevel ` argument of the ` openAIService.realtimeSession ` call to your desired level. If you leave
900
+ it set at ` .debug ` , then you'll see logs for all audio samples that we send and receive from OpenAI.
901
+
902
+ ** Important** If you would like to protect your connection through AIProxy's backend, your
903
+ AIProxy project must be enabled for websocket use. Please reach out if you would like to be
904
+ added to the private beta.
905
+
906
+ ``` swift
907
+ import SwiftUI
908
+ import AIProxy
909
+
910
+ @main
911
+ struct MyApp : App {
912
+
913
+ let realtimeManager = RealtimeManager ()
914
+
915
+ var body: some Scene {
916
+ WindowGroup {
917
+ Button (" Start conversation" ) {
918
+ Task {
919
+ try await realtimeManager.startConversation ()
920
+ }
921
+ }
922
+ }
923
+ }
924
+ }
925
+
926
+ @RealtimeActor
927
+ final class RealtimeManager {
928
+ private var realtimeSession: OpenAIRealtimeSession?
929
+ private var microphonePCMSampleVendor: MicrophonePCMSampleVendor?
930
+ private var audioPCMPlayer: AudioPCMPlayer?
931
+
932
+ nonisolated init () {}
933
+
934
+ func startConversation () async throws {
935
+ /* Uncomment for BYOK use cases */
936
+ // let openAIService = AIProxy.openAIDirectService(
937
+ // unprotectedAPIKey: "your-openai-key"
938
+ // )
939
+
940
+ /* Uncomment to protect your connection through AIProxy */
941
+ // let openAIService = AIProxy.openAIService(
942
+ // partialKey: "partial-key-from-your-developer-dashboard",
943
+ // serviceURL: "service-url-from-your-developer-dashboard"
944
+ // )
945
+
946
+ // Set to false if you want your user to speak first
947
+ let aiSpeaksFirst = true
948
+
949
+ // Initialize an audio player to play PCM16 data that we receive from OpenAI:
950
+ let audioPCMPlayer = try AudioPCMPlayer ()
951
+
952
+ // Initialize a microphone vendor to vend PCM16 audio samples that we'll send to OpenAI:
953
+ let microphonePCMSampleVendor = MicrophonePCMSampleVendor ()
954
+ let audioStream = try microphonePCMSampleVendor.start ()
955
+
956
+ // Start the realtime session:
957
+ let configuration = OpenAIRealtimeSessionConfiguration (
958
+ inputAudioFormat : .pcm16 ,
959
+ inputAudioTranscription : .init (model : " whisper-1" ),
960
+ instructions : " You are a tour guide of Yosemite national park" ,
961
+ maxResponseOutputTokens : .int (4096 ),
962
+ modalities : [.audio , .text ],
963
+ outputAudioFormat : .pcm16 ,
964
+ temperature : 0.7 ,
965
+ turnDetection : .init (
966
+ prefixPaddingMs : 200 ,
967
+ silenceDurationMs : 500 ,
968
+ threshold : 0.5
969
+ ),
970
+ voice : " shimmer"
971
+ )
972
+
973
+ let realtimeSession = try await openAIService.realtimeSession (
974
+ model : " gpt-4o-mini-realtime-preview-2024-12-17" ,
975
+ configuration : configuration,
976
+ logLevel : .debug
977
+ )
978
+
979
+ // Send audio from the microphone to OpenAI once OpenAI is ready for it:
980
+ var isOpenAIReadyForAudio = false
981
+ Task {
982
+ for await buffer in audioStream {
983
+ if isOpenAIReadyForAudio, let base64Audio = AIProxy.base64EncodeAudioPCMBuffer (from : buffer) {
984
+ await realtimeSession.sendMessage (
985
+ OpenAIRealtimeInputAudioBufferAppend (audio : base64Audio)
986
+ )
987
+ }
988
+ }
989
+ }
990
+
991
+ // Listen for messages from OpenAI:
992
+ Task {
993
+ for await message in realtimeSession.receiver {
994
+ switch message {
995
+ case .error (_ ):
996
+ realtimeSession.disconnect ()
997
+ case .sessionUpdated :
998
+ if aiSpeaksFirst {
999
+ await realtimeSession.sendMessage (OpenAIRealtimeResponseCreate ())
1000
+ } else {
1001
+ isOpenAIReadyForAudio = true
1002
+ }
1003
+ case .responseAudioDelta (let base64Audio):
1004
+ audioPCMPlayer.playPCM16Audio (from : base64Audio)
1005
+ case .inputAudioBufferSpeechStarted :
1006
+ audioPCMPlayer.interruptPlayback ()
1007
+ case .responseCreated :
1008
+ isOpenAIReadyForAudio = true
1009
+ default :
1010
+ break
1011
+ }
1012
+ }
1013
+ }
1014
+
1015
+ self .microphonePCMSampleVendor = microphonePCMSampleVendor
1016
+ self .audioPCMPlayer = audioPCMPlayer
1017
+ self .realtimeSession = realtimeSession
1018
+ }
1019
+ }
1020
+ ```
1021
+
879
1022
### How to use OpenAI through an Azure deployment
880
1023
881
1024
You can use all of the OpenAI snippets aboves with one change. Initialize the OpenAI service with:
0 commit comments