Skip to content

Commit 5956130

Browse files
authored
Add support for OpenAI's semantic VAD (#120)
1 parent 63c5edc commit 5956130

File tree

3 files changed

+53
-20
lines changed

3 files changed

+53
-20
lines changed

README.md

+5-3
Original file line numberDiff line numberDiff line change
@@ -978,9 +978,11 @@ final class RealtimeManager {
978978
outputAudioFormat: .pcm16,
979979
temperature: 0.7,
980980
turnDetection: .init(
981-
prefixPaddingMs: 200,
982-
silenceDurationMs: 500,
983-
threshold: 0.5
981+
type: .serverVAD(
982+
prefixPaddingMs: 300,
983+
silenceDurationMs: 500,
984+
threshold: 0.5
985+
)
984986
),
985987
voice: "shimmer"
986988
)

Sources/AIProxy/AIProxy.swift

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import UIKit
88
public struct AIProxy {
99

1010
/// The current sdk version
11-
public static let sdkVersion = "0.78.0"
11+
public static let sdkVersion = "0.80.0"
1212

1313
/// - Parameters:
1414
/// - partialKey: Your partial key is displayed in the AIProxy dashboard when you submit your provider's key.

Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift

+47-16
Original file line numberDiff line numberDiff line change
@@ -144,33 +144,37 @@ extension OpenAIRealtimeSessionConfiguration {
144144
// MARK: -
145145
extension OpenAIRealtimeSessionConfiguration {
146146
public struct TurnDetection: Encodable {
147-
/// Amount of audio to include before speech starts (in milliseconds).
148-
let prefixPaddingMs: Int?
149147

150-
/// Duration of silence to detect speech stop (in milliseconds).
151-
let silenceDurationMs: Int?
152-
153-
/// Activation threshold for VAD (0.0 to 1.0).
154-
let threshold: Double?
155-
156-
/// Type of turn detection, only "server_vad" is currently supported.
157-
let type = "server_vad"
148+
let type: DetectionType
158149

159150
private enum CodingKeys: String, CodingKey {
160151
case prefixPaddingMs = "prefix_padding_ms"
161152
case silenceDurationMs = "silence_duration_ms"
162153
case threshold
163154
case type
155+
case eagerness
164156
}
165157

166158
public init(
167-
prefixPaddingMs: Int? = nil,
168-
silenceDurationMs: Int? = nil,
169-
threshold: Double? = nil
159+
type: DetectionType
170160
) {
171-
self.prefixPaddingMs = prefixPaddingMs
172-
self.silenceDurationMs = silenceDurationMs
173-
self.threshold = threshold
161+
self.type = type
162+
}
163+
164+
public func encode(to encoder: any Encoder) throws {
165+
var container = encoder.container(keyedBy: CodingKeys.self)
166+
167+
switch type {
168+
case .serverVAD(let prefixPaddingMs, let silenceDurationMs, let threshold):
169+
try container.encode("server_vad", forKey: .type)
170+
try container.encode(prefixPaddingMs, forKey: .prefixPaddingMs)
171+
try container.encode(silenceDurationMs, forKey: .silenceDurationMs)
172+
try container.encode(threshold, forKey: .threshold)
173+
174+
case .semanticVAD(let eagerness):
175+
try container.encode("semantic_vad", forKey: .type)
176+
try container.encode(String(describing: eagerness), forKey: .eagerness)
177+
}
174178
}
175179
}
176180
}
@@ -193,3 +197,30 @@ extension OpenAIRealtimeSessionConfiguration {
193197
case text
194198
}
195199
}
200+
201+
extension OpenAIRealtimeSessionConfiguration.TurnDetection {
202+
public enum DetectionType: Encodable {
203+
public enum Eagerness: String, Encodable {
204+
case low
205+
case medium
206+
case high
207+
}
208+
209+
/// - Parameters:
210+
/// - prefixPaddingMs: Amount of audio to include before speech starts (in milliseconds).
211+
/// OpenAI's default is 300
212+
/// - silenceDurationMs: Duration of silence to detect speech stop (in milliseconds). With shorter values
213+
/// the model will respond more quickly, but may jump in on short pauses from the user.
214+
/// OpenAI's default is 500
215+
/// - threshold: Activation threshold for VAD (0.0 to 1.0). A higher threshold will require louder audio to
216+
/// activate the model, and thus might perform better in noisy environments.
217+
/// OpenAI's default is 0.5
218+
case serverVAD(prefixPaddingMs: Int, silenceDurationMs: Int, threshold: Double)
219+
220+
/// - Parameters:
221+
/// - eagerness: The eagerness of the model to respond. `low` will wait longer for the user to
222+
/// continue speaking, `high` will respond more quickly.
223+
/// OpenAI's default is medium
224+
case semanticVAD(eagerness: Eagerness)
225+
}
226+
}

0 commit comments

Comments
 (0)