@@ -144,33 +144,37 @@ extension OpenAIRealtimeSessionConfiguration {
144
144
// MARK: -
145
145
extension OpenAIRealtimeSessionConfiguration {
146
146
public struct TurnDetection : Encodable {
147
- /// Amount of audio to include before speech starts (in milliseconds).
148
- let prefixPaddingMs : Int ?
149
147
150
- /// Duration of silence to detect speech stop (in milliseconds).
151
- let silenceDurationMs : Int ?
152
-
153
- /// Activation threshold for VAD (0.0 to 1.0).
154
- let threshold : Double ?
155
-
156
- /// Type of turn detection, only "server_vad" is currently supported.
157
- let type = " server_vad "
148
+ let type : DetectionType
158
149
159
150
private enum CodingKeys : String , CodingKey {
160
151
case prefixPaddingMs = " prefix_padding_ms "
161
152
case silenceDurationMs = " silence_duration_ms "
162
153
case threshold
163
154
case type
155
+ case eagerness
164
156
}
165
157
166
158
public init (
167
- prefixPaddingMs: Int ? = nil ,
168
- silenceDurationMs: Int ? = nil ,
169
- threshold: Double ? = nil
159
+ type: DetectionType
170
160
) {
171
- self . prefixPaddingMs = prefixPaddingMs
172
- self . silenceDurationMs = silenceDurationMs
173
- self . threshold = threshold
161
+ self . type = type
162
+ }
163
+
164
+ public func encode( to encoder: any Encoder ) throws {
165
+ var container = encoder. container ( keyedBy: CodingKeys . self)
166
+
167
+ switch type {
168
+ case . serverVAD( let prefixPaddingMs, let silenceDurationMs, let threshold) :
169
+ try container. encode ( " server_vad " , forKey: . type)
170
+ try container. encode ( prefixPaddingMs, forKey: . prefixPaddingMs)
171
+ try container. encode ( silenceDurationMs, forKey: . silenceDurationMs)
172
+ try container. encode ( threshold, forKey: . threshold)
173
+
174
+ case . semanticVAD( let eagerness) :
175
+ try container. encode ( " semantic_vad " , forKey: . type)
176
+ try container. encode ( String ( describing: eagerness) , forKey: . eagerness)
177
+ }
174
178
}
175
179
}
176
180
}
@@ -193,3 +197,30 @@ extension OpenAIRealtimeSessionConfiguration {
193
197
case text
194
198
}
195
199
}
200
+
201
+ extension OpenAIRealtimeSessionConfiguration . TurnDetection {
202
+ public enum DetectionType : Encodable {
203
+ public enum Eagerness : String , Encodable {
204
+ case low
205
+ case medium
206
+ case high
207
+ }
208
+
209
+ /// - Parameters:
210
+ /// - prefixPaddingMs: Amount of audio to include before speech starts (in milliseconds).
211
+ /// OpenAI's default is 300
212
+ /// - silenceDurationMs: Duration of silence to detect speech stop (in milliseconds). With shorter values
213
+ /// the model will respond more quickly, but may jump in on short pauses from the user.
214
+ /// OpenAI's default is 500
215
+ /// - threshold: Activation threshold for VAD (0.0 to 1.0). A higher threshold will require louder audio to
216
+ /// activate the model, and thus might perform better in noisy environments.
217
+ /// OpenAI's default is 0.5
218
+ case serverVAD( prefixPaddingMs: Int , silenceDurationMs: Int , threshold: Double )
219
+
220
+ /// - Parameters:
221
+ /// - eagerness: The eagerness of the model to respond. `low` will wait longer for the user to
222
+ /// continue speaking, `high` will respond more quickly.
223
+ /// OpenAI's default is medium
224
+ case semanticVAD( eagerness: Eagerness )
225
+ }
226
+ }
0 commit comments