Add support for OpenAI's semantic VAD (#120)

lzell · web-flow · commit 59561309ed07 · 2025-03-25T11:00:15.000-07:00
diff --git a/README.md b/README.md
@@ -978,9 +978,11 @@ final class RealtimeManager {
             outputAudioFormat: .pcm16,
             temperature: 0.7,
             turnDetection: .init(
-                prefixPaddingMs: 200,
-                silenceDurationMs: 500,
-                threshold: 0.5
+                type: .serverVAD(
+                    prefixPaddingMs: 300,
+                    silenceDurationMs: 500,
+                    threshold: 0.5
+                )
             ),
             voice: "shimmer"
         )
diff --git a/Sources/AIProxy/AIProxy.swift b/Sources/AIProxy/AIProxy.swift
@@ -8,7 +8,7 @@ import UIKit
 public struct AIProxy {
 
     /// The current sdk version
-    public static let sdkVersion = "0.78.0"
+    public static let sdkVersion = "0.80.0"
 
     /// - Parameters:
     ///   - partialKey: Your partial key is displayed in the AIProxy dashboard when you submit your provider's key.
diff --git a/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift b/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift
@@ -144,33 +144,37 @@ extension OpenAIRealtimeSessionConfiguration {
 // MARK: -
 extension OpenAIRealtimeSessionConfiguration {
     public struct TurnDetection: Encodable {
-        /// Amount of audio to include before speech starts (in milliseconds).
-        let prefixPaddingMs: Int?
 
-        /// Duration of silence to detect speech stop (in milliseconds).
-        let silenceDurationMs: Int?
-
-        /// Activation threshold for VAD (0.0 to 1.0).
-        let threshold: Double?
-
-        /// Type of turn detection, only "server_vad" is currently supported.
-        let type = "server_vad"
+        let type: DetectionType
 
         private enum CodingKeys: String, CodingKey {
             case prefixPaddingMs = "prefix_padding_ms"
             case silenceDurationMs = "silence_duration_ms"
             case threshold
             case type
+            case eagerness
         }
 
         public init(
-            prefixPaddingMs: Int? = nil,
-            silenceDurationMs: Int? = nil,
-            threshold: Double? = nil
+            type: DetectionType
         ) {
-            self.prefixPaddingMs = prefixPaddingMs
-            self.silenceDurationMs = silenceDurationMs
-            self.threshold = threshold
+            self.type = type
+        }
+
+        public func encode(to encoder: any Encoder) throws {
+            var container = encoder.container(keyedBy: CodingKeys.self)
+
+            switch type {
+            case .serverVAD(let prefixPaddingMs, let silenceDurationMs, let threshold):
+                try container.encode("server_vad", forKey: .type)
+                try container.encode(prefixPaddingMs, forKey: .prefixPaddingMs)
+                try container.encode(silenceDurationMs, forKey: .silenceDurationMs)
+                try container.encode(threshold, forKey: .threshold)
+
+            case .semanticVAD(let eagerness):
+                try container.encode("semantic_vad", forKey: .type)
+                try container.encode(String(describing: eagerness), forKey: .eagerness)
+            }
         }
     }
 }
@@ -193,3 +197,30 @@ extension OpenAIRealtimeSessionConfiguration {
         case text
     }
 }
+
+extension OpenAIRealtimeSessionConfiguration.TurnDetection {
+    public enum DetectionType: Encodable {
+        public enum Eagerness: String, Encodable {
+            case low
+            case medium
+            case high
+        }
+
+        /// - Parameters:
+        ///   - prefixPaddingMs: Amount of audio to include before speech starts (in milliseconds).
+        ///                      OpenAI's default is 300
+        ///   - silenceDurationMs: Duration of silence to detect speech stop (in milliseconds).  With shorter values
+        ///                        the model will respond more quickly, but may jump in on short pauses from the user.
+        ///                        OpenAI's default is 500
+        ///   - threshold: Activation threshold for VAD (0.0 to 1.0). A higher threshold will require louder audio to
+        ///                activate the model, and thus might perform better in noisy environments.
+        ///                OpenAI's default is 0.5
+        case serverVAD(prefixPaddingMs: Int, silenceDurationMs: Int, threshold: Double)
+
+        /// - Parameters:
+        ///   - eagerness: The eagerness of the model to respond. `low` will wait longer for the user to
+        ///                continue speaking, `high` will respond more quickly.
+        ///                OpenAI's default is medium
+        case semanticVAD(eagerness: Eagerness)
+    }
+}