Add Imagen 3 support through Gemini API (#118)

lzell · web-flow · commit 07ee2f9066c2 · 2025-03-18T22:40:42.000-07:00
diff --git a/README.md b/README.md
@@ -1853,6 +1853,52 @@ Use the file URL returned from the snippet above.
     }
 ```
 
+### How to generate an image with Imagen
+
+```swift
+    import AIProxy
+
+    /* Uncomment for BYOK use cases */
+    // let geminiService = AIProxy.geminiDirectService(
+    //     unprotectedAPIKey: "your-gemini-key"
+    // )
+
+    /* Uncomment for all other production use cases */
+    // let geminiService = AIProxy.geminiService(
+    //     partialKey: "partial-key-from-your-developer-dashboard",
+    //     serviceURL: "service-url-from-your-developer-dashboard"
+    // )
+
+    let requestBody = GeminiImagenRequestBody(
+        instances: [
+            .init(prompt: prompt)
+        ],
+        parameters: .init(
+            personGeneration: .allowAdult,
+            safetyLevel: .blockNone,
+            sampleCount: 1
+        )
+    )
+
+    do {
+        let response = try await geminiService.makeImagenRequest(
+            body: requestBody,
+            model: "imagen-3.0-generate-002"
+        )
+        if let base64Data = response.predictions.first?.bytesBase64Encoded,
+           let imageData = Data(base64Encoded: base64Data),
+           let image = UIImage(data: imageData) {
+            // Do something with image
+        } else {
+            print("Imagen response did not include base64 image data")
+        }
+    } catch AIProxyError.unsuccessfulRequest(let statusCode, let responseBody) {
+        print("Received \(statusCode) status code with response body: \(responseBody)")
+    } catch {
+        print("Could not create Imagen image: \(error.localizedDescription)")
+    }
+```
+
 
 ***
 
diff --git a/Sources/AIProxy/AIProxy.swift b/Sources/AIProxy/AIProxy.swift
@@ -8,7 +8,7 @@ import UIKit
 public struct AIProxy {
 
     /// The current sdk version
-    public static let sdkVersion = "0.77.0"
+    public static let sdkVersion = "0.78.0"
 
     /// - Parameters:
     ///   - partialKey: Your partial key is displayed in the AIProxy dashboard when you submit your provider's key.
diff --git a/Sources/AIProxy/Gemini/GeminiDirectService.swift b/Sources/AIProxy/Gemini/GeminiDirectService.swift
@@ -43,6 +43,25 @@ open class GeminiDirectService: GeminiService, DirectService {
         return try await self.makeRequestAndDeserializeResponse(request)
     }
 
+    /// Generate images with the Imagen API
+    public func makeImagenRequest(
+        body: GeminiImagenRequestBody,
+        model: String
+    ) async throws -> GeminiImagenResponseBody {
+        let proxyPath = "/v1beta/models/\(model):predict"
+        let request = try AIProxyURLRequest.createDirect(
+            baseURL: "https://generativelanguage.googleapis.com",
+            path: proxyPath,
+            body:  body.serialize(),
+            verb: .post,
+            contentType: "application/json",
+            additionalHeaders: [
+                "X-Goog-Api-Key": self.unprotectedAPIKey
+            ]
+        )
+        return try await self.makeRequestAndDeserializeResponse(request)
+    }
+
     /// Uploads a file to Google's short term storage.
     ///
     /// The File API lets you store up to 20 GB of files per project, with a per-file maximum
diff --git a/Sources/AIProxy/Gemini/GeminiImagenRequestBody.swift b/Sources/AIProxy/Gemini/GeminiImagenRequestBody.swift
@@ -0,0 +1,157 @@
+//
+//  GeminiImagenRequestBody.swift
+//  AIProxy
+//
+//  Created by Lou Zell on 3/18/25.
+//
+
+import Foundation
+
+/// See the Imagen [prompt guide](https://ai.google.dev/gemini-api/docs/imagen-prompt-guide)
+///
+/// Imagen is described in a few places:
+/// - https://ai.google.dev/gemini-api/docs/image-generation#imagen
+/// - https://cloud.google.com/vertex-ai/generative-ai/docs/image/model-versioning
+/// - https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/imagen-api
+///
+/// It's unclear to me whether you need to apply to this first:
+/// https://cloud.google.com/vertex-ai/generative-ai/docs/image/overview?authuser=1
+///   - Request access: Imagen 3 Customization and Editing
+///   - Request access: Person and face generation
+public struct GeminiImagenRequestBody: Encodable {
+    public let instances: [Instance]
+    public let parameters: Parameters
+
+    public init(
+        instances: [GeminiImagenRequestBody.Instance],
+        parameters: GeminiImagenRequestBody.Parameters
+    ) {
+        self.instances = instances
+        self.parameters = parameters
+    }
+}
+
+extension GeminiImagenRequestBody {
+    public struct Instance: Encodable {
+        public let prompt: String
+        public let image: InputImage?
+
+        public init(prompt: String, image: GeminiImagenRequestBody.Instance.InputImage? = nil) {
+            self.prompt = prompt
+            self.image = image
+        }
+    }
+
+    public struct Parameters: Encodable {
+        public init(
+            aspectRatio: String? = nil,
+            mode: String? = nil,
+            personGeneration: GeminiImagenRequestBody.Parameters.PersonGeneration? = nil,
+            safetyLevel: GeminiImagenRequestBody.Parameters.SafetyLevel? = nil,
+            sampleCount: Int,
+            upscaleConfig: GeminiImagenRequestBody.Parameters.UpscaleConfig? = nil
+        ) {
+            self.aspectRatio = aspectRatio
+            self.mode = mode
+            self.personGeneration = personGeneration
+            self.safetyLevel = safetyLevel
+            self.sampleCount = sampleCount
+            self.upscaleConfig = upscaleConfig
+        }
+
+        /// Supported values are "1:1", "3:4", "4:3", "9:16", and "16:9". The default is "1:1".
+        public let aspectRatio: String?
+
+        /// One valid mode is 'upscale'. I do not know what the other valid modes are.
+        public let mode: String?
+
+        /// "dont_allow": Disallow the inclusion of people or faces in images.
+        /// "allow_adult": Allow generation of adults only.
+        /// "allow_all": Allow generation of people of all ages.
+        /// The default value is "allow_adult".
+        public let personGeneration: PersonGeneration?
+
+        /// Adds a filter level to safety filtering. The following values are supported:
+        ///
+        ///    "block_low_and_above": Strongest filtering level, most strict blocking.
+        ///    "block_medium_and_above": Block some problematic prompts and responses.
+        ///    "block_only_high": Reduces the number of requests blocked due to safety filters. May increase objectionable content generated by Imagen.
+        ///    "block_none": Block very few problematic prompts and responses. Access to this feature is restricted.
+        ///
+        /// The default value is "block_medium_and_above".
+        public let safetyLevel: SafetyLevel?
+
+        /// The number of images to create
+        public let sampleCount: Int
+
+        public let upscaleConfig: UpscaleConfig?
+    }
+}
+
+
+extension GeminiImagenRequestBody.Parameters {
+    /// Represents the safety filtering level for content moderation.
+    public enum SafetyLevel: String, Encodable {
+        /// Strongest filtering level, most strict blocking. Deprecated value: "block_most".
+        case blockLowAndAbove = "block_low_and_above"
+
+        /// Block some problematic prompts and responses. Deprecated value: "block_some".
+        case blockMediumAndAbove = "block_medium_and_above"
+
+        /// Reduces the number of requests blocked due to safety filters. May increase
+        /// objectionable content generated by Imagen. Deprecated value: "block_few".
+        case blockOnlyHigh = "block_only_high"
+
+        /// Block very few problematic prompts and responses. Access to this feature is
+        /// restricted. Previous field value: "block_fewest".
+        case blockNone = "block_none"
+    }
+}
+
+extension GeminiImagenRequestBody.Parameters {
+    /// Controls the generation of people or faces in images.
+    public enum PersonGeneration: String, Encodable {
+        /// Disallow the inclusion of people or faces in images.
+        case dontAllow = "dont_allow"
+
+        /// Allow generation of adults only.
+        case allowAdult = "allow_adult"
+
+        /// Allow generation of people of all ages.
+        case allowAll = "allow_all"
+    }
+}
+
+extension GeminiImagenRequestBody.Instance {
+    public struct InputImage: Encodable {
+        public let data: Data
+
+        private enum CodingKeys: String, CodingKey {
+            case bytesBase64Encoded
+        }
+
+        public init(data: Data) {
+            self.data = data
+        }
+
+        public func encode(to encoder: any Encoder) throws {
+            var container = encoder.container(keyedBy: CodingKeys.self)
+            try container.encode(self.data.base64EncodedString(), forKey: .bytesBase64Encoded)
+        }
+    }
+}
+
+extension GeminiImagenRequestBody.Parameters {
+    public struct UpscaleConfig: Encodable {
+        public enum UpscaleFactor: String, Encodable {
+            case x2
+            case x4
+        }
+
+        public let upscaleFactor: UpscaleFactor
+
+        public init(upscaleFactor: GeminiImagenRequestBody.Parameters.UpscaleConfig.UpscaleFactor) {
+            self.upscaleFactor = upscaleFactor
+        }
+    }
+}
diff --git a/Sources/AIProxy/Gemini/GeminiImagenResponseBody.swift b/Sources/AIProxy/Gemini/GeminiImagenResponseBody.swift
@@ -0,0 +1,17 @@
+//
+//  GeminiImagenResponseBody.swift
+//  AIProxy
+//
+//  Created by Lou Zell on 3/18/25.
+//
+
+public struct GeminiImagenResponseBody: Decodable {
+    public let predictions: [Prediction]
+}
+
+extension GeminiImagenResponseBody {
+    public struct Prediction: Decodable {
+        public let mimeType: String?
+        public let bytesBase64Encoded: String
+    }
+}
diff --git a/Sources/AIProxy/Gemini/GeminiProxiedService.swift b/Sources/AIProxy/Gemini/GeminiProxiedService.swift
@@ -44,6 +44,24 @@ open class GeminiProxiedService: GeminiService, ProxiedService {
         return try await self.makeRequestAndDeserializeResponse(request)
     }
 
+    /// Generate images with the Imagen API
+    public func makeImagenRequest(
+        body: GeminiImagenRequestBody,
+        model: String
+    ) async throws -> GeminiImagenResponseBody {
+        let proxyPath = "/v1beta/models/\(model):predict"
+        let request = try await AIProxyURLRequest.create(
+            partialKey: self.partialKey,
+            serviceURL: self.serviceURL,
+            clientID: self.clientID,
+            proxyPath: proxyPath,
+            body:  body.serialize(),
+            verb: .post,
+            contentType: "application/json"
+        )
+        return try await self.makeRequestAndDeserializeResponse(request)
+    }
+
     /// Uploads a file to Google's short term storage.
     ///
     /// The File API lets you store up to 20 GB of files per project, with a per-file maximum
diff --git a/Sources/AIProxy/Gemini/GeminiService.swift b/Sources/AIProxy/Gemini/GeminiService.swift
@@ -21,6 +21,13 @@ public protocol GeminiService {
         model: String
     ) async throws -> GeminiGenerateContentResponseBody
 
+    /// Generate images with the Imagen API
+    func makeImagenRequest(
+        body: GeminiImagenRequestBody,
+        model: String
+    ) async throws -> GeminiImagenResponseBody
+
+
     /// Uploads a file to Google's short term storage.
     ///
     /// The File API lets you store up to 20 GB of files per project, with a per-file maximum
diff --git a/Tests/AIProxyTests/GeminiGenerateImageResponseTests.swift b/Tests/AIProxyTests/GeminiGenerateImageResponseTests.swift
@@ -54,8 +54,4 @@ final class GeminiGenerateImageResponseTests: XCTestCase {
             XCTFail()
         }
     }
-
 }
-
-
-

Original file line number	Diff line number	Diff line change
`@@ -54,8 +54,4 @@ final class GeminiGenerateImageResponseTests: XCTestCase {`
`54`	`54`	`XCTFail()`
`55`	`55`	`}`
`56`	`56`	`}`
`57`		`-`
`58`	`57`	`}`
`59`		`-`
`60`		`-`
`61`		`-`