Add Gemma 3 test.

shoumikhin · GregoryComer · commit e21b614daaa9 · 2025-10-08T12:52:04.000-06:00
Differential Revision: D84001548 Pull Request resolved: #14825 (cherry picked from commit d8a2126)
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.h
@@ -44,6 +44,12 @@ __attribute__((objc_subclassing_restricted))
                     channels:(NSInteger)channels
     NS_DESIGNATED_INITIALIZER;
 
+- (instancetype)initWithFloatData:(NSData *)data
+                            width:(NSInteger)width
+                           height:(NSInteger)height
+                         channels:(NSInteger)channels
+    NS_DESIGNATED_INITIALIZER;
+
 @property(nonatomic, readonly) NSData *data;
 
 @property(nonatomic, readonly) NSInteger width;
@@ -52,6 +58,8 @@ __attribute__((objc_subclassing_restricted))
 
 @property(nonatomic, readonly) NSInteger channels;
 
+@property(nonatomic, readonly) BOOL isFloat;
+
 + (instancetype)new NS_UNAVAILABLE;
 - (instancetype)init NS_UNAVAILABLE;
 
@@ -80,6 +88,12 @@ __attribute__((objc_subclassing_restricted))
                       frames:(NSInteger)frames
     NS_DESIGNATED_INITIALIZER;
 
+- (instancetype)initWithFloatData:(NSData *)data
+                        batchSize:(NSInteger)batchSize
+                             bins:(NSInteger)bins
+                           frames:(NSInteger)frames
+    NS_DESIGNATED_INITIALIZER;
+
 @property(nonatomic, readonly) NSData *data;
 
 @property(nonatomic, readonly) NSInteger batchSize;
@@ -88,6 +102,8 @@ __attribute__((objc_subclassing_restricted))
 
 @property(nonatomic, readonly) NSInteger frames;
 
+@property(nonatomic, readonly) BOOL isFloat;
+
 + (instancetype)new NS_UNAVAILABLE;
 - (instancetype)init NS_UNAVAILABLE;
 
diff --git a/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm b/extension/llm/apple/ExecuTorchLLM/Exported/ExecuTorchLLMMultimodalRunner.mm
@@ -32,6 +32,22 @@ - (instancetype)initWithData:(NSData *)data
     _width = width;
     _height = height;
     _channels = channels;
+    _isFloat = NO;
+  }
+  return self;
+}
+
+- (instancetype)initWithFloatData:(NSData *)data
+                            width:(NSInteger)width
+                           height:(NSInteger)height
+                         channels:(NSInteger)channels {
+  self = [super init];
+  if (self) {
+    _data = [data copy];
+    _width = width;
+    _height = height;
+    _channels = channels;
+    _isFloat = YES;
   }
   return self;
 }
@@ -53,6 +69,22 @@ - (instancetype)initWithData:(NSData *)data
     _batchSize = batchSize;
     _bins = bins;
     _frames = frames;
+    _isFloat = NO;
+  }
+  return self;
+}
+
+- (instancetype)initWithFloatData:(NSData *)data
+                        batchSize:(NSInteger)batchSize
+                             bins:(NSInteger)bins
+                           frames:(NSInteger)frames {
+  self = [super init];
+  if (self) {
+    _data = [data copy];
+    _batchSize = batchSize;
+    _bins = bins;
+    _frames = frames;
+    _isFloat = YES;
   }
   return self;
 }
@@ -170,20 +202,58 @@ - (BOOL)generateWithInputs:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
     return NO;
   }
   std::vector<llm::MultimodalInput> nativeInputs;
+  nativeInputs.reserve((size_t)inputs.count);
   for (ExecuTorchLLMMultimodalInput *input in inputs) {
     switch (input.type) {
       case ExecuTorchLLMMultimodalInputTypeText:
         nativeInputs.emplace_back(llm::MultimodalInput(input.text.UTF8String));
         break;
       case ExecuTorchLLMMultimodalInputTypeImage: {
         ExecuTorchLLMImage *image = input.image;
-        std::vector<uint8_t> data((uint8_t *)image.data.bytes, (uint8_t *)image.data.bytes + image.data.length);
-        nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
-          std::move(data),
-          (int32_t)image.width,
-          (int32_t)image.height,
-          (int32_t)image.channels
-        )));
+        if (image.isFloat) {
+          const float *buffer = (const float *)image.data.bytes;
+          size_t elementCount = (size_t)image.data.length / sizeof(float);
+          std::vector<float> data(buffer, buffer + elementCount);
+          nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
+            std::move(data),
+            (int32_t)image.width,
+            (int32_t)image.height,
+            (int32_t)image.channels
+          )));
+        } else {
+          const uint8_t *buffer = (const uint8_t *)image.data.bytes;
+          std::vector<uint8_t> data(buffer, buffer + image.data.length);
+          nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
+            std::move(data),
+            (int32_t)image.width,
+            (int32_t)image.height,
+            (int32_t)image.channels
+          )));
+        }
+        break;
+      }
+      case ExecuTorchLLMMultimodalInputTypeAudio: {
+        ExecuTorchLLMAudio *audio = input.audio;
+        if (audio.isFloat) {
+          const float *buffer = (const float *)audio.data.bytes;
+          size_t elementCount = (size_t)audio.data.length / sizeof(float);
+          std::vector<float> data(buffer, buffer + elementCount);
+          nativeInputs.emplace_back(llm::MultimodalInput(llm::Audio(
+            std::move(data),
+            (int32_t)audio.batchSize,
+            (int32_t)audio.bins,
+            (int32_t)audio.frames
+          )));
+        } else {
+          const uint8_t *buffer = (const uint8_t *)audio.data.bytes;
+          std::vector<uint8_t> data(buffer, buffer + audio.data.length);
+          nativeInputs.emplace_back(llm::MultimodalInput(llm::Audio(
+            std::move(data),
+            (int32_t)audio.batchSize,
+            (int32_t)audio.bins,
+            (int32_t)audio.frames
+          )));
+        }
         break;
       }
       default: {
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/MultimodalRunnerTest.swift
@@ -10,60 +10,157 @@ import ExecuTorchLLM
 import XCTest
 
 extension UIImage {
-  func asImage() -> Image {
-    let targetSide = CGFloat(336)
-    let scale = max(targetSide / size.width, targetSide / size.height)
-    let scaledSize = CGSize(width: size.width * scale, height: size.height * scale)
+  func centerCropped(to sideSize: CGFloat) -> UIImage {
+    precondition(sideSize > 0)
     let format = UIGraphicsImageRendererFormat.default()
     format.scale = 1
-    let scaledImage = UIGraphicsImageRenderer(size: scaledSize, format: format).image { _ in
-      draw(in: CGRect(origin: .zero, size: scaledSize))
-    }
-    guard let scaledCGImage = scaledImage.cgImage else {
-      return Image(data: Data(), width: 336, height: 336, channels: 3)
-    }
-    let cropRect = CGRect(
-      x: ((scaledSize.width - targetSide) * 0.5).rounded(.down),
-      y: ((scaledSize.height - targetSide) * 0.5).rounded(.down),
-      width: targetSide.rounded(.down),
-      height: targetSide.rounded(.down)
-    )
-    let croppedCGImage = scaledCGImage.cropping(to: cropRect) ?? scaledCGImage
-    let imageWidth = croppedCGImage.width
-    let imageHeight = croppedCGImage.height
-    let pixelCount = imageWidth * imageHeight
-    var rgbaBuffer = [UInt8](repeating: 0, count: pixelCount * 4)
-    let context = CGContext(
+    format.opaque = false
+    return UIGraphicsImageRenderer(size: CGSize(width: sideSize, height: sideSize), format: format)
+      .image { _ in
+        let scaleFactor = max(sideSize / size.width, sideSize / size.height)
+        let scaledWidth = size.width * scaleFactor
+        let scaledHeight = size.height * scaleFactor
+        let originX = (sideSize - scaledWidth) / 2
+        let originY = (sideSize - scaledHeight) / 2
+        draw(in: CGRect(x: originX, y: originY, width: scaledWidth, height: scaledHeight))
+      }
+  }
+
+  func rgbBytes() -> [UInt8]? {
+    guard let cgImage = cgImage else { return nil }
+    let pixelWidth = Int(cgImage.width)
+    let pixelHeight = Int(cgImage.height)
+    let pixelCount = pixelWidth * pixelHeight
+    let bytesPerPixel = 4
+    let bytesPerRow = pixelWidth * bytesPerPixel
+    var rgbaBuffer = [UInt8](repeating: 0, count: pixelCount * bytesPerPixel)
+    guard let context = CGContext(
       data: &rgbaBuffer,
-      width: imageWidth,
-      height: imageHeight,
+      width: pixelWidth,
+      height: pixelHeight,
       bitsPerComponent: 8,
-      bytesPerRow: imageWidth * 4,
+      bytesPerRow: bytesPerRow,
       space: CGColorSpaceCreateDeviceRGB(),
       bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue | CGBitmapInfo.byteOrder32Big.rawValue
-    )!
-    context.draw(croppedCGImage, in: CGRect(x: 0, y: 0, width: imageWidth, height: imageHeight))
-    var planarRGB = [UInt8](repeating: 0, count: pixelCount * 3)
+    ) else { return nil }
+
+    context.draw(cgImage, in: CGRect(x: 0, y: 0, width: pixelWidth, height: pixelHeight))
+
+    var rgbBytes = [UInt8](repeating: 0, count: pixelCount * 3)
     for pixelIndex in 0..<pixelCount {
-      let sourceOffset = pixelIndex * 4
-      planarRGB[pixelIndex] = rgbaBuffer[sourceOffset]
-      planarRGB[pixelIndex + pixelCount] = rgbaBuffer[sourceOffset + 1]
-      planarRGB[pixelIndex + pixelCount * 2] = rgbaBuffer[sourceOffset + 2]
+      let sourceIndex = pixelIndex * bytesPerPixel
+      rgbBytes[pixelIndex] = rgbaBuffer[sourceIndex + 0]
+      rgbBytes[pixelIndex + pixelCount] = rgbaBuffer[sourceIndex + 1]
+      rgbBytes[pixelIndex + 2 * pixelCount] = rgbaBuffer[sourceIndex + 2]
     }
-    return Image(data: Data(planarRGB), width: 336, height: 336, channels: 3)
+    return rgbBytes
+  }
+
+  func rgbBytesNormalized(mean: [Float] = [0, 0, 0], std: [Float] = [1, 1, 1]) -> [Float]? {
+    precondition(mean.count == 3 && std.count == 3)
+    precondition(std[0] != 0 && std[1] != 0 && std[2] != 0)
+    guard let rgbBytes = rgbBytes() else { return nil }
+    let pixelCount = rgbBytes.count / 3
+    var rgbBytesNormalized = [Float](repeating: 0, count: pixelCount * 3)
+    for pixelIndex in 0..<pixelCount {
+      rgbBytesNormalized[pixelIndex] =
+        (Float(rgbBytes[pixelIndex]) / 255.0 - mean[0]) / std[0]
+      rgbBytesNormalized[pixelIndex + pixelCount] =
+        (Float(rgbBytes[pixelIndex + pixelCount]) / 255.0 - mean[1]) / std[1]
+      rgbBytesNormalized[pixelIndex + 2 * pixelCount] =
+        (Float(rgbBytes[pixelIndex + 2 * pixelCount]) / 255.0 - mean[2]) / std[2]
+    }
+    return rgbBytesNormalized
+  }
+
+  func asImage(_ sideSize: CGFloat) -> Image {
+    return Image(
+      data: Data(centerCropped(to: sideSize).rgbBytes() ?? []),
+      width: Int(sideSize),
+      height: Int(sideSize),
+      channels: 3
+    )
+  }
+
+  func asNormalizedImage(
+    _ sideSize: CGFloat,
+    mean: [Float] = [0.485, 0.456, 0.406],
+    std: [Float] = [0.229, 0.224, 0.225]
+  ) -> Image {
+    return Image(
+      float: (centerCropped(to: sideSize).rgbBytesNormalized(mean: mean, std: std) ?? []).withUnsafeBufferPointer { Data(buffer: $0) },
+      width: Int(sideSize),
+      height: Int(sideSize),
+      channels: 3
+    )
   }
 }
 
 class MultimodalRunnerTest: XCTestCase {
-  let systemPrompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: "
-  let assistantPrompt = "ASSISTANT: "
+  let systemPrompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
   let userPrompt = "What's on the picture?"
-  let sequenceLength = 768
+
+  func testGemma() {
+    let chatTemplate = "<start_of_turn>user\n%@<end_of_turn>\n<start_of_turn>model"
+    let sideSize: CGFloat = 896
+    let sequenceLength = 768
+    let bundle = Bundle(for: type(of: self))
+    guard let modelPath = bundle.path(forResource: "gemma3", ofType: "pte"),
+          let tokenizerPath = bundle.path(forResource: "gemma3_tokenizer", ofType: "model"),
+          let imagePath = bundle.path(forResource: "IMG_0005", ofType: "jpg"),
+          let uiImage = UIImage(contentsOfFile: imagePath) else {
+      XCTFail("Couldn't find model or tokenizer files")
+      return
+    }
+    let runner = MultimodalRunner(modelPath: modelPath, tokenizerPath: tokenizerPath)
+    var text = ""
+
+    do {
+      try runner.generate([
+        MultimodalInput(systemPrompt),
+        MultimodalInput(uiImage.asNormalizedImage(sideSize)),
+        MultimodalInput(String(format: chatTemplate, userPrompt)),
+      ], Config {
+        $0.sequenceLength = sequenceLength
+      }) { token in
+        text += token
+        if token == "<end_of_turn>" {
+          runner.stop()
+        }
+      }
+    } catch {
+      XCTFail("Failed to generate text with error \(error)")
+    }
+    XCTAssertTrue(text.lowercased().contains("waterfall"))
+
+    text = ""
+    runner.reset()
+    do {
+      try runner.generate([
+        MultimodalInput(systemPrompt),
+        MultimodalInput(uiImage.asNormalizedImage(sideSize)),
+        MultimodalInput(String(format: chatTemplate, userPrompt)),
+      ], Config {
+        $0.sequenceLength = sequenceLength
+      }) { token in
+        text += token
+        if token == "<end_of_turn>" {
+          runner.stop()
+        }
+      }
+    } catch {
+      XCTFail("Failed to generate text with error \(error)")
+    }
+    XCTAssertTrue(text.lowercased().contains("waterfall"))
+  }
 
   func testLLaVA() {
+    let chatTemplate = "USER: %@ ASSISTANT: "
+    let sideSize: CGFloat = 336
+    let sequenceLength = 768
     let bundle = Bundle(for: type(of: self))
     guard let modelPath = bundle.path(forResource: "llava", ofType: "pte"),
-          let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "bin"),
+          let tokenizerPath = bundle.path(forResource: "llava_tokenizer", ofType: "bin"),
           let imagePath = bundle.path(forResource: "IMG_0005", ofType: "jpg"),
           let uiImage = UIImage(contentsOfFile: imagePath) else {
       XCTFail("Couldn't find model or tokenizer files")
@@ -75,8 +172,8 @@ class MultimodalRunnerTest: XCTestCase {
     do {
       try runner.generate([
         MultimodalInput(systemPrompt),
-        MultimodalInput(uiImage.asImage()),
-        MultimodalInput("\(userPrompt) \(assistantPrompt)"),
+        MultimodalInput(uiImage.asImage(sideSize)),
+        MultimodalInput(String(format: chatTemplate, userPrompt)),
       ], Config {
         $0.sequenceLength = sequenceLength
       }) { token in
@@ -92,8 +189,8 @@ class MultimodalRunnerTest: XCTestCase {
     do {
       try runner.generate([
         MultimodalInput(systemPrompt),
-        MultimodalInput(uiImage.asImage()),
-        MultimodalInput("\(userPrompt) \(assistantPrompt)"),
+        MultimodalInput(uiImage.asImage(sideSize)),
+        MultimodalInput(String(format: chatTemplate, userPrompt)),
       ], Config {
         $0.sequenceLength = sequenceLength
       }) { token in
diff --git a/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift b/extension/llm/apple/ExecuTorchLLM/__tests__/TextRunnerTest.swift
@@ -42,7 +42,7 @@ class TextRunnerTest: XCTestCase {
   func testLLaMA() {
     let bundle = Bundle(for: type(of: self))
     guard let modelPath = bundle.path(forResource: "llama3_2-1B", ofType: "pte"),
-          let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "model") else {
+          let tokenizerPath = bundle.path(forResource: "llama_tokenizer", ofType: "model") else {
       XCTFail("Couldn't find model or tokenizer files")
       return
     }
@@ -77,7 +77,7 @@ class TextRunnerTest: XCTestCase {
   func testPhi4() {
     let bundle = Bundle(for: type(of: self))
     guard let modelPath = bundle.path(forResource: "phi4-mini", ofType: "pte"),
-          let tokenizerPath = bundle.path(forResource: "tokenizer", ofType: "json") else {
+          let tokenizerPath = bundle.path(forResource: "phi_tokenizer", ofType: "json") else {
       XCTFail("Couldn't find model or tokenizer files")
       return
     }