diff --git a/packages/sdk/client/api/diffusion.ts b/packages/sdk/client/api/diffusion.ts index d790e405fb..2f95214feb 100644 --- a/packages/sdk/client/api/diffusion.ts +++ b/packages/sdk/client/api/diffusion.ts @@ -44,6 +44,30 @@ interface DiffusionResult { * // modelConfig: { prediction: "flux2_flow" } })`). * const { outputs } = diffusion({ modelId, prompt: "turn into watercolor", init_image: initImage }); * + * // FLUX.2 multi-reference fusion + * // IMPORTANT: requires the model loaded with `modelConfig: { prediction: "flux2_flow" }` + * // and a Qwen3 text encoder via `llmModelSrc` (same loadModel requirements as the + * // FLUX.2 img2img example above). `init_image` and `init_images` are mutually + * // exclusive — pass one or the other, not both. + * const refA = fs.readFileSync("scientist-a.jpg"); + * const refB = fs.readFileSync("scientist-b.jpg"); + * const { outputs } = diffusion({ + * modelId, + * prompt: "a portrait using most visual traits from @image1 and the eyes from @image2", + * init_images: [refA, refB], + * width: 768, + * height: 768, + * }); + * + * // LoRA adapter for this generation (absolute path required). + * // Persistence across subsequent diffusion() calls is controlled at + * // loadModel time via `modelConfig.lora_apply_mode`. + * const { outputs } = diffusion({ + * modelId, + * prompt: "a watercolor cat", + * lora: "/home/user/loras/watercolor.safetensors", + * }); + * * // With progress tracking * const { progressStream, outputs } = diffusion({ modelId, prompt: "a cat" }); * for await (const { step, totalSteps } of progressStream) { @@ -53,10 +77,14 @@ interface DiffusionResult { * ``` */ export function diffusion(params: DiffusionClientParams): DiffusionResult { - const { init_image, ...rest } = params; + const { init_image, init_images, ...rest } = params; + const request: DiffusionStreamRequest = { ...rest, ...(init_image !== undefined && { init_image: encodeBase64(init_image) }), + ...(init_images !== undefined && { + init_images: init_images.map(encodeBase64), + }), type: "diffusionStream", }; diff --git a/packages/sdk/package.json b/packages/sdk/package.json index e54eb963cf..0f74aa7b5c 100644 --- a/packages/sdk/package.json +++ b/packages/sdk/package.json @@ -173,7 +173,7 @@ }, "dependencies": { "@qvac/decoder-audio": "^0.3.7", - "@qvac/diffusion-cpp": "^0.3.0", + "@qvac/diffusion-cpp": "^0.5.0", "@qvac/embed-llamacpp": "^0.14.0", "@qvac/error": "^0.1.1", "@qvac/langdetect-text": "^0.1.2", diff --git a/packages/sdk/schemas/sdcpp-config.ts b/packages/sdk/schemas/sdcpp-config.ts index 168e02c0bd..ac0c5dfdb8 100644 --- a/packages/sdk/schemas/sdcpp-config.ts +++ b/packages/sdk/schemas/sdcpp-config.ts @@ -4,6 +4,8 @@ import { modelSrcInputSchema } from "./model-src-utils"; const BASE64_PATTERN = /^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?$/; +const ABSOLUTE_PATH_PATTERN = /^(\/|[A-Za-z]:[\\/]|\\\\)/; + export const sdcppConfigSchema = z .object({ threads: z.number().optional(), @@ -26,6 +28,16 @@ export const sdcppConfigSchema = z vae_on_cpu: z.boolean().optional().describe("Force VAE decoder to run on CPU"), vae_tiling: z.boolean().optional().describe("Enable VAE tiling for large images on limited VRAM"), flash_attn: z.boolean().optional().describe("Enable flash attention to reduce memory usage"), + lora_apply_mode: z.enum(["auto", "immediately", "at_runtime"]).optional() + .describe( + "How LoRA adapters passed via diffusion({ lora }) are applied. " + + "'auto' (default): picked based on weight type — 'at_runtime' for " + + "quantized weights, 'immediately' for full-precision. " + + "'immediately': adapter is fused into the model on first use and " + + "persists across subsequent diffusion() calls until the model is " + + "unloaded. " + + "'at_runtime': adapter is applied per-call and not persisted.", + ), verbosity: z.number().optional(), clipLModelSrc: modelSrcInputSchema.optional() .describe("CLIP-L text encoder model — required for SD3"), @@ -210,7 +222,45 @@ export const diffusionRequestSchema = z.object({ .min(1) .regex(BASE64_PATTERN) .optional() - .describe("Base64-encoded image for img2img generation"), + .describe("Base64-encoded image for img2img generation. Mutually exclusive with init_images."), + init_images: z.array( + z.string().min(1).regex(BASE64_PATTERN), + ) + .min(1) + .optional() + .describe( + "FLUX.2-only multi-reference fusion: array of base64-encoded PNG/JPEG buffers. " + + "Each buffer becomes a separate reference image that the FLUX.2 transformer attends to. " + + "Mutually exclusive with init_image; requires the model to be loaded with " + + "config.prediction='flux2_flow' and a Qwen3 text encoder via llmModelSrc.", + ), + increase_ref_index: z.boolean().optional() + .describe( + "FLUX.2 fusion only. When omitted, the addon default (false) is used. When false, all " + + "reference latents share one RoPE index slot and blend via attention (recommended for " + + "FLUX.2-klein). When true, each reference gets its own RoPE index slot — use only with " + + "text encoders that receive per-image vision tokens.", + ), + auto_resize_ref_image: z.boolean().optional() + .describe( + "FLUX.2 only. When omitted, the addon default (true) is used. When true, every reference " + + "image (single or fusion) is auto-resized to the target width/height before VAE-encoding. " + + "Disable only if the buffers are already at the exact target dimensions.", + ), + lora: z + .string() + .min(1) + .regex(ABSOLUTE_PATH_PATTERN, { + message: + "lora must be an absolute path", + }) + .optional() + .describe( + "Optional local LoRA adapter path to apply for this generation. " + + "Must be an absolute filesystem path. " + + "Whether the adapter persists across subsequent diffusion() calls is controlled " + + "by sdcppConfigSchema.lora_apply_mode (set at loadModel time).", + ), strength: z .number() .min(0) @@ -219,7 +269,13 @@ export const diffusionRequestSchema = z.object({ .describe( "img2img denoising strength (0.0 = keep source, 1.0 = ignore source); used by the SD/SDXL SDEdit path. No-op for FLUX.2, which uses in-context conditioning and ignores this field.", ), -}); +}).refine( + (d) => d.init_image === undefined || d.init_images === undefined, + { + message: + "init_image and init_images are mutually exclusive — pass one or the other, not both.", + }, +); export type DiffusionRequest = z.input; @@ -231,6 +287,13 @@ export type DiffusionStreamRequest = z.input< typeof diffusionStreamRequestSchema >; -export type DiffusionClientParams = Omit & { - init_image?: Uint8Array; -}; +type DiffusionClientParamsBase = Omit< + DiffusionRequest, + "init_image" | "init_images" +>; + +export type DiffusionClientParams = DiffusionClientParamsBase & + ( + | { init_image?: Uint8Array; init_images?: never } + | { init_image?: never; init_images?: Uint8Array[] } + ); diff --git a/packages/sdk/server/bare/plugins/sdcpp-generation/ops/diffusion.ts b/packages/sdk/server/bare/plugins/sdcpp-generation/ops/diffusion.ts index f32cc9ac0a..4620689afc 100644 --- a/packages/sdk/server/bare/plugins/sdcpp-generation/ops/diffusion.ts +++ b/packages/sdk/server/bare/plugins/sdcpp-generation/ops/diffusion.ts @@ -18,6 +18,10 @@ export async function* diffusion( ? Buffer.from(request.init_image, "base64") : undefined; + const init_images = request.init_images + ? request.init_images.map((b64) => Buffer.from(b64, "base64")) + : undefined; + const response = await model.run({ prompt: request.prompt, negative_prompt: request.negative_prompt, @@ -34,7 +38,11 @@ export async function* diffusion( vae_tiling: request.vae_tiling, cache_preset: request.cache_preset, init_image, + init_images, + increase_ref_index: request.increase_ref_index, + auto_resize_ref_image: request.auto_resize_ref_image, strength: request.strength, + lora: request.lora, }); let outputIndex = 0; diff --git a/packages/sdk/test/unit/sdcpp-plugin.test.ts b/packages/sdk/test/unit/sdcpp-plugin.test.ts index f00fec6ae7..ff53b9f771 100644 --- a/packages/sdk/test/unit/sdcpp-plugin.test.ts +++ b/packages/sdk/test/unit/sdcpp-plugin.test.ts @@ -4,6 +4,7 @@ import { z } from "zod"; import { sdcppConfigSchema, diffusionRequestSchema, + diffusionStreamRequestSchema, diffusionStreamResponseSchema, diffusionStatsSchema, modelInfoSchema, @@ -320,6 +321,180 @@ test("diffusionRequestSchema: accepts strength at boundaries (0 and 1)", (t) => t.is(resultOne.success, true); }); +// ---- LoRA + multi-reference fusion (FLUX.2) ---- + +test("diffusionRequestSchema: accepts lora as POSIX absolute path", (t) => { + const result = diffusionRequestSchema.safeParse({ + modelId: "model-1", + prompt: "a cat", + lora: "/home/user/loras/watercolor.safetensors", + }); + t.is(result.success, true); +}); + +test("diffusionRequestSchema: accepts lora as Windows drive-letter path", (t) => { + const resultBackslash = diffusionRequestSchema.safeParse({ + modelId: "model-1", + prompt: "a cat", + lora: "C:\\models\\loras\\watercolor.safetensors", + }); + t.is(resultBackslash.success, true); + + const resultForwardSlash = diffusionRequestSchema.safeParse({ + modelId: "model-1", + prompt: "a cat", + lora: "C:/models/loras/watercolor.safetensors", + }); + t.is(resultForwardSlash.success, true); +}); + +test("diffusionRequestSchema: accepts lora as Windows UNC path", (t) => { + const result = diffusionRequestSchema.safeParse({ + modelId: "model-1", + prompt: "a cat", + lora: "\\\\server\\share\\loras\\watercolor.safetensors", + }); + t.is(result.success, true); +}); + +test("diffusionRequestSchema: rejects lora as bare filename", (t) => { + const result = diffusionRequestSchema.safeParse({ + modelId: "model-1", + prompt: "a cat", + lora: "my-lora.safetensors", + }); + t.is(result.success, false); + if (!result.success) { + t.ok( + /must be an absolute path/.test(result.error.issues[0]!.message), + "error message explains lora must be absolute", + ); + } +}); + +test("diffusionRequestSchema: rejects lora as relative path", (t) => { + const resultDot = diffusionRequestSchema.safeParse({ + modelId: "model-1", + prompt: "a cat", + lora: "./loras/watercolor.safetensors", + }); + t.is(resultDot.success, false); + + const resultParent = diffusionRequestSchema.safeParse({ + modelId: "model-1", + prompt: "a cat", + lora: "../loras/watercolor.safetensors", + }); + t.is(resultParent.success, false); + + const resultSubdir = diffusionRequestSchema.safeParse({ + modelId: "model-1", + prompt: "a cat", + lora: "loras/watercolor.safetensors", + }); + t.is(resultSubdir.success, false); +}); + +test("diffusionRequestSchema: accepts init_images with multiple base64 buffers", (t) => { + const result = diffusionRequestSchema.safeParse({ + modelId: "model-1", + prompt: "blend @image1 and @image2", + init_images: ["iVBORw0KGgoAAAANSUhEUg==", "/9j/4AAQSkZJRgABAQEASABIAAA="], + }); + t.is(result.success, true); +}); + +test("diffusionRequestSchema: accepts increase_ref_index boolean", (t) => { + const result = diffusionRequestSchema.safeParse({ + modelId: "model-1", + prompt: "a cat", + init_images: ["iVBORw0KGgoAAAANSUhEUg=="], + increase_ref_index: true, + }); + t.is(result.success, true); +}); + +test("diffusionRequestSchema: accepts auto_resize_ref_image boolean", (t) => { + const result = diffusionRequestSchema.safeParse({ + modelId: "model-1", + prompt: "a cat", + init_image: "iVBORw0KGgoAAAANSUhEUg==", + auto_resize_ref_image: false, + }); + t.is(result.success, true); +}); + +test("diffusionRequestSchema: rejects when init_image and init_images are both set", (t) => { + const result = diffusionRequestSchema.safeParse({ + modelId: "model-1", + prompt: "blend two refs", + init_image: "iVBORw0KGgoAAAANSUhEUg==", + init_images: ["iVBORw0KGgoAAAANSUhEUg==", "/9j/4AAQSkZJRgABAQEASABIAAA="], + }); + t.is(result.success, false); + if (!result.success) { + const messages = result.error.issues.map((i) => i.message).join(" | "); + t.ok( + messages.includes("mutually exclusive"), + `expected mutual-exclusion message, got: ${messages}`, + ); + } +}); + +test("diffusionRequestSchema: accepts init_image alone (mutual exclusion not triggered)", (t) => { + const result = diffusionRequestSchema.safeParse({ + modelId: "model-1", + prompt: "img2img", + init_image: "iVBORw0KGgoAAAANSUhEUg==", + }); + t.is(result.success, true); +}); + +test("diffusionRequestSchema: accepts init_images alone (mutual exclusion not triggered)", (t) => { + const result = diffusionRequestSchema.safeParse({ + modelId: "model-1", + prompt: "fusion", + init_images: ["iVBORw0KGgoAAAANSUhEUg=="], + }); + t.is(result.success, true); +}); + +// ---- diffusionStreamRequestSchema ---- + +test("diffusionStreamRequestSchema: accepts a valid stream request with type literal", (t) => { + const result = diffusionStreamRequestSchema.safeParse({ + type: "diffusionStream", + modelId: "model-1", + prompt: "a cat", + }); + t.is(result.success, true); +}); + +test("diffusionStreamRequestSchema: rejects when init_image and init_images are both set", (t) => { + const result = diffusionStreamRequestSchema.safeParse({ + type: "diffusionStream", + modelId: "model-1", + prompt: "a cat", + init_image: "iVBORw0KGgoAAAANSUhEUg==", + init_images: ["iVBORw0KGgoAAAANSUhEUg=="], + }); + t.is(result.success, false); + if (!result.success) { + const messages = result.error.issues.map((i) => i.message).join(" | "); + t.ok( + messages.includes("mutually exclusive"), + `expected mutual-exclusion message, got: ${messages}`, + ); + } +}); + +// ---- sdcppConfigSchema: lora_apply_mode ---- + +test("sdcppConfigSchema: accepts lora_apply_mode", (t) => { + const result = sdcppConfigSchema.safeParse({ lora_apply_mode: "auto" }); + t.is(result.success, true); +}); + // ============================================ // diffusionStreamResponseSchema // ============================================ diff --git a/packages/sdk/tests-qvac/tests/desktop/executors/diffusion-executor.ts b/packages/sdk/tests-qvac/tests/desktop/executors/diffusion-executor.ts index 1cbd49628e..1449fc01d0 100644 --- a/packages/sdk/tests-qvac/tests/desktop/executors/diffusion-executor.ts +++ b/packages/sdk/tests-qvac/tests/desktop/executors/diffusion-executor.ts @@ -2,14 +2,39 @@ import * as fs from "node:fs"; import * as path from "node:path"; import { DiffusionExecutor as SharedDiffusionExecutor } from "../../shared/executors/diffusion-executor.js"; +function readImageBytes(name: string): Uint8Array { + const fileName = name.split("/").pop()!; + const filePath = path.resolve(process.cwd(), "assets/images", fileName); + return new Uint8Array(fs.readFileSync(filePath)); +} + export class DesktopDiffusionExecutor extends SharedDiffusionExecutor { protected override async resolveParams( p: Record, ): Promise> { - if (typeof p.init_image !== "string") return p; + const out: Record = { ...p }; + + if (p.init_image !== undefined) { + if (typeof p.init_image !== "string") { + throw new Error( + `init_image in test params must be a string filename, got: ${typeof p.init_image}`, + ); + } + out.init_image = readImageBytes(p.init_image); + } + + if (p.init_images !== undefined) { + if ( + !Array.isArray(p.init_images) || + !p.init_images.every((v) => typeof v === "string") + ) { + throw new Error( + "init_images in test params must be a string[] of image filenames", + ); + } + out.init_images = (p.init_images as string[]).map(readImageBytes); + } - const fileName = p.init_image.split("/").pop()!; - const filePath = path.resolve(process.cwd(), "assets/images", fileName); - return { ...p, init_image: new Uint8Array(fs.readFileSync(filePath)) }; + return out; } } diff --git a/packages/sdk/tests-qvac/tests/diffusion-tests.ts b/packages/sdk/tests-qvac/tests/diffusion-tests.ts index 36af93ec83..eecd8e15c5 100644 --- a/packages/sdk/tests-qvac/tests/diffusion-tests.ts +++ b/packages/sdk/tests-qvac/tests/diffusion-tests.ts @@ -199,6 +199,23 @@ export const diffusionStatsPresent = createDiffusionTest( { validation: "type", expectedType: "string" }, ); +// ---- FLUX.2 multi-reference fusion ---- + +export const diffusionFusionFlux2Basic = createDiffusionTest( + "diffusion-fusion-flux2-basic", + { + prompt: "a portrait using most visual traits from @image1 and the eyes from @image2", + init_images: ["cat.jpg", "elephant.jpg"], + width: 256, + height: 256, + steps: 4, + seed: 42, + }, + // Required by TestDefinition but effectively ignored - DiffusionExecutor.fusionFlux2Basic gates the result. + { validation: "type", expectedType: "array" }, + 600000, +); + // ---- error cases ---- export const diffusionEmptyPrompt = createDiffusionTest( @@ -227,5 +244,6 @@ export const diffusionTests = [ diffusionStreaming, diffusionStreamingProgress, diffusionStatsPresent, + diffusionFusionFlux2Basic, diffusionEmptyPrompt, ]; diff --git a/packages/sdk/tests-qvac/tests/mobile/executors/diffusion-executor.ts b/packages/sdk/tests-qvac/tests/mobile/executors/diffusion-executor.ts index efb5f6f1ea..111a2f9c13 100644 --- a/packages/sdk/tests-qvac/tests/mobile/executors/diffusion-executor.ts +++ b/packages/sdk/tests-qvac/tests/mobile/executors/diffusion-executor.ts @@ -28,18 +28,44 @@ export class MobileDiffusionExecutor extends SharedDiffusionExecutor { return await new File(fileUri).bytes(); } - protected override async resolveParams( - p: Record, - ): Promise> { - if (typeof p.init_image !== "string") return p; - - const fileName = p.init_image.split("/").pop()!; + private async resolveImageByName(name: string): Promise { + const fileName = name.split("/").pop()!; const images = await this.loadImageAssets(); const assetModule = images[fileName]; if (!assetModule) { throw new Error(`Image file not found in assets: ${fileName}`); } - const bytes = await this.resolveAssetBytes(assetModule); - return { ...p, init_image: bytes }; + return await this.resolveAssetBytes(assetModule); + } + + protected override async resolveParams( + p: Record, + ): Promise> { + const out: Record = { ...p }; + + if (p.init_image !== undefined) { + if (typeof p.init_image !== "string") { + throw new Error( + `init_image in test params must be a string filename, got: ${typeof p.init_image}`, + ); + } + out.init_image = await this.resolveImageByName(p.init_image); + } + + if (p.init_images !== undefined) { + if ( + !Array.isArray(p.init_images) || + !p.init_images.every((v) => typeof v === "string") + ) { + throw new Error( + "init_images in test params must be a string[] of image filenames", + ); + } + out.init_images = await Promise.all( + (p.init_images as string[]).map((n) => this.resolveImageByName(n)), + ); + } + + return out; } } diff --git a/packages/sdk/tests-qvac/tests/shared/executors/diffusion-executor.ts b/packages/sdk/tests-qvac/tests/shared/executors/diffusion-executor.ts index 6c64515ced..6e5b4037f5 100644 --- a/packages/sdk/tests-qvac/tests/shared/executors/diffusion-executor.ts +++ b/packages/sdk/tests-qvac/tests/shared/executors/diffusion-executor.ts @@ -8,6 +8,11 @@ import { import { AbstractModelExecutor } from "./abstract-model-executor.js"; import { diffusionTests } from "../../diffusion-tests.js"; +// Minimum byte-level divergence between fusion output and the same-seed txt2img +// baseline. SDK output is bit-exact at fixed seed (see seedReproducibility), so +// 1% is well above noise and catches a silent fallback when init_images is dropped. +const MIN_FUSION_DIVERGENCE_RATIO = 0.01; + export class DiffusionExecutor extends AbstractModelExecutor { pattern = /^diffusion-/; @@ -30,6 +35,9 @@ export class DiffusionExecutor extends AbstractModelExecutor Promise>)[testId]; if (handler) { @@ -47,7 +55,7 @@ export class DiffusionExecutor extends AbstractModelExecutor, ): DiffusionClientParams { - const params: DiffusionClientParams = { + const params: Omit = { modelId, prompt: p.prompt as string, }; @@ -63,9 +71,23 @@ export class DiffusionExecutor extends AbstractModelExecutor { + const p = await this.resolveParams(params as Record); + const modelId = await this.resources.ensureLoaded("diffusion"); + + if (p.seed === undefined) { + return { + passed: false, + output: "fusion test requires a fixed seed to compare against baseline", + }; + } + + try { + const fusionParams = this.buildParams(modelId, p); + const baselineParams = this.buildParams(modelId, { + ...p, + init_images: undefined, + }); + + const { outputs: fusionOutputs } = diffusion(fusionParams); + const fusionBuffers = await fusionOutputs; + + const { outputs: baselineOutputs } = diffusion(baselineParams); + const baselineBuffers = await baselineOutputs; + + if (fusionBuffers.length === 0 || baselineBuffers.length === 0) { + return { + passed: false, + output: `Missing output(s): fusion=${fusionBuffers.length}, baseline=${baselineBuffers.length}`, + }; + } + + const diffRatio = this.byteDiffRatio( + fusionBuffers[0]!, + baselineBuffers[0]!, + ); + const passed = diffRatio > MIN_FUSION_DIVERGENCE_RATIO; + const deltaPct = (diffRatio * 100).toFixed(2); + + return { + passed, + output: passed + ? `Fusion output differs from txt2img baseline (${deltaPct}% byte delta)` + : `Fusion output matches txt2img baseline too closely (${deltaPct}% byte delta)`, + }; + } catch (error) { + const errorMsg = error instanceof Error ? error.message : String(error); + return { + passed: false, + output: `FLUX.2 fusion comparison failed: ${errorMsg}`, + }; + } + } + + private byteDiffRatio(left: Uint8Array, right: Uint8Array): number { + const maxLength = Math.max(left.length, right.length); + if (maxLength === 0) { + return 0; + } + + const minLength = Math.min(left.length, right.length); + let changed = Math.abs(left.length - right.length); + + for (let i = 0; i < minLength; i++) { + if (left[i] !== right[i]) { + changed++; + } + } + + return changed / maxLength; + } }