diff --git a/docs/website/content/docs/sdk/examples/ai-tasks/image-generation.mdx b/docs/website/content/docs/sdk/examples/ai-tasks/image-generation.mdx index bb074acf1a..34bb66324b 100644 --- a/docs/website/content/docs/sdk/examples/ai-tasks/image-generation.mdx +++ b/docs/website/content/docs/sdk/examples/ai-tasks/image-generation.mdx @@ -1,11 +1,13 @@ --- title: Image generation -description: Text-to-image generation using Stable Diffusion. +description: Text-to-image and image-to-image generation using Stable Diffusion. --- ## Overview -Image generation uses [`qvac-ext-stable-diffusion.cpp`](https://github.com/tetherto/qvac-ext-stable-diffusion.cpp) as the inference engine. Load a supported model using `modelType: "diffusion"`. Then, provide a text `prompt` describing the image to generate. +Image generation uses [`qvac-ext-stable-diffusion.cpp`](https://github.com/tetherto/qvac-ext-stable-diffusion.cpp) as the inference engine. Load a supported model using `modelType: "diffusion"`. Then, provide a text `prompt` describing the image to generate. + +For image-to-image, also pass `init_image` (a `Uint8Array` of PNG bytes) — the model transforms the input guided by the prompt instead of starting from noise. `diffusion()` returns one or more PNG images as `Uint8Array` buffers. Use `progressStream` to track generation progress step-by-step. @@ -22,14 +24,36 @@ For how to use each function, see [SDK — API reference](/sdk/api/). Supported model families and their file layouts: +- **FLUX.2-klein**: split layout — diffusion model `*.gguf` + LLM text encoder `*.gguf` (via `llmModelSrc`) + VAE `*.safetensors` (via `vaeModelSrc`). - **SD1.x, SD2.x**: single all-in-one `*.gguf` file. No companion files needed. - **SDXL, SD3**: may require separate CLIP/T5 text encoder files (`clipLModelSrc`, `clipGModelSrc`, `t5XxlModelSrc`) in `modelConfig` depending on the model variant. -- **FLUX.2-klein**: split layout — diffusion model `*.gguf` + LLM text encoder `*.gguf` (via `llmModelSrc`) + VAE `*.safetensors` (via `vaeModelSrc`). For models available as constants, see [SDK — Models](/sdk/getting-started#models). ## Examples +### FLUX.2-klein + +The following script shows text-to-image generation using FLUX.2-klein with its split-layout model (separate diffusion model, LLM text encoder, and VAE): + + + + + +```js file=/packages/sdk/dist/examples/diffusion-flux2-klein.js title="diffusion-flux2-klein.js" lineNumbers +``` + + + + + + +```ts file=/packages/sdk/examples/diffusion-flux2-klein.ts title="diffusion-flux2-klein.ts" lineNumbers +``` + + + + ### Stable Diffusion The following script shows a minimal text-to-image generation example using a single all-in-one SD 2.1 model: @@ -52,15 +76,22 @@ The following script shows a minimal text-to-image generation example using a si -### FLUX.2-klein -The following script shows text-to-image generation using FLUX.2-klein with its split-layout model (separate diffusion model, LLM text encoder, and VAE): + +### Image-to-image + +Pass `init_image` to transform an existing image guided by a text prompt. Behavior depends on the model family: + +- **FLUX.2**: in-context conditioning. Requires `prediction: "flux2_flow"` in `modelConfig` at `loadModel()` time; `strength` is ignored on this path. +- **SD / SDXL / SD3**: SDEdit-style. Use `strength` to control how much the source is preserved (`0` = keep source, `1` = ignore source). + +The following script loads FLUX.2-klein in split-layout and transforms an input image using in-context conditioning (`prediction: "flux2_flow"`): -```js file=/packages/sdk/dist/examples/diffusion-flux2-klein.js title="diffusion-flux2-klein.js" lineNumbers +```js file=/packages/sdk/dist/examples/diffusion-flux2-klein-img2img.js title="diffusion-flux2-klein-img2img.js" lineNumbers ``` @@ -68,7 +99,7 @@ The following script shows text-to-image generation using FLUX.2-klein with its -```ts file=/packages/sdk/examples/diffusion-flux2-klein.ts title="diffusion-flux2-klein.ts" lineNumbers +```ts file=/packages/sdk/examples/diffusion-flux2-klein-img2img.ts title="diffusion-flux2-klein-img2img.ts" lineNumbers ``` diff --git a/docs/website/content/docs/sdk/getting-started/index.mdx b/docs/website/content/docs/sdk/getting-started/index.mdx index cc2c7f178a..51b878911b 100644 --- a/docs/website/content/docs/sdk/getting-started/index.mdx +++ b/docs/website/content/docs/sdk/getting-started/index.mdx @@ -51,7 +51,7 @@ The JS SDK is cross-platform, type-safe, and pluggable, exposing all QVAC capabi * [**Transcription:**](/sdk/examples/ai-tasks/transcription) automatic speech recognition (ASR) for speech-to-text via [`qvac-ext-lib-whisper.cpp`](https://github.com/tetherto/qvac-ext-lib-whisper.cpp) or [NVIDIA Parakeet](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2). * [**Text-to-Speech:**](/sdk/examples/ai-tasks/text-to-speech) speech synthesis for text-to-speech (TTS) via [ONNX Runtime](https://onnxruntime.ai). * [**OCR:**](/sdk/examples/ai-tasks/ocr) optical character recognition (OCR) for extracting text from images via ONNX runtime. -* [**Image generation:**](/sdk/examples/ai-tasks/image-generation) text-to-image generation via [`qvac-ext-stable-diffusion.cpp`](https://github.com/tetherto/qvac-ext-stable-diffusion.cpp). +* [**Image generation:**](/sdk/examples/ai-tasks/image-generation) text-to-image and image-to-image generation via [`qvac-ext-stable-diffusion.cpp`](https://github.com/tetherto/qvac-ext-stable-diffusion.cpp). * [**Multimodal:**](/sdk/examples/ai-tasks/multimodal) LLM inference over text, images, and other media within a single conversation context. * [**Fine-tuning:**](/sdk/examples/ai-tasks/fine-tuning) adapting LLMs to domain-specific tasks via LoRA. * [**RAG:**](/sdk/examples/ai-tasks/rag) out-of-the-box retrieval-augmented generation workflow. diff --git a/packages/sdk/examples/diffusion-flux2-klein-img2img.ts b/packages/sdk/examples/diffusion-flux2-klein-img2img.ts new file mode 100644 index 0000000000..41c8c06047 --- /dev/null +++ b/packages/sdk/examples/diffusion-flux2-klein-img2img.ts @@ -0,0 +1,77 @@ +import { + loadModel, + unloadModel, + diffusion, + FLUX_2_KLEIN_4B_Q4_0, + FLUX_2_KLEIN_4B_VAE, + QWEN3_4B_Q4_K_M, +} from "@qvac/sdk"; +import fs from "fs"; +import path from "path"; + +// img2img with FLUX.2 [klein] split-layout — uses in-context conditioning ("flux2_flow"). + +const inputPath = process.argv[2]; +const prompt = process.argv[3] || "oil painting style, vibrant colors"; +const outputDir = process.argv[4] || "."; +const diffusionModelSrc = process.argv[5] || FLUX_2_KLEIN_4B_Q4_0; +const llmModelSrc = process.argv[6] || QWEN3_4B_Q4_K_M; +const vaeModelSrc = process.argv[7] || FLUX_2_KLEIN_4B_VAE; + +if (!inputPath) { + console.error("❌ Error: input image path is required"); + console.error( + "Usage: bun run bare:example dist/examples/diffusion-flux2-klein-img2img.js [prompt] [outputDir] [diffusionModelSrc] [llmModelSrc] [vaeModelSrc]", + ); + process.exit(1); +} + +try { + console.log("Loading FLUX.2 [klein] split-layout model..."); + const modelId = await loadModel({ + modelSrc: diffusionModelSrc, + modelType: "diffusion", + modelConfig: { + device: "gpu", + threads: 4, + llmModelSrc, + vaeModelSrc, + prediction: "flux2_flow", + }, + onProgress: (p) => console.log(`Loading: ${p.percentage.toFixed(1)}%`), + }); + console.log(`Model loaded: ${modelId}`); + + const init_image = new Uint8Array(fs.readFileSync(inputPath)); + console.log(`\nTransforming "${inputPath}" with prompt: "${prompt}"`); + + const { progressStream, outputs, stats } = diffusion({ + modelId, + prompt, + init_image, + steps: 20, + guidance: 3.5, + cfg_scale: 1, + seed: -1, + }); + + for await (const { step, totalSteps } of progressStream) { + process.stdout.write(`\rStep ${step}/${totalSteps}`); + } + console.log(); + + const buffers = await outputs; + for (let i = 0; i < buffers.length; i++) { + const outputPath = path.join(outputDir, `flux2_img2img_${i}.png`); + fs.writeFileSync(outputPath, buffers[i]!); + console.log(`Saved: ${outputPath}`); + } + + console.log("\nStats:", await stats); + await unloadModel({ modelId, clearStorage: false }); + console.log("Done."); + process.exit(0); +} catch (error) { + console.error("❌ Error:", error); + process.exit(1); +}