tetherto · aegioscy · Mar 1, 2026 · Mar 1, 2026 · Mar 1, 2026
diff --git a/packages/qvac-lib-infer-diffusion/addon/src/addon/AddonJs.hpp b/packages/qvac-lib-infer-diffusion/addon/src/addon/AddonJs.hpp
@@ -34,6 +34,7 @@ inline js_value_t* createInstance(js_env_t* env, js_callback_info_t* info) try {
   config.diffusionModelPath = args.getMapEntry(1, "diffusionModelPath");
   config.clipLPath          = args.getMapEntry(1, "clipLPath");
   config.clipGPath          = args.getMapEntry(1, "clipGPath");
+  config.t5XxlPath          = args.getMapEntry(1, "t5XxlPath");
   config.llmPath            = args.getMapEntry(1, "llmPath");
   config.vaePath            = args.getMapEntry(1, "vaePath");
 

diff --git a/packages/qvac-lib-infer-diffusion/addon/src/handlers/SdCtxHandlers.cpp b/packages/qvac-lib-infer-diffusion/addon/src/handlers/SdCtxHandlers.cpp
@@ -99,8 +99,9 @@ const SdCtxHandlersMap SD_CTX_HANDLERS = {
   // ── Prediction type ────────────────────────────────────────────────────────
   // SD1.x  → "eps"         (epsilon prediction)
   // SD2.x  → "v"           (v-prediction)
-  // FLUX.2 → "flux2_flow"  (flow matching)
-  // Leave unset / use "eps" to rely on model auto-detection.
+  // SD3    → "flow"        (flow matching)
+  // FLUX.2 → "flux2_flow"  (FLUX.2 flow matching)
+  // Leave unset (or "auto") to use PREDICTION_COUNT sentinel for auto-detection.
 
   {"prediction", [](SdCtxConfig& c, const std::string& v) {
     if      (v.empty() || v == "auto") c.prediction = PREDICTION_COUNT; // sentinel: auto-detect

diff --git a/packages/qvac-lib-infer-diffusion/addon/src/handlers/SdCtxHandlers.hpp b/packages/qvac-lib-infer-diffusion/addon/src/handlers/SdCtxHandlers.hpp
@@ -21,19 +21,22 @@ namespace qvac_lib_inference_addon_sd {
  * Consumed once in SdModel::load() where new_sd_ctx() is called.
  *
  * Supported models:
- *   SD1.x  — uses modelPath (all-in-one .ckpt / .safetensors)
- *   SD2.x  — same as SD1, add prediction="v" to the config
- *   SDXL   — uses modelPath, add clipGModel if split; set force_sdxl_vae_conv_scale if needed
- *   FLUX.2 [klein] — uses diffusionModelPath + llmPath (Qwen3) + vaeModel
+ *   SD1.x        — uses modelPath (all-in-one .ckpt / .safetensors / GGUF)
+ *   SD2.x        — same as SD1, add prediction="v" to the config
+ *   SDXL         — uses modelPath (all-in-one GGUF); set force_sdxl_vae_conv_scale if needed
+ *   SD3 Medium   — all-in-one GGUF via modelPath (CLIP-L, CLIP-G, T5-XXL baked in)
+ *                  OR split layout: diffusionModelPath + clipLPath + clipGPath + t5XxlPath
+ *   FLUX.2 [klein] — uses diffusionModelPath + llmPath (Qwen3) + vaePath
  */
 struct SdCtxConfig {
   // ── Model file paths ───────────────────────────────────────────────────────
   // All paths are absolute; empty string = not used.
 
-  std::string modelPath;           // model_path            — SD1.x/SDXL all-in-one checkpoint
-  std::string diffusionModelPath;  // diffusion_model_path  — FLUX.2 [klein] standalone diffusion GGUF
-  std::string clipLPath;           // clip_l_path           — CLIP-L text encoder (SD1.x / SDXL)
-  std::string clipGPath;           // clip_g_path           — CLIP-G text encoder (SDXL)
+  std::string modelPath;           // model_path            — SD1.x/SD2.x/SDXL/SD3 all-in-one checkpoint
+  std::string diffusionModelPath;  // diffusion_model_path  — FLUX.2 [klein] or SD3 pure diffusion GGUF
+  std::string clipLPath;           // clip_l_path           — CLIP-L text encoder (SD3 split / SDXL)
+  std::string clipGPath;           // clip_g_path           — CLIP-G text encoder (SD3 split / SDXL)
+  std::string t5XxlPath;           // t5xxl_path            — T5-XXL text encoder (SD3 split)
   std::string llmPath;             // llm_path              — LLM text encoder (FLUX.2 → Qwen3)
   std::string vaePath;             // vae_path              — standalone VAE decoder weights
   std::string taesdPath;           // taesd_path            — Tiny AutoEncoder (optional fast preview)
@@ -61,8 +64,11 @@ struct SdCtxConfig {
 
   // ── Prediction type ───────────────────────────────────────────────────────
   // PREDICTION_COUNT = auto-detect from model GGUF metadata (recommended).
-  // Override only if model lacks metadata: EPS_PRED (SD1.x), V_PRED (SD2.x),
-  // FLUX2_FLOW_PRED (FLUX.2 klein).
+  // Override if the GGUF lacks metadata (community conversions often do):
+  //   EPS_PRED        → SD1.x
+  //   V_PRED          → SD2.x
+  //   FLOW_PRED       → SD3 (flow matching)
+  //   FLUX2_FLOW_PRED → FLUX.2 [klein]
   prediction_t prediction = PREDICTION_COUNT;  // auto
 
   // ── LoRA (Low-Rank Adaptation) apply mode ─────────────────────────────────

diff --git a/packages/qvac-lib-infer-diffusion/addon/src/model-interface/SdModel.cpp b/packages/qvac-lib-infer-diffusion/addon/src/model-interface/SdModel.cpp
@@ -86,6 +86,7 @@ void SdModel::load() {
   params.diffusion_model_path = config_.diffusionModelPath.empty() ? nullptr : config_.diffusionModelPath.c_str();
   params.clip_l_path          = config_.clipLPath.empty()          ? nullptr : config_.clipLPath.c_str();
   params.clip_g_path          = config_.clipGPath.empty()          ? nullptr : config_.clipGPath.c_str();
+  params.t5xxl_path           = config_.t5XxlPath.empty()          ? nullptr : config_.t5XxlPath.c_str();
   params.llm_path             = config_.llmPath.empty()            ? nullptr : config_.llmPath.c_str();
   params.vae_path             = config_.vaePath.empty()            ? nullptr : config_.vaePath.c_str();
   params.taesd_path           = config_.taesdPath.empty()          ? nullptr : config_.taesdPath.c_str();

diff --git a/packages/qvac-lib-infer-diffusion/examples/generate-image-sd3.js b/packages/qvac-lib-infer-diffusion/examples/generate-image-sd3.js
@@ -0,0 +1,143 @@
+'use strict'
+
+const path = require('bare-path')
+const process = require('bare-process')
+const fs = require('bare-fs')
+const FilesystemDL = require('@qvac/dl-filesystem')
+const ImgStableDiffusion = require('../index')
+
+// ---------------------------------------------------------------------------
+// Model file — downloaded via: ./scripts/download-model-sd3.sh
+//
+// sd3_medium_incl_clips.safetensors: official Stability AI safetensors from
+// adamo1139/stable-diffusion-3-medium-ungated (ungated public mirror).
+// Contains the diffusion model + CLIP-L + CLIP-G text encoders in one file.
+// No separate encoder paths needed.
+//
+// NOTE: The gpustack GGUF variants (stable-diffusion-v3-medium-*.gguf) have
+// zero KV metadata pairs and are NOT compatible with standard stable-diffusion.cpp.
+// ---------------------------------------------------------------------------
+const MODELS_DIR = path.resolve(__dirname, '../models')
+const OUTPUT_DIR = path.resolve(__dirname, '../output')
+
+// All-in-one safetensors — diffusion + CLIP-L + CLIP-G:
+const MODEL_NAME = 'sd3_medium_incl_clips.safetensors'
+
+// ---------------------------------------------------------------------------
+// Generation params
+// SD3 Medium uses flow-matching. cfg_scale 4.5–7.0 is the typical range.
+// 512×512 works fine; SD3 was trained at 1024×1024 but smaller is faster.
+// ---------------------------------------------------------------------------
+const PROMPT = [
+  'a majestic red fox standing in a snowy forest at dusk,',
+  'soft golden light through the pine trees,',
+  'photorealistic, 8k, detailed fur'
+].join(' ')
+
+const NEGATIVE_PROMPT = 'blurry, low quality, watermark, text, bad anatomy'
+
+const STEPS    = 28    // SD3 Medium typically 20–30 steps
+const WIDTH    = 512
+const HEIGHT   = 512
+const CFG      = 5.0   // SD3 flow-matching; lower than SD1/SD2 (4.5–7.0 range)
+const SEED     = 42    // -1 = random
+
+async function main () {
+  fs.mkdirSync(OUTPUT_DIR, { recursive: true })
+
+  console.log('Stable Diffusion 3 Medium — text-to-image inference')
+  console.log('=====================================================')
+  console.log('Model  :', MODEL_NAME)
+  console.log('Prompt :', PROMPT)
+  console.log('Steps  :', STEPS)
+  console.log('Size   :', `${WIDTH}x${HEIGHT}`)
+  console.log('CFG    :', CFG)
+  console.log('Seed   :', SEED)
+  console.log()
+
+  const loader = new FilesystemDL({ dirPath: MODELS_DIR })
+
+  const model = new ImgStableDiffusion(
+    {
+      loader,
+      logger: console,
+      diskPath: MODELS_DIR,
+      modelName: MODEL_NAME
+      // All-in-one safetensors: no clipLModel, clipGModel, t5XxlModel, or vaeModel.
+      //
+      // To add T5-XXL (better text following) without redownloading the main file:
+      //   t5XxlModel: 't5xxl_fp8_e4m3fn.safetensors'   // download via download-model-sd3.sh
+    },
+    {
+      threads: 4,
+      // SD3 uses flow-matching. The safetensors metadata allows auto-detection,
+      // but we set these explicitly as safety overrides.
+      prediction: 'flow',   // FLOW_PRED — SD3 flow-matching
+      flow_shift: '3.0'     // SD3 Medium default; overrides INFINITY sentinel
+    }
+  )
+
+  try {
+    // ── 1. Load weights ───────────────────────────────────────────────────────
+    console.log('Loading model weights...')
+    const tLoad = Date.now()
+    await model.load()
+    console.log(`Loaded in ${((Date.now() - tLoad) / 1000).toFixed(1)}s\n`)
+
+    // ── 2. Start generation ───────────────────────────────────────────────────
+    console.log('Starting generation...')
+    const tGen = Date.now()
+
+    const response = await model.run({
+      prompt: PROMPT,
+      negative_prompt: NEGATIVE_PROMPT,
+      steps: STEPS,
+      width: WIDTH,
+      height: HEIGHT,
+      cfg_scale: CFG,            // SD3 CFG — not the FLUX distilled 'guidance'
+      sampling_method: 'euler',  // SD3 flow-matching requires euler (not euler_a)
+      seed: SEED
+    })
+
+    // ── 3. Stream progress + collect image bytes ──────────────────────────────
+    const images = []
+
+    await response
+      .onUpdate((data) => {
+        if (data instanceof Uint8Array) {
+          images.push(data)
+        } else if (typeof data === 'string') {
+          try {
+            const tick = JSON.parse(data)
+            if ('step' in tick && 'total' in tick) {
+              const pct = Math.round((tick.step / tick.total) * 100)
+              const bar = '█'.repeat(Math.floor(pct / 5)).padEnd(20, '░')
+              process.stdout.write(`\r  [${bar}] ${tick.step}/${tick.total} steps`)
+            }
+          } catch (_) {}
+        }
+      })
+      .await()
+
+    process.stdout.write('\n')
+    console.log(`\nGenerated in ${((Date.now() - tGen) / 1000).toFixed(1)}s`)
+    console.log(`Got ${images.length} image(s)`)
+
+    // ── 4. Save each image to disk ────────────────────────────────────────────
+    for (let i = 0; i < images.length; i++) {
+      const outPath = path.join(OUTPUT_DIR, `sd3_seed${SEED}_${i}.png`)
+      fs.writeFileSync(outPath, images[i])
+      console.log(`Saved → ${outPath}`)
+    }
+  } finally {
+    console.log('\nUnloading model...')
+    await model.unload()
+    await loader.close()
+    console.log('Done.')
+  }
+}
+
+main().catch(err => {
+  console.error('Fatal:', err.message || err)
+  process.exit(1)
+})
diff --git a/packages/qvac-lib-infer-diffusion/index.js b/packages/qvac-lib-infer-diffusion/index.js
@@ -81,16 +81,23 @@ class ImgStableDiffusion extends BaseInference {
       })
 
       // Route the primary model file to the correct stable-diffusion.cpp param:
-      //   FLUX.2 [klein] uses a split layout — diffusion weights have no SD
-      //   version metadata, so diffusion_model_path must be used.
-      //   SD1.x / SD2.x / SDXL use all-in-one checkpoints with metadata, so
-      //   model_path is correct.
-      // Heuristic: if llmModel is provided the caller is using FLUX.2 (which
-      // requires an LLM text encoder); otherwise assume an all-in-one SD model.
-      const isFluxLayout = !!this._llmModel
+      //
+      //   model_path           — all-in-one checkpoints that embed their own text
+      //                          encoders and version metadata (SD1.x, SD2.x, SDXL,
+      //                          SD3 all-in-one GGUF).
+      //
+      //   diffusion_model_path — standalone diffusion-only weights that have no
+      //                          embedded SD metadata and require separate encoders:
+      //                            FLUX.2 [klein] → llmModel (Qwen3)
+      //                            SD3 pure GGUF  → t5XxlModel (T5-XXL) + clipLModel + clipGModel
+      //
+      // Heuristic: if any separate encoder is provided (LLM for FLUX.2, T5-XXL
+      // for SD3 split) the caller is using a pure diffusion GGUF that must be
+      // loaded via diffusion_model_path.
+      const isSplitLayout = !!this._llmModel || !!this._t5XxlModel
       const configurationParams = {
-        path: isFluxLayout ? '' : path.join(this._diskPath, this._modelName),
-        diffusionModelPath: isFluxLayout ? path.join(this._diskPath, this._modelName) : '',
+        path: isSplitLayout ? '' : path.join(this._diskPath, this._modelName),
+        diffusionModelPath: isSplitLayout ? path.join(this._diskPath, this._modelName) : '',
         clipLPath: this._clipLModel ? path.join(this._diskPath, this._clipLModel) : '',
         clipGPath: this._clipGModel ? path.join(this._diskPath, this._clipGModel) : '',
         t5XxlPath: this._t5XxlModel ? path.join(this._diskPath, this._t5XxlModel) : '',

diff --git a/packages/qvac-lib-infer-diffusion/package.json b/packages/qvac-lib-infer-diffusion/package.json
@@ -9,6 +9,7 @@
     "generate": "bare examples/generate-image.js",
     "generate:sd2": "bare examples/generate-image-sd2.js",
     "generate:sdxl": "bare examples/generate-image-sdxl.js",
+    "generate:sd3": "bare examples/generate-image-sd3.js",
     "build:pack": "mkdir -p dist && npm pack --pack-destination dist",
     "mobile:copy-prebuilds": "cp -r prebuilds/android-arm64 prebuilds/android-ia32 || echo 'Warning: Failed to copy sd prebuilds to android-ia32'; cp -r prebuilds/android-arm64 prebuilds/android-arm || echo 'Warning: Failed to copy sd prebuilds to android-arm'; cp -r prebuilds/android-arm64 prebuilds/android-x64 || echo 'Warning: Failed to copy sd prebuilds to android-x64'; cp -r prebuilds/ios-arm64 prebuilds/ios-arm64-simulator 2>/dev/null || echo 'iOS prebuilds already present'; cp -r prebuilds/ios-arm64 prebuilds/ios-x64-simulator 2>/dev/null || echo 'iOS prebuilds already present'",
     "lint": "standard --ignore \"addon/**\"",

diff --git a/packages/qvac-lib-infer-diffusion/scripts/download-model-sd3.sh b/packages/qvac-lib-infer-diffusion/scripts/download-model-sd3.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Stable Diffusion 3 Medium — official safetensors (ungated mirror).
+#
+# Source: adamo1139/stable-diffusion-3-medium-ungated
+#         Ungated re-upload of the official stabilityai/stable-diffusion-3-medium
+#         weights.  No HuggingFace account or token required.
+#
+# File downloaded:
+#   sd3_medium_incl_clips.safetensors    5.97 GB
+#     All-in-one: diffusion model + CLIP-L + CLIP-G text encoders.
+#     No T5-XXL — text-following quality is slightly lower but RAM usage is
+#     comfortable on 16 GB unified memory.
+#
+# Optional — better quality with T5-XXL (adds ~4.9 GB download + ~5 GB RAM):
+#   Uncomment the t5xxl download block below and use generate-image-sd3-split.js.
+#
+# Disk: ~6.0 GB    RAM: ~7–8 GB at runtime (without T5-XXL)
+# Minimum recommended: 12 GB unified memory
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+OUT="$(cd "$SCRIPT_DIR/.." && pwd)/models"
+HF="https://huggingface.co"
+REPO="adamo1139/stable-diffusion-3-medium-ungated"
+
+mkdir -p "$OUT"
+
+dl() {
+  local url="$1" dest="$2"
+  [[ -f "$dest" ]] && echo "exists: $(basename "$dest")" && return
+  echo "downloading: $(basename "$dest")"
+  curl -fL --progress-bar --retry 5 --retry-delay 3 --retry-connrefused -C - -o "$dest" "$url" \
+    || { rm -f "$dest"; exit 1; }
+}
+
+# All-in-one: diffusion model + CLIP-L + CLIP-G (no T5-XXL)
+dl "$HF/$REPO/resolve/main/sd3_medium_incl_clips.safetensors" \
+   "$OUT/sd3_medium_incl_clips.safetensors"
+
+# Optional: T5-XXL FP8 for much better prompt understanding (~4.89 GB)
+# Uncomment to download:
+# dl "$HF/$REPO/resolve/main/text_encoders/t5xxl_fp8_e4m3fn.safetensors" \
+#    "$OUT/t5xxl_fp8_e4m3fn.safetensors"
+
+echo "done → $OUT"