Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ inline js_value_t* createInstance(js_env_t* env, js_callback_info_t* info) try {
config.diffusionModelPath = args.getMapEntry(1, "diffusionModelPath");
config.clipLPath = args.getMapEntry(1, "clipLPath");
config.clipGPath = args.getMapEntry(1, "clipGPath");
config.t5XxlPath = args.getMapEntry(1, "t5XxlPath");
config.llmPath = args.getMapEntry(1, "llmPath");
config.vaePath = args.getMapEntry(1, "vaePath");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,9 @@ const SdCtxHandlersMap SD_CTX_HANDLERS = {
// ── Prediction type ────────────────────────────────────────────────────────
// SD1.x → "eps" (epsilon prediction)
// SD2.x → "v" (v-prediction)
// FLUX.2 → "flux2_flow" (flow matching)
// Leave unset / use "eps" to rely on model auto-detection.
// SD3 → "flow" (flow matching)
// FLUX.2 → "flux2_flow" (FLUX.2 flow matching)
// Leave unset (or "auto") to use PREDICTION_COUNT sentinel for auto-detection.

{"prediction", [](SdCtxConfig& c, const std::string& v) {
if (v.empty() || v == "auto") c.prediction = PREDICTION_COUNT; // sentinel: auto-detect
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,22 @@ namespace qvac_lib_inference_addon_sd {
* Consumed once in SdModel::load() where new_sd_ctx() is called.
*
* Supported models:
* SD1.x — uses modelPath (all-in-one .ckpt / .safetensors)
* SD2.x — same as SD1, add prediction="v" to the config
* SDXL — uses modelPath, add clipGModel if split; set force_sdxl_vae_conv_scale if needed
* FLUX.2 [klein] — uses diffusionModelPath + llmPath (Qwen3) + vaeModel
* SD1.x — uses modelPath (all-in-one .ckpt / .safetensors / GGUF)
* SD2.x — same as SD1, add prediction="v" to the config
* SDXL — uses modelPath (all-in-one GGUF); set force_sdxl_vae_conv_scale if needed
* SD3 Medium — all-in-one GGUF via modelPath (CLIP-L, CLIP-G, T5-XXL baked in)
* OR split layout: diffusionModelPath + clipLPath + clipGPath + t5XxlPath
* FLUX.2 [klein] — uses diffusionModelPath + llmPath (Qwen3) + vaePath
*/
struct SdCtxConfig {
// ── Model file paths ───────────────────────────────────────────────────────
// All paths are absolute; empty string = not used.

std::string modelPath; // model_path — SD1.x/SDXL all-in-one checkpoint
std::string diffusionModelPath; // diffusion_model_path — FLUX.2 [klein] standalone diffusion GGUF
std::string clipLPath; // clip_l_path — CLIP-L text encoder (SD1.x / SDXL)
std::string clipGPath; // clip_g_path — CLIP-G text encoder (SDXL)
std::string modelPath; // model_path — SD1.x/SD2.x/SDXL/SD3 all-in-one checkpoint
std::string diffusionModelPath; // diffusion_model_path — FLUX.2 [klein] or SD3 pure diffusion GGUF
std::string clipLPath; // clip_l_path — CLIP-L text encoder (SD3 split / SDXL)
std::string clipGPath; // clip_g_path — CLIP-G text encoder (SD3 split / SDXL)
std::string t5XxlPath; // t5xxl_path — T5-XXL text encoder (SD3 split)
std::string llmPath; // llm_path — LLM text encoder (FLUX.2 → Qwen3)
std::string vaePath; // vae_path — standalone VAE decoder weights
std::string taesdPath; // taesd_path — Tiny AutoEncoder (optional fast preview)
Expand Down Expand Up @@ -61,8 +64,11 @@ struct SdCtxConfig {

// ── Prediction type ───────────────────────────────────────────────────────
// PREDICTION_COUNT = auto-detect from model GGUF metadata (recommended).
// Override only if model lacks metadata: EPS_PRED (SD1.x), V_PRED (SD2.x),
// FLUX2_FLOW_PRED (FLUX.2 klein).
// Override if the GGUF lacks metadata (community conversions often do):
// EPS_PRED → SD1.x
// V_PRED → SD2.x
// FLOW_PRED → SD3 (flow matching)
// FLUX2_FLOW_PRED → FLUX.2 [klein]
prediction_t prediction = PREDICTION_COUNT; // auto

// ── LoRA (Low-Rank Adaptation) apply mode ─────────────────────────────────
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ void SdModel::load() {
params.diffusion_model_path = config_.diffusionModelPath.empty() ? nullptr : config_.diffusionModelPath.c_str();
params.clip_l_path = config_.clipLPath.empty() ? nullptr : config_.clipLPath.c_str();
params.clip_g_path = config_.clipGPath.empty() ? nullptr : config_.clipGPath.c_str();
params.t5xxl_path = config_.t5XxlPath.empty() ? nullptr : config_.t5XxlPath.c_str();
params.llm_path = config_.llmPath.empty() ? nullptr : config_.llmPath.c_str();
params.vae_path = config_.vaePath.empty() ? nullptr : config_.vaePath.c_str();
params.taesd_path = config_.taesdPath.empty() ? nullptr : config_.taesdPath.c_str();
Expand Down
143 changes: 143 additions & 0 deletions packages/qvac-lib-infer-diffusion/examples/generate-image-sd3.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
'use strict'

const path = require('bare-path')
const process = require('bare-process')
const fs = require('bare-fs')
const FilesystemDL = require('@qvac/dl-filesystem')
const ImgStableDiffusion = require('../index')

// ---------------------------------------------------------------------------
// Model file — downloaded via: ./scripts/download-model-sd3.sh
//
// sd3_medium_incl_clips.safetensors: official Stability AI safetensors from
// adamo1139/stable-diffusion-3-medium-ungated (ungated public mirror).
// Contains the diffusion model + CLIP-L + CLIP-G text encoders in one file.
// No separate encoder paths needed.
//
// NOTE: The gpustack GGUF variants (stable-diffusion-v3-medium-*.gguf) have
// zero KV metadata pairs and are NOT compatible with standard stable-diffusion.cpp.
// ---------------------------------------------------------------------------
const MODELS_DIR = path.resolve(__dirname, '../models')
const OUTPUT_DIR = path.resolve(__dirname, '../output')

// All-in-one safetensors — diffusion + CLIP-L + CLIP-G:
const MODEL_NAME = 'sd3_medium_incl_clips.safetensors'

// ---------------------------------------------------------------------------
// Generation params
// SD3 Medium uses flow-matching. cfg_scale 4.5–7.0 is the typical range.
// 512×512 works fine; SD3 was trained at 1024×1024 but smaller is faster.
// ---------------------------------------------------------------------------
const PROMPT = [
'a majestic red fox standing in a snowy forest at dusk,',
'soft golden light through the pine trees,',
'photorealistic, 8k, detailed fur'
].join(' ')

const NEGATIVE_PROMPT = 'blurry, low quality, watermark, text, bad anatomy'

const STEPS = 28 // SD3 Medium typically 20–30 steps
const WIDTH = 512
const HEIGHT = 512
const CFG = 5.0 // SD3 flow-matching; lower than SD1/SD2 (4.5–7.0 range)
const SEED = 42 // -1 = random

async function main () {
fs.mkdirSync(OUTPUT_DIR, { recursive: true })

console.log('Stable Diffusion 3 Medium — text-to-image inference')
console.log('=====================================================')
console.log('Model :', MODEL_NAME)
console.log('Prompt :', PROMPT)
console.log('Steps :', STEPS)
console.log('Size :', `${WIDTH}x${HEIGHT}`)
console.log('CFG :', CFG)
console.log('Seed :', SEED)
console.log()

const loader = new FilesystemDL({ dirPath: MODELS_DIR })

const model = new ImgStableDiffusion(
{
loader,
logger: console,
diskPath: MODELS_DIR,
modelName: MODEL_NAME
// All-in-one safetensors: no clipLModel, clipGModel, t5XxlModel, or vaeModel.
//
// To add T5-XXL (better text following) without redownloading the main file:
// t5XxlModel: 't5xxl_fp8_e4m3fn.safetensors' // download via download-model-sd3.sh
},
{
threads: 4,
// SD3 uses flow-matching. The safetensors metadata allows auto-detection,
// but we set these explicitly as safety overrides.
prediction: 'flow', // FLOW_PRED — SD3 flow-matching
flow_shift: '3.0' // SD3 Medium default; overrides INFINITY sentinel
}
)

try {
// ── 1. Load weights ───────────────────────────────────────────────────────
console.log('Loading model weights...')
const tLoad = Date.now()
await model.load()
console.log(`Loaded in ${((Date.now() - tLoad) / 1000).toFixed(1)}s\n`)

// ── 2. Start generation ───────────────────────────────────────────────────
console.log('Starting generation...')
const tGen = Date.now()

const response = await model.run({
prompt: PROMPT,
negative_prompt: NEGATIVE_PROMPT,
steps: STEPS,
width: WIDTH,
height: HEIGHT,
cfg_scale: CFG, // SD3 CFG — not the FLUX distilled 'guidance'
sampling_method: 'euler', // SD3 flow-matching requires euler (not euler_a)
seed: SEED
})

// ── 3. Stream progress + collect image bytes ──────────────────────────────
const images = []

await response
.onUpdate((data) => {
if (data instanceof Uint8Array) {
images.push(data)
} else if (typeof data === 'string') {
try {
const tick = JSON.parse(data)
if ('step' in tick && 'total' in tick) {
const pct = Math.round((tick.step / tick.total) * 100)
const bar = '█'.repeat(Math.floor(pct / 5)).padEnd(20, '░')
process.stdout.write(`\r [${bar}] ${tick.step}/${tick.total} steps`)
}
} catch (_) {}
}
})
.await()

process.stdout.write('\n')
console.log(`\nGenerated in ${((Date.now() - tGen) / 1000).toFixed(1)}s`)
console.log(`Got ${images.length} image(s)`)

// ── 4. Save each image to disk ────────────────────────────────────────────
for (let i = 0; i < images.length; i++) {
const outPath = path.join(OUTPUT_DIR, `sd3_seed${SEED}_${i}.png`)
fs.writeFileSync(outPath, images[i])
console.log(`Saved → ${outPath}`)
}
} finally {
console.log('\nUnloading model...')
await model.unload()
await loader.close()
console.log('Done.')
}
}

main().catch(err => {
console.error('Fatal:', err.message || err)
process.exit(1)
})
25 changes: 16 additions & 9 deletions packages/qvac-lib-infer-diffusion/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -81,16 +81,23 @@ class ImgStableDiffusion extends BaseInference {
})

// Route the primary model file to the correct stable-diffusion.cpp param:
// FLUX.2 [klein] uses a split layout — diffusion weights have no SD
// version metadata, so diffusion_model_path must be used.
// SD1.x / SD2.x / SDXL use all-in-one checkpoints with metadata, so
// model_path is correct.
// Heuristic: if llmModel is provided the caller is using FLUX.2 (which
// requires an LLM text encoder); otherwise assume an all-in-one SD model.
const isFluxLayout = !!this._llmModel
//
// model_path — all-in-one checkpoints that embed their own text
// encoders and version metadata (SD1.x, SD2.x, SDXL,
// SD3 all-in-one GGUF).
//
// diffusion_model_path — standalone diffusion-only weights that have no
// embedded SD metadata and require separate encoders:
// FLUX.2 [klein] → llmModel (Qwen3)
// SD3 pure GGUF → t5XxlModel (T5-XXL) + clipLModel + clipGModel
//
// Heuristic: if any separate encoder is provided (LLM for FLUX.2, T5-XXL
// for SD3 split) the caller is using a pure diffusion GGUF that must be
// loaded via diffusion_model_path.
const isSplitLayout = !!this._llmModel || !!this._t5XxlModel
const configurationParams = {
path: isFluxLayout ? '' : path.join(this._diskPath, this._modelName),
diffusionModelPath: isFluxLayout ? path.join(this._diskPath, this._modelName) : '',
path: isSplitLayout ? '' : path.join(this._diskPath, this._modelName),
diffusionModelPath: isSplitLayout ? path.join(this._diskPath, this._modelName) : '',
clipLPath: this._clipLModel ? path.join(this._diskPath, this._clipLModel) : '',
clipGPath: this._clipGModel ? path.join(this._diskPath, this._clipGModel) : '',
t5XxlPath: this._t5XxlModel ? path.join(this._diskPath, this._t5XxlModel) : '',
Expand Down
1 change: 1 addition & 0 deletions packages/qvac-lib-infer-diffusion/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"generate": "bare examples/generate-image.js",
"generate:sd2": "bare examples/generate-image-sd2.js",
"generate:sdxl": "bare examples/generate-image-sdxl.js",
"generate:sd3": "bare examples/generate-image-sd3.js",
"build:pack": "mkdir -p dist && npm pack --pack-destination dist",
"mobile:copy-prebuilds": "cp -r prebuilds/android-arm64 prebuilds/android-ia32 || echo 'Warning: Failed to copy sd prebuilds to android-ia32'; cp -r prebuilds/android-arm64 prebuilds/android-arm || echo 'Warning: Failed to copy sd prebuilds to android-arm'; cp -r prebuilds/android-arm64 prebuilds/android-x64 || echo 'Warning: Failed to copy sd prebuilds to android-x64'; cp -r prebuilds/ios-arm64 prebuilds/ios-arm64-simulator 2>/dev/null || echo 'iOS prebuilds already present'; cp -r prebuilds/ios-arm64 prebuilds/ios-x64-simulator 2>/dev/null || echo 'iOS prebuilds already present'",
"lint": "standard --ignore \"addon/**\"",
Expand Down
46 changes: 46 additions & 0 deletions packages/qvac-lib-infer-diffusion/scripts/download-model-sd3.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env bash
set -euo pipefail

# Stable Diffusion 3 Medium — official safetensors (ungated mirror).
#
# Source: adamo1139/stable-diffusion-3-medium-ungated
# Ungated re-upload of the official stabilityai/stable-diffusion-3-medium
# weights. No HuggingFace account or token required.
#
# File downloaded:
# sd3_medium_incl_clips.safetensors 5.97 GB
# All-in-one: diffusion model + CLIP-L + CLIP-G text encoders.
# No T5-XXL — text-following quality is slightly lower but RAM usage is
# comfortable on 16 GB unified memory.
#
# Optional — better quality with T5-XXL (adds ~4.9 GB download + ~5 GB RAM):
# Uncomment the t5xxl download block below and use generate-image-sd3-split.js.
#
# Disk: ~6.0 GB RAM: ~7–8 GB at runtime (without T5-XXL)
# Minimum recommended: 12 GB unified memory

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
OUT="$(cd "$SCRIPT_DIR/.." && pwd)/models"
HF="https://huggingface.co"
REPO="adamo1139/stable-diffusion-3-medium-ungated"

mkdir -p "$OUT"

dl() {
local url="$1" dest="$2"
[[ -f "$dest" ]] && echo "exists: $(basename "$dest")" && return
echo "downloading: $(basename "$dest")"
curl -fL --progress-bar --retry 5 --retry-delay 3 --retry-connrefused -C - -o "$dest" "$url" \
|| { rm -f "$dest"; exit 1; }
}

# All-in-one: diffusion model + CLIP-L + CLIP-G (no T5-XXL)
dl "$HF/$REPO/resolve/main/sd3_medium_incl_clips.safetensors" \
"$OUT/sd3_medium_incl_clips.safetensors"

# Optional: T5-XXL FP8 for much better prompt understanding (~4.89 GB)
# Uncomment to download:
# dl "$HF/$REPO/resolve/main/text_encoders/t5xxl_fp8_e4m3fn.safetensors" \
# "$OUT/t5xxl_fp8_e4m3fn.safetensors"

echo "done → $OUT"