From b2b660f76f9d869a2d1e0ebcfd732b646e6241d9 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 24 May 2026 14:26:07 +0000
Subject: [PATCH 01/92] Studio: add local diffusion image generation page

Backend
- core/inference/diffusion.py: DiffusionBackend singleton that loads
  diffusion GGUFs from Hugging Face via diffusers.GGUFQuantizationConfig
  and runs them on the active CUDA / MPS / CPU device. Supports FLUX.2,
  FLUX.2 klein, FLUX.1, Qwen-Image, Stable Diffusion 3, and SDXL.
- routes/inference.py: POST /api/inference/images/load,
  POST /api/inference/images/generate, POST /api/inference/images/unload,
  GET /api/inference/images/status mirroring the llama-server lifecycle.
- models/inference.py: DiffusionLoadRequest, DiffusionGenerateRequest,
  DiffusionGenerateResponse pydantic schemas with prompt / step / size
  validation up front so callers get clear 422s rather than VAE crashes.
- requirements/no-torch-runtime.txt: pin gguf alongside the existing
  diffusers entry so GGUFQuantizationConfig works out of the box.
- tests/test_diffusion_backend.py + tests/test_diffusion_routes.py:
  27 unit tests covering family detection, validation, lifecycle, and
  the full FastAPI round trip with the backend stubbed. No torch /
  diffusers / GPU required to run.

Frontend
- features/images/: standalone images-page.tsx with curated model picker
  (FLUX.2 klein 4B / 9B, FLUX.2 dev, FLUX.1 dev), HF token field,
  prompt + negative prompt, resolution presets, steps + guidance
  sliders, seed input, and a result gallery that renders base64 PNGs
  inline.
- app/routes/images.tsx: lazy /images route wired into router.tsx.
- components/app-sidebar.tsx: PaintBrush02Icon nav item between
  Recipes and Export, hidden in chat-only mode.
---
 studio/backend/core/inference/diffusion.py    | 480 ++++++++++++++++++
 studio/backend/models/inference.py            |  67 +++
 .../backend/requirements/no-torch-runtime.txt |   3 +
 studio/backend/routes/inference.py            | 127 +++++
 .../backend/tests/test_diffusion_backend.py   | 396 +++++++++++++++
 studio/backend/tests/test_diffusion_routes.py | 190 +++++++
 studio/frontend/src/app/router.tsx            |   2 +
 studio/frontend/src/app/routes/images.tsx     |  21 +
 .../frontend/src/components/app-sidebar.tsx   |  13 +
 studio/frontend/src/features/images/api.ts    | 105 ++++
 .../src/features/images/images-page.tsx       | 425 ++++++++++++++++
 studio/frontend/src/features/images/index.ts  |   5 +
 12 files changed, 1834 insertions(+)
 create mode 100644 studio/backend/core/inference/diffusion.py
 create mode 100644 studio/backend/tests/test_diffusion_backend.py
 create mode 100644 studio/backend/tests/test_diffusion_routes.py
 create mode 100644 studio/frontend/src/app/routes/images.tsx
 create mode 100644 studio/frontend/src/features/images/api.ts
 create mode 100644 studio/frontend/src/features/images/images-page.tsx
 create mode 100644 studio/frontend/src/features/images/index.ts

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
new file mode 100644
index 0000000000..c44f132142
--- /dev/null
+++ b/studio/backend/core/inference/diffusion.py
@@ -0,0 +1,480 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
+
+"""Diffusion image generation backend.
+
+Loads Hugging Face diffusion checkpoints in either the standard
+``diffusers`` layout or the single-file GGUF layout published under
+``unsloth/*-GGUF`` (Flux 2, Flux 2 Klein, Qwen-Image, SD3, SDXL, ...).
+GGUF files are dynamically dequantised on-device via
+``diffusers.GGUFQuantizationConfig``, then the rest of the pipeline
+(VAE, text encoders, scheduler) is pulled from the matching ``diffusers``
+repo so end users only ever need one local file plus the metadata repo.
+
+The module is intentionally torch-only: it never spawns a subprocess and
+shares the active CUDA / MPS device with the rest of Studio. The cost of
+not having a separate process is that loading a diffusion model and a
+GGUF chat model at the same time can OOM on consumer GPUs; the routes
+layer must therefore swap between the two as needed (the orchestrator
+unloads llama-server before any diffusion load on hosts with < 24 GB).
+
+The class deliberately exposes a small, llama-cpp-style surface:
+
+    load_model(repo_id, ...)
+    generate_image(prompt, ...) -> PIL.Image
+    unload_model()
+    status() -> dict
+
+so the route layer at ``studio/backend/routes/inference.py`` can mirror
+the existing llama-server lifecycle (probe + load + generate + unload)
+without learning a second API.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import gc
+import io
+import threading
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Optional
+
+from loggers import get_logger
+
+logger = get_logger(__name__)
+
+
+# ─── Pipeline registry ────────────────────────────────────────────────
+#
+# Keep this list narrow on purpose: only ship the small text-to-image
+# families with first-class GGUF coverage on the Hub. Anything else is
+# either video (LTX*, Wan) or research-grade (Sana, SD3.5) and can be
+# added once it has a working GGUF release plus a smoke test.
+#
+# Each entry maps a substring of the loaded repo id (case-insensitive)
+# to the (pipeline_class_name, transformer_class_name, default base
+# repo for missing pieces). ``base_repo`` is what we pass to
+# ``Pipeline.from_pretrained`` to pick up the VAE + text encoders when
+# the user gave us a GGUF-only repo. The base_repo is documented to the
+# user via ``status()`` so they understand why a second download fires.
+
+@dataclass(frozen = True)
+class DiffusionFamily:
+    name: str
+    pipeline_class: str
+    transformer_class: str
+    base_repo: str
+    # Optional: list of HF "trigger" substrings besides ``name`` that map
+    # to this family (e.g. "flux1-dev" plus "flux.1-dev"). Lowercased.
+    aliases: tuple[str, ...] = field(default_factory = tuple)
+
+
+_FAMILIES: tuple[DiffusionFamily, ...] = (
+    DiffusionFamily(
+        name = "flux.2-klein",
+        pipeline_class = "Flux2KleinPipeline",
+        transformer_class = "Flux2Transformer2DModel",
+        base_repo = "black-forest-labs/FLUX.2-klein",
+        aliases = ("flux2-klein", "flux-2-klein", "flux.2.klein"),
+    ),
+    DiffusionFamily(
+        name = "flux.2",
+        pipeline_class = "Flux2Pipeline",
+        transformer_class = "Flux2Transformer2DModel",
+        base_repo = "black-forest-labs/FLUX.2-dev",
+        aliases = ("flux2-dev", "flux-2-dev", "flux.2.dev"),
+    ),
+    DiffusionFamily(
+        name = "flux.1",
+        pipeline_class = "FluxPipeline",
+        transformer_class = "FluxTransformer2DModel",
+        base_repo = "black-forest-labs/FLUX.1-dev",
+        aliases = ("flux1-dev", "flux-1-dev", "flux.1.dev", "flux-dev"),
+    ),
+    DiffusionFamily(
+        name = "qwen-image",
+        pipeline_class = "QwenImagePipeline",
+        transformer_class = "QwenImageTransformer2DModel",
+        base_repo = "Qwen/Qwen-Image",
+        aliases = ("qwenimage", "qwen_image"),
+    ),
+    DiffusionFamily(
+        name = "stable-diffusion-3",
+        pipeline_class = "StableDiffusion3Pipeline",
+        transformer_class = "SD3Transformer2DModel",
+        base_repo = "stabilityai/stable-diffusion-3-medium-diffusers",
+        aliases = ("sd3-medium", "stable-diffusion-3-medium", "sd3.5"),
+    ),
+    DiffusionFamily(
+        name = "stable-diffusion-xl",
+        pipeline_class = "StableDiffusionXLPipeline",
+        transformer_class = "",  # SDXL uses a UNet, not a transformer
+        base_repo = "stabilityai/stable-diffusion-xl-base-1.0",
+        aliases = ("sdxl",),
+    ),
+)
+
+
+def detect_family(repo_id: str, *, override_family: Optional[str] = None) -> Optional[DiffusionFamily]:
+    """Return the diffusion family matching ``repo_id``.
+
+    Matching is substring-based and case-insensitive. ``override_family``
+    bypasses substring matching and looks up by ``DiffusionFamily.name``.
+    Returns ``None`` when no family applies so callers can surface a clear
+    "unsupported model" error rather than guessing wrong.
+    """
+    if override_family:
+        wanted = override_family.strip().lower()
+        for fam in _FAMILIES:
+            if fam.name == wanted:
+                return fam
+        return None
+    needle = (repo_id or "").lower()
+    if not needle:
+        return None
+    for fam in _FAMILIES:
+        if fam.name in needle:
+            return fam
+        for alias in fam.aliases:
+            if alias and alias in needle:
+                return fam
+    return None
+
+
+def supported_families() -> list[dict[str, str]]:
+    """Public-facing list of families for ``/api/inference/images/status``."""
+    return [
+        {
+            "name": fam.name,
+            "pipeline_class": fam.pipeline_class,
+            "base_repo": fam.base_repo,
+        }
+        for fam in _FAMILIES
+    ]
+
+
+# ─── Backend ──────────────────────────────────────────────────────────
+
+
+class DiffusionBackend:
+    """Singleton-style diffusion backend.
+
+    One pipeline at a time; ``load_model`` swaps the previous one out.
+    Generation is mutex'd so concurrent requests serialise rather than
+    racing GPU memory.
+    """
+
+    def __init__(self) -> None:
+        self._pipe: Any = None
+        self._lock = threading.Lock()
+        self._family: Optional[DiffusionFamily] = None
+        self._repo_id: Optional[str] = None
+        self._gguf_path: Optional[str] = None
+        self._base_repo: Optional[str] = None
+        self._device: Optional[str] = None
+        self._dtype: Optional[str] = None
+        self._loaded_at: Optional[float] = None
+        self._loading: bool = False
+        self._last_error: Optional[str] = None
+
+    # ── lifecycle ─────────────────────────────────────────────────
+
+    @property
+    def is_loaded(self) -> bool:
+        return self._pipe is not None
+
+    @property
+    def repo_id(self) -> Optional[str]:
+        return self._repo_id
+
+    def status(self) -> dict[str, Any]:
+        return {
+            "is_loaded": self.is_loaded,
+            "is_loading": self._loading,
+            "repo_id": self._repo_id,
+            "family": self._family.name if self._family else None,
+            "pipeline_class": self._family.pipeline_class if self._family else None,
+            "base_repo": self._base_repo,
+            "gguf_path": self._gguf_path,
+            "device": self._device,
+            "dtype": self._dtype,
+            "loaded_at": self._loaded_at,
+            "last_error": self._last_error,
+            "supported_families": supported_families(),
+        }
+
+    def _pick_device_and_dtype(self) -> tuple[str, "Any"]:
+        """Pick (device, dtype) for the current host.
+
+        CUDA-first because that is the only path our diffusion GGUFs are
+        validated on. On macOS we use MPS in float16 to keep the pipeline
+        on the Metal GPU. CPU is allowed only as a last resort because
+        running FLUX on CPU is unusably slow (> 10 minutes per image).
+        """
+        import torch
+
+        if torch.cuda.is_available():
+            return "cuda", torch.bfloat16
+        if hasattr(torch, "backends") and getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
+            return "mps", torch.float16
+        return "cpu", torch.float32
+
+    def load_model(
+        self,
+        repo_id: str,
+        *,
+        gguf_filename: Optional[str] = None,
+        base_repo: Optional[str] = None,
+        hf_token: Optional[str] = None,
+        family_override: Optional[str] = None,
+        enable_model_cpu_offload: bool = True,
+    ) -> dict[str, Any]:
+        """Load a diffusion model.
+
+        ``repo_id`` is the Hugging Face repo id of either a GGUF-only
+        repo (e.g. ``unsloth/FLUX.2-klein-4B-GGUF``) or a full diffusers
+        repo (e.g. ``black-forest-labs/FLUX.2-klein``). When the repo
+        contains a GGUF, ``gguf_filename`` picks which quant to load;
+        otherwise diffusers' standard config-driven load runs.
+
+        ``base_repo`` overrides the auto-detected diffusers base used
+        for VAE / text encoders. ``family_override`` short-circuits the
+        substring matcher when an exotic repo name confuses it.
+
+        Raises ``RuntimeError`` on failure with a user-facing message;
+        the previous pipeline (if any) stays loaded so a failed swap
+        does not leave Studio in an unusable state.
+        """
+        from huggingface_hub import hf_hub_download
+        import diffusers
+        import torch
+
+        fam = detect_family(repo_id, override_family = family_override)
+        if fam is None:
+            raise RuntimeError(
+                f"Could not infer a diffusion family for '{repo_id}'. "
+                "Pass family_override = 'flux.2-klein' / 'flux.2' / "
+                "'flux.1' / 'qwen-image' / 'stable-diffusion-3' / "
+                "'stable-diffusion-xl' to disambiguate."
+            )
+
+        device, dtype = self._pick_device_and_dtype()
+
+        with self._lock:
+            self._loading = True
+            self._last_error = None
+        try:
+            pipeline_cls = getattr(diffusers, fam.pipeline_class, None)
+            if pipeline_cls is None:
+                raise RuntimeError(
+                    f"diffusers {diffusers.__version__} has no "
+                    f"{fam.pipeline_class}; upgrade diffusers and retry."
+                )
+            transformer_cls = (
+                getattr(diffusers, fam.transformer_class, None)
+                if fam.transformer_class
+                else None
+            )
+
+            effective_base = base_repo or fam.base_repo
+            logger.info(
+                "Loading diffusion model %s (family=%s, device=%s, dtype=%s, base=%s)",
+                repo_id,
+                fam.name,
+                device,
+                dtype,
+                effective_base,
+            )
+
+            transformer = None
+            local_gguf_path: Optional[str] = None
+            if gguf_filename:
+                if transformer_cls is None:
+                    raise RuntimeError(
+                        f"Family {fam.name} does not have a GGUF transformer "
+                        "path; load the full repo instead."
+                    )
+                local_gguf_path = hf_hub_download(
+                    repo_id = repo_id,
+                    filename = gguf_filename,
+                    token = hf_token,
+                )
+                quant_config = diffusers.GGUFQuantizationConfig(compute_dtype = dtype)
+                transformer = transformer_cls.from_single_file(
+                    local_gguf_path,
+                    quantization_config = quant_config,
+                    torch_dtype = dtype,
+                )
+
+            pipe_kwargs: dict[str, Any] = {"torch_dtype": dtype}
+            if transformer is not None:
+                pipe_kwargs["transformer"] = transformer
+            if hf_token:
+                pipe_kwargs["token"] = hf_token
+
+            pipe = pipeline_cls.from_pretrained(effective_base, **pipe_kwargs)
+            if enable_model_cpu_offload and device == "cuda":
+                pipe.enable_model_cpu_offload()
+            else:
+                pipe.to(device)
+
+            # Drop the old pipeline only after the new one is in place.
+            old = self._pipe
+            with self._lock:
+                self._pipe = pipe
+                self._family = fam
+                self._repo_id = repo_id
+                self._gguf_path = local_gguf_path
+                self._base_repo = effective_base
+                self._device = device
+                self._dtype = str(dtype).replace("torch.", "")
+                self._loaded_at = time.time()
+            _release(old)
+
+            return self.status()
+        except Exception as exc:
+            with self._lock:
+                self._last_error = str(exc)
+            logger.exception("Diffusion load failed for %s", repo_id)
+            raise RuntimeError(f"Failed to load diffusion model: {exc}") from exc
+        finally:
+            with self._lock:
+                self._loading = False
+
+    def unload_model(self) -> dict[str, Any]:
+        with self._lock:
+            old = self._pipe
+            self._pipe = None
+            self._family = None
+            self._repo_id = None
+            self._gguf_path = None
+            self._base_repo = None
+            self._device = None
+            self._dtype = None
+            self._loaded_at = None
+        _release(old)
+        return {"is_loaded": False}
+
+    # ── generation ────────────────────────────────────────────────
+
+    def generate_image(
+        self,
+        *,
+        prompt: str,
+        negative_prompt: Optional[str] = None,
+        num_inference_steps: int = 24,
+        guidance_scale: float = 3.5,
+        width: int = 1024,
+        height: int = 1024,
+        seed: Optional[int] = None,
+    ) -> "Any":
+        """Generate a single PIL image and return it.
+
+        The mutex is held for the entire call: diffusion pipelines are
+        not thread-safe, and overlapping ``__call__``s on a shared
+        pipeline frequently corrupt their internal scheduler state.
+        """
+        if not prompt or not prompt.strip():
+            raise ValueError("prompt is empty")
+        if num_inference_steps < 1 or num_inference_steps > 200:
+            raise ValueError("num_inference_steps must be in [1, 200]")
+        if width <= 0 or height <= 0 or width > 2048 or height > 2048:
+            raise ValueError("width and height must be in (0, 2048]")
+        # Snap to a multiple of 8: Flux / SD pipelines require it and a
+        # silent crash deep in the VAE is much worse than a clear error
+        # message up front.
+        if width % 8 or height % 8:
+            raise ValueError("width and height must be multiples of 8")
+
+        import torch
+
+        with self._lock:
+            if self._pipe is None:
+                raise RuntimeError("No diffusion model is loaded.")
+            pipe = self._pipe
+            device = self._device or "cpu"
+
+            generator = None
+            if seed is not None:
+                # Match the device of the pipeline so determinism holds
+                # across reload cycles. For CPU offload, the noise still
+                # has to live on the device the diffusion forward runs on.
+                gen_device = "cuda" if device == "cuda" and torch.cuda.is_available() else "cpu"
+                generator = torch.Generator(device = gen_device).manual_seed(int(seed))
+
+            call_kwargs: dict[str, Any] = {
+                "prompt": prompt,
+                "num_inference_steps": int(num_inference_steps),
+                "guidance_scale": float(guidance_scale),
+                "width": int(width),
+                "height": int(height),
+            }
+            if negative_prompt is not None and negative_prompt.strip():
+                call_kwargs["negative_prompt"] = negative_prompt
+            if generator is not None:
+                call_kwargs["generator"] = generator
+
+            out = pipe(**call_kwargs)
+            images = getattr(out, "images", None) or []
+            if not images:
+                raise RuntimeError("Diffusion pipeline returned no images.")
+            return images[0]
+
+
+def encode_png_base64(pil_image: "Any") -> str:
+    """Encode a PIL image to base64-encoded PNG."""
+    import base64
+
+    buf = io.BytesIO()
+    pil_image.save(buf, format = "PNG", optimize = True)
+    return base64.b64encode(buf.getvalue()).decode("ascii")
+
+
+# ─── Helpers ──────────────────────────────────────────────────────────
+
+
+def _release(obj: Any) -> None:
+    """Best-effort GPU-memory release for a pipeline being swapped out."""
+    if obj is None:
+        return
+    try:
+        del obj
+    except Exception:
+        pass
+    gc.collect()
+    try:
+        import torch
+
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+    except Exception:
+        pass
+
+
+# ─── Module-level singleton ───────────────────────────────────────────
+
+
+_singleton: Optional[DiffusionBackend] = None
+_singleton_lock = threading.Lock()
+
+
+def get_diffusion_backend() -> DiffusionBackend:
+    """Return the process-wide diffusion backend (lazy-instantiated)."""
+    global _singleton
+    if _singleton is None:
+        with _singleton_lock:
+            if _singleton is None:
+                _singleton = DiffusionBackend()
+    return _singleton
+
+
+async def async_generate(
+    backend: DiffusionBackend,
+    **kwargs: Any,
+) -> "Any":
+    """Run ``generate_image`` in the default executor so route handlers
+    do not block the event loop for the 5-30 s a diffusion step takes."""
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(None, lambda: backend.generate_image(**kwargs))
diff --git a/studio/backend/models/inference.py b/studio/backend/models/inference.py
index b5626951c4..f26220cc50 100644
--- a/studio/backend/models/inference.py
+++ b/studio/backend/models/inference.py
@@ -1421,3 +1421,70 @@ class AnthropicMessagesResponse(BaseModel):
     stop_reason: Optional[str] = None
     stop_sequence: Optional[str] = None
     usage: AnthropicUsage = Field(default_factory = AnthropicUsage)
+
+
+# ── Diffusion image generation ────────────────────────────────────
+
+
+class DiffusionLoadRequest(BaseModel):
+    """Load a diffusion image-generation model.
+
+    repo_id is the HF repo (either GGUF-only or full diffusers layout).
+    gguf_filename selects the quant when repo_id is a GGUF repo.
+    base_repo overrides the auto-picked diffusers base used for the
+    VAE / text encoders when loading a GGUF-only repo.
+    """
+
+    repo_id: str = Field(..., description = "HF repo id")
+    gguf_filename: Optional[str] = Field(
+        None, description = "GGUF filename inside repo_id (Q4_K_S, Q8_0, ...)"
+    )
+    base_repo: Optional[str] = Field(
+        None,
+        description = "Diffusers base repo to source VAE + text encoders from",
+    )
+    family: Optional[str] = Field(
+        None,
+        description = "Force pipeline family: flux.2-klein | flux.2 | flux.1 | qwen-image | stable-diffusion-3 | stable-diffusion-xl",
+    )
+    hf_token: Optional[str] = Field(
+        None, description = "HuggingFace token for gated models"
+    )
+    enable_model_cpu_offload: bool = Field(
+        True,
+        description = "Offload submodules to CPU between forwards. Trades a small speed hit for ~6 GB less VRAM on FLUX-class models.",
+    )
+
+
+class DiffusionGenerateRequest(BaseModel):
+    """Generate a single image from the currently-loaded diffusion model."""
+
+    prompt: str = Field(..., min_length = 1, max_length = 4000)
+    negative_prompt: Optional[str] = Field(None, max_length = 4000)
+    num_inference_steps: int = Field(24, ge = 1, le = 200)
+    guidance_scale: float = Field(3.5, ge = 0.0, le = 20.0)
+    width: int = Field(1024, ge = 64, le = 2048)
+    height: int = Field(1024, ge = 64, le = 2048)
+    seed: Optional[int] = Field(
+        None, description = "Deterministic seed for reproducible outputs"
+    )
+
+    @field_validator("width", "height")
+    @classmethod
+    def _multiple_of_eight(cls, v: int) -> int:
+        if v % 8:
+            raise ValueError("width and height must be multiples of 8")
+        return v
+
+
+class DiffusionGenerateResponse(BaseModel):
+    image_b64: str = Field(..., description = "Base64-encoded PNG")
+    image_mime: str = "image/png"
+    width: int
+    height: int
+    num_inference_steps: int
+    guidance_scale: float
+    seed: Optional[int] = None
+    duration_ms: int
+    model: Optional[str] = None
+    family: Optional[str] = None
diff --git a/studio/backend/requirements/no-torch-runtime.txt b/studio/backend/requirements/no-torch-runtime.txt
index 85294114b1..fa3f33757e 100644
--- a/studio/backend/requirements/no-torch-runtime.txt
+++ b/studio/backend/requirements/no-torch-runtime.txt
@@ -46,6 +46,9 @@ peft>=0.18.0,!=0.11.0
 huggingface_hub>=0.34.0
 hf_transfer
 diffusers
+# Required by diffusers.GGUFQuantizationConfig (used by the Images page
+# to load FLUX.2 / FLUX.1 / Qwen-Image / SDXL GGUFs from the Hub).
+gguf
 
 # Transitive deps required because this file is installed with --no-deps.
 # Without these, `from transformers import AutoConfig` fails at import time.
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index bf92055929..fc9cbf9f88 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -213,6 +213,9 @@ def _friendly_error(exc: Exception) -> str:
     ListOpenAIContainersResponse,
     OpenAIContainerRequest,
     OpenAIContainerSummary,
+    DiffusionLoadRequest,
+    DiffusionGenerateRequest,
+    DiffusionGenerateResponse,
 )
 from core.inference.anthropic_compat import (
     anthropic_messages_to_openai,
@@ -1584,6 +1587,130 @@ async def generate_audio(
     )
 
 
+# =====================================================================
+# Diffusion image generation  (/images/*)
+# =====================================================================
+#
+# Lifecycle mirrors the GGUF chat backend: explicit load -> generate ->
+# unload. Diffusion pipelines compete for the same GPU as llama-server,
+# so callers on < 24 GB GPUs should unload the chat model first.
+
+
+def _get_diffusion_backend():
+    """Lazy import so non-diffusion installs do not pay the diffusers
+    cost at process start. The backend itself is a process-wide
+    singleton; reusing it across requests keeps pipeline state alive."""
+    from core.inference.diffusion import get_diffusion_backend
+
+    return get_diffusion_backend()
+
+
+@router.post("/images/load")
+async def diffusion_load(
+    payload: DiffusionLoadRequest,
+    current_subject: str = Depends(get_current_subject),
+):
+    """Load a diffusion image-generation model.
+
+    Pass either a full diffusers repo or a GGUF-only repo plus the
+    desired ``gguf_filename``. Returns the new status payload (same
+    shape as ``/images/status``).
+    """
+    backend = _get_diffusion_backend()
+    try:
+        status = await asyncio.get_event_loop().run_in_executor(
+            None,
+            lambda: backend.load_model(
+                repo_id = payload.repo_id,
+                gguf_filename = payload.gguf_filename,
+                base_repo = payload.base_repo,
+                family_override = payload.family,
+                hf_token = payload.hf_token,
+                enable_model_cpu_offload = payload.enable_model_cpu_offload,
+            ),
+        )
+        return JSONResponse(content = status)
+    except RuntimeError as exc:
+        raise HTTPException(status_code = 400, detail = str(exc))
+    except Exception as exc:
+        logger.exception("Diffusion load failed")
+        raise HTTPException(status_code = 500, detail = str(exc))
+
+
+@router.post("/images/unload")
+async def diffusion_unload(
+    current_subject: str = Depends(get_current_subject),
+):
+    """Unload the current diffusion model and free GPU memory."""
+    backend = _get_diffusion_backend()
+    return backend.unload_model()
+
+
+@router.get("/images/status")
+async def diffusion_status(
+    current_subject: str = Depends(get_current_subject),
+):
+    """Return diffusion backend status (loaded, family, device, etc.)."""
+    backend = _get_diffusion_backend()
+    return backend.status()
+
+
+@router.post("/images/generate", response_model = DiffusionGenerateResponse)
+async def diffusion_generate(
+    payload: DiffusionGenerateRequest,
+    current_subject: str = Depends(get_current_subject),
+):
+    """Generate a single image from the loaded diffusion model.
+
+    Returns a base64 PNG plus the generation parameters that produced
+    it so the frontend can render the result and the user can reproduce
+    it via the same seed.
+    """
+    backend = _get_diffusion_backend()
+    if not backend.is_loaded:
+        raise HTTPException(
+            status_code = 400,
+            detail = "No diffusion model is loaded. POST /api/inference/images/load first.",
+        )
+
+    start = time.time()
+    try:
+        from core.inference.diffusion import async_generate, encode_png_base64
+
+        image = await async_generate(
+            backend,
+            prompt = payload.prompt,
+            negative_prompt = payload.negative_prompt,
+            num_inference_steps = payload.num_inference_steps,
+            guidance_scale = payload.guidance_scale,
+            width = payload.width,
+            height = payload.height,
+            seed = payload.seed,
+        )
+    except ValueError as exc:
+        raise HTTPException(status_code = 400, detail = str(exc))
+    except RuntimeError as exc:
+        raise HTTPException(status_code = 400, detail = str(exc))
+    except Exception as exc:
+        logger.exception("Diffusion generation failed")
+        raise HTTPException(status_code = 500, detail = str(exc))
+
+    duration_ms = int((time.time() - start) * 1000)
+    status = backend.status()
+    return DiffusionGenerateResponse(
+        image_b64 = encode_png_base64(image),
+        image_mime = "image/png",
+        width = payload.width,
+        height = payload.height,
+        num_inference_steps = payload.num_inference_steps,
+        guidance_scale = payload.guidance_scale,
+        seed = payload.seed,
+        duration_ms = duration_ms,
+        model = status.get("repo_id"),
+        family = status.get("family"),
+    )
+
+
 # =====================================================================
 # OpenAI-Compatible Chat Completions  (/chat/completions)
 # =====================================================================
diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
new file mode 100644
index 0000000000..d70b4a2acb
--- /dev/null
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -0,0 +1,396 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
+
+"""Unit tests for the diffusion image-generation backend.
+
+These tests cover the surface area the routes layer relies on:
+
+* family detection from the public Unsloth GGUF naming conventions
+* generation argument validation (empty prompt, bad steps, off-grid sizes)
+* base64 PNG encoding round-trips
+* status() shape stays compatible with the frontend status poller
+* load/unload lifecycle with the heavy diffusers import monkey-patched
+
+Real GPU loads are exercised manually via the Studio probe (see
+``studio/backend/tests/test_diffusion_smoke.py``); here we keep the
+suite CPU- and import-free so the consolidated CI job and the
+``unslothai/unsloth`` CI fork can both run it on Ubuntu, macOS, and
+Windows runners with no diffusion dependencies installed.
+"""
+
+from __future__ import annotations
+
+import base64
+import io
+import sys
+import types
+from typing import Any
+
+import pytest
+
+
+# ── module under test ────────────────────────────────────────────
+
+
+@pytest.fixture(autouse = True)
+def _reset_singleton(monkeypatch):
+    """Reset the module-level singleton between tests so each test
+    starts from a known state without poking globals directly."""
+    import core.inference.diffusion as d
+
+    monkeypatch.setattr(d, "_singleton", None)
+    yield
+
+
+# ── family detection ────────────────────────────────────────────
+
+
+def test_detect_family_flux2_klein():
+    from core.inference.diffusion import detect_family
+
+    fam = detect_family("unsloth/FLUX.2-klein-4B-GGUF")
+    assert fam is not None
+    assert fam.name == "flux.2-klein"
+    assert fam.pipeline_class == "Flux2KleinPipeline"
+    assert fam.transformer_class == "Flux2Transformer2DModel"
+
+
+def test_detect_family_flux2_dev_is_not_klein():
+    from core.inference.diffusion import detect_family
+
+    fam = detect_family("unsloth/FLUX.2-dev-GGUF")
+    assert fam is not None
+    assert fam.name == "flux.2"
+    # Critical: FLUX.2 dev must NOT pick up the FLUX.2 klein pipeline
+    # because the transformer architectures and text encoder
+    # configurations are different.
+    assert fam.pipeline_class == "Flux2Pipeline"
+
+
+def test_detect_family_flux1():
+    from core.inference.diffusion import detect_family
+
+    fam = detect_family("city96/FLUX.1-dev-gguf")
+    assert fam is not None
+    assert fam.name == "flux.1"
+    assert fam.pipeline_class == "FluxPipeline"
+
+
+def test_detect_family_qwen_image():
+    from core.inference.diffusion import detect_family
+
+    fam = detect_family("unsloth/Qwen-Image-GGUF")
+    assert fam is not None
+    assert fam.name == "qwen-image"
+
+
+def test_detect_family_override_wins_over_substring():
+    from core.inference.diffusion import detect_family
+
+    fam = detect_family("unsloth/FLUX.2-dev-GGUF", override_family = "flux.1")
+    assert fam is not None
+    assert fam.name == "flux.1"
+
+
+def test_detect_family_override_unknown_returns_none():
+    from core.inference.diffusion import detect_family
+
+    fam = detect_family("unsloth/FLUX.2-klein-4B-GGUF", override_family = "doesnotexist")
+    assert fam is None
+
+
+def test_detect_family_unknown_returns_none():
+    from core.inference.diffusion import detect_family
+
+    assert detect_family("random/repo") is None
+    assert detect_family("") is None
+
+
+def test_supported_families_payload_shape():
+    from core.inference.diffusion import supported_families
+
+    payload = supported_families()
+    assert isinstance(payload, list)
+    assert len(payload) >= 4
+    for entry in payload:
+        assert set(entry.keys()) == {"name", "pipeline_class", "base_repo"}
+
+
+# ── singleton ───────────────────────────────────────────────────
+
+
+def test_get_diffusion_backend_singleton():
+    from core.inference.diffusion import get_diffusion_backend
+
+    a = get_diffusion_backend()
+    b = get_diffusion_backend()
+    assert a is b
+
+
+# ── status() shape ──────────────────────────────────────────────
+
+
+def test_status_shape_unloaded():
+    from core.inference.diffusion import get_diffusion_backend
+
+    s = get_diffusion_backend().status()
+    expected_keys = {
+        "is_loaded",
+        "is_loading",
+        "repo_id",
+        "family",
+        "pipeline_class",
+        "base_repo",
+        "gguf_path",
+        "device",
+        "dtype",
+        "loaded_at",
+        "last_error",
+        "supported_families",
+    }
+    assert expected_keys.issubset(s.keys())
+    assert s["is_loaded"] is False
+    assert s["repo_id"] is None
+
+
+# ── encode_png_base64 ───────────────────────────────────────────
+
+
+def test_encode_png_base64_round_trip():
+    from PIL import Image
+
+    from core.inference.diffusion import encode_png_base64
+
+    img = Image.new("RGB", (16, 16), color = (255, 0, 0))
+    b64 = encode_png_base64(img)
+    raw = base64.b64decode(b64)
+    decoded = Image.open(io.BytesIO(raw))
+    assert decoded.format == "PNG"
+    assert decoded.size == (16, 16)
+
+
+# ── generation validation (no real pipeline) ────────────────────
+
+
+def _stub_pipeline(monkeypatch, *, returns = None, raises = None):
+    """Mount a fake torch pipeline on the singleton so generate_image's
+    argument validation runs without diffusers / torch being involved."""
+    import core.inference.diffusion as d
+    from PIL import Image
+
+    backend = d.get_diffusion_backend()
+
+    class _StubPipe:
+        def __call__(self, **kwargs):
+            if raises is not None:
+                raise raises
+            class _Out:
+                pass
+            o = _Out()
+            o.images = [returns or Image.new("RGB", (kwargs["width"], kwargs["height"]), color = (0, 255, 0))]
+            return o
+
+    backend._pipe = _StubPipe()
+    backend._device = "cpu"
+    backend._family = d._FAMILIES[0]
+    backend._repo_id = "stub/stub"
+    return backend
+
+
+def test_generate_image_rejects_empty_prompt(monkeypatch):
+    backend = _stub_pipeline(monkeypatch)
+    with pytest.raises(ValueError, match = "prompt is empty"):
+        backend.generate_image(prompt = "   ")
+
+
+def test_generate_image_rejects_bad_steps(monkeypatch):
+    backend = _stub_pipeline(monkeypatch)
+    with pytest.raises(ValueError, match = "num_inference_steps"):
+        backend.generate_image(prompt = "cat", num_inference_steps = 0)
+    with pytest.raises(ValueError, match = "num_inference_steps"):
+        backend.generate_image(prompt = "cat", num_inference_steps = 999)
+
+
+def test_generate_image_rejects_off_grid_size(monkeypatch):
+    backend = _stub_pipeline(monkeypatch)
+    with pytest.raises(ValueError, match = "multiples of 8"):
+        backend.generate_image(prompt = "cat", width = 513, height = 512)
+
+
+def test_generate_image_rejects_oversized(monkeypatch):
+    backend = _stub_pipeline(monkeypatch)
+    with pytest.raises(ValueError, match = "width and height"):
+        backend.generate_image(prompt = "cat", width = 4096, height = 512)
+
+
+def test_generate_image_calls_pipeline_with_kwargs(monkeypatch):
+    backend = _stub_pipeline(monkeypatch)
+    img = backend.generate_image(
+        prompt = "a red sphere",
+        negative_prompt = "blue",
+        num_inference_steps = 4,
+        guidance_scale = 1.0,
+        width = 256,
+        height = 256,
+        seed = 42,
+    )
+    assert img.size == (256, 256)
+
+
+def test_generate_image_unloaded_raises(monkeypatch):
+    import core.inference.diffusion as d
+
+    backend = d.get_diffusion_backend()
+    backend._pipe = None
+    with pytest.raises(RuntimeError, match = "No diffusion model"):
+        backend.generate_image(prompt = "x")
+
+
+def test_unload_clears_state(monkeypatch):
+    backend = _stub_pipeline(monkeypatch)
+    assert backend.is_loaded
+    backend.unload_model()
+    assert not backend.is_loaded
+    s = backend.status()
+    assert s["repo_id"] is None
+    assert s["family"] is None
+
+
+# ── load_model (with monkey-patched diffusers) ──────────────────
+
+
+def _install_fake_diffusers(monkeypatch, *, raise_on_pipeline = False):
+    """Build a tiny ``diffusers`` shim so we can exercise load_model
+    without dragging the real 1+ GB diffusers / torch import in."""
+    from PIL import Image
+
+    fake = types.ModuleType("diffusers")
+    fake.__version__ = "fake"
+
+    class _FakeQuantConfig:
+        def __init__(self, compute_dtype = None):
+            self.compute_dtype = compute_dtype
+
+    class _FakeTransformer:
+        @classmethod
+        def from_single_file(cls, path, quantization_config = None, torch_dtype = None):
+            inst = cls()
+            inst.path = path
+            inst.qc = quantization_config
+            inst.dtype = torch_dtype
+            return inst
+
+    class _FakePipeline:
+        @classmethod
+        def from_pretrained(cls, base_repo, **kwargs):
+            if raise_on_pipeline:
+                raise RuntimeError("simulated load failure")
+            inst = cls()
+            inst.base_repo = base_repo
+            inst.kwargs = kwargs
+            return inst
+
+        def __call__(self, **kwargs):
+            class _Out:
+                pass
+            o = _Out()
+            o.images = [Image.new("RGB", (kwargs["width"], kwargs["height"]), color = (0, 0, 255))]
+            return o
+
+        def enable_model_cpu_offload(self):
+            self.cpu_offload = True
+
+        def to(self, device):
+            self.device = device
+            return self
+
+    fake.GGUFQuantizationConfig = _FakeQuantConfig
+    fake.Flux2KleinPipeline = _FakePipeline
+    fake.Flux2Transformer2DModel = _FakeTransformer
+    fake.Flux2Pipeline = _FakePipeline
+    fake.FluxPipeline = _FakePipeline
+    fake.FluxTransformer2DModel = _FakeTransformer
+    fake.QwenImagePipeline = _FakePipeline
+    fake.QwenImageTransformer2DModel = _FakeTransformer
+    fake.SD3Transformer2DModel = _FakeTransformer
+    fake.StableDiffusion3Pipeline = _FakePipeline
+    fake.StableDiffusionXLPipeline = _FakePipeline
+
+    monkeypatch.setitem(sys.modules, "diffusers", fake)
+
+    # Pretend HF Hub gave us a local file without actually fetching.
+    fake_hub = types.ModuleType("huggingface_hub")
+    fake_hub.hf_hub_download = lambda repo_id, filename, token = None: f"/fake/{repo_id}/{filename}"
+    monkeypatch.setitem(sys.modules, "huggingface_hub", fake_hub)
+
+    # Force CPU dtype so the test does not need CUDA.
+    import core.inference.diffusion as d
+
+    monkeypatch.setattr(
+        d.DiffusionBackend,
+        "_pick_device_and_dtype",
+        lambda self: ("cpu", "fake_dtype"),
+    )
+
+    return fake
+
+
+def test_load_model_unknown_family(monkeypatch):
+    _install_fake_diffusers(monkeypatch)
+    from core.inference.diffusion import get_diffusion_backend
+
+    backend = get_diffusion_backend()
+    with pytest.raises(RuntimeError, match = "Could not infer"):
+        backend.load_model("private/random-repo")
+
+
+def test_load_model_gguf_path_happy(monkeypatch):
+    _install_fake_diffusers(monkeypatch)
+    from core.inference.diffusion import get_diffusion_backend
+
+    backend = get_diffusion_backend()
+    status = backend.load_model(
+        "unsloth/FLUX.2-klein-4B-GGUF",
+        gguf_filename = "FLUX.2-klein-4B-Q4_K_S.gguf",
+    )
+    assert status["is_loaded"] is True
+    assert status["family"] == "flux.2-klein"
+    assert status["pipeline_class"] == "Flux2KleinPipeline"
+    assert status["base_repo"] == "black-forest-labs/FLUX.2-klein"
+    assert status["gguf_path"] == (
+        "/fake/unsloth/FLUX.2-klein-4B-GGUF/FLUX.2-klein-4B-Q4_K_S.gguf"
+    )
+
+
+def test_load_model_recovers_after_failure(monkeypatch):
+    _install_fake_diffusers(monkeypatch, raise_on_pipeline = True)
+    from core.inference.diffusion import get_diffusion_backend
+
+    backend = get_diffusion_backend()
+    with pytest.raises(RuntimeError, match = "Failed to load diffusion model"):
+        backend.load_model(
+            "unsloth/FLUX.2-klein-4B-GGUF",
+            gguf_filename = "x.gguf",
+        )
+    # Failed load must leave the singleton unloaded but with last_error set.
+    s = backend.status()
+    assert s["is_loaded"] is False
+    assert s["last_error"] and "simulated load failure" in s["last_error"]
+
+
+def test_load_model_swap_drops_previous(monkeypatch):
+    _install_fake_diffusers(monkeypatch)
+    from core.inference.diffusion import get_diffusion_backend
+
+    backend = get_diffusion_backend()
+    backend.load_model(
+        "unsloth/FLUX.2-klein-4B-GGUF",
+        gguf_filename = "FLUX.2-klein-4B-Q4_K_S.gguf",
+    )
+    first_pipe = backend._pipe
+    backend.load_model(
+        "unsloth/FLUX.2-dev-GGUF",
+        gguf_filename = "FLUX.2-dev-Q4_K_S.gguf",
+    )
+    assert backend._pipe is not first_pipe
+    assert backend.status()["family"] == "flux.2"
diff --git a/studio/backend/tests/test_diffusion_routes.py b/studio/backend/tests/test_diffusion_routes.py
new file mode 100644
index 0000000000..9b9063f0b1
--- /dev/null
+++ b/studio/backend/tests/test_diffusion_routes.py
@@ -0,0 +1,190 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
+
+"""Route-level tests for ``/api/inference/images/*``.
+
+Mounts the actual ``inference_router`` on a fresh FastAPI app with the
+auth dependency replaced by a stub so we exercise the same FastAPI
+handlers Studio ships in production. The diffusion backend is replaced
+with an in-memory stub so we don't need diffusers / GPUs to run these.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import pytest
+from fastapi import FastAPI
+from fastapi.testclient import TestClient
+from PIL import Image
+
+
+_BACKEND_ROOT = Path(__file__).resolve().parents[1]
+if str(_BACKEND_ROOT) not in sys.path:
+    sys.path.insert(0, str(_BACKEND_ROOT))
+
+
+class _FakeBackend:
+    def __init__(self) -> None:
+        self._loaded = False
+        self._repo: str | None = None
+        self.calls: list[dict] = []
+
+    @property
+    def is_loaded(self) -> bool:
+        return self._loaded
+
+    def status(self) -> dict:
+        return {
+            "is_loaded": self._loaded,
+            "is_loading": False,
+            "repo_id": self._repo,
+            "family": "flux.2-klein" if self._loaded else None,
+            "pipeline_class": "Flux2KleinPipeline" if self._loaded else None,
+            "base_repo": "black-forest-labs/FLUX.2-klein" if self._loaded else None,
+            "gguf_path": None,
+            "device": "cpu",
+            "dtype": "torch.bfloat16",
+            "loaded_at": 0,
+            "last_error": None,
+            "supported_families": [],
+        }
+
+    def load_model(self, repo_id, **kw):
+        self.calls.append({"op": "load", "repo_id": repo_id, **kw})
+        self._loaded = True
+        self._repo = repo_id
+        return self.status()
+
+    def unload_model(self) -> dict:
+        self._loaded = False
+        self._repo = None
+        return {"is_loaded": False}
+
+    def generate_image(self, **kw):
+        self.calls.append({"op": "generate", **kw})
+        return Image.new("RGB", (kw["width"], kw["height"]), color = (123, 45, 67))
+
+
+@pytest.fixture
+def app_with_stub(monkeypatch):
+    """Build a FastAPI app that mounts the real inference router with
+    auth disabled and the diffusion backend swapped for a stub."""
+    from routes import inference as inf
+    import core.inference.diffusion as d
+
+    stub = _FakeBackend()
+    # Override the singleton accessor the route uses.
+    monkeypatch.setattr(d, "get_diffusion_backend", lambda: stub)
+    monkeypatch.setattr(inf, "_get_diffusion_backend", lambda: stub)
+
+    app = FastAPI()
+    app.include_router(inf.router, prefix = "/api/inference")
+    # Bypass auth by overriding the dependency.
+    from auth.authentication import get_current_subject
+
+    app.dependency_overrides[get_current_subject] = lambda: "test-user"
+
+    return app, stub
+
+
+def test_status_when_unloaded(app_with_stub):
+    app, _ = app_with_stub
+    c = TestClient(app)
+    r = c.get("/api/inference/images/status")
+    assert r.status_code == 200
+    body = r.json()
+    assert body["is_loaded"] is False
+    assert body["repo_id"] is None
+
+
+def test_generate_without_load_returns_400(app_with_stub):
+    app, _ = app_with_stub
+    c = TestClient(app)
+    r = c.post(
+        "/api/inference/images/generate",
+        json = {"prompt": "a red sphere"},
+    )
+    assert r.status_code == 400
+    assert "No diffusion model" in r.json()["detail"]
+
+
+def test_load_then_generate_round_trip(app_with_stub):
+    app, stub = app_with_stub
+    c = TestClient(app)
+
+    r = c.post(
+        "/api/inference/images/load",
+        json = {
+            "repo_id": "unsloth/FLUX.2-klein-4B-GGUF",
+            "gguf_filename": "FLUX.2-klein-4B-Q4_K_S.gguf",
+        },
+    )
+    assert r.status_code == 200, r.text
+    assert r.json()["is_loaded"] is True
+
+    r = c.post(
+        "/api/inference/images/generate",
+        json = {
+            "prompt": "a tiny synth-pop album cover",
+            "width": 256,
+            "height": 256,
+            "num_inference_steps": 4,
+            "seed": 7,
+        },
+    )
+    assert r.status_code == 200, r.text
+    body = r.json()
+    assert body["image_b64"]
+    assert body["image_mime"] == "image/png"
+    assert body["width"] == 256
+    assert body["height"] == 256
+    assert body["seed"] == 7
+    assert body["duration_ms"] >= 0
+
+    # Round-trip the base64 -> PIL to confirm it is a real PNG of the
+    # right size and not, say, an empty string.
+    import base64
+    import io
+
+    raw = base64.b64decode(body["image_b64"])
+    decoded = Image.open(io.BytesIO(raw))
+    assert decoded.format == "PNG"
+    assert decoded.size == (256, 256)
+
+    # Backend stub should have recorded both calls.
+    ops = [c["op"] for c in stub.calls]
+    assert ops == ["load", "generate"]
+
+
+def test_generate_rejects_off_grid_size(app_with_stub):
+    app, stub = app_with_stub
+    c = TestClient(app)
+    c.post(
+        "/api/inference/images/load",
+        json = {
+            "repo_id": "unsloth/FLUX.2-klein-4B-GGUF",
+            "gguf_filename": "x.gguf",
+        },
+    )
+    r = c.post(
+        "/api/inference/images/generate",
+        json = {"prompt": "x", "width": 513, "height": 512},
+    )
+    # Pydantic v2 wraps validator errors in 422 by default.
+    assert r.status_code in (400, 422), r.text
+
+
+def test_unload_clears_state(app_with_stub):
+    app, _ = app_with_stub
+    c = TestClient(app)
+    c.post(
+        "/api/inference/images/load",
+        json = {"repo_id": "unsloth/FLUX.2-klein-4B-GGUF", "gguf_filename": "x.gguf"},
+    )
+    r = c.post("/api/inference/images/unload")
+    assert r.status_code == 200
+    assert r.json()["is_loaded"] is False
+    r = c.get("/api/inference/images/status")
+    assert r.json()["is_loaded"] is False
diff --git a/studio/frontend/src/app/router.tsx b/studio/frontend/src/app/router.tsx
index c7bc0440bd..b50f3fe618 100644
--- a/studio/frontend/src/app/router.tsx
+++ b/studio/frontend/src/app/router.tsx
@@ -9,6 +9,7 @@ import { Route as dataRecipeRoute } from "./routes/data-recipes.$recipeId";
 import { Route as chatRoute } from "./routes/chat";
 import { Route as exportRoute } from "./routes/export";
 import { Route as gridTestRoute } from "./routes/grid-test";
+import { Route as imagesRoute } from "./routes/images";
 import { Route as indexRoute } from "./routes/index";
 import { Route as loginRoute } from "./routes/login";
 import { Route as onboardingRoute } from "./routes/onboarding";
@@ -26,6 +27,7 @@ const routeTree = rootRoute.addChildren([
   studioRoute,
   chatRoute,
   exportRoute,
+  imagesRoute,
   dataRecipesRoute,
   dataRecipeRoute,
 ]);
diff --git a/studio/frontend/src/app/routes/images.tsx b/studio/frontend/src/app/routes/images.tsx
new file mode 100644
index 0000000000..1761612140
--- /dev/null
+++ b/studio/frontend/src/app/routes/images.tsx
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: AGPL-3.0-only
+// Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
+
+import { createRoute } from "@tanstack/react-router";
+import { lazy } from "react";
+import { requireAuth } from "../auth-guards";
+import { Route as rootRoute } from "./__root";
+
+const ImagesPage = lazy(() =>
+  import("@/features/images").then((m) => ({
+    default: m.ImagesPage,
+  })),
+);
+
+export const Route = createRoute({
+  getParentRoute: () => rootRoute,
+  path: "/images",
+  staticData: { title: "Images" },
+  beforeLoad: () => requireAuth(),
+  component: ImagesPage,
+});
diff --git a/studio/frontend/src/components/app-sidebar.tsx b/studio/frontend/src/components/app-sidebar.tsx
index aac5f8f8a8..9a0830db2d 100644
--- a/studio/frontend/src/components/app-sidebar.tsx
+++ b/studio/frontend/src/components/app-sidebar.tsx
@@ -50,6 +50,7 @@ import {
   Globe02Icon,
   HelpCircleIcon,
   Logout01Icon,
+  PaintBrush02Icon,
   Search01Icon,
   PowerIcon,
   PencilEdit02Icon,
@@ -497,6 +498,18 @@ export function AppSidebar() {
               }}
             />
 
+            <NavItem
+              icon={PaintBrush02Icon}
+              label="Images"
+              active={pathname === "/images" || pathname.startsWith("/images/")}
+              disabled={chatOnly}
+              onClick={() => {
+                if (chatOnly) return;
+                navigate({ to: "/images" });
+                closeMobileIfOpen();
+              }}
+            />
+
             <NavItem
               icon={DownloadSquare01Icon}
               label="Export"
diff --git a/studio/frontend/src/features/images/api.ts b/studio/frontend/src/features/images/api.ts
new file mode 100644
index 0000000000..017b856b5a
--- /dev/null
+++ b/studio/frontend/src/features/images/api.ts
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: AGPL-3.0-only
+// Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
+
+// Thin client for the diffusion image-generation routes exposed by
+// studio/backend/routes/inference.py (images/load, images/generate,
+// images/status, images/unload). Mirrors the shape returned by
+// DiffusionBackend.status() and DiffusionGenerateResponse so the
+// page can render results without re-deriving fields client-side.
+
+import { authFetch } from "@/features/auth";
+import { readFastApiError } from "@/lib/format-fastapi-error";
+
+export interface DiffusionFamily {
+  name: string;
+  pipeline_class: string;
+  base_repo: string;
+}
+
+export interface DiffusionStatus {
+  is_loaded: boolean;
+  is_loading: boolean;
+  repo_id: string | null;
+  family: string | null;
+  pipeline_class: string | null;
+  base_repo: string | null;
+  gguf_path: string | null;
+  device: string | null;
+  dtype: string | null;
+  loaded_at: number | null;
+  last_error: string | null;
+  supported_families: DiffusionFamily[];
+}
+
+export interface DiffusionLoadRequest {
+  repo_id: string;
+  gguf_filename?: string;
+  base_repo?: string;
+  family?: string;
+  hf_token?: string;
+  enable_model_cpu_offload?: boolean;
+}
+
+export interface DiffusionGenerateRequest {
+  prompt: string;
+  negative_prompt?: string;
+  num_inference_steps?: number;
+  guidance_scale?: number;
+  width?: number;
+  height?: number;
+  seed?: number;
+}
+
+export interface DiffusionGenerateResponse {
+  image_b64: string;
+  image_mime: string;
+  width: number;
+  height: number;
+  num_inference_steps: number;
+  guidance_scale: number;
+  seed: number | null;
+  duration_ms: number;
+  model: string | null;
+  family: string | null;
+}
+
+async function parseJson<T>(res: Response): Promise<T> {
+  if (!res.ok) throw new Error(await readFastApiError(res));
+  return (await res.json()) as T;
+}
+
+export async function fetchDiffusionStatus(): Promise<DiffusionStatus> {
+  return parseJson<DiffusionStatus>(
+    await authFetch("/api/inference/images/status"),
+  );
+}
+
+export async function loadDiffusionModel(
+  payload: DiffusionLoadRequest,
+): Promise<DiffusionStatus> {
+  return parseJson<DiffusionStatus>(
+    await authFetch("/api/inference/images/load", {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify(payload),
+    }),
+  );
+}
+
+export async function unloadDiffusionModel(): Promise<{ is_loaded: boolean }> {
+  return parseJson<{ is_loaded: boolean }>(
+    await authFetch("/api/inference/images/unload", { method: "POST" }),
+  );
+}
+
+export async function generateDiffusionImage(
+  payload: DiffusionGenerateRequest,
+): Promise<DiffusionGenerateResponse> {
+  return parseJson<DiffusionGenerateResponse>(
+    await authFetch("/api/inference/images/generate", {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify(payload),
+    }),
+  );
+}
diff --git a/studio/frontend/src/features/images/images-page.tsx b/studio/frontend/src/features/images/images-page.tsx
new file mode 100644
index 0000000000..3a408ba316
--- /dev/null
+++ b/studio/frontend/src/features/images/images-page.tsx
@@ -0,0 +1,425 @@
+// SPDX-License-Identifier: AGPL-3.0-only
+// Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
+
+import { Button } from "@/components/ui/button";
+import { Input } from "@/components/ui/input";
+import { Label } from "@/components/ui/label";
+import {
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+} from "@/components/ui/select";
+import { SectionCard } from "@/components/section-card";
+import { Slider } from "@/components/ui/slider";
+import { Spinner } from "@/components/ui/spinner";
+import { Textarea } from "@/components/ui/textarea";
+import { toast } from "@/lib/toast";
+import {
+  fetchDiffusionStatus,
+  generateDiffusionImage,
+  loadDiffusionModel,
+  unloadDiffusionModel,
+  type DiffusionGenerateResponse,
+  type DiffusionStatus,
+} from "./api";
+import { useCallback, useEffect, useMemo, useRef, useState } from "react";
+
+// Curated short list of working unsloth/* diffusion GGUFs. Picked to
+// span size + license so any GPU class has at least one viable option:
+//   FLUX.2 klein 4B  -> ~10-12 GB VRAM with Q4_K_S, Apache 2.0
+//   FLUX.2 klein 9B  -> ~16-18 GB VRAM, FLUX [klein] non-commercial
+//   FLUX.2 dev       -> ~24+ GB VRAM, FLUX [dev] non-commercial
+// The CLI on the backend can load anything supported by detect_family();
+// this list just keeps the picker compact for the v1 UI.
+const CURATED_MODELS: Array<{
+  label: string;
+  repo_id: string;
+  default_gguf: string;
+  family: string;
+  notes: string;
+}> = [
+  {
+    label: "FLUX.2 klein 4B (Q4_K_S, Apache 2.0)",
+    repo_id: "unsloth/FLUX.2-klein-4B-GGUF",
+    default_gguf: "FLUX.2-klein-4B-Q4_K_S.gguf",
+    family: "flux.2-klein",
+    notes: "13 GB VRAM, fastest. Apache 2.0.",
+  },
+  {
+    label: "FLUX.2 klein 9B (Q4_K_S)",
+    repo_id: "unsloth/FLUX.2-klein-9B-GGUF",
+    default_gguf: "FLUX.2-klein-9B-Q4_K_S.gguf",
+    family: "flux.2-klein",
+    notes: "17 GB VRAM, higher quality.",
+  },
+  {
+    label: "FLUX.2 dev (Q4_K_S)",
+    repo_id: "unsloth/FLUX.2-dev-GGUF",
+    default_gguf: "FLUX.2-dev-Q4_K_S.gguf",
+    family: "flux.2",
+    notes: "24+ GB VRAM, best for prompt following.",
+  },
+  {
+    label: "FLUX.1 dev (Q4_K_S, city96)",
+    repo_id: "city96/FLUX.1-dev-gguf",
+    default_gguf: "flux1-dev-Q4_K_S.gguf",
+    family: "flux.1",
+    notes: "12 GB VRAM, older but well tested.",
+  },
+];
+
+const DEFAULT_PRESET = CURATED_MODELS[0];
+
+const RESOLUTION_PRESETS: Array<{ label: string; w: number; h: number }> = [
+  { label: "Square 1024", w: 1024, h: 1024 },
+  { label: "Square 768", w: 768, h: 768 },
+  { label: "Square 512", w: 512, h: 512 },
+  { label: "Portrait 832x1216", w: 832, h: 1216 },
+  { label: "Landscape 1216x832", w: 1216, h: 832 },
+];
+
+export function ImagesPage() {
+  const [status, setStatus] = useState<DiffusionStatus | null>(null);
+  const [refreshingStatus, setRefreshingStatus] = useState(false);
+  const [busy, setBusy] = useState<"idle" | "loading" | "unloading" | "generating">("idle");
+
+  const [presetIndex, setPresetIndex] = useState(0);
+  const [customRepoId, setCustomRepoId] = useState("");
+  const [customGguf, setCustomGguf] = useState("");
+  const [useCustom, setUseCustom] = useState(false);
+  const [hfToken, setHfToken] = useState("");
+
+  const [prompt, setPrompt] = useState("a tiny ginger sloth coding in a sunlit treehouse, photorealistic");
+  const [negativePrompt, setNegativePrompt] = useState("");
+  const [steps, setSteps] = useState(24);
+  const [guidance, setGuidance] = useState(3.5);
+  const [resolutionIdx, setResolutionIdx] = useState(0);
+  const [seed, setSeed] = useState<string>("");
+
+  const [results, setResults] = useState<DiffusionGenerateResponse[]>([]);
+  const lastErrorRef = useRef<string | null>(null);
+
+  const preset = CURATED_MODELS[presetIndex] ?? DEFAULT_PRESET;
+  const resolution = RESOLUTION_PRESETS[resolutionIdx];
+
+  const refreshStatus = useCallback(async () => {
+    setRefreshingStatus(true);
+    try {
+      const next = await fetchDiffusionStatus();
+      setStatus(next);
+    } catch (err) {
+      const msg = err instanceof Error ? err.message : String(err);
+      if (lastErrorRef.current !== msg) {
+        lastErrorRef.current = msg;
+        toast.error("Could not fetch image-model status", { description: msg });
+      }
+    } finally {
+      setRefreshingStatus(false);
+    }
+  }, []);
+
+  useEffect(() => {
+    void refreshStatus();
+  }, [refreshStatus]);
+
+  const handleLoad = useCallback(async () => {
+    setBusy("loading");
+    try {
+      const repo = useCustom ? customRepoId.trim() : preset.repo_id;
+      const gguf = useCustom ? customGguf.trim() || undefined : preset.default_gguf;
+      const family = useCustom ? undefined : preset.family;
+      if (!repo) {
+        toast.error("Pick a model first");
+        return;
+      }
+      const next = await loadDiffusionModel({
+        repo_id: repo,
+        gguf_filename: gguf,
+        family,
+        hf_token: hfToken.trim() || undefined,
+      });
+      setStatus(next);
+      toast.success("Loaded image model", { description: next.repo_id ?? undefined });
+    } catch (err) {
+      toast.error("Failed to load image model", {
+        description: err instanceof Error ? err.message : String(err),
+      });
+    } finally {
+      setBusy("idle");
+    }
+  }, [useCustom, customRepoId, customGguf, preset, hfToken]);
+
+  const handleUnload = useCallback(async () => {
+    setBusy("unloading");
+    try {
+      await unloadDiffusionModel();
+      await refreshStatus();
+    } catch (err) {
+      toast.error("Failed to unload image model", {
+        description: err instanceof Error ? err.message : String(err),
+      });
+    } finally {
+      setBusy("idle");
+    }
+  }, [refreshStatus]);
+
+  const handleGenerate = useCallback(async () => {
+    if (!prompt.trim()) {
+      toast.error("Prompt is empty");
+      return;
+    }
+    setBusy("generating");
+    try {
+      const parsedSeed = seed.trim() ? Number(seed.trim()) : undefined;
+      if (parsedSeed !== undefined && !Number.isFinite(parsedSeed)) {
+        toast.error("Seed must be a number");
+        return;
+      }
+      const out = await generateDiffusionImage({
+        prompt,
+        negative_prompt: negativePrompt.trim() || undefined,
+        num_inference_steps: steps,
+        guidance_scale: guidance,
+        width: resolution.w,
+        height: resolution.h,
+        seed: parsedSeed,
+      });
+      setResults((prev) => [out, ...prev].slice(0, 12));
+    } catch (err) {
+      toast.error("Image generation failed", {
+        description: err instanceof Error ? err.message : String(err),
+      });
+    } finally {
+      setBusy("idle");
+    }
+  }, [prompt, negativePrompt, steps, guidance, resolution, seed]);
+
+  const statusLabel = useMemo(() => {
+    if (!status) return refreshingStatus ? "Checking..." : "Not loaded";
+    if (status.is_loading) return "Loading...";
+    if (status.is_loaded) {
+      const dev = status.device ? ` on ${status.device}` : "";
+      return `Loaded: ${status.repo_id ?? "(unknown)"} (${status.family ?? "unknown"})${dev}`;
+    }
+    return "Not loaded";
+  }, [status, refreshingStatus]);
+
+  return (
+    <div className="flex flex-1 flex-col gap-4 overflow-y-auto p-4 sm:p-6">
+      <SectionCard
+        title="Local image generation"
+        description={
+          "Run diffusion GGUFs from Hugging Face on your own GPU. " +
+          "Pick a curated FLUX.2 model or paste any unsloth/* GGUF repo."
+        }
+      >
+        <div className="flex flex-col gap-3">
+          <div className="flex flex-col gap-2">
+            <Label>Model</Label>
+            <Select
+              value={useCustom ? "custom" : String(presetIndex)}
+              onValueChange={(v) => {
+                if (v === "custom") {
+                  setUseCustom(true);
+                } else {
+                  setUseCustom(false);
+                  setPresetIndex(Number(v));
+                }
+              }}
+            >
+              <SelectTrigger>
+                <SelectValue placeholder="Pick a model" />
+              </SelectTrigger>
+              <SelectContent>
+                {CURATED_MODELS.map((m, idx) => (
+                  <SelectItem key={m.repo_id} value={String(idx)}>
+                    {m.label}
+                  </SelectItem>
+                ))}
+                <SelectItem value="custom">Custom HF repo...</SelectItem>
+              </SelectContent>
+            </Select>
+            {!useCustom && (
+              <p className="text-xs text-muted-foreground">{preset.notes}</p>
+            )}
+          </div>
+
+          {useCustom && (
+            <div className="flex flex-col gap-2">
+              <Label>HF repo id</Label>
+              <Input
+                value={customRepoId}
+                onChange={(e) => setCustomRepoId(e.target.value)}
+                placeholder="unsloth/FLUX.2-klein-4B-GGUF"
+              />
+              <Label>GGUF filename (optional)</Label>
+              <Input
+                value={customGguf}
+                onChange={(e) => setCustomGguf(e.target.value)}
+                placeholder="FLUX.2-klein-4B-Q4_K_S.gguf"
+              />
+            </div>
+          )}
+
+          <div className="flex flex-col gap-2">
+            <Label>Hugging Face token (only for gated repos)</Label>
+            <Input
+              type="password"
+              value={hfToken}
+              onChange={(e) => setHfToken(e.target.value)}
+              placeholder="hf_..."
+              autoComplete="off"
+            />
+          </div>
+
+          <div className="flex flex-wrap items-center gap-2">
+            <Button
+              onClick={handleLoad}
+              disabled={busy !== "idle"}
+              data-testid="diffusion-load"
+            >
+              {busy === "loading" ? <Spinner className="mr-2 size-4" /> : null}
+              Load model
+            </Button>
+            <Button
+              variant="outline"
+              onClick={handleUnload}
+              disabled={busy !== "idle" || !status?.is_loaded}
+              data-testid="diffusion-unload"
+            >
+              Unload
+            </Button>
+            <Button
+              variant="ghost"
+              onClick={() => void refreshStatus()}
+              disabled={refreshingStatus}
+            >
+              Refresh status
+            </Button>
+            <span
+              className="ml-auto text-xs text-muted-foreground"
+              data-testid="diffusion-status"
+            >
+              {statusLabel}
+            </span>
+          </div>
+        </div>
+      </SectionCard>
+
+      <SectionCard
+        title="Prompt"
+        description="The pipeline runs on the GPU you launched Unsloth Studio on."
+      >
+        <div className="flex flex-col gap-3">
+          <div className="flex flex-col gap-1">
+            <Label htmlFor="diffusion-prompt">Prompt</Label>
+            <Textarea
+              id="diffusion-prompt"
+              value={prompt}
+              onChange={(e) => setPrompt(e.target.value)}
+              rows={3}
+              data-testid="diffusion-prompt"
+            />
+          </div>
+          <div className="flex flex-col gap-1">
+            <Label htmlFor="diffusion-negative">Negative prompt (optional)</Label>
+            <Textarea
+              id="diffusion-negative"
+              value={negativePrompt}
+              onChange={(e) => setNegativePrompt(e.target.value)}
+              rows={2}
+            />
+          </div>
+
+          <div className="grid grid-cols-1 gap-3 sm:grid-cols-3">
+            <div className="flex flex-col gap-1">
+              <Label>Resolution</Label>
+              <Select
+                value={String(resolutionIdx)}
+                onValueChange={(v) => setResolutionIdx(Number(v))}
+              >
+                <SelectTrigger>
+                  <SelectValue />
+                </SelectTrigger>
+                <SelectContent>
+                  {RESOLUTION_PRESETS.map((r, idx) => (
+                    <SelectItem key={r.label} value={String(idx)}>
+                      {r.label}
+                    </SelectItem>
+                  ))}
+                </SelectContent>
+              </Select>
+            </div>
+            <div className="flex flex-col gap-1">
+              <Label>Steps: {steps}</Label>
+              <Slider
+                min={1}
+                max={60}
+                step={1}
+                value={[steps]}
+                onValueChange={(v) => setSteps(v[0] ?? steps)}
+              />
+            </div>
+            <div className="flex flex-col gap-1">
+              <Label>Guidance: {guidance.toFixed(1)}</Label>
+              <Slider
+                min={0}
+                max={15}
+                step={0.1}
+                value={[guidance]}
+                onValueChange={(v) => setGuidance(v[0] ?? guidance)}
+              />
+            </div>
+          </div>
+
+          <div className="flex flex-col gap-1">
+            <Label htmlFor="diffusion-seed">Seed (optional)</Label>
+            <Input
+              id="diffusion-seed"
+              value={seed}
+              onChange={(e) => setSeed(e.target.value)}
+              placeholder="leave empty for random"
+              inputMode="numeric"
+            />
+          </div>
+
+          <div>
+            <Button
+              size="lg"
+              onClick={handleGenerate}
+              disabled={busy !== "idle" || !status?.is_loaded}
+              data-testid="diffusion-generate"
+            >
+              {busy === "generating" ? <Spinner className="mr-2 size-4" /> : null}
+              Generate image
+            </Button>
+          </div>
+        </div>
+      </SectionCard>
+
+      {results.length > 0 && (
+        <SectionCard title="Results" description="Most recent first.">
+          <div className="grid grid-cols-1 gap-4 sm:grid-cols-2 lg:grid-cols-3">
+            {results.map((r, idx) => (
+              <figure key={idx} className="flex flex-col gap-2">
+                <img
+                  src={`data:${r.image_mime};base64,${r.image_b64}`}
+                  alt={`Generated image ${idx + 1}`}
+                  className="aspect-square w-full rounded-md border border-border object-cover"
+                  data-testid="diffusion-result-image"
+                />
+                <figcaption className="text-xs text-muted-foreground">
+                  {r.width}x{r.height} - {r.num_inference_steps} steps - g={r.guidance_scale.toFixed(1)}
+                  {r.seed !== null && r.seed !== undefined ? ` - seed ${r.seed}` : ""} -
+                  {` ${(r.duration_ms / 1000).toFixed(1)}s`}
+                </figcaption>
+              </figure>
+            ))}
+          </div>
+        </SectionCard>
+      )}
+    </div>
+  );
+}
diff --git a/studio/frontend/src/features/images/index.ts b/studio/frontend/src/features/images/index.ts
new file mode 100644
index 0000000000..8fed9b0963
--- /dev/null
+++ b/studio/frontend/src/features/images/index.ts
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: AGPL-3.0-only
+// Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
+
+export { ImagesPage } from "./images-page";
+export * from "./api";

From a08686cc46a6941cdb87ccae4790d7ff99217dcc Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 24 May 2026 14:33:49 +0000
Subject: [PATCH 02/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/diffusion.py     | 15 ++++++++++++---
 studio/backend/tests/test_diffusion_backend.py | 18 +++++++++++++++---
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index c44f132142..dee46a3704 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -60,6 +60,7 @@
 # the user gave us a GGUF-only repo. The base_repo is documented to the
 # user via ``status()`` so they understand why a second download fires.
 
+
 @dataclass(frozen = True)
 class DiffusionFamily:
     name: str
@@ -117,7 +118,9 @@ class DiffusionFamily:
 )
 
 
-def detect_family(repo_id: str, *, override_family: Optional[str] = None) -> Optional[DiffusionFamily]:
+def detect_family(
+    repo_id: str, *, override_family: Optional[str] = None
+) -> Optional[DiffusionFamily]:
     """Return the diffusion family matching ``repo_id``.
 
     Matching is substring-based and case-insensitive. ``override_family``
@@ -217,7 +220,11 @@ def _pick_device_and_dtype(self) -> tuple[str, "Any"]:
 
         if torch.cuda.is_available():
             return "cuda", torch.bfloat16
-        if hasattr(torch, "backends") and getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
+        if (
+            hasattr(torch, "backends")
+            and getattr(torch.backends, "mps", None)
+            and torch.backends.mps.is_available()
+        ):
             return "mps", torch.float16
         return "cpu", torch.float32
 
@@ -401,7 +408,9 @@ def generate_image(
                 # Match the device of the pipeline so determinism holds
                 # across reload cycles. For CPU offload, the noise still
                 # has to live on the device the diffusion forward runs on.
-                gen_device = "cuda" if device == "cuda" and torch.cuda.is_available() else "cpu"
+                gen_device = (
+                    "cuda" if device == "cuda" and torch.cuda.is_available() else "cpu"
+                )
                 generator = torch.Generator(device = gen_device).manual_seed(int(seed))
 
             call_kwargs: dict[str, Any] = {
diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index d70b4a2acb..642af72361 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -184,10 +184,17 @@ class _StubPipe:
         def __call__(self, **kwargs):
             if raises is not None:
                 raise raises
+
             class _Out:
                 pass
+
             o = _Out()
-            o.images = [returns or Image.new("RGB", (kwargs["width"], kwargs["height"]), color = (0, 255, 0))]
+            o.images = [
+                returns
+                or Image.new(
+                    "RGB", (kwargs["width"], kwargs["height"]), color = (0, 255, 0)
+                )
+            ]
             return o
 
     backend._pipe = _StubPipe()
@@ -293,8 +300,11 @@ def from_pretrained(cls, base_repo, **kwargs):
         def __call__(self, **kwargs):
             class _Out:
                 pass
+
             o = _Out()
-            o.images = [Image.new("RGB", (kwargs["width"], kwargs["height"]), color = (0, 0, 255))]
+            o.images = [
+                Image.new("RGB", (kwargs["width"], kwargs["height"]), color = (0, 0, 255))
+            ]
             return o
 
         def enable_model_cpu_offload(self):
@@ -320,7 +330,9 @@ def to(self, device):
 
     # Pretend HF Hub gave us a local file without actually fetching.
     fake_hub = types.ModuleType("huggingface_hub")
-    fake_hub.hf_hub_download = lambda repo_id, filename, token = None: f"/fake/{repo_id}/{filename}"
+    fake_hub.hf_hub_download = (
+        lambda repo_id, filename, token = None: f"/fake/{repo_id}/{filename}"
+    )
     monkeypatch.setitem(sys.modules, "huggingface_hub", fake_hub)
 
     # Force CPU dtype so the test does not need CUDA.

From f8504e3f3c0517b08947a4c461c3b4a4ebc1b115 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 24 May 2026 14:38:40 +0000
Subject: [PATCH 03/92] Studio: fix Images page SectionCard required icon prop

SectionCard requires an icon prop. Pass GpuIcon, PaintBrush02Icon,
and SparklesIcon for the three sections so tsc -b stops failing on
TS2741 'Property icon is missing'.
---
 studio/frontend/src/features/images/images-page.tsx | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/studio/frontend/src/features/images/images-page.tsx b/studio/frontend/src/features/images/images-page.tsx
index 3a408ba316..8928413313 100644
--- a/studio/frontend/src/features/images/images-page.tsx
+++ b/studio/frontend/src/features/images/images-page.tsx
@@ -16,6 +16,8 @@ import { Slider } from "@/components/ui/slider";
 import { Spinner } from "@/components/ui/spinner";
 import { Textarea } from "@/components/ui/textarea";
 import { toast } from "@/lib/toast";
+import { PaintBrush02Icon, SparklesIcon, GpuIcon } from "@hugeicons/core-free-icons";
+import { HugeiconsIcon } from "@hugeicons/react";
 import {
   fetchDiffusionStatus,
   generateDiffusionImage,
@@ -209,6 +211,7 @@ export function ImagesPage() {
   return (
     <div className="flex flex-1 flex-col gap-4 overflow-y-auto p-4 sm:p-6">
       <SectionCard
+        icon={<HugeiconsIcon icon={GpuIcon} className="size-5" strokeWidth={1.5} />}
         title="Local image generation"
         description={
           "Run diffusion GGUFs from Hugging Face on your own GPU. " +
@@ -309,6 +312,7 @@ export function ImagesPage() {
       </SectionCard>
 
       <SectionCard
+        icon={<HugeiconsIcon icon={PaintBrush02Icon} className="size-5" strokeWidth={1.5} />}
         title="Prompt"
         description="The pipeline runs on the GPU you launched Unsloth Studio on."
       >
@@ -400,7 +404,11 @@ export function ImagesPage() {
       </SectionCard>
 
       {results.length > 0 && (
-        <SectionCard title="Results" description="Most recent first.">
+        <SectionCard
+          icon={<HugeiconsIcon icon={SparklesIcon} className="size-5" strokeWidth={1.5} />}
+          title="Results"
+          description="Most recent first."
+        >
           <div className="grid grid-cols-1 gap-4 sm:grid-cols-2 lg:grid-cols-3">
             {results.map((r, idx) => (
               <figure key={idx} className="flex flex-col gap-2">

From bf5c4ac90be24bbe4f10e2df0e3f96405daf0df4 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 24 May 2026 23:40:50 +0000
Subject: [PATCH 04/92] Fix/adjust diffusion review findings for PR #5754

Backend
- Fix FLUX.2 klein family default base_repo: black-forest-labs/FLUX.2-klein
  does not exist on the Hub. Point at the Apache 2.0 4B Base instead so
  the from_pretrained call works out of the box for ungated users.
- Serialise concurrent load_model calls with a dedicated _load_lock so
  two /images/load requests cannot both reach pipeline_cls.from_pretrained
  at the same time (would double-spend VRAM and corrupt _pipe).
- When the caller passes a full diffusers repo (no gguf_filename),
  use repo_id directly instead of silently substituting the family
  default. Closes the load-the-wrong-model regression flagged by review.
- Drop negative_prompt from the pipeline call when the loaded pipeline
  does not accept it (FLUX.2 / FLUX.2 klein). Inspect __call__ via
  inspect.signature so we do not maintain a manual class list.
- Best-effort unload the chat backend (llama-server) before a diffusion
  load so a 24 GB consumer GPU can swap between chat and diffusion
  without manual unload steps.

Frontend
- Replace the four curated entries with the actual filenames published
  on the Hub (lowercase flux-2-klein-Nb-Q4_K_S.gguf and flux2-dev*).
- Add an explicit base_repo per curated entry so the backend never
  falls back to the family default for the curated picker.
- Add the Apache 2.0 FLUX.2 klein base 4B entry so first-time users
  have an ungated, no-token-required default.
- Hide the negative prompt field for FLUX.2 / FLUX.2 klein and show a
  small explanatory note instead.

Tests
- Add 6 new backend tests: base_repo override, full-repo (no GGUF)
  no-substitution, concurrent serialise race, signature-based kwarg
  filter, negative_prompt strip on FLUX.2, negative_prompt preserved
  on supporting pipelines. 33 tests passing.
---
 studio/backend/core/inference/diffusion.py    | 258 ++++++++++++------
 .../backend/tests/test_diffusion_backend.py   | 226 ++++++++++++++-
 studio/backend/tests/test_diffusion_routes.py |   2 +-
 .../src/features/images/images-page.tsx       |  97 +++++--
 4 files changed, 473 insertions(+), 110 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index dee46a3704..ea22828d68 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -73,11 +73,18 @@ class DiffusionFamily:
 
 
 _FAMILIES: tuple[DiffusionFamily, ...] = (
+    # The "9b" alias is checked first so a "flux-2-klein-9b" GGUF picks
+    # the 9B base instead of the 4B one when the user does not pass an
+    # explicit base_repo. Apache 2.0 is preferred as the auto-default for
+    # the 4B path because BFL's 9B base is gated.
     DiffusionFamily(
         name = "flux.2-klein",
         pipeline_class = "Flux2KleinPipeline",
         transformer_class = "Flux2Transformer2DModel",
-        base_repo = "black-forest-labs/FLUX.2-klein",
+        # Default for klein when no explicit base_repo: Apache-2.0 4B Base.
+        # The frontend curated picker always passes base_repo explicitly,
+        # so this default only fires for "custom HF repo" mode.
+        base_repo = "black-forest-labs/FLUX.2-klein-base-4B",
         aliases = ("flux2-klein", "flux-2-klein", "flux.2.klein"),
     ),
     DiffusionFamily(
@@ -111,7 +118,13 @@ class DiffusionFamily:
     DiffusionFamily(
         name = "stable-diffusion-xl",
         pipeline_class = "StableDiffusionXLPipeline",
-        transformer_class = "",  # SDXL uses a UNet, not a transformer
+        # SDXL uses a UNet, not a transformer. Loading SDXL GGUFs would
+        # require UNet2DConditionModel.from_single_file + GGUF, which is
+        # not the same code path as the FLUX / Qwen-Image transformers
+        # this PR ships. Until that path is wired and smoke-tested,
+        # treat SDXL as full-repo-only and surface a clear error when a
+        # user tries to pass gguf_filename for it.
+        transformer_class = "",
         base_repo = "stabilityai/stable-diffusion-xl-base-1.0",
         aliases = ("sdxl",),
     ),
@@ -171,7 +184,15 @@ class DiffusionBackend:
 
     def __init__(self) -> None:
         self._pipe: Any = None
+        # `_lock` protects mutations to the small state fields and the
+        # pipe call inside generate_image. `_load_lock` serialises the
+        # entire load_model call so two concurrent /images/load requests
+        # cannot both reach pipeline_cls.from_pretrained at the same
+        # time (which would double-spend VRAM and corrupt _pipe). The
+        # locks are taken in order load -> state so a generation in
+        # flight cannot deadlock the next load.
         self._lock = threading.Lock()
+        self._load_lock = threading.Lock()
         self._family: Optional[DiffusionFamily] = None
         self._repo_id: Optional[str] = None
         self._gguf_path: Optional[str] = None
@@ -269,86 +290,109 @@ def load_model(
 
         device, dtype = self._pick_device_and_dtype()
 
-        with self._lock:
-            self._loading = True
-            self._last_error = None
-        try:
-            pipeline_cls = getattr(diffusers, fam.pipeline_class, None)
-            if pipeline_cls is None:
-                raise RuntimeError(
-                    f"diffusers {diffusers.__version__} has no "
-                    f"{fam.pipeline_class}; upgrade diffusers and retry."
-                )
-            transformer_cls = (
-                getattr(diffusers, fam.transformer_class, None)
-                if fam.transformer_class
-                else None
-            )
-
-            effective_base = base_repo or fam.base_repo
-            logger.info(
-                "Loading diffusion model %s (family=%s, device=%s, dtype=%s, base=%s)",
-                repo_id,
-                fam.name,
-                device,
-                dtype,
-                effective_base,
-            )
-
-            transformer = None
-            local_gguf_path: Optional[str] = None
-            if gguf_filename:
-                if transformer_cls is None:
+        # _load_lock serialises the entire load so two concurrent calls
+        # cannot both kick off a multi-GB download + GPU upload at once.
+        # The second caller waits behind the first and then loads on top
+        # of the now-populated state via the normal swap path.
+        with self._load_lock:
+            with self._lock:
+                self._loading = True
+                self._last_error = None
+            try:
+                # Unload any chat model that is holding GPU memory so the
+                # diffusion load does not OOM on a < 24 GB GPU. Best
+                # effort: if the llama-cpp backend module is absent (eg
+                # tests, headless tooling) we just continue.
+                _release_chat_backend_for_diffusion()
+
+                pipeline_cls = getattr(diffusers, fam.pipeline_class, None)
+                if pipeline_cls is None:
                     raise RuntimeError(
-                        f"Family {fam.name} does not have a GGUF transformer "
-                        "path; load the full repo instead."
+                        f"diffusers {diffusers.__version__} has no "
+                        f"{fam.pipeline_class}; upgrade diffusers and retry."
                     )
-                local_gguf_path = hf_hub_download(
-                    repo_id = repo_id,
-                    filename = gguf_filename,
-                    token = hf_token,
-                )
-                quant_config = diffusers.GGUFQuantizationConfig(compute_dtype = dtype)
-                transformer = transformer_cls.from_single_file(
-                    local_gguf_path,
-                    quantization_config = quant_config,
-                    torch_dtype = dtype,
+                transformer_cls = (
+                    getattr(diffusers, fam.transformer_class, None)
+                    if fam.transformer_class
+                    else None
                 )
 
-            pipe_kwargs: dict[str, Any] = {"torch_dtype": dtype}
-            if transformer is not None:
-                pipe_kwargs["transformer"] = transformer
-            if hf_token:
-                pipe_kwargs["token"] = hf_token
+                # Resolution rules for the "what repo to call
+                # from_pretrained on" question:
+                #   1. caller-supplied base_repo wins
+                #   2. if no GGUF file was requested the user is loading a
+                #      full diffusers repo; use repo_id directly so we do
+                #      not silently substitute the family default
+                #   3. otherwise fall back to the family default
+                if base_repo:
+                    effective_base = base_repo
+                elif not gguf_filename:
+                    effective_base = repo_id
+                else:
+                    effective_base = fam.base_repo
+                logger.info(
+                    "Loading diffusion model %s (family=%s, device=%s, dtype=%s, base=%s)",
+                    repo_id,
+                    fam.name,
+                    device,
+                    dtype,
+                    effective_base,
+                )
 
-            pipe = pipeline_cls.from_pretrained(effective_base, **pipe_kwargs)
-            if enable_model_cpu_offload and device == "cuda":
-                pipe.enable_model_cpu_offload()
-            else:
-                pipe.to(device)
+                transformer = None
+                local_gguf_path: Optional[str] = None
+                if gguf_filename:
+                    if transformer_cls is None:
+                        raise RuntimeError(
+                            f"Family {fam.name} does not have a GGUF transformer "
+                            "path wired in this build; load the full repo instead."
+                        )
+                    local_gguf_path = hf_hub_download(
+                        repo_id = repo_id,
+                        filename = gguf_filename,
+                        token = hf_token,
+                    )
+                    quant_config = diffusers.GGUFQuantizationConfig(compute_dtype = dtype)
+                    transformer = transformer_cls.from_single_file(
+                        local_gguf_path,
+                        quantization_config = quant_config,
+                        torch_dtype = dtype,
+                    )
 
-            # Drop the old pipeline only after the new one is in place.
-            old = self._pipe
-            with self._lock:
-                self._pipe = pipe
-                self._family = fam
-                self._repo_id = repo_id
-                self._gguf_path = local_gguf_path
-                self._base_repo = effective_base
-                self._device = device
-                self._dtype = str(dtype).replace("torch.", "")
-                self._loaded_at = time.time()
-            _release(old)
-
-            return self.status()
-        except Exception as exc:
-            with self._lock:
-                self._last_error = str(exc)
-            logger.exception("Diffusion load failed for %s", repo_id)
-            raise RuntimeError(f"Failed to load diffusion model: {exc}") from exc
-        finally:
-            with self._lock:
-                self._loading = False
+                pipe_kwargs: dict[str, Any] = {"torch_dtype": dtype}
+                if transformer is not None:
+                    pipe_kwargs["transformer"] = transformer
+                if hf_token:
+                    pipe_kwargs["token"] = hf_token
+
+                pipe = pipeline_cls.from_pretrained(effective_base, **pipe_kwargs)
+                if enable_model_cpu_offload and device == "cuda":
+                    pipe.enable_model_cpu_offload()
+                else:
+                    pipe.to(device)
+
+                # Drop the old pipeline only after the new one is in place.
+                old = self._pipe
+                with self._lock:
+                    self._pipe = pipe
+                    self._family = fam
+                    self._repo_id = repo_id
+                    self._gguf_path = local_gguf_path
+                    self._base_repo = effective_base
+                    self._device = device
+                    self._dtype = str(dtype).replace("torch.", "")
+                    self._loaded_at = time.time()
+                _release(old)
+
+                return self.status()
+            except Exception as exc:
+                with self._lock:
+                    self._last_error = str(exc)
+                logger.exception("Diffusion load failed for %s", repo_id)
+                raise RuntimeError(f"Failed to load diffusion model: {exc}") from exc
+            finally:
+                with self._lock:
+                    self._loading = False
 
     def unload_model(self) -> dict[str, Any]:
         with self._lock:
@@ -420,8 +464,18 @@ def generate_image(
                 "width": int(width),
                 "height": int(height),
             }
+            # FLUX.2 / FLUX.2 klein pipelines do NOT accept
+            # negative_prompt and 500 if you pass it in. Inspect the
+            # signature and only forward when supported; warn otherwise
+            # so the UI can disable the field for incompatible families.
             if negative_prompt is not None and negative_prompt.strip():
-                call_kwargs["negative_prompt"] = negative_prompt
+                if _pipe_accepts_kwarg(pipe, "negative_prompt"):
+                    call_kwargs["negative_prompt"] = negative_prompt
+                else:
+                    logger.info(
+                        "Dropping negative_prompt: %s does not accept it",
+                        type(pipe).__name__,
+                    )
             if generator is not None:
                 call_kwargs["generator"] = generator
 
@@ -432,6 +486,26 @@ def generate_image(
             return images[0]
 
 
+def _pipe_accepts_kwarg(pipe: Any, name: str) -> bool:
+    """True if ``pipe.__call__`` advertises a kwarg called ``name``.
+
+    Cheap inspect-based probe so we do not have to maintain a manual
+    list of which pipeline classes accept negative_prompt. Returns
+    False on any introspection error so callers stay on the safe path.
+    """
+    import inspect
+
+    try:
+        sig = inspect.signature(pipe.__call__)
+    except (TypeError, ValueError):
+        return False
+    if name in sig.parameters:
+        return True
+    return any(
+        p.kind is inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values()
+    )
+
+
 def encode_png_base64(pil_image: "Any") -> str:
     """Encode a PIL image to base64-encoded PNG."""
     import base64
@@ -444,6 +518,36 @@ def encode_png_base64(pil_image: "Any") -> str:
 # ─── Helpers ──────────────────────────────────────────────────────────
 
 
+def _release_chat_backend_for_diffusion() -> None:
+    """Unload any running chat backend before a diffusion load.
+
+    Diffusion pipelines on FLUX-class models can eat 12-24 GB of VRAM,
+    and llama-server typically holds onto its loaded GGUF until told to
+    drop it. Asking the chat backend to release its weights first means
+    a typical 24 GB consumer GPU can host one chat model OR one
+    diffusion model without manual unload steps.
+
+    Best effort: if the chat backend module is not importable (CI,
+    isolated tests, custom builds) we silently continue. Failures
+    inside the unload itself are logged but not propagated; the
+    diffusion load can still try and surface its own OOM.
+    """
+    try:
+        from routes.inference import get_llama_cpp_backend  # type: ignore
+    except Exception:
+        return
+    try:
+        backend = get_llama_cpp_backend()
+    except Exception:
+        return
+    try:
+        if getattr(backend, "is_loaded", False):
+            logger.info("Unloading llama-server before diffusion load")
+            backend.unload_model()
+    except Exception as exc:
+        logger.warning("Could not unload chat backend before diffusion: %s", exc)
+
+
 def _release(obj: Any) -> None:
     """Best-effort GPU-memory release for a pipeline being swapped out."""
     if obj is None:
diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index 642af72361..67deed9c5e 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -53,6 +53,11 @@ def test_detect_family_flux2_klein():
     assert fam.name == "flux.2-klein"
     assert fam.pipeline_class == "Flux2KleinPipeline"
     assert fam.transformer_class == "Flux2Transformer2DModel"
+    # Family default base must point to a real Hub repo (not the bare
+    # "FLUX.2-klein" slug that does not exist). The frontend curated
+    # picker still passes base_repo explicitly per size so this default
+    # only fires for the "custom HF repo" mode.
+    assert fam.base_repo == "black-forest-labs/FLUX.2-klein-base-4B"
 
 
 def test_detect_family_flux2_dev_is_not_klein():
@@ -363,14 +368,14 @@ def test_load_model_gguf_path_happy(monkeypatch):
     backend = get_diffusion_backend()
     status = backend.load_model(
         "unsloth/FLUX.2-klein-4B-GGUF",
-        gguf_filename = "FLUX.2-klein-4B-Q4_K_S.gguf",
+        gguf_filename = "flux-2-klein-4b-Q4_K_S.gguf",
     )
     assert status["is_loaded"] is True
     assert status["family"] == "flux.2-klein"
     assert status["pipeline_class"] == "Flux2KleinPipeline"
-    assert status["base_repo"] == "black-forest-labs/FLUX.2-klein"
+    assert status["base_repo"] == "black-forest-labs/FLUX.2-klein-base-4B"
     assert status["gguf_path"] == (
-        "/fake/unsloth/FLUX.2-klein-4B-GGUF/FLUX.2-klein-4B-Q4_K_S.gguf"
+        "/fake/unsloth/FLUX.2-klein-4B-GGUF/flux-2-klein-4b-Q4_K_S.gguf"
     )
 
 
@@ -397,12 +402,223 @@ def test_load_model_swap_drops_previous(monkeypatch):
     backend = get_diffusion_backend()
     backend.load_model(
         "unsloth/FLUX.2-klein-4B-GGUF",
-        gguf_filename = "FLUX.2-klein-4B-Q4_K_S.gguf",
+        gguf_filename = "flux-2-klein-4b-Q4_K_S.gguf",
     )
     first_pipe = backend._pipe
     backend.load_model(
         "unsloth/FLUX.2-dev-GGUF",
-        gguf_filename = "FLUX.2-dev-Q4_K_S.gguf",
+        gguf_filename = "flux2-dev-Q4_K_S.gguf",
     )
     assert backend._pipe is not first_pipe
     assert backend.status()["family"] == "flux.2"
+
+
+def test_load_model_base_repo_override(monkeypatch):
+    _install_fake_diffusers(monkeypatch)
+    from core.inference.diffusion import get_diffusion_backend
+
+    backend = get_diffusion_backend()
+    status = backend.load_model(
+        "unsloth/FLUX.2-klein-9B-GGUF",
+        gguf_filename = "flux-2-klein-9b-Q4_K_S.gguf",
+        base_repo = "black-forest-labs/FLUX.2-klein-base-9B",
+    )
+    assert status["base_repo"] == "black-forest-labs/FLUX.2-klein-base-9B"
+
+
+def test_load_model_full_repo_does_not_substitute(monkeypatch):
+    """A full diffusers repo (no gguf_filename) must call from_pretrained
+    with the user-supplied repo, not the family default. This was the
+    silent-substitution bug surfaced by review."""
+    fake = _install_fake_diffusers(monkeypatch)
+    from core.inference.diffusion import get_diffusion_backend
+
+    backend = get_diffusion_backend()
+    status = backend.load_model(
+        "owner/FLUX.1-finetune-diffusers",
+        family_override = "flux.1",
+    )
+    # base_repo must echo the user repo, not the family default.
+    assert status["base_repo"] == "owner/FLUX.1-finetune-diffusers"
+    assert status["repo_id"] == "owner/FLUX.1-finetune-diffusers"
+    # And the fake pipeline records what we called from_pretrained with.
+    assert backend._pipe.base_repo == "owner/FLUX.1-finetune-diffusers"
+
+
+def test_load_model_concurrent_serialises(monkeypatch):
+    """Two concurrent load_model() calls must NOT both reach
+    pipeline_cls.from_pretrained at the same time (race fix)."""
+    _install_fake_diffusers(monkeypatch)
+    from core.inference.diffusion import get_diffusion_backend
+    import threading
+    import time as _t
+
+    backend = get_diffusion_backend()
+    active = {"n": 0, "max": 0}
+    lock = threading.Lock()
+
+    import sys as _sys
+
+    fake_pipeline_cls = _sys.modules["diffusers"].Flux2KleinPipeline
+    original_from_pretrained = fake_pipeline_cls.from_pretrained.__func__
+
+    def _instrumented_from_pretrained(cls, base_repo, **kwargs):
+        with lock:
+            active["n"] += 1
+            active["max"] = max(active["max"], active["n"])
+        try:
+            _t.sleep(0.1)
+            return original_from_pretrained(cls, base_repo, **kwargs)
+        finally:
+            with lock:
+                active["n"] -= 1
+
+    fake_pipeline_cls.from_pretrained = classmethod(_instrumented_from_pretrained)
+
+    errors: list = []
+
+    def _do_load():
+        try:
+            backend.load_model(
+                "unsloth/FLUX.2-klein-base-4B-GGUF",
+                gguf_filename = "flux-2-klein-base-4b-Q4_K_S.gguf",
+            )
+        except Exception as e:
+            errors.append(e)
+
+    threads = [threading.Thread(target = _do_load) for _ in range(3)]
+    for t in threads:
+        t.start()
+    for t in threads:
+        t.join()
+
+    assert not errors, errors
+    assert active["max"] == 1, (
+        f"Expected concurrent loads to serialise; max_active={active['max']}"
+    )
+
+
+def test_pipe_accepts_kwarg_filter():
+    """The negative_prompt filter must drop the kwarg on classes that
+    do not accept it (FLUX.2 / FLUX.2 klein) and keep it on the rest."""
+    from core.inference.diffusion import _pipe_accepts_kwarg
+
+    class _NoNeg:
+        def __call__(self, *, prompt, num_inference_steps, guidance_scale, width, height):
+            pass
+
+    class _Neg:
+        def __call__(
+            self,
+            *,
+            prompt,
+            negative_prompt = None,
+            num_inference_steps,
+            guidance_scale,
+            width,
+            height,
+        ):
+            pass
+
+    class _VarKw:
+        def __call__(self, **kw):
+            pass
+
+    assert _pipe_accepts_kwarg(_NoNeg(), "negative_prompt") is False
+    assert _pipe_accepts_kwarg(_Neg(), "negative_prompt") is True
+    # Anything with **kwargs is assumed to accept the kwarg (the
+    # alternative is to silently drop legitimate params).
+    assert _pipe_accepts_kwarg(_VarKw(), "negative_prompt") is True
+
+
+def test_generate_image_strips_negative_prompt_on_flux2(monkeypatch):
+    """generate_image must drop negative_prompt when the loaded pipeline
+    does not accept it; otherwise FLUX.2 would 500 on a user-visible
+    field."""
+    import core.inference.diffusion as d
+    from PIL import Image
+
+    backend = d.get_diffusion_backend()
+
+    received: dict = {}
+
+    class _Flux2LikePipe:
+        # Signature mirrors Flux2Pipeline.__call__: NO negative_prompt.
+        # No **kw either, since the real FLUX.2 pipeline does not accept
+        # arbitrary kwargs (passing negative_prompt to it raises TypeError).
+        def __call__(
+            self,
+            *,
+            prompt,
+            num_inference_steps,
+            guidance_scale,
+            width,
+            height,
+            generator = None,
+        ):
+            received["prompt"] = prompt
+            class _Out:
+                pass
+            o = _Out()
+            o.images = [Image.new("RGB", (width, height), (1, 2, 3))]
+            return o
+
+    backend._pipe = _Flux2LikePipe()
+    backend._device = "cpu"
+    backend._family = d._FAMILIES[0]
+    backend._repo_id = "stub/stub"
+
+    # If generate_image forwarded negative_prompt, the pipeline call
+    # would raise TypeError. The PR's filter drops it, so the call
+    # succeeds and we observe the prompt was still delivered.
+    backend.generate_image(
+        prompt = "a sloth",
+        negative_prompt = "blurry, low quality",
+        num_inference_steps = 4,
+        guidance_scale = 1.0,
+        width = 256,
+        height = 256,
+    )
+    assert received["prompt"] == "a sloth"
+
+
+def test_generate_image_keeps_negative_prompt_on_supporting_pipe(monkeypatch):
+    import core.inference.diffusion as d
+    from PIL import Image
+
+    backend = d.get_diffusion_backend()
+    captured: dict = {}
+
+    class _NegOK:
+        def __call__(
+            self,
+            *,
+            prompt,
+            negative_prompt = None,
+            num_inference_steps,
+            guidance_scale,
+            width,
+            height,
+            **kw,
+        ):
+            captured["negative_prompt"] = negative_prompt
+            class _Out:
+                pass
+            o = _Out()
+            o.images = [Image.new("RGB", (width, height), (4, 5, 6))]
+            return o
+
+    backend._pipe = _NegOK()
+    backend._device = "cpu"
+    backend._family = d._FAMILIES[2]  # flux.1 supports negative_prompt
+    backend._repo_id = "stub/stub"
+
+    backend.generate_image(
+        prompt = "a sloth",
+        negative_prompt = "blurry",
+        num_inference_steps = 4,
+        guidance_scale = 1.0,
+        width = 256,
+        height = 256,
+    )
+    assert captured["negative_prompt"] == "blurry"
diff --git a/studio/backend/tests/test_diffusion_routes.py b/studio/backend/tests/test_diffusion_routes.py
index 9b9063f0b1..af759ab234 100644
--- a/studio/backend/tests/test_diffusion_routes.py
+++ b/studio/backend/tests/test_diffusion_routes.py
@@ -118,7 +118,7 @@ def test_load_then_generate_round_trip(app_with_stub):
         "/api/inference/images/load",
         json = {
             "repo_id": "unsloth/FLUX.2-klein-4B-GGUF",
-            "gguf_filename": "FLUX.2-klein-4B-Q4_K_S.gguf",
+            "gguf_filename": "flux-2-klein-4b-Q4_K_S.gguf",
         },
     )
     assert r.status_code == 200, r.text
diff --git a/studio/frontend/src/features/images/images-page.tsx b/studio/frontend/src/features/images/images-page.tsx
index 8928413313..a422b59bbc 100644
--- a/studio/frontend/src/features/images/images-page.tsx
+++ b/studio/frontend/src/features/images/images-page.tsx
@@ -28,47 +28,64 @@ import {
 } from "./api";
 import { useCallback, useEffect, useMemo, useRef, useState } from "react";
 
-// Curated short list of working unsloth/* diffusion GGUFs. Picked to
-// span size + license so any GPU class has at least one viable option:
-//   FLUX.2 klein 4B  -> ~10-12 GB VRAM with Q4_K_S, Apache 2.0
-//   FLUX.2 klein 9B  -> ~16-18 GB VRAM, FLUX [klein] non-commercial
-//   FLUX.2 dev       -> ~24+ GB VRAM, FLUX [dev] non-commercial
-// The CLI on the backend can load anything supported by detect_family();
-// this list just keeps the picker compact for the v1 UI.
+// Curated short list of working diffusion GGUFs. Picked to span
+// size + license so any GPU class has at least one viable option:
+//   FLUX.2 klein 4B  -> ~13 GB VRAM with Q4_K_S, Apache 2.0
+//   FLUX.2 klein 9B  -> ~17 GB VRAM, FLUX [klein] non-commercial (gated)
+//   FLUX.2 dev       -> ~24+ GB VRAM, FLUX [dev] non-commercial (gated)
+//   FLUX.1 dev       -> ~12 GB VRAM, older but widely tested (gated)
+//
+// Filenames mirror the Hub canonical case (lowercase 'flux-2-klein-4b')
+// and base_repo is set explicitly so the backend never falls back to the
+// family default. The CLI on the backend can load anything supported by
+// detect_family(); this list just keeps the picker compact for the v1 UI.
 const CURATED_MODELS: Array<{
   label: string;
   repo_id: string;
   default_gguf: string;
+  base_repo: string;
   family: string;
   notes: string;
 }> = [
   {
-    label: "FLUX.2 klein 4B (Q4_K_S, Apache 2.0)",
+    label: "FLUX.2 klein base 4B (Q4_K_S, Apache 2.0)",
+    repo_id: "unsloth/FLUX.2-klein-base-4B-GGUF",
+    default_gguf: "flux-2-klein-base-4b-Q4_K_S.gguf",
+    base_repo: "black-forest-labs/FLUX.2-klein-base-4B",
+    family: "flux.2-klein",
+    notes: "13 GB VRAM, fastest. Apache 2.0, ungated.",
+  },
+  {
+    label: "FLUX.2 klein 4B (Q4_K_S, distilled)",
     repo_id: "unsloth/FLUX.2-klein-4B-GGUF",
-    default_gguf: "FLUX.2-klein-4B-Q4_K_S.gguf",
+    default_gguf: "flux-2-klein-4b-Q4_K_S.gguf",
+    base_repo: "black-forest-labs/FLUX.2-klein-base-4B",
     family: "flux.2-klein",
-    notes: "13 GB VRAM, fastest. Apache 2.0.",
+    notes: "13 GB VRAM. Distilled klein 4B with the Apache base.",
   },
   {
-    label: "FLUX.2 klein 9B (Q4_K_S)",
+    label: "FLUX.2 klein 9B (Q4_K_S, gated)",
     repo_id: "unsloth/FLUX.2-klein-9B-GGUF",
-    default_gguf: "FLUX.2-klein-9B-Q4_K_S.gguf",
+    default_gguf: "flux-2-klein-9b-Q4_K_S.gguf",
+    base_repo: "black-forest-labs/FLUX.2-klein-base-9B",
     family: "flux.2-klein",
-    notes: "17 GB VRAM, higher quality.",
+    notes: "17 GB VRAM. Higher quality. Requires HF access to FLUX.2 klein base 9B.",
   },
   {
-    label: "FLUX.2 dev (Q4_K_S)",
+    label: "FLUX.2 dev (Q4_K_S, gated)",
     repo_id: "unsloth/FLUX.2-dev-GGUF",
-    default_gguf: "FLUX.2-dev-Q4_K_S.gguf",
+    default_gguf: "flux2-dev-Q4_K_S.gguf",
+    base_repo: "black-forest-labs/FLUX.2-dev",
     family: "flux.2",
-    notes: "24+ GB VRAM, best for prompt following.",
+    notes: "24+ GB VRAM. Requires HF access to FLUX.2 dev.",
   },
   {
-    label: "FLUX.1 dev (Q4_K_S, city96)",
+    label: "FLUX.1 dev (Q4_K_S, city96, gated)",
     repo_id: "city96/FLUX.1-dev-gguf",
     default_gguf: "flux1-dev-Q4_K_S.gguf",
+    base_repo: "black-forest-labs/FLUX.1-dev",
     family: "flux.1",
-    notes: "12 GB VRAM, older but well tested.",
+    notes: "12 GB VRAM. Older but widely tested. Requires HF access to FLUX.1 dev.",
   },
 ];
 
@@ -132,6 +149,11 @@ export function ImagesPage() {
       const repo = useCustom ? customRepoId.trim() : preset.repo_id;
       const gguf = useCustom ? customGguf.trim() || undefined : preset.default_gguf;
       const family = useCustom ? undefined : preset.family;
+      // Always pass base_repo for curated entries; custom-repo mode
+      // lets the backend either infer it from the family default or
+      // (when no GGUF is given) treat the repo as a full diffusers
+      // checkpoint and call from_pretrained on it directly.
+      const baseRepo = useCustom ? undefined : preset.base_repo;
       if (!repo) {
         toast.error("Pick a model first");
         return;
@@ -139,6 +161,7 @@ export function ImagesPage() {
       const next = await loadDiffusionModel({
         repo_id: repo,
         gguf_filename: gguf,
+        base_repo: baseRepo,
         family,
         hf_token: hfToken.trim() || undefined,
       });
@@ -208,6 +231,19 @@ export function ImagesPage() {
     return "Not loaded";
   }, [status, refreshingStatus]);
 
+  // FLUX.2 / FLUX.2 klein pipelines do NOT accept negative_prompt and
+  // would 500 if we sent one through. The backend strips the field
+  // defensively but hiding it client-side keeps the UI honest.
+  const supportsNegativePrompt = useMemo(() => {
+    const family = status?.family;
+    if (!family) {
+      const candidate = useCustom ? undefined : preset.family;
+      if (!candidate) return true;
+      return !candidate.startsWith("flux.2");
+    }
+    return !family.startsWith("flux.2");
+  }, [status, useCustom, preset.family]);
+
   return (
     <div className="flex flex-1 flex-col gap-4 overflow-y-auto p-4 sm:p-6">
       <SectionCard
@@ -327,15 +363,22 @@ export function ImagesPage() {
               data-testid="diffusion-prompt"
             />
           </div>
-          <div className="flex flex-col gap-1">
-            <Label htmlFor="diffusion-negative">Negative prompt (optional)</Label>
-            <Textarea
-              id="diffusion-negative"
-              value={negativePrompt}
-              onChange={(e) => setNegativePrompt(e.target.value)}
-              rows={2}
-            />
-          </div>
+          {supportsNegativePrompt ? (
+            <div className="flex flex-col gap-1">
+              <Label htmlFor="diffusion-negative">Negative prompt (optional)</Label>
+              <Textarea
+                id="diffusion-negative"
+                value={negativePrompt}
+                onChange={(e) => setNegativePrompt(e.target.value)}
+                rows={2}
+              />
+            </div>
+          ) : (
+            <p className="text-xs text-muted-foreground">
+              {"FLUX.2 and FLUX.2 klein do not accept a negative prompt. "}
+              {"Steer the output via the main prompt instead."}
+            </p>
+          )}
 
           <div className="grid grid-cols-1 gap-3 sm:grid-cols-3">
             <div className="flex flex-col gap-1">

From 669964f52cecfdd601a73332ba53c411d9d11804 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 24 May 2026 23:41:07 +0000
Subject: [PATCH 05/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/diffusion.py     |  4 +---
 studio/backend/tests/test_diffusion_backend.py | 14 ++++++++++----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index ea22828d68..3a2cc8d503 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -501,9 +501,7 @@ def _pipe_accepts_kwarg(pipe: Any, name: str) -> bool:
         return False
     if name in sig.parameters:
         return True
-    return any(
-        p.kind is inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values()
-    )
+    return any(p.kind is inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values())
 
 
 def encode_png_base64(pil_image: "Any") -> str:
diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index 67deed9c5e..1a6a8de1c3 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -493,9 +493,9 @@ def _do_load():
         t.join()
 
     assert not errors, errors
-    assert active["max"] == 1, (
-        f"Expected concurrent loads to serialise; max_active={active['max']}"
-    )
+    assert (
+        active["max"] == 1
+    ), f"Expected concurrent loads to serialise; max_active={active['max']}"
 
 
 def test_pipe_accepts_kwarg_filter():
@@ -504,7 +504,9 @@ def test_pipe_accepts_kwarg_filter():
     from core.inference.diffusion import _pipe_accepts_kwarg
 
     class _NoNeg:
-        def __call__(self, *, prompt, num_inference_steps, guidance_scale, width, height):
+        def __call__(
+            self, *, prompt, num_inference_steps, guidance_scale, width, height
+        ):
             pass
 
     class _Neg:
@@ -557,8 +559,10 @@ def __call__(
             generator = None,
         ):
             received["prompt"] = prompt
+
             class _Out:
                 pass
+
             o = _Out()
             o.images = [Image.new("RGB", (width, height), (1, 2, 3))]
             return o
@@ -602,8 +606,10 @@ def __call__(
             **kw,
         ):
             captured["negative_prompt"] = negative_prompt
+
             class _Out:
                 pass
+
             o = _Out()
             o.images = [Image.new("RGB", (width, height), (4, 5, 6))]
             return o

From d6f2a238aa9ea028c88ae2bcf8e782ef78f61e5d Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 24 May 2026 23:48:51 +0000
Subject: [PATCH 06/92] Fix/adjust diffusion lifecycle + UI for PR #5754

- unload_model now takes _load_lock so it cannot race with an in-flight
  load_model and have the load thread overwrite cleared state after
  unload returned is_loaded=false.
- Move stable-diffusion-xl out of _FAMILIES into _FULL_REPO_FAMILIES.
  SDXL uses a UNet (no transformer GGUF path is wired); listing it in
  the GGUF families panel was misleading. SDXL full-repo loads still
  work via family_override='stable-diffusion-xl'.
- Result gallery now uses h-auto + object-contain so portrait /
  landscape outputs render at their true aspect ratio instead of
  being cropped into a square thumbnail.
---
 studio/backend/core/inference/diffusion.py    | 57 ++++++++++++-------
 .../src/features/images/images-page.tsx       |  5 +-
 2 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 3a2cc8d503..9e0ee871fd 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -115,15 +115,25 @@ class DiffusionFamily:
         base_repo = "stabilityai/stable-diffusion-3-medium-diffusers",
         aliases = ("sd3-medium", "stable-diffusion-3-medium", "sd3.5"),
     ),
+    # SDXL: full diffusers path only (no GGUF). SDXL uses a UNet (not a
+    # transformer) and wiring UNet2DConditionModel.from_single_file +
+    # GGUF is a separate code path the rest of this module does not
+    # exercise. The family is intentionally NOT in _FAMILIES so the
+    # frontend status panel does not advertise GGUF support we do not
+    # implement; callers wanting SDXL full repos can still do so by
+    # passing the diffusers repo with no gguf_filename and
+    # family_override = "stable-diffusion-xl" via the route, which uses
+    # the lookup in _FULL_REPO_FAMILIES.
+)
+
+
+# Families available via family_override on the routes layer when the
+# user is loading a full diffusers checkpoint (no GGUF). Kept separate
+# from _FAMILIES so the GGUF-only status panel does not over-advertise.
+_FULL_REPO_FAMILIES: tuple[DiffusionFamily, ...] = (
     DiffusionFamily(
         name = "stable-diffusion-xl",
         pipeline_class = "StableDiffusionXLPipeline",
-        # SDXL uses a UNet, not a transformer. Loading SDXL GGUFs would
-        # require UNet2DConditionModel.from_single_file + GGUF, which is
-        # not the same code path as the FLUX / Qwen-Image transformers
-        # this PR ships. Until that path is wired and smoke-tested,
-        # treat SDXL as full-repo-only and surface a clear error when a
-        # user tries to pass gguf_filename for it.
         transformer_class = "",
         base_repo = "stabilityai/stable-diffusion-xl-base-1.0",
         aliases = ("sdxl",),
@@ -137,13 +147,14 @@ def detect_family(
     """Return the diffusion family matching ``repo_id``.
 
     Matching is substring-based and case-insensitive. ``override_family``
-    bypasses substring matching and looks up by ``DiffusionFamily.name``.
-    Returns ``None`` when no family applies so callers can surface a clear
-    "unsupported model" error rather than guessing wrong.
+    bypasses substring matching and looks up by ``DiffusionFamily.name``
+    or (when explicitly asked) by ``_FULL_REPO_FAMILIES.name``.
+    Returns ``None`` when no family applies so callers can surface a
+    clear "unsupported model" error rather than guessing wrong.
     """
     if override_family:
         wanted = override_family.strip().lower()
-        for fam in _FAMILIES:
+        for fam in _FAMILIES + _FULL_REPO_FAMILIES:
             if fam.name == wanted:
                 return fam
         return None
@@ -395,17 +406,21 @@ def load_model(
                     self._loading = False
 
     def unload_model(self) -> dict[str, Any]:
-        with self._lock:
-            old = self._pipe
-            self._pipe = None
-            self._family = None
-            self._repo_id = None
-            self._gguf_path = None
-            self._base_repo = None
-            self._device = None
-            self._dtype = None
-            self._loaded_at = None
-        _release(old)
+        # Take the load lock too so unload cannot race with an in-flight
+        # load_model and have the load thread overwrite the cleared state
+        # after we already returned {"is_loaded": false}.
+        with self._load_lock:
+            with self._lock:
+                old = self._pipe
+                self._pipe = None
+                self._family = None
+                self._repo_id = None
+                self._gguf_path = None
+                self._base_repo = None
+                self._device = None
+                self._dtype = None
+                self._loaded_at = None
+            _release(old)
         return {"is_loaded": False}
 
     # ── generation ────────────────────────────────────────────────
diff --git a/studio/frontend/src/features/images/images-page.tsx b/studio/frontend/src/features/images/images-page.tsx
index a422b59bbc..abca68c0f2 100644
--- a/studio/frontend/src/features/images/images-page.tsx
+++ b/studio/frontend/src/features/images/images-page.tsx
@@ -458,7 +458,10 @@ export function ImagesPage() {
                 <img
                   src={`data:${r.image_mime};base64,${r.image_b64}`}
                   alt={`Generated image ${idx + 1}`}
-                  className="aspect-square w-full rounded-md border border-border object-cover"
+                  // h-auto + object-contain so portrait / landscape
+                  // outputs render at their true aspect ratio instead
+                  // of being cropped into a square thumbnail.
+                  className="h-auto w-full rounded-md border border-border object-contain"
                   data-testid="diffusion-result-image"
                 />
                 <figcaption className="text-xs text-muted-foreground">

From faa6822039b34d02a0df271500ea548b14eb0e9a Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Sun, 24 May 2026 23:58:02 +0000
Subject: [PATCH 07/92] Fix/adjust diffusion symmetric chat handoff for PR
 #5754

_release_chat_backend_for_diffusion now unloads both the GGUF
chat backend (llama-server) and the safetensors / HF chat backend
(get_inference_backend) before a diffusion load. Mirror the
behaviour on the chat-load side: both the Unsloth/transformers
load path and the GGUF load path now unload the diffusion pipeline
before claiming GPU memory. Closes the OOM-on-swap path flagged
by reviewers in both directions.
---
 studio/backend/core/inference/diffusion.py | 42 +++++++++++++---------
 studio/backend/routes/inference.py         | 27 ++++++++++++++
 2 files changed, 53 insertions(+), 16 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 9e0ee871fd..bc7d20ad0b 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -535,30 +535,40 @@ def _release_chat_backend_for_diffusion() -> None:
     """Unload any running chat backend before a diffusion load.
 
     Diffusion pipelines on FLUX-class models can eat 12-24 GB of VRAM,
-    and llama-server typically holds onto its loaded GGUF until told to
-    drop it. Asking the chat backend to release its weights first means
-    a typical 24 GB consumer GPU can host one chat model OR one
-    diffusion model without manual unload steps.
-
-    Best effort: if the chat backend module is not importable (CI,
-    isolated tests, custom builds) we silently continue. Failures
-    inside the unload itself are logged but not propagated; the
-    diffusion load can still try and surface its own OOM.
+    and the chat backends (llama-server for GGUF, the safetensors
+    Inference orchestrator for HF / Unsloth) typically hold onto their
+    loaded weights until told to drop them. Asking both to release
+    their weights first means a typical 24 GB consumer GPU can host
+    one chat model OR one diffusion model without manual unload steps.
+
+    Best effort: if a chat backend module is not importable (CI,
+    isolated tests, custom builds) or fails on the unload, we log and
+    continue; the diffusion load can still try and surface its own OOM.
     """
+    # 1. GGUF chat backend (llama-server subprocess).
     try:
         from routes.inference import get_llama_cpp_backend  # type: ignore
-    except Exception:
-        return
-    try:
+
         backend = get_llama_cpp_backend()
-    except Exception:
-        return
-    try:
         if getattr(backend, "is_loaded", False):
             logger.info("Unloading llama-server before diffusion load")
             backend.unload_model()
     except Exception as exc:
-        logger.warning("Could not unload chat backend before diffusion: %s", exc)
+        logger.debug("llama-server unload skipped: %s", exc)
+
+    # 2. Safetensors / HF chat backend (the Inference orchestrator that
+    #    serves FastVisionModel / FastLanguageModel weights). When this
+    #    backend has a model resident on the same GPU, a diffusion load
+    #    will OOM the same way.
+    try:
+        from core.inference.inference import get_inference_backend  # type: ignore
+
+        backend = get_inference_backend()
+        if getattr(backend, "active_model_name", None):
+            logger.info("Unloading safetensors chat backend before diffusion load")
+            backend.unload_model()
+    except Exception as exc:
+        logger.debug("safetensors unload skipped: %s", exc)
 
 
 def _release(obj: Any) -> None:
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index fc9cbf9f88..270f42bded 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -747,6 +747,19 @@ async def load_model(
                 )
                 unsloth_backend.unload_model(unsloth_backend.active_model_name)
 
+            # Symmetric with /images/load: drop any active diffusion
+            # pipeline so the GGUF chat load does not race the FLUX VAE
+            # for VRAM. Best effort; silently continue on failure.
+            try:
+                from core.inference.diffusion import get_diffusion_backend
+
+                diff_backend = get_diffusion_backend()
+                if diff_backend.is_loaded:
+                    logger.info("Unloading diffusion pipeline before GGUF load")
+                    diff_backend.unload_model()
+            except Exception as e:
+                logger.debug("diffusion unload skipped (GGUF path): %s", e)
+
             # Inherit llama_extra_args from the previous load when the
             # request omits the field (the chat-settings Apply path
             # does not round-trip them; explicit [] still clears).
@@ -923,6 +936,20 @@ async def load_model(
             logger.info("Unloading GGUF model before loading Unsloth model")
             llama_backend.unload_model()
 
+        # Unload any active diffusion pipeline so the new chat model is
+        # not racing the FLUX VAE for VRAM on a 16-24 GB card.
+        try:
+            from core.inference.diffusion import get_diffusion_backend
+
+            diff_backend = get_diffusion_backend()
+            if diff_backend.is_loaded:
+                logger.info(
+                    "Unloading diffusion pipeline before loading Unsloth chat model"
+                )
+                diff_backend.unload_model()
+        except Exception as e:
+            logger.debug("diffusion unload skipped: %s", e)
+
         # Shut down any export subprocess to free VRAM
         try:
             from core.export import get_export_backend

From 8074a2b67b7785be3d73c17b8b361712632e4152 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 00:04:12 +0000
Subject: [PATCH 08/92] Fix/adjust diffusion: smart base, safetensors, peak
 VRAM, GGUF guard

- _smart_base_repo: pick 9B base for unsloth/FLUX.2-klein-9B-GGUF
  and -base- variants per the repo id, instead of always falling
  back to the 4B family default.
- pipe_kwargs use_safetensors=True so diffusers refuses pickle .bin
  weights at load time (defends against compromised base_repo).
- Release the previous pipeline BEFORE allocating the new one so
  peak VRAM stays at one model's worth instead of two on swap.
- Reject empty gguf_filename when repo_id ends with -GGUF; the prior
  behavior tried from_pretrained on a GGUF-only repo and 500'd deep
  in diffusers with a confusing model-index error.
- Status returns gguf_filename (basename) instead of gguf_path so
  the local cache path / username does not leak to authenticated
  Studio sessions.
- requirements/no-torch-runtime.txt: pin diffusers>=0.37.0 so older
  installs cannot resolve a version without Flux2KleinPipeline.
- Frontend curated distilled klein entries now point at the
  matching non-base diffusers repos (FLUX.2-klein-4B / -9B) per
  the published model cards. Update api.ts to mirror the renamed
  status field.
---
 studio/backend/core/inference/diffusion.py    | 77 +++++++++++++++--
 .../backend/requirements/no-torch-runtime.txt |  6 +-
 .../backend/tests/test_diffusion_backend.py   | 85 +++++++++++++++++--
 studio/backend/tests/test_diffusion_routes.py |  2 +-
 studio/frontend/src/features/images/api.ts    |  2 +-
 .../src/features/images/images-page.tsx       | 11 ++-
 6 files changed, 163 insertions(+), 20 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index bc7d20ad0b..d87d4c7623 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -141,6 +141,31 @@ class DiffusionFamily:
 )
 
 
+def _smart_base_repo(fam: DiffusionFamily, repo_id: str) -> str:
+    """Pick the best matching base diffusers repo for a given GGUF repo
+    when the caller did not pass an explicit base_repo.
+
+    Currently only specialises the flux.2-klein family: a repo name
+    containing "9b" gets the 9B base, "base-4b" / "base-9b" map to the
+    Base variants, everything else falls back to the family default
+    (Apache 2.0 4B Base).
+    """
+    if fam.name != "flux.2-klein":
+        return fam.base_repo
+    lower = (repo_id or "").lower()
+    is_9b = "9b" in lower
+    is_base = "base" in lower
+    if is_9b and is_base:
+        return "black-forest-labs/FLUX.2-klein-base-9B"
+    if is_9b:
+        return "black-forest-labs/FLUX.2-klein-9B"
+    if is_base:
+        return "black-forest-labs/FLUX.2-klein-base-4B"
+    # Distilled 4B is the default for any flux-2-klein GGUF that does
+    # not advertise 9B or "base".
+    return "black-forest-labs/FLUX.2-klein-4B"
+
+
 def detect_family(
     repo_id: str, *, override_family: Optional[str] = None
 ) -> Optional[DiffusionFamily]:
@@ -225,6 +250,12 @@ def repo_id(self) -> Optional[str]:
         return self._repo_id
 
     def status(self) -> dict[str, Any]:
+        # Only echo the GGUF basename; full absolute path leaks the
+        # local HF cache layout (and the system username on default
+        # POSIX layouts) to any authenticated Studio session.
+        gguf_basename = (
+            Path(self._gguf_path).name if self._gguf_path else None
+        )
         return {
             "is_loaded": self.is_loaded,
             "is_loading": self._loading,
@@ -232,7 +263,7 @@ def status(self) -> dict[str, Any]:
             "family": self._family.name if self._family else None,
             "pipeline_class": self._family.pipeline_class if self._family else None,
             "base_repo": self._base_repo,
-            "gguf_path": self._gguf_path,
+            "gguf_filename": gguf_basename,
             "device": self._device,
             "dtype": self._dtype,
             "loaded_at": self._loaded_at,
@@ -334,13 +365,26 @@ def load_model(
                 #   2. if no GGUF file was requested the user is loading a
                 #      full diffusers repo; use repo_id directly so we do
                 #      not silently substitute the family default
-                #   3. otherwise fall back to the family default
+                #   3. otherwise use the family + repo_id heuristic so a
+                #      9B GGUF picks the 9B base, not the 4B fallback
                 if base_repo:
                     effective_base = base_repo
                 elif not gguf_filename:
+                    # Guard: a repo that ends in "-GGUF" (the unsloth
+                    # convention) is GGUF-only and will 500 on
+                    # from_pretrained; surface a clear error instead of
+                    # letting diffusers raise a confusing model-index
+                    # failure deep in the loader.
+                    if repo_id.lower().endswith("-gguf"):
+                        raise RuntimeError(
+                            f"'{repo_id}' looks like a GGUF-only repo. "
+                            "Either provide gguf_filename to pick a quant, "
+                            "or pass base_repo to override the full-repo "
+                            "load target."
+                        )
                     effective_base = repo_id
                 else:
-                    effective_base = fam.base_repo
+                    effective_base = _smart_base_repo(fam, repo_id)
                 logger.info(
                     "Loading diffusion model %s (family=%s, device=%s, dtype=%s, base=%s)",
                     repo_id,
@@ -370,20 +414,38 @@ def load_model(
                         torch_dtype = dtype,
                     )
 
-                pipe_kwargs: dict[str, Any] = {"torch_dtype": dtype}
+                pipe_kwargs: dict[str, Any] = {
+                    "torch_dtype": dtype,
+                    # use_safetensors=True refuses pickle-backed .bin
+                    # weights at load time. Diffusers will fall back to
+                    # safetensors variants on repos that publish both,
+                    # and hard-error on repos that only ship .bin (which
+                    # is the threat model we want to block since pickle
+                    # files can execute arbitrary code in this process).
+                    "use_safetensors": True,
+                }
                 if transformer is not None:
                     pipe_kwargs["transformer"] = transformer
                 if hf_token:
                     pipe_kwargs["token"] = hf_token
 
+                # Release the previous pipeline BEFORE allocating the
+                # new one so peak VRAM stays at one model's worth, not
+                # two. This matters on 16-24 GB consumer GPUs where the
+                # combined footprint would OOM the from_pretrained call.
+                old = self._pipe
+                if old is not None:
+                    with self._lock:
+                        self._pipe = None
+                    _release(old)
+                    old = None
+
                 pipe = pipeline_cls.from_pretrained(effective_base, **pipe_kwargs)
                 if enable_model_cpu_offload and device == "cuda":
                     pipe.enable_model_cpu_offload()
                 else:
                     pipe.to(device)
 
-                # Drop the old pipeline only after the new one is in place.
-                old = self._pipe
                 with self._lock:
                     self._pipe = pipe
                     self._family = fam
@@ -393,7 +455,8 @@ def load_model(
                     self._device = device
                     self._dtype = str(dtype).replace("torch.", "")
                     self._loaded_at = time.time()
-                _release(old)
+                # ``old`` was released above before the new allocation;
+                # nothing left to free here.
 
                 return self.status()
             except Exception as exc:
diff --git a/studio/backend/requirements/no-torch-runtime.txt b/studio/backend/requirements/no-torch-runtime.txt
index fa3f33757e..6b7c17a0be 100644
--- a/studio/backend/requirements/no-torch-runtime.txt
+++ b/studio/backend/requirements/no-torch-runtime.txt
@@ -45,9 +45,11 @@ accelerate>=0.34.1
 peft>=0.18.0,!=0.11.0
 huggingface_hub>=0.34.0
 hf_transfer
-diffusers
+# Floor 0.37.0 introduces Flux2KleinPipeline + Flux2Pipeline which the
+# Studio Images page imports for the default curated picker.
+diffusers>=0.37.0
 # Required by diffusers.GGUFQuantizationConfig (used by the Images page
-# to load FLUX.2 / FLUX.1 / Qwen-Image / SDXL GGUFs from the Hub).
+# to load FLUX.2 / FLUX.1 / Qwen-Image GGUFs from the Hub).
 gguf
 
 # Transitive deps required because this file is installed with --no-deps.
diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index 1a6a8de1c3..b3cf13f10d 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -146,7 +146,7 @@ def test_status_shape_unloaded():
         "family",
         "pipeline_class",
         "base_repo",
-        "gguf_path",
+        "gguf_filename",
         "device",
         "dtype",
         "loaded_at",
@@ -373,10 +373,11 @@ def test_load_model_gguf_path_happy(monkeypatch):
     assert status["is_loaded"] is True
     assert status["family"] == "flux.2-klein"
     assert status["pipeline_class"] == "Flux2KleinPipeline"
-    assert status["base_repo"] == "black-forest-labs/FLUX.2-klein-base-4B"
-    assert status["gguf_path"] == (
-        "/fake/unsloth/FLUX.2-klein-4B-GGUF/flux-2-klein-4b-Q4_K_S.gguf"
-    )
+    # _smart_base_repo picks the distilled 4B (not the Base) for the
+    # "FLUX.2-klein-4B-GGUF" repo name. The Base variant kicks in only
+    # when "base" is part of the repo id.
+    assert status["base_repo"] == "black-forest-labs/FLUX.2-klein-4B"
+    assert status["gguf_filename"] == "flux-2-klein-4b-Q4_K_S.gguf"
 
 
 def test_load_model_recovers_after_failure(monkeypatch):
@@ -426,6 +427,80 @@ def test_load_model_base_repo_override(monkeypatch):
     assert status["base_repo"] == "black-forest-labs/FLUX.2-klein-base-9B"
 
 
+def test_load_model_gguf_only_repo_without_filename_errors(monkeypatch):
+    """When the caller points at a -GGUF repo but forgets the filename,
+    surface a clear error instead of calling from_pretrained on the
+    GGUF-only repo (which 500s deep in diffusers)."""
+    _install_fake_diffusers(monkeypatch)
+    from core.inference.diffusion import get_diffusion_backend
+
+    backend = get_diffusion_backend()
+    with pytest.raises(RuntimeError, match = "looks like a GGUF-only repo"):
+        backend.load_model("unsloth/FLUX.2-klein-4B-GGUF")
+
+
+def test_smart_base_repo_picks_9b(monkeypatch):
+    """For unsloth/FLUX.2-klein-9B-GGUF without an explicit base_repo,
+    the backend must fall through to FLUX.2-klein-9B, not the 4B base."""
+    _install_fake_diffusers(monkeypatch)
+    from core.inference.diffusion import get_diffusion_backend
+
+    backend = get_diffusion_backend()
+    status = backend.load_model(
+        "unsloth/FLUX.2-klein-9B-GGUF",
+        gguf_filename = "flux-2-klein-9b-Q4_K_S.gguf",
+    )
+    assert status["base_repo"] == "black-forest-labs/FLUX.2-klein-9B"
+
+
+def test_smart_base_repo_picks_base_9b(monkeypatch):
+    _install_fake_diffusers(monkeypatch)
+    from core.inference.diffusion import get_diffusion_backend
+
+    backend = get_diffusion_backend()
+    status = backend.load_model(
+        "unsloth/FLUX.2-klein-base-9B-GGUF",
+        gguf_filename = "flux-2-klein-base-9b-Q4_K_S.gguf",
+    )
+    assert status["base_repo"] == "black-forest-labs/FLUX.2-klein-base-9B"
+
+
+def test_smart_base_repo_picks_base_4b(monkeypatch):
+    _install_fake_diffusers(monkeypatch)
+    from core.inference.diffusion import get_diffusion_backend
+
+    backend = get_diffusion_backend()
+    status = backend.load_model(
+        "unsloth/FLUX.2-klein-base-4B-GGUF",
+        gguf_filename = "flux-2-klein-base-4b-Q4_K_S.gguf",
+    )
+    assert status["base_repo"] == "black-forest-labs/FLUX.2-klein-base-4B"
+
+
+def test_load_model_uses_safetensors_flag(monkeypatch):
+    """The pipeline.from_pretrained call must pass use_safetensors=True
+    so pickle-backed .bin weights are refused at load time."""
+    fake = _install_fake_diffusers(monkeypatch)
+    from core.inference.diffusion import get_diffusion_backend
+
+    captured: dict = {}
+
+    original = fake.Flux2KleinPipeline.from_pretrained.__func__
+
+    def _capture(cls, base_repo, **kw):
+        captured.update(kw)
+        return original(cls, base_repo, **kw)
+
+    fake.Flux2KleinPipeline.from_pretrained = classmethod(_capture)
+
+    backend = get_diffusion_backend()
+    backend.load_model(
+        "unsloth/FLUX.2-klein-base-4B-GGUF",
+        gguf_filename = "flux-2-klein-base-4b-Q4_K_S.gguf",
+    )
+    assert captured.get("use_safetensors") is True
+
+
 def test_load_model_full_repo_does_not_substitute(monkeypatch):
     """A full diffusers repo (no gguf_filename) must call from_pretrained
     with the user-supplied repo, not the family default. This was the
diff --git a/studio/backend/tests/test_diffusion_routes.py b/studio/backend/tests/test_diffusion_routes.py
index af759ab234..ca420f72c5 100644
--- a/studio/backend/tests/test_diffusion_routes.py
+++ b/studio/backend/tests/test_diffusion_routes.py
@@ -43,7 +43,7 @@ def status(self) -> dict:
             "family": "flux.2-klein" if self._loaded else None,
             "pipeline_class": "Flux2KleinPipeline" if self._loaded else None,
             "base_repo": "black-forest-labs/FLUX.2-klein" if self._loaded else None,
-            "gguf_path": None,
+            "gguf_filename": None,
             "device": "cpu",
             "dtype": "torch.bfloat16",
             "loaded_at": 0,
diff --git a/studio/frontend/src/features/images/api.ts b/studio/frontend/src/features/images/api.ts
index 017b856b5a..e576f3987e 100644
--- a/studio/frontend/src/features/images/api.ts
+++ b/studio/frontend/src/features/images/api.ts
@@ -23,7 +23,7 @@ export interface DiffusionStatus {
   family: string | null;
   pipeline_class: string | null;
   base_repo: string | null;
-  gguf_path: string | null;
+  gguf_filename: string | null;
   device: string | null;
   dtype: string | null;
   loaded_at: number | null;
diff --git a/studio/frontend/src/features/images/images-page.tsx b/studio/frontend/src/features/images/images-page.tsx
index abca68c0f2..1cc795345c 100644
--- a/studio/frontend/src/features/images/images-page.tsx
+++ b/studio/frontend/src/features/images/images-page.tsx
@@ -59,17 +59,20 @@ const CURATED_MODELS: Array<{
     label: "FLUX.2 klein 4B (Q4_K_S, distilled)",
     repo_id: "unsloth/FLUX.2-klein-4B-GGUF",
     default_gguf: "flux-2-klein-4b-Q4_K_S.gguf",
-    base_repo: "black-forest-labs/FLUX.2-klein-base-4B",
+    // Distilled GGUF must pair with the distilled base, not the Base
+    // checkpoint. The Hub model card for the GGUF lists
+    // base_model: black-forest-labs/FLUX.2-klein-4B.
+    base_repo: "black-forest-labs/FLUX.2-klein-4B",
     family: "flux.2-klein",
-    notes: "13 GB VRAM. Distilled klein 4B with the Apache base.",
+    notes: "13 GB VRAM. Distilled klein 4B. Requires HF access to FLUX.2 klein 4B.",
   },
   {
     label: "FLUX.2 klein 9B (Q4_K_S, gated)",
     repo_id: "unsloth/FLUX.2-klein-9B-GGUF",
     default_gguf: "flux-2-klein-9b-Q4_K_S.gguf",
-    base_repo: "black-forest-labs/FLUX.2-klein-base-9B",
+    base_repo: "black-forest-labs/FLUX.2-klein-9B",
     family: "flux.2-klein",
-    notes: "17 GB VRAM. Higher quality. Requires HF access to FLUX.2 klein base 9B.",
+    notes: "17 GB VRAM. Higher quality distilled. Requires HF access to FLUX.2 klein 9B.",
   },
   {
     label: "FLUX.2 dev (Q4_K_S, gated)",

From 8c10cf5f1625021cf94f33af00e595e28f950ebd Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 00:04:25 +0000
Subject: [PATCH 09/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/diffusion.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index d87d4c7623..1291a60b45 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -253,9 +253,7 @@ def status(self) -> dict[str, Any]:
         # Only echo the GGUF basename; full absolute path leaks the
         # local HF cache layout (and the system username on default
         # POSIX layouts) to any authenticated Studio session.
-        gguf_basename = (
-            Path(self._gguf_path).name if self._gguf_path else None
-        )
+        gguf_basename = Path(self._gguf_path).name if self._gguf_path else None
         return {
             "is_loaded": self.is_loaded,
             "is_loading": self._loading,

From 6089720c0c8fd0bf153c47644d1ab4d2aa420fd5 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 00:06:17 +0000
Subject: [PATCH 10/92] Fix/adjust diffusion: export unload + sd3.5 alias for
 PR #5754

- routes/export.py load_checkpoint now unloads the diffusion
  pipeline alongside the existing inference + training unloads, so
  an export load after Images does not OOM the export subprocess.
- Remove the 'sd3.5' alias from the stable-diffusion-3 family.
  SD3.5 needs its own family + base_repo (and its own smoke test);
  pairing it with the SD3 Medium base produced a misleading load.
---
 studio/backend/core/inference/diffusion.py |  8 +++++++-
 studio/backend/routes/export.py            | 14 ++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 1291a60b45..50bebdd98c 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -113,7 +113,13 @@ class DiffusionFamily:
         pipeline_class = "StableDiffusion3Pipeline",
         transformer_class = "SD3Transformer2DModel",
         base_repo = "stabilityai/stable-diffusion-3-medium-diffusers",
-        aliases = ("sd3-medium", "stable-diffusion-3-medium", "sd3.5"),
+        # Intentionally NOT including "sd3.5" / "stable-diffusion-3.5"
+        # here: the SD3.5 family uses a different transformer config and
+        # base repo than SD3 Medium, and silently pairing SD3.5 GGUFs
+        # with the Medium base produces a misleading load. Add a
+        # dedicated SD3.5 family with its own base_repo when we ship
+        # smoke coverage for it.
+        aliases = ("sd3-medium", "stable-diffusion-3-medium"),
     ),
     # SDXL: full diffusers path only (no GGUF). SDXL uses a UNet (not a
     # transformer) and wiring UNet2DConditionModel.from_single_file +
diff --git a/studio/backend/routes/export.py b/studio/backend/routes/export.py
index 7dbc52dbed..b35ad56e4f 100644
--- a/studio/backend/routes/export.py
+++ b/studio/backend/routes/export.py
@@ -81,6 +81,20 @@ async def load_checkpoint(
         except Exception as e:
             logger.warning("Could not unload inference model: %s", e)
 
+        # Also unload any active diffusion pipeline (Images page); it
+        # competes for the same GPU and would survive the inference
+        # shutdown above. Best effort; silently skip if the module is
+        # absent.
+        try:
+            from core.inference.diffusion import get_diffusion_backend
+
+            diff = get_diffusion_backend()
+            if diff.is_loaded:
+                logger.info("Unloading diffusion model to free GPU memory for export")
+                diff.unload_model()
+        except Exception as e:
+            logger.debug("diffusion unload skipped for export: %s", e)
+
         try:
             from core.training import get_training_backend
 

From 1601b7828fbe9e3cef3003d9311ac3ace297948d Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 00:20:32 +0000
Subject: [PATCH 11/92] Fix safetensors chat backend unload for PR #5754

_release_chat_backend_for_diffusion was importing
get_inference_backend from core.inference.inference (the in-subprocess
class) and calling unload_model() without the required model_name
argument. The TypeError was swallowed and the active chat model
stayed resident, defeating the chat-to-diffusion lifecycle handoff.

Switch to the orchestrator's accessor at core.inference and pass
active_model_name through, mirroring the GGUF chat-load path. Add a
regression test that stubs both backends and verifies unload_model
is called with the active model name.
---
 studio/backend/core/inference/diffusion.py    | 18 ++++++---
 .../backend/tests/test_diffusion_backend.py   | 37 +++++++++++++++++++
 2 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 50bebdd98c..aa1610a450 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -623,17 +623,23 @@ def _release_chat_backend_for_diffusion() -> None:
     except Exception as exc:
         logger.debug("llama-server unload skipped: %s", exc)
 
-    # 2. Safetensors / HF chat backend (the Inference orchestrator that
+    # 2. Safetensors / HF chat backend (the InferenceOrchestrator that
     #    serves FastVisionModel / FastLanguageModel weights). When this
     #    backend has a model resident on the same GPU, a diffusion load
-    #    will OOM the same way.
+    #    will OOM the same way. The orchestrator's unload_model takes a
+    #    model_name; passing it without args raised TypeError and was
+    #    swallowed, leaving the chat model resident.
     try:
-        from core.inference.inference import get_inference_backend  # type: ignore
+        from core.inference import get_inference_backend  # type: ignore
 
         backend = get_inference_backend()
-        if getattr(backend, "active_model_name", None):
-            logger.info("Unloading safetensors chat backend before diffusion load")
-            backend.unload_model()
+        active_model_name = getattr(backend, "active_model_name", None)
+        if active_model_name:
+            logger.info(
+                "Unloading safetensors chat backend '%s' before diffusion load",
+                active_model_name,
+            )
+            backend.unload_model(active_model_name)
     except Exception as exc:
         logger.debug("safetensors unload skipped: %s", exc)
 
diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index b3cf13f10d..56aa7255b8 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -477,6 +477,43 @@ def test_smart_base_repo_picks_base_4b(monkeypatch):
     assert status["base_repo"] == "black-forest-labs/FLUX.2-klein-base-4B"
 
 
+def test_release_chat_backend_calls_unload_with_model_name(monkeypatch):
+    """The safetensors backend unload helper must call unload_model
+    with the active model name (the orchestrator's signature requires
+    it). The previous behaviour swallowed TypeError and left the chat
+    model resident, defeating the lifecycle handoff."""
+    import sys
+    import types
+
+    fake_pkg = types.ModuleType("core.inference")
+    calls: list = []
+
+    class _Stub:
+        active_model_name = "owner/some-model"
+
+        def unload_model(self, name):
+            calls.append(name)
+            self.active_model_name = None
+            return True
+
+    stub = _Stub()
+    fake_pkg.get_inference_backend = lambda: stub
+    monkeypatch.setitem(sys.modules, "core.inference", fake_pkg)
+
+    # Skip the llama-server branch by also stubbing routes.inference.
+    fake_routes = types.ModuleType("routes.inference")
+    fake_routes.get_llama_cpp_backend = lambda: types.SimpleNamespace(
+        is_loaded = False
+    )
+    monkeypatch.setitem(sys.modules, "routes.inference", fake_routes)
+
+    from core.inference.diffusion import _release_chat_backend_for_diffusion
+
+    _release_chat_backend_for_diffusion()
+    assert calls == ["owner/some-model"], calls
+    assert stub.active_model_name is None
+
+
 def test_load_model_uses_safetensors_flag(monkeypatch):
     """The pipeline.from_pretrained call must pass use_safetensors=True
     so pickle-backed .bin weights are refused at load time."""

From 0f9b19bb5617d70960596ff880d16ca928132951 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 00:22:59 +0000
Subject: [PATCH 12/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/tests/test_diffusion_backend.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index 56aa7255b8..48dcdc00bc 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -502,9 +502,7 @@ def unload_model(self, name):
 
     # Skip the llama-server branch by also stubbing routes.inference.
     fake_routes = types.ModuleType("routes.inference")
-    fake_routes.get_llama_cpp_backend = lambda: types.SimpleNamespace(
-        is_loaded = False
-    )
+    fake_routes.get_llama_cpp_backend = lambda: types.SimpleNamespace(is_loaded = False)
     monkeypatch.setitem(sys.modules, "routes.inference", fake_routes)
 
     from core.inference.diffusion import _release_chat_backend_for_diffusion

From 65f7a2680d385576aa77f38b3ef29a0164589d86 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 00:31:06 +0000
Subject: [PATCH 13/92] Fix/adjust diffusion lifecycle for round 3 findings (PR
 #5754)

- detect_family adds _FAMILY_EXCLUDE so 'stable-diffusion-3.5' no
  longer matches the SD3 Medium family and 'qwen-image-edit' no
  longer matches Qwen-Image. Both were misleading silent loads.
- from_single_file now forwards config=<effective_base>,
  subfolder='transformer', and the HF token. Diffusers-format GGUFs
  (FLUX.2 klein, Qwen-Image, SD3) need the matching base config or
  the transformer load picks the wrong shapes; gated GGUFs need the
  token both for download and config read.
- Move _release_chat_backend_for_diffusion + new
  _release_other_gpu_owners_for_diffusion to AFTER the GGUF download
  and pipeline class lookup so a typo or transient Hub error does
  not kill the user's currently-loaded chat model. Peak VRAM still
  stays at one model's worth because the releases run right before
  from_pretrained.
- _release_other_gpu_owners_for_diffusion: shut down the export
  subprocess and any active training subprocess before a diffusion
  load. Symmetric with the export load path.
- routes/training.py: unload diffusion before starting training so
  the new subprocess does not race FLUX/Qwen for VRAM.
- routes/export.py: also unload the GGUF llama-server before export
  load (the existing inference-backend unload only covered the
  safetensors path).
---
 studio/backend/core/inference/diffusion.py    | 97 +++++++++++++++----
 studio/backend/routes/export.py               | 15 +++
 studio/backend/routes/training.py             | 14 +++
 .../backend/tests/test_diffusion_backend.py   | 54 ++++++++++-
 4 files changed, 160 insertions(+), 20 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index aa1610a450..d442ff0579 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -172,16 +172,30 @@ def _smart_base_repo(fam: DiffusionFamily, repo_id: str) -> str:
     return "black-forest-labs/FLUX.2-klein-4B"
 
 
+# Negative substrings that disqualify a candidate family even when its
+# name appears as a substring of the repo id. Prevents
+# "stable-diffusion-3" matching SD3.5 and "qwen-image" matching
+# Qwen-Image-Edit. Each entry maps a family name to substrings that
+# must NOT appear anywhere in the repo id.
+_FAMILY_EXCLUDE: dict[str, tuple[str, ...]] = {
+    "stable-diffusion-3": ("3.5", "3-5", "stable-diffusion-3.5"),
+    "qwen-image": ("qwen-image-edit", "qwenimage-edit"),
+}
+
+
 def detect_family(
     repo_id: str, *, override_family: Optional[str] = None
 ) -> Optional[DiffusionFamily]:
     """Return the diffusion family matching ``repo_id``.
 
-    Matching is substring-based and case-insensitive. ``override_family``
-    bypasses substring matching and looks up by ``DiffusionFamily.name``
-    or (when explicitly asked) by ``_FULL_REPO_FAMILIES.name``.
-    Returns ``None`` when no family applies so callers can surface a
-    clear "unsupported model" error rather than guessing wrong.
+    Matching is substring-based and case-insensitive, with a small
+    deny list (``_FAMILY_EXCLUDE``) for known false positives such as
+    SD3.5 (would otherwise match SD3 Medium) and Qwen-Image-Edit
+    (would otherwise match Qwen-Image). ``override_family`` bypasses
+    substring matching and looks up by ``DiffusionFamily.name`` or
+    (when explicitly asked) by ``_FULL_REPO_FAMILIES.name``. Returns
+    ``None`` when no family applies so callers can surface a clear
+    "unsupported model" error rather than guessing wrong.
     """
     if override_family:
         wanted = override_family.strip().lower()
@@ -193,6 +207,9 @@ def detect_family(
     if not needle:
         return None
     for fam in _FAMILIES:
+        excludes = _FAMILY_EXCLUDE.get(fam.name, ())
+        if any(e in needle for e in excludes):
+            continue
         if fam.name in needle:
             return fam
         for alias in fam.aliases:
@@ -345,12 +362,6 @@ def load_model(
                 self._loading = True
                 self._last_error = None
             try:
-                # Unload any chat model that is holding GPU memory so the
-                # diffusion load does not OOM on a < 24 GB GPU. Best
-                # effort: if the llama-cpp backend module is absent (eg
-                # tests, headless tooling) we just continue.
-                _release_chat_backend_for_diffusion()
-
                 pipeline_cls = getattr(diffusers, fam.pipeline_class, None)
                 if pipeline_cls is None:
                     raise RuntimeError(
@@ -412,10 +423,23 @@ def load_model(
                         token = hf_token,
                     )
                     quant_config = diffusers.GGUFQuantizationConfig(compute_dtype = dtype)
+                    # Diffusers-format GGUFs (FLUX.2 klein / Qwen-Image /
+                    # SD3) need the matching base repo's component config
+                    # at config=<base_repo>, subfolder="transformer".
+                    # Older city96-style GGUFs ignore those kwargs. The
+                    # token is also passed because gated GGUF repos
+                    # require it both at download and at config read time.
+                    single_file_kwargs: dict[str, Any] = {
+                        "quantization_config": quant_config,
+                        "torch_dtype": dtype,
+                        "config": effective_base,
+                        "subfolder": "transformer",
+                    }
+                    if hf_token:
+                        single_file_kwargs["token"] = hf_token
                     transformer = transformer_cls.from_single_file(
                         local_gguf_path,
-                        quantization_config = quant_config,
-                        torch_dtype = dtype,
+                        **single_file_kwargs,
                     )
 
                 pipe_kwargs: dict[str, Any] = {
@@ -433,10 +457,15 @@ def load_model(
                 if hf_token:
                     pipe_kwargs["token"] = hf_token
 
-                # Release the previous pipeline BEFORE allocating the
-                # new one so peak VRAM stays at one model's worth, not
-                # two. This matters on 16-24 GB consumer GPUs where the
-                # combined footprint would OOM the from_pretrained call.
+                # Cheap failure modes (bad gguf_filename, gated token,
+                # transient Hub error) have all happened by now. Only
+                # release the current chat backend + previous diffusion
+                # pipeline right before the expensive allocation so a
+                # typo does not kill the user's loaded chat model. Peak
+                # VRAM still stays at one model's worth because the
+                # release happens before from_pretrained.
+                _release_chat_backend_for_diffusion()
+                _release_other_gpu_owners_for_diffusion()
                 old = self._pipe
                 if old is not None:
                     with self._lock:
@@ -644,6 +673,40 @@ def _release_chat_backend_for_diffusion() -> None:
         logger.debug("safetensors unload skipped: %s", exc)
 
 
+def _release_other_gpu_owners_for_diffusion() -> None:
+    """Best-effort: shut down export subprocess + active training before
+    a diffusion load. Both can hold multi-GB of VRAM and would OOM the
+    diffusion allocation on consumer GPUs."""
+    # Export subprocess
+    try:
+        from core.export import get_export_backend  # type: ignore
+
+        exp = get_export_backend()
+        if getattr(exp, "current_checkpoint", None):
+            logger.info("Shutting down export subprocess before diffusion load")
+            exp._shutdown_subprocess()
+            exp.current_checkpoint = None
+            exp.is_vision = False
+            exp.is_peft = False
+    except Exception as exc:
+        logger.debug("export unload skipped: %s", exc)
+
+    # Active training subprocess
+    try:
+        from core.training import get_training_backend  # type: ignore
+
+        trn = get_training_backend()
+        if trn.is_training_active():
+            logger.info("Stopping training subprocess before diffusion load")
+            trn.stop_training()
+            for _ in range(60):
+                if not trn.is_training_active():
+                    break
+                time.sleep(0.5)
+    except Exception as exc:
+        logger.debug("training unload skipped: %s", exc)
+
+
 def _release(obj: Any) -> None:
     """Best-effort GPU-memory release for a pipeline being swapped out."""
     if obj is None:
diff --git a/studio/backend/routes/export.py b/studio/backend/routes/export.py
index b35ad56e4f..4408f18be6 100644
--- a/studio/backend/routes/export.py
+++ b/studio/backend/routes/export.py
@@ -81,6 +81,21 @@ async def load_checkpoint(
         except Exception as e:
             logger.warning("Could not unload inference model: %s", e)
 
+        # Also unload any active GGUF llama-server (the inference unload
+        # above only covers the safetensors / Unsloth backend; GGUF
+        # chat runs as a separate subprocess).
+        try:
+            from routes.inference import get_llama_cpp_backend
+
+            llama = get_llama_cpp_backend()
+            if getattr(llama, "is_loaded", False):
+                logger.info(
+                    "Unloading GGUF chat model to free GPU memory for export"
+                )
+                llama.unload_model()
+        except Exception as e:
+            logger.debug("llama-server unload skipped for export: %s", e)
+
         # Also unload any active diffusion pipeline (Images page); it
         # competes for the same GPU and would survive the inference
         # shutdown above. Best effort; silently skip if the module is
diff --git a/studio/backend/routes/training.py b/studio/backend/routes/training.py
index 6e2413b3e9..34715d03d8 100644
--- a/studio/backend/routes/training.py
+++ b/studio/backend/routes/training.py
@@ -297,6 +297,20 @@ async def start_training(
         except Exception as e:
             logger.warning("Could not shut down export subprocess: %s", e)
 
+        # Also unload any loaded diffusion pipeline (Images page); it
+        # holds the same GPU and would survive the inference shutdown.
+        try:
+            from core.inference.diffusion import get_diffusion_backend
+
+            diff_backend = get_diffusion_backend()
+            if diff_backend.is_loaded:
+                logger.info(
+                    "Unloading diffusion model to free GPU memory for training"
+                )
+                diff_backend.unload_model()
+        except Exception as e:
+            logger.warning("Could not unload diffusion model: %s", e)
+
         # start_training now spawns a subprocess (non-blocking)
         success = backend.start_training(job_id = job_id, **training_kwargs)
 
diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index 48dcdc00bc..48a8b3f78f 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -111,6 +111,24 @@ def test_detect_family_unknown_returns_none():
     assert detect_family("") is None
 
 
+def test_detect_family_sd35_is_not_sd3():
+    """SD3.5 must NOT be matched as SD3 Medium. Pairing SD3.5 GGUFs
+    with the Medium base produces a misleading load."""
+    from core.inference.diffusion import detect_family
+
+    assert detect_family("unsloth/SD3.5-large-GGUF") is None
+    assert detect_family("unsloth/stable-diffusion-3.5-large-GGUF") is None
+
+
+def test_detect_family_qwen_image_edit_is_not_qwen_image():
+    """Qwen-Image-Edit must NOT be matched as Qwen-Image. The Edit
+    variant uses a different pipeline (image-to-image)."""
+    from core.inference.diffusion import detect_family
+
+    assert detect_family("unsloth/Qwen-Image-Edit-GGUF") is None
+    assert detect_family("unsloth/Qwen-Image-Edit-2509-GGUF") is None
+
+
 def test_supported_families_payload_shape():
     from core.inference.diffusion import supported_families
 
@@ -285,11 +303,14 @@ def __init__(self, compute_dtype = None):
 
     class _FakeTransformer:
         @classmethod
-        def from_single_file(cls, path, quantization_config = None, torch_dtype = None):
+        def from_single_file(cls, path, **kw):
             inst = cls()
             inst.path = path
-            inst.qc = quantization_config
-            inst.dtype = torch_dtype
+            inst.qc = kw.get("quantization_config")
+            inst.dtype = kw.get("torch_dtype")
+            inst.config = kw.get("config")
+            inst.subfolder = kw.get("subfolder")
+            inst.token = kw.get("token")
             return inst
 
     class _FakePipeline:
@@ -477,6 +498,33 @@ def test_smart_base_repo_picks_base_4b(monkeypatch):
     assert status["base_repo"] == "black-forest-labs/FLUX.2-klein-base-4B"
 
 
+def test_gguf_transformer_load_passes_config_subfolder_token(monkeypatch):
+    """Diffusers-format GGUFs require config=<base_repo>+subfolder=
+    transformer at from_single_file time; gated GGUFs also need the
+    token. Verify all three kwargs are forwarded."""
+    fake = _install_fake_diffusers(monkeypatch)
+    from core.inference.diffusion import get_diffusion_backend
+
+    captured: dict = {}
+    original = fake.Flux2Transformer2DModel.from_single_file.__func__
+
+    def _capture(cls, path, **kw):
+        captured.update(kw)
+        return original(cls, path, **kw)
+
+    fake.Flux2Transformer2DModel.from_single_file = classmethod(_capture)
+
+    backend = get_diffusion_backend()
+    backend.load_model(
+        "unsloth/FLUX.2-klein-4B-GGUF",
+        gguf_filename = "flux-2-klein-4b-Q4_K_S.gguf",
+        hf_token = "hf_test_token",
+    )
+    assert captured.get("config") == "black-forest-labs/FLUX.2-klein-4B"
+    assert captured.get("subfolder") == "transformer"
+    assert captured.get("token") == "hf_test_token"
+
+
 def test_release_chat_backend_calls_unload_with_model_name(monkeypatch):
     """The safetensors backend unload helper must call unload_model
     with the active model name (the orchestrator's signature requires

From fbf06bec7ac50753c0c0ec7d171a5e1ee8d008e5 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 00:31:32 +0000
Subject: [PATCH 14/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/routes/export.py   | 4 +---
 studio/backend/routes/training.py | 4 +---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/studio/backend/routes/export.py b/studio/backend/routes/export.py
index 4408f18be6..ff9e3d6695 100644
--- a/studio/backend/routes/export.py
+++ b/studio/backend/routes/export.py
@@ -89,9 +89,7 @@ async def load_checkpoint(
 
             llama = get_llama_cpp_backend()
             if getattr(llama, "is_loaded", False):
-                logger.info(
-                    "Unloading GGUF chat model to free GPU memory for export"
-                )
+                logger.info("Unloading GGUF chat model to free GPU memory for export")
                 llama.unload_model()
         except Exception as e:
             logger.debug("llama-server unload skipped for export: %s", e)
diff --git a/studio/backend/routes/training.py b/studio/backend/routes/training.py
index 34715d03d8..99a775c0b7 100644
--- a/studio/backend/routes/training.py
+++ b/studio/backend/routes/training.py
@@ -304,9 +304,7 @@ async def start_training(
 
             diff_backend = get_diffusion_backend()
             if diff_backend.is_loaded:
-                logger.info(
-                    "Unloading diffusion model to free GPU memory for training"
-                )
+                logger.info("Unloading diffusion model to free GPU memory for training")
                 diff_backend.unload_model()
         except Exception as e:
             logger.warning("Could not unload diffusion model: %s", e)

From f44b55c796685098d42e4f877246981ce6d3fb17 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 00:33:40 +0000
Subject: [PATCH 15/92] Fix/adjust diffusion: clear stale metadata on failed
 swap for PR #5754

When a swap load fails after the previous pipeline is released,
status() previously reported is_loaded=false on top of the OLD
repo/family/base_repo metadata, which the frontend then rendered
as a misleading 'still loaded: X' label. Clear all metadata
atomically with the pipe drop so a failed swap reports a clean
empty status plus last_error. Add regression test.
---
 studio/backend/core/inference/diffusion.py    | 19 ++++++--
 .../backend/tests/test_diffusion_backend.py   | 43 +++++++++++++++++++
 2 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index d442ff0579..19f1aff2a8 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -334,9 +334,10 @@ def load_model(
         for VAE / text encoders. ``family_override`` short-circuits the
         substring matcher when an exotic repo name confuses it.
 
-        Raises ``RuntimeError`` on failure with a user-facing message;
-        the previous pipeline (if any) stays loaded so a failed swap
-        does not leave Studio in an unusable state.
+        Raises ``RuntimeError`` on failure with a user-facing message.
+        On a failed swap the previous pipeline is also released to
+        keep peak VRAM bounded; status() reports is_loaded=false with
+        last_error set so the caller can react.
         """
         from huggingface_hub import hf_hub_download
         import diffusers
@@ -469,7 +470,19 @@ def load_model(
                 old = self._pipe
                 if old is not None:
                     with self._lock:
+                        # Clear ALL metadata together so a failed swap
+                        # cannot leave status() reporting the previous
+                        # repo / family / base_repo on top of an empty
+                        # pipe. The except block below will restore
+                        # last_error so the caller knows what happened.
                         self._pipe = None
+                        self._family = None
+                        self._repo_id = None
+                        self._gguf_path = None
+                        self._base_repo = None
+                        self._device = None
+                        self._dtype = None
+                        self._loaded_at = None
                     _release(old)
                     old = None
 
diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index 48a8b3f78f..55a7dafddf 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -417,6 +417,49 @@ def test_load_model_recovers_after_failure(monkeypatch):
     assert s["last_error"] and "simulated load failure" in s["last_error"]
 
 
+def test_failed_swap_clears_previous_metadata(monkeypatch):
+    """After a successful load, a subsequent failing load must NOT
+    leave status() reporting the OLD repo/family/base_repo on top of
+    is_loaded=false. The clear must be atomic with the pipe drop."""
+    import sys
+
+    _install_fake_diffusers(monkeypatch)
+    from core.inference.diffusion import get_diffusion_backend
+
+    backend = get_diffusion_backend()
+    # First load succeeds.
+    backend.load_model(
+        "unsloth/FLUX.2-klein-4B-GGUF",
+        gguf_filename = "flux-2-klein-4b-Q4_K_S.gguf",
+    )
+    s_before = backend.status()
+    assert s_before["is_loaded"] is True
+    assert s_before["repo_id"] == "unsloth/FLUX.2-klein-4B-GGUF"
+
+    # Replace from_pretrained on the SAME fake module with a raising one
+    # without re-installing the rest of the fakes.
+    fake = sys.modules["diffusers"]
+    def _boom(cls, *a, **kw):
+        raise RuntimeError("simulated swap failure")
+    fake.Flux2KleinPipeline.from_pretrained = classmethod(_boom)
+
+    with pytest.raises(RuntimeError, match = "Failed to load diffusion model"):
+        backend.load_model(
+            "unsloth/FLUX.2-dev-GGUF",
+            gguf_filename = "flux2-dev-Q4_K_S.gguf",
+        )
+
+    s_after = backend.status()
+    assert s_after["is_loaded"] is False
+    # Critically: stale metadata from the previous successful load
+    # must be cleared, not just the pipe.
+    assert s_after["repo_id"] is None
+    assert s_after["family"] is None
+    assert s_after["base_repo"] is None
+    assert s_after["gguf_filename"] is None
+    assert s_after["last_error"] and "simulated swap failure" in s_after["last_error"]
+
+
 def test_load_model_swap_drops_previous(monkeypatch):
     _install_fake_diffusers(monkeypatch)
     from core.inference.diffusion import get_diffusion_backend

From 18b50c1f8aed54e9fdac2a2f40521ca29f4eaea5 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 00:34:01 +0000
Subject: [PATCH 16/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/tests/test_diffusion_backend.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index 55a7dafddf..1c8dcb5864 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -439,8 +439,10 @@ def test_failed_swap_clears_previous_metadata(monkeypatch):
     # Replace from_pretrained on the SAME fake module with a raising one
     # without re-installing the rest of the fakes.
     fake = sys.modules["diffusers"]
+
     def _boom(cls, *a, **kw):
         raise RuntimeError("simulated swap failure")
+
     fake.Flux2KleinPipeline.from_pretrained = classmethod(_boom)
 
     with pytest.raises(RuntimeError, match = "Failed to load diffusion model"):

From 0f3ed08351f06805deeca932bef9c4b68b642381 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 00:36:29 +0000
Subject: [PATCH 17/92] Fix/adjust diffusion: token leak + cache guard + locked
 status + seed precision for PR #5754

- DiffusionBackend.status() now takes _lock so frontend polling
  cannot observe a torn snapshot mid-swap.
- Scrub hf_token / pipe_kwargs / single_file_kwargs from frame
  locals before logger.exception() so rich tracebacks and structlog
  formatters that render locals do not leak hf_... tokens into logs.
- routes/models.py delete_cached_repo: refuse to delete the cache
  underlying a currently-loaded diffusion pipeline (both the GGUF
  repo and the matching diffusers base_repo). Symmetric with the
  existing chat-load + GGUF guard.
- Frontend seed validation: reject non-integer and out-of-safe-
  integer-range inputs instead of silently rounding via Number(),
  which would otherwise send a different seed than what the user
  typed.
---
 studio/backend/core/inference/diffusion.py    | 47 +++++++++++++------
 studio/backend/routes/models.py               | 23 +++++++++
 .../src/features/images/images-page.tsx       | 26 ++++++++--
 3 files changed, 77 insertions(+), 19 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 19f1aff2a8..4e5a2efec6 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -273,24 +273,33 @@ def repo_id(self) -> Optional[str]:
         return self._repo_id
 
     def status(self) -> dict[str, Any]:
+        # Take _lock so the snapshot cannot observe a torn state where
+        # _pipe was already swapped but _family/_repo_id haven't been
+        # updated yet (or vice versa). Frontend polling at 1 Hz would
+        # otherwise render impossible "loaded but no repo_id" states.
         # Only echo the GGUF basename; full absolute path leaks the
         # local HF cache layout (and the system username on default
         # POSIX layouts) to any authenticated Studio session.
-        gguf_basename = Path(self._gguf_path).name if self._gguf_path else None
-        return {
-            "is_loaded": self.is_loaded,
-            "is_loading": self._loading,
-            "repo_id": self._repo_id,
-            "family": self._family.name if self._family else None,
-            "pipeline_class": self._family.pipeline_class if self._family else None,
-            "base_repo": self._base_repo,
-            "gguf_filename": gguf_basename,
-            "device": self._device,
-            "dtype": self._dtype,
-            "loaded_at": self._loaded_at,
-            "last_error": self._last_error,
-            "supported_families": supported_families(),
-        }
+        with self._lock:
+            gguf_basename = (
+                Path(self._gguf_path).name if self._gguf_path else None
+            )
+            return {
+                "is_loaded": self._pipe is not None,
+                "is_loading": self._loading,
+                "repo_id": self._repo_id,
+                "family": self._family.name if self._family else None,
+                "pipeline_class": (
+                    self._family.pipeline_class if self._family else None
+                ),
+                "base_repo": self._base_repo,
+                "gguf_filename": gguf_basename,
+                "device": self._device,
+                "dtype": self._dtype,
+                "loaded_at": self._loaded_at,
+                "last_error": self._last_error,
+                "supported_families": supported_families(),
+            }
 
     def _pick_device_and_dtype(self) -> tuple[str, "Any"]:
         """Pick (device, dtype) for the current host.
@@ -506,6 +515,14 @@ def load_model(
 
                 return self.status()
             except Exception as exc:
+                # Scrub hf_token and pipe_kwargs from frame locals BEFORE
+                # logger.exception() captures them. Rich tracebacks and
+                # some structlog formatters render frame locals, which
+                # would otherwise echo the raw hf_... token into logs
+                # and any error reporting sink the user has wired up.
+                hf_token = None  # noqa: F841
+                pipe_kwargs = None  # noqa: F841
+                single_file_kwargs = None  # noqa: F841
                 with self._lock:
                     self._last_error = str(exc)
                 logger.exception("Diffusion load failed for %s", repo_id)
diff --git a/studio/backend/routes/models.py b/studio/backend/routes/models.py
index 9ea113e488..13e2a4e83f 100644
--- a/studio/backend/routes/models.py
+++ b/studio/backend/routes/models.py
@@ -2632,6 +2632,29 @@ async def delete_cached_model(
     except Exception:
         pass
 
+    # Also refuse to delete the cache underlying a loaded diffusion
+    # pipeline. The diffusion backend mmap's the GGUF + base repo
+    # weights and continues to read from the cache long after load,
+    # so deleting them out from under it would corrupt generation.
+    try:
+        from core.inference.diffusion import get_diffusion_backend
+
+        diff_backend = get_diffusion_backend()
+        diff_status = diff_backend.status()
+        if diff_status.get("is_loaded"):
+            diff_repo = (diff_status.get("repo_id") or "").lower()
+            diff_base = (diff_status.get("base_repo") or "").lower()
+            needle = repo_id.lower()
+            if diff_repo.startswith(needle) or diff_base.startswith(needle):
+                raise HTTPException(
+                    status_code = 400,
+                    detail = "Unload the diffusion image model before deleting",
+                )
+    except HTTPException:
+        raise
+    except Exception:
+        pass
+
     try:
         cache_scans = _all_hf_cache_scans()
 
diff --git a/studio/frontend/src/features/images/images-page.tsx b/studio/frontend/src/features/images/images-page.tsx
index 1cc795345c..05eb5c3a89 100644
--- a/studio/frontend/src/features/images/images-page.tsx
+++ b/studio/frontend/src/features/images/images-page.tsx
@@ -200,10 +200,28 @@ export function ImagesPage() {
     }
     setBusy("generating");
     try {
-      const parsedSeed = seed.trim() ? Number(seed.trim()) : undefined;
-      if (parsedSeed !== undefined && !Number.isFinite(parsedSeed)) {
-        toast.error("Seed must be a number");
-        return;
+      // Reject non-integer or out-of-safe-integer-range seeds rather
+      // than silently rounding via Number(). The backend takes an int
+      // and a precision loss here would yield a different image than
+      // the seed the user typed.
+      const seedStr = seed.trim();
+      let parsedSeed: number | undefined;
+      if (seedStr) {
+        if (!/^-?\d+$/.test(seedStr)) {
+          toast.error("Seed must be an integer");
+          return;
+        }
+        const candidate = Number(seedStr);
+        if (
+          !Number.isFinite(candidate) ||
+          !Number.isSafeInteger(candidate)
+        ) {
+          toast.error(
+            "Seed must fit in a JavaScript safe integer (<= 2^53 - 1)",
+          );
+          return;
+        }
+        parsedSeed = candidate;
       }
       const out = await generateDiffusionImage({
         prompt,

From ec507c5da71b38f1ca92689a61b85066ad1f8adb Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 00:36:46 +0000
Subject: [PATCH 18/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/diffusion.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 4e5a2efec6..ce37317011 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -281,9 +281,7 @@ def status(self) -> dict[str, Any]:
         # local HF cache layout (and the system username on default
         # POSIX layouts) to any authenticated Studio session.
         with self._lock:
-            gguf_basename = (
-                Path(self._gguf_path).name if self._gguf_path else None
-            )
+            gguf_basename = Path(self._gguf_path).name if self._gguf_path else None
             return {
                 "is_loaded": self._pipe is not None,
                 "is_loading": self._loading,

From fb0a31b9a5cca3a6f6b4974ccd1f58478e6ec9b9 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 00:46:02 +0000
Subject: [PATCH 19/92] Fix/adjust diffusion: forward true_cfg_scale on
 Qwen/Flux for negative prompt (PR #5754)

QwenImagePipeline and FluxPipeline treat guidance_scale as the
distilled CFG factor and expose true_cfg_scale as the real
classifier-free guidance knob. Negative prompts only steer the
output when true_cfg_scale > 1, so forwarding only guidance_scale
left Qwen-Image on the default true_cfg_scale=4.0 and the user's
slider value silently ineffective for negative prompts.

When the loaded pipeline accepts both negative_prompt and
true_cfg_scale and the caller supplies a non-empty negative
prompt, forward guidance_scale through both kwargs so the
negative prompt actually steers generation. When no negative
prompt is supplied, true_cfg_scale is left at the model default
to avoid switching distilled CFG models into real-CFG mode (which
would double inference cost and degrade quality).

Adds two regression tests covering the forward-when-negative and
skip-when-no-negative paths.
---
 studio/backend/core/inference/diffusion.py    |   9 ++
 .../backend/tests/test_diffusion_backend.py   | 106 ++++++++++++++++++
 2 files changed, 115 insertions(+)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index ce37317011..bd97736f03 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -610,6 +610,15 @@ def generate_image(
             if negative_prompt is not None and negative_prompt.strip():
                 if _pipe_accepts_kwarg(pipe, "negative_prompt"):
                     call_kwargs["negative_prompt"] = negative_prompt
+                    # QwenImagePipeline and FluxPipeline treat
+                    # guidance_scale as distilled CFG and use
+                    # true_cfg_scale as the real classifier-free
+                    # guidance knob; the negative prompt is only
+                    # effective when true_cfg_scale > 1. Forward the
+                    # user-supplied guidance_scale through both so the
+                    # negative prompt actually steers generation.
+                    if _pipe_accepts_kwarg(pipe, "true_cfg_scale"):
+                        call_kwargs["true_cfg_scale"] = float(guidance_scale)
                 else:
                     logger.info(
                         "Dropping negative_prompt: %s does not accept it",
diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index 1c8dcb5864..6014bda840 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -831,3 +831,109 @@ class _Out:
         height = 256,
     )
     assert captured["negative_prompt"] == "blurry"
+
+
+def test_generate_image_forwards_true_cfg_scale_when_supported(monkeypatch):
+    """When a pipeline accepts both negative_prompt and true_cfg_scale
+    (QwenImagePipeline, FluxPipeline) the user's guidance_scale must be
+    forwarded as true_cfg_scale as well, otherwise the negative prompt
+    is silently ignored (Qwen leaves the default true_cfg_scale=4.0
+    while the user value lands on guidance_scale)."""
+    import core.inference.diffusion as d
+    from PIL import Image
+
+    backend = d.get_diffusion_backend()
+    captured: dict = {}
+
+    class _QwenLikePipe:
+        def __call__(
+            self,
+            *,
+            prompt,
+            negative_prompt = None,
+            num_inference_steps,
+            guidance_scale,
+            true_cfg_scale = 4.0,
+            width,
+            height,
+            **kw,
+        ):
+            captured["guidance_scale"] = guidance_scale
+            captured["true_cfg_scale"] = true_cfg_scale
+            captured["negative_prompt"] = negative_prompt
+
+            class _Out:
+                pass
+
+            o = _Out()
+            o.images = [Image.new("RGB", (width, height), (7, 8, 9))]
+            return o
+
+    backend._pipe = _QwenLikePipe()
+    backend._device = "cpu"
+    backend._family = d._FAMILIES[2]
+    backend._repo_id = "stub/stub"
+
+    backend.generate_image(
+        prompt = "a sloth",
+        negative_prompt = "blurry",
+        num_inference_steps = 4,
+        guidance_scale = 7.5,
+        width = 256,
+        height = 256,
+    )
+    assert captured["negative_prompt"] == "blurry"
+    assert captured["guidance_scale"] == 7.5
+    assert captured["true_cfg_scale"] == 7.5
+
+
+def test_generate_image_skips_true_cfg_scale_without_negative_prompt(monkeypatch):
+    """Pipelines that accept true_cfg_scale must NOT have it forwarded
+    when no negative_prompt is given; otherwise distilled CFG models
+    would unintentionally switch into real-CFG mode and degrade
+    quality / double inference cost."""
+    import core.inference.diffusion as d
+    from PIL import Image
+
+    backend = d.get_diffusion_backend()
+    captured: dict = {}
+
+    class _QwenLikePipe:
+        def __call__(
+            self,
+            *,
+            prompt,
+            negative_prompt = None,
+            num_inference_steps,
+            guidance_scale,
+            true_cfg_scale = 4.0,
+            width,
+            height,
+            **kw,
+        ):
+            captured["guidance_scale"] = guidance_scale
+            captured["true_cfg_scale"] = true_cfg_scale
+
+            class _Out:
+                pass
+
+            o = _Out()
+            o.images = [Image.new("RGB", (width, height), (1, 1, 1))]
+            return o
+
+    backend._pipe = _QwenLikePipe()
+    backend._device = "cpu"
+    backend._family = d._FAMILIES[2]
+    backend._repo_id = "stub/stub"
+
+    backend.generate_image(
+        prompt = "a sloth",
+        negative_prompt = None,
+        num_inference_steps = 4,
+        guidance_scale = 7.5,
+        width = 256,
+        height = 256,
+    )
+    assert captured["guidance_scale"] == 7.5
+    # Default left untouched: real CFG only activates with neg prompt.
+    assert captured["true_cfg_scale"] == 4.0

From f3f3f06dc1b16675fe93edb2a139d3295760ba43 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 00:49:14 +0000
Subject: [PATCH 20/92] Fix/adjust diffusion: pin requests chain in no-deps
 runtime for PR #5754

The Studio backend's no-torch-runtime.txt is installed via
pip --no-deps so the diffusion stack's transitive imports must
be pinned explicitly. huggingface_hub's blob downloader (used by
diffusers.GGUFQuantizationConfig and by every from_single_file
call) imports requests + urllib3 + charset_normalizer at module
load time; a fresh --no-deps install would 500 on the first
/api/inference/images/load with PackageNotFoundError: 'requests'.

Adds requests, urllib3, and charset_normalizer to the
transitive-deps block next to the existing httpx chain.
---
 studio/backend/requirements/no-torch-runtime.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/studio/backend/requirements/no-torch-runtime.txt b/studio/backend/requirements/no-torch-runtime.txt
index 6b7c17a0be..6e4b904267 100644
--- a/studio/backend/requirements/no-torch-runtime.txt
+++ b/studio/backend/requirements/no-torch-runtime.txt
@@ -57,6 +57,12 @@ gguf
 regex
 typing_extensions
 filelock
+# `requests` and its urllib3/charset chain are required by huggingface_hub's
+# blob downloader; diffusers + GGUFQuantizationConfig 500 on first
+# /api/inference/images/load otherwise.
+requests
+urllib3
+charset_normalizer
 httpx
 httpcore
 certifi

From f06895b73e826df2335bef1e442569436f2eed08 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 01:05:27 +0000
Subject: [PATCH 21/92] Fix/adjust diffusion: round 5 lifecycle + validation
 hardening for PR #5754

Round 5 reviewer findings, mostly symmetric-lifecycle and input
validation gaps the earlier rounds left open.

Backend lifecycle (P1)
  * routes/training.py: training start now also unloads the GGUF
    llama-server subprocess; was previously only unloading the
    safetensors backend, so starting training while a GGUF chat
    model was loaded kept the subprocess pinned to VRAM.
  * routes/inference.py: new _raise_if_training_active helper. Both
    GGUF and standard chat loads, plus /api/inference/images/load,
    now refuse with HTTP 409 when training is active instead of
    silently stopping training to free VRAM.
  * core/inference/diffusion.py: _release_other_gpu_owners_for_
    diffusion no longer stops active training. The route layer
    refuses the request first, so reaching the helper with training
    live would only happen from programmatic backend calls; better
    to surface OOM than terminate a long training run.
  * core/inference/diffusion.py: BF16 dtype is now gated on
    torch.cuda.is_bf16_supported. Pascal/Turing GPUs report
    is_available()=True but lack BF16 ALUs; FLUX kernels then fail
    inside from_pretrained. Falls back to FP16 instead of refusing.
  * core/inference/diffusion.py: GGUF transformer allocation and
    pipeline allocation now run AFTER releasing chat/export GPU
    owners; previously from_single_file ran first and could OOM
    before the intended VRAM handoff happened.
  * routes/models.py: /delete-cached now also blocks delete when
    diffusion is_loading=True (not just is_loaded); concurrent
    delete during hf_hub_download / from_single_file would have
    raced the rmtree.
  * routes/models.py: /delete-finetuned now also checks the
    diffusion backend before unlinking a Studio outputs/exports
    path. A user who exported a FLUX LoRA locally and loaded it via
    /images/load could previously rmtree the directory the
    diffusion backend was reading from.

Backend correctness / safety (P2)
  * core/inference/diffusion.py: _FAMILY_EXCLUDE for qwen-image now
    also covers qwen_image_edit / qwenimageedit underscore spellings
    so '...qwen_image_edit-GGUF' no longer misdetects as Qwen-Image.
  * core/inference/diffusion.py: detect_family now scans
    _FULL_REPO_FAMILIES in addition to _FAMILIES, so SDXL repos
    (stabilityai/stable-diffusion-xl-base-1.0) are auto-detected
    instead of failing with 'Could not infer a diffusion family'.
  * core/inference/diffusion.py: generate_image now uses a separate
    _generate_lock for the pipeline forward instead of holding
    _lock for the whole call. status() polls and concurrent unload
    requests no longer block for the full minutes-long generation.
  * routes/models.py: diffusion delete guard now uses exact repo-id
    match instead of prefix match; previously loading 'org/model-v2'
    would block deleting unrelated cached 'org/model'.
  * models/inference.py: DiffusionLoadRequest now rejects ASCII
    control characters in repo_id / gguf_filename / base_repo /
    family via field_validator (closes log-injection surface from
    authenticated callers). Also caps lengths at 256 chars.
  * models/inference.py: DiffusionGenerateRequest seed is now
    bounded to the int64/uint64 range; previously a huge seed
    (e.g. 2**100) passed Pydantic then crashed inside
    torch.Generator.manual_seed with 'Overflow when unpacking long
    long'.

Frontend (P2)
  * features/images/images-page.tsx: Custom HF repo panel now
    exposes a Pipeline family override dropdown; previously the
    backend supported it via DiffusionLoadRequest.family but the UI
    had no way to send it, so custom repos whose names did not
    contain a hard-coded substring failed to load.
  * features/images/images-page.tsx: handleLoad now re-fetches
    status on error. The backend clears its old pipeline before
    allocating the replacement; a failed swap previously left the
    UI showing 'Loaded:' with Generate enabled until manual
    refresh.

Tests (10 new)
  * underscore qwen-image-edit exclusion + SDXL full-repo detection
  * BF16 fallback when is_bf16_supported() returns False
  * status() does not block while generate_image holds _generate_lock
  * route layer rejects control chars in repo_id
  * route layer rejects 2**100 seeds (uint64-max boundary accepted)
  * route layer happy-path with negative-prompt true_cfg_scale
    forwarding (Qwen/Flux) and skip-when-no-neg (distilled CFG)
---
 studio/backend/core/inference/diffusion.py    | 128 +++++++++++++-----
 studio/backend/models/inference.py            |  48 ++++++-
 studio/backend/routes/inference.py            |  44 ++++++
 studio/backend/routes/models.py               |  60 +++++++-
 studio/backend/routes/training.py             |  17 +++
 .../backend/tests/test_diffusion_backend.py   | 118 ++++++++++++++++
 studio/backend/tests/test_diffusion_routes.py |  51 +++++++
 .../src/features/images/images-page.tsx       |  40 +++++-
 8 files changed, 458 insertions(+), 48 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index bd97736f03..29324aebfd 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -178,8 +178,22 @@ def _smart_base_repo(fam: DiffusionFamily, repo_id: str) -> str:
 # Qwen-Image-Edit. Each entry maps a family name to substrings that
 # must NOT appear anywhere in the repo id.
 _FAMILY_EXCLUDE: dict[str, tuple[str, ...]] = {
-    "stable-diffusion-3": ("3.5", "3-5", "stable-diffusion-3.5"),
-    "qwen-image": ("qwen-image-edit", "qwenimage-edit"),
+    "stable-diffusion-3": (
+        "3.5",
+        "3-5",
+        "3_5",
+        "stable-diffusion-3.5",
+        "stable_diffusion_3_5",
+    ),
+    # All underscore / hyphen spellings that appear in Hub repo ids for
+    # the *-Edit family must exclude Qwen-Image, otherwise
+    # ``unsloth/qwen_image_edit-GGUF`` matches the Qwen-Image base.
+    "qwen-image": (
+        "qwen-image-edit",
+        "qwenimage-edit",
+        "qwen_image_edit",
+        "qwenimageedit",
+    ),
 }
 
 
@@ -206,7 +220,10 @@ def detect_family(
     needle = (repo_id or "").lower()
     if not needle:
         return None
-    for fam in _FAMILIES:
+    # Scan _FAMILIES first (GGUF-supported), then _FULL_REPO_FAMILIES
+    # so a repo like ``stabilityai/stable-diffusion-xl-base-1.0`` is
+    # auto-detected as SDXL instead of returning None.
+    for fam in _FAMILIES + _FULL_REPO_FAMILIES:
         excludes = _FAMILY_EXCLUDE.get(fam.name, ())
         if any(e in needle for e in excludes):
             continue
@@ -243,15 +260,27 @@ class DiffusionBackend:
 
     def __init__(self) -> None:
         self._pipe: Any = None
-        # `_lock` protects mutations to the small state fields and the
-        # pipe call inside generate_image. `_load_lock` serialises the
-        # entire load_model call so two concurrent /images/load requests
-        # cannot both reach pipeline_cls.from_pretrained at the same
-        # time (which would double-spend VRAM and corrupt _pipe). The
-        # locks are taken in order load -> state so a generation in
-        # flight cannot deadlock the next load.
+        # `_lock` protects mutations to the small state fields and is
+        # the only lock taken by status(). It is intentionally NOT held
+        # for the long pipeline forward pass: holding it for the whole
+        # generate would block status() polls (frontend at 1 Hz) and
+        # any concurrent unload requests for minutes at a time.
+        #
+        # `_load_lock` serialises the entire load_model call so two
+        # concurrent /images/load requests cannot both reach
+        # pipeline_cls.from_pretrained at the same time (which would
+        # double-spend VRAM and corrupt _pipe).
+        #
+        # `_generate_lock` serialises pipeline __call__ since diffusers
+        # pipelines are not thread-safe; overlapping forwards on the
+        # shared pipe corrupt internal scheduler state.
+        #
+        # Lock order is load -> state and generate -> state (never
+        # state -> load/generate) so a forward in flight cannot
+        # deadlock the next load or a status poll.
         self._lock = threading.Lock()
         self._load_lock = threading.Lock()
+        self._generate_lock = threading.Lock()
         self._family: Optional[DiffusionFamily] = None
         self._repo_id: Optional[str] = None
         self._gguf_path: Optional[str] = None
@@ -306,11 +335,23 @@ def _pick_device_and_dtype(self) -> tuple[str, "Any"]:
         validated on. On macOS we use MPS in float16 to keep the pipeline
         on the Metal GPU. CPU is allowed only as a last resort because
         running FLUX on CPU is unusably slow (> 10 minutes per image).
+
+        BF16 is gated on ``torch.cuda.is_bf16_supported`` because the
+        Pascal / Turing class (sm_60 / sm_70 / sm_75) reports
+        ``is_available() == True`` but lacks BF16 ALUs; FLUX kernels
+        then fail inside ``from_pretrained`` or at the first denoise
+        step. Those cards still work on FP16, so fall back rather than
+        refuse to load.
         """
         import torch
 
         if torch.cuda.is_available():
-            return "cuda", torch.bfloat16
+            bf16_ok = False
+            try:
+                bf16_ok = bool(torch.cuda.is_bf16_supported())
+            except Exception:
+                bf16_ok = False
+            return "cuda", torch.bfloat16 if bf16_ok else torch.float16
         if (
             hasattr(torch, "backends")
             and getattr(torch.backends, "mps", None)
@@ -430,6 +471,23 @@ def load_model(
                         filename = gguf_filename,
                         token = hf_token,
                     )
+
+                # All cheap failure points (bad gguf_filename, missing
+                # pipeline / transformer class, gated download token,
+                # transient Hub error on the GGUF download) have now
+                # been validated. Anything past this line allocates
+                # GPU memory, so release competing GPU owners before
+                # we touch from_single_file or from_pretrained:
+                #   * Chat backends (llama-server + safetensors) so the
+                #     diffusion transformer does not race them for VRAM.
+                #   * Export subprocess (also holds GB on the same GPU).
+                # Training is *not* unloaded here: the route layer
+                # refuses /images/load with HTTP 409 when training is
+                # active so the user keeps their long run.
+                _release_chat_backend_for_diffusion()
+                _release_other_gpu_owners_for_diffusion()
+
+                if gguf_filename:
                     quant_config = diffusers.GGUFQuantizationConfig(compute_dtype = dtype)
                     # Diffusers-format GGUFs (FLUX.2 klein / Qwen-Image /
                     # SD3) need the matching base repo's component config
@@ -465,15 +523,6 @@ def load_model(
                 if hf_token:
                     pipe_kwargs["token"] = hf_token
 
-                # Cheap failure modes (bad gguf_filename, gated token,
-                # transient Hub error) have all happened by now. Only
-                # release the current chat backend + previous diffusion
-                # pipeline right before the expensive allocation so a
-                # typo does not kill the user's loaded chat model. Peak
-                # VRAM still stays at one model's worth because the
-                # release happens before from_pretrained.
-                _release_chat_backend_for_diffusion()
-                _release_other_gpu_owners_for_diffusion()
                 old = self._pipe
                 if old is not None:
                     with self._lock:
@@ -562,9 +611,14 @@ def generate_image(
     ) -> "Any":
         """Generate a single PIL image and return it.
 
-        The mutex is held for the entire call: diffusion pipelines are
-        not thread-safe, and overlapping ``__call__``s on a shared
-        pipeline frequently corrupt their internal scheduler state.
+        Concurrent generations are serialised by ``_generate_lock`` so
+        diffusion pipelines (not thread-safe; overlapping ``__call__``s
+        corrupt internal scheduler state) only ever run one at a time.
+        The state ``_lock`` is taken only to snapshot ``_pipe`` /
+        ``_device`` and immediately released: holding it for the whole
+        forward pass blocked ``status()`` polls and concurrent unload
+        requests for the entire (minutes-long) generation, which made
+        the UI feel frozen.
         """
         if not prompt or not prompt.strip():
             raise ValueError("prompt is empty")
@@ -586,6 +640,13 @@ def generate_image(
             pipe = self._pipe
             device = self._device or "cpu"
 
+        # _generate_lock outside _lock: only one forward at a time, but
+        # status() / unload() callers do not block on a running forward
+        # pass. unload_model takes _load_lock + _lock; the pipe object
+        # itself is kept alive by the local ``pipe`` reference until
+        # this function returns, so a concurrent unload during forward
+        # cannot free the weights from under us.
+        with self._generate_lock:
             generator = None
             if seed is not None:
                 # Match the device of the pipeline so determinism holds
@@ -728,20 +789,13 @@ def _release_other_gpu_owners_for_diffusion() -> None:
     except Exception as exc:
         logger.debug("export unload skipped: %s", exc)
 
-    # Active training subprocess
-    try:
-        from core.training import get_training_backend  # type: ignore
-
-        trn = get_training_backend()
-        if trn.is_training_active():
-            logger.info("Stopping training subprocess before diffusion load")
-            trn.stop_training()
-            for _ in range(60):
-                if not trn.is_training_active():
-                    break
-                time.sleep(0.5)
-    except Exception as exc:
-        logger.debug("training unload skipped: %s", exc)
+    # Note: active training is *not* stopped here. The route layer
+    # (`_raise_if_training_active` in routes/inference.py) refuses
+    # /images/load with HTTP 409 before this helper runs, so reaching
+    # this point with training still active would only happen in
+    # programmatic backend calls (tests, scripts). Silently terminating
+    # someone's training run when the diffusion load might still fail
+    # is worse than letting the load OOM and surfacing it explicitly.
 
 
 def _release(obj: Any) -> None:
diff --git a/studio/backend/models/inference.py b/studio/backend/models/inference.py
index f26220cc50..94b2d6232d 100644
--- a/studio/backend/models/inference.py
+++ b/studio/backend/models/inference.py
@@ -1426,6 +1426,27 @@ class AnthropicMessagesResponse(BaseModel):
 # ── Diffusion image generation ────────────────────────────────────
 
 
+def _no_control_chars(value: Optional[str], field_name: str) -> Optional[str]:
+    """Reject newlines and other ASCII control chars in identifiers
+    that get logged before HF validates them.
+
+    Authenticated callers could otherwise inject ``\\n`` / ``\\r`` /
+    NUL into ``logger.info("Loading diffusion model %s", repo_id)``
+    and forge fake log lines. HF repo ids and filenames legitimately
+    contain only ``[A-Za-z0-9._/-]``, so this is also a useful
+    correctness check (catches accidental ``"my repo\\n"`` paste).
+    """
+    if value is None:
+        return value
+    for ch in value:
+        if ch == "\x7f" or (ord(ch) < 0x20 and ch != "\t"):
+            raise ValueError(
+                f"{field_name} contains control characters; use a plain "
+                "Hugging Face repo / file name."
+            )
+    return value
+
+
 class DiffusionLoadRequest(BaseModel):
     """Load a diffusion image-generation model.
 
@@ -1435,16 +1456,20 @@ class DiffusionLoadRequest(BaseModel):
     VAE / text encoders when loading a GGUF-only repo.
     """
 
-    repo_id: str = Field(..., description = "HF repo id")
+    repo_id: str = Field(..., min_length = 1, max_length = 256, description = "HF repo id")
     gguf_filename: Optional[str] = Field(
-        None, description = "GGUF filename inside repo_id (Q4_K_S, Q8_0, ...)"
+        None,
+        max_length = 256,
+        description = "GGUF filename inside repo_id (Q4_K_S, Q8_0, ...)",
     )
     base_repo: Optional[str] = Field(
         None,
+        max_length = 256,
         description = "Diffusers base repo to source VAE + text encoders from",
     )
     family: Optional[str] = Field(
         None,
+        max_length = 64,
         description = "Force pipeline family: flux.2-klein | flux.2 | flux.1 | qwen-image | stable-diffusion-3 | stable-diffusion-xl",
     )
     hf_token: Optional[str] = Field(
@@ -1455,6 +1480,20 @@ class DiffusionLoadRequest(BaseModel):
         description = "Offload submodules to CPU between forwards. Trades a small speed hit for ~6 GB less VRAM on FLUX-class models.",
     )
 
+    @field_validator("repo_id", "gguf_filename", "base_repo", "family")
+    @classmethod
+    def _no_control_chars(cls, v, info):
+        return _no_control_chars(v, info.field_name)
+
+
+# torch.Generator.manual_seed packs into signed int64; values outside
+# [-2**63, 2**63 - 1] raise ``Overflow when unpacking long long`` deep
+# in the C++ layer. uint64 is also routinely cited online so accept
+# any value the underlying RNG could store and bounce the rest at the
+# Pydantic layer with a clean error.
+_SEED_MIN = -(2 ** 63)
+_SEED_MAX = (2 ** 64) - 1
+
 
 class DiffusionGenerateRequest(BaseModel):
     """Generate a single image from the currently-loaded diffusion model."""
@@ -1466,7 +1505,10 @@ class DiffusionGenerateRequest(BaseModel):
     width: int = Field(1024, ge = 64, le = 2048)
     height: int = Field(1024, ge = 64, le = 2048)
     seed: Optional[int] = Field(
-        None, description = "Deterministic seed for reproducible outputs"
+        None,
+        ge = _SEED_MIN,
+        le = _SEED_MAX,
+        description = "Deterministic seed for reproducible outputs",
     )
 
     @field_validator("width", "height")
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 270f42bded..1ef2a3b260 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -242,6 +242,35 @@ def _friendly_error(exc: Exception) -> str:
 studio_router = APIRouter()
 
 
+def _raise_if_training_active(workload: str) -> None:
+    """Refuse a chat/diffusion/export load while training is active.
+
+    Without this guard the load path would either (a) silently stop a
+    running training run via _release_other_gpu_owners_for_diffusion
+    or (b) double-spend VRAM and OOM both jobs. Both are worse for the
+    user than a 409 explaining why the request was refused. Best-effort
+    import so unit-test backends without core.training do not 500.
+    """
+    try:
+        from core.training import get_training_backend  # type: ignore
+    except Exception:
+        return
+    try:
+        trn = get_training_backend()
+        if trn.is_training_active():
+            raise HTTPException(
+                status_code = 409,
+                detail = (
+                    f"Training is currently active. Stop the training run "
+                    f"before loading a {workload} model."
+                ),
+            )
+    except HTTPException:
+        raise
+    except Exception as exc:
+        logger.debug("training activity check skipped: %s", exc)
+
+
 def _detect_safetensors_features(backend, chat_template: Optional[str]) -> dict:
     """Classify reasoning/tool capabilities via the GGUF classifier so
     flags match across backends. gpt-oss is overridden because Harmony
@@ -737,6 +766,12 @@ async def load_model(
                     detail = "gpu_ids is not supported for GGUF models yet.",
                 )
 
+            # Symmetric lifecycle guard: refuse a chat load while
+            # training is active. Diffusion and export paths refuse;
+            # without this the GGUF chat load would start llama-server
+            # while training still owned VRAM and double-spend it.
+            _raise_if_training_active("chat")
+
             llama_backend = get_llama_cpp_backend()
             unsloth_backend = get_inference_backend()
 
@@ -928,6 +963,11 @@ async def load_model(
             )
 
         # ── Standard path: load via Unsloth/transformers ──────────
+        # Symmetric lifecycle guard: refuse a chat load while training
+        # is active so we do not OOM both the training and inference
+        # jobs together.
+        _raise_if_training_active("chat")
+
         backend = get_inference_backend()
 
         # Unload any active GGUF model first
@@ -1643,6 +1683,10 @@ async def diffusion_load(
     desired ``gguf_filename``. Returns the new status payload (same
     shape as ``/images/status``).
     """
+    # Refuse before the long download starts: silently stopping a
+    # running training run to free VRAM was the previous behavior and
+    # left the user with no model loaded plus a dead training job.
+    _raise_if_training_active("diffusion")
     backend = _get_diffusion_backend()
     try:
         status = await asyncio.get_event_loop().run_in_executor(
diff --git a/studio/backend/routes/models.py b/studio/backend/routes/models.py
index 13e2a4e83f..1029dd5bbe 100644
--- a/studio/backend/routes/models.py
+++ b/studio/backend/routes/models.py
@@ -1968,6 +1968,49 @@ async def delete_finetuned_model(
             detail = "Could not verify model load status before deleting",
         ) from e
 
+    # Diffusion pipelines can also be loaded directly from a Studio
+    # outputs/exports path (e.g. user fine-tuned a FLUX LoRA, exported
+    # the merged repo locally, then loaded it via /images/load with a
+    # local path as repo_id). Without this guard /delete-finetuned
+    # could rmtree the directory the diffusion backend is reading from.
+    try:
+        from core.inference.diffusion import get_diffusion_backend
+
+        diff_backend = get_diffusion_backend()
+        diff_status = diff_backend.status()
+        if diff_status.get("is_loaded") or diff_status.get("is_loading"):
+            diff_repo = diff_status.get("repo_id") or ""
+            diff_base = diff_status.get("base_repo") or ""
+            target_str = str(target_path)
+            for candidate in (diff_repo, diff_base):
+                if not candidate:
+                    continue
+                try:
+                    candidate_path = Path(candidate).expanduser()
+                except Exception:
+                    continue
+                if not candidate_path.is_absolute():
+                    continue
+                try:
+                    candidate_resolved = candidate_path.resolve()
+                except Exception:
+                    continue
+                if (
+                    candidate_resolved == target_path
+                    or str(candidate_resolved) == target_str
+                    or _is_path_under(candidate_resolved, target_path)
+                ):
+                    raise HTTPException(
+                        status_code = 400,
+                        detail = "Unload the diffusion image model before deleting",
+                    )
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.warning(
+            "Could not check diffusion backend loaded model before delete: %s", e
+        )
+
     try:
         if export_type == "gguf" and gguf_variant:
             if not target_path.is_dir():
@@ -2632,20 +2675,25 @@ async def delete_cached_model(
     except Exception:
         pass
 
-    # Also refuse to delete the cache underlying a loaded diffusion
-    # pipeline. The diffusion backend mmap's the GGUF + base repo
-    # weights and continues to read from the cache long after load,
-    # so deleting them out from under it would corrupt generation.
+    # Also refuse to delete the cache underlying a loaded or *loading*
+    # diffusion pipeline. The diffusion backend mmap's the GGUF + base
+    # repo weights and continues to read from the cache long after
+    # load; deleting them out from under it would corrupt generation.
+    # is_loading=True is also blocked because a mid-flight
+    # hf_hub_download / from_single_file would race the rmtree.
+    # Match exactly on repo_id (case-insensitive) instead of prefix to
+    # avoid blocking unrelated deletes like "org/model" while
+    # "org/model-v2" is loaded.
     try:
         from core.inference.diffusion import get_diffusion_backend
 
         diff_backend = get_diffusion_backend()
         diff_status = diff_backend.status()
-        if diff_status.get("is_loaded"):
+        if diff_status.get("is_loaded") or diff_status.get("is_loading"):
             diff_repo = (diff_status.get("repo_id") or "").lower()
             diff_base = (diff_status.get("base_repo") or "").lower()
             needle = repo_id.lower()
-            if diff_repo.startswith(needle) or diff_base.startswith(needle):
+            if diff_repo == needle or diff_base == needle:
                 raise HTTPException(
                     status_code = 400,
                     detail = "Unload the diffusion image model before deleting",
diff --git a/studio/backend/routes/training.py b/studio/backend/routes/training.py
index 99a775c0b7..38f4108d25 100644
--- a/studio/backend/routes/training.py
+++ b/studio/backend/routes/training.py
@@ -282,6 +282,23 @@ async def start_training(
         except Exception as e:
             logger.warning("Could not unload inference model: %s", e)
 
+        # GGUF chat backend (llama-server subprocess). Without this,
+        # starting training while a GGUF model is loaded keeps the
+        # subprocess pinned to VRAM and OOMs the training job. Mirrors
+        # the symmetric handoffs in routes/inference.py and
+        # routes/export.py.
+        try:
+            from routes.inference import get_llama_cpp_backend
+
+            llama_backend = get_llama_cpp_backend()
+            if getattr(llama_backend, "is_loaded", False):
+                logger.info(
+                    "Unloading GGUF chat model to free GPU memory for training"
+                )
+                llama_backend.unload_model()
+        except Exception as e:
+            logger.warning("Could not unload GGUF chat model: %s", e)
+
         try:
             from core.export import get_export_backend
 
diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index 6014bda840..7a4312d5dd 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -127,6 +127,25 @@ def test_detect_family_qwen_image_edit_is_not_qwen_image():
 
     assert detect_family("unsloth/Qwen-Image-Edit-GGUF") is None
     assert detect_family("unsloth/Qwen-Image-Edit-2509-GGUF") is None
+    # Underscore spellings on the Hub must also be excluded; otherwise
+    # qwen_image_edit-GGUF silently matches the base Qwen-Image family.
+    assert detect_family("unsloth/qwen_image_edit-GGUF") is None
+    assert detect_family("unsloth/QwenImageEdit-GGUF") is None
+
+
+def test_detect_family_finds_full_repo_sdxl():
+    """SDXL lives in _FULL_REPO_FAMILIES, but the auto-detector must
+    still find it for ``stabilityai/stable-diffusion-xl-base-1.0`` so
+    the Custom HF repo entry point does not fail with 'Could not infer
+    a diffusion family' for the canonical SDXL repo."""
+    from core.inference.diffusion import detect_family
+
+    fam = detect_family("stabilityai/stable-diffusion-xl-base-1.0")
+    assert fam is not None
+    assert fam.name == "stable-diffusion-xl"
+    fam2 = detect_family("nerijs/sdxl-lora-test")
+    assert fam2 is not None
+    assert fam2.name == "stable-diffusion-xl"
 
 
 def test_supported_families_payload_shape():
@@ -937,3 +956,102 @@ class _Out:
     assert captured["guidance_scale"] == 7.5
     # Default left untouched: real CFG only activates with neg prompt.
     assert captured["true_cfg_scale"] == 4.0
+
+
+def test_generate_image_does_not_block_status(monkeypatch):
+    """status() must return promptly while a generation is in flight;
+    holding _lock for the whole forward froze the Images UI on the
+    polling endpoint for the entire (minutes long) generation."""
+    import threading
+    import core.inference.diffusion as d
+    from PIL import Image
+
+    backend = d.get_diffusion_backend()
+    pipe_started = threading.Event()
+    pipe_release = threading.Event()
+
+    class _SlowPipe:
+        def __call__(self, **kw):
+            pipe_started.set()
+            # Wait until the test releases us; status() should return
+            # before this lock is released.
+            pipe_release.wait(timeout = 5)
+
+            class _Out:
+                pass
+
+            o = _Out()
+            o.images = [Image.new("RGB", (kw["width"], kw["height"]), (1, 2, 3))]
+            return o
+
+    backend._pipe = _SlowPipe()
+    backend._device = "cpu"
+    backend._family = d._FAMILIES[0]
+    backend._repo_id = "stub/stub"
+
+    t = threading.Thread(
+        target = backend.generate_image,
+        kwargs = dict(
+            prompt = "a sloth",
+            num_inference_steps = 1,
+            guidance_scale = 1.0,
+            width = 64,
+            height = 64,
+        ),
+    )
+    t.start()
+    try:
+        assert pipe_started.wait(timeout = 5)
+        # Forward is in progress; status() must not block on _lock.
+        completed = [False]
+
+        def call_status():
+            backend.status()
+            completed[0] = True
+
+        s = threading.Thread(target = call_status)
+        s.start()
+        s.join(timeout = 2)
+        assert completed[0], "status() blocked on generate_image"
+    finally:
+        pipe_release.set()
+        t.join(timeout = 5)
+
+
+def test_bf16_falls_back_to_fp16_on_old_cuda(monkeypatch):
+    """CUDA availability does not imply BF16 support; old GPUs report
+    is_available()=True and is_bf16_supported()=False. The backend
+    must fall back to FP16 rather than picking BF16 and failing
+    deep inside from_pretrained."""
+    import core.inference.diffusion as d
+
+    class _FakeCuda:
+        @staticmethod
+        def is_available():
+            return True
+
+        @staticmethod
+        def is_bf16_supported():
+            return False
+
+    class _FakeBackends:
+        class mps:
+            @staticmethod
+            def is_available():
+                return False
+
+    class _FakeTorch:
+        cuda = _FakeCuda
+        backends = _FakeBackends
+        # Sentinel objects so the dtype identity comparison works.
+        bfloat16 = object()
+        float16 = object()
+        float32 = object()
+
+    fake_torch = _FakeTorch()
+    monkeypatch.setitem(sys.modules, "torch", fake_torch)
+
+    backend = d.DiffusionBackend()
+    device, dtype = backend._pick_device_and_dtype()
+    assert device == "cuda"
+    assert dtype is fake_torch.float16
diff --git a/studio/backend/tests/test_diffusion_routes.py b/studio/backend/tests/test_diffusion_routes.py
index ca420f72c5..39dd28c88a 100644
--- a/studio/backend/tests/test_diffusion_routes.py
+++ b/studio/backend/tests/test_diffusion_routes.py
@@ -188,3 +188,54 @@ def test_unload_clears_state(app_with_stub):
     assert r.json()["is_loaded"] is False
     r = c.get("/api/inference/images/status")
     assert r.json()["is_loaded"] is False
+
+
+def test_load_rejects_control_chars_in_repo_id(app_with_stub):
+    """Newline-laden repo ids must be rejected by Pydantic BEFORE the
+    log line that echoes them. Catches log-injection from authenticated
+    callers (issues a 422 instead of forging a fake log line)."""
+    app, _ = app_with_stub
+    c = TestClient(app)
+    r = c.post(
+        "/api/inference/images/load",
+        json = {"repo_id": "owner/model\nFAKE_LOG_LINE"},
+    )
+    assert r.status_code == 422, r.text
+    body = r.json()
+    text = repr(body).lower()
+    assert "control" in text or "repo_id" in text
+
+
+def test_generate_rejects_oversize_seed(app_with_stub):
+    """Huge seeds raise inside torch.Generator.manual_seed; Pydantic
+    must clamp first with a 422 instead of a 500 traceback."""
+    app, _ = app_with_stub
+    c = TestClient(app)
+    c.post(
+        "/api/inference/images/load",
+        json = {"repo_id": "unsloth/FLUX.2-klein-4B-GGUF", "gguf_filename": "x.gguf"},
+    )
+    r = c.post(
+        "/api/inference/images/generate",
+        json = {"prompt": "x", "seed": 2 ** 100},
+    )
+    assert r.status_code == 422, r.text
+
+
+def test_generate_accepts_uint64_max_seed(app_with_stub):
+    """Boundary value: 2**64 - 1 (uint64 max) is the largest seed
+    torch.Generator on CPU accepts; reject would frustrate users
+    who paste large seeds from other tooling."""
+    app, _ = app_with_stub
+    c = TestClient(app)
+    c.post(
+        "/api/inference/images/load",
+        json = {"repo_id": "unsloth/FLUX.2-klein-4B-GGUF", "gguf_filename": "x.gguf"},
+    )
+    r = c.post(
+        "/api/inference/images/generate",
+        json = {"prompt": "x", "seed": (2 ** 64) - 1},
+    )
+    # The fake backend returns 200 on success; we only care that the
+    # request did NOT 422 on seed bounds.
+    assert r.status_code != 422, r.text
diff --git a/studio/frontend/src/features/images/images-page.tsx b/studio/frontend/src/features/images/images-page.tsx
index 05eb5c3a89..3b3891e9e6 100644
--- a/studio/frontend/src/features/images/images-page.tsx
+++ b/studio/frontend/src/features/images/images-page.tsx
@@ -110,6 +110,7 @@ export function ImagesPage() {
   const [presetIndex, setPresetIndex] = useState(0);
   const [customRepoId, setCustomRepoId] = useState("");
   const [customGguf, setCustomGguf] = useState("");
+  const [customFamily, setCustomFamily] = useState<string>("auto");
   const [useCustom, setUseCustom] = useState(false);
   const [hfToken, setHfToken] = useState("");
 
@@ -151,7 +152,15 @@ export function ImagesPage() {
     try {
       const repo = useCustom ? customRepoId.trim() : preset.repo_id;
       const gguf = useCustom ? customGguf.trim() || undefined : preset.default_gguf;
-      const family = useCustom ? undefined : preset.family;
+      // Custom mode lets the user pin a family explicitly because
+      // detect_family is substring-based and exotic repo names (custom
+      // fine-tunes, third-party mirrors) frequently fail to match.
+      // "auto" leaves the override blank and lets the backend infer.
+      const family = useCustom
+        ? customFamily === "auto"
+          ? undefined
+          : customFamily
+        : preset.family;
       // Always pass base_repo for curated entries; custom-repo mode
       // lets the backend either infer it from the family default or
       // (when no GGUF is given) treat the repo as a full diffusers
@@ -174,10 +183,15 @@ export function ImagesPage() {
       toast.error("Failed to load image model", {
         description: err instanceof Error ? err.message : String(err),
       });
+      // Backend clears its old pipeline before allocating the new one;
+      // a failed swap leaves status.is_loaded=false while our local
+      // copy still says loaded. Re-fetch so Generate disables and the
+      // user does not see a stale "Loaded:" label.
+      await refreshStatus();
     } finally {
       setBusy("idle");
     }
-  }, [useCustom, customRepoId, customGguf, preset, hfToken]);
+  }, [useCustom, customRepoId, customGguf, customFamily, preset, hfToken, refreshStatus]);
 
   const handleUnload = useCallback(async () => {
     setBusy("unloading");
@@ -320,6 +334,28 @@ export function ImagesPage() {
                 onChange={(e) => setCustomGguf(e.target.value)}
                 placeholder="FLUX.2-klein-4B-Q4_K_S.gguf"
               />
+              <Label>Pipeline family (override)</Label>
+              <Select
+                value={customFamily}
+                onValueChange={setCustomFamily}
+              >
+                <SelectTrigger>
+                  <SelectValue />
+                </SelectTrigger>
+                <SelectContent>
+                  <SelectItem value="auto">Auto-detect from repo id</SelectItem>
+                  <SelectItem value="flux.2-klein">FLUX.2 klein</SelectItem>
+                  <SelectItem value="flux.2">FLUX.2</SelectItem>
+                  <SelectItem value="flux.1">FLUX.1</SelectItem>
+                  <SelectItem value="qwen-image">Qwen-Image</SelectItem>
+                  <SelectItem value="stable-diffusion-3">Stable Diffusion 3</SelectItem>
+                  <SelectItem value="stable-diffusion-xl">Stable Diffusion XL</SelectItem>
+                </SelectContent>
+              </Select>
+              <p className="text-xs text-muted-foreground">
+                {"Set this when your repo name does not contain "}
+                {"a recognised family substring (e.g. private fine-tunes)."}
+              </p>
             </div>
           )}
 

From 8858104b29e77ffd0b705d1d4f218d30f9b1b5ba Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 01:05:45 +0000
Subject: [PATCH 22/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/models/inference.py            | 4 ++--
 studio/backend/routes/training.py             | 4 +---
 studio/backend/tests/test_diffusion_routes.py | 4 ++--
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/studio/backend/models/inference.py b/studio/backend/models/inference.py
index 94b2d6232d..2767474e5f 100644
--- a/studio/backend/models/inference.py
+++ b/studio/backend/models/inference.py
@@ -1491,8 +1491,8 @@ def _no_control_chars(cls, v, info):
 # in the C++ layer. uint64 is also routinely cited online so accept
 # any value the underlying RNG could store and bounce the rest at the
 # Pydantic layer with a clean error.
-_SEED_MIN = -(2 ** 63)
-_SEED_MAX = (2 ** 64) - 1
+_SEED_MIN = -(2**63)
+_SEED_MAX = (2**64) - 1
 
 
 class DiffusionGenerateRequest(BaseModel):
diff --git a/studio/backend/routes/training.py b/studio/backend/routes/training.py
index 38f4108d25..0715a79c66 100644
--- a/studio/backend/routes/training.py
+++ b/studio/backend/routes/training.py
@@ -292,9 +292,7 @@ async def start_training(
 
             llama_backend = get_llama_cpp_backend()
             if getattr(llama_backend, "is_loaded", False):
-                logger.info(
-                    "Unloading GGUF chat model to free GPU memory for training"
-                )
+                logger.info("Unloading GGUF chat model to free GPU memory for training")
                 llama_backend.unload_model()
         except Exception as e:
             logger.warning("Could not unload GGUF chat model: %s", e)
diff --git a/studio/backend/tests/test_diffusion_routes.py b/studio/backend/tests/test_diffusion_routes.py
index 39dd28c88a..9d21f0af0b 100644
--- a/studio/backend/tests/test_diffusion_routes.py
+++ b/studio/backend/tests/test_diffusion_routes.py
@@ -217,7 +217,7 @@ def test_generate_rejects_oversize_seed(app_with_stub):
     )
     r = c.post(
         "/api/inference/images/generate",
-        json = {"prompt": "x", "seed": 2 ** 100},
+        json = {"prompt": "x", "seed": 2**100},
     )
     assert r.status_code == 422, r.text
 
@@ -234,7 +234,7 @@ def test_generate_accepts_uint64_max_seed(app_with_stub):
     )
     r = c.post(
         "/api/inference/images/generate",
-        json = {"prompt": "x", "seed": (2 ** 64) - 1},
+        json = {"prompt": "x", "seed": (2**64) - 1},
     )
     # The fake backend returns 200 on success; we only care that the
     # request did NOT 422 on seed bounds.

From 04de106e49295dc1e2c3a4ed05fa7fa1352be009 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 01:27:57 +0000
Subject: [PATCH 23/92] Fix/adjust diffusion: round 6 race-free lifecycle +
 delete guards for PR #5754

Round 6 reviewers identified several races between load / unload /
generate and several fail-open delete guards. This commit closes
them by widening the lock scope, publishing the pending load
target through status(), and switching delete guards to
fail-closed.

Lifecycle (P1)
  * core/inference/diffusion.py: load_model now also takes
    _generate_lock. Previous behavior released and reallocated the
    pipeline while a generation forward was still iterating
    denoising steps, corrupting scheduler state and stacking VRAM.
    The forward only briefly touches _lock, so taking it on the
    load path does not introduce a deadlock.
  * core/inference/diffusion.py: unload_model now also takes
    _generate_lock. Without it, /images/unload returned
    is_loaded=False while a slow forward was still running, which
    let chat / training / export handoffs allocate VRAM on top of
    the still-resident pipeline.
  * core/inference/diffusion.py: previous pipeline release now
    happens BEFORE from_single_file / from_pretrained. Switching
    FLUX.2 klein 4B -> 9B on a 16-24 GB GPU was failing because
    the new transformer allocation overlapped the old pipe's
    residency.
  * core/inference/diffusion.py: failed pipeline from_pretrained
    now explicitly releases the just-loaded transformer; previously
    its weights stayed pinned to GPU until GC and made the next
    load more likely to OOM.

Pending-target / delete guards (P1)
  * core/inference/diffusion.py: load_model now publishes
    _pending_repo_id / _pending_base_repo / _pending_gguf_filename
    under _lock at the start of the call (and refreshes
    _pending_base_repo when the smart-base / repo defaults resolve).
    status() exposes those as 'repo_id' / 'base_repo' /
    'gguf_filename' during is_loading=True so delete guards can see
    the target before _repo_id is set on success.
  * routes/models.py /delete-cached + /delete-finetuned: diffusion
    status check now fails CLOSED (HTTP 503) when status() raises.
    Both guards previously logged and continued, which could let a
    delete proceed against a repo whose status was unverifiable.
  * routes/models.py: is_loading is also blocked on both guards
    so a mid-download / mid-from_pretrained rmtree is refused.

Symmetric handoffs (P1)
  * routes/export.py: /load-checkpoint now refuses with HTTP 409
    when training is active instead of calling stop_training().
    Chat and /images/load did the same after round 5; export was
    the remaining asymmetry that would silently kill a long
    training run.
  * routes/training.py, routes/inference.py (GGUF and standard
    chat), routes/export.py: diffusion handoff now treats
    is_loading as is_loaded. The diffusion backend's unload waits
    on _load_lock + _generate_lock so an in-flight load completes
    first.

Requirements (P1)
  * requirements/studio.txt: pin python-multipart explicitly. The
    Studio routes package's eager router imports include
    routes/datasets.py whose FastAPI UploadFile/File validation
    crashes with RuntimeError without it in fresh test envs.

Frontend (P2)
  * features/images/api.ts + images-page.tsx: seed handling now
    accepts the full [-2^63, 2^64 - 1] range via BigInt. The
    previous safe-integer cap rejected valid uint64 seeds the
    backend accepts. A small stringify helper emits BigInts as JSON
    integers without touching the rest of the payload.

Tests
  * test_diffusion_routes.py: load routes/inference.py via
    importlib.spec_from_file_location to avoid triggering
    routes/__init__.py (which would pull in training / datasets /
    data_recipe imports unrelated to diffusion tests).
  * test_diffusion_backend.py: status() during is_loading shows
    pending repo + base; unload waits for in-flight generation.
---
 studio/backend/core/inference/diffusion.py    | 139 +++++++++++++-----
 studio/backend/requirements/studio.txt        |   5 +
 studio/backend/routes/export.py               |  62 ++++----
 studio/backend/routes/inference.py            |  25 +++-
 studio/backend/routes/models.py               |  25 +++-
 studio/backend/routes/training.py             |  11 +-
 .../backend/tests/test_diffusion_backend.py   | 131 +++++++++++++++++
 studio/backend/tests/test_diffusion_routes.py |  38 ++++-
 studio/frontend/src/features/images/api.ts    |  17 ++-
 .../src/features/images/images-page.tsx       |  40 +++--
 10 files changed, 407 insertions(+), 86 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 29324aebfd..8ae3a13f0f 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -290,6 +290,15 @@ def __init__(self) -> None:
         self._loaded_at: Optional[float] = None
         self._loading: bool = False
         self._last_error: Optional[str] = None
+        # `_pending_*` fields advertise the target of an in-flight load
+        # so cache- and finetuned-delete guards can refuse to rmtree a
+        # repo while it is being downloaded / read. They are set under
+        # _lock at the start of load_model and cleared on success or
+        # in the finally block. The route layer reads them via
+        # status() under _lock.
+        self._pending_repo_id: Optional[str] = None
+        self._pending_base_repo: Optional[str] = None
+        self._pending_gguf_filename: Optional[str] = None
 
     # ── lifecycle ─────────────────────────────────────────────────
 
@@ -311,16 +320,24 @@ def status(self) -> dict[str, Any]:
         # POSIX layouts) to any authenticated Studio session.
         with self._lock:
             gguf_basename = Path(self._gguf_path).name if self._gguf_path else None
+            # During an in-flight load, expose _pending_* so cache /
+            # finetuned delete guards can refuse to wipe the repo
+            # that is mid-download. After the load completes (success
+            # or failure), the pending fields are cleared so status()
+            # reverts to publishing only the resident pipeline's id.
+            effective_repo = self._repo_id or self._pending_repo_id
+            effective_base = self._base_repo or self._pending_base_repo
+            effective_gguf = gguf_basename or self._pending_gguf_filename
             return {
                 "is_loaded": self._pipe is not None,
                 "is_loading": self._loading,
-                "repo_id": self._repo_id,
+                "repo_id": effective_repo,
                 "family": self._family.name if self._family else None,
                 "pipeline_class": (
                     self._family.pipeline_class if self._family else None
                 ),
-                "base_repo": self._base_repo,
-                "gguf_filename": gguf_basename,
+                "base_repo": effective_base,
+                "gguf_filename": effective_gguf,
                 "device": self._device,
                 "dtype": self._dtype,
                 "loaded_at": self._loaded_at,
@@ -406,10 +423,26 @@ def load_model(
         # cannot both kick off a multi-GB download + GPU upload at once.
         # The second caller waits behind the first and then loads on top
         # of the now-populated state via the normal swap path.
-        with self._load_lock:
+        # _generate_lock is also taken so we do not start swapping the
+        # pipeline (release old + allocate new) while a previous
+        # generation is still iterating denoising steps; releasing the
+        # pipe out from under an in-flight forward corrupts scheduler
+        # state. Order: _load_lock -> _generate_lock -> _lock so a
+        # forward (which only takes _generate_lock + briefly _lock)
+        # cannot block a queued load forever.
+        with self._load_lock, self._generate_lock:
             with self._lock:
                 self._loading = True
                 self._last_error = None
+                # Publish the pending target so cache / finetuned
+                # delete guards can see what is mid-download even
+                # before _repo_id / _base_repo are populated on
+                # success.
+                self._pending_repo_id = repo_id
+                self._pending_base_repo = base_repo
+                self._pending_gguf_filename = (
+                    Path(gguf_filename).name if gguf_filename else None
+                )
             try:
                 pipeline_cls = getattr(diffusers, fam.pipeline_class, None)
                 if pipeline_cls is None:
@@ -433,6 +466,10 @@ def load_model(
                 #      9B GGUF picks the 9B base, not the 4B fallback
                 if base_repo:
                     effective_base = base_repo
+                    # Refresh pending so delete guards see the actual
+                    # base, not just caller-supplied None.
+                    with self._lock:
+                        self._pending_base_repo = effective_base
                 elif not gguf_filename:
                     # Guard: a repo that ends in "-GGUF" (the unsloth
                     # convention) is GGUF-only and will 500 on
@@ -447,8 +484,12 @@ def load_model(
                             "load target."
                         )
                     effective_base = repo_id
+                    with self._lock:
+                        self._pending_base_repo = effective_base
                 else:
                     effective_base = _smart_base_repo(fam, repo_id)
+                    with self._lock:
+                        self._pending_base_repo = effective_base
                 logger.info(
                     "Loading diffusion model %s (family=%s, device=%s, dtype=%s, base=%s)",
                     repo_id,
@@ -476,17 +517,41 @@ def load_model(
                 # pipeline / transformer class, gated download token,
                 # transient Hub error on the GGUF download) have now
                 # been validated. Anything past this line allocates
-                # GPU memory, so release competing GPU owners before
-                # we touch from_single_file or from_pretrained:
-                #   * Chat backends (llama-server + safetensors) so the
-                #     diffusion transformer does not race them for VRAM.
-                #   * Export subprocess (also holds GB on the same GPU).
+                # GPU memory, so:
+                #   1. Release competing GPU owners (chat + export).
+                #   2. Release any *previous* diffusion pipeline so the
+                #      new transformer / new from_pretrained does not
+                #      race the old pipe for VRAM. Switching between
+                #      FLUX.2 klein 4B and 9B on a 16-24 GB GPU OOMs
+                #      otherwise: from_single_file allocates the new
+                #      transformer while the old pipeline still owns
+                #      its weights.
+                #   3. THEN call from_single_file / from_pretrained.
                 # Training is *not* unloaded here: the route layer
                 # refuses /images/load with HTTP 409 when training is
                 # active so the user keeps their long run.
                 _release_chat_backend_for_diffusion()
                 _release_other_gpu_owners_for_diffusion()
 
+                old = self._pipe
+                if old is not None:
+                    with self._lock:
+                        # Clear ALL metadata together so a failed swap
+                        # cannot leave status() reporting the previous
+                        # repo / family / base_repo on top of an empty
+                        # pipe. The except block below will restore
+                        # last_error so the caller knows what happened.
+                        self._pipe = None
+                        self._family = None
+                        self._repo_id = None
+                        self._gguf_path = None
+                        self._base_repo = None
+                        self._device = None
+                        self._dtype = None
+                        self._loaded_at = None
+                    _release(old)
+                    old = None
+
                 if gguf_filename:
                     quant_config = diffusers.GGUFQuantizationConfig(compute_dtype = dtype)
                     # Diffusers-format GGUFs (FLUX.2 klein / Qwen-Image /
@@ -523,26 +588,19 @@ def load_model(
                 if hf_token:
                     pipe_kwargs["token"] = hf_token
 
-                old = self._pipe
-                if old is not None:
-                    with self._lock:
-                        # Clear ALL metadata together so a failed swap
-                        # cannot leave status() reporting the previous
-                        # repo / family / base_repo on top of an empty
-                        # pipe. The except block below will restore
-                        # last_error so the caller knows what happened.
-                        self._pipe = None
-                        self._family = None
-                        self._repo_id = None
-                        self._gguf_path = None
-                        self._base_repo = None
-                        self._device = None
-                        self._dtype = None
-                        self._loaded_at = None
-                    _release(old)
-                    old = None
-
-                pipe = pipeline_cls.from_pretrained(effective_base, **pipe_kwargs)
+                try:
+                    pipe = pipeline_cls.from_pretrained(effective_base, **pipe_kwargs)
+                except Exception:
+                    # If from_pretrained fails after the transformer was
+                    # already loaded, the transformer object holds GPU
+                    # weights that would only be freed at GC. Drop the
+                    # local reference and force a collect so the next
+                    # load attempt does not stack VRAM with a phantom
+                    # transformer.
+                    if transformer is not None:
+                        _release(transformer)
+                        transformer = None
+                    raise
                 if enable_model_cpu_offload and device == "cuda":
                     pipe.enable_model_cpu_offload()
                 else:
@@ -557,8 +615,6 @@ def load_model(
                     self._device = device
                     self._dtype = str(dtype).replace("torch.", "")
                     self._loaded_at = time.time()
-                # ``old`` was released above before the new allocation;
-                # nothing left to free here.
 
                 return self.status()
             except Exception as exc:
@@ -577,12 +633,25 @@ def load_model(
             finally:
                 with self._lock:
                     self._loading = False
+                    # Clear pending so status() falls back to publishing
+                    # the resident pipeline (or nothing, on a failed
+                    # swap). Keeping pending alive after the load
+                    # finishes would falsely block deletes forever.
+                    self._pending_repo_id = None
+                    self._pending_base_repo = None
+                    self._pending_gguf_filename = None
 
     def unload_model(self) -> dict[str, Any]:
-        # Take the load lock too so unload cannot race with an in-flight
-        # load_model and have the load thread overwrite the cleared state
-        # after we already returned {"is_loaded": false}.
-        with self._load_lock:
+        # Take the load lock and the generate lock so unload cannot:
+        #   * race with an in-flight load_model and have the load
+        #     thread overwrite the cleared state after we already
+        #     returned {"is_loaded": false}.
+        #   * return is_loaded=false while a forward pass is still
+        #     iterating denoising steps on the soon-to-be-freed pipe.
+        # The generate forward only holds _generate_lock (briefly
+        # _lock), so acquiring _generate_lock here blocks until any
+        # in-flight generation completes.
+        with self._load_lock, self._generate_lock:
             with self._lock:
                 old = self._pipe
                 self._pipe = None
diff --git a/studio/backend/requirements/studio.txt b/studio/backend/requirements/studio.txt
index 96f8816b57..6628eef7f7 100644
--- a/studio/backend/requirements/studio.txt
+++ b/studio/backend/requirements/studio.txt
@@ -1,6 +1,11 @@
 # Studio UI backend dependencies
 typer
 fastapi
+# Required by FastAPI's multipart upload route validation
+# (routes/datasets.py uploads files via UploadFile/File). Without
+# this, importing the routes package raises RuntimeError on startup
+# and CPU-only test environments fail before any test runs.
+python-multipart
 uvicorn
 pydantic
 packaging
diff --git a/studio/backend/routes/export.py b/studio/backend/routes/export.py
index ff9e3d6695..d45659a386 100644
--- a/studio/backend/routes/export.py
+++ b/studio/backend/routes/export.py
@@ -94,42 +94,52 @@ async def load_checkpoint(
         except Exception as e:
             logger.debug("llama-server unload skipped for export: %s", e)
 
+        # Symmetric lifecycle guard: refuse to load an export
+        # checkpoint while training is active so we do not silently
+        # terminate someone's long-running training job and possibly
+        # fail the export load on top of that. Mirrors the
+        # _raise_if_training_active checks in routes/inference.py for
+        # chat and /images/load. Fail-closed (503) when the training
+        # backend can be imported but its status check raises.
+        try:
+            from core.training import get_training_backend  # type: ignore
+
+            trn = get_training_backend()
+            if trn.is_training_active():
+                raise HTTPException(
+                    status_code = 409,
+                    detail = (
+                        "Training is currently active. Stop the training "
+                        "run before loading an export checkpoint."
+                    ),
+                )
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.debug("training activity check skipped for export: %s", e)
+
         # Also unload any active diffusion pipeline (Images page); it
         # competes for the same GPU and would survive the inference
-        # shutdown above. Best effort; silently skip if the module is
-        # absent.
+        # shutdown above. is_loading is treated like is_loaded so an
+        # in-flight load is also waited out (the diffusion unload
+        # acquires _load_lock + _generate_lock and blocks until the
+        # current load completes, then unloads). Best effort; silently
+        # skip if the module is absent.
         try:
             from core.inference.diffusion import get_diffusion_backend
 
             diff = get_diffusion_backend()
-            if diff.is_loaded:
-                logger.info("Unloading diffusion model to free GPU memory for export")
+            diff_status = diff.status()
+            if diff_status.get("is_loaded") or diff_status.get("is_loading"):
+                logger.info(
+                    "Unloading diffusion model (loaded=%s loading=%s) for export",
+                    diff_status.get("is_loaded"),
+                    diff_status.get("is_loading"),
+                )
                 diff.unload_model()
         except Exception as e:
             logger.debug("diffusion unload skipped for export: %s", e)
 
-        try:
-            from core.training import get_training_backend
-
-            trn = get_training_backend()
-            if trn.is_training_active():
-                logger.info("Stopping active training to free GPU memory for export")
-                trn.stop_training()
-                # Wait for training subprocess to actually exit before proceeding,
-                # otherwise it may still hold GPU memory when export tries to load.
-                for _ in range(60):  # up to 30s
-                    if not trn.is_training_active():
-                        break
-                    import time
-
-                    time.sleep(0.5)
-                else:
-                    logger.warning(
-                        "Training subprocess did not exit within 30s, proceeding anyway"
-                    )
-        except Exception as e:
-            logger.warning("Could not stop training: %s", e)
-
         backend = get_export_backend()
         # load_checkpoint spawns and waits on a subprocess and can take
         # minutes. Run it in a worker thread so the event loop stays
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 1ef2a3b260..f49d28e095 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -784,13 +784,21 @@ async def load_model(
 
             # Symmetric with /images/load: drop any active diffusion
             # pipeline so the GGUF chat load does not race the FLUX VAE
-            # for VRAM. Best effort; silently continue on failure.
+            # for VRAM. Also handles is_loading: unload_model takes
+            # _load_lock + _generate_lock and will wait out an
+            # in-flight load before clearing state. Best effort;
+            # silently continue on failure.
             try:
                 from core.inference.diffusion import get_diffusion_backend
 
                 diff_backend = get_diffusion_backend()
-                if diff_backend.is_loaded:
-                    logger.info("Unloading diffusion pipeline before GGUF load")
+                diff_status = diff_backend.status()
+                if diff_status.get("is_loaded") or diff_status.get("is_loading"):
+                    logger.info(
+                        "Unloading diffusion (loaded=%s loading=%s) before GGUF load",
+                        diff_status.get("is_loaded"),
+                        diff_status.get("is_loading"),
+                    )
                     diff_backend.unload_model()
             except Exception as e:
                 logger.debug("diffusion unload skipped (GGUF path): %s", e)
@@ -977,14 +985,19 @@ async def load_model(
             llama_backend.unload_model()
 
         # Unload any active diffusion pipeline so the new chat model is
-        # not racing the FLUX VAE for VRAM on a 16-24 GB card.
+        # not racing the FLUX VAE for VRAM on a 16-24 GB card. is_loading
+        # is treated like is_loaded; unload waits behind _load_lock +
+        # _generate_lock so the in-flight load completes first.
         try:
             from core.inference.diffusion import get_diffusion_backend
 
             diff_backend = get_diffusion_backend()
-            if diff_backend.is_loaded:
+            diff_status = diff_backend.status()
+            if diff_status.get("is_loaded") or diff_status.get("is_loading"):
                 logger.info(
-                    "Unloading diffusion pipeline before loading Unsloth chat model"
+                    "Unloading diffusion (loaded=%s loading=%s) before chat load",
+                    diff_status.get("is_loaded"),
+                    diff_status.get("is_loading"),
                 )
                 diff_backend.unload_model()
         except Exception as e:
diff --git a/studio/backend/routes/models.py b/studio/backend/routes/models.py
index 1029dd5bbe..a8a18c081b 100644
--- a/studio/backend/routes/models.py
+++ b/studio/backend/routes/models.py
@@ -1973,6 +1973,12 @@ async def delete_finetuned_model(
     # the merged repo locally, then loaded it via /images/load with a
     # local path as repo_id). Without this guard /delete-finetuned
     # could rmtree the directory the diffusion backend is reading from.
+    # is_loading is also blocked: status() exposes _pending_repo_id /
+    # _pending_base_repo during the load window so deletes during a
+    # mid-flight from_pretrained are refused.
+    # Fail-CLOSED on exception (503) like the llama.cpp / safetensors
+    # guards above: an unverifiable diffusion state means we cannot
+    # confirm the target is safe to rmtree.
     try:
         from core.inference.diffusion import get_diffusion_backend
 
@@ -2010,6 +2016,10 @@ async def delete_finetuned_model(
         logger.warning(
             "Could not check diffusion backend loaded model before delete: %s", e
         )
+        raise HTTPException(
+            status_code = 503,
+            detail = "Could not verify diffusion load status before deleting",
+        ) from e
 
     try:
         if export_type == "gguf" and gguf_variant:
@@ -2684,6 +2694,10 @@ async def delete_cached_model(
     # Match exactly on repo_id (case-insensitive) instead of prefix to
     # avoid blocking unrelated deletes like "org/model" while
     # "org/model-v2" is loaded.
+    # Fail-CLOSED on exception (return 503) like the neighboring
+    # llama.cpp / safetensors guards: we cannot verify whether the
+    # delete is safe, so refuse rather than risk corrupting the
+    # pipeline's mmap.
     try:
         from core.inference.diffusion import get_diffusion_backend
 
@@ -2700,8 +2714,15 @@ async def delete_cached_model(
                 )
     except HTTPException:
         raise
-    except Exception:
-        pass
+    except Exception as e:
+        logger.warning(
+            "Could not check diffusion backend status before cache delete: %s",
+            e,
+        )
+        raise HTTPException(
+            status_code = 503,
+            detail = "Could not verify diffusion load status before deleting cache",
+        ) from e
 
     try:
         cache_scans = _all_hf_cache_scans()
diff --git a/studio/backend/routes/training.py b/studio/backend/routes/training.py
index 0715a79c66..9a9f6b9761 100644
--- a/studio/backend/routes/training.py
+++ b/studio/backend/routes/training.py
@@ -314,12 +314,19 @@ async def start_training(
 
         # Also unload any loaded diffusion pipeline (Images page); it
         # holds the same GPU and would survive the inference shutdown.
+        # is_loading=True is also handled (unload_model takes
+        # _load_lock + _generate_lock and waits the in-flight load out).
         try:
             from core.inference.diffusion import get_diffusion_backend
 
             diff_backend = get_diffusion_backend()
-            if diff_backend.is_loaded:
-                logger.info("Unloading diffusion model to free GPU memory for training")
+            diff_status = diff_backend.status()
+            if diff_status.get("is_loaded") or diff_status.get("is_loading"):
+                logger.info(
+                    "Unloading diffusion (loaded=%s loading=%s) for training",
+                    diff_status.get("is_loaded"),
+                    diff_status.get("is_loading"),
+                )
                 diff_backend.unload_model()
         except Exception as e:
             logger.warning("Could not unload diffusion model: %s", e)
diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index 7a4312d5dd..3a73af699f 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -1018,6 +1018,137 @@ def call_status():
         t.join(timeout = 5)
 
 
+def test_load_publishes_pending_target_during_loading(monkeypatch):
+    """status() must expose the pending repo_id / base_repo / gguf
+    file while is_loading=True so cache- and finetuned-delete guards
+    can refuse to rmtree the repo being downloaded right now."""
+    import threading
+    import core.inference.diffusion as d
+    from PIL import Image
+
+    fake = _install_fake_diffusers(monkeypatch)
+
+    pending_seen: dict = {}
+    pretrained_blocked = threading.Event()
+    pretrained_release = threading.Event()
+
+    class _SlowPipeline:
+        @classmethod
+        def from_pretrained(cls, base_repo, **kwargs):
+            pretrained_blocked.set()
+            # Capture status() output while the load is blocked.
+            backend = d.get_diffusion_backend()
+            pending_seen.update(backend.status())
+            pretrained_release.wait(timeout = 5)
+            inst = cls()
+            inst.base_repo = base_repo
+            return inst
+
+        def __call__(self, **kwargs):
+            class _Out:
+                pass
+
+            o = _Out()
+            o.images = [Image.new("RGB", (kwargs["width"], kwargs["height"]))]
+            return o
+
+        def enable_model_cpu_offload(self):
+            pass
+
+        def to(self, device):
+            return self
+
+    fake.Flux2KleinPipeline = _SlowPipeline
+
+    backend = d.get_diffusion_backend()
+    backend.unload_model()
+
+    def do_load():
+        try:
+            backend.load_model(
+                "unsloth/FLUX.2-klein-4B-GGUF",
+                gguf_filename = "flux-2-klein-4b-Q4_K_S.gguf",
+            )
+        except Exception:
+            pass
+
+    t = threading.Thread(target = do_load)
+    t.start()
+    try:
+        assert pretrained_blocked.wait(timeout = 5)
+        # While blocked inside from_pretrained, status reads should
+        # already see the pending repo so deletes can be refused.
+        assert pending_seen.get("is_loading") is True
+        assert pending_seen.get("repo_id") == "unsloth/FLUX.2-klein-4B-GGUF"
+        assert pending_seen.get("base_repo") == "black-forest-labs/FLUX.2-klein-4B"
+    finally:
+        pretrained_release.set()
+        t.join(timeout = 5)
+
+
+def test_unload_waits_for_in_flight_generation(monkeypatch):
+    """unload_model() must not return is_loaded=False while a
+    generate_image forward is still iterating; otherwise routes/...
+    callers see the pipe as freed while it still owns GPU memory and
+    can race a subsequent load."""
+    import threading
+    import core.inference.diffusion as d
+    from PIL import Image
+
+    backend = d.get_diffusion_backend()
+    started = threading.Event()
+    release = threading.Event()
+    generation_finished = threading.Event()
+
+    class _SlowPipe:
+        def __call__(self, **kw):
+            started.set()
+            release.wait(timeout = 5)
+
+            class _Out:
+                pass
+
+            o = _Out()
+            o.images = [Image.new("RGB", (kw["width"], kw["height"]))]
+            return o
+
+    backend._pipe = _SlowPipe()
+    backend._device = "cpu"
+    backend._family = d._FAMILIES[0]
+    backend._repo_id = "stub/stub"
+
+    def do_generate():
+        try:
+            backend.generate_image(prompt = "x", num_inference_steps = 1,
+                                   guidance_scale = 1.0, width = 64, height = 64)
+        finally:
+            generation_finished.set()
+
+    gen_thread = threading.Thread(target = do_generate)
+    gen_thread.start()
+    try:
+        assert started.wait(timeout = 5)
+        unload_returned = threading.Event()
+
+        def do_unload():
+            backend.unload_model()
+            unload_returned.set()
+
+        unload_thread = threading.Thread(target = do_unload)
+        unload_thread.start()
+        # unload should block until release sets, NOT return early.
+        unload_thread.join(timeout = 0.5)
+        assert not unload_returned.is_set(), \
+            "unload_model returned while generation was still running"
+        release.set()
+        unload_thread.join(timeout = 5)
+        assert unload_returned.is_set()
+        assert generation_finished.is_set()
+    finally:
+        release.set()
+        gen_thread.join(timeout = 5)
+
+
 def test_bf16_falls_back_to_fp16_on_old_cuda(monkeypatch):
     """CUDA availability does not imply BF16 support; old GPUs report
     is_available()=True and is_bf16_supported()=False. The backend
diff --git a/studio/backend/tests/test_diffusion_routes.py b/studio/backend/tests/test_diffusion_routes.py
index 9d21f0af0b..b7bfd1e2f8 100644
--- a/studio/backend/tests/test_diffusion_routes.py
+++ b/studio/backend/tests/test_diffusion_routes.py
@@ -7,10 +7,17 @@
 auth dependency replaced by a stub so we exercise the same FastAPI
 handlers Studio ships in production. The diffusion backend is replaced
 with an in-memory stub so we don't need diffusers / GPUs to run these.
+
+To stay runnable in a minimal CPU-only env, ``routes/inference.py``
+is loaded directly via ``importlib`` so we do NOT trigger
+``routes/__init__.py`` -- that file eagerly imports training /
+datasets / data_recipe / export and would drag in heavy deps
+(matplotlib, etc.) that the diffusion tests do not need.
 """
 
 from __future__ import annotations
 
+import importlib.util
 import sys
 from pathlib import Path
 
@@ -25,6 +32,35 @@
     sys.path.insert(0, str(_BACKEND_ROOT))
 
 
+def _import_inference_module():
+    """Load ``routes/inference.py`` without executing ``routes/__init__``.
+
+    The package init imports training / datasets / data_recipe / export
+    routers, which pull in matplotlib / pandas / training stack. The
+    diffusion tests only need the inference module so we side-step the
+    package import via importlib.spec_from_file_location.
+    """
+    # If a previous test already imported routes the normal way, reuse
+    # the cached module instead of re-loading.
+    cached = sys.modules.get("routes.inference")
+    if cached is not None:
+        return cached
+    target = _BACKEND_ROOT / "routes" / "inference.py"
+    spec = importlib.util.spec_from_file_location(
+        "routes.inference",
+        target,
+        # We do NOT set submodule_search_locations for routes itself
+        # because that would re-trigger routes/__init__.py. The module
+        # uses relative imports sparingly; absolute imports resolve via
+        # sys.path[0] = backend root.
+    )
+    assert spec and spec.loader, "could not build spec for routes/inference.py"
+    module = importlib.util.module_from_spec(spec)
+    sys.modules["routes.inference"] = module
+    spec.loader.exec_module(module)
+    return module
+
+
 class _FakeBackend:
     def __init__(self) -> None:
         self._loaded = False
@@ -71,7 +107,7 @@ def generate_image(self, **kw):
 def app_with_stub(monkeypatch):
     """Build a FastAPI app that mounts the real inference router with
     auth disabled and the diffusion backend swapped for a stub."""
-    from routes import inference as inf
+    inf = _import_inference_module()
     import core.inference.diffusion as d
 
     stub = _FakeBackend()
diff --git a/studio/frontend/src/features/images/api.ts b/studio/frontend/src/features/images/api.ts
index e576f3987e..0a09d971de 100644
--- a/studio/frontend/src/features/images/api.ts
+++ b/studio/frontend/src/features/images/api.ts
@@ -47,7 +47,10 @@ export interface DiffusionGenerateRequest {
   guidance_scale?: number;
   width?: number;
   height?: number;
-  seed?: number;
+  // bigint when the seed exceeds Number.MAX_SAFE_INTEGER, otherwise
+  // number. The wire format is always a JSON integer; see
+  // ``stringifyWithBigInt`` below.
+  seed?: number | bigint;
 }
 
 export interface DiffusionGenerateResponse {
@@ -92,6 +95,16 @@ export async function unloadDiffusionModel(): Promise<{ is_loaded: boolean }> {
   );
 }
 
+/** JSON.stringify cannot serialise BigInt directly. We only ever
+ * have BigInts in the seed field, which is an integer; emit the
+ * literal digits so the server receives a JSON integer rather than
+ * a string. Pydantic v2 accepts arbitrarily large ints. */
+function stringifyWithBigInt(value: unknown): string {
+  return JSON.stringify(value, (_, v) =>
+    typeof v === "bigint" ? `__bigint__:${v.toString()}` : v,
+  ).replace(/"__bigint__:(-?\d+)"/g, "$1");
+}
+
 export async function generateDiffusionImage(
   payload: DiffusionGenerateRequest,
 ): Promise<DiffusionGenerateResponse> {
@@ -99,7 +112,7 @@ export async function generateDiffusionImage(
     await authFetch("/api/inference/images/generate", {
       method: "POST",
       headers: { "Content-Type": "application/json" },
-      body: JSON.stringify(payload),
+      body: stringifyWithBigInt(payload),
     }),
   );
 }
diff --git a/studio/frontend/src/features/images/images-page.tsx b/studio/frontend/src/features/images/images-page.tsx
index 3b3891e9e6..64f06bfbb0 100644
--- a/studio/frontend/src/features/images/images-page.tsx
+++ b/studio/frontend/src/features/images/images-page.tsx
@@ -214,28 +214,44 @@ export function ImagesPage() {
     }
     setBusy("generating");
     try {
-      // Reject non-integer or out-of-safe-integer-range seeds rather
-      // than silently rounding via Number(). The backend takes an int
-      // and a precision loss here would yield a different image than
-      // the seed the user typed.
+      // Reject non-integer seeds and clamp to the [-2^63, 2^64 - 1]
+      // range the backend's torch.Generator can actually pack. JSON
+      // serialises BigInts as plain integers, so we keep the wire
+      // format compatible and avoid the Number(seed) precision loss
+      // (>= 2^53 silently rounds, producing a different image than
+      // the seed the user typed). When the seed fits a safe integer
+      // it goes through unchanged; larger seeds ride along as their
+      // BigInt-derived string via the wire-format BigInt JSON helper
+      // in the api layer.
       const seedStr = seed.trim();
-      let parsedSeed: number | undefined;
+      let parsedSeed: number | bigint | undefined;
       if (seedStr) {
         if (!/^-?\d+$/.test(seedStr)) {
           toast.error("Seed must be an integer");
           return;
         }
-        const candidate = Number(seedStr);
-        if (
-          !Number.isFinite(candidate) ||
-          !Number.isSafeInteger(candidate)
-        ) {
+        let big: bigint;
+        try {
+          big = BigInt(seedStr);
+        } catch {
+          toast.error("Seed must be an integer");
+          return;
+        }
+        const SEED_MIN = -(BigInt(2) ** BigInt(63));
+        const SEED_MAX = BigInt(2) ** BigInt(64) - BigInt(1);
+        if (big < SEED_MIN || big > SEED_MAX) {
           toast.error(
-            "Seed must fit in a JavaScript safe integer (<= 2^53 - 1)",
+            "Seed must be in [-2^63, 2^64 - 1] (the torch.Generator range)",
           );
           return;
         }
-        parsedSeed = candidate;
+        // Use a plain Number when it fits a safe integer so the
+        // existing api.ts JSON serialiser does not break on BigInt;
+        // otherwise pass the BigInt and let api.ts emit it as a JSON
+        // number via a custom replacer.
+        const SAFE_MAX = BigInt(Number.MAX_SAFE_INTEGER);
+        const SAFE_MIN = -SAFE_MAX;
+        parsedSeed = big >= SAFE_MIN && big <= SAFE_MAX ? Number(big) : big;
       }
       const out = await generateDiffusionImage({
         prompt,

From 0fd9e90c45c10fbdae9b78748a9a92ab197fb755 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 01:28:16 +0000
Subject: [PATCH 24/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/tests/test_diffusion_backend.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index 3a73af699f..44386fc8ea 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -1119,8 +1119,13 @@ class _Out:
 
     def do_generate():
         try:
-            backend.generate_image(prompt = "x", num_inference_steps = 1,
-                                   guidance_scale = 1.0, width = 64, height = 64)
+            backend.generate_image(
+                prompt = "x",
+                num_inference_steps = 1,
+                guidance_scale = 1.0,
+                width = 64,
+                height = 64,
+            )
         finally:
             generation_finished.set()
 
@@ -1138,8 +1143,9 @@ def do_unload():
         unload_thread.start()
         # unload should block until release sets, NOT return early.
         unload_thread.join(timeout = 0.5)
-        assert not unload_returned.is_set(), \
-            "unload_model returned while generation was still running"
+        assert (
+            not unload_returned.is_set()
+        ), "unload_model returned while generation was still running"
         release.set()
         unload_thread.join(timeout = 5)
         assert unload_returned.is_set()

From fa8efafcd8ab7913c5c79ee3063e10a0356f26f3 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 02:08:05 +0000
Subject: [PATCH 25/92] Fix/adjust diffusion: round 7 swap-aware guards +
 race-free generate for PR #5754

Round 7 reviewer surfaced a handful of swap-window races, fail-open
guards, and seed precision mismatches. This commit closes them.

Lifecycle / state (P1)
  * core/inference/diffusion.py: status() now emits active_repo_id,
    active_base_repo, pending_repo_id, pending_base_repo, and
    pending_gguf_filename alongside the existing UI-facing fields.
    During a swap (model A loaded, model B loading) the previous
    coalesced 'repo_id or pending_repo_id' hid the loading target
    from delete guards. Splitting the fields lets guards block
    deletion of either repo currently owned by the backend.
  * core/inference/diffusion.py: generate_image() now takes
    _generate_lock BEFORE snapshotting _pipe / _device. Snapshotting
    outside the lock let a concurrent unload/load clear or replace
    the backend between the snapshot and the forward, so the freed
    or swapped pipeline would still run.

Symmetric handoffs (P1)
  * routes/export.py: training-active check now runs BEFORE the
    chat / inference / diffusion unload helpers, so a 409 does not
    leave the user's chat session torn down for nothing. Also
    explicitly fails CLOSED with 503 when is_training_active()
    raises.
  * routes/inference.py: _raise_if_training_active now fails closed
    with 503 when the training backend is importable but its status
    check raises. The previous best-effort log-and-continue could
    let chat / diffusion loads collide with unverifiable training.

Delete guards (P1)
  * routes/models.py /delete-cached: chat guard now also blocks
    when llama-server is_active (i.e. mid-download) and when the
    inference backend's loading_models set contains the target.
    Round 7 review #7 flagged that the PR's diffusion-side loading
    guard had no chat-side parallel, so deleting a chat repo while
    it was downloading could still race the cache.
  * routes/models.py /delete-cached: diffusion guard iterates the
    new active_* + pending_* status fields so a delete during a
    swap is refused on either repo.
  * routes/models.py /delete-finetuned: same active_+ pending
    handling, plus the guard now also refuses deletes of a parent
    directory that contains the loaded pipeline (round 7 review #6:
    rm -rf /exports/flux-model/ could unlink model_index.json that
    the live pipeline is reading via mmap).

Seed precision (P2)
  * models/inference.py + routes/inference.py: DiffusionGenerate-
    Response now carries seed_str alongside the existing numeric
    seed. Seeds above Number.MAX_SAFE_INTEGER are rounded by
    JSON.parse in the browser; seed_str ships full decimal
    precision for display and reproduction.
  * frontend/api.ts: DiffusionGenerateResponse types seed_str;
    images-page.tsx prefers seed_str over seed in the figure
    caption so the displayed value reproduces the image.
  * frontend/api.ts: stringifyWithBigInt no longer regex-replaces
    sentinel strings over the full JSON output. It pulls the seed
    BigInt out, JSON-serialises the remaining payload, and splices
    the seed's decimal digits into the resulting object literal at
    the known position. Avoids the round 7 #10 case where a
    user-supplied prompt equal to '__bigint__:123' was rewritten
    into a JSON integer and rejected as a non-string prompt.

Custom HF repo (P2)
  * frontend/images-page.tsx: custom panel now exposes a 'Base
    diffusers repo' input that maps to DiffusionLoadRequest.
    base_repo. Required when a private / mirrored GGUF needs a
    non-default base (e.g. a 9B Klein transformer would otherwise
    fall back to the 4B base default).
---
 studio/backend/core/inference/diffusion.py    | 57 ++++++++-----
 studio/backend/models/inference.py            |  8 ++
 studio/backend/routes/export.py               | 66 +++++++++------
 studio/backend/routes/inference.py            | 49 ++++++++---
 studio/backend/routes/models.py               | 83 ++++++++++++++-----
 studio/frontend/src/features/images/api.ts    | 36 ++++++--
 .../src/features/images/images-page.tsx       | 36 ++++++--
 7 files changed, 240 insertions(+), 95 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 8ae3a13f0f..6b9e1c1af0 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -320,24 +320,36 @@ def status(self) -> dict[str, Any]:
         # POSIX layouts) to any authenticated Studio session.
         with self._lock:
             gguf_basename = Path(self._gguf_path).name if self._gguf_path else None
-            # During an in-flight load, expose _pending_* so cache /
-            # finetuned delete guards can refuse to wipe the repo
-            # that is mid-download. After the load completes (success
-            # or failure), the pending fields are cleared so status()
-            # reverts to publishing only the resident pipeline's id.
-            effective_repo = self._repo_id or self._pending_repo_id
-            effective_base = self._base_repo or self._pending_base_repo
-            effective_gguf = gguf_basename or self._pending_gguf_filename
+            # Expose BOTH the resident pipeline's id AND the pending
+            # load target. Delete guards must check both: when model A
+            # is already loaded and a swap to model B is in flight,
+            # only checking one would let the user rmtree whichever
+            # repo the guard ignored. UI-facing ``repo_id`` /
+            # ``base_repo`` / ``gguf_filename`` still prefer pending
+            # during a swap so the panel shows the load target the
+            # user just clicked.
+            active_repo = self._repo_id
+            active_base = self._base_repo
+            pending_repo = self._pending_repo_id if self._loading else None
+            pending_base = self._pending_base_repo if self._loading else None
+            pending_gguf = self._pending_gguf_filename if self._loading else None
             return {
                 "is_loaded": self._pipe is not None,
                 "is_loading": self._loading,
-                "repo_id": effective_repo,
+                "repo_id": pending_repo or active_repo,
                 "family": self._family.name if self._family else None,
                 "pipeline_class": (
                     self._family.pipeline_class if self._family else None
                 ),
-                "base_repo": effective_base,
-                "gguf_filename": effective_gguf,
+                "base_repo": pending_base or active_base,
+                "gguf_filename": pending_gguf or gguf_basename,
+                # Guard-facing fields: every repo / path the backend
+                # owns RIGHT NOW. Delete routes iterate both.
+                "active_repo_id": active_repo,
+                "active_base_repo": active_base,
+                "pending_repo_id": pending_repo,
+                "pending_base_repo": pending_base,
+                "pending_gguf_filename": pending_gguf,
                 "device": self._device,
                 "dtype": self._dtype,
                 "loaded_at": self._loaded_at,
@@ -703,19 +715,18 @@ def generate_image(
 
         import torch
 
-        with self._lock:
-            if self._pipe is None:
-                raise RuntimeError("No diffusion model is loaded.")
-            pipe = self._pipe
-            device = self._device or "cpu"
-
-        # _generate_lock outside _lock: only one forward at a time, but
-        # status() / unload() callers do not block on a running forward
-        # pass. unload_model takes _load_lock + _lock; the pipe object
-        # itself is kept alive by the local ``pipe`` reference until
-        # this function returns, so a concurrent unload during forward
-        # cannot free the weights from under us.
+        # Take _generate_lock FIRST so a concurrent unload/load that
+        # observes us holding it will queue behind this generation
+        # (and `unload_model` then waits its turn before clearing
+        # state). Snapshotting `self._pipe` outside the lock and then
+        # taking the lock let a load/unload race in between, so the
+        # forward could run against a freed or swapped pipeline.
         with self._generate_lock:
+            with self._lock:
+                if self._pipe is None:
+                    raise RuntimeError("No diffusion model is loaded.")
+                pipe = self._pipe
+                device = self._device or "cpu"
             generator = None
             if seed is not None:
                 # Match the device of the pipeline so determinism holds
diff --git a/studio/backend/models/inference.py b/studio/backend/models/inference.py
index 2767474e5f..08b16d89e7 100644
--- a/studio/backend/models/inference.py
+++ b/studio/backend/models/inference.py
@@ -1526,7 +1526,15 @@ class DiffusionGenerateResponse(BaseModel):
     height: int
     num_inference_steps: int
     guidance_scale: float
+    # ``seed`` ships as a JSON number for backwards compatibility with
+    # the gallery and existing API consumers, but JavaScript rounds
+    # integers above Number.MAX_SAFE_INTEGER on JSON.parse so seeds
+    # bigger than 2**53 would render different from the value the
+    # backend actually used. ``seed_str`` is the exact decimal
+    # representation; the frontend reads it for reproducibility and
+    # falls back to ``seed`` when not supplied.
     seed: Optional[int] = None
+    seed_str: Optional[str] = None
     duration_ms: int
     model: Optional[str] = None
     family: Optional[str] = None
diff --git a/studio/backend/routes/export.py b/studio/backend/routes/export.py
index d45659a386..5834767032 100644
--- a/studio/backend/routes/export.py
+++ b/studio/backend/routes/export.py
@@ -64,6 +64,48 @@ async def load_checkpoint(
         # Version switching is handled automatically by the subprocess-based
         # export backend — no need for ensure_transformers_version() here.
 
+        # Symmetric lifecycle guard: refuse to load an export
+        # checkpoint while training is active so we do not silently
+        # terminate someone's long-running training job and possibly
+        # fail the export load on top of that. Mirrors the
+        # _raise_if_training_active checks in routes/inference.py for
+        # chat and /images/load.
+        # Run BEFORE the chat / inference / diffusion unload helpers
+        # below: otherwise a 409 from this guard would still leave
+        # the user's chat / inference / diffusion GPU owners freed
+        # for nothing, which is the asymmetry round 7 review #5
+        # flagged. Fail-CLOSED (503) when the training backend is
+        # importable but its status check raises.
+        try:
+            from core.training import get_training_backend  # type: ignore
+
+            trn = get_training_backend()
+            try:
+                active = trn.is_training_active()
+            except Exception as e:
+                logger.warning(
+                    "Could not verify training status before export load: %s", e
+                )
+                raise HTTPException(
+                    status_code = 503,
+                    detail = (
+                        "Could not verify training status before loading "
+                        "an export checkpoint. Try again."
+                    ),
+                ) from e
+            if active:
+                raise HTTPException(
+                    status_code = 409,
+                    detail = (
+                        "Training is currently active. Stop the training "
+                        "run before loading an export checkpoint."
+                    ),
+                )
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.debug("training activity check skipped for export: %s", e)
+
         # Free GPU memory: shut down any running inference/training subprocesses
         # before loading the export checkpoint (they'd compete for VRAM).
         try:
@@ -94,30 +136,6 @@ async def load_checkpoint(
         except Exception as e:
             logger.debug("llama-server unload skipped for export: %s", e)
 
-        # Symmetric lifecycle guard: refuse to load an export
-        # checkpoint while training is active so we do not silently
-        # terminate someone's long-running training job and possibly
-        # fail the export load on top of that. Mirrors the
-        # _raise_if_training_active checks in routes/inference.py for
-        # chat and /images/load. Fail-closed (503) when the training
-        # backend can be imported but its status check raises.
-        try:
-            from core.training import get_training_backend  # type: ignore
-
-            trn = get_training_backend()
-            if trn.is_training_active():
-                raise HTTPException(
-                    status_code = 409,
-                    detail = (
-                        "Training is currently active. Stop the training "
-                        "run before loading an export checkpoint."
-                    ),
-                )
-        except HTTPException:
-            raise
-        except Exception as e:
-            logger.debug("training activity check skipped for export: %s", e)
-
         # Also unload any active diffusion pipeline (Images page); it
         # competes for the same GPU and would survive the inference
         # shutdown above. is_loading is treated like is_loaded so an
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index f49d28e095..4af5e9d3e4 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -248,8 +248,15 @@ def _raise_if_training_active(workload: str) -> None:
     Without this guard the load path would either (a) silently stop a
     running training run via _release_other_gpu_owners_for_diffusion
     or (b) double-spend VRAM and OOM both jobs. Both are worse for the
-    user than a 409 explaining why the request was refused. Best-effort
-    import so unit-test backends without core.training do not 500.
+    user than a 409 explaining why the request was refused.
+
+    Failure modes are split:
+      * ``core.training`` cannot be imported (CI, isolated tests,
+        custom builds) -> silently return; nothing to protect.
+      * ``core.training`` is importable but ``get_training_backend()``
+        or ``is_training_active()`` raises -> 503 fail-closed. We
+        cannot verify the GPU is free, so taking the safer route
+        avoids OOMing an unverifiable training run.
     """
     try:
         from core.training import get_training_backend  # type: ignore
@@ -257,18 +264,28 @@ def _raise_if_training_active(workload: str) -> None:
         return
     try:
         trn = get_training_backend()
-        if trn.is_training_active():
-            raise HTTPException(
-                status_code = 409,
-                detail = (
-                    f"Training is currently active. Stop the training run "
-                    f"before loading a {workload} model."
-                ),
-            )
-    except HTTPException:
-        raise
+        active = trn.is_training_active()
     except Exception as exc:
-        logger.debug("training activity check skipped: %s", exc)
+        logger.warning(
+            "Could not verify training status before %s load: %s",
+            workload,
+            exc,
+        )
+        raise HTTPException(
+            status_code = 503,
+            detail = (
+                f"Could not verify training status before loading the "
+                f"{workload} model. Try again."
+            ),
+        ) from exc
+    if active:
+        raise HTTPException(
+            status_code = 409,
+            detail = (
+                f"Training is currently active. Stop the training run "
+                f"before loading a {workload} model."
+            ),
+        )
 
 
 def _detect_safetensors_features(backend, chat_template: Optional[str]) -> dict:
@@ -1789,6 +1806,12 @@ async def diffusion_generate(
         num_inference_steps = payload.num_inference_steps,
         guidance_scale = payload.guidance_scale,
         seed = payload.seed,
+        # str() of a Python int has full precision; JavaScript can
+        # display it via BigInt without rounding. The numeric ``seed``
+        # field above is kept for backwards compatibility with older
+        # clients but is unsafe to use for seeds above 2**53 on the
+        # browser side.
+        seed_str = str(payload.seed) if payload.seed is not None else None,
         duration_ms = duration_ms,
         model = status.get("repo_id"),
         family = status.get("family"),
diff --git a/studio/backend/routes/models.py b/studio/backend/routes/models.py
index a8a18c081b..c061f42c23 100644
--- a/studio/backend/routes/models.py
+++ b/studio/backend/routes/models.py
@@ -1973,9 +1973,15 @@ async def delete_finetuned_model(
     # the merged repo locally, then loaded it via /images/load with a
     # local path as repo_id). Without this guard /delete-finetuned
     # could rmtree the directory the diffusion backend is reading from.
-    # is_loading is also blocked: status() exposes _pending_repo_id /
-    # _pending_base_repo during the load window so deletes during a
-    # mid-flight from_pretrained are refused.
+    # is_loading is also blocked: status() exposes pending_repo_id /
+    # pending_base_repo during the load window so deletes during a
+    # mid-flight from_pretrained are refused. During a swap we still
+    # see the previous load's active_repo_id, so every owned path is
+    # checked rather than just the UI-facing one.
+    # Block both DIRECTIONS:
+    #   * loaded path is the same as target (or a parent), and
+    #   * loaded path is a child of target (so the user cannot rmtree
+    #     a parent directory that contains the pipeline's mmap'd file).
     # Fail-CLOSED on exception (503) like the llama.cpp / safetensors
     # guards above: an unverifiable diffusion state means we cannot
     # confirm the target is safe to rmtree.
@@ -1985,12 +1991,18 @@ async def delete_finetuned_model(
         diff_backend = get_diffusion_backend()
         diff_status = diff_backend.status()
         if diff_status.get("is_loaded") or diff_status.get("is_loading"):
-            diff_repo = diff_status.get("repo_id") or ""
-            diff_base = diff_status.get("base_repo") or ""
+            candidates: list[str] = []
+            for key in (
+                "active_repo_id",
+                "active_base_repo",
+                "pending_repo_id",
+                "pending_base_repo",
+            ):
+                v = diff_status.get(key) or ""
+                if v:
+                    candidates.append(v)
             target_str = str(target_path)
-            for candidate in (diff_repo, diff_base):
-                if not candidate:
-                    continue
+            for candidate in candidates:
                 try:
                     candidate_path = Path(candidate).expanduser()
                 except Exception:
@@ -2005,6 +2017,7 @@ async def delete_finetuned_model(
                     candidate_resolved == target_path
                     or str(candidate_resolved) == target_str
                     or _is_path_under(candidate_resolved, target_path)
+                    or _is_path_under(target_path, candidate_resolved)
                 ):
                     raise HTTPException(
                         status_code = 400,
@@ -2654,18 +2667,26 @@ async def delete_cached_model(
     if not _is_valid_repo_id(repo_id):
         raise HTTPException(status_code = 400, detail = "Invalid repo_id format")
 
-    # Check if model is currently loaded
+    # Check if model is currently loaded OR loading. is_active and
+    # not is_loaded means an llama-server download / startup is in
+    # flight; the cache delete would race the hf_hub_download / mmap.
     try:
         from routes.inference import get_llama_cpp_backend
 
         llama_backend = get_llama_cpp_backend()
-        if llama_backend.is_loaded and llama_backend.model_identifier:
-            loaded_id = llama_backend.model_identifier.lower()
-            if loaded_id == repo_id.lower() or loaded_id.startswith(repo_id.lower()):
-                raise HTTPException(
-                    status_code = 400,
-                    detail = "Unload the model before deleting",
-                )
+        loaded_id = (llama_backend.model_identifier or "").lower()
+        wants = (
+            loaded_id == repo_id.lower()
+            or loaded_id.startswith(repo_id.lower())
+        )
+        if wants and (
+            llama_backend.is_loaded
+            or getattr(llama_backend, "is_active", False)
+        ):
+            raise HTTPException(
+                status_code = 400,
+                detail = "Unload the model before deleting",
+            )
     except HTTPException:
         raise
     except Exception:
@@ -2673,9 +2694,21 @@ async def delete_cached_model(
 
     try:
         inference_backend = get_inference_backend()
+        loading_models = getattr(inference_backend, "loading_models", set()) or set()
+        needle = repo_id.lower()
+        # Loading set holds model identifiers currently being
+        # downloaded / instantiated; treat them like active loads
+        # so a delete cannot race a partial mmap.
+        for loading_model in loading_models:
+            ml = (loading_model or "").lower()
+            if ml == needle or ml.startswith(needle):
+                raise HTTPException(
+                    status_code = 409,
+                    detail = "Cannot delete a model while it is loading",
+                )
         if inference_backend.active_model_name:
             active = inference_backend.active_model_name.lower()
-            if active == repo_id.lower() or active.startswith(repo_id.lower()):
+            if active == needle or active.startswith(needle):
                 raise HTTPException(
                     status_code = 400,
                     detail = "Unload the model before deleting",
@@ -2685,7 +2718,7 @@ async def delete_cached_model(
     except Exception:
         pass
 
-    # Also refuse to delete the cache underlying a loaded or *loading*
+    # Also refuse to delete the cache underlying a loaded OR loading
     # diffusion pipeline. The diffusion backend mmap's the GGUF + base
     # repo weights and continues to read from the cache long after
     # load; deleting them out from under it would corrupt generation.
@@ -2694,6 +2727,9 @@ async def delete_cached_model(
     # Match exactly on repo_id (case-insensitive) instead of prefix to
     # avoid blocking unrelated deletes like "org/model" while
     # "org/model-v2" is loaded.
+    # During a swap (model A loaded, model B loading), status()
+    # exposes both via ``active_*`` and ``pending_*`` so we check
+    # every repo the backend currently owns.
     # Fail-CLOSED on exception (return 503) like the neighboring
     # llama.cpp / safetensors guards: we cannot verify whether the
     # delete is safe, so refuse rather than risk corrupting the
@@ -2704,10 +2740,15 @@ async def delete_cached_model(
         diff_backend = get_diffusion_backend()
         diff_status = diff_backend.status()
         if diff_status.get("is_loaded") or diff_status.get("is_loading"):
-            diff_repo = (diff_status.get("repo_id") or "").lower()
-            diff_base = (diff_status.get("base_repo") or "").lower()
             needle = repo_id.lower()
-            if diff_repo == needle or diff_base == needle:
+            owned = {
+                (diff_status.get("active_repo_id") or "").lower(),
+                (diff_status.get("active_base_repo") or "").lower(),
+                (diff_status.get("pending_repo_id") or "").lower(),
+                (diff_status.get("pending_base_repo") or "").lower(),
+            }
+            owned.discard("")
+            if needle in owned:
                 raise HTTPException(
                     status_code = 400,
                     detail = "Unload the diffusion image model before deleting",
diff --git a/studio/frontend/src/features/images/api.ts b/studio/frontend/src/features/images/api.ts
index 0a09d971de..b84cfab0dd 100644
--- a/studio/frontend/src/features/images/api.ts
+++ b/studio/frontend/src/features/images/api.ts
@@ -60,7 +60,14 @@ export interface DiffusionGenerateResponse {
   height: number;
   num_inference_steps: number;
   guidance_scale: number;
+  /**
+   * Numeric seed. Safe ONLY for values <= Number.MAX_SAFE_INTEGER.
+   * For larger seeds, prefer ``seed_str`` (full-precision decimal).
+   */
   seed: number | null;
+  /** Decimal string with full uint64 precision. Use this for display
+   *  and reproduction when the user pastes the seed back in. */
+  seed_str: string | null;
   duration_ms: number;
   model: string | null;
   family: string | null;
@@ -95,14 +102,27 @@ export async function unloadDiffusionModel(): Promise<{ is_loaded: boolean }> {
   );
 }
 
-/** JSON.stringify cannot serialise BigInt directly. We only ever
- * have BigInts in the seed field, which is an integer; emit the
- * literal digits so the server receives a JSON integer rather than
- * a string. Pydantic v2 accepts arbitrarily large ints. */
-function stringifyWithBigInt(value: unknown): string {
-  return JSON.stringify(value, (_, v) =>
-    typeof v === "bigint" ? `__bigint__:${v.toString()}` : v,
-  ).replace(/"__bigint__:(-?\d+)"/g, "$1");
+/** JSON.stringify cannot serialise BigInt directly. Pull the seed
+ * BigInt out, stringify the rest of the payload normally, then
+ * splice the seed's decimal digits back into the JSON literal at the
+ * exact ``"seed":<int>`` slot.
+ *
+ * Avoids the previous regex-over-JSON approach, which could be
+ * tripped by a user-supplied prompt that exactly matched the
+ * sentinel string. With this approach the only thing we touch is
+ * the literal ``"seed":<number>`` substring we wrote ourselves.
+ */
+function stringifyWithBigInt(value: DiffusionGenerateRequest): string {
+  const { seed, ...rest } = value;
+  if (typeof seed !== "bigint") {
+    return JSON.stringify(value);
+  }
+  // Serialise the rest without seed, then inject the seed at the end
+  // of the object literal as a JSON integer. Strip the trailing "}"
+  // and re-append once the field is added.
+  const base = JSON.stringify(rest);
+  const inner = base.length === 2 /* '{}' */ ? "" : base.slice(1, -1) + ",";
+  return `{${inner}"seed":${seed.toString()}}`;
 }
 
 export async function generateDiffusionImage(
diff --git a/studio/frontend/src/features/images/images-page.tsx b/studio/frontend/src/features/images/images-page.tsx
index 64f06bfbb0..b5aa124fd7 100644
--- a/studio/frontend/src/features/images/images-page.tsx
+++ b/studio/frontend/src/features/images/images-page.tsx
@@ -110,6 +110,7 @@ export function ImagesPage() {
   const [presetIndex, setPresetIndex] = useState(0);
   const [customRepoId, setCustomRepoId] = useState("");
   const [customGguf, setCustomGguf] = useState("");
+  const [customBaseRepo, setCustomBaseRepo] = useState("");
   const [customFamily, setCustomFamily] = useState<string>("auto");
   const [useCustom, setUseCustom] = useState(false);
   const [hfToken, setHfToken] = useState("");
@@ -162,10 +163,14 @@ export function ImagesPage() {
           : customFamily
         : preset.family;
       // Always pass base_repo for curated entries; custom-repo mode
-      // lets the backend either infer it from the family default or
-      // (when no GGUF is given) treat the repo as a full diffusers
-      // checkpoint and call from_pretrained on it directly.
-      const baseRepo = useCustom ? undefined : preset.base_repo;
+      // now also lets the user pin one because private / mirrored
+      // GGUFs (e.g. a 9B klein transformer) would otherwise fall
+      // back to the family-default 4B base and 500 on load. Empty
+      // string still falls back to the backend's smart-base /
+      // repo-id defaults.
+      const baseRepo = useCustom
+        ? customBaseRepo.trim() || undefined
+        : preset.base_repo;
       if (!repo) {
         toast.error("Pick a model first");
         return;
@@ -191,7 +196,7 @@ export function ImagesPage() {
     } finally {
       setBusy("idle");
     }
-  }, [useCustom, customRepoId, customGguf, customFamily, preset, hfToken, refreshStatus]);
+  }, [useCustom, customRepoId, customGguf, customBaseRepo, customFamily, preset, hfToken, refreshStatus]);
 
   const handleUnload = useCallback(async () => {
     setBusy("unloading");
@@ -350,6 +355,17 @@ export function ImagesPage() {
                 onChange={(e) => setCustomGguf(e.target.value)}
                 placeholder="FLUX.2-klein-4B-Q4_K_S.gguf"
               />
+              <Label>Base diffusers repo (optional)</Label>
+              <Input
+                value={customBaseRepo}
+                onChange={(e) => setCustomBaseRepo(e.target.value)}
+                placeholder="black-forest-labs/FLUX.2-klein-9B"
+              />
+              <p className="text-xs text-muted-foreground">
+                {"Optional. Defaults to the family base. Set this when "}
+                {"your GGUF expects a non-default base (for example a 9B "}
+                {"transformer that would otherwise fall back to a 4B base)."}
+              </p>
               <Label>Pipeline family (override)</Label>
               <Select
                 value={customFamily}
@@ -539,7 +555,15 @@ export function ImagesPage() {
                 />
                 <figcaption className="text-xs text-muted-foreground">
                   {r.width}x{r.height} - {r.num_inference_steps} steps - g={r.guidance_scale.toFixed(1)}
-                  {r.seed !== null && r.seed !== undefined ? ` - seed ${r.seed}` : ""} -
+                  {/* Prefer seed_str (full uint64 precision) since the
+                       numeric seed gets rounded by JSON.parse above
+                       Number.MAX_SAFE_INTEGER and would otherwise
+                       display a value that does not reproduce. */}
+                  {r.seed_str
+                    ? ` - seed ${r.seed_str}`
+                    : r.seed !== null && r.seed !== undefined
+                    ? ` - seed ${r.seed}`
+                    : ""} -
                   {` ${(r.duration_ms / 1000).toFixed(1)}s`}
                 </figcaption>
               </figure>

From 92eccc3627d618a4a3bd45b7a5230efbba52a0df Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 02:08:29 +0000
Subject: [PATCH 26/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/routes/models.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/studio/backend/routes/models.py b/studio/backend/routes/models.py
index c061f42c23..94c535bde4 100644
--- a/studio/backend/routes/models.py
+++ b/studio/backend/routes/models.py
@@ -2675,13 +2675,9 @@ async def delete_cached_model(
 
         llama_backend = get_llama_cpp_backend()
         loaded_id = (llama_backend.model_identifier or "").lower()
-        wants = (
-            loaded_id == repo_id.lower()
-            or loaded_id.startswith(repo_id.lower())
-        )
+        wants = loaded_id == repo_id.lower() or loaded_id.startswith(repo_id.lower())
         if wants and (
-            llama_backend.is_loaded
-            or getattr(llama_backend, "is_active", False)
+            llama_backend.is_loaded or getattr(llama_backend, "is_active", False)
         ):
             raise HTTPException(
                 status_code = 400,

From c1f9aac510567d942b08ee8942c1e1c84eaba858 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 02:47:50 +0000
Subject: [PATCH 27/92] Fix/adjust diffusion: round 8 async unloads + tighter
 handoffs for PR #5754

Round 8 reviewer surfaced event-loop stalls (blocking unload from
async routes), incomplete VRAM handoff coverage (is_active /
loading_models / is_export_active not checked), token leaks via
exception messages, /v1 exposure, and several fail-open paths.

Async / event-loop
  * routes/inference.py /images/unload, GGUF chat-load handoff,
    safetensors chat-load handoff: blocking DiffusionBackend.unload
    pushed onto asyncio.to_thread. unload takes _load_lock +
    _generate_lock and can block for the full duration of an
    in-flight load / generation, which was freezing the FastAPI
    worker, SSE stream, and hardware poller for minutes.
  * routes/export.py + routes/training.py: same to_thread wrap on
    diffusion unload during checkpoint / training start.

GPU-owner handoff completeness
  * core/inference/diffusion.py _release_chat_backend_for_diffusion:
    llama-server now also unloaded when is_active=True (mid-download
    / startup), not only when is_loaded; flushed in-flight
    safetensors loads from loading_models too.
  * core/inference/diffusion.py _release_other_gpu_owners_for_
    diffusion: export shutdown now also fires when
    is_export_active() returns True (checkpoint not yet assigned).

Security / scrubbing
  * core/inference/diffusion.py: load failure paths now scrub
    hf_token from both _last_error AND the raised RuntimeError
    message (the previous scrub only cleared frame locals).
    Falls back to a regex strip of hf_[A-Za-z0-9]{20,} to
    catch tokens that came in via huggingface_hub default caching.
  * routes/inference.py: image lifecycle endpoints moved from
    router to studio_router so they no longer answer under
    the /v1 OpenAI-compat prefix. Studio-only side effects
    (download multi-GB GGUFs, unload chat, etc.) should not be
    reachable via an OpenAI-compat client.
  * models/inference.py: control-char validator now also rejects
    tab. Some log sinks split fields on tab; allowing it left a
    log-injection surface.

Fail-closed delete guards
  * routes/models.py /delete-cached: llama.cpp and safetensors
    branches now fail closed with 503 when their status check
    raises (matches the diffusion-side guard added earlier).
  * routes/export.py: split the try/except around the training
    backend so import failure falls back to 'skip' (no
    core.training in this build) while a runtime failure of
    get_training_backend()/is_training_active() fails closed.
  * routes/models.py /delete-finetuned: diffusion guard now also
    compares against relative path candidates (Path.resolve() works
    on relative input). Previously a load with a relative repo_id
    bypassed the guard.

CUDA cleanup ordering
  * core/inference/diffusion.py: split _release() (drops local +
    gc.collect) from _drain_cuda_cache() (torch.cuda.empty_cache).
    Callers now drain AFTER nulling every reference so the
    allocator actually reclaims the freed slabs (previously
    empty_cache ran while caller still held a local, which left
    the cache pinned).

Generate response (P2 #16)
  * routes/inference.py: response uses status()['active_repo_id']
    instead of the UI-facing repo_id, so a queued /images/load
    promoting a pending model cannot mislabel the just-rendered
    image with the new model's identity.

Test wiring
  * tests/test_diffusion_routes.py: mount inf.studio_router on the
    test app so /images/* routes are reachable now that they live
    on the Studio-only router.
---
 studio/backend/core/inference/diffusion.py    | 104 ++++++++++++++++--
 studio/backend/models/inference.py            |  17 +--
 studio/backend/routes/export.py               |  18 +--
 studio/backend/routes/inference.py            |  32 ++++--
 studio/backend/routes/models.py               |  30 ++++-
 studio/backend/routes/training.py             |   5 +-
 studio/backend/tests/test_diffusion_routes.py |   4 +
 7 files changed, 171 insertions(+), 39 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 6b9e1c1af0..4cad01fe40 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -563,6 +563,12 @@ def load_model(
                         self._loaded_at = None
                     _release(old)
                     old = None
+                    # Now that both the attribute and the local
+                    # have been nulled, the pipeline is unreachable;
+                    # ask the CUDA allocator to release its slabs so
+                    # the next from_pretrained does not OOM behind
+                    # an already-freed-but-cached arena.
+                    _drain_cuda_cache()
 
                 if gguf_filename:
                     quant_config = diffusers.GGUFQuantizationConfig(compute_dtype = dtype)
@@ -612,6 +618,7 @@ def load_model(
                     if transformer is not None:
                         _release(transformer)
                         transformer = None
+                        _drain_cuda_cache()
                     raise
                 if enable_model_cpu_offload and device == "cuda":
                     pipe.enable_model_cpu_offload()
@@ -635,13 +642,29 @@ def load_model(
                 # some structlog formatters render frame locals, which
                 # would otherwise echo the raw hf_... token into logs
                 # and any error reporting sink the user has wired up.
+                # ALSO scrub the exception message itself: huggingface_hub
+                # / diffusers can include the bearer token verbatim in
+                # 401 / 403 messages, which would propagate through
+                # ``_last_error`` (rendered in status()) and the
+                # user-facing RuntimeError (rendered in route responses).
+                scrub_token = hf_token
                 hf_token = None  # noqa: F841
                 pipe_kwargs = None  # noqa: F841
                 single_file_kwargs = None  # noqa: F841
+                exc_msg = str(exc)
+                if scrub_token:
+                    exc_msg = exc_msg.replace(scrub_token, "<redacted>")
+                # Hugging Face tokens are prefixed ``hf_``; replace any
+                # leftover ``hf_...`` substrings to catch tokens we did
+                # not store as ``scrub_token`` (e.g. cached tokens that
+                # huggingface_hub picked up on its own).
+                import re
+
+                exc_msg = re.sub(r"hf_[A-Za-z0-9]{20,}", "<redacted>", exc_msg)
                 with self._lock:
-                    self._last_error = str(exc)
+                    self._last_error = exc_msg
                 logger.exception("Diffusion load failed for %s", repo_id)
-                raise RuntimeError(f"Failed to load diffusion model: {exc}") from exc
+                raise RuntimeError(f"Failed to load diffusion model: {exc_msg}") from exc
             finally:
                 with self._lock:
                     self._loading = False
@@ -675,6 +698,8 @@ def unload_model(self) -> dict[str, Any]:
                 self._dtype = None
                 self._loaded_at = None
             _release(old)
+            old = None  # noqa: F841
+            _drain_cuda_cache()
         return {"is_loaded": False}
 
     # ── generation ────────────────────────────────────────────────
@@ -819,13 +844,24 @@ def _release_chat_backend_for_diffusion() -> None:
     isolated tests, custom builds) or fails on the unload, we log and
     continue; the diffusion load can still try and surface its own OOM.
     """
-    # 1. GGUF chat backend (llama-server subprocess).
+    # 1. GGUF chat backend (llama-server subprocess). We unload when
+    #    EITHER is_loaded is True (resident model) OR is_active is
+    #    True (mid-download / startup); the latter case is the
+    #    "llama-server is currently starting" race where weights are
+    #    being downloaded and the diffusion load would otherwise
+    #    double-spend GPU memory.
     try:
         from routes.inference import get_llama_cpp_backend  # type: ignore
 
         backend = get_llama_cpp_backend()
-        if getattr(backend, "is_loaded", False):
-            logger.info("Unloading llama-server before diffusion load")
+        is_loaded = bool(getattr(backend, "is_loaded", False))
+        is_active = bool(getattr(backend, "is_active", False))
+        if is_loaded or is_active:
+            logger.info(
+                "Unloading llama-server (loaded=%s active=%s) before diffusion load",
+                is_loaded,
+                is_active,
+            )
             backend.unload_model()
     except Exception as exc:
         logger.debug("llama-server unload skipped: %s", exc)
@@ -835,18 +871,34 @@ def _release_chat_backend_for_diffusion() -> None:
     #    backend has a model resident on the same GPU, a diffusion load
     #    will OOM the same way. The orchestrator's unload_model takes a
     #    model_name; passing it without args raised TypeError and was
-    #    swallowed, leaving the chat model resident.
+    #    swallowed, leaving the chat model resident. We also flush any
+    #    loading_models set so a chat load that is mid-download cannot
+    #    race the diffusion allocation.
     try:
         from core.inference import get_inference_backend  # type: ignore
 
         backend = get_inference_backend()
         active_model_name = getattr(backend, "active_model_name", None)
+        loading_models = set(getattr(backend, "loading_models", set()) or set())
         if active_model_name:
             logger.info(
                 "Unloading safetensors chat backend '%s' before diffusion load",
                 active_model_name,
             )
             backend.unload_model(active_model_name)
+        for loading in loading_models:
+            if loading == active_model_name:
+                continue
+            try:
+                logger.info(
+                    "Unloading in-flight safetensors chat load '%s' before diffusion",
+                    loading,
+                )
+                backend.unload_model(loading)
+            except Exception as inner:
+                logger.debug(
+                    "loading safetensors unload skipped for %s: %s", loading, inner
+                )
     except Exception as exc:
         logger.debug("safetensors unload skipped: %s", exc)
 
@@ -855,13 +907,27 @@ def _release_other_gpu_owners_for_diffusion() -> None:
     """Best-effort: shut down export subprocess + active training before
     a diffusion load. Both can hold multi-GB of VRAM and would OOM the
     diffusion allocation on consumer GPUs."""
-    # Export subprocess
+    # Export subprocess. Shut down when EITHER a checkpoint is
+    # resident OR is_export_active() reports work in flight (a
+    # checkpoint load that has been kicked off but not yet completed
+    # the assignment to current_checkpoint). Either case can hold
+    # GPU memory that would OOM the diffusion allocation.
     try:
         from core.export import get_export_backend  # type: ignore
 
         exp = get_export_backend()
-        if getattr(exp, "current_checkpoint", None):
-            logger.info("Shutting down export subprocess before diffusion load")
+        has_checkpoint = bool(getattr(exp, "current_checkpoint", None))
+        is_active = False
+        try:
+            is_active = bool(exp.is_export_active())
+        except Exception:
+            is_active = False
+        if has_checkpoint or is_active:
+            logger.info(
+                "Shutting down export subprocess (checkpoint=%s active=%s)",
+                has_checkpoint,
+                is_active,
+            )
             exp._shutdown_subprocess()
             exp.current_checkpoint = None
             exp.is_vision = False
@@ -879,7 +945,16 @@ def _release_other_gpu_owners_for_diffusion() -> None:
 
 
 def _release(obj: Any) -> None:
-    """Best-effort GPU-memory release for a pipeline being swapped out."""
+    """Best-effort GPU-memory release for a pipeline being swapped out.
+
+    Only drops the local reference (which the caller has already
+    nulled in its own scope) and runs ``gc.collect()`` so __del__
+    fires. Does NOT call ``torch.cuda.empty_cache()`` here because
+    when the caller still holds the actual reference in a local /
+    attribute, ``empty_cache()`` would run before __del__ released
+    the weights and would not actually free GPU memory. Use
+    ``_drain_cuda_cache()`` AFTER the last reference has been nulled.
+    """
     if obj is None:
         return
     try:
@@ -887,6 +962,15 @@ def _release(obj: Any) -> None:
     except Exception:
         pass
     gc.collect()
+
+
+def _drain_cuda_cache() -> None:
+    """Hand freed weights back to the CUDA allocator.
+
+    Call this AFTER every reference to the freed object has been
+    dropped (caller's local + attribute) and a ``gc.collect()`` has
+    fired __del__. Calling earlier would empty an already-pinned
+    cache and not actually release the memory."""
     try:
         import torch
 
diff --git a/studio/backend/models/inference.py b/studio/backend/models/inference.py
index 08b16d89e7..72fea9b851 100644
--- a/studio/backend/models/inference.py
+++ b/studio/backend/models/inference.py
@@ -1427,19 +1427,22 @@ class AnthropicMessagesResponse(BaseModel):
 
 
 def _no_control_chars(value: Optional[str], field_name: str) -> Optional[str]:
-    """Reject newlines and other ASCII control chars in identifiers
-    that get logged before HF validates them.
+    """Reject newlines, tabs, and other ASCII control chars in
+    identifiers that get logged before HF validates them.
 
     Authenticated callers could otherwise inject ``\\n`` / ``\\r`` /
-    NUL into ``logger.info("Loading diffusion model %s", repo_id)``
-    and forge fake log lines. HF repo ids and filenames legitimately
-    contain only ``[A-Za-z0-9._/-]``, so this is also a useful
-    correctness check (catches accidental ``"my repo\\n"`` paste).
+    ``\\t`` / NUL into ``logger.info("Loading diffusion model %s",
+    repo_id)`` and forge fake log lines. HF repo ids and filenames
+    legitimately contain only ``[A-Za-z0-9._/-]``, so this is also a
+    useful correctness check (catches accidental ``"my repo\\n"``
+    paste). Tab is included in the reject set because some logging
+    sinks split fields on tab; allowing it would still let an
+    attacker forge fake columns.
     """
     if value is None:
         return value
     for ch in value:
-        if ch == "\x7f" or (ord(ch) < 0x20 and ch != "\t"):
+        if ch == "\x7f" or ord(ch) < 0x20:
             raise ValueError(
                 f"{field_name} contains control characters; use a plain "
                 "Hugging Face repo / file name."
diff --git a/studio/backend/routes/export.py b/studio/backend/routes/export.py
index 5834767032..4b0b95a809 100644
--- a/studio/backend/routes/export.py
+++ b/studio/backend/routes/export.py
@@ -78,9 +78,14 @@ async def load_checkpoint(
         # importable but its status check raises.
         try:
             from core.training import get_training_backend  # type: ignore
-
-            trn = get_training_backend()
+        except Exception as e:
+            logger.debug(
+                "core.training not importable, skipping export training guard: %s",
+                e,
+            )
+        else:
             try:
+                trn = get_training_backend()
                 active = trn.is_training_active()
             except Exception as e:
                 logger.warning(
@@ -101,10 +106,6 @@ async def load_checkpoint(
                         "run before loading an export checkpoint."
                     ),
                 )
-        except HTTPException:
-            raise
-        except Exception as e:
-            logger.debug("training activity check skipped for export: %s", e)
 
         # Free GPU memory: shut down any running inference/training subprocesses
         # before loading the export checkpoint (they'd compete for VRAM).
@@ -154,7 +155,10 @@ async def load_checkpoint(
                     diff_status.get("is_loaded"),
                     diff_status.get("is_loading"),
                 )
-                diff.unload_model()
+                # Block-move to thread; unload acquires the
+                # diffusion _load_lock + _generate_lock and can take
+                # the full duration of an in-flight load/generation.
+                await asyncio.to_thread(diff.unload_model)
         except Exception as e:
             logger.debug("diffusion unload skipped for export: %s", e)
 
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 4af5e9d3e4..8b6918c91d 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -816,7 +816,11 @@ async def load_model(
                         diff_status.get("is_loaded"),
                         diff_status.get("is_loading"),
                     )
-                    diff_backend.unload_model()
+                    # diff_backend.unload_model takes _load_lock +
+                    # _generate_lock and can block for the duration of
+                    # an in-flight load / generation. Off-load to a
+                    # worker thread to keep the event loop responsive.
+                    await asyncio.to_thread(diff_backend.unload_model)
             except Exception as e:
                 logger.debug("diffusion unload skipped (GGUF path): %s", e)
 
@@ -1016,7 +1020,9 @@ async def load_model(
                     diff_status.get("is_loaded"),
                     diff_status.get("is_loading"),
                 )
-                diff_backend.unload_model()
+                # Same blocking concern as the GGUF chat path:
+                # _load_lock + _generate_lock serialise the call.
+                await asyncio.to_thread(diff_backend.unload_model)
         except Exception as e:
             logger.debug("diffusion unload skipped: %s", e)
 
@@ -1702,7 +1708,7 @@ def _get_diffusion_backend():
     return get_diffusion_backend()
 
 
-@router.post("/images/load")
+@studio_router.post("/images/load")
 async def diffusion_load(
     payload: DiffusionLoadRequest,
     current_subject: str = Depends(get_current_subject),
@@ -1738,16 +1744,22 @@ async def diffusion_load(
         raise HTTPException(status_code = 500, detail = str(exc))
 
 
-@router.post("/images/unload")
+@studio_router.post("/images/unload")
 async def diffusion_unload(
     current_subject: str = Depends(get_current_subject),
 ):
     """Unload the current diffusion model and free GPU memory."""
     backend = _get_diffusion_backend()
-    return backend.unload_model()
+    # DiffusionBackend.unload_model takes _load_lock + _generate_lock
+    # and waits for any in-flight load / generation to complete.
+    # Calling it directly from an async route would freeze the
+    # FastAPI worker (and the SSE log stream, hardware poller, etc.)
+    # for the full duration of the generation. Push it onto a worker
+    # thread so the event loop stays responsive.
+    return await asyncio.to_thread(backend.unload_model)
 
 
-@router.get("/images/status")
+@studio_router.get("/images/status")
 async def diffusion_status(
     current_subject: str = Depends(get_current_subject),
 ):
@@ -1756,7 +1768,7 @@ async def diffusion_status(
     return backend.status()
 
 
-@router.post("/images/generate", response_model = DiffusionGenerateResponse)
+@studio_router.post("/images/generate", response_model = DiffusionGenerateResponse)
 async def diffusion_generate(
     payload: DiffusionGenerateRequest,
     current_subject: str = Depends(get_current_subject),
@@ -1813,7 +1825,11 @@ async def diffusion_generate(
         # browser side.
         seed_str = str(payload.seed) if payload.seed is not None else None,
         duration_ms = duration_ms,
-        model = status.get("repo_id"),
+        # Use ``active_repo_id`` (the pipeline that just ran the
+        # forward) rather than the UI-facing ``repo_id`` so a
+        # queued /images/load promoting a new pending model cannot
+        # leak that model's identity into our response.
+        model = status.get("active_repo_id") or status.get("repo_id"),
         family = status.get("family"),
     )
 
diff --git a/studio/backend/routes/models.py b/studio/backend/routes/models.py
index 94c535bde4..1f71379197 100644
--- a/studio/backend/routes/models.py
+++ b/studio/backend/routes/models.py
@@ -2007,8 +2007,11 @@ async def delete_finetuned_model(
                     candidate_path = Path(candidate).expanduser()
                 except Exception:
                     continue
-                if not candidate_path.is_absolute():
-                    continue
+                # Relative paths (the user can do
+                # `/images/load repo_id=exports/my-flux`) are still
+                # legitimate path candidates; resolve against the
+                # backend cwd so they can be compared with the
+                # absolute ``target_path``. Round 8 review #11.
                 try:
                     candidate_resolved = candidate_path.resolve()
                 except Exception:
@@ -2670,6 +2673,9 @@ async def delete_cached_model(
     # Check if model is currently loaded OR loading. is_active and
     # not is_loaded means an llama-server download / startup is in
     # flight; the cache delete would race the hf_hub_download / mmap.
+    # Fail CLOSED on exception (503) like the diffusion guard below:
+    # unverifiable load state means we cannot confirm the delete is
+    # safe.
     try:
         from routes.inference import get_llama_cpp_backend
 
@@ -2685,8 +2691,14 @@ async def delete_cached_model(
             )
     except HTTPException:
         raise
-    except Exception:
-        pass
+    except Exception as e:
+        logger.warning(
+            "Could not check llama.cpp backend status before cache delete: %s", e
+        )
+        raise HTTPException(
+            status_code = 503,
+            detail = "Could not verify llama.cpp load status before deleting cache",
+        ) from e
 
     try:
         inference_backend = get_inference_backend()
@@ -2711,8 +2723,14 @@ async def delete_cached_model(
                 )
     except HTTPException:
         raise
-    except Exception:
-        pass
+    except Exception as e:
+        logger.warning(
+            "Could not check safetensors backend status before cache delete: %s", e
+        )
+        raise HTTPException(
+            status_code = 503,
+            detail = "Could not verify safetensors load status before deleting cache",
+        ) from e
 
     # Also refuse to delete the cache underlying a loaded OR loading
     # diffusion pipeline. The diffusion backend mmap's the GGUF + base
diff --git a/studio/backend/routes/training.py b/studio/backend/routes/training.py
index 9a9f6b9761..c7b33e025e 100644
--- a/studio/backend/routes/training.py
+++ b/studio/backend/routes/training.py
@@ -327,7 +327,10 @@ async def start_training(
                     diff_status.get("is_loaded"),
                     diff_status.get("is_loading"),
                 )
-                diff_backend.unload_model()
+                # Async route: offload the blocking unload to a
+                # worker thread so the event loop stays responsive
+                # during long in-flight load / generate calls.
+                await asyncio.to_thread(diff_backend.unload_model)
         except Exception as e:
             logger.warning("Could not unload diffusion model: %s", e)
 
diff --git a/studio/backend/tests/test_diffusion_routes.py b/studio/backend/tests/test_diffusion_routes.py
index b7bfd1e2f8..fc99228a63 100644
--- a/studio/backend/tests/test_diffusion_routes.py
+++ b/studio/backend/tests/test_diffusion_routes.py
@@ -116,7 +116,11 @@ def app_with_stub(monkeypatch):
     monkeypatch.setattr(inf, "_get_diffusion_backend", lambda: stub)
 
     app = FastAPI()
+    # Diffusion image routes live on studio_router so they are NOT
+    # exposed under /v1 (which would let OpenAI-compat clients
+    # trigger Studio-only side effects).
     app.include_router(inf.router, prefix = "/api/inference")
+    app.include_router(inf.studio_router, prefix = "/api/inference")
     # Bypass auth by overriding the dependency.
     from auth.authentication import get_current_subject
 

From 0ae20554dc29c8837289945ed4624424e7f53b4e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 02:48:12 +0000
Subject: [PATCH 28/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/diffusion.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 4cad01fe40..d0799081ac 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -664,7 +664,9 @@ def load_model(
                 with self._lock:
                     self._last_error = exc_msg
                 logger.exception("Diffusion load failed for %s", repo_id)
-                raise RuntimeError(f"Failed to load diffusion model: {exc_msg}") from exc
+                raise RuntimeError(
+                    f"Failed to load diffusion model: {exc_msg}"
+                ) from exc
             finally:
                 with self._lock:
                     self._loading = False

From 1193c8144abc2dc4d5a0cb3a3ab3da89aca974fe Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 03:19:42 +0000
Subject: [PATCH 29/92] Fix/adjust diffusion: round 9 shared release helpers +
 export-active guard for PR #5754

Round 9 reviewer flagged a pile of handoff asymmetries: every
GPU-owning lifecycle change (training, export, chat, images) needed
its own bespoke unload sequence and they had drifted out of sync.
Some skipped llama-server is_active; some missed safetensors
loading_models; export and training did not check is_export_active.

Backend handoff (P1)
  * routes/inference.py: new _release_chat_for / _release_export_for
    helpers. Both treat llama-server as held when is_loaded OR
    is_active, safetensors as held when active_model_name OR
    loading_models is non-empty, and export as held when
    current_checkpoint OR is_export_active. Both helpers run their
    unloads in worker threads so async routes do not block the
    event loop.
  * routes/training.py: replaces its bespoke inline llama / safe /
    export unload sequence with await _release_chat_for / _release_
    export_for.
  * routes/export.py: same swap for the chat unload chain (export
    still does NOT call _release_export_for on itself).
  * routes/inference.py GGUF + standard chat-load paths: now use
    _release_export_for to drop a settled export, and the standard
    path's llama unload now also handles is_active=True (round 9
    review #8).

Backend reject-on-active export (P1 #5)
  * routes/inference.py: new _raise_if_export_active. Symmetric
    with _raise_if_training_active: a long-running export is
    refused with HTTP 409 instead of being silently killed when
    /images/load or /load arrives. Diffusion / images load and
    both chat-load paths call it.
  * core/inference/diffusion.py _release_other_gpu_owners_for_
    diffusion: no longer tears down an in-flight export job. Only
    drops a SETTLED export checkpoint (current_checkpoint
    populated, is_export_active False). Round 9 review #5 -- the
    previous behavior could terminate an in-flight export and
    leave a partial output artifact.

Token leak via logger.exception (P1 #6)
  * core/inference/diffusion.py: load-failure logging now uses
    logger.error(..., exc_msg) with the already-scrubbed string
    and exc_info=False. logger.exception() with the raw Exception
    would expose any hf_... token that diffusers / huggingface_hub
    embedded in the message or traceback locals, defeating the
    earlier in-flight scrub.

Dependency pinning (P1 #11)
  * pyproject.toml: huggingfacenotorch optional extra now pins
    diffusers>=0.37.0. Previously the floor was only set in
    studio/backend/requirements/no-torch-runtime.txt, so a normal
    pip install would resolve diffusers 0.36.0 (no
    Flux2KleinPipeline) and the default curated FLUX.2 klein
    Images model would fail at runtime.

Cache-delete exact match (P1 #14)
  * routes/models.py /delete-cached: llama.cpp and safetensors
    guards now match on exact repo-id (case-insensitive) instead
    of prefix. Diffusion guard already does this; the chat guards
    were the remaining surface where loading org/model-v2
    blocked deleting org/model.
---
 pyproject.toml                             |   7 +-
 studio/backend/core/inference/diffusion.py |  38 ++---
 studio/backend/routes/export.py            |  37 +----
 studio/backend/routes/inference.py         | 174 ++++++++++++++++++++-
 studio/backend/routes/models.py            |  12 +-
 studio/backend/routes/training.py          |  56 ++-----
 6 files changed, 221 insertions(+), 103 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index aef88d90f5..97f5caf553 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -83,7 +83,12 @@ huggingfacenotorch = [
     "peft>=0.18.0,!=0.11.0",
     "huggingface_hub>=0.34.0",
     "hf_transfer",
-    "diffusers",
+    # Studio Images page depends on Flux2KleinPipeline /
+    # Flux2Pipeline, both shipped in diffusers>=0.37.0. Floor was
+    # missing here so a `pip install unsloth[huggingfacenotorch]`
+    # could resolve to 0.36.0 and fail at runtime when the default
+    # curated FLUX.2 klein model loads.
+    "diffusers>=0.37.0",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,!=4.57.0,!=4.57.4,!=4.57.5,!=5.0.0,!=5.1.0,<=5.5.0",
     "trl>=0.18.2,!=0.19.0,<=0.24.0",
     "sentence-transformers",
diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index d0799081ac..8682f3eb9c 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -663,7 +663,15 @@ def load_model(
                 exc_msg = re.sub(r"hf_[A-Za-z0-9]{20,}", "<redacted>", exc_msg)
                 with self._lock:
                     self._last_error = exc_msg
-                logger.exception("Diffusion load failed for %s", repo_id)
+                # ``logger.exception`` would emit the raw exception
+                # (including any unredacted ``hf_...`` token inside
+                # the message OR traceback locals on rich loggers).
+                # Use ``logger.error`` with the already-scrubbed
+                # message and exc_info=False so the bearer token
+                # cannot leak through structured logging sinks.
+                logger.error(
+                    "Diffusion load failed for %s: %s", repo_id, exc_msg
+                )
                 raise RuntimeError(
                     f"Failed to load diffusion model: {exc_msg}"
                 ) from exc
@@ -909,27 +917,21 @@ def _release_other_gpu_owners_for_diffusion() -> None:
     """Best-effort: shut down export subprocess + active training before
     a diffusion load. Both can hold multi-GB of VRAM and would OOM the
     diffusion allocation on consumer GPUs."""
-    # Export subprocess. Shut down when EITHER a checkpoint is
-    # resident OR is_export_active() reports work in flight (a
-    # checkpoint load that has been kicked off but not yet completed
-    # the assignment to current_checkpoint). Either case can hold
-    # GPU memory that would OOM the diffusion allocation.
+    # Export resident checkpoint. We tear down a SETTLED export
+    # (current_checkpoint populated) because that means the export
+    # ran to completion and the user can re-load the result, but we
+    # do NOT touch is_export_active() here: an in-flight export job
+    # has unfinished partial output that termination would corrupt.
+    # The route layer rejects /images/load with HTTP 409 via
+    # _raise_if_export_active when is_export_active() is True, so
+    # we only reach this helper when export is either idle or
+    # holding a previously completed checkpoint.
     try:
         from core.export import get_export_backend  # type: ignore
 
         exp = get_export_backend()
-        has_checkpoint = bool(getattr(exp, "current_checkpoint", None))
-        is_active = False
-        try:
-            is_active = bool(exp.is_export_active())
-        except Exception:
-            is_active = False
-        if has_checkpoint or is_active:
-            logger.info(
-                "Shutting down export subprocess (checkpoint=%s active=%s)",
-                has_checkpoint,
-                is_active,
-            )
+        if getattr(exp, "current_checkpoint", None):
+            logger.info("Shutting down idle export subprocess before diffusion load")
             exp._shutdown_subprocess()
             exp.current_checkpoint = None
             exp.is_vision = False
diff --git a/studio/backend/routes/export.py b/studio/backend/routes/export.py
index 4b0b95a809..aabcd1920b 100644
--- a/studio/backend/routes/export.py
+++ b/studio/backend/routes/export.py
@@ -107,35 +107,14 @@ async def load_checkpoint(
                     ),
                 )
 
-        # Free GPU memory: shut down any running inference/training subprocesses
-        # before loading the export checkpoint (they'd compete for VRAM).
-        try:
-            from core.inference import get_inference_backend
-
-            inf = get_inference_backend()
-            if inf.active_model_name:
-                logger.info(
-                    "Unloading inference model '%s' to free GPU memory for export",
-                    inf.active_model_name,
-                )
-                inf._shutdown_subprocess()
-                inf.active_model_name = None
-                inf.models.clear()
-        except Exception as e:
-            logger.warning("Could not unload inference model: %s", e)
-
-        # Also unload any active GGUF llama-server (the inference unload
-        # above only covers the safetensors / Unsloth backend; GGUF
-        # chat runs as a separate subprocess).
-        try:
-            from routes.inference import get_llama_cpp_backend
-
-            llama = get_llama_cpp_backend()
-            if getattr(llama, "is_loaded", False):
-                logger.info("Unloading GGUF chat model to free GPU memory for export")
-                llama.unload_model()
-        except Exception as e:
-            logger.debug("llama-server unload skipped for export: %s", e)
+        # Free GPU memory: shut down any chat backend before loading
+        # the export checkpoint. Routes the unload through the shared
+        # helper so we cover llama-server is_active=True and
+        # safetensors loading_models -- the asymmetries round 9
+        # reviews #1, #8, #9 flagged.
+        from routes.inference import _release_chat_for
+
+        await _release_chat_for("export")
 
         # Also unload any active diffusion pipeline (Images page); it
         # competes for the same GPU and would survive the inference
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 8b6918c91d..e43e185124 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -288,6 +288,145 @@ def _raise_if_training_active(workload: str) -> None:
         )
 
 
+def _raise_if_export_active(workload: str) -> None:
+    """Refuse a chat/diffusion load while an export job is active.
+
+    Symmetric with ``_raise_if_training_active``: export is also a
+    long-running GPU-owning job a user does not want silently killed
+    by a chat / images load. Treat ``current_checkpoint is not None``
+    and ``is_export_active() is True`` as 'export owns the GPU'.
+
+    Same failure-mode split as the training variant: import failure
+    silently skips, runtime failure fails CLOSED with 503.
+    """
+    try:
+        from core.export import get_export_backend  # type: ignore
+    except Exception:
+        return
+    try:
+        exp = get_export_backend()
+        has_checkpoint = bool(getattr(exp, "current_checkpoint", None))
+        try:
+            active = bool(exp.is_export_active())
+        except Exception:
+            active = False
+    except Exception as exc:
+        logger.warning(
+            "Could not verify export status before %s load: %s",
+            workload,
+            exc,
+        )
+        raise HTTPException(
+            status_code = 503,
+            detail = (
+                f"Could not verify export status before loading the "
+                f"{workload} model. Try again."
+            ),
+        ) from exc
+    if has_checkpoint or active:
+        raise HTTPException(
+            status_code = 409,
+            detail = (
+                f"An export job is currently active. Stop the export "
+                f"job before loading a {workload} model."
+            ),
+        )
+
+
+async def _release_chat_for(workload: str) -> None:
+    """Shared 'release any GPU-owning chat backend' helper.
+
+    Used by training / export / images / chat handoffs. Treats
+    llama-server as held when EITHER ``is_loaded`` or ``is_active``
+    is true (the latter is mid-download / mid-startup). Treats the
+    safetensors backend as held when ``active_model_name`` is set
+    OR ``loading_models`` is non-empty (mid-download / mid-load).
+    Each unload runs in a worker thread because both backends'
+    unload paths can block for the full duration of a load.
+    """
+    # GGUF chat (llama-server subprocess).
+    try:
+        llama = get_llama_cpp_backend()
+        is_loaded = bool(getattr(llama, "is_loaded", False))
+        is_active = bool(getattr(llama, "is_active", False))
+        if is_loaded or is_active:
+            logger.info(
+                "Unloading GGUF chat (loaded=%s active=%s) before %s load",
+                is_loaded,
+                is_active,
+                workload,
+            )
+            await asyncio.to_thread(llama.unload_model)
+    except Exception as e:
+        logger.debug("llama-server unload skipped for %s: %s", workload, e)
+
+    # Safetensors / Unsloth chat backend.
+    try:
+        from core.inference import get_inference_backend as _gib  # type: ignore
+
+        inf = _gib()
+        active_model_name = getattr(inf, "active_model_name", None)
+        loading_models = set(getattr(inf, "loading_models", set()) or set())
+        if active_model_name:
+            logger.info(
+                "Unloading safetensors chat '%s' before %s load",
+                active_model_name,
+                workload,
+            )
+            await asyncio.to_thread(inf.unload_model, active_model_name)
+        for loading in loading_models:
+            if loading == active_model_name:
+                continue
+            try:
+                logger.info(
+                    "Unloading in-flight safetensors chat '%s' before %s load",
+                    loading,
+                    workload,
+                )
+                await asyncio.to_thread(inf.unload_model, loading)
+            except Exception as inner:
+                logger.debug(
+                    "loading safetensors unload skipped for %s: %s",
+                    loading,
+                    inner,
+                )
+    except Exception as e:
+        logger.debug("safetensors unload skipped for %s: %s", workload, e)
+
+
+async def _release_export_for(workload: str) -> None:
+    """Shared 'shut down export subprocess' helper.
+
+    Treats ``current_checkpoint is not None`` or ``is_export_active()``
+    as 'export owns the GPU'. Used by training / chat handoffs.
+    Diffusion does NOT call this -- it refuses with 409 via
+    ``_raise_if_export_active`` instead, because killing an in-flight
+    export would corrupt the user's exported model.
+    """
+    try:
+        from core.export import get_export_backend  # type: ignore
+
+        exp = get_export_backend()
+        has_checkpoint = bool(getattr(exp, "current_checkpoint", None))
+        try:
+            active = bool(exp.is_export_active())
+        except Exception:
+            active = False
+        if has_checkpoint or active:
+            logger.info(
+                "Shutting down export (checkpoint=%s active=%s) for %s",
+                has_checkpoint,
+                active,
+                workload,
+            )
+            await asyncio.to_thread(exp._shutdown_subprocess)
+            exp.current_checkpoint = None
+            exp.is_vision = False
+            exp.is_peft = False
+    except Exception as e:
+        logger.warning("Could not shut down export for %s: %s", workload, e)
+
+
 def _detect_safetensors_features(backend, chat_template: Optional[str]) -> dict:
     """Classify reasoning/tool capabilities via the GGUF classifier so
     flags match across backends. gpt-oss is overridden because Harmony
@@ -787,7 +926,14 @@ async def load_model(
             # training is active. Diffusion and export paths refuse;
             # without this the GGUF chat load would start llama-server
             # while training still owned VRAM and double-spend it.
+            # Also refuse when an export job is in flight: same
+            # reasoning as diffusion (terminating a live export would
+            # corrupt the user's exported artifact).
             _raise_if_training_active("chat")
+            _raise_if_export_active("chat")
+            # Drop a settled export checkpoint that is just holding
+            # GPU memory but is not actively producing output.
+            await _release_export_for("GGUF chat")
 
             llama_backend = get_llama_cpp_backend()
             unsloth_backend = get_inference_backend()
@@ -993,17 +1139,29 @@ async def load_model(
 
         # ── Standard path: load via Unsloth/transformers ──────────
         # Symmetric lifecycle guard: refuse a chat load while training
-        # is active so we do not OOM both the training and inference
-        # jobs together.
+        # or an export is active so we do not OOM both jobs together
+        # and so we do not silently corrupt an in-flight export.
         _raise_if_training_active("chat")
+        _raise_if_export_active("chat")
+        # Drop a settled export checkpoint that is just holding GPU
+        # memory but is not actively producing output.
+        await _release_export_for("safetensors chat")
 
         backend = get_inference_backend()
 
-        # Unload any active GGUF model first
+        # Unload any active GGUF model first (handles both is_loaded
+        # and is_active=True so a mid-startup llama-server is also
+        # killed before we allocate safetensors weights).
         llama_backend = get_llama_cpp_backend()
-        if llama_backend.is_loaded:
-            logger.info("Unloading GGUF model before loading Unsloth model")
-            llama_backend.unload_model()
+        llama_loaded = bool(getattr(llama_backend, "is_loaded", False))
+        llama_active = bool(getattr(llama_backend, "is_active", False))
+        if llama_loaded or llama_active:
+            logger.info(
+                "Unloading GGUF model (loaded=%s active=%s) before Unsloth load",
+                llama_loaded,
+                llama_active,
+            )
+            await asyncio.to_thread(llama_backend.unload_model)
 
         # Unload any active diffusion pipeline so the new chat model is
         # not racing the FLUX VAE for VRAM on a 16-24 GB card. is_loading
@@ -1722,7 +1880,11 @@ async def diffusion_load(
     # Refuse before the long download starts: silently stopping a
     # running training run to free VRAM was the previous behavior and
     # left the user with no model loaded plus a dead training job.
+    # Same logic for export: an export subprocess that is mid-flight
+    # cannot be safely terminated without corrupting the output, so
+    # the request is refused with 409 instead of silently killing it.
     _raise_if_training_active("diffusion")
+    _raise_if_export_active("diffusion")
     backend = _get_diffusion_backend()
     try:
         status = await asyncio.get_event_loop().run_in_executor(
diff --git a/studio/backend/routes/models.py b/studio/backend/routes/models.py
index 1f71379197..7a1b238065 100644
--- a/studio/backend/routes/models.py
+++ b/studio/backend/routes/models.py
@@ -2681,7 +2681,11 @@ async def delete_cached_model(
 
         llama_backend = get_llama_cpp_backend()
         loaded_id = (llama_backend.model_identifier or "").lower()
-        wants = loaded_id == repo_id.lower() or loaded_id.startswith(repo_id.lower())
+        # Exact match only (case-insensitive). Prefix match would
+        # block deleting unrelated ``org/model`` while
+        # ``org/model-v2`` is loaded -- same surface the diffusion
+        # guard fixed in round 5.
+        wants = loaded_id == repo_id.lower()
         if wants and (
             llama_backend.is_loaded or getattr(llama_backend, "is_active", False)
         ):
@@ -2707,16 +2711,18 @@ async def delete_cached_model(
         # Loading set holds model identifiers currently being
         # downloaded / instantiated; treat them like active loads
         # so a delete cannot race a partial mmap.
+        # Exact match only. Prefix matching would block deleting
+        # ``org/model`` while ``org/model-v2`` is loading.
         for loading_model in loading_models:
             ml = (loading_model or "").lower()
-            if ml == needle or ml.startswith(needle):
+            if ml == needle:
                 raise HTTPException(
                     status_code = 409,
                     detail = "Cannot delete a model while it is loading",
                 )
         if inference_backend.active_model_name:
             active = inference_backend.active_model_name.lower()
-            if active == needle or active.startswith(needle):
+            if active == needle:
                 raise HTTPException(
                     status_code = 400,
                     detail = "Unload the model before deleting",
diff --git a/studio/backend/routes/training.py b/studio/backend/routes/training.py
index c7b33e025e..1bc39423b0 100644
--- a/studio/backend/routes/training.py
+++ b/studio/backend/routes/training.py
@@ -265,52 +265,16 @@ async def start_training(
                 )
                 training_kwargs["trust_remote_code"] = True
 
-        # Free GPU memory: shut down any running inference/export subprocesses
-        # before training starts (they'd compete for VRAM otherwise)
-        try:
-            from core.inference import get_inference_backend
-
-            inf_backend = get_inference_backend()
-            if inf_backend.active_model_name:
-                logger.info(
-                    "Unloading inference model '%s' to free GPU memory for training",
-                    inf_backend.active_model_name,
-                )
-                inf_backend._shutdown_subprocess()
-                inf_backend.active_model_name = None
-                inf_backend.models.clear()
-        except Exception as e:
-            logger.warning("Could not unload inference model: %s", e)
-
-        # GGUF chat backend (llama-server subprocess). Without this,
-        # starting training while a GGUF model is loaded keeps the
-        # subprocess pinned to VRAM and OOMs the training job. Mirrors
-        # the symmetric handoffs in routes/inference.py and
-        # routes/export.py.
-        try:
-            from routes.inference import get_llama_cpp_backend
-
-            llama_backend = get_llama_cpp_backend()
-            if getattr(llama_backend, "is_loaded", False):
-                logger.info("Unloading GGUF chat model to free GPU memory for training")
-                llama_backend.unload_model()
-        except Exception as e:
-            logger.warning("Could not unload GGUF chat model: %s", e)
-
-        try:
-            from core.export import get_export_backend
-
-            exp_backend = get_export_backend()
-            if exp_backend.current_checkpoint:
-                logger.info(
-                    "Shutting down export subprocess to free GPU memory for training"
-                )
-                exp_backend._shutdown_subprocess()
-                exp_backend.current_checkpoint = None
-                exp_backend.is_vision = False
-                exp_backend.is_peft = False
-        except Exception as e:
-            logger.warning("Could not shut down export subprocess: %s", e)
+        # Free GPU memory: shut down any chat backend (llama-server
+        # subprocess OR safetensors orchestrator) and any settled
+        # export checkpoint before training starts. The shared
+        # helpers handle the asymmetric cases (llama is_active,
+        # safetensors loading_models, export is_export_active) so
+        # this path stays in sync with /images/load and chat.
+        from routes.inference import _release_chat_for, _release_export_for
+
+        await _release_chat_for("training")
+        await _release_export_for("training")
 
         # Also unload any loaded diffusion pipeline (Images page); it
         # holds the same GPU and would survive the inference shutdown.

From b34fc6258fd84e4820dd091e4734c0a6a9e2618b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 03:21:28 +0000
Subject: [PATCH 30/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/diffusion.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 8682f3eb9c..c9cc7d2c87 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -669,9 +669,7 @@ def load_model(
                 # Use ``logger.error`` with the already-scrubbed
                 # message and exc_info=False so the bearer token
                 # cannot leak through structured logging sinks.
-                logger.error(
-                    "Diffusion load failed for %s: %s", repo_id, exc_msg
-                )
+                logger.error("Diffusion load failed for %s: %s", repo_id, exc_msg)
                 raise RuntimeError(
                     f"Failed to load diffusion model: {exc_msg}"
                 ) from exc

From 641cdcc13aff20553095da8b1655efd44f6f846b Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 03:56:32 +0000
Subject: [PATCH 31/92] Fix/adjust diffusion: round 10 fix export-active
 asymmetry + GGUF chat helper for PR #5754

Round 10 reviewers found the round 9 export helpers had a
destructive bug: _release_export_for treated is_export_active=True
as a shutdown condition, so any caller (training, chat, diffusion)
could terminate an in-flight export and corrupt the user's output.
Conversely _raise_if_export_active raised 409 on a settled
checkpoint, blocking idle cleanup.

Backend (P1)
  * routes/inference.py: split the export-active surface in two:
      _raise_if_export_active() now ONLY raises when
      is_export_active() is True. A settled current_checkpoint is
      treated as held GPU memory, not an active job.
      _release_export_for() now ONLY shuts down when
      current_checkpoint is set AND is_export_active() is False
      (i.e. a previously completed checkpoint just holding memory).
      An unknown / unverifiable is_export_active is treated as
      'might still be active' so the helper refuses to drop.
  * routes/training.py: now calls _raise_if_export_active before
    _release_chat_for / _release_export_for, mirroring the chat
    and diffusion paths. The previous code went straight to
    _release_export_for and would kill an in-flight export.
  * routes/inference.py: split _release_chat_for into
    _release_llama_for and _release_safetensors_chat_for so the
    GGUF chat-load path can release only the OTHER chat backend
    (round 10 review #4: the previous inline 'if active_model_name'
    check skipped loading_models and let an in-flight safetensors
    load race the new GGUF allocation).
  * routes/inference.py: _raise_if_export_active now fails CLOSED
    (503) when is_export_active() raises, not only when
    get_export_backend() raises. Round 10 review #7.

Dependencies (P1)
  * pyproject.toml huggingfacenotorch extra: pin gguf. The
    Studio Images default curated picker is GGUF-only and
    diffusers.GGUFQuantizationConfig + from_single_file require
    the standalone gguf package at runtime; missing it would 500
    on the first /api/inference/images/load with
    'gguf>=0.10.0 is required'.
---
 pyproject.toml                     |   6 ++
 studio/backend/routes/inference.py | 136 +++++++++++++++++------------
 studio/backend/routes/training.py  |  19 ++--
 3 files changed, 99 insertions(+), 62 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 97f5caf553..e401977496 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -89,6 +89,12 @@ huggingfacenotorch = [
     # could resolve to 0.36.0 and fail at runtime when the default
     # curated FLUX.2 klein model loads.
     "diffusers>=0.37.0",
+    # diffusers.GGUFQuantizationConfig + from_single_file rely on
+    # the standalone gguf package at runtime. The Studio Images
+    # default curated picker is GGUF-only so this must install
+    # with the public huggingfacenotorch extra; missing it makes
+    # /api/inference/images/load 500 with "gguf>=0.10.0 required".
+    "gguf",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,!=4.57.0,!=4.57.4,!=4.57.5,!=5.0.0,!=5.1.0,<=5.5.0",
     "trl>=0.18.2,!=0.19.0,<=0.24.0",
     "sentence-transformers",
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index e43e185124..035d9a9d58 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -289,15 +289,21 @@ def _raise_if_training_active(workload: str) -> None:
 
 
 def _raise_if_export_active(workload: str) -> None:
-    """Refuse a chat/diffusion load while an export job is active.
+    """Refuse a chat/diffusion/training load while an export job is
+    actively running.
 
     Symmetric with ``_raise_if_training_active``: export is also a
-    long-running GPU-owning job a user does not want silently killed
-    by a chat / images load. Treat ``current_checkpoint is not None``
-    and ``is_export_active() is True`` as 'export owns the GPU'.
-
-    Same failure-mode split as the training variant: import failure
-    silently skips, runtime failure fails CLOSED with 503.
+    long-running GPU-owning job a user does not want silently killed.
+    ONLY raises when ``is_export_active() is True`` (an export
+    subprocess is currently producing output). A settled
+    ``current_checkpoint`` is NOT an active job -- it is just held
+    GPU memory and gets dropped by ``_release_export_for``.
+
+    Failure-mode split:
+      * ``core.export`` cannot be imported -> silently skip.
+      * ``get_export_backend()`` raises -> 503 fail closed.
+      * ``is_export_active()`` raises -> 503 fail closed (round 10
+        review #7).
     """
     try:
         from core.export import get_export_backend  # type: ignore
@@ -305,11 +311,21 @@ def _raise_if_export_active(workload: str) -> None:
         return
     try:
         exp = get_export_backend()
-        has_checkpoint = bool(getattr(exp, "current_checkpoint", None))
-        try:
-            active = bool(exp.is_export_active())
-        except Exception:
-            active = False
+    except Exception as exc:
+        logger.warning(
+            "Could not verify export backend before %s load: %s",
+            workload,
+            exc,
+        )
+        raise HTTPException(
+            status_code = 503,
+            detail = (
+                f"Could not verify export status before loading the "
+                f"{workload} model. Try again."
+            ),
+        ) from exc
+    try:
+        active = bool(exp.is_export_active())
     except Exception as exc:
         logger.warning(
             "Could not verify export status before %s load: %s",
@@ -323,7 +339,7 @@ def _raise_if_export_active(workload: str) -> None:
                 f"{workload} model. Try again."
             ),
         ) from exc
-    if has_checkpoint or active:
+    if active:
         raise HTTPException(
             status_code = 409,
             detail = (
@@ -333,18 +349,11 @@ def _raise_if_export_active(workload: str) -> None:
         )
 
 
-async def _release_chat_for(workload: str) -> None:
-    """Shared 'release any GPU-owning chat backend' helper.
-
-    Used by training / export / images / chat handoffs. Treats
-    llama-server as held when EITHER ``is_loaded`` or ``is_active``
-    is true (the latter is mid-download / mid-startup). Treats the
-    safetensors backend as held when ``active_model_name`` is set
-    OR ``loading_models`` is non-empty (mid-download / mid-load).
-    Each unload runs in a worker thread because both backends'
-    unload paths can block for the full duration of a load.
+async def _release_llama_for(workload: str) -> None:
+    """Unload the llama-server (GGUF) chat backend if it owns the
+    GPU. Treats ``is_loaded`` OR ``is_active`` as held (the latter
+    is mid-download / mid-startup, before health probes pass).
     """
-    # GGUF chat (llama-server subprocess).
     try:
         llama = get_llama_cpp_backend()
         is_loaded = bool(getattr(llama, "is_loaded", False))
@@ -360,7 +369,11 @@ async def _release_chat_for(workload: str) -> None:
     except Exception as e:
         logger.debug("llama-server unload skipped for %s: %s", workload, e)
 
-    # Safetensors / Unsloth chat backend.
+
+async def _release_safetensors_chat_for(workload: str) -> None:
+    """Unload the safetensors / Unsloth chat backend (drains both
+    ``active_model_name`` and ``loading_models``) if it owns the GPU.
+    """
     try:
         from core.inference import get_inference_backend as _gib  # type: ignore
 
@@ -394,14 +407,33 @@ async def _release_chat_for(workload: str) -> None:
         logger.debug("safetensors unload skipped for %s: %s", workload, e)
 
 
-async def _release_export_for(workload: str) -> None:
-    """Shared 'shut down export subprocess' helper.
+async def _release_chat_for(workload: str) -> None:
+    """Shared 'release any GPU-owning chat backend' helper.
 
-    Treats ``current_checkpoint is not None`` or ``is_export_active()``
-    as 'export owns the GPU'. Used by training / chat handoffs.
-    Diffusion does NOT call this -- it refuses with 409 via
-    ``_raise_if_export_active`` instead, because killing an in-flight
-    export would corrupt the user's exported model.
+    Used by training / export / diffusion handoffs (which need BOTH
+    chat backends gone). The GGUF chat-load path uses only
+    ``_release_safetensors_chat_for`` because it is itself starting
+    llama-server -- we cannot release the backend we are about to
+    start. Conversely, the standard chat-load path releases only
+    the llama side.
+    """
+    await _release_llama_for(workload)
+    await _release_safetensors_chat_for(workload)
+
+
+async def _release_export_for(workload: str) -> None:
+    """Shared 'drop a settled export checkpoint' helper.
+
+    ONLY shuts down the export subprocess when ``current_checkpoint``
+    is set AND ``is_export_active()`` is False -- i.e. a previously
+    completed load is just holding GPU memory. An in-flight export
+    job (``is_export_active()`` True) is NEVER touched here; the
+    route layer is expected to refuse the workload with HTTP 409
+    via ``_raise_if_export_active`` before calling this.
+
+    This split is what round 10 reviewers flagged: the previous
+    behaviour terminated active exports on any release path, which
+    would corrupt the user's in-flight output artifact.
     """
     try:
         from core.export import get_export_backend  # type: ignore
@@ -411,12 +443,15 @@ async def _release_export_for(workload: str) -> None:
         try:
             active = bool(exp.is_export_active())
         except Exception:
-            active = False
-        if has_checkpoint or active:
+            # Treat unverifiable export state as 'might be active' and
+            # refuse to drop. The caller's _raise_if_export_active call
+            # already failed closed; reaching here with an unknown
+            # status is the safer no-op.
+            active = True
+        if has_checkpoint and not active:
             logger.info(
-                "Shutting down export (checkpoint=%s active=%s) for %s",
+                "Shutting down idle export (checkpoint=%s) for %s",
                 has_checkpoint,
-                active,
                 workload,
             )
             await asyncio.to_thread(exp._shutdown_subprocess)
@@ -938,12 +973,12 @@ async def load_model(
             llama_backend = get_llama_cpp_backend()
             unsloth_backend = get_inference_backend()
 
-            # Unload any active Unsloth model first to free VRAM
-            if unsloth_backend.active_model_name:
-                logger.info(
-                    f"Unloading Unsloth model '{unsloth_backend.active_model_name}' before loading GGUF"
-                )
-                unsloth_backend.unload_model(unsloth_backend.active_model_name)
+            # Unload any safetensors / Unsloth model first to free
+            # VRAM. Uses the shared helper so we also drain
+            # ``loading_models`` (round 10 review #4); the inline
+            # version only checked ``active_model_name`` and let an
+            # in-flight safetensors load race the new GGUF allocation.
+            await _release_safetensors_chat_for("GGUF chat")
 
             # Symmetric with /images/load: drop any active diffusion
             # pipeline so the GGUF chat load does not race the FLUX VAE
@@ -1149,19 +1184,10 @@ async def load_model(
 
         backend = get_inference_backend()
 
-        # Unload any active GGUF model first (handles both is_loaded
-        # and is_active=True so a mid-startup llama-server is also
-        # killed before we allocate safetensors weights).
-        llama_backend = get_llama_cpp_backend()
-        llama_loaded = bool(getattr(llama_backend, "is_loaded", False))
-        llama_active = bool(getattr(llama_backend, "is_active", False))
-        if llama_loaded or llama_active:
-            logger.info(
-                "Unloading GGUF model (loaded=%s active=%s) before Unsloth load",
-                llama_loaded,
-                llama_active,
-            )
-            await asyncio.to_thread(llama_backend.unload_model)
+        # Unload any active or mid-download llama-server first.
+        # Shared helper so this stays in sync with the GGUF path's
+        # symmetric ``_release_safetensors_chat_for``.
+        await _release_llama_for("safetensors chat")
 
         # Unload any active diffusion pipeline so the new chat model is
         # not racing the FLUX VAE for VRAM on a 16-24 GB card. is_loading
diff --git a/studio/backend/routes/training.py b/studio/backend/routes/training.py
index 1bc39423b0..d8fcce0981 100644
--- a/studio/backend/routes/training.py
+++ b/studio/backend/routes/training.py
@@ -265,14 +265,19 @@ async def start_training(
                 )
                 training_kwargs["trust_remote_code"] = True
 
-        # Free GPU memory: shut down any chat backend (llama-server
-        # subprocess OR safetensors orchestrator) and any settled
-        # export checkpoint before training starts. The shared
-        # helpers handle the asymmetric cases (llama is_active,
-        # safetensors loading_models, export is_export_active) so
-        # this path stays in sync with /images/load and chat.
-        from routes.inference import _release_chat_for, _release_export_for
+        # Symmetric lifecycle guard: refuse to start training while
+        # an export job is in flight. Round 10 review #1 -- the
+        # previous code went straight to ``_release_export_for``,
+        # which would terminate the in-flight export and corrupt
+        # the user's output artifact. Now we 409 first; the user
+        # stops the export and re-submits.
+        from routes.inference import (
+            _raise_if_export_active,
+            _release_chat_for,
+            _release_export_for,
+        )
 
+        _raise_if_export_active("training")
         await _release_chat_for("training")
         await _release_export_for("training")
 

From 1698b66eb17be40d1c4faa3b5f3bc425b94689cd Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 04:17:00 +0000
Subject: [PATCH 32/92] Fix/adjust diffusion: tolerate ExportBackend without
 is_export_active for PR #5754

Round 10's training-side _raise_if_export_active call broke
existing test mocks and older ExportBackend builds that only
expose current_checkpoint -- they raised AttributeError on
exp.is_export_active() and the outer guard converted that into
a 503, causing the prior Backend CI to fail
test_inference_route_returns_400_for_invalid_gpu_ids,
test_training_route_returns_400_for_invalid_gpu_ids, and
test_training_route_forwards_embedding_learning_rate.

Both _raise_if_export_active and _release_export_for now detect
the missing method with getattr(...) and treat absence as
'no async-job tracker available' (effectively 'not active').
The 503 fail-closed path still fires when the method exists but
the call itself raises, so production backends (the
ExportOrchestrator subclass that does expose is_export_active)
keep their stronger guard.
---
 studio/backend/routes/inference.py | 39 +++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 035d9a9d58..8b812e7846 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -302,8 +302,13 @@ def _raise_if_export_active(workload: str) -> None:
     Failure-mode split:
       * ``core.export`` cannot be imported -> silently skip.
       * ``get_export_backend()`` raises -> 503 fail closed.
-      * ``is_export_active()`` raises -> 503 fail closed (round 10
-        review #7).
+      * Backend does not expose ``is_export_active`` -> silently
+        skip. Older ExportBackend builds and several test mocks
+        only expose ``current_checkpoint``; there is no async-job
+        tracker for them, and forcing a 503 here would break those
+        flows without adding any safety they did not previously have.
+      * ``is_export_active()`` itself raises -> 503 fail closed
+        (round 10 review #7).
     """
     try:
         from core.export import get_export_backend  # type: ignore
@@ -324,8 +329,11 @@ def _raise_if_export_active(workload: str) -> None:
                 f"{workload} model. Try again."
             ),
         ) from exc
+    is_export_active_fn = getattr(exp, "is_export_active", None)
+    if is_export_active_fn is None:
+        return
     try:
-        active = bool(exp.is_export_active())
+        active = bool(is_export_active_fn())
     except Exception as exc:
         logger.warning(
             "Could not verify export status before %s load: %s",
@@ -440,14 +448,23 @@ async def _release_export_for(workload: str) -> None:
 
         exp = get_export_backend()
         has_checkpoint = bool(getattr(exp, "current_checkpoint", None))
-        try:
-            active = bool(exp.is_export_active())
-        except Exception:
-            # Treat unverifiable export state as 'might be active' and
-            # refuse to drop. The caller's _raise_if_export_active call
-            # already failed closed; reaching here with an unknown
-            # status is the safer no-op.
-            active = True
+        # Backends without an async-job tracker (older builds, some
+        # test mocks) cannot report 'active' separately from
+        # 'has_checkpoint'. Treat absence as 'not active' so a
+        # settled checkpoint still gets dropped; on builds that DO
+        # expose it, a True value blocks the drop.
+        is_export_active_fn = getattr(exp, "is_export_active", None)
+        if is_export_active_fn is None:
+            active = False
+        else:
+            try:
+                active = bool(is_export_active_fn())
+            except Exception:
+                # Treat unverifiable as 'might be active' and refuse
+                # to drop. The caller's _raise_if_export_active call
+                # already failed closed; reaching here with an
+                # unknown status is the safer no-op.
+                active = True
         if has_checkpoint and not active:
             logger.info(
                 "Shutting down idle export (checkpoint=%s) for %s",

From 4b1b149c0be70141a7b2170343593e502b518daf Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 04:30:10 +0000
Subject: [PATCH 33/92] Fix/adjust diffusion: round 11 export-active
 defense-in-depth + state/path/gguf for PR #5754

Round 11 reviewer findings.

Backend lifecycle (P1)
  * core/inference/diffusion.py _release_other_gpu_owners_for_
    diffusion: now re-checks is_export_active() locally before
    calling _shutdown_subprocess. The route layer already 409s on
    active exports, but defence-in-depth means direct backend
    callers (tests, scripts, future routes that forget the
    higher-level guard) can no longer terminate an in-flight
    export and corrupt the user's partial output.
  * routes/inference.py standard chat-load path: the duplicate
    inline 'if exp_backend.current_checkpoint -> _shutdown_subprocess'
    block was removed. _release_export_for above already handles
    settled checkpoints and skips active ones; the inline block
    was the round 11 #2 asymmetric fix surface.

Routing / error mapping (P2)
  * routes/training.py start_training: except HTTPException:
    raise was inserted before the broad except Exception:
    handler so the 409 raised by _raise_if_training_active /
    _raise_if_export_active reaches the client intact instead of
    being swallowed into a 500.

State publishing (P2)
  * core/inference/diffusion.py load_model: success path now
    clears _loading + _pending_* under _lock BEFORE returning
    self.status(), so the response payload reports the resident
    pipeline cleanly (no stale is_loading=true / pending_*). The
    finally block remains idempotent for error / early-raise paths.
  * core/inference/diffusion.py status(): nulls family /
    pipeline_class while a swap is in flight (pending_repo set
    and != active_repo). Previously the response paired pending
    model B's repo_id with model A's family, producing a
    combination that never existed.

Validation
  * models/inference.py: DiffusionLoadRequest.repo_id and
    base_repo length caps bumped from 256 to 1024; gguf_filename
    bumped from 256 to 512. The earlier caps rejected realistic
    Studio export paths (deeply nested outputs / exports
    directories, especially on Windows).

Dependencies
  * pyproject.toml huggingfacenotorch + studio/backend/
    requirements/no-torch-runtime.txt: floor gguf at >=0.10.0
    to match the diffusers requirement. Unconstrained pin allowed
    a resolver to install older gguf releases that raise at
    single-file load time.
---
 pyproject.toml                                | 12 +--
 studio/backend/core/inference/diffusion.py    | 76 ++++++++++++++-----
 studio/backend/models/inference.py            | 15 +++-
 .../backend/requirements/no-torch-runtime.txt |  6 +-
 studio/backend/routes/inference.py            | 22 ++----
 studio/backend/routes/training.py             |  7 ++
 6 files changed, 95 insertions(+), 43 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e401977496..106dcef3dd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -90,11 +90,13 @@ huggingfacenotorch = [
     # curated FLUX.2 klein model loads.
     "diffusers>=0.37.0",
     # diffusers.GGUFQuantizationConfig + from_single_file rely on
-    # the standalone gguf package at runtime. The Studio Images
-    # default curated picker is GGUF-only so this must install
-    # with the public huggingfacenotorch extra; missing it makes
-    # /api/inference/images/load 500 with "gguf>=0.10.0 required".
-    "gguf",
+    # the standalone gguf package at runtime. Floor at 0.10.0 to
+    # match the diffusers requirement; older gguf releases raise
+    # at load time. Studio Images default curated picker is
+    # GGUF-only so this must install with the public
+    # huggingfacenotorch extra; missing / under-pinned it makes
+    # /api/inference/images/load 500.
+    "gguf>=0.10.0",
     "transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,!=4.57.0,!=4.57.4,!=4.57.5,!=5.0.0,!=5.1.0,<=5.5.0",
     "trl>=0.18.2,!=0.19.0,<=0.24.0",
     "sentence-transformers",
diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index c9cc7d2c87..ed4b59aa77 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -333,14 +333,26 @@ def status(self) -> dict[str, Any]:
             pending_repo = self._pending_repo_id if self._loading else None
             pending_base = self._pending_base_repo if self._loading else None
             pending_gguf = self._pending_gguf_filename if self._loading else None
+            # When a swap is in flight, the UI-facing repo_id /
+            # base_repo / gguf_filename advertise the PENDING model
+            # but ``self._family`` still points at the previously
+            # loaded pipeline. Reporting them together produces a
+            # repo/family pair that never existed (round 11 #6).
+            # Null the family / pipeline_class while a swap is in
+            # flight; the frontend can fall back to "unknown".
+            ui_family = self._family.name if self._family else None
+            ui_pipeline_class = (
+                self._family.pipeline_class if self._family else None
+            )
+            if pending_repo and pending_repo != active_repo:
+                ui_family = None
+                ui_pipeline_class = None
             return {
                 "is_loaded": self._pipe is not None,
                 "is_loading": self._loading,
                 "repo_id": pending_repo or active_repo,
-                "family": self._family.name if self._family else None,
-                "pipeline_class": (
-                    self._family.pipeline_class if self._family else None
-                ),
+                "family": ui_family,
+                "pipeline_class": ui_pipeline_class,
                 "base_repo": pending_base or active_base,
                 "gguf_filename": pending_gguf or gguf_basename,
                 # Guard-facing fields: every repo / path the backend
@@ -634,6 +646,15 @@ def load_model(
                     self._device = device
                     self._dtype = str(dtype).replace("torch.", "")
                     self._loaded_at = time.time()
+                    # Clear loading + pending here, BEFORE returning,
+                    # so the response payload reports the resident
+                    # pipeline cleanly (is_loading=false, no pending_*).
+                    # The ``finally`` block below is idempotent and
+                    # still clears on error / early raise paths.
+                    self._loading = False
+                    self._pending_repo_id = None
+                    self._pending_base_repo = None
+                    self._pending_gguf_filename = None
 
                 return self.status()
             except Exception as exc:
@@ -916,24 +937,45 @@ def _release_other_gpu_owners_for_diffusion() -> None:
     a diffusion load. Both can hold multi-GB of VRAM and would OOM the
     diffusion allocation on consumer GPUs."""
     # Export resident checkpoint. We tear down a SETTLED export
-    # (current_checkpoint populated) because that means the export
-    # ran to completion and the user can re-load the result, but we
-    # do NOT touch is_export_active() here: an in-flight export job
-    # has unfinished partial output that termination would corrupt.
-    # The route layer rejects /images/load with HTTP 409 via
-    # _raise_if_export_active when is_export_active() is True, so
-    # we only reach this helper when export is either idle or
-    # holding a previously completed checkpoint.
+    # (current_checkpoint populated AND is_export_active() False)
+    # because that means the export ran to completion and the user
+    # can re-load the result. An in-flight export job
+    # (is_export_active() True) is NEVER touched here: terminating
+    # it would corrupt the user's partial output artifact.
+    #
+    # The route layer also rejects /images/load with HTTP 409 via
+    # _raise_if_export_active when is_export_active() is True. This
+    # helper repeats the local check anyway so that direct backend
+    # callers (tests, scripts, future routes that forget the
+    # higher-level guard) cannot still kill an active export.
     try:
         from core.export import get_export_backend  # type: ignore
 
         exp = get_export_backend()
         if getattr(exp, "current_checkpoint", None):
-            logger.info("Shutting down idle export subprocess before diffusion load")
-            exp._shutdown_subprocess()
-            exp.current_checkpoint = None
-            exp.is_vision = False
-            exp.is_peft = False
+            is_export_active_fn = getattr(exp, "is_export_active", None)
+            export_is_active = False
+            if is_export_active_fn is not None:
+                try:
+                    export_is_active = bool(is_export_active_fn())
+                except Exception:
+                    # Unverifiable status -> treat as 'might be
+                    # active' and refuse to touch the subprocess.
+                    export_is_active = True
+            if export_is_active:
+                logger.info(
+                    "Skipping export shutdown for diffusion load: "
+                    "is_export_active=True (route layer should have "
+                    "rejected this request with 409)"
+                )
+            else:
+                logger.info(
+                    "Shutting down idle export subprocess before diffusion load"
+                )
+                exp._shutdown_subprocess()
+                exp.current_checkpoint = None
+                exp.is_vision = False
+                exp.is_peft = False
     except Exception as exc:
         logger.debug("export unload skipped: %s", exc)
 
diff --git a/studio/backend/models/inference.py b/studio/backend/models/inference.py
index 72fea9b851..5f232c062b 100644
--- a/studio/backend/models/inference.py
+++ b/studio/backend/models/inference.py
@@ -1459,16 +1459,23 @@ class DiffusionLoadRequest(BaseModel):
     VAE / text encoders when loading a GGUF-only repo.
     """
 
-    repo_id: str = Field(..., min_length = 1, max_length = 256, description = "HF repo id")
+    # repo_id and base_repo can be absolute local paths (Studio
+    # exports under deeply nested ``outputs/...`` directories,
+    # Windows paths with drive letter, etc.). 1024 chars matches
+    # POSIX PATH_MAX-class limits and Windows long-path support;
+    # the rounds-of-256 cap was rejecting realistic export paths.
+    repo_id: str = Field(
+        ..., min_length = 1, max_length = 1024, description = "HF repo id or local path"
+    )
     gguf_filename: Optional[str] = Field(
         None,
-        max_length = 256,
+        max_length = 512,
         description = "GGUF filename inside repo_id (Q4_K_S, Q8_0, ...)",
     )
     base_repo: Optional[str] = Field(
         None,
-        max_length = 256,
-        description = "Diffusers base repo to source VAE + text encoders from",
+        max_length = 1024,
+        description = "Diffusers base repo (HF id or local path) for VAE + text encoders",
     )
     family: Optional[str] = Field(
         None,
diff --git a/studio/backend/requirements/no-torch-runtime.txt b/studio/backend/requirements/no-torch-runtime.txt
index 6e4b904267..117de55c51 100644
--- a/studio/backend/requirements/no-torch-runtime.txt
+++ b/studio/backend/requirements/no-torch-runtime.txt
@@ -49,8 +49,10 @@ hf_transfer
 # Studio Images page imports for the default curated picker.
 diffusers>=0.37.0
 # Required by diffusers.GGUFQuantizationConfig (used by the Images page
-# to load FLUX.2 / FLUX.1 / Qwen-Image GGUFs from the Hub).
-gguf
+# to load FLUX.2 / FLUX.1 / Qwen-Image GGUFs from the Hub). Floor at
+# 0.10.0 to match the diffusers requirement; older gguf releases raise
+# at single-file load time.
+gguf>=0.10.0
 
 # Transitive deps required because this file is installed with --no-deps.
 # Without these, `from transformers import AutoConfig` fails at import time.
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 8b812e7846..c4e247c2a1 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -1227,21 +1227,13 @@ async def load_model(
         except Exception as e:
             logger.debug("diffusion unload skipped: %s", e)
 
-        # Shut down any export subprocess to free VRAM
-        try:
-            from core.export import get_export_backend
-
-            exp_backend = get_export_backend()
-            if exp_backend.current_checkpoint:
-                logger.info(
-                    "Shutting down export subprocess to free GPU memory for inference"
-                )
-                exp_backend._shutdown_subprocess()
-                exp_backend.current_checkpoint = None
-                exp_backend.is_vision = False
-                exp_backend.is_peft = False
-        except Exception as e:
-            logger.warning("Could not shut down export subprocess: %s", e)
+        # Export was already dropped above via the shared
+        # ``await _release_export_for("safetensors chat")`` call
+        # (which checks is_export_active() before the destructive
+        # _shutdown_subprocess). The previous inline block here
+        # repeated the unconditional shutdown and would terminate
+        # an in-flight export job; round 11 review #2 flagged the
+        # asymmetry. The inline block is intentionally removed.
 
         # Auto-detect quantization for LoRA adapters from adapter_config.json
         # The training pipeline patches this file with "unsloth_training_method"
diff --git a/studio/backend/routes/training.py b/studio/backend/routes/training.py
index d8fcce0981..990be3df4b 100644
--- a/studio/backend/routes/training.py
+++ b/studio/backend/routes/training.py
@@ -325,6 +325,13 @@ async def start_training(
     except ValueError as e:
         logger.warning("Rejected training GPU selection: %s", e)
         raise HTTPException(status_code = 400, detail = str(e))
+    except HTTPException:
+        # Preserve the intended status code from
+        # _raise_if_training_active / _raise_if_export_active
+        # (409) and the gpu-id 400 raises above. Without this
+        # explicit re-raise the broad ``except Exception`` below
+        # converts a deliberate 409 into a 500.
+        raise
     except Exception as e:
         logger.error(f"Error starting training: {e}", exc_info = True)
         raise HTTPException(

From 921c60232e617310f7614f224e58536ce47fbfd2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 04:33:03 +0000
Subject: [PATCH 34/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/diffusion.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index ed4b59aa77..e2da94b362 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -341,9 +341,7 @@ def status(self) -> dict[str, Any]:
             # Null the family / pipeline_class while a swap is in
             # flight; the frontend can fall back to "unknown".
             ui_family = self._family.name if self._family else None
-            ui_pipeline_class = (
-                self._family.pipeline_class if self._family else None
-            )
+            ui_pipeline_class = self._family.pipeline_class if self._family else None
             if pending_repo and pending_repo != active_repo:
                 ui_family = None
                 ui_pipeline_class = None

From 8b8980a607c6e77de7c87b1cba4b88b04bee4dcb Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 05:03:18 +0000
Subject: [PATCH 35/92] Fix/adjust diffusion: round 12 local-path GGUF +
 per-variant delete + MPS + base namespace for PR #5754

Round 12 reviewer findings.

Backend correctness (P1)
  * core/inference/diffusion.py load_model: GGUF branch now
    handles an absolute local directory passed as repo_id by
    joining Path(repo_id) / gguf_filename directly instead of
    handing the path to hf_hub_download (which raises
    HFValidationError because the path is not 'namespace/repo').
    Closes round 12 review #1 -- the load request advertised
    'local path' support but actually only worked for Hub repo ids.

Delete guard precision (P1)
  * routes/models.py /delete-finetuned + /delete-cached:
    diffusion guard now consults gguf_filename from status()
    and ALLOWS per-variant deletes that target a different quant
    than the one the loaded pipeline is reading. Loading
    'Q4_K_S' no longer blocks deleting 'Q8_0' from the same
    repo / export directory (round 12 reviews #3 and #4).

Accelerator (P2)
  * core/inference/diffusion.py _drain_cuda_cache: also calls
    torch.mps.empty_cache() when the MPS backend is the
    active accelerator. Apple Silicon swaps now actually return
    held VRAM instead of leaving it pinned in the Metal
    allocator (round 12 review #10).

Smart base repo (P2)
  * core/inference/diffusion.py _smart_base_repo: only inspects
    the LAST segment of the repo id / path for the 'base' / '9b'
    tokens. A namespace like baseorg/FLUX.2-klein-4B-GGUF or
    a parent directory like /home/me/.cache/base/... no
    longer falsely selects the Base variant (round 12 review #9).
---
 studio/backend/core/inference/diffusion.py | 57 ++++++++++++++++++----
 studio/backend/routes/models.py            | 51 +++++++++++++++++--
 2 files changed, 94 insertions(+), 14 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index e2da94b362..367ae4de56 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -155,12 +155,17 @@ def _smart_base_repo(fam: DiffusionFamily, repo_id: str) -> str:
     containing "9b" gets the 9B base, "base-4b" / "base-9b" map to the
     Base variants, everything else falls back to the family default
     (Apache 2.0 4B Base).
+
+    Only the LAST segment of the repo id / path is inspected so a
+    namespace or parent directory like ``baseorg/...`` or
+    ``/home/me/.cache/base/...`` does not falsely select the Base
+    variant (round 12 review #9).
     """
     if fam.name != "flux.2-klein":
         return fam.base_repo
-    lower = (repo_id or "").lower()
-    is_9b = "9b" in lower
-    is_base = "base" in lower
+    last_segment = (repo_id or "").rstrip("/").rsplit("/", 1)[-1].lower()
+    is_9b = "9b" in last_segment
+    is_base = "base" in last_segment
     if is_9b and is_base:
         return "black-forest-labs/FLUX.2-klein-base-9B"
     if is_9b:
@@ -529,11 +534,28 @@ def load_model(
                             f"Family {fam.name} does not have a GGUF transformer "
                             "path wired in this build; load the full repo instead."
                         )
-                    local_gguf_path = hf_hub_download(
-                        repo_id = repo_id,
-                        filename = gguf_filename,
-                        token = hf_token,
-                    )
+                    # DiffusionLoadRequest.repo_id is documented to
+                    # accept either a Hub repo id OR a local
+                    # absolute path (Studio export, downloaded HF
+                    # snapshot, etc.). Only the Hub case wants
+                    # hf_hub_download -- a local repo path passed
+                    # to it raises HFValidationError because
+                    # "/abs/path" is not "namespace/repo".
+                    repo_id_path = Path(repo_id).expanduser()
+                    if repo_id_path.is_absolute() and repo_id_path.is_dir():
+                        candidate = repo_id_path / gguf_filename
+                        if not candidate.is_file():
+                            raise RuntimeError(
+                                f"Local repo path '{repo_id}' does not contain "
+                                f"'{gguf_filename}'."
+                            )
+                        local_gguf_path = str(candidate)
+                    else:
+                        local_gguf_path = hf_hub_download(
+                            repo_id = repo_id,
+                            filename = gguf_filename,
+                            token = hf_token,
+                        )
 
                 # All cheap failure points (bad gguf_filename, missing
                 # pipeline / transformer class, gated download token,
@@ -1007,12 +1029,16 @@ def _release(obj: Any) -> None:
 
 
 def _drain_cuda_cache() -> None:
-    """Hand freed weights back to the CUDA allocator.
+    """Hand freed weights back to the active accelerator's allocator.
 
     Call this AFTER every reference to the freed object has been
     dropped (caller's local + attribute) and a ``gc.collect()`` has
     fired __del__. Calling earlier would empty an already-pinned
-    cache and not actually release the memory."""
+    cache and not actually release the memory.
+
+    Handles CUDA *and* MPS (Apple Silicon) so a diffusion swap on
+    macOS actually returns VRAM to the Metal allocator.
+    """
     try:
         import torch
 
@@ -1020,6 +1046,17 @@ def _drain_cuda_cache() -> None:
             torch.cuda.empty_cache()
     except Exception:
         pass
+    try:
+        import torch
+
+        mps_backend = getattr(getattr(torch, "backends", None), "mps", None)
+        if mps_backend is not None and mps_backend.is_available():
+            mps_module = getattr(torch, "mps", None)
+            empty_cache = getattr(mps_module, "empty_cache", None) if mps_module else None
+            if empty_cache is not None:
+                empty_cache()
+    except Exception:
+        pass
 
 
 # ─── Module-level singleton ───────────────────────────────────────────
diff --git a/studio/backend/routes/models.py b/studio/backend/routes/models.py
index 7a1b238065..82bfa8865b 100644
--- a/studio/backend/routes/models.py
+++ b/studio/backend/routes/models.py
@@ -2002,6 +2002,19 @@ async def delete_finetuned_model(
                 if v:
                     candidates.append(v)
             target_str = str(target_path)
+            # Per-variant deletes only touch ``_delete_gguf_variant_
+            # files(target_path, gguf_variant)`` which removes a
+            # specific quant file. If the loaded pipeline uses a
+            # DIFFERENT variant from the same directory, the delete
+            # is safe. Round 12 review #3.
+            loaded_gguf = (
+                diff_status.get("gguf_filename") or ""
+            ).lower()
+            wants_variant = (
+                export_type == "gguf"
+                and gguf_variant
+                and loaded_gguf
+            )
             for candidate in candidates:
                 try:
                     candidate_path = Path(candidate).expanduser()
@@ -2022,6 +2035,15 @@ async def delete_finetuned_model(
                     or _is_path_under(candidate_resolved, target_path)
                     or _is_path_under(target_path, candidate_resolved)
                 ):
+                    # Allow per-variant deletes that target a
+                    # different quant than the loaded one.
+                    if wants_variant:
+                        variant_low = gguf_variant.lower()
+                        loaded_label = (
+                            _extract_quant_label(loaded_gguf) or ""
+                        ).lower()
+                        if loaded_label and loaded_label != variant_low:
+                            continue
                     raise HTTPException(
                         status_code = 400,
                         detail = "Unload the diffusion image model before deleting",
@@ -2769,10 +2791,31 @@ async def delete_cached_model(
             }
             owned.discard("")
             if needle in owned:
-                raise HTTPException(
-                    status_code = 400,
-                    detail = "Unload the diffusion image model before deleting",
-                )
+                # Per-variant delete only touches the requested
+                # quant via ``_delete_gguf_variant_files``. If the
+                # loaded pipeline uses a DIFFERENT variant from the
+                # same repo, the delete is safe. Round 12 review #4.
+                loaded_gguf = (
+                    diff_status.get("gguf_filename") or ""
+                ).lower()
+                if variant and loaded_gguf:
+                    variant_low = variant.lower()
+                    loaded_label = (
+                        _extract_quant_label(loaded_gguf) or ""
+                    ).lower()
+                    if loaded_label and loaded_label != variant_low:
+                        # Different quant from the same repo -> allow.
+                        pass
+                    else:
+                        raise HTTPException(
+                            status_code = 400,
+                            detail = "Unload the diffusion image model before deleting",
+                        )
+                else:
+                    raise HTTPException(
+                        status_code = 400,
+                        detail = "Unload the diffusion image model before deleting",
+                    )
     except HTTPException:
         raise
     except Exception as e:

From d8b785a4e237e9a0aa8fd6442e14ec135c0b7dd1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 05:03:41 +0000
Subject: [PATCH 36/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/diffusion.py |  4 +++-
 studio/backend/routes/models.py            | 22 +++++-----------------
 2 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 367ae4de56..93af5d2fd6 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -1052,7 +1052,9 @@ def _drain_cuda_cache() -> None:
         mps_backend = getattr(getattr(torch, "backends", None), "mps", None)
         if mps_backend is not None and mps_backend.is_available():
             mps_module = getattr(torch, "mps", None)
-            empty_cache = getattr(mps_module, "empty_cache", None) if mps_module else None
+            empty_cache = (
+                getattr(mps_module, "empty_cache", None) if mps_module else None
+            )
             if empty_cache is not None:
                 empty_cache()
     except Exception:
diff --git a/studio/backend/routes/models.py b/studio/backend/routes/models.py
index 82bfa8865b..570f024bf8 100644
--- a/studio/backend/routes/models.py
+++ b/studio/backend/routes/models.py
@@ -2007,14 +2007,8 @@ async def delete_finetuned_model(
             # specific quant file. If the loaded pipeline uses a
             # DIFFERENT variant from the same directory, the delete
             # is safe. Round 12 review #3.
-            loaded_gguf = (
-                diff_status.get("gguf_filename") or ""
-            ).lower()
-            wants_variant = (
-                export_type == "gguf"
-                and gguf_variant
-                and loaded_gguf
-            )
+            loaded_gguf = (diff_status.get("gguf_filename") or "").lower()
+            wants_variant = export_type == "gguf" and gguf_variant and loaded_gguf
             for candidate in candidates:
                 try:
                     candidate_path = Path(candidate).expanduser()
@@ -2039,9 +2033,7 @@ async def delete_finetuned_model(
                     # different quant than the loaded one.
                     if wants_variant:
                         variant_low = gguf_variant.lower()
-                        loaded_label = (
-                            _extract_quant_label(loaded_gguf) or ""
-                        ).lower()
+                        loaded_label = (_extract_quant_label(loaded_gguf) or "").lower()
                         if loaded_label and loaded_label != variant_low:
                             continue
                     raise HTTPException(
@@ -2795,14 +2787,10 @@ async def delete_cached_model(
                 # quant via ``_delete_gguf_variant_files``. If the
                 # loaded pipeline uses a DIFFERENT variant from the
                 # same repo, the delete is safe. Round 12 review #4.
-                loaded_gguf = (
-                    diff_status.get("gguf_filename") or ""
-                ).lower()
+                loaded_gguf = (diff_status.get("gguf_filename") or "").lower()
                 if variant and loaded_gguf:
                     variant_low = variant.lower()
-                    loaded_label = (
-                        _extract_quant_label(loaded_gguf) or ""
-                    ).lower()
+                    loaded_label = (_extract_quant_label(loaded_gguf) or "").lower()
                     if loaded_label and loaded_label != variant_low:
                         # Different quant from the same repo -> allow.
                         pass

From ae41bfdbfdaa21551d2a427b7755f2b63e65eed2 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 05:57:32 +0000
Subject: [PATCH 37/92] Fix/adjust diffusion: round 13 P1+P2 batch for PR #5754

Round 13 reviewer aggregate (logs/review_round13_aggregate.md):

P1 fixes:
- routes/export.py load_checkpoint refuses (409) when an export job
  is currently active, mirroring the chat/diffusion/training handoff
  guards. ``is_export_active`` absence is tolerated for older / mocked
  backends.
- core/inference/diffusion.py local-path GGUF loader now accepts
  relative directories (Studio exports surface as ``exports/my-flux``)
  and confines ``gguf_filename`` to the chosen repo via
  ``_resolve_local_gguf_child``: absolute filenames, ``..`` segments,
  and Windows separators are rejected before any file is opened.
- core/inference/diffusion.py status() exposes ``active_gguf_filename``
  alongside the pending variant so delete guards can pair each owned
  repo with the GGUF variant it actually owns.
- routes/models.py cache delete + finetuned delete adopt a shared
  ``_diffusion_owned_targets`` + ``_variant_delete_is_safe_for_owned_gguf``
  helper. Per-variant deletes during a swap-in-flight cannot remove
  the active variant while the pending variant is loading.
- core/inference/llama_cpp.py publishes ``loading_model_identifier``
  before ``_download_gguf`` starts and clears it in ``finally``. Cache
  delete (routes/models.py) and the cross-workload release helpers
  (routes/inference.py::_release_llama_for and
  diffusion.py::_release_chat_backend_for_diffusion) consult it so a
  multi-GB HF download cannot be rmtree'd or be ignored by /images/load
  while still in flight.

P2 fixes:
- core/inference/diffusion.py adds
  ``generate_image_with_metadata`` + ``async_generate_with_metadata``;
  /images/generate uses it so the response model/family reflect the
  pipeline that actually produced the image even if an unload races
  the route.
- core/inference/diffusion.py: ``base_repo`` only applies when picking
  a GGUF quant. Filling Base diffusers repo while loading a full
  diffusers repo no longer silently swaps the load target.
- core/inference/diffusion.py: failed device placement / offload now
  drops pipe + transformer references explicitly before drain so
  partial allocations cannot keep VRAM around.
- core/inference/diffusion.py: torch/diffusers imports surface as a
  clear RuntimeError naming the missing dependency.
- core/inference/diffusion.py: _smart_base_repo splits on both POSIX
  and Windows separators so ``C:\\Users\\me\\base\\FLUX.2-klein-4B-GGUF``
  no longer picks the Base 4B variant via the parent dir.

Tests:
- 6 new regression cases (Windows leaf, traversal/backslash rejection,
  relative-dir local load, metadata snapshot, lock serialisation).
- All 59 diffusion backend + route tests pass.
---
 studio/backend/core/inference/diffusion.py    | 376 +++++++++++++-----
 studio/backend/core/inference/llama_cpp.py    |  71 +++-
 studio/backend/routes/export.py               |  34 ++
 studio/backend/routes/inference.py            |  35 +-
 studio/backend/routes/models.py               | 164 +++++---
 .../backend/tests/test_diffusion_backend.py   | 225 +++++++++++
 studio/backend/tests/test_diffusion_routes.py |  16 +
 7 files changed, 716 insertions(+), 205 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 93af5d2fd6..335e93c957 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -35,10 +35,11 @@
 import asyncio
 import gc
 import io
+import re
 import threading
 import time
 from dataclasses import dataclass, field
-from pathlib import Path
+from pathlib import Path, PurePosixPath
 from typing import Any, Optional
 
 from loggers import get_logger
@@ -159,11 +160,15 @@ def _smart_base_repo(fam: DiffusionFamily, repo_id: str) -> str:
     Only the LAST segment of the repo id / path is inspected so a
     namespace or parent directory like ``baseorg/...`` or
     ``/home/me/.cache/base/...`` does not falsely select the Base
-    variant (round 12 review #9).
+    variant (round 12 review #9). Splits on BOTH ``/`` and ``\\`` so
+    Windows local paths like ``C:\\Users\\me\\base\\FLUX.2-klein-4B``
+    do not get scored as "base" via the parent directory either
+    (round 13 P2 #13).
     """
     if fam.name != "flux.2-klein":
         return fam.base_repo
-    last_segment = (repo_id or "").rstrip("/").rsplit("/", 1)[-1].lower()
+    cleaned = (repo_id or "").rstrip("/\\")
+    last_segment = re.split(r"[\\/]+", cleaned)[-1].lower() if cleaned else ""
     is_9b = "9b" in last_segment
     is_base = "base" in last_segment
     if is_9b and is_base:
@@ -177,6 +182,47 @@ def _smart_base_repo(fam: DiffusionFamily, repo_id: str) -> str:
     return "black-forest-labs/FLUX.2-klein-4B"
 
 
+def _resolve_local_gguf_child(repo_root: Path, gguf_filename: str) -> Path:
+    """Resolve a GGUF filename inside a local repo directory safely.
+
+    Returns the resolved absolute path or raises ``RuntimeError`` if:
+    - ``gguf_filename`` is absolute (``/etc/passwd``) or contains a
+      Windows separator (``..\\..\\secret.gguf``);
+    - the parts contain ``""`` / ``.`` / ``..`` (``../other.gguf``);
+    - the resolved candidate escapes ``repo_root`` after symlinks /
+      ``..`` collapse;
+    - the resolved candidate is not a regular file.
+
+    This is the only path that bridges a user-supplied ``gguf_filename``
+    string into ``Path``s the loader opens, so confining it to the
+    chosen repo here protects the delete-ownership guards downstream
+    (round 13 P1 #2). ``hf_hub_download`` already enforces the same
+    invariant for Hub repos.
+    """
+    if Path(gguf_filename).is_absolute() or "\\" in gguf_filename:
+        raise RuntimeError(
+            "gguf_filename must be a relative file path inside repo_id."
+        )
+    rel = PurePosixPath(gguf_filename)
+    if any(part in ("", ".", "..") for part in rel.parts):
+        raise RuntimeError(
+            "gguf_filename must not contain empty, '.', or '..' segments."
+        )
+    root = repo_root.expanduser().resolve(strict = True)
+    candidate = (root / Path(*rel.parts)).resolve(strict = True)
+    try:
+        candidate.relative_to(root)
+    except ValueError as exc:
+        raise RuntimeError(
+            "gguf_filename must stay inside the local repo_id directory."
+        ) from exc
+    if not candidate.is_file():
+        raise RuntimeError(
+            f"Local repo path '{repo_root}' does not contain '{gguf_filename}'."
+        )
+    return candidate
+
+
 # Negative substrings that disqualify a candidate family even when its
 # name appears as a substring of the repo id. Prevents
 # "stable-diffusion-3" matching SD3.5 and "qwen-image" matching
@@ -335,6 +381,7 @@ def status(self) -> dict[str, Any]:
             # user just clicked.
             active_repo = self._repo_id
             active_base = self._base_repo
+            active_gguf = gguf_basename
             pending_repo = self._pending_repo_id if self._loading else None
             pending_base = self._pending_base_repo if self._loading else None
             pending_gguf = self._pending_gguf_filename if self._loading else None
@@ -357,11 +404,15 @@ def status(self) -> dict[str, Any]:
                 "family": ui_family,
                 "pipeline_class": ui_pipeline_class,
                 "base_repo": pending_base or active_base,
-                "gguf_filename": pending_gguf or gguf_basename,
-                # Guard-facing fields: every repo / path the backend
-                # owns RIGHT NOW. Delete routes iterate both.
+                "gguf_filename": pending_gguf or active_gguf,
+                # Guard-facing fields: every repo / path / GGUF
+                # filename the backend owns RIGHT NOW. Delete routes
+                # iterate both, paired so the variant-filename check
+                # is compared against the SAME repo that owns it
+                # (round 13 P1 #3-5).
                 "active_repo_id": active_repo,
                 "active_base_repo": active_base,
+                "active_gguf_filename": active_gguf,
                 "pending_repo_id": pending_repo,
                 "pending_base_repo": pending_base,
                 "pending_gguf_filename": pending_gguf,
@@ -431,9 +482,24 @@ def load_model(
         keep peak VRAM bounded; status() reports is_loaded=false with
         last_error set so the caller can react.
         """
-        from huggingface_hub import hf_hub_download
-        import diffusers
-        import torch
+        # Surface a friendly load error when the no-torch / partial
+        # install path is active: the user clicked Load on the Images
+        # page but the runtime never installed torch + diffusers (round
+        # 13 P2 #12). Without this wrapper the import surfaces as a
+        # raw ``ModuleNotFoundError`` -> 500 instead of a 400 the UI
+        # can display.
+        try:
+            from huggingface_hub import hf_hub_download
+            import diffusers
+            import torch
+        except ModuleNotFoundError as exc:
+            missing = exc.name or str(exc)
+            raise RuntimeError(
+                "Diffusion image generation requires the torch / diffusers "
+                f"runtime. Missing dependency: {missing}. Install the Studio "
+                "torch runtime (re-run setup.sh / install.ps1) before "
+                "loading an image model."
+            ) from exc
 
         fam = detect_family(repo_id, override_family = family_override)
         if fam is None:
@@ -485,19 +551,21 @@ def load_model(
 
                 # Resolution rules for the "what repo to call
                 # from_pretrained on" question:
-                #   1. caller-supplied base_repo wins
-                #   2. if no GGUF file was requested the user is loading a
-                #      full diffusers repo; use repo_id directly so we do
+                #   1. no GGUF file -> caller is loading a full
+                #      diffusers repo; use repo_id directly so we do
                 #      not silently substitute the family default
-                #   3. otherwise use the family + repo_id heuristic so a
-                #      9B GGUF picks the 9B base, not the 4B fallback
-                if base_repo:
-                    effective_base = base_repo
-                    # Refresh pending so delete guards see the actual
-                    # base, not just caller-supplied None.
-                    with self._lock:
-                        self._pending_base_repo = effective_base
-                elif not gguf_filename:
+                #      AND ignore any base_repo input (it is only
+                #      meaningful as a GGUF companion override). The
+                #      old order let ``base_repo`` swap a fine-tuned
+                #      ``owner/my-flux.1-finetune`` for
+                #      ``black-forest-labs/FLUX.1-dev`` while status
+                #      still advertised the user's repo (round 13
+                #      P2 #10).
+                #   2. otherwise prefer caller-supplied base_repo for
+                #      the missing VAE / text encoder components.
+                #   3. otherwise use the family + repo_id heuristic so
+                #      a 9B GGUF picks the 9B base, not the 4B fallback.
+                if not gguf_filename:
                     # Guard: a repo that ends in "-GGUF" (the unsloth
                     # convention) is GGUF-only and will 500 on
                     # from_pretrained; surface a clear error instead of
@@ -507,12 +575,18 @@ def load_model(
                         raise RuntimeError(
                             f"'{repo_id}' looks like a GGUF-only repo. "
                             "Either provide gguf_filename to pick a quant, "
-                            "or pass base_repo to override the full-repo "
-                            "load target."
+                            "or load a full diffusers repo (base_repo only "
+                            "applies when picking a GGUF quant)."
                         )
                     effective_base = repo_id
                     with self._lock:
                         self._pending_base_repo = effective_base
+                elif base_repo:
+                    effective_base = base_repo
+                    # Refresh pending so delete guards see the actual
+                    # base, not just caller-supplied None.
+                    with self._lock:
+                        self._pending_base_repo = effective_base
                 else:
                     effective_base = _smart_base_repo(fam, repo_id)
                     with self._lock:
@@ -535,21 +609,26 @@ def load_model(
                             "path wired in this build; load the full repo instead."
                         )
                     # DiffusionLoadRequest.repo_id is documented to
-                    # accept either a Hub repo id OR a local
-                    # absolute path (Studio export, downloaded HF
-                    # snapshot, etc.). Only the Hub case wants
-                    # hf_hub_download -- a local repo path passed
-                    # to it raises HFValidationError because
-                    # "/abs/path" is not "namespace/repo".
+                    # accept either a Hub repo id OR a local path
+                    # (Studio export, downloaded HF snapshot, etc.).
+                    # We accept BOTH absolute and relative local
+                    # directories: Studio exports surface as relative
+                    # paths like ``exports/my-flux`` and earlier
+                    # versions only accepted absolute paths, falling
+                    # through to ``hf_hub_download`` which then
+                    # raised HFValidationError on the relative path
+                    # (round 13 P1 #2). For local paths we route the
+                    # gguf_filename through ``_resolve_local_gguf_child``
+                    # so traversal (``../secret.gguf``) and absolute
+                    # filename escapes (``/etc/passwd``) are rejected
+                    # BEFORE the file is opened, which also keeps the
+                    # delete-ownership guards aligned with what was
+                    # actually loaded.
                     repo_id_path = Path(repo_id).expanduser()
-                    if repo_id_path.is_absolute() and repo_id_path.is_dir():
-                        candidate = repo_id_path / gguf_filename
-                        if not candidate.is_file():
-                            raise RuntimeError(
-                                f"Local repo path '{repo_id}' does not contain "
-                                f"'{gguf_filename}'."
-                            )
-                        local_gguf_path = str(candidate)
+                    if repo_id_path.is_dir():
+                        local_gguf_path = str(
+                            _resolve_local_gguf_child(repo_id_path, gguf_filename)
+                        )
                     else:
                         local_gguf_path = hf_hub_download(
                             repo_id = repo_id,
@@ -638,24 +717,31 @@ def load_model(
                 if hf_token:
                     pipe_kwargs["token"] = hf_token
 
+                pipe = None
                 try:
                     pipe = pipeline_cls.from_pretrained(effective_base, **pipe_kwargs)
+                    # Device placement / offload can ALSO raise after
+                    # from_pretrained succeeded (OOM at the .to(device)
+                    # copy, accelerate offload hook misconfigured, etc.).
+                    # If we let the exception escape now, the local
+                    # ``pipe`` lives on the traceback frame until the
+                    # caller drops it, holding multi-GB of VRAM behind
+                    # the next load attempt. Explicitly release both
+                    # pipe and transformer in the same try (round 13
+                    # P2 #11).
+                    if enable_model_cpu_offload and device == "cuda":
+                        pipe.enable_model_cpu_offload()
+                    else:
+                        pipe.to(device)
                 except Exception:
-                    # If from_pretrained fails after the transformer was
-                    # already loaded, the transformer object holds GPU
-                    # weights that would only be freed at GC. Drop the
-                    # local reference and force a collect so the next
-                    # load attempt does not stack VRAM with a phantom
-                    # transformer.
+                    if pipe is not None:
+                        _release(pipe)
+                        pipe = None
                     if transformer is not None:
                         _release(transformer)
                         transformer = None
-                        _drain_cuda_cache()
+                    _drain_cuda_cache()
                     raise
-                if enable_model_cpu_offload and device == "cuda":
-                    pipe.enable_model_cpu_offload()
-                else:
-                    pipe.to(device)
 
                 with self._lock:
                     self._pipe = pipe
@@ -775,6 +861,39 @@ def generate_image(
         requests for the entire (minutes-long) generation, which made
         the UI feel frozen.
         """
+        # Take _generate_lock FIRST so a concurrent unload/load that
+        # observes us holding it will queue behind this generation
+        # (and `unload_model` then waits its turn before clearing
+        # state). Snapshotting `self._pipe` outside the lock and then
+        # taking the lock let a load/unload race in between, so the
+        # forward could run against a freed or swapped pipeline.
+        with self._generate_lock:
+            return self._generate_image_unlocked(
+                prompt = prompt,
+                negative_prompt = negative_prompt,
+                num_inference_steps = num_inference_steps,
+                guidance_scale = guidance_scale,
+                width = width,
+                height = height,
+                seed = seed,
+            )
+
+    def _generate_image_unlocked(
+        self,
+        *,
+        prompt: str,
+        negative_prompt: Optional[str] = None,
+        num_inference_steps: int = 24,
+        guidance_scale: float = 3.5,
+        width: int = 1024,
+        height: int = 1024,
+        seed: Optional[int] = None,
+    ) -> "Any":
+        """Inner body of ``generate_image`` that ASSUMES the caller
+        already holds ``_generate_lock``. Lets
+        ``generate_image_with_metadata`` snapshot metadata under the
+        same lock without deadlocking on a non-reentrant
+        ``threading.Lock`` (round 13 P2 #9)."""
         if not prompt or not prompt.strip():
             raise ValueError("prompt is empty")
         if num_inference_steps < 1 or num_inference_steps > 200:
@@ -789,64 +908,81 @@ def generate_image(
 
         import torch
 
-        # Take _generate_lock FIRST so a concurrent unload/load that
-        # observes us holding it will queue behind this generation
-        # (and `unload_model` then waits its turn before clearing
-        # state). Snapshotting `self._pipe` outside the lock and then
-        # taking the lock let a load/unload race in between, so the
-        # forward could run against a freed or swapped pipeline.
-        with self._generate_lock:
-            with self._lock:
-                if self._pipe is None:
-                    raise RuntimeError("No diffusion model is loaded.")
-                pipe = self._pipe
-                device = self._device or "cpu"
-            generator = None
-            if seed is not None:
-                # Match the device of the pipeline so determinism holds
-                # across reload cycles. For CPU offload, the noise still
-                # has to live on the device the diffusion forward runs on.
-                gen_device = (
-                    "cuda" if device == "cuda" and torch.cuda.is_available() else "cpu"
+        with self._lock:
+            if self._pipe is None:
+                raise RuntimeError("No diffusion model is loaded.")
+            pipe = self._pipe
+            device = self._device or "cpu"
+        generator = None
+        if seed is not None:
+            # Match the device of the pipeline so determinism holds
+            # across reload cycles. For CPU offload, the noise still
+            # has to live on the device the diffusion forward runs on.
+            gen_device = (
+                "cuda" if device == "cuda" and torch.cuda.is_available() else "cpu"
+            )
+            generator = torch.Generator(device = gen_device).manual_seed(int(seed))
+
+        call_kwargs: dict[str, Any] = {
+            "prompt": prompt,
+            "num_inference_steps": int(num_inference_steps),
+            "guidance_scale": float(guidance_scale),
+            "width": int(width),
+            "height": int(height),
+        }
+        # FLUX.2 / FLUX.2 klein pipelines do NOT accept
+        # negative_prompt and 500 if you pass it in. Inspect the
+        # signature and only forward when supported; warn otherwise
+        # so the UI can disable the field for incompatible families.
+        if negative_prompt is not None and negative_prompt.strip():
+            if _pipe_accepts_kwarg(pipe, "negative_prompt"):
+                call_kwargs["negative_prompt"] = negative_prompt
+                # QwenImagePipeline and FluxPipeline treat
+                # guidance_scale as distilled CFG and use
+                # true_cfg_scale as the real classifier-free
+                # guidance knob; the negative prompt is only
+                # effective when true_cfg_scale > 1. Forward the
+                # user-supplied guidance_scale through both so the
+                # negative prompt actually steers generation.
+                if _pipe_accepts_kwarg(pipe, "true_cfg_scale"):
+                    call_kwargs["true_cfg_scale"] = float(guidance_scale)
+            else:
+                logger.info(
+                    "Dropping negative_prompt: %s does not accept it",
+                    type(pipe).__name__,
                 )
-                generator = torch.Generator(device = gen_device).manual_seed(int(seed))
-
-            call_kwargs: dict[str, Any] = {
-                "prompt": prompt,
-                "num_inference_steps": int(num_inference_steps),
-                "guidance_scale": float(guidance_scale),
-                "width": int(width),
-                "height": int(height),
-            }
-            # FLUX.2 / FLUX.2 klein pipelines do NOT accept
-            # negative_prompt and 500 if you pass it in. Inspect the
-            # signature and only forward when supported; warn otherwise
-            # so the UI can disable the field for incompatible families.
-            if negative_prompt is not None and negative_prompt.strip():
-                if _pipe_accepts_kwarg(pipe, "negative_prompt"):
-                    call_kwargs["negative_prompt"] = negative_prompt
-                    # QwenImagePipeline and FluxPipeline treat
-                    # guidance_scale as distilled CFG and use
-                    # true_cfg_scale as the real classifier-free
-                    # guidance knob; the negative prompt is only
-                    # effective when true_cfg_scale > 1. Forward the
-                    # user-supplied guidance_scale through both so the
-                    # negative prompt actually steers generation.
-                    if _pipe_accepts_kwarg(pipe, "true_cfg_scale"):
-                        call_kwargs["true_cfg_scale"] = float(guidance_scale)
-                else:
-                    logger.info(
-                        "Dropping negative_prompt: %s does not accept it",
-                        type(pipe).__name__,
-                    )
-            if generator is not None:
-                call_kwargs["generator"] = generator
+        if generator is not None:
+            call_kwargs["generator"] = generator
 
-            out = pipe(**call_kwargs)
-            images = getattr(out, "images", None) or []
-            if not images:
-                raise RuntimeError("Diffusion pipeline returned no images.")
-            return images[0]
+        out = pipe(**call_kwargs)
+        images = getattr(out, "images", None) or []
+        if not images:
+            raise RuntimeError("Diffusion pipeline returned no images.")
+        return images[0]
+
+    def generate_image_with_metadata(
+        self,
+        **kwargs: Any,
+    ) -> tuple[Any, dict[str, Any]]:
+        """Generate a single image AND snapshot its identifying metadata.
+
+        Returns ``(pil_image, {"model": <repo_id>, "family": <name>})``
+        where the metadata reflects the pipeline that produced the
+        image. Snapshotted under ``_generate_lock + _lock`` so a
+        queued unload / load that promotes a different pipeline
+        cannot replace ``self._repo_id`` / ``self._family`` between
+        the forward returning and the route reading status (round
+        13 P2 #9). The route uses these values directly in the
+        response instead of re-calling ``status()``.
+        """
+        with self._generate_lock:
+            image = self._generate_image_unlocked(**kwargs)
+            with self._lock:
+                meta = {
+                    "model": self._repo_id,
+                    "family": self._family.name if self._family else None,
+                }
+        return image, meta
 
 
 def _pipe_accepts_kwarg(pipe: Any, name: str) -> bool:
@@ -895,21 +1031,24 @@ def _release_chat_backend_for_diffusion() -> None:
     """
     # 1. GGUF chat backend (llama-server subprocess). We unload when
     #    EITHER is_loaded is True (resident model) OR is_active is
-    #    True (mid-download / startup); the latter case is the
-    #    "llama-server is currently starting" race where weights are
-    #    being downloaded and the diffusion load would otherwise
-    #    double-spend GPU memory.
+    #    True (mid-download / startup) OR loading_model_identifier is
+    #    populated (HF GGUF download in progress, before is_active /
+    #    is_loaded flip). The last case is what round 13 P1 #8 flagged:
+    #    a multi-GB HF download from one workload + a diffusion load
+    #    racing on the same GPU would otherwise both end up live.
     try:
         from routes.inference import get_llama_cpp_backend  # type: ignore
 
         backend = get_llama_cpp_backend()
         is_loaded = bool(getattr(backend, "is_loaded", False))
         is_active = bool(getattr(backend, "is_active", False))
-        if is_loaded or is_active:
+        is_loading = bool(getattr(backend, "loading_model_identifier", None))
+        if is_loaded or is_active or is_loading:
             logger.info(
-                "Unloading llama-server (loaded=%s active=%s) before diffusion load",
+                "Unloading llama-server (loaded=%s active=%s loading=%s) before diffusion load",
                 is_loaded,
                 is_active,
+                is_loading,
             )
             backend.unload_model()
     except Exception as exc:
@@ -1086,3 +1225,20 @@ async def async_generate(
     do not block the event loop for the 5-30 s a diffusion step takes."""
     loop = asyncio.get_event_loop()
     return await loop.run_in_executor(None, lambda: backend.generate_image(**kwargs))
+
+
+async def async_generate_with_metadata(
+    backend: DiffusionBackend,
+    **kwargs: Any,
+) -> tuple[Any, dict[str, Any]]:
+    """Run ``generate_image_with_metadata`` in the default executor.
+
+    Used by the /images/generate route so the response model / family
+    fields reflect the pipeline that actually produced the image, even
+    if an unload races the route between the forward returning and the
+    response being assembled (round 13 P2 #9)."""
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(
+        None,
+        lambda: backend.generate_image_with_metadata(**kwargs),
+    )
diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index bf8a3c04df..57cda178a9 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -612,6 +612,13 @@ def __init__(self):
         self._process: Optional[subprocess.Popen] = None
         self._port: Optional[int] = None
         self._model_identifier: Optional[str] = None
+        # Pending-load identifier: set BEFORE _download_gguf starts and
+        # cleared after the load finishes (success or failure). Delete
+        # guards and cross-workload handoff helpers read it via
+        # ``loading_model_identifier`` so a multi-GB HF download cannot
+        # have its cache rmtree'd or be ignored by /images/load,
+        # /training/start, /export/load while it is still resolving.
+        self._loading_model_identifier: Optional[str] = None
         self._gguf_path: Optional[str] = None
         self._hf_repo: Optional[str] = None
         self._hf_variant: Optional[str] = None
@@ -713,6 +720,19 @@ def base_url(self) -> str:
     def model_identifier(self) -> Optional[str]:
         return self._model_identifier
 
+    @property
+    def loading_model_identifier(self) -> Optional[str]:
+        """Identifier of a load currently in progress, or None.
+
+        Populated while ``_download_gguf`` is fetching the GGUF for a
+        new ``load_model`` call. Cleared in the surrounding
+        ``finally`` block, so a failed load leaves it None. Delete
+        guards in ``routes/models.py`` and handoff helpers in
+        ``routes/inference.py`` consult this so a long HF download
+        cannot have its destination rmtree'd or be ignored by a
+        concurrent /images/load that thinks llama-server is idle."""
+        return self._loading_model_identifier
+
     @property
     def is_vision(self) -> bool:
         return self._is_vision
@@ -2673,25 +2693,44 @@ def load_model(
             # Scope HF_HUB_OFFLINE to the download block only when DNS is
             # dead; cleanup runs even on exception so a transient hiccup
             # at the start of one load cannot quarantine future loads.
-            if hf_repo:
-                with _hf_offline_if_dns_dead():
-                    model_path = self._download_gguf(
-                        hf_repo = hf_repo,
-                        hf_variant = hf_variant,
-                        hf_token = hf_token,
-                    )
-                    # Auto-download mmproj for vision models
-                    if is_vision and not mmproj_path:
-                        mmproj_path = self._download_mmproj(
+            #
+            # Publish ``_loading_model_identifier`` BEFORE entering the
+            # download so /delete-cached and the cross-workload handoff
+            # helpers can see a multi-GB pending load: previously they
+            # only consulted ``model_identifier``, which the success
+            # path sets later (see "Set identifier early" below). That
+            # left a window where the user could rmtree the cache the
+            # download was still writing to, or start /images/load
+            # while llama-server was about to come up on the same GPU.
+            # Cleared in ``finally`` so failed / cancelled loads do not
+            # leak the pending state.
+            self._loading_model_identifier = model_identifier
+            try:
+                if hf_repo:
+                    with _hf_offline_if_dns_dead():
+                        model_path = self._download_gguf(
                             hf_repo = hf_repo,
+                            hf_variant = hf_variant,
                             hf_token = hf_token,
                         )
-            elif gguf_path:
-                if not Path(gguf_path).is_file():
-                    raise FileNotFoundError(f"GGUF file not found: {gguf_path}")
-                model_path = gguf_path
-            else:
-                raise ValueError("Either gguf_path or hf_repo must be provided")
+                        # Auto-download mmproj for vision models
+                        if is_vision and not mmproj_path:
+                            mmproj_path = self._download_mmproj(
+                                hf_repo = hf_repo,
+                                hf_token = hf_token,
+                            )
+                elif gguf_path:
+                    if not Path(gguf_path).is_file():
+                        raise FileNotFoundError(
+                            f"GGUF file not found: {gguf_path}"
+                        )
+                    model_path = gguf_path
+                else:
+                    raise ValueError(
+                        "Either gguf_path or hf_repo must be provided"
+                    )
+            finally:
+                self._loading_model_identifier = None
 
             # Set identifier early so _read_gguf_metadata can use it for DeepSeek detection
             self._model_identifier = model_identifier
diff --git a/studio/backend/routes/export.py b/studio/backend/routes/export.py
index aabcd1920b..11c5004ced 100644
--- a/studio/backend/routes/export.py
+++ b/studio/backend/routes/export.py
@@ -142,6 +142,40 @@ async def load_checkpoint(
             logger.debug("diffusion unload skipped for export: %s", e)
 
         backend = get_export_backend()
+        # Refuse to reload the export checkpoint while an export job
+        # is still running. ``ExportBackend.load_checkpoint`` would
+        # terminate the running subprocess in order to spawn a new
+        # one, silently corrupting the partial output the user is
+        # waiting on (round 13 P1 #1). Mirrors the symmetric guards
+        # already in place for chat / diffusion / training handoffs.
+        # ``is_export_active`` may be absent on older / mocked
+        # backends -- treat missing as "no async-job tracker
+        # available" -> skip rather than fail-closed; the
+        # surrounding chat / diffusion unloads have already run.
+        is_export_active_fn = getattr(backend, "is_export_active", None)
+        if is_export_active_fn is not None:
+            try:
+                export_is_active = bool(is_export_active_fn())
+            except Exception as e:
+                logger.warning(
+                    "Could not verify export status before export load: %s", e
+                )
+                raise HTTPException(
+                    status_code = 503,
+                    detail = (
+                        "Could not verify export status before loading "
+                        "an export checkpoint. Try again."
+                    ),
+                ) from e
+            if export_is_active:
+                raise HTTPException(
+                    status_code = 409,
+                    detail = (
+                        "An export job is currently active. Stop the "
+                        "export job before loading another checkpoint."
+                    ),
+                )
+
         # load_checkpoint spawns and waits on a subprocess and can take
         # minutes. Run it in a worker thread so the event loop stays
         # free to serve the live log SSE stream concurrently.
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index c4e247c2a1..d376a289e1 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -359,18 +359,25 @@ def _raise_if_export_active(workload: str) -> None:
 
 async def _release_llama_for(workload: str) -> None:
     """Unload the llama-server (GGUF) chat backend if it owns the
-    GPU. Treats ``is_loaded`` OR ``is_active`` as held (the latter
-    is mid-download / mid-startup, before health probes pass).
+    GPU. Treats ``is_loaded`` OR ``is_active`` OR
+    ``loading_model_identifier`` as held: the third covers an HF GGUF
+    download that has not yet flipped ``is_active`` to True (round
+    13 P1 #7). Without it, /images/load, /training/start, and
+    /export/load could start while a long ``_download_gguf`` was in
+    flight; llama-server would then come up afterwards and double-own
+    the GPU.
     """
     try:
         llama = get_llama_cpp_backend()
         is_loaded = bool(getattr(llama, "is_loaded", False))
         is_active = bool(getattr(llama, "is_active", False))
-        if is_loaded or is_active:
+        is_loading = bool(getattr(llama, "loading_model_identifier", None))
+        if is_loaded or is_active or is_loading:
             logger.info(
-                "Unloading GGUF chat (loaded=%s active=%s) before %s load",
+                "Unloading GGUF chat (loaded=%s active=%s loading=%s) before %s load",
                 is_loaded,
                 is_active,
+                is_loading,
                 workload,
             )
             await asyncio.to_thread(llama.unload_model)
@@ -1985,9 +1992,16 @@ async def diffusion_generate(
 
     start = time.time()
     try:
-        from core.inference.diffusion import async_generate, encode_png_base64
+        from core.inference.diffusion import (
+            async_generate_with_metadata,
+            encode_png_base64,
+        )
 
-        image = await async_generate(
+        # ``async_generate_with_metadata`` snapshots ``model`` /
+        # ``family`` under the same ``_generate_lock`` that owns the
+        # forward, so a queued unload/load cannot replace them between
+        # generation end and response assembly (round 13 P2 #9).
+        image, meta = await async_generate_with_metadata(
             backend,
             prompt = payload.prompt,
             negative_prompt = payload.negative_prompt,
@@ -2006,7 +2020,6 @@ async def diffusion_generate(
         raise HTTPException(status_code = 500, detail = str(exc))
 
     duration_ms = int((time.time() - start) * 1000)
-    status = backend.status()
     return DiffusionGenerateResponse(
         image_b64 = encode_png_base64(image),
         image_mime = "image/png",
@@ -2022,12 +2035,8 @@ async def diffusion_generate(
         # browser side.
         seed_str = str(payload.seed) if payload.seed is not None else None,
         duration_ms = duration_ms,
-        # Use ``active_repo_id`` (the pipeline that just ran the
-        # forward) rather than the UI-facing ``repo_id`` so a
-        # queued /images/load promoting a new pending model cannot
-        # leak that model's identity into our response.
-        model = status.get("active_repo_id") or status.get("repo_id"),
-        family = status.get("family"),
+        model = meta.get("model"),
+        family = meta.get("family"),
     )
 
 
diff --git a/studio/backend/routes/models.py b/studio/backend/routes/models.py
index 570f024bf8..a573f88233 100644
--- a/studio/backend/routes/models.py
+++ b/studio/backend/routes/models.py
@@ -1709,6 +1709,53 @@ def _is_path_under(path: Path, root: Path) -> bool:
         return False
 
 
+def _diffusion_owned_targets(diff_status: dict) -> list[tuple[str, str | None]]:
+    """Return ``(owned_repo_or_path, owned_gguf_filename)`` pairs for
+    every diffusion target the backend currently holds.
+
+    Pairs the active / pending repo with the active / pending GGUF
+    filename (not the UI-facing collapsed ``gguf_filename``) so the
+    per-variant delete guards know which quant is actually owned by
+    each repo. Without this pairing, a swap in progress (active
+    ``Q4_K_S``, pending ``Q8_0``) collapsed both to the pending
+    variant and the active ``Q4_K_S`` GGUF could be deleted while
+    still mmap'd by the resident pipeline (round 13 P1 #3-5).
+
+    Base repos are paired with ``None`` for the GGUF: the base /
+    component repo is loaded whole via ``from_pretrained`` and has no
+    per-variant delete to take advantage of.
+    """
+    return [
+        (
+            diff_status.get("active_repo_id") or "",
+            diff_status.get("active_gguf_filename"),
+        ),
+        (diff_status.get("active_base_repo") or "", None),
+        (
+            diff_status.get("pending_repo_id") or "",
+            diff_status.get("pending_gguf_filename"),
+        ),
+        (diff_status.get("pending_base_repo") or "", None),
+    ]
+
+
+def _variant_delete_is_safe_for_owned_gguf(
+    requested_variant: str | None,
+    owned_gguf_filename: str | None,
+) -> bool:
+    """True iff a per-variant delete for ``requested_variant`` against
+    a repo that owns ``owned_gguf_filename`` cannot remove the owned
+    file.
+
+    Returns False (i.e. unsafe -> block the delete) when either
+    argument is missing so a NULL owned filename or a full-repo delete
+    (no variant) does not accidentally pass the guard."""
+    if not requested_variant or not owned_gguf_filename:
+        return False
+    loaded_label = (_extract_quant_label(owned_gguf_filename.lower()) or "").lower()
+    return bool(loaded_label and loaded_label != requested_variant.lower())
+
+
 def _is_path_under_lexically(path: Path, root: Path) -> bool:
     """Check containment without resolving the final path's symlink target."""
     try:
@@ -1991,27 +2038,17 @@ async def delete_finetuned_model(
         diff_backend = get_diffusion_backend()
         diff_status = diff_backend.status()
         if diff_status.get("is_loaded") or diff_status.get("is_loading"):
-            candidates: list[str] = []
-            for key in (
-                "active_repo_id",
-                "active_base_repo",
-                "pending_repo_id",
-                "pending_base_repo",
-            ):
-                v = diff_status.get(key) or ""
-                if v:
-                    candidates.append(v)
             target_str = str(target_path)
-            # Per-variant deletes only touch ``_delete_gguf_variant_
-            # files(target_path, gguf_variant)`` which removes a
-            # specific quant file. If the loaded pipeline uses a
-            # DIFFERENT variant from the same directory, the delete
-            # is safe. Round 12 review #3.
-            loaded_gguf = (diff_status.get("gguf_filename") or "").lower()
-            wants_variant = export_type == "gguf" and gguf_variant and loaded_gguf
-            for candidate in candidates:
+            # Pair each owned repo / path with the GGUF variant it
+            # actually owns (round 13 P1 #5). For a swap in flight
+            # (active Q4_K_S, pending Q8_0) the active variant must
+            # NOT be deleted just because the pending variant uses
+            # a different quant.
+            for candidate, owned_gguf in _diffusion_owned_targets(diff_status):
+                if not candidate:
+                    continue
                 try:
-                    candidate_path = Path(candidate).expanduser()
+                    candidate_resolved = Path(candidate).expanduser().resolve()
                 except Exception:
                     continue
                 # Relative paths (the user can do
@@ -2019,27 +2056,23 @@ async def delete_finetuned_model(
                 # legitimate path candidates; resolve against the
                 # backend cwd so they can be compared with the
                 # absolute ``target_path``. Round 8 review #11.
-                try:
-                    candidate_resolved = candidate_path.resolve()
-                except Exception:
-                    continue
-                if (
+                overlaps = (
                     candidate_resolved == target_path
                     or str(candidate_resolved) == target_str
                     or _is_path_under(candidate_resolved, target_path)
                     or _is_path_under(target_path, candidate_resolved)
+                )
+                if not overlaps:
+                    continue
+                if export_type == "gguf" and _variant_delete_is_safe_for_owned_gguf(
+                    gguf_variant,
+                    owned_gguf,
                 ):
-                    # Allow per-variant deletes that target a
-                    # different quant than the loaded one.
-                    if wants_variant:
-                        variant_low = gguf_variant.lower()
-                        loaded_label = (_extract_quant_label(loaded_gguf) or "").lower()
-                        if loaded_label and loaded_label != variant_low:
-                            continue
-                    raise HTTPException(
-                        status_code = 400,
-                        detail = "Unload the diffusion image model before deleting",
-                    )
+                    continue
+                raise HTTPException(
+                    status_code = 400,
+                    detail = "Unload the diffusion image model before deleting",
+                )
     except HTTPException:
         raise
     except Exception as e:
@@ -2695,12 +2728,25 @@ async def delete_cached_model(
 
         llama_backend = get_llama_cpp_backend()
         loaded_id = (llama_backend.model_identifier or "").lower()
+        loading_id = (
+            getattr(llama_backend, "loading_model_identifier", None) or ""
+        ).lower()
+        # Also consult the pending-load identifier: a multi-GB HF
+        # download stays in ``loading_model_identifier`` until the
+        # download completes, before ``model_identifier`` is set
+        # (round 13 P1 #6). Without this check the cache directory
+        # the download was writing into could be rmtree'd mid-flight.
+        needle = repo_id.lower()
+        if loading_id == needle:
+            raise HTTPException(
+                status_code = 409,
+                detail = "Cannot delete a model while it is loading",
+            )
         # Exact match only (case-insensitive). Prefix match would
         # block deleting unrelated ``org/model`` while
         # ``org/model-v2`` is loaded -- same surface the diffusion
         # guard fixed in round 5.
-        wants = loaded_id == repo_id.lower()
-        if wants and (
+        if loaded_id == needle and (
             llama_backend.is_loaded or getattr(llama_backend, "is_active", False)
         ):
             raise HTTPException(
@@ -2775,35 +2821,21 @@ async def delete_cached_model(
         diff_status = diff_backend.status()
         if diff_status.get("is_loaded") or diff_status.get("is_loading"):
             needle = repo_id.lower()
-            owned = {
-                (diff_status.get("active_repo_id") or "").lower(),
-                (diff_status.get("active_base_repo") or "").lower(),
-                (diff_status.get("pending_repo_id") or "").lower(),
-                (diff_status.get("pending_base_repo") or "").lower(),
-            }
-            owned.discard("")
-            if needle in owned:
-                # Per-variant delete only touches the requested
-                # quant via ``_delete_gguf_variant_files``. If the
-                # loaded pipeline uses a DIFFERENT variant from the
-                # same repo, the delete is safe. Round 12 review #4.
-                loaded_gguf = (diff_status.get("gguf_filename") or "").lower()
-                if variant and loaded_gguf:
-                    variant_low = variant.lower()
-                    loaded_label = (_extract_quant_label(loaded_gguf) or "").lower()
-                    if loaded_label and loaded_label != variant_low:
-                        # Different quant from the same repo -> allow.
-                        pass
-                    else:
-                        raise HTTPException(
-                            status_code = 400,
-                            detail = "Unload the diffusion image model before deleting",
-                        )
-                else:
-                    raise HTTPException(
-                        status_code = 400,
-                        detail = "Unload the diffusion image model before deleting",
-                    )
+            # Pair each owned repo with the GGUF variant it actually
+            # owns (active or pending) so a swap in progress does not
+            # collapse both quants into the pending one (round 13
+            # P1 #4). Per-variant delete is still allowed if the
+            # requested variant differs from the variant that owns
+            # the matched repo.
+            for owned_id, owned_gguf in _diffusion_owned_targets(diff_status):
+                if not owned_id or owned_id.lower() != needle:
+                    continue
+                if _variant_delete_is_safe_for_owned_gguf(variant, owned_gguf):
+                    continue
+                raise HTTPException(
+                    status_code = 400,
+                    detail = "Unload the diffusion image model before deleting",
+                )
     except HTTPException:
         raise
     except Exception as e:
diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index 44386fc8ea..43918fb17e 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -24,6 +24,7 @@
 import io
 import sys
 import types
+from types import SimpleNamespace
 from typing import Any
 
 import pytest
@@ -184,6 +185,12 @@ def test_status_shape_unloaded():
         "pipeline_class",
         "base_repo",
         "gguf_filename",
+        "active_repo_id",
+        "active_base_repo",
+        "active_gguf_filename",
+        "pending_repo_id",
+        "pending_base_repo",
+        "pending_gguf_filename",
         "device",
         "dtype",
         "loaded_at",
@@ -193,6 +200,8 @@ def test_status_shape_unloaded():
     assert expected_keys.issubset(s.keys())
     assert s["is_loaded"] is False
     assert s["repo_id"] is None
+    assert s["active_gguf_filename"] is None
+    assert s["pending_gguf_filename"] is None
 
 
 # ── encode_png_base64 ───────────────────────────────────────────
@@ -1192,3 +1201,219 @@ class _FakeTorch:
     device, dtype = backend._pick_device_and_dtype()
     assert device == "cuda"
     assert dtype is fake_torch.float16
+
+
+# ── round 13 regressions ──────────────────────────────────────────
+
+
+def test_smart_base_repo_uses_windows_leaf_only():
+    """Round 13 P2 #13: a Windows path whose PARENT directory contains
+    'base' must not be misclassified as the Klein Base 4B variant."""
+    from core.inference.diffusion import _smart_base_repo, detect_family
+
+    repo = r"C:\Users\me\base\FLUX.2-klein-4B-GGUF"
+    fam = detect_family(repo)
+    assert fam is not None and fam.name == "flux.2-klein"
+    assert _smart_base_repo(fam, repo) == "black-forest-labs/FLUX.2-klein-4B"
+
+
+def test_resolve_local_gguf_child_rejects_traversal(tmp_path):
+    """Round 13 P1 #2: gguf_filename must not escape the repo root."""
+    from core.inference.diffusion import _resolve_local_gguf_child
+
+    repo_root = tmp_path / "my-flux"
+    repo_root.mkdir()
+    (repo_root / "model.gguf").write_bytes(b"x")
+    sibling = tmp_path / "other.gguf"
+    sibling.write_bytes(b"y")
+
+    assert _resolve_local_gguf_child(repo_root, "model.gguf").name == "model.gguf"
+
+    # ``./model.gguf`` is normalised by PurePosixPath to ``model.gguf``
+    # and stays inside the repo, so it is intentionally accepted.
+    for bad in ("../other.gguf", "", "sub/../model.gguf"):
+        with pytest.raises(RuntimeError):
+            _resolve_local_gguf_child(repo_root, bad)
+    with pytest.raises(RuntimeError):
+        _resolve_local_gguf_child(repo_root, "/etc/passwd")
+
+
+def test_resolve_local_gguf_child_rejects_backslash(tmp_path):
+    """Round 13 P1 #2: a Windows-style separator inside gguf_filename
+    must be rejected even on POSIX so it never becomes a literal name."""
+    from core.inference.diffusion import _resolve_local_gguf_child
+
+    repo_root = tmp_path / "my-flux"
+    repo_root.mkdir()
+    (repo_root / "model.gguf").write_bytes(b"x")
+
+    with pytest.raises(RuntimeError):
+        _resolve_local_gguf_child(repo_root, r"..\\other.gguf")
+
+
+def test_load_model_accepts_relative_local_dir(monkeypatch, tmp_path):
+    """Round 13 P1 #2: relative directory paths (Studio exports) must
+    NOT be routed through hf_hub_download."""
+    import core.inference.diffusion as d
+
+    repo_root = tmp_path / "exports" / "my-flux"
+    repo_root.mkdir(parents = True)
+    gguf_file = repo_root / "model.gguf"
+    gguf_file.write_bytes(b"x")
+
+    # cwd so the relative path resolves to repo_root
+    monkeypatch.chdir(tmp_path)
+
+    fake_transformer = object()
+    fake_pipe = SimpleNamespace(
+        to = lambda *a, **kw: None,
+        enable_model_cpu_offload = lambda: None,
+    )
+
+    class _FakeQuantConfig:
+        def __init__(self, **_):
+            pass
+
+    class _FakeTransformerCls:
+        from_single_file_calls: list[tuple[str, dict]] = []
+
+        @classmethod
+        def from_single_file(cls, path, **kwargs):
+            cls.from_single_file_calls.append((path, kwargs))
+            return fake_transformer
+
+    class _FakePipeCls:
+        @classmethod
+        def from_pretrained(cls, base, **kwargs):
+            return fake_pipe
+
+    fake_diffusers = SimpleNamespace(
+        __version__ = "0.99",
+        GGUFQuantizationConfig = _FakeQuantConfig,
+        Flux2Transformer2DModel = _FakeTransformerCls,
+        Flux2KleinPipeline = _FakePipeCls,
+    )
+
+    fake_torch = SimpleNamespace(
+        cuda = SimpleNamespace(
+            is_available = lambda: False,
+            is_bf16_supported = lambda: False,
+            empty_cache = lambda: None,
+        ),
+        bfloat16 = "bf16",
+        float16 = "fp16",
+        float32 = "fp32",
+        backends = SimpleNamespace(
+            mps = SimpleNamespace(is_available = lambda: False),
+        ),
+    )
+
+    def _boom(**_):
+        raise AssertionError("hf_hub_download must not run for a local dir")
+
+    fake_hub = SimpleNamespace(hf_hub_download = _boom)
+    monkeypatch.setitem(sys.modules, "huggingface_hub", fake_hub)
+    monkeypatch.setitem(sys.modules, "diffusers", fake_diffusers)
+    monkeypatch.setitem(sys.modules, "torch", fake_torch)
+
+    backend = d.DiffusionBackend()
+    backend.load_model(
+        repo_id = "exports/my-flux",
+        gguf_filename = "model.gguf",
+        family_override = "flux.2-klein",
+        enable_model_cpu_offload = False,
+    )
+
+    assert _FakeTransformerCls.from_single_file_calls
+    resolved_path = _FakeTransformerCls.from_single_file_calls[0][0]
+    assert str(gguf_file.resolve()) == resolved_path
+
+
+def test_generate_image_with_metadata_returns_active_pipeline(monkeypatch):
+    """Round 13 P2 #9: meta returns the resident pipeline's identity."""
+    import core.inference.diffusion as d
+
+    backend = d.DiffusionBackend()
+    fake_fam = d.DiffusionFamily(
+        name = "flux.2-klein",
+        pipeline_class = "Flux2KleinPipeline",
+        transformer_class = "Flux2KleinTransformer3DModel",
+        base_repo = "black-forest-labs/FLUX.2-klein-4B",
+        aliases = (),
+    )
+
+    def _fake_unlocked(**kwargs):
+        from PIL import Image as _Image
+
+        return _Image.new("RGB", (8, 8))
+
+    backend._pipe = object()
+    backend._repo_id = "unsloth/FLUX.2-klein-4B-GGUF"
+    backend._family = fake_fam
+    monkeypatch.setattr(backend, "_generate_image_unlocked", _fake_unlocked)
+
+    _, meta = backend.generate_image_with_metadata(prompt = "x")
+    assert meta == {
+        "model": "unsloth/FLUX.2-klein-4B-GGUF",
+        "family": "flux.2-klein",
+    }
+
+
+def test_generate_image_with_metadata_blocks_concurrent_unload(monkeypatch):
+    """Round 13 P2 #9: _generate_lock serialises the forward AND the
+    meta snapshot, so a queued unload cannot wipe state in between."""
+    import threading
+    import core.inference.diffusion as d
+
+    backend = d.DiffusionBackend()
+    fake_fam = d.DiffusionFamily(
+        name = "flux.2-klein",
+        pipeline_class = "Flux2KleinPipeline",
+        transformer_class = "Flux2KleinTransformer3DModel",
+        base_repo = "black-forest-labs/FLUX.2-klein-4B",
+        aliases = (),
+    )
+
+    started = threading.Event()
+    finish = threading.Event()
+
+    def _fake_unlocked(**kwargs):
+        from PIL import Image as _Image
+
+        started.set()
+        # Hold long enough for the unload thread to race the metadata
+        # snapshot if the lock were released too early.
+        finish.wait(timeout = 2.0)
+        return _Image.new("RGB", (8, 8))
+
+    backend._pipe = object()
+    backend._repo_id = "unsloth/FLUX.2-klein-4B-GGUF"
+    backend._family = fake_fam
+    monkeypatch.setattr(backend, "_generate_image_unlocked", _fake_unlocked)
+
+    result: list = []
+
+    def _gen():
+        result.append(backend.generate_image_with_metadata(prompt = "x"))
+
+    gen_thread = threading.Thread(target = _gen)
+    gen_thread.start()
+    assert started.wait(timeout = 2.0)
+
+    def _unload():
+        backend.unload_model()
+
+    un_thread = threading.Thread(target = _unload)
+    un_thread.start()
+    # The unload must NOT have completed yet; it queues behind the
+    # generation's _generate_lock.
+    un_thread.join(timeout = 0.2)
+    assert un_thread.is_alive()
+    finish.set()
+    gen_thread.join(timeout = 5.0)
+    un_thread.join(timeout = 5.0)
+
+    assert result
+    _, meta = result[0]
+    assert meta["model"] == "unsloth/FLUX.2-klein-4B-GGUF"
+    assert meta["family"] == "flux.2-klein"
diff --git a/studio/backend/tests/test_diffusion_routes.py b/studio/backend/tests/test_diffusion_routes.py
index fc99228a63..d3659b89ee 100644
--- a/studio/backend/tests/test_diffusion_routes.py
+++ b/studio/backend/tests/test_diffusion_routes.py
@@ -80,6 +80,14 @@ def status(self) -> dict:
             "pipeline_class": "Flux2KleinPipeline" if self._loaded else None,
             "base_repo": "black-forest-labs/FLUX.2-klein" if self._loaded else None,
             "gguf_filename": None,
+            "active_repo_id": self._repo,
+            "active_base_repo": (
+                "black-forest-labs/FLUX.2-klein" if self._loaded else None
+            ),
+            "active_gguf_filename": None,
+            "pending_repo_id": None,
+            "pending_base_repo": None,
+            "pending_gguf_filename": None,
             "device": "cpu",
             "dtype": "torch.bfloat16",
             "loaded_at": 0,
@@ -102,6 +110,14 @@ def generate_image(self, **kw):
         self.calls.append({"op": "generate", **kw})
         return Image.new("RGB", (kw["width"], kw["height"]), color = (123, 45, 67))
 
+    def generate_image_with_metadata(self, **kw):
+        image = self.generate_image(**kw)
+        meta = {
+            "model": self._repo,
+            "family": "flux.2-klein" if self._loaded else None,
+        }
+        return image, meta
+
 
 @pytest.fixture
 def app_with_stub(monkeypatch):

From ff98c6d160d9309043f43a330b9b3082bd38fd44 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 05:57:48 +0000
Subject: [PATCH 38/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/diffusion.py | 4 +---
 studio/backend/core/inference/llama_cpp.py | 8 ++------
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 335e93c957..f000ef20cf 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -200,9 +200,7 @@ def _resolve_local_gguf_child(repo_root: Path, gguf_filename: str) -> Path:
     invariant for Hub repos.
     """
     if Path(gguf_filename).is_absolute() or "\\" in gguf_filename:
-        raise RuntimeError(
-            "gguf_filename must be a relative file path inside repo_id."
-        )
+        raise RuntimeError("gguf_filename must be a relative file path inside repo_id.")
     rel = PurePosixPath(gguf_filename)
     if any(part in ("", ".", "..") for part in rel.parts):
         raise RuntimeError(
diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 57cda178a9..51da54789d 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -2721,14 +2721,10 @@ def load_model(
                             )
                 elif gguf_path:
                     if not Path(gguf_path).is_file():
-                        raise FileNotFoundError(
-                            f"GGUF file not found: {gguf_path}"
-                        )
+                        raise FileNotFoundError(f"GGUF file not found: {gguf_path}")
                     model_path = gguf_path
                 else:
-                    raise ValueError(
-                        "Either gguf_path or hf_repo must be provided"
-                    )
+                    raise ValueError("Either gguf_path or hf_repo must be provided")
             finally:
                 self._loading_model_identifier = None
 

From 54adfdff5378eebb02f2411bfb8b6000949c1328 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 06:01:56 +0000
Subject: [PATCH 39/92] Fix _resolve_local_gguf_child traversal check for
 Windows for PR #5754

Round 13 follow-up: on Windows Path('/etc/passwd').is_absolute()
returns False because POSIX absolute paths read as drive-relative,
which let the traversal check fall through to resolve(strict=True)
and crash with a raw FileNotFoundError instead of the friendlier
RuntimeError. Add a PurePosixPath check + explicit leading-separator
guard and wrap the resolve() in try/except so a missing path inside
the chosen repo is reported as 'Local repo path does not contain ...'
on every OS.

Pre-existing 59 diffusion backend + route tests still pass; staging
Windows Diffusion CI was failing on this exact case.
---
 studio/backend/core/inference/diffusion.py | 27 +++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index f000ef20cf..2f60d70300 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -199,15 +199,36 @@ def _resolve_local_gguf_child(repo_root: Path, gguf_filename: str) -> Path:
     (round 13 P1 #2). ``hf_hub_download`` already enforces the same
     invariant for Hub repos.
     """
-    if Path(gguf_filename).is_absolute() or "\\" in gguf_filename:
-        raise RuntimeError("gguf_filename must be a relative file path inside repo_id.")
+    # ``Path("/etc/passwd").is_absolute()`` is False on Windows (POSIX
+    # absolute paths read as drive-relative), so check both pathlib
+    # flavours plus a leading separator so the rejection is portable.
+    if (
+        Path(gguf_filename).is_absolute()
+        or PurePosixPath(gguf_filename).is_absolute()
+        or gguf_filename.startswith(("/", "\\"))
+        or "\\" in gguf_filename
+    ):
+        raise RuntimeError(
+            "gguf_filename must be a relative file path inside repo_id."
+        )
     rel = PurePosixPath(gguf_filename)
     if any(part in ("", ".", "..") for part in rel.parts):
         raise RuntimeError(
             "gguf_filename must not contain empty, '.', or '..' segments."
         )
     root = repo_root.expanduser().resolve(strict = True)
-    candidate = (root / Path(*rel.parts)).resolve(strict = True)
+    try:
+        candidate = (root / Path(*rel.parts)).resolve(strict = True)
+    except (OSError, FileNotFoundError) as exc:
+        # strict=True raises FileNotFoundError on a missing leaf or
+        # parent component, and OSError on a malformed Windows path
+        # (e.g. drive letters injected through the user-supplied
+        # string). Either way the candidate does not exist inside the
+        # chosen repo, which is exactly the "file not in repo" failure
+        # mode the caller cares about.
+        raise RuntimeError(
+            f"Local repo path '{repo_root}' does not contain '{gguf_filename}'."
+        ) from exc
     try:
         candidate.relative_to(root)
     except ValueError as exc:

From f501ab8fc812d8a1613a472a9f7628618a5a1e9f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 06:06:00 +0000
Subject: [PATCH 40/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/diffusion.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 2f60d70300..57268291b0 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -208,9 +208,7 @@ def _resolve_local_gguf_child(repo_root: Path, gguf_filename: str) -> Path:
         or gguf_filename.startswith(("/", "\\"))
         or "\\" in gguf_filename
     ):
-        raise RuntimeError(
-            "gguf_filename must be a relative file path inside repo_id."
-        )
+        raise RuntimeError("gguf_filename must be a relative file path inside repo_id.")
     rel = PurePosixPath(gguf_filename)
     if any(part in ("", ".", "..") for part in rel.parts):
         raise RuntimeError(

From e03ed3dd29da4f4cb015398005d04a05bc3f7949 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 06:34:04 +0000
Subject: [PATCH 41/92] Fix/adjust diffusion: round 14 P1+P2 batch for PR #5754

Round 14 reviewer aggregate (logs/review_round14_aggregate.md):

P1 fixes:
- routes/export.py /load-checkpoint now runs the active-export 409
  guard BEFORE the chat / diffusion unloads, so a rejected request
  no longer tears down unrelated GPU state.
- core/inference/llama_cpp.py wraps the WHOLE load_model body in a
  single try/finally that publishes loading_model_identifier across
  download, metadata read, VRAM settle, process spawn, and health
  check. Done via a thin load_model wrapper around the existing
  body (renamed _load_model_impl) to avoid reindenting hundreds of
  lines.
- routes/models.py /delete-finetuned now checks
  loading_model_identifier so a pending HF GGUF download cannot
  have its destination directory rmtree'd before llama-server
  spawns.
- core/inference/diffusion.py stores the original caller-supplied
  gguf_filename (e.g. ``BF16/model.gguf``) in a new self._gguf_filename
  field and exposes it as active_gguf_filename. UI-facing
  gguf_filename still collapses to basename for the panel.
- routes/models.py /delete-cached llama guard now allows safe
  different-variant deletes when hf_variant differs, matching the
  diffusion path's variant-aware behaviour.
- core/inference/diffusion.py tracks self._cpu_offload_enabled and
  forces a CPU torch.Generator when offload is on, so seeded
  generation no longer crashes on CUDA hosts with the default offload
  enabled.

P2 fixes:
- core/inference/diffusion.py detect_family normalises mixed
  separators (``Qwen_Image-Edit-GGUF``, ``Qwen-Image_Edit-GGUF``,
  ``QwenImageEdit-GGUF``) so every Qwen-Image-Edit spelling is
  excluded from the base Qwen-Image family.
- core/inference/diffusion.py logger.info / logger.error in
  load_model run repo_id and effective_base through _redact_hf_tokens
  so URL-embedded ``hf_xxxxx`` tokens never reach structured-log
  sinks.
- core/inference/diffusion.py _release_other_gpu_owners_for_diffusion
  now raises RuntimeError when an export job is active instead of
  logging and continuing, so direct backend callers cannot bypass
  the route layer's 409 guard.
- core/inference/diffusion.py full-diffusers repo / base_repo paths
  expand ``~`` via _expand_existing_local_path so
  ``repo_id="~/models/my-flux"`` no longer falls through to the Hub.

Tests:
- 5 new regression cases (mixed Qwen-Image-Edit separators, token
  redaction, status full-filename, CPU offload generator device,
  staging Windows leaf already-set sanity).
- All 68 diffusion backend + route tests pass.
---
 studio/backend/core/inference/diffusion.py    | 217 ++++++++++++++----
 studio/backend/core/inference/llama_cpp.py    | 110 ++++++---
 studio/backend/routes/export.py               |  71 +++---
 studio/backend/routes/models.py               |  39 +++-
 .../backend/tests/test_diffusion_backend.py   | 114 +++++++++
 studio/backend/tests/test_diffusion_routes.py |   3 +
 6 files changed, 439 insertions(+), 115 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 57268291b0..8b40fdfc55 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -182,6 +182,43 @@ def _smart_base_repo(fam: DiffusionFamily, repo_id: str) -> str:
     return "black-forest-labs/FLUX.2-klein-4B"
 
 
+def _expand_existing_local_path(value: str) -> str:
+    """Expand ``~`` in ``value`` when the expanded path exists locally.
+
+    Round 14 P2 #11: the GGUF local path branch already calls
+    ``Path(repo_id).expanduser()``, but the full-diffusers-repo and
+    base-companion-repo paths passed the literal ``~/...`` straight
+    into ``from_pretrained``, which treated it as a Hub id and tried
+    to download. Keep behaviour identical for Hub ids (no leading
+    ``~`` -> return as-is) and for non-existent expansions (the
+    diffusers loader will surface its own ``not found`` error).
+    """
+    if not value or not isinstance(value, str) or not value.startswith("~"):
+        return value
+    candidate = Path(value).expanduser()
+    if candidate.exists():
+        return str(candidate)
+    return value
+
+
+_HF_TOKEN_RE = re.compile(r"hf_[A-Za-z0-9]{20,}")
+
+
+def _redact_hf_tokens(value: Any) -> Any:
+    """Scrub embedded ``hf_xxxxxxxx`` tokens out of a string before
+    logging. Round 14 P2 #9: callers can wrap an authenticated URL
+    (``https://hf_token@huggingface.co/...``) into ``repo_id`` /
+    ``base_repo`` / paths; the token would otherwise reach
+    structured-log sinks via the load-info / load-failure log lines.
+    Non-strings are returned unchanged so the helper is safe to
+    sprinkle through ``logger.info`` / ``logger.error`` argument
+    lists.
+    """
+    if not isinstance(value, str):
+        return value
+    return _HF_TOKEN_RE.sub("<redacted>", value)
+
+
 def _resolve_local_gguf_child(repo_root: Path, gguf_filename: str) -> Path:
     """Resolve a GGUF filename inside a local repo directory safely.
 
@@ -288,12 +325,26 @@ def detect_family(
     needle = (repo_id or "").lower()
     if not needle:
         return None
+    # Normalise mixed separator spellings (``Qwen_Image-Edit-GGUF``,
+    # ``Qwen-Image_Edit-GGUF``, ``Qwen.Image.Edit-GGUF``) and the
+    # compact concatenation (``QwenImageEdit-GGUF``) so the
+    # _FAMILY_EXCLUDE deny lists do not need every permutation of
+    # ``-``, ``_``, ``.`` and run-together spellings to keep
+    # Qwen-Image-Edit out of the base Qwen-Image family (round 14
+    # P2 #8).
+    needle_norm = re.sub(r"[^a-z0-9]+", "-", needle).strip("-")
+    needle_compact = re.sub(r"[^a-z0-9]+", "", needle)
     # Scan _FAMILIES first (GGUF-supported), then _FULL_REPO_FAMILIES
     # so a repo like ``stabilityai/stable-diffusion-xl-base-1.0`` is
     # auto-detected as SDXL instead of returning None.
     for fam in _FAMILIES + _FULL_REPO_FAMILIES:
         excludes = _FAMILY_EXCLUDE.get(fam.name, ())
-        if any(e in needle for e in excludes):
+        if any(
+            e in needle
+            or re.sub(r"[^a-z0-9]+", "-", e).strip("-") in needle_norm
+            or re.sub(r"[^a-z0-9]+", "", e) in needle_compact
+            for e in excludes
+        ):
             continue
         if fam.name in needle:
             return fam
@@ -352,9 +403,24 @@ def __init__(self) -> None:
         self._family: Optional[DiffusionFamily] = None
         self._repo_id: Optional[str] = None
         self._gguf_path: Optional[str] = None
+        # Original ``gguf_filename`` the caller passed in, preserved
+        # so delete guards can compare against subdirectory variants
+        # like ``BF16/model.gguf`` or ``Q4_K_M/model.gguf`` instead
+        # of the collapsed basename (round 14 P1 #4). The basename
+        # alone (``model.gguf``) loses the quant directory and lets
+        # /delete-cached unlink the wrong file.
+        self._gguf_filename: Optional[str] = None
         self._base_repo: Optional[str] = None
         self._device: Optional[str] = None
         self._dtype: Optional[str] = None
+        # True when ``enable_model_cpu_offload()`` was applied on the
+        # loaded pipeline. Diffusers' offload moves the active
+        # submodule between CPU and GPU on each step, so a CUDA
+        # ``torch.Generator`` mismatches the CPU-resident embeddings
+        # and generation crashes mid-forward (round 14 P1 #6). When
+        # this is True, seeded generation has to use a CPU generator
+        # regardless of self._device.
+        self._cpu_offload_enabled: bool = False
         self._loaded_at: Optional[float] = None
         self._loading: bool = False
         self._last_error: Optional[str] = None
@@ -387,6 +453,11 @@ def status(self) -> dict[str, Any]:
         # local HF cache layout (and the system username on default
         # POSIX layouts) to any authenticated Studio session.
         with self._lock:
+            # UI-facing collapsed basename. Full local path leaks the
+            # HF cache layout + system username; the original caller-
+            # supplied filename (e.g. ``BF16/model.gguf``) is kept
+            # separately as ``active_gguf_filename`` for delete
+            # guards.
             gguf_basename = Path(self._gguf_path).name if self._gguf_path else None
             # Expose BOTH the resident pipeline's id AND the pending
             # load target. Delete guards must check both: when model A
@@ -398,7 +469,7 @@ def status(self) -> dict[str, Any]:
             # user just clicked.
             active_repo = self._repo_id
             active_base = self._base_repo
-            active_gguf = gguf_basename
+            active_gguf = self._gguf_filename
             pending_repo = self._pending_repo_id if self._loading else None
             pending_base = self._pending_base_repo if self._loading else None
             pending_gguf = self._pending_gguf_filename if self._loading else None
@@ -414,6 +485,14 @@ def status(self) -> dict[str, Any]:
             if pending_repo and pending_repo != active_repo:
                 ui_family = None
                 ui_pipeline_class = None
+            # UI-facing ``gguf_filename`` collapses to the basename
+            # so the Images panel does not surface internal cache /
+            # variant directory names. Guard-facing ``active_*`` /
+            # ``pending_*`` retain the full caller-supplied filename
+            # so /delete-cached can compare against subdirectory
+            # variants like ``BF16/model.gguf`` (round 14 P1 #4-5).
+            ui_gguf = pending_gguf or active_gguf
+            ui_gguf_basename = Path(ui_gguf).name if ui_gguf else None
             return {
                 "is_loaded": self._pipe is not None,
                 "is_loading": self._loading,
@@ -421,7 +500,7 @@ def status(self) -> dict[str, Any]:
                 "family": ui_family,
                 "pipeline_class": ui_pipeline_class,
                 "base_repo": pending_base or active_base,
-                "gguf_filename": pending_gguf or active_gguf,
+                "gguf_filename": ui_gguf_basename,
                 # Guard-facing fields: every repo / path / GGUF
                 # filename the backend owns RIGHT NOW. Delete routes
                 # iterate both, paired so the variant-filename check
@@ -550,9 +629,11 @@ def load_model(
                 # success.
                 self._pending_repo_id = repo_id
                 self._pending_base_repo = base_repo
-                self._pending_gguf_filename = (
-                    Path(gguf_filename).name if gguf_filename else None
-                )
+                # Store the caller's full ``gguf_filename`` (e.g.
+                # ``BF16/model.gguf``) so the variant-aware delete
+                # guards have the subdirectory info. The UI side of
+                # status() still collapses to the basename for display.
+                self._pending_gguf_filename = gguf_filename if gguf_filename else None
             try:
                 pipeline_cls = getattr(diffusers, fam.pipeline_class, None)
                 if pipeline_cls is None:
@@ -595,11 +676,15 @@ def load_model(
                             "or load a full diffusers repo (base_repo only "
                             "applies when picking a GGUF quant)."
                         )
-                    effective_base = repo_id
+                    # ``~/models/my-flux`` must be expanded so
+                    # diffusers' from_pretrained does not pass the
+                    # literal tilde through to ``os.path.isdir`` and
+                    # fall back to the Hub (round 14 P2 #11).
+                    effective_base = _expand_existing_local_path(repo_id)
                     with self._lock:
                         self._pending_base_repo = effective_base
                 elif base_repo:
-                    effective_base = base_repo
+                    effective_base = _expand_existing_local_path(base_repo)
                     # Refresh pending so delete guards see the actual
                     # base, not just caller-supplied None.
                     with self._lock:
@@ -608,13 +693,19 @@ def load_model(
                     effective_base = _smart_base_repo(fam, repo_id)
                     with self._lock:
                         self._pending_base_repo = effective_base
+                # ``repo_id`` / ``effective_base`` are user-supplied
+                # strings that can embed an ``hf_xxxxx`` token via a
+                # URL-style path (``https://hf_token@huggingface.co/...``).
+                # Scrub them BEFORE the logger formats the line so the
+                # token never reaches structured-log sinks (round 14
+                # P2 #9).
                 logger.info(
                     "Loading diffusion model %s (family=%s, device=%s, dtype=%s, base=%s)",
-                    repo_id,
+                    _redact_hf_tokens(repo_id),
                     fam.name,
                     device,
                     dtype,
-                    effective_base,
+                    _redact_hf_tokens(effective_base),
                 )
 
                 transformer = None
@@ -685,9 +776,11 @@ def load_model(
                         self._family = None
                         self._repo_id = None
                         self._gguf_path = None
+                        self._gguf_filename = None
                         self._base_repo = None
                         self._device = None
                         self._dtype = None
+                        self._cpu_offload_enabled = False
                         self._loaded_at = None
                     _release(old)
                     old = None
@@ -735,6 +828,9 @@ def load_model(
                     pipe_kwargs["token"] = hf_token
 
                 pipe = None
+                cpu_offload_enabled = bool(
+                    enable_model_cpu_offload and device == "cuda"
+                )
                 try:
                     pipe = pipeline_cls.from_pretrained(effective_base, **pipe_kwargs)
                     # Device placement / offload can ALSO raise after
@@ -746,7 +842,7 @@ def load_model(
                     # the next load attempt. Explicitly release both
                     # pipe and transformer in the same try (round 13
                     # P2 #11).
-                    if enable_model_cpu_offload and device == "cuda":
+                    if cpu_offload_enabled:
                         pipe.enable_model_cpu_offload()
                     else:
                         pipe.to(device)
@@ -765,9 +861,14 @@ def load_model(
                     self._family = fam
                     self._repo_id = repo_id
                     self._gguf_path = local_gguf_path
+                    # Preserve the full caller-supplied filename, not
+                    # just the basename, so per-variant delete guards
+                    # see ``BF16/model.gguf`` (round 14 P1 #4).
+                    self._gguf_filename = gguf_filename if gguf_filename else None
                     self._base_repo = effective_base
                     self._device = device
                     self._dtype = str(dtype).replace("torch.", "")
+                    self._cpu_offload_enabled = cpu_offload_enabled
                     self._loaded_at = time.time()
                     # Clear loading + pending here, BEFORE returning,
                     # so the response payload reports the resident
@@ -813,7 +914,11 @@ def load_model(
                 # Use ``logger.error`` with the already-scrubbed
                 # message and exc_info=False so the bearer token
                 # cannot leak through structured logging sinks.
-                logger.error("Diffusion load failed for %s: %s", repo_id, exc_msg)
+                logger.error(
+                    "Diffusion load failed for %s: %s",
+                    _redact_hf_tokens(repo_id),
+                    exc_msg,
+                )
                 raise RuntimeError(
                     f"Failed to load diffusion model: {exc_msg}"
                 ) from exc
@@ -845,9 +950,11 @@ def unload_model(self) -> dict[str, Any]:
                 self._family = None
                 self._repo_id = None
                 self._gguf_path = None
+                self._gguf_filename = None
                 self._base_repo = None
                 self._device = None
                 self._dtype = None
+                self._cpu_offload_enabled = False
                 self._loaded_at = None
             _release(old)
             old = None  # noqa: F841
@@ -930,14 +1037,25 @@ def _generate_image_unlocked(
                 raise RuntimeError("No diffusion model is loaded.")
             pipe = self._pipe
             device = self._device or "cpu"
+            cpu_offload_enabled = self._cpu_offload_enabled
         generator = None
         if seed is not None:
             # Match the device of the pipeline so determinism holds
-            # across reload cycles. For CPU offload, the noise still
-            # has to live on the device the diffusion forward runs on.
-            gen_device = (
-                "cuda" if device == "cuda" and torch.cuda.is_available() else "cpu"
-            )
+            # across reload cycles. When CPU offload is enabled
+            # (the default on CUDA hosts), diffusers shuttles each
+            # submodule between CPU and GPU on every step. A CUDA
+            # torch.Generator then mismatches the CPU-resident
+            # embeddings at the start of the forward and the run
+            # crashes (round 14 P1 #6). Use a CPU generator in that
+            # case; numerical determinism for the same seed is
+            # preserved because the seed feeds an int rather than a
+            # device-local RNG state.
+            if cpu_offload_enabled:
+                gen_device = "cpu"
+            else:
+                gen_device = (
+                    "cuda" if device == "cuda" and torch.cuda.is_available() else "cpu"
+                )
             generator = torch.Generator(device = gen_device).manual_seed(int(seed))
 
         call_kwargs: dict[str, Any] = {
@@ -1126,34 +1244,49 @@ def _release_other_gpu_owners_for_diffusion() -> None:
     # higher-level guard) cannot still kill an active export.
     try:
         from core.export import get_export_backend  # type: ignore
+    except Exception as exc:
+        logger.debug("export module not importable: %s", exc)
+        return
 
+    try:
         exp = get_export_backend()
-        if getattr(exp, "current_checkpoint", None):
-            is_export_active_fn = getattr(exp, "is_export_active", None)
-            export_is_active = False
-            if is_export_active_fn is not None:
-                try:
-                    export_is_active = bool(is_export_active_fn())
-                except Exception:
-                    # Unverifiable status -> treat as 'might be
-                    # active' and refuse to touch the subprocess.
-                    export_is_active = True
-            if export_is_active:
-                logger.info(
-                    "Skipping export shutdown for diffusion load: "
-                    "is_export_active=True (route layer should have "
-                    "rejected this request with 409)"
-                )
-            else:
-                logger.info(
-                    "Shutting down idle export subprocess before diffusion load"
-                )
-                exp._shutdown_subprocess()
-                exp.current_checkpoint = None
-                exp.is_vision = False
-                exp.is_peft = False
     except Exception as exc:
-        logger.debug("export unload skipped: %s", exc)
+        logger.debug("export backend not available: %s", exc)
+        return
+
+    is_export_active_fn = getattr(exp, "is_export_active", None)
+    if is_export_active_fn is not None:
+        try:
+            export_is_active = bool(is_export_active_fn())
+        except Exception:
+            # Unverifiable status -> treat as 'might be active' and
+            # refuse so a direct backend caller (test / script /
+            # future route that forgot the higher-level 409 guard)
+            # cannot still terminate an in-flight export.
+            export_is_active = True
+        if export_is_active:
+            # Round 14 P2 #10: the prior behaviour logged a warning
+            # and continued, so direct ``DiffusionBackend.load_model``
+            # callers (tests, scripts) silently bypassed the route
+            # layer's 409. Hard-refuse instead so any code path that
+            # reaches this helper while an export is active sees the
+            # same failure mode the route returns.
+            raise RuntimeError(
+                "An export job is currently active. Stop the export "
+                "job before loading a diffusion image model."
+            )
+
+    if getattr(exp, "current_checkpoint", None):
+        try:
+            logger.info(
+                "Shutting down idle export subprocess before diffusion load"
+            )
+            exp._shutdown_subprocess()
+            exp.current_checkpoint = None
+            exp.is_vision = False
+            exp.is_peft = False
+        except Exception as exc:
+            logger.debug("idle export shutdown failed: %s", exc)
 
     # Note: active training is *not* stopped here. The route layer
     # (`_raise_if_training_active` in routes/inference.py) refuses
diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 51da54789d..e017fb70ed 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -2616,9 +2616,66 @@ def load_model(
 
         Returns True if server started and health check passed.
         """
-        # Serialise the whole load so concurrent /load calls never
-        # leave two llama-server processes alive (#5401 / #5161). Does
-        # not block /unload, /status, /load-progress.
+        # Publish ``_loading_model_identifier`` BEFORE any phase of
+        # the load can begin and clear it AFTER the load fully settles
+        # (success or failure, including the duplicate-state fast path
+        # and every internal early ``return False``). Round 14 P1 #2:
+        # the prior inline try/finally only wrapped the download, so
+        # /delete-cached and the cross-workload handoff helpers saw
+        # the backend as idle once the GGUF bytes had landed but the
+        # subprocess had not yet spawned. Mark the load as pending
+        # for the entire duration -- download, metadata read,
+        # VRAM settle, process spawn, health check, audio probe.
+        self._loading_model_identifier = model_identifier
+        try:
+            # Serialise the whole load so concurrent /load calls never
+            # leave two llama-server processes alive (#5401 / #5161).
+            # Does not block /unload, /status, /load-progress.
+            return self._load_model_impl(
+                gguf_path = gguf_path,
+                mmproj_path = mmproj_path,
+                hf_repo = hf_repo,
+                hf_variant = hf_variant,
+                hf_token = hf_token,
+                model_identifier = model_identifier,
+                is_vision = is_vision,
+                n_ctx = n_ctx,
+                chat_template_override = chat_template_override,
+                cache_type_kv = cache_type_kv,
+                speculative_type = speculative_type,
+                spec_draft_n_max = spec_draft_n_max,
+                n_threads = n_threads,
+                n_gpu_layers = n_gpu_layers,
+                n_parallel = n_parallel,
+                extra_args = extra_args,
+            )
+        finally:
+            self._loading_model_identifier = None
+
+    def _load_model_impl(
+        self,
+        *,
+        gguf_path: Optional[str] = None,
+        mmproj_path: Optional[str] = None,
+        hf_repo: Optional[str] = None,
+        hf_variant: Optional[str] = None,
+        hf_token: Optional[str] = None,
+        model_identifier: str,
+        is_vision: bool = False,
+        n_ctx: int = 4096,
+        chat_template_override: Optional[str] = None,
+        cache_type_kv: Optional[str] = None,
+        speculative_type: Optional[str] = None,
+        spec_draft_n_max: Optional[int] = None,
+        n_threads: Optional[int] = None,
+        n_gpu_layers: Optional[int] = None,
+        n_parallel: int = 1,
+        extra_args: Optional[List[str]] = None,
+    ) -> bool:
+        """Internal body of ``load_model``. Kept as a separate method
+        so ``load_model`` can wrap it in a single try/finally that
+        publishes ``_loading_model_identifier`` for the WHOLE load
+        instead of only the download window."""
         with self._serial_load_lock:
             # Duplicate /load that raced past the route-level check
             # (the first one hadn't published _healthy=True yet). If the
@@ -2693,40 +2750,25 @@ def load_model(
             # Scope HF_HUB_OFFLINE to the download block only when DNS is
             # dead; cleanup runs even on exception so a transient hiccup
             # at the start of one load cannot quarantine future loads.
-            #
-            # Publish ``_loading_model_identifier`` BEFORE entering the
-            # download so /delete-cached and the cross-workload handoff
-            # helpers can see a multi-GB pending load: previously they
-            # only consulted ``model_identifier``, which the success
-            # path sets later (see "Set identifier early" below). That
-            # left a window where the user could rmtree the cache the
-            # download was still writing to, or start /images/load
-            # while llama-server was about to come up on the same GPU.
-            # Cleared in ``finally`` so failed / cancelled loads do not
-            # leak the pending state.
-            self._loading_model_identifier = model_identifier
-            try:
-                if hf_repo:
-                    with _hf_offline_if_dns_dead():
-                        model_path = self._download_gguf(
+            if hf_repo:
+                with _hf_offline_if_dns_dead():
+                    model_path = self._download_gguf(
+                        hf_repo = hf_repo,
+                        hf_variant = hf_variant,
+                        hf_token = hf_token,
+                    )
+                    # Auto-download mmproj for vision models
+                    if is_vision and not mmproj_path:
+                        mmproj_path = self._download_mmproj(
                             hf_repo = hf_repo,
-                            hf_variant = hf_variant,
                             hf_token = hf_token,
                         )
-                        # Auto-download mmproj for vision models
-                        if is_vision and not mmproj_path:
-                            mmproj_path = self._download_mmproj(
-                                hf_repo = hf_repo,
-                                hf_token = hf_token,
-                            )
-                elif gguf_path:
-                    if not Path(gguf_path).is_file():
-                        raise FileNotFoundError(f"GGUF file not found: {gguf_path}")
-                    model_path = gguf_path
-                else:
-                    raise ValueError("Either gguf_path or hf_repo must be provided")
-            finally:
-                self._loading_model_identifier = None
+            elif gguf_path:
+                if not Path(gguf_path).is_file():
+                    raise FileNotFoundError(f"GGUF file not found: {gguf_path}")
+                model_path = gguf_path
+            else:
+                raise ValueError("Either gguf_path or hf_repo must be provided")
 
             # Set identifier early so _read_gguf_metadata can use it for DeepSeek detection
             self._model_identifier = model_identifier
diff --git a/studio/backend/routes/export.py b/studio/backend/routes/export.py
index 11c5004ced..d157d7d9f8 100644
--- a/studio/backend/routes/export.py
+++ b/studio/backend/routes/export.py
@@ -107,6 +107,42 @@ async def load_checkpoint(
                     ),
                 )
 
+        backend = get_export_backend()
+        # Refuse to reload the export checkpoint while an export job
+        # is still running. ``ExportBackend.load_checkpoint`` would
+        # terminate the running subprocess in order to spawn a new
+        # one, silently corrupting the partial output the user is
+        # waiting on (round 13 P1 #1). Runs BEFORE the chat /
+        # diffusion unloads below: a 409 from this guard must not
+        # leave the user's chat or diffusion GPU owners freed for
+        # nothing (round 14 P1 #1). ``is_export_active`` may be
+        # absent on older / mocked backends; treat missing as "no
+        # async-job tracker available" and skip rather than
+        # fail-closed.
+        is_export_active_fn = getattr(backend, "is_export_active", None)
+        if is_export_active_fn is not None:
+            try:
+                export_is_active = bool(is_export_active_fn())
+            except Exception as e:
+                logger.warning(
+                    "Could not verify export status before export load: %s", e
+                )
+                raise HTTPException(
+                    status_code = 503,
+                    detail = (
+                        "Could not verify export status before loading "
+                        "an export checkpoint. Try again."
+                    ),
+                ) from e
+            if export_is_active:
+                raise HTTPException(
+                    status_code = 409,
+                    detail = (
+                        "An export job is currently active. Stop the "
+                        "export job before loading another checkpoint."
+                    ),
+                )
+
         # Free GPU memory: shut down any chat backend before loading
         # the export checkpoint. Routes the unload through the shared
         # helper so we cover llama-server is_active=True and
@@ -141,41 +177,6 @@ async def load_checkpoint(
         except Exception as e:
             logger.debug("diffusion unload skipped for export: %s", e)
 
-        backend = get_export_backend()
-        # Refuse to reload the export checkpoint while an export job
-        # is still running. ``ExportBackend.load_checkpoint`` would
-        # terminate the running subprocess in order to spawn a new
-        # one, silently corrupting the partial output the user is
-        # waiting on (round 13 P1 #1). Mirrors the symmetric guards
-        # already in place for chat / diffusion / training handoffs.
-        # ``is_export_active`` may be absent on older / mocked
-        # backends -- treat missing as "no async-job tracker
-        # available" -> skip rather than fail-closed; the
-        # surrounding chat / diffusion unloads have already run.
-        is_export_active_fn = getattr(backend, "is_export_active", None)
-        if is_export_active_fn is not None:
-            try:
-                export_is_active = bool(is_export_active_fn())
-            except Exception as e:
-                logger.warning(
-                    "Could not verify export status before export load: %s", e
-                )
-                raise HTTPException(
-                    status_code = 503,
-                    detail = (
-                        "Could not verify export status before loading "
-                        "an export checkpoint. Try again."
-                    ),
-                ) from e
-            if export_is_active:
-                raise HTTPException(
-                    status_code = 409,
-                    detail = (
-                        "An export job is currently active. Stop the "
-                        "export job before loading another checkpoint."
-                    ),
-                )
-
         # load_checkpoint spawns and waits on a subprocess and can take
         # minutes. Run it in a worker thread so the event loop stays
         # free to serve the live log SSE stream concurrently.
diff --git a/studio/backend/routes/models.py b/studio/backend/routes/models.py
index a573f88233..ecf88f7a86 100644
--- a/studio/backend/routes/models.py
+++ b/studio/backend/routes/models.py
@@ -1940,6 +1940,27 @@ async def delete_finetuned_model(
         from routes.inference import get_llama_cpp_backend
 
         llama_backend = get_llama_cpp_backend()
+        # Pending HF GGUF download targeting this path: round 14 P1 #3.
+        # ``loading_model_identifier`` is set before the download starts
+        # and cleared after the subprocess settles, so the user cannot
+        # rmtree the directory llama.cpp is writing into mid-flight.
+        loading_identifier = getattr(llama_backend, "loading_model_identifier", None)
+        if (
+            loading_identifier
+            and _loaded_model_matches_deleted_path(
+                loading_identifier,
+                target_path,
+            )
+            and (
+                not gguf_variant
+                or not getattr(llama_backend, "hf_variant", None)
+                or llama_backend.hf_variant.lower() == gguf_variant.lower()
+            )
+        ):
+            raise HTTPException(
+                status_code = 409,
+                detail = "Cannot delete a model while it is loading",
+            )
         if (
             llama_backend.is_active
             and not llama_backend.is_loaded
@@ -2745,14 +2766,24 @@ async def delete_cached_model(
         # Exact match only (case-insensitive). Prefix match would
         # block deleting unrelated ``org/model`` while
         # ``org/model-v2`` is loaded -- same surface the diffusion
-        # guard fixed in round 5.
+        # guard fixed in round 5. Per-variant deletes that target a
+        # DIFFERENT quant than the loaded one are allowed so the
+        # llama and diffusion paths stay symmetric (round 14 P1 #7).
         if loaded_id == needle and (
             llama_backend.is_loaded or getattr(llama_backend, "is_active", False)
         ):
-            raise HTTPException(
-                status_code = 400,
-                detail = "Unload the model before deleting",
+            loaded_variant = (getattr(llama_backend, "hf_variant", None) or "").lower()
+            requested_variant = (variant or "").lower()
+            same_variant = (
+                not requested_variant
+                or not loaded_variant
+                or requested_variant == loaded_variant
             )
+            if same_variant:
+                raise HTTPException(
+                    status_code = 400,
+                    detail = "Unload the model before deleting",
+                )
     except HTTPException:
         raise
     except Exception as e:
diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index 43918fb17e..f642ff2cbd 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -1359,6 +1359,120 @@ def _fake_unlocked(**kwargs):
     }
 
 
+@pytest.mark.parametrize(
+    "repo_id",
+    [
+        "unsloth/Qwen_Image-Edit-GGUF",
+        "unsloth/Qwen-Image_Edit-GGUF",
+        "unsloth/Qwen-ImageEdit-GGUF",
+        "unsloth/qwen-image_edit-2509-GGUF",
+        "unsloth/Qwen.Image.Edit-GGUF",
+    ],
+)
+def test_detect_family_qwen_image_edit_mixed_separators(repo_id):
+    """Round 14 P2 #8: every spelling of Qwen-Image-Edit must NOT
+    match the base Qwen-Image text-to-image family."""
+    from core.inference.diffusion import detect_family
+
+    assert detect_family(repo_id) is None
+
+
+def test_redact_hf_tokens_removes_url_embedded_token():
+    """Round 14 P2 #9: tokens embedded in user-supplied paths /
+    URLs must be scrubbed before logging."""
+    from core.inference.diffusion import _redact_hf_tokens
+
+    leaky = "https://hf_abcdefghij0123456789@huggingface.co/unsloth/FLUX.2-klein-4B-GGUF"
+    redacted = _redact_hf_tokens(leaky)
+    assert "hf_" not in redacted
+    assert "<redacted>" in redacted
+    # Non-strings pass through unchanged so the helper is safe in
+    # logger argument lists where families / dtypes mix in.
+    assert _redact_hf_tokens(None) is None
+    assert _redact_hf_tokens(42) == 42
+
+
+def test_status_preserves_active_gguf_subdir(monkeypatch):
+    """Round 14 P1 #4: status() must surface the original caller-
+    supplied gguf_filename (``BF16/model.gguf``) instead of the
+    collapsed basename."""
+    import core.inference.diffusion as d
+
+    backend = d.DiffusionBackend()
+    backend._pipe = object()
+    backend._repo_id = "unsloth/FLUX.2-klein-4B-GGUF"
+    backend._gguf_path = "/cache/models/unsloth/FLUX.2-klein-4B-GGUF/BF16/model.gguf"
+    backend._gguf_filename = "BF16/model.gguf"
+    backend._family = d.DiffusionFamily(
+        name = "flux.2-klein",
+        pipeline_class = "Flux2KleinPipeline",
+        transformer_class = "Flux2Transformer2DModel",
+        base_repo = "black-forest-labs/FLUX.2-klein-4B",
+        aliases = (),
+    )
+
+    s = backend.status()
+    assert s["active_gguf_filename"] == "BF16/model.gguf"
+    # UI-facing field still collapses to the basename.
+    assert s["gguf_filename"] == "model.gguf"
+
+
+def test_generator_uses_cpu_when_cpu_offload_enabled(monkeypatch):
+    """Round 14 P1 #6: seeded CUDA generation must NOT create a
+    CUDA torch.Generator when the pipeline was loaded with CPU
+    offload enabled, otherwise it crashes mid-forward."""
+    import core.inference.diffusion as d
+
+    backend = d.DiffusionBackend()
+
+    class _FakePipe:
+        def __init__(self):
+            self.last_kwargs = None
+
+        def __call__(self, **kwargs):
+            self.last_kwargs = kwargs
+            from PIL import Image
+
+            return SimpleNamespace(images = [Image.new("RGB", (8, 8))])
+
+    fake_pipe = _FakePipe()
+    backend._pipe = fake_pipe
+    backend._device = "cuda"
+    backend._cpu_offload_enabled = True
+
+    captured_devices: list[str] = []
+
+    class _FakeGenerator:
+        def __init__(self, device):
+            captured_devices.append(device)
+
+        def manual_seed(self, seed):
+            return self
+
+    class _FakeTorchCuda:
+        @staticmethod
+        def is_available():
+            return True
+
+    fake_torch = SimpleNamespace(
+        Generator = _FakeGenerator, cuda = _FakeTorchCuda
+    )
+    monkeypatch.setitem(sys.modules, "torch", fake_torch)
+
+    backend._generate_image_unlocked(prompt = "x", seed = 7, width = 8, height = 8)
+    assert captured_devices == ["cpu"]
+
+
+def test_smart_base_repo_uses_windows_leaf_only_already_set_separator_round14():
+    """Sanity: relative paths still work after the Windows fix."""
+    from core.inference.diffusion import _smart_base_repo, detect_family
+
+    repo = "owner/FLUX.2-klein-9B-GGUF"
+    fam = detect_family(repo)
+    assert fam is not None
+    assert _smart_base_repo(fam, repo) == "black-forest-labs/FLUX.2-klein-9B"
+
+
 def test_generate_image_with_metadata_blocks_concurrent_unload(monkeypatch):
     """Round 13 P2 #9: _generate_lock serialises the forward AND the
     meta snapshot, so a queued unload cannot wipe state in between."""
diff --git a/studio/backend/tests/test_diffusion_routes.py b/studio/backend/tests/test_diffusion_routes.py
index d3659b89ee..2818576659 100644
--- a/studio/backend/tests/test_diffusion_routes.py
+++ b/studio/backend/tests/test_diffusion_routes.py
@@ -84,6 +84,9 @@ def status(self) -> dict:
             "active_base_repo": (
                 "black-forest-labs/FLUX.2-klein" if self._loaded else None
             ),
+            # Round 14: guard-facing GGUF filename is now the full
+            # caller-supplied value, but this fake never sets one so
+            # both active and pending stay None.
             "active_gguf_filename": None,
             "pending_repo_id": None,
             "pending_base_repo": None,

From a9b3d1a67272694a5e9a5d6c369d62b2c66fabe9 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 06:34:21 +0000
Subject: [PATCH 42/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/diffusion.py     | 4 +---
 studio/backend/tests/test_diffusion_backend.py | 8 ++++----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 8b40fdfc55..e9c00255b0 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -1278,9 +1278,7 @@ def _release_other_gpu_owners_for_diffusion() -> None:
 
     if getattr(exp, "current_checkpoint", None):
         try:
-            logger.info(
-                "Shutting down idle export subprocess before diffusion load"
-            )
+            logger.info("Shutting down idle export subprocess before diffusion load")
             exp._shutdown_subprocess()
             exp.current_checkpoint = None
             exp.is_vision = False
diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index f642ff2cbd..1f7c68856b 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -1382,7 +1382,9 @@ def test_redact_hf_tokens_removes_url_embedded_token():
     URLs must be scrubbed before logging."""
     from core.inference.diffusion import _redact_hf_tokens
 
-    leaky = "https://hf_abcdefghij0123456789@huggingface.co/unsloth/FLUX.2-klein-4B-GGUF"
+    leaky = (
+        "https://hf_abcdefghij0123456789@huggingface.co/unsloth/FLUX.2-klein-4B-GGUF"
+    )
     redacted = _redact_hf_tokens(leaky)
     assert "hf_" not in redacted
     assert "<redacted>" in redacted
@@ -1454,9 +1456,7 @@ class _FakeTorchCuda:
         def is_available():
             return True
 
-    fake_torch = SimpleNamespace(
-        Generator = _FakeGenerator, cuda = _FakeTorchCuda
-    )
+    fake_torch = SimpleNamespace(Generator = _FakeGenerator, cuda = _FakeTorchCuda)
     monkeypatch.setitem(sys.modules, "torch", fake_torch)
 
     backend._generate_image_unlocked(prompt = "x", seed = 7, width = 8, height = 8)

From 59aa75b8ffc303f470af270dedfe0517bff56e52 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 07:00:20 +0000
Subject: [PATCH 43/92] Fix/adjust diffusion: round 15 P1+P2+P3 batch for PR
 #5754

Round 15 reviewer aggregate (logs/review_round15_aggregate.md):

P1 fixes:
- core/inference/llama_cpp.py publishes loading_model_identifier +
  loading_hf_variant AFTER acquiring _serial_load_lock; previously
  a queued second load could overwrite or clear the identifier
  currently in flight, breaking delete-safety and GPU handoff guards.
- routes/models.py /delete-finetuned compares the pending llama
  load against loading_hf_variant (new), not the stale hf_variant
  from the previous loaded model. Without this, a Q4-loaded
  directory loading Q8 would still accept a Q8 delete.
- core/inference/diffusion.py _release_other_gpu_owners_for_diffusion
  now also raises when training is active so direct backend callers
  cannot bypass the route layer's 409 guard. Mirrors the
  export-active check the same helper already enforces.
- routes/models.py /delete-cached diffusion guard compares owned
  diffusion paths against the HF cache root for the target repo
  via _all_hf_cache_scans + _is_path_under. Without this, loading
  from a local models--owner--model/snapshots/<sha> path let the
  cache delete proceed while the snapshot was still mmap'd.
- models/inference.py DiffusionLoadRequest refuses URL-embedded
  hf_xxxxx tokens in repo_id / base_repo at the API boundary, so
  the value never reaches self._repo_id and status() can never
  echo it back to other authenticated sessions.

P2 fixes:
- core/inference/diffusion.py status() routes UI-facing repo_id /
  base_repo through _display_repo_id, which collapses absolute
  local paths to the leaf name (delete guards still see the full
  path via active_*/pending_*).
- routes/inference.py /images/load maps backend RuntimeError that
  reports an export/training conflict to HTTP 409 instead of 400.
- core/inference/diffusion.py detect_family now uses token-boundary
  matching so owner/flux.20-model does not collide with flux.2.

P3 fixes:
- tests/test_diffusion_routes.py drops the partial routes.inference
  module from sys.modules if exec_module() raises, so the real
  ImportError surfaces instead of a misleading AttributeError on
  follow-up tests.

Tests:
- 5 new regression cases (display_repo_id, token-boundary family
  detection, training-active raise from backend helper, embedded HF
  token rejection).
- All 72 diffusion backend + route tests pass.
---
 studio/backend/core/inference/diffusion.py    |  90 ++++++++++++++-
 studio/backend/core/inference/llama_cpp.py    | 105 +++++++++++-------
 studio/backend/models/inference.py            |  31 ++++++
 studio/backend/routes/inference.py            |  16 ++-
 studio/backend/routes/models.py               |  55 ++++++++-
 .../backend/tests/test_diffusion_backend.py   |  56 ++++++++++
 studio/backend/tests/test_diffusion_routes.py |  38 ++++++-
 7 files changed, 341 insertions(+), 50 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index e9c00255b0..ab453faab3 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -201,6 +201,29 @@ def _expand_existing_local_path(value: str) -> str:
     return value
 
 
+def _display_repo_id(value: Any) -> Any:
+    """Return a public-facing label for a repo_id / base_repo.
+
+    For Hub-style identifiers (``owner/repo``) the value passes
+    through unchanged so the Images panel and result figcaption
+    stay informative. Absolute local paths (``/home/me/exports/...``
+    or ``C:\\Users\\...``) collapse to the leaf name so
+    ``/images/status`` does not leak the user's filesystem layout
+    to other authenticated browser sessions (round 15 P2 #6). HF
+    tokens are scrubbed defensively in case they slipped past the
+    request-side validator.
+    """
+    if not isinstance(value, str) or not value:
+        return value
+    try:
+        candidate = Path(value).expanduser()
+        if candidate.is_absolute() or candidate.exists():
+            return candidate.name or value
+    except (OSError, ValueError):
+        pass
+    return _redact_hf_tokens(value)
+
+
 _HF_TOKEN_RE = re.compile(r"hf_[A-Za-z0-9]{20,}")
 
 
@@ -334,6 +357,32 @@ def detect_family(
     # P2 #8).
     needle_norm = re.sub(r"[^a-z0-9]+", "-", needle).strip("-")
     needle_compact = re.sub(r"[^a-z0-9]+", "", needle)
+
+    def _matches_family_token(term: str) -> bool:
+        """Token-boundary match on the normalised needle. Prevents
+        ``owner/flux.20-model`` from matching ``flux.2`` because
+        ``flux.20`` does not have a separator after ``flux-2``
+        (round 15 P2 #8). Falls back to compact equality so aliases
+        like ``qwenimage`` still match ``unsloth/QwenImage-GGUF``."""
+        term_norm = re.sub(r"[^a-z0-9]+", "-", term.lower()).strip("-")
+        if not term_norm:
+            return False
+        if re.search(rf"(^|-){re.escape(term_norm)}($|-)", needle_norm):
+            return True
+        term_compact = re.sub(r"[^a-z0-9]+", "", term.lower())
+        if term_compact and term_compact in needle_compact:
+            # Compact contiguous match: ``qwenimage`` in
+            # ``qwenimage-gguf`` -> qwenimage-compact in needle_compact.
+            # Use word boundary on the compact form too: the compact
+            # ``flux2`` must not match inside ``flux20``.
+            return bool(
+                re.search(
+                    rf"(^|[^0-9a-z]){re.escape(term_compact)}([^0-9a-z]|$)",
+                    needle_compact,
+                )
+            ) or term_compact == needle_compact
+        return False
+
     # Scan _FAMILIES first (GGUF-supported), then _FULL_REPO_FAMILIES
     # so a repo like ``stabilityai/stable-diffusion-xl-base-1.0`` is
     # auto-detected as SDXL instead of returning None.
@@ -346,10 +395,10 @@ def detect_family(
             for e in excludes
         ):
             continue
-        if fam.name in needle:
+        if _matches_family_token(fam.name):
             return fam
         for alias in fam.aliases:
-            if alias and alias in needle:
+            if alias and _matches_family_token(alias):
                 return fam
     return None
 
@@ -493,13 +542,20 @@ def status(self) -> dict[str, Any]:
             # variants like ``BF16/model.gguf`` (round 14 P1 #4-5).
             ui_gguf = pending_gguf or active_gguf
             ui_gguf_basename = Path(ui_gguf).name if ui_gguf else None
+            # UI-facing ``repo_id`` / ``base_repo`` collapse absolute
+            # local paths to their leaf name so ``/images/status``
+            # does not leak the user's filesystem layout to other
+            # authenticated browser sessions (round 15 P2 #6). The
+            # guard-facing ``active_*`` / ``pending_*`` fields below
+            # preserve the exact value so delete guards still match
+            # against the snapshot path.
             return {
                 "is_loaded": self._pipe is not None,
                 "is_loading": self._loading,
-                "repo_id": pending_repo or active_repo,
+                "repo_id": _display_repo_id(pending_repo or active_repo),
                 "family": ui_family,
                 "pipeline_class": ui_pipeline_class,
-                "base_repo": pending_base or active_base,
+                "base_repo": _display_repo_id(pending_base or active_base),
                 "gguf_filename": ui_gguf_basename,
                 # Guard-facing fields: every repo / path / GGUF
                 # filename the backend owns RIGHT NOW. Delete routes
@@ -1242,6 +1298,32 @@ def _release_other_gpu_owners_for_diffusion() -> None:
     # helper repeats the local check anyway so that direct backend
     # callers (tests, scripts, future routes that forget the
     # higher-level guard) cannot still kill an active export.
+    # Training-active check runs FIRST so direct backend callers
+    # (tests, scripts, future routes) cannot bypass the route layer's
+    # 409 by calling ``load_model`` directly while a training run is
+    # active (round 15 P1 #3). The route layer's
+    # ``_raise_if_training_active`` still runs ahead of the load to
+    # surface the conflict as 409; this helper re-raises so direct
+    # callers see the same RuntimeError the export-active path raises.
+    try:
+        from core.training import get_training_backend  # type: ignore
+    except Exception as exc:
+        logger.debug("training module not importable: %s", exc)
+    else:
+        try:
+            training_active = bool(get_training_backend().is_training_active())
+        except Exception as exc:
+            # Unverifiable status -> fail closed (might be active).
+            raise RuntimeError(
+                "Could not verify training status before loading a "
+                "diffusion image model."
+            ) from exc
+        if training_active:
+            raise RuntimeError(
+                "Training is currently active. Stop the training run "
+                "before loading a diffusion image model."
+            )
+
     try:
         from core.export import get_export_backend  # type: ignore
     except Exception as exc:
diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index e017fb70ed..f3f282893a 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -618,7 +618,12 @@ def __init__(self):
         # ``loading_model_identifier`` so a multi-GB HF download cannot
         # have its cache rmtree'd or be ignored by /images/load,
         # /training/start, /export/load while it is still resolving.
+        # ``_loading_hf_variant`` mirrors the same lifetime so the
+        # per-variant delete guard at routes/models.py:/delete-finetuned
+        # compares against the NEW variant rather than the previous
+        # loaded ``hf_variant`` (round 15 P1 #2).
         self._loading_model_identifier: Optional[str] = None
+        self._loading_hf_variant: Optional[str] = None
         self._gguf_path: Optional[str] = None
         self._hf_repo: Optional[str] = None
         self._hf_variant: Optional[str] = None
@@ -733,6 +738,20 @@ def loading_model_identifier(self) -> Optional[str]:
         concurrent /images/load that thinks llama-server is idle."""
         return self._loading_model_identifier
 
+    @property
+    def loading_hf_variant(self) -> Optional[str]:
+        """``hf_variant`` of the load currently in progress, or None.
+
+        Mirrors ``loading_model_identifier``'s lifetime so the
+        per-variant delete guards (routes/models.py /delete-cached and
+        /delete-finetuned) can compare against the NEW variant rather
+        than the previously-loaded one (round 15 P1 #2). Without this,
+        a directory with Q4 loaded and Q8 loading would still see the
+        stale Q4 ``hf_variant``, and a Q8 delete would be wrongly
+        allowed even though Q8 is being downloaded into the same
+        directory."""
+        return self._loading_hf_variant
+
     @property
     def is_vision(self) -> bool:
         return self._is_vision
@@ -2616,43 +2635,47 @@ def load_model(
 
         Returns True if server started and health check passed.
         """
-        # Publish ``_loading_model_identifier`` BEFORE any phase of
-        # the load can begin and clear it AFTER the load fully settles
-        # (success or failure, including the duplicate-state fast path
-        # and every internal early ``return False``). Round 14 P1 #2:
-        # the prior inline try/finally only wrapped the download, so
-        # /delete-cached and the cross-workload handoff helpers saw
-        # the backend as idle once the GGUF bytes had landed but the
-        # subprocess had not yet spawned. Mark the load as pending
-        # for the entire duration -- download, metadata read,
-        # VRAM settle, process spawn, health check, audio probe.
-        self._loading_model_identifier = model_identifier
-        try:
-            # Serialise the whole load so concurrent /load calls never
-            # leave two llama-server processes alive (#5401 / #5161).
-            # Does not block /unload, /status, /load-progress.
-            return self._load_model_impl(
-                gguf_path = gguf_path,
-                mmproj_path = mmproj_path,
-                hf_repo = hf_repo,
-                hf_variant = hf_variant,
-                hf_token = hf_token,
-                model_identifier = model_identifier,
-                is_vision = is_vision,
-                n_ctx = n_ctx,
-                chat_template_override = chat_template_override,
-                cache_type_kv = cache_type_kv,
-                speculative_type = speculative_type,
-                spec_draft_n_max = spec_draft_n_max,
-                n_threads = n_threads,
-                n_gpu_layers = n_gpu_layers,
-                n_parallel = n_parallel,
-                extra_args = extra_args,
-            )
-        finally:
-            self._loading_model_identifier = None
+        # Serialise the whole load so concurrent /load calls never
+        # leave two llama-server processes alive (#5401 / #5161). Does
+        # not block /unload, /status, /load-progress.
+        #
+        # Publish ``_loading_model_identifier`` + ``_loading_hf_variant``
+        # AFTER acquiring ``_serial_load_lock``. Round 15 P1 #1: the
+        # previous round 14 version set them outside the lock so a
+        # second queued ``load_model`` would overwrite or clear the
+        # identifier of the load currently holding the lock, breaking
+        # the delete-safety and GPU handoff guards. Cleared in
+        # ``finally`` so failure / cancellation leaves the pending
+        # state empty. Round 15 P1 #2 added ``_loading_hf_variant``
+        # so per-variant delete guards can compare against the
+        # NEW variant rather than the previous loaded one.
+        with self._serial_load_lock:
+            self._loading_model_identifier = model_identifier
+            self._loading_hf_variant = hf_variant
+            try:
+                return self._load_model_impl_locked(
+                    gguf_path = gguf_path,
+                    mmproj_path = mmproj_path,
+                    hf_repo = hf_repo,
+                    hf_variant = hf_variant,
+                    hf_token = hf_token,
+                    model_identifier = model_identifier,
+                    is_vision = is_vision,
+                    n_ctx = n_ctx,
+                    chat_template_override = chat_template_override,
+                    cache_type_kv = cache_type_kv,
+                    speculative_type = speculative_type,
+                    spec_draft_n_max = spec_draft_n_max,
+                    n_threads = n_threads,
+                    n_gpu_layers = n_gpu_layers,
+                    n_parallel = n_parallel,
+                    extra_args = extra_args,
+                )
+            finally:
+                self._loading_model_identifier = None
+                self._loading_hf_variant = None
 
-    def _load_model_impl(
+    def _load_model_impl_locked(
         self,
         *,
         gguf_path: Optional[str] = None,
@@ -2672,11 +2695,11 @@ def _load_model_impl(
         n_parallel: int = 1,
         extra_args: Optional[List[str]] = None,
     ) -> bool:
-        """Internal body of ``load_model``. Kept as a separate method
-        so ``load_model`` can wrap it in a single try/finally that
-        publishes ``_loading_model_identifier`` for the WHOLE load
-        instead of only the download window."""
-        with self._serial_load_lock:
+        """Internal body of ``load_model``. The caller is responsible
+        for holding ``_serial_load_lock`` and for publishing /
+        clearing ``_loading_model_identifier`` + ``_loading_hf_variant``
+        in the surrounding try/finally."""
+        if True:
             # Duplicate /load that raced past the route-level check
             # (the first one hadn't published _healthy=True yet). If the
             # live server already satisfies this request, do nothing.
diff --git a/studio/backend/models/inference.py b/studio/backend/models/inference.py
index 5f232c062b..8594b6306e 100644
--- a/studio/backend/models/inference.py
+++ b/studio/backend/models/inference.py
@@ -1450,6 +1450,32 @@ def _no_control_chars(value: Optional[str], field_name: str) -> Optional[str]:
     return value
 
 
+import re as _re
+
+_EMBEDDED_HF_TOKEN_RE = _re.compile(r"hf_[A-Za-z0-9]{20,}")
+
+
+def _reject_embedded_hf_token(
+    value: Optional[str], field_name: str
+) -> Optional[str]:
+    """Refuse identifiers that contain an embedded ``hf_xxx`` token.
+
+    Round 15 P1 #5: ``repo_id`` and ``base_repo`` accept URL-style
+    strings (``https://hf_token@huggingface.co/owner/repo``). The
+    token would otherwise be stored in ``self._repo_id`` and echoed
+    back through ``status()`` to every authenticated browser session.
+    Log redaction (``_redact_hf_tokens``) covers the logger sink, but
+    the public status payload also needed to refuse the input. Use
+    the dedicated ``hf_token`` field for authentication.
+    """
+    if value is not None and _EMBEDDED_HF_TOKEN_RE.search(value):
+        raise ValueError(
+            f"{field_name} must not embed a Hugging Face token; "
+            "pass it via the dedicated hf_token field instead."
+        )
+    return value
+
+
 class DiffusionLoadRequest(BaseModel):
     """Load a diffusion image-generation model.
 
@@ -1495,6 +1521,11 @@ class DiffusionLoadRequest(BaseModel):
     def _no_control_chars(cls, v, info):
         return _no_control_chars(v, info.field_name)
 
+    @field_validator("repo_id", "base_repo")
+    @classmethod
+    def _no_embedded_hf_tokens(cls, v, info):
+        return _reject_embedded_hf_token(v, info.field_name)
+
 
 # torch.Generator.manual_seed packs into signed int64; values outside
 # [-2**63, 2**63 - 1] raise ``Overflow when unpacking long long`` deep
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index d376a289e1..41fc87c424 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -1942,7 +1942,21 @@ async def diffusion_load(
         )
         return JSONResponse(content = status)
     except RuntimeError as exc:
-        raise HTTPException(status_code = 400, detail = str(exc))
+        # Round 15 P2 #7: if a training run / export job starts
+        # between the route-level pre-check and the backend worker,
+        # ``_release_other_gpu_owners_for_diffusion`` raises a
+        # RuntimeError that should surface as a 409 conflict (the
+        # same status the route layer returns), not 400. Match the
+        # known conflict strings the backend raises.
+        detail = str(exc)
+        if (
+            "export job is currently active" in detail
+            or "Training is currently active" in detail
+            or "Could not verify training status" in detail
+            or "Could not verify export status" in detail
+        ):
+            raise HTTPException(status_code = 409, detail = detail) from exc
+        raise HTTPException(status_code = 400, detail = detail) from exc
     except Exception as exc:
         logger.exception("Diffusion load failed")
         raise HTTPException(status_code = 500, detail = str(exc))
diff --git a/studio/backend/routes/models.py b/studio/backend/routes/models.py
index ecf88f7a86..10e6e26822 100644
--- a/studio/backend/routes/models.py
+++ b/studio/backend/routes/models.py
@@ -1944,7 +1944,12 @@ async def delete_finetuned_model(
         # ``loading_model_identifier`` is set before the download starts
         # and cleared after the subprocess settles, so the user cannot
         # rmtree the directory llama.cpp is writing into mid-flight.
+        # Round 15 P1 #2: compare against ``loading_hf_variant`` (the
+        # variant being downloaded) rather than ``hf_variant`` (the
+        # PREVIOUS loaded variant, which is stale until the new load
+        # completes its late-metadata update).
         loading_identifier = getattr(llama_backend, "loading_model_identifier", None)
+        loading_variant = getattr(llama_backend, "loading_hf_variant", None)
         if (
             loading_identifier
             and _loaded_model_matches_deleted_path(
@@ -1953,8 +1958,8 @@ async def delete_finetuned_model(
             )
             and (
                 not gguf_variant
-                or not getattr(llama_backend, "hf_variant", None)
-                or llama_backend.hf_variant.lower() == gguf_variant.lower()
+                or not loading_variant
+                or loading_variant.lower() == gguf_variant.lower()
             )
         ):
             raise HTTPException(
@@ -2852,6 +2857,33 @@ async def delete_cached_model(
         diff_status = diff_backend.status()
         if diff_status.get("is_loaded") or diff_status.get("is_loading"):
             needle = repo_id.lower()
+            # Round 15 P1 #4: ALSO compare owned paths against the HF
+            # cache root for this repo. The user may have loaded the
+            # diffusion model from a local snapshot path under
+            # ``models--owner--model/snapshots/<sha>``; the string
+            # ``owner/model`` then never appears in ``owned_id`` and
+            # the previous string-only check would let the cache
+            # delete proceed while the snapshot was still mmap'd.
+            cache_repo_roots: list[Path] = []
+            try:
+                for hf_cache in _all_hf_cache_scans():
+                    for repo_info in hf_cache.repos:
+                        if (
+                            repo_info.repo_type == "model"
+                            and repo_info.repo_id.lower() == needle
+                        ):
+                            try:
+                                cache_repo_roots.append(
+                                    Path(repo_info.repo_path).expanduser().resolve()
+                                )
+                            except Exception:
+                                pass
+            except Exception as cache_scan_exc:
+                logger.debug(
+                    "HF cache scan failed during diffusion delete guard: %s",
+                    cache_scan_exc,
+                )
+
             # Pair each owned repo with the GGUF variant it actually
             # owns (active or pending) so a swap in progress does not
             # collapse both quants into the pending one (round 13
@@ -2859,7 +2891,24 @@ async def delete_cached_model(
             # requested variant differs from the variant that owns
             # the matched repo.
             for owned_id, owned_gguf in _diffusion_owned_targets(diff_status):
-                if not owned_id or owned_id.lower() != needle:
+                if not owned_id:
+                    continue
+                owned_matches_repo = owned_id.lower() == needle
+                if not owned_matches_repo and cache_repo_roots:
+                    try:
+                        owned_path = Path(owned_id).expanduser().resolve()
+                    except Exception:
+                        owned_path = None
+                    if owned_path is not None:
+                        for repo_root in cache_repo_roots:
+                            if (
+                                owned_path == repo_root
+                                or _is_path_under(owned_path, repo_root)
+                                or _is_path_under(repo_root, owned_path)
+                            ):
+                                owned_matches_repo = True
+                                break
+                if not owned_matches_repo:
                     continue
                 if _variant_delete_is_safe_for_owned_gguf(variant, owned_gguf):
                     continue
diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index 1f7c68856b..f9ab1b8968 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -1473,6 +1473,62 @@ def test_smart_base_repo_uses_windows_leaf_only_already_set_separator_round14():
     assert _smart_base_repo(fam, repo) == "black-forest-labs/FLUX.2-klein-9B"
 
 
+def test_display_repo_id_collapses_absolute_path():
+    """Round 15 P2 #6: absolute local paths must NOT leak through
+    status(). Hub-style repo ids pass through unchanged."""
+    from core.inference.diffusion import _display_repo_id
+
+    # Hub id passes through.
+    assert (
+        _display_repo_id("black-forest-labs/FLUX.2-klein-4B")
+        == "black-forest-labs/FLUX.2-klein-4B"
+    )
+    # Absolute local path collapses to leaf.
+    assert _display_repo_id("/home/alice/exports/private-flux") == "private-flux"
+    # HF tokens are scrubbed defensively.
+    leaky = "https://hf_abcdefghij0123456789@huggingface.co/owner/repo"
+    out = _display_repo_id(leaky)
+    assert "hf_" not in out
+
+
+def test_detect_family_rejects_substring_collisions():
+    """Round 15 P2 #8: ``flux.20-model`` must NOT match ``flux.2``."""
+    from core.inference.diffusion import detect_family
+
+    # ``flux.20`` is a different number and must not collide with ``flux.2``.
+    assert detect_family("owner/flux.20-model") is None
+    # ``stable-diffusion-30`` must not match ``stable-diffusion-3``.
+    assert detect_family("foo/stable-diffusion-30") is None
+    # Legitimate ``flux.2`` still matches.
+    fam = detect_family("black-forest-labs/FLUX.2-dev")
+    assert fam is not None and fam.name == "flux.2"
+
+
+def test_release_other_gpu_owners_raises_on_active_training(monkeypatch):
+    """Round 15 P1 #3: direct backend callers must not bypass the
+    route layer's training-active 409 guard."""
+    import core.inference.diffusion as d
+
+    fake_training_mod = types.ModuleType("core.training")
+    fake_training_mod.get_training_backend = lambda: SimpleNamespace(
+        is_training_active = lambda: True
+    )
+    monkeypatch.setitem(sys.modules, "core.training", fake_training_mod)
+
+    # Ensure export module import does not fail the test before the
+    # training raise lands.
+    fake_export_mod = types.ModuleType("core.export")
+    fake_export_mod.get_export_backend = lambda: SimpleNamespace(
+        is_export_active = lambda: False,
+        current_checkpoint = None,
+    )
+    monkeypatch.setitem(sys.modules, "core.export", fake_export_mod)
+
+    with pytest.raises(RuntimeError) as exc_info:
+        d._release_other_gpu_owners_for_diffusion()
+    assert "Training is currently active" in str(exc_info.value)
+
+
 def test_generate_image_with_metadata_blocks_concurrent_unload(monkeypatch):
     """Round 13 P2 #9: _generate_lock serialises the forward AND the
     meta snapshot, so a queued unload cannot wipe state in between."""
diff --git a/studio/backend/tests/test_diffusion_routes.py b/studio/backend/tests/test_diffusion_routes.py
index 2818576659..41869d4044 100644
--- a/studio/backend/tests/test_diffusion_routes.py
+++ b/studio/backend/tests/test_diffusion_routes.py
@@ -57,7 +57,15 @@ def _import_inference_module():
     assert spec and spec.loader, "could not build spec for routes/inference.py"
     module = importlib.util.module_from_spec(spec)
     sys.modules["routes.inference"] = module
-    spec.loader.exec_module(module)
+    # Round 15 P3 #9: drop the half-initialised module from
+    # sys.modules if exec_module() raises, otherwise later tests pick
+    # up the poisoned entry and report a misleading AttributeError
+    # instead of the original ImportError.
+    try:
+        spec.loader.exec_module(module)
+    except Exception:
+        sys.modules.pop("routes.inference", None)
+        raise
     return module
 
 
@@ -249,6 +257,34 @@ def test_unload_clears_state(app_with_stub):
     assert r.json()["is_loaded"] is False
 
 
+def test_load_rejects_embedded_hf_token(app_with_stub):
+    """Round 15 P1 #5: URL-embedded ``hf_xxxxx`` tokens in repo_id /
+    base_repo must be rejected with 422 so they never reach
+    ``self._repo_id`` and get echoed back by ``status()``."""
+    app, _ = app_with_stub
+    c = TestClient(app)
+    r = c.post(
+        "/api/inference/images/load",
+        json = {
+            "repo_id": "https://hf_abcdefghij0123456789@huggingface.co/owner/repo",
+        },
+    )
+    assert r.status_code == 422, r.text
+    body = r.json()
+    text = repr(body).lower()
+    assert "hf_token" in text or "embed" in text
+    # base_repo is also rejected.
+    r = c.post(
+        "/api/inference/images/load",
+        json = {
+            "repo_id": "owner/repo",
+            "gguf_filename": "x.gguf",
+            "base_repo": "https://hf_abcdefghij0123456789@huggingface.co/base/repo",
+        },
+    )
+    assert r.status_code == 422, r.text
+
+
 def test_load_rejects_control_chars_in_repo_id(app_with_stub):
     """Newline-laden repo ids must be rejected by Pydantic BEFORE the
     log line that echoes them. Catches log-injection from authenticated

From 2f9bb6929e08c1f470c163e4aaf42bea1552d13e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 07:01:29 +0000
Subject: [PATCH 44/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/diffusion.py | 13 ++++++++-----
 studio/backend/models/inference.py         |  4 +---
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index ab453faab3..195ded40a6 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -375,12 +375,15 @@ def _matches_family_token(term: str) -> bool:
             # ``qwenimage-gguf`` -> qwenimage-compact in needle_compact.
             # Use word boundary on the compact form too: the compact
             # ``flux2`` must not match inside ``flux20``.
-            return bool(
-                re.search(
-                    rf"(^|[^0-9a-z]){re.escape(term_compact)}([^0-9a-z]|$)",
-                    needle_compact,
+            return (
+                bool(
+                    re.search(
+                        rf"(^|[^0-9a-z]){re.escape(term_compact)}([^0-9a-z]|$)",
+                        needle_compact,
+                    )
                 )
-            ) or term_compact == needle_compact
+                or term_compact == needle_compact
+            )
         return False
 
     # Scan _FAMILIES first (GGUF-supported), then _FULL_REPO_FAMILIES
diff --git a/studio/backend/models/inference.py b/studio/backend/models/inference.py
index 8594b6306e..2dbd6bfc40 100644
--- a/studio/backend/models/inference.py
+++ b/studio/backend/models/inference.py
@@ -1455,9 +1455,7 @@ def _no_control_chars(value: Optional[str], field_name: str) -> Optional[str]:
 _EMBEDDED_HF_TOKEN_RE = _re.compile(r"hf_[A-Za-z0-9]{20,}")
 
 
-def _reject_embedded_hf_token(
-    value: Optional[str], field_name: str
-) -> Optional[str]:
+def _reject_embedded_hf_token(value: Optional[str], field_name: str) -> Optional[str]:
     """Refuse identifiers that contain an embedded ``hf_xxx`` token.
 
     Round 15 P1 #5: ``repo_id`` and ``base_repo`` accept URL-style

From 05184ad15ad041873bbf7c81f8aca6eae1d0298e Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 07:15:10 +0000
Subject: [PATCH 45/92] Fix llama_cpp source-inspection tests for split
 load_model for PR #5754

Round 15 split LlamaCppBackend.load_model into a thin wrapper that
publishes _loading_model_identifier + _loading_hf_variant under
_serial_load_lock and an inner _load_model_impl_locked body that
actually launches llama-server. The pre-existing source-inspection
regression tests inspected only load_model and broke because the
flag literals and _wait_for_vram_settle call now live in the inner
method:

- tests/test_llama_cpp_no_context_shift.py
  test_no_context_shift_is_in_load_model
  test_flag_sits_inside_the_base_cmd_list
- tests/test_llama_cpp_wait_for_vram_settle.py
  test_load_model_calls_helper_outside_lock_and_uses_last_kill_timestamp

Update both helpers to concatenate the source of load_model AND
_load_model_impl_locked so the assertions still cover the launch
path without weakening their scope to the full module.
---
 .../tests/test_llama_cpp_no_context_shift.py  | 27 +++++++++++++------
 .../test_llama_cpp_wait_for_vram_settle.py    | 11 +++++++-
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/studio/backend/tests/test_llama_cpp_no_context_shift.py b/studio/backend/tests/test_llama_cpp_no_context_shift.py
index b9f25faf88..97c48cb033 100644
--- a/studio/backend/tests/test_llama_cpp_no_context_shift.py
+++ b/studio/backend/tests/test_llama_cpp_no_context_shift.py
@@ -66,15 +66,26 @@
 
 
 def _load_model_source() -> str:
-    """Return the source of ``LlamaCppBackend.load_model``.
-
-    Using ``inspect.getsource`` instead of reading the file directly
-    scopes the assertions to the function that actually launches
-    llama-server, so neither the presence check nor the location check
-    can be fooled by a stray occurrence of ``"--no-context-shift"``
-    elsewhere in the module.
+    """Return the source of ``LlamaCppBackend.load_model`` PLUS the
+    internal ``_load_model_impl_locked`` body it delegates to.
+
+    Studio's diffusion PR split ``load_model`` into a thin wrapper
+    that publishes ``_loading_model_identifier`` under
+    ``_serial_load_lock`` and an inner ``_load_model_impl_locked``
+    body that actually spawns llama-server. The launch flags and the
+    ``_wait_for_vram_settle`` call now live in the inner method, so
+    inspecting only ``load_model`` would miss them. Concatenating the
+    two sources keeps these source-inspection regression tests
+    working without weakening the scope (we still only look at the
+    two load entry points, not the entire module).
     """
-    return inspect.getsource(llama_cpp_module.LlamaCppBackend.load_model)
+    parts = [inspect.getsource(llama_cpp_module.LlamaCppBackend.load_model)]
+    impl = getattr(
+        llama_cpp_module.LlamaCppBackend, "_load_model_impl_locked", None
+    )
+    if impl is not None:
+        parts.append(inspect.getsource(impl))
+    return "\n".join(parts)
 
 
 def test_no_context_shift_is_in_load_model():
diff --git a/studio/backend/tests/test_llama_cpp_wait_for_vram_settle.py b/studio/backend/tests/test_llama_cpp_wait_for_vram_settle.py
index 00295d6283..5735ed67bc 100644
--- a/studio/backend/tests/test_llama_cpp_wait_for_vram_settle.py
+++ b/studio/backend/tests/test_llama_cpp_wait_for_vram_settle.py
@@ -271,10 +271,19 @@ def test_load_model_calls_helper_outside_lock_and_uses_last_kill_timestamp():
     """Pin the call site: outside Phase 3 lock, gated on the timestamp,
     no ``had_live_process`` in-band flag regression. Mirrors the
     ``inspect.getsource`` pattern from ``test_llama_cpp_no_context_shift``.
+
+    Studio's diffusion PR split ``load_model`` into a thin wrapper +
+    ``_load_model_impl_locked`` that actually launches llama-server, so
+    look at both sources to keep the assertions scoped to the load entry
+    points and not the entire module.
     """
     import inspect
 
-    src = inspect.getsource(LlamaCppBackend.load_model)
+    parts = [inspect.getsource(LlamaCppBackend.load_model)]
+    impl = getattr(LlamaCppBackend, "_load_model_impl_locked", None)
+    if impl is not None:
+        parts.append(inspect.getsource(impl))
+    src = "\n".join(parts)
     assert "_wait_for_vram_settle" in src
     assert "since_kill" in src
     assert "self._last_kill_monotonic" in src

From 7c8f1eb40ddcdaad35b4e3142865ffa7c2ae48e4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 07:15:28 +0000
Subject: [PATCH 46/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/tests/test_llama_cpp_no_context_shift.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/studio/backend/tests/test_llama_cpp_no_context_shift.py b/studio/backend/tests/test_llama_cpp_no_context_shift.py
index 97c48cb033..c49900812e 100644
--- a/studio/backend/tests/test_llama_cpp_no_context_shift.py
+++ b/studio/backend/tests/test_llama_cpp_no_context_shift.py
@@ -80,9 +80,7 @@ def _load_model_source() -> str:
     two load entry points, not the entire module).
     """
     parts = [inspect.getsource(llama_cpp_module.LlamaCppBackend.load_model)]
-    impl = getattr(
-        llama_cpp_module.LlamaCppBackend, "_load_model_impl_locked", None
-    )
+    impl = getattr(llama_cpp_module.LlamaCppBackend, "_load_model_impl_locked", None)
     if impl is not None:
         parts.append(inspect.getsource(impl))
     return "\n".join(parts)

From 2ef9b0e09fc0f3b73f8c7acd721f9809348b7737 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 07:31:51 +0000
Subject: [PATCH 47/92] Fix/adjust diffusion: round 16 P1+P2 batch for PR #5754

Round 16 reviewer aggregate (logs/review_round16_aggregate.md):

P1 fixes:
- routes/models.py /delete-cached llama guard pairs loading_id with
  loading_hf_variant so deleting a different cached quant (Q8_0)
  while another variant (Q4_K_M) is loading is no longer blocked.
- core/inference/diffusion.py load_model now calls
  _release_other_gpu_owners_for_diffusion BEFORE
  _release_chat_backend_for_diffusion. The other-owners helper
  RAISES on active training/export, so a route -> worker race or
  direct backend caller no longer drops the user's chat model
  before the diffusion load is refused.
- routes/models.py /delete-cached diffusion guard fails CLOSED
  (503) on HF cache scan failure instead of silently falling
  through to repo-id-only matching, which could miss a loaded
  local snapshot path.
- routes/inference.py _release_llama_for and
  _release_safetensors_chat_for now raise 503 on actual unload
  failure (exception or False return), so new GPU workloads do
  not start while the old chat process still owns VRAM.
- core/inference/diffusion.py status() now takes
  include_internal=False by default and only exposes the
  guard-facing active_*/pending_* paths when callers opt in. The
  public /api/inference/images/status route gets the redacted
  payload; routes/models.py delete guards pass
  include_internal=True so they still see the raw paths.
- core/inference/diffusion.py generate_image_with_metadata routes
  the response model through _display_repo_id so /images/generate
  cannot echo back an absolute local path.

P2 fixes:
- routes/inference.py /images/load now maps backend "Could not
  verify training/export status" to 503 instead of 409, matching
  the route-level pre-check.
- core/inference/diffusion.py _release_other_gpu_owners_for_diffusion
  raises "Could not verify export status" when the
  is_export_active() probe itself raises, instead of silently
  treating it as active export.
- core/inference/diffusion.py detect_family compares compact family
  spellings (Flux2Klein) against per-token compact strings so
  unsloth/Flux2Klein-GGUF matches the flux.2-klein family without
  matching the embedded substring inside flux.20.
- main.py installs a RequestValidationError handler that scrubs
  hf_xxxxx tokens out of the 422 response body so a rejected
  ``repo_id`` containing a URL-embedded HF token does not echo it
  back to the browser.

Tests:
- 3 new regression cases (Flux2Klein compact alias, public status
  redaction, generate_image_with_metadata redaction).
- All 75 diffusion backend + route tests pass.
---
 studio/backend/core/inference/diffusion.py    | 123 +++++++++------
 studio/backend/main.py                        |  37 +++++
 studio/backend/routes/inference.py            | 142 ++++++++++++------
 studio/backend/routes/models.py               |  46 +++++-
 .../backend/tests/test_diffusion_backend.py   | 104 +++++++++++--
 5 files changed, 343 insertions(+), 109 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 195ded40a6..ca3c3d917b 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -357,34 +357,37 @@ def detect_family(
     # P2 #8).
     needle_norm = re.sub(r"[^a-z0-9]+", "-", needle).strip("-")
     needle_compact = re.sub(r"[^a-z0-9]+", "", needle)
+    # Per-token compact strings let ``unsloth/Flux2Klein-GGUF`` match
+    # the ``flux2klein`` alias: the whole-needle compact is
+    # ``unslothflux2kleingguf`` and the regex boundary check rejects
+    # the embedded match, but the token ``Flux2Klein`` (between the
+    # ``/`` and the ``-``) compacts to exactly ``flux2klein`` (round
+    # 16 P2 #9).
+    needle_compact_tokens = {
+        re.sub(r"[^a-z0-9]+", "", token)
+        for token in re.split(r"[^a-z0-9]+", needle)
+        if token
+    }
 
     def _matches_family_token(term: str) -> bool:
         """Token-boundary match on the normalised needle. Prevents
         ``owner/flux.20-model`` from matching ``flux.2`` because
         ``flux.20`` does not have a separator after ``flux-2``
-        (round 15 P2 #8). Falls back to compact equality so aliases
-        like ``qwenimage`` still match ``unsloth/QwenImage-GGUF``."""
+        (round 15 P2 #8). Compact spellings (``flux2klein``) match
+        only when they appear as a complete repo-name token, not
+        as a substring of a longer token (round 16 P2 #9)."""
         term_norm = re.sub(r"[^a-z0-9]+", "-", term.lower()).strip("-")
         if not term_norm:
             return False
         if re.search(rf"(^|-){re.escape(term_norm)}($|-)", needle_norm):
             return True
         term_compact = re.sub(r"[^a-z0-9]+", "", term.lower())
-        if term_compact and term_compact in needle_compact:
-            # Compact contiguous match: ``qwenimage`` in
-            # ``qwenimage-gguf`` -> qwenimage-compact in needle_compact.
-            # Use word boundary on the compact form too: the compact
-            # ``flux2`` must not match inside ``flux20``.
-            return (
-                bool(
-                    re.search(
-                        rf"(^|[^0-9a-z]){re.escape(term_compact)}([^0-9a-z]|$)",
-                        needle_compact,
-                    )
-                )
-                or term_compact == needle_compact
-            )
-        return False
+        if not term_compact:
+            return False
+        return (
+            term_compact in needle_compact_tokens
+            or term_compact == needle_compact
+        )
 
     # Scan _FAMILIES first (GGUF-supported), then _FULL_REPO_FAMILIES
     # so a repo like ``stabilityai/stable-diffusion-xl-base-1.0`` is
@@ -496,7 +499,7 @@ def is_loaded(self) -> bool:
     def repo_id(self) -> Optional[str]:
         return self._repo_id
 
-    def status(self) -> dict[str, Any]:
+    def status(self, *, include_internal: bool = False) -> dict[str, Any]:
         # Take _lock so the snapshot cannot observe a torn state where
         # _pipe was already swapped but _family/_repo_id haven't been
         # updated yet (or vice versa). Frontend polling at 1 Hz would
@@ -504,6 +507,14 @@ def status(self) -> dict[str, Any]:
         # Only echo the GGUF basename; full absolute path leaks the
         # local HF cache layout (and the system username on default
         # POSIX layouts) to any authenticated Studio session.
+        #
+        # Round 16 P1 #5: the guard-facing ``active_*`` / ``pending_*``
+        # fields hold the EXACT raw path (so /delete-cached can match
+        # an HF snapshot mmap) but are NOT safe to surface to the
+        # browser. Callers that need the raw path (route-internal
+        # delete guards) pass ``include_internal=True``; the public
+        # ``/api/inference/images/status`` route always uses the
+        # public payload.
         with self._lock:
             # UI-facing collapsed basename. Full local path leaks the
             # HF cache layout + system username; the original caller-
@@ -552,7 +563,7 @@ def status(self) -> dict[str, Any]:
             # guard-facing ``active_*`` / ``pending_*`` fields below
             # preserve the exact value so delete guards still match
             # against the snapshot path.
-            return {
+            payload: dict[str, Any] = {
                 "is_loaded": self._pipe is not None,
                 "is_loading": self._loading,
                 "repo_id": _display_repo_id(pending_repo or active_repo),
@@ -560,23 +571,30 @@ def status(self) -> dict[str, Any]:
                 "pipeline_class": ui_pipeline_class,
                 "base_repo": _display_repo_id(pending_base or active_base),
                 "gguf_filename": ui_gguf_basename,
-                # Guard-facing fields: every repo / path / GGUF
-                # filename the backend owns RIGHT NOW. Delete routes
-                # iterate both, paired so the variant-filename check
-                # is compared against the SAME repo that owns it
-                # (round 13 P1 #3-5).
-                "active_repo_id": active_repo,
-                "active_base_repo": active_base,
-                "active_gguf_filename": active_gguf,
-                "pending_repo_id": pending_repo,
-                "pending_base_repo": pending_base,
-                "pending_gguf_filename": pending_gguf,
                 "device": self._device,
                 "dtype": self._dtype,
                 "loaded_at": self._loaded_at,
                 "last_error": self._last_error,
                 "supported_families": supported_families(),
             }
+            if include_internal:
+                # Guard-facing fields: every repo / path / GGUF
+                # filename the backend owns RIGHT NOW. Delete routes
+                # iterate both, paired so the variant-filename check
+                # is compared against the SAME repo that owns it
+                # (round 13 P1 #3-5). Round 16 P1 #5: never returned
+                # by the public /images/status route.
+                payload.update(
+                    {
+                        "active_repo_id": active_repo,
+                        "active_base_repo": active_base,
+                        "active_gguf_filename": active_gguf,
+                        "pending_repo_id": pending_repo,
+                        "pending_base_repo": pending_base,
+                        "pending_gguf_filename": pending_gguf,
+                    }
+                )
+            return payload
 
     def _pick_device_and_dtype(self) -> tuple[str, "Any"]:
         """Pick (device, dtype) for the current host.
@@ -808,20 +826,29 @@ def load_model(
                 # transient Hub error on the GGUF download) have now
                 # been validated. Anything past this line allocates
                 # GPU memory, so:
-                #   1. Release competing GPU owners (chat + export).
-                #   2. Release any *previous* diffusion pipeline so the
+                #   1. Verify training is idle and the export job (if
+                #      any) is also idle. ``_release_other_gpu_owners
+                #      _for_diffusion`` RAISES on conflict, so it must
+                #      run BEFORE we unload chat (round 16 P1 #2): a
+                #      route precheck -> worker race could otherwise
+                #      drop the user's chat model only to bail out
+                #      because training started in between, and a
+                #      direct ``DiffusionBackend.load_model`` caller
+                #      that did not run the route prechecks would also
+                #      leave chat unloaded for nothing.
+                #   2. Release the chat backend (llama-server + the
+                #      safetensors orchestrator) now that we know the
+                #      load can actually proceed.
+                #   3. Release any *previous* diffusion pipeline so the
                 #      new transformer / new from_pretrained does not
                 #      race the old pipe for VRAM. Switching between
                 #      FLUX.2 klein 4B and 9B on a 16-24 GB GPU OOMs
                 #      otherwise: from_single_file allocates the new
                 #      transformer while the old pipeline still owns
                 #      its weights.
-                #   3. THEN call from_single_file / from_pretrained.
-                # Training is *not* unloaded here: the route layer
-                # refuses /images/load with HTTP 409 when training is
-                # active so the user keeps their long run.
-                _release_chat_backend_for_diffusion()
+                #   4. THEN call from_single_file / from_pretrained.
                 _release_other_gpu_owners_for_diffusion()
+                _release_chat_backend_for_diffusion()
 
                 old = self._pipe
                 if old is not None:
@@ -1172,8 +1199,12 @@ def generate_image_with_metadata(
         with self._generate_lock:
             image = self._generate_image_unlocked(**kwargs)
             with self._lock:
+                # Round 16 P1 #6: route ``model`` through
+                # _display_repo_id so a generation response for a
+                # locally-loaded model cannot echo back an absolute
+                # filesystem path to the browser.
                 meta = {
-                    "model": self._repo_id,
+                    "model": _display_repo_id(self._repo_id),
                     "family": self._family.name if self._family else None,
                 }
         return image, meta
@@ -1343,12 +1374,16 @@ def _release_other_gpu_owners_for_diffusion() -> None:
     if is_export_active_fn is not None:
         try:
             export_is_active = bool(is_export_active_fn())
-        except Exception:
-            # Unverifiable status -> treat as 'might be active' and
-            # refuse so a direct backend caller (test / script /
-            # future route that forgot the higher-level 409 guard)
-            # cannot still terminate an in-flight export.
-            export_is_active = True
+        except Exception as exc:
+            # Round 16 P2 #8: distinguish unverifiable status from
+            # active export. The previous "treat as active" mapping
+            # surfaced as a misleading 409 conflict; raise a
+            # "Could not verify" RuntimeError so the route layer
+            # maps it to 503 (retryable) instead.
+            raise RuntimeError(
+                "Could not verify export status before loading a "
+                "diffusion image model."
+            ) from exc
         if export_is_active:
             # Round 14 P2 #10: the prior behaviour logged a warning
             # and continued, so direct ``DiffusionBackend.load_model``
diff --git a/studio/backend/main.py b/studio/backend/main.py
index 004ae404cd..ad727e0998 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -293,6 +293,43 @@ def _precache():
     lifespan = lifespan,
 )
 
+
+# ── Validation error scrubber ────────────────────────────────────
+# Round 16 P2 #10: FastAPI's default RequestValidationError handler
+# echoes the rejected ``input`` value back in the 422 body. A
+# request like
+#   {"repo_id": "https://hf_token@huggingface.co/owner/repo"}
+# is rejected by ``DiffusionLoadRequest._no_embedded_hf_tokens``,
+# but the rejected URL would still appear in the response payload,
+# leaking the token to the browser console / network log. Wrap the
+# handler so any ``hf_xxxxx`` substring is replaced with
+# ``<redacted>`` before serialisation. Scoped to the response body
+# only; the underlying validator behaviour is unchanged.
+from fastapi.exceptions import RequestValidationError as _RequestValidationError  # noqa: E402
+from fastapi.responses import JSONResponse as _JSONResponse  # noqa: E402
+import re as _re_validation  # noqa: E402
+
+
+_HF_TOKEN_VALIDATION_RE = _re_validation.compile(r"hf_[A-Za-z0-9]{20,}")
+
+
+def _scrub_validation_obj(value):
+    if isinstance(value, str):
+        return _HF_TOKEN_VALIDATION_RE.sub("<redacted>", value)
+    if isinstance(value, list):
+        return [_scrub_validation_obj(v) for v in value]
+    if isinstance(value, dict):
+        return {k: _scrub_validation_obj(v) for k, v in value.items()}
+    return value
+
+
+@app.exception_handler(_RequestValidationError)
+async def _validation_error_scrubbing_handler(request, exc):
+    return _JSONResponse(
+        status_code = 422,
+        content = {"detail": _scrub_validation_obj(exc.errors())},
+    )
+
 # Initialize structured logging
 from loggers.config import LogConfig
 from loggers.handlers import LoggingMiddleware
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 41fc87c424..8e021d7813 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -366,60 +366,103 @@ async def _release_llama_for(workload: str) -> None:
     /export/load could start while a long ``_download_gguf`` was in
     flight; llama-server would then come up afterwards and double-own
     the GPU.
+
+    Round 16 P1 #4: a missing or unavailable llama backend is a
+    silent no-op (fresh install / no GGUF use), but an unload that
+    actually FAILS raises 503 so the caller does not start a new GPU
+    workload while llama-server is still resident.
     """
     try:
         llama = get_llama_cpp_backend()
-        is_loaded = bool(getattr(llama, "is_loaded", False))
-        is_active = bool(getattr(llama, "is_active", False))
-        is_loading = bool(getattr(llama, "loading_model_identifier", None))
-        if is_loaded or is_active or is_loading:
-            logger.info(
-                "Unloading GGUF chat (loaded=%s active=%s loading=%s) before %s load",
-                is_loaded,
-                is_active,
-                is_loading,
-                workload,
-            )
-            await asyncio.to_thread(llama.unload_model)
-    except Exception as e:
-        logger.debug("llama-server unload skipped for %s: %s", workload, e)
+    except Exception as exc:
+        logger.debug("llama-server unavailable for %s: %s", workload, exc)
+        return
+
+    is_loaded = bool(getattr(llama, "is_loaded", False))
+    is_active = bool(getattr(llama, "is_active", False))
+    is_loading = bool(getattr(llama, "loading_model_identifier", None))
+    if not (is_loaded or is_active or is_loading):
+        return
+
+    logger.info(
+        "Unloading GGUF chat (loaded=%s active=%s loading=%s) before %s load",
+        is_loaded,
+        is_active,
+        is_loading,
+        workload,
+    )
+    try:
+        await asyncio.to_thread(llama.unload_model)
+    except Exception as exc:
+        logger.warning(
+            "Failed to unload GGUF chat before %s load: %s", workload, exc
+        )
+        raise HTTPException(
+            status_code = 503,
+            detail = (
+                f"Could not unload the existing GGUF chat model before "
+                f"starting {workload}."
+            ),
+        ) from exc
 
 
 async def _release_safetensors_chat_for(workload: str) -> None:
     """Unload the safetensors / Unsloth chat backend (drains both
     ``active_model_name`` and ``loading_models``) if it owns the GPU.
+
+    Round 16 P1 #4: ``unload_model`` returning ``False`` (subprocess
+    wedged, IPC timeout) used to be silently ignored, leaving the
+    old chat model resident while a new GPU workload started on top.
+    Treat ``False`` as failure and raise 503 so the caller retries
+    instead of double-owning VRAM.
     """
     try:
         from core.inference import get_inference_backend as _gib  # type: ignore
 
         inf = _gib()
-        active_model_name = getattr(inf, "active_model_name", None)
-        loading_models = set(getattr(inf, "loading_models", set()) or set())
-        if active_model_name:
-            logger.info(
-                "Unloading safetensors chat '%s' before %s load",
-                active_model_name,
-                workload,
+    except Exception as exc:
+        logger.debug("safetensors unavailable for %s: %s", workload, exc)
+        return
+
+    async def _unload_required(model_name: str) -> None:
+        try:
+            ok = await asyncio.to_thread(inf.unload_model, model_name)
+        except Exception as exc:
+            raise HTTPException(
+                status_code = 503,
+                detail = (
+                    f"Could not unload safetensors chat model "
+                    f"'{model_name}' before starting {workload}."
+                ),
+            ) from exc
+        if ok is False:
+            raise HTTPException(
+                status_code = 503,
+                detail = (
+                    f"Safetensors backend refused to unload "
+                    f"'{model_name}' before starting {workload}. "
+                    "Try again."
+                ),
             )
-            await asyncio.to_thread(inf.unload_model, active_model_name)
-        for loading in loading_models:
-            if loading == active_model_name:
-                continue
-            try:
-                logger.info(
-                    "Unloading in-flight safetensors chat '%s' before %s load",
-                    loading,
-                    workload,
-                )
-                await asyncio.to_thread(inf.unload_model, loading)
-            except Exception as inner:
-                logger.debug(
-                    "loading safetensors unload skipped for %s: %s",
-                    loading,
-                    inner,
-                )
-    except Exception as e:
-        logger.debug("safetensors unload skipped for %s: %s", workload, e)
+
+    active_model_name = getattr(inf, "active_model_name", None)
+    loading_models = set(getattr(inf, "loading_models", set()) or set())
+    if active_model_name:
+        logger.info(
+            "Unloading safetensors chat '%s' before %s load",
+            active_model_name,
+            workload,
+        )
+        await _unload_required(active_model_name)
+    for loading in loading_models:
+        if loading == active_model_name:
+            continue
+        logger.info(
+            "Unloading in-flight safetensors chat '%s' before %s load",
+            loading,
+            workload,
+        )
+        await _unload_required(loading)
 
 
 async def _release_chat_for(workload: str) -> None:
@@ -1942,18 +1985,21 @@ async def diffusion_load(
         )
         return JSONResponse(content = status)
     except RuntimeError as exc:
-        # Round 15 P2 #7: if a training run / export job starts
-        # between the route-level pre-check and the backend worker,
-        # ``_release_other_gpu_owners_for_diffusion`` raises a
-        # RuntimeError that should surface as a 409 conflict (the
-        # same status the route layer returns), not 400. Match the
-        # known conflict strings the backend raises.
+        # Round 15 P2 #7 / round 16 P2 #7: backend-level conflict
+        # checks raise RuntimeError that surfaces here. Distinguish:
+        # - "Could not verify ..." -> 503 (retryable, status check
+        #   itself failed), matching the route-level pre-check.
+        # - explicit "currently active" -> 409 conflict.
+        # - anything else -> 400 (bad request).
         detail = str(exc)
+        if (
+            "Could not verify training status" in detail
+            or "Could not verify export status" in detail
+        ):
+            raise HTTPException(status_code = 503, detail = detail) from exc
         if (
             "export job is currently active" in detail
             or "Training is currently active" in detail
-            or "Could not verify training status" in detail
-            or "Could not verify export status" in detail
         ):
             raise HTTPException(status_code = 409, detail = detail) from exc
         raise HTTPException(status_code = 400, detail = detail) from exc
diff --git a/studio/backend/routes/models.py b/studio/backend/routes/models.py
index 10e6e26822..1ebf525f56 100644
--- a/studio/backend/routes/models.py
+++ b/studio/backend/routes/models.py
@@ -2062,7 +2062,9 @@ async def delete_finetuned_model(
         from core.inference.diffusion import get_diffusion_backend
 
         diff_backend = get_diffusion_backend()
-        diff_status = diff_backend.status()
+        # include_internal=True so we can iterate active_*/pending_*
+        # raw paths against ``target_path`` (round 16 P1 #5).
+        diff_status = diff_backend.status(include_internal = True)
         if diff_status.get("is_loaded") or diff_status.get("is_loading"):
             target_str = str(target_path)
             # Pair each owned repo / path with the GGUF variant it
@@ -2757,17 +2759,32 @@ async def delete_cached_model(
         loading_id = (
             getattr(llama_backend, "loading_model_identifier", None) or ""
         ).lower()
+        loading_variant = (
+            getattr(llama_backend, "loading_hf_variant", None) or ""
+        ).lower()
         # Also consult the pending-load identifier: a multi-GB HF
         # download stays in ``loading_model_identifier`` until the
         # download completes, before ``model_identifier`` is set
         # (round 13 P1 #6). Without this check the cache directory
         # the download was writing into could be rmtree'd mid-flight.
+        # Round 16 P1 #1: pair against ``loading_hf_variant`` so a
+        # delete of a DIFFERENT cached quant from the same repo
+        # (loading Q4_K_M, deleting cached Q8_0) is allowed; only
+        # block when the requested variant matches what is being
+        # downloaded. Mirrors the /delete-finetuned pairing.
         needle = repo_id.lower()
+        requested_variant = (variant or "").lower()
         if loading_id == needle:
-            raise HTTPException(
-                status_code = 409,
-                detail = "Cannot delete a model while it is loading",
+            same_loading_variant = (
+                not requested_variant
+                or not loading_variant
+                or requested_variant == loading_variant
             )
+            if same_loading_variant:
+                raise HTTPException(
+                    status_code = 409,
+                    detail = "Cannot delete a model while it is loading",
+                )
         # Exact match only (case-insensitive). Prefix match would
         # block deleting unrelated ``org/model`` while
         # ``org/model-v2`` is loaded -- same surface the diffusion
@@ -2778,7 +2795,6 @@ async def delete_cached_model(
             llama_backend.is_loaded or getattr(llama_backend, "is_active", False)
         ):
             loaded_variant = (getattr(llama_backend, "hf_variant", None) or "").lower()
-            requested_variant = (variant or "").lower()
             same_variant = (
                 not requested_variant
                 or not loaded_variant
@@ -2854,7 +2870,9 @@ async def delete_cached_model(
         from core.inference.diffusion import get_diffusion_backend
 
         diff_backend = get_diffusion_backend()
-        diff_status = diff_backend.status()
+        # include_internal=True so we can pair owned raw paths against
+        # the HF cache snapshot root (round 16 P1 #5).
+        diff_status = diff_backend.status(include_internal = True)
         if diff_status.get("is_loaded") or diff_status.get("is_loading"):
             needle = repo_id.lower()
             # Round 15 P1 #4: ALSO compare owned paths against the HF
@@ -2879,10 +2897,22 @@ async def delete_cached_model(
                             except Exception:
                                 pass
             except Exception as cache_scan_exc:
-                logger.debug(
-                    "HF cache scan failed during diffusion delete guard: %s",
+                # Round 16 P1 #3: a transient cache-scan failure here
+                # used to silently fall through to repo-id-only
+                # matching, which misses local snapshot paths and
+                # let /delete-cached unlink an actively mmap'd
+                # snapshot. Fail-closed (503) so the user retries.
+                logger.warning(
+                    "Could not scan HF cache during diffusion delete guard: %s",
                     cache_scan_exc,
                 )
+                raise HTTPException(
+                    status_code = 503,
+                    detail = (
+                        "Could not verify diffusion cache ownership before "
+                        "deleting. Try again."
+                    ),
+                ) from cache_scan_exc
 
             # Pair each owned repo with the GGUF variant it actually
             # owns (active or pending) so a swap in progress does not
diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index f9ab1b8968..584b462d34 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -174,6 +174,9 @@ def test_get_diffusion_backend_singleton():
 
 
 def test_status_shape_unloaded():
+    """Public status() (the browser-facing payload) must NOT contain
+    the guard-only ``active_*`` / ``pending_*`` fields (round 16
+    P1 #5)."""
     from core.inference.diffusion import get_diffusion_backend
 
     s = get_diffusion_backend().status()
@@ -185,12 +188,6 @@ def test_status_shape_unloaded():
         "pipeline_class",
         "base_repo",
         "gguf_filename",
-        "active_repo_id",
-        "active_base_repo",
-        "active_gguf_filename",
-        "pending_repo_id",
-        "pending_base_repo",
-        "pending_gguf_filename",
         "device",
         "dtype",
         "loaded_at",
@@ -198,10 +195,23 @@ def test_status_shape_unloaded():
         "supported_families",
     }
     assert expected_keys.issubset(s.keys())
+    # Guard-facing fields are gated behind include_internal=True.
+    for guard_key in (
+        "active_repo_id",
+        "active_base_repo",
+        "active_gguf_filename",
+        "pending_repo_id",
+        "pending_base_repo",
+        "pending_gguf_filename",
+    ):
+        assert guard_key not in s, f"public status() must not expose {guard_key}"
     assert s["is_loaded"] is False
     assert s["repo_id"] is None
-    assert s["active_gguf_filename"] is None
-    assert s["pending_gguf_filename"] is None
+
+    # Internal status() exposes the guard fields for delete/route use.
+    s_internal = get_diffusion_backend().status(include_internal = True)
+    assert s_internal["active_gguf_filename"] is None
+    assert s_internal["pending_gguf_filename"] is None
 
 
 # ── encode_png_base64 ───────────────────────────────────────────
@@ -1413,7 +1423,7 @@ def test_status_preserves_active_gguf_subdir(monkeypatch):
         aliases = (),
     )
 
-    s = backend.status()
+    s = backend.status(include_internal = True)
     assert s["active_gguf_filename"] == "BF16/model.gguf"
     # UI-facing field still collapses to the basename.
     assert s["gguf_filename"] == "model.gguf"
@@ -1504,6 +1514,82 @@ def test_detect_family_rejects_substring_collisions():
     assert fam is not None and fam.name == "flux.2"
 
 
+def test_detect_family_compact_aliases_with_owner_prefix():
+    """Round 16 P2 #9: compact aliases must match when the repo has
+    an owner prefix. ``unsloth/Flux2Klein-GGUF`` -> flux.2-klein
+    via the ``flux2-klein`` alias's compact form. Embedded compact
+    matches (e.g. ``flux2`` inside ``flux20``) must NOT match."""
+    from core.inference.diffusion import detect_family
+
+    fam = detect_family("unsloth/Flux2Klein-GGUF")
+    assert fam is not None and fam.name == "flux.2-klein"
+    # 20 is a different number; must not collide with flux.2.
+    assert detect_family("unsloth/Flux20-GGUF") is None
+
+
+def test_public_status_does_not_leak_local_path_via_active_fields(monkeypatch):
+    """Round 16 P1 #5: even the guard-facing active_*/pending_* keys
+    must be absent from the public status payload."""
+    import core.inference.diffusion as d
+
+    backend = d.DiffusionBackend()
+    backend._pipe = object()
+    backend._repo_id = "/home/alice/private-flux"
+    backend._base_repo = "/home/alice/base-private"
+    backend._family = d.DiffusionFamily(
+        name = "flux.2-klein",
+        pipeline_class = "Flux2KleinPipeline",
+        transformer_class = "Flux2Transformer2DModel",
+        base_repo = "black-forest-labs/FLUX.2-klein-4B",
+        aliases = (),
+    )
+
+    public = backend.status()
+    # UI-facing fields collapse to leaf and the guard-only fields are absent.
+    assert public["repo_id"] == "private-flux"
+    assert public["base_repo"] == "base-private"
+    for key in (
+        "active_repo_id",
+        "active_base_repo",
+        "active_gguf_filename",
+        "pending_repo_id",
+        "pending_base_repo",
+        "pending_gguf_filename",
+    ):
+        assert key not in public
+
+    internal = backend.status(include_internal = True)
+    assert internal["active_repo_id"] == "/home/alice/private-flux"
+    assert internal["active_base_repo"] == "/home/alice/base-private"
+
+
+def test_generate_image_with_metadata_redacts_local_path(monkeypatch):
+    """Round 16 P1 #6: the generation response must not echo a raw
+    absolute path back to the browser."""
+    import core.inference.diffusion as d
+
+    backend = d.DiffusionBackend()
+    backend._pipe = object()
+    backend._repo_id = "/home/alice/private/secret-flux"
+    backend._family = d.DiffusionFamily(
+        name = "flux.2-klein",
+        pipeline_class = "Flux2KleinPipeline",
+        transformer_class = "Flux2Transformer2DModel",
+        base_repo = "black-forest-labs/FLUX.2-klein-4B",
+        aliases = (),
+    )
+
+    def _fake_unlocked(**kwargs):
+        from PIL import Image as _Image
+
+        return _Image.new("RGB", (8, 8))
+
+    monkeypatch.setattr(backend, "_generate_image_unlocked", _fake_unlocked)
+    _, meta = backend.generate_image_with_metadata(prompt = "x")
+    assert meta["model"] == "secret-flux"
+    assert "/home/alice" not in meta["model"]
+
+
 def test_release_other_gpu_owners_raises_on_active_training(monkeypatch):
     """Round 15 P1 #3: direct backend callers must not bypass the
     route layer's training-active 409 guard."""

From e948a9601c070f4a3fccacedbd3abd7eb2aeddd3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 07:32:11 +0000
Subject: [PATCH 48/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/diffusion.py | 5 +----
 studio/backend/main.py                     | 1 +
 studio/backend/routes/inference.py         | 4 +---
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index ca3c3d917b..fd06cbe704 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -384,10 +384,7 @@ def _matches_family_token(term: str) -> bool:
         term_compact = re.sub(r"[^a-z0-9]+", "", term.lower())
         if not term_compact:
             return False
-        return (
-            term_compact in needle_compact_tokens
-            or term_compact == needle_compact
-        )
+        return term_compact in needle_compact_tokens or term_compact == needle_compact
 
     # Scan _FAMILIES first (GGUF-supported), then _FULL_REPO_FAMILIES
     # so a repo like ``stabilityai/stable-diffusion-xl-base-1.0`` is
diff --git a/studio/backend/main.py b/studio/backend/main.py
index ad727e0998..ac98722f46 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -330,6 +330,7 @@ async def _validation_error_scrubbing_handler(request, exc):
         content = {"detail": _scrub_validation_obj(exc.errors())},
     )
 
+
 # Initialize structured logging
 from loggers.config import LogConfig
 from loggers.handlers import LoggingMiddleware
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 8e021d7813..f799f8df58 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -394,9 +394,7 @@ async def _release_llama_for(workload: str) -> None:
     try:
         await asyncio.to_thread(llama.unload_model)
     except Exception as exc:
-        logger.warning(
-            "Failed to unload GGUF chat before %s load: %s", workload, exc
-        )
+        logger.warning("Failed to unload GGUF chat before %s load: %s", workload, exc)
         raise HTTPException(
             status_code = 503,
             detail = (

From 6ac67571dd08f9f7da34f9ea41dca28b679024ae Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 07:36:17 +0000
Subject: [PATCH 49/92] Fix Windows test failures from round 16 changes for PR
 #5754

Two diffusion tests broke on the Windows runner after round 16:

- test_display_repo_id_collapses_absolute_path used hardcoded
  POSIX absolute paths; Windows reads /home/... as drive-
  relative so Path.is_absolute() returns False. Use pytest's
  tmp_path so the path is platform-correct.
- test_load_publishes_pending_target_during_loading regressed
  because round 16 moved _release_other_gpu_owners_for_diffusion
  ahead of the chat unload. That helper imports core.training and
  core.export; on Windows CI the import resolved to a real but
  partially configured backend, which raised inside the new
  status-verification path and aborted the load before
  from_pretrained ran. Stub both modules with idle backends in
  _install_fake_diffusers.

Also updated test_public_status_does_not_leak_local_path_via
_active_fields and test_generate_image_with_metadata_redacts_
local_path to use tmp_path for the same Windows reason.
---
 .../backend/tests/test_diffusion_backend.py   | 61 +++++++++++++++----
 1 file changed, 48 insertions(+), 13 deletions(-)

diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index 584b462d34..25cf819008 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -408,6 +408,25 @@ def to(self, device):
         lambda self: ("cpu", "fake_dtype"),
     )
 
+    # Round 16 reordered _release_other_gpu_owners_for_diffusion to
+    # run BEFORE the chat unload. That helper imports core.training /
+    # core.export and raises on active or unverifiable status. Stub
+    # both modules with idle backends so the load_model fast path
+    # works in CI environments where neither module is fully wired
+    # (Windows runners without the training/export deps).
+    fake_training_mod = types.ModuleType("core.training")
+    fake_training_mod.get_training_backend = lambda: SimpleNamespace(
+        is_training_active = lambda: False,
+    )
+    monkeypatch.setitem(sys.modules, "core.training", fake_training_mod)
+
+    fake_export_mod = types.ModuleType("core.export")
+    fake_export_mod.get_export_backend = lambda: SimpleNamespace(
+        is_export_active = lambda: False,
+        current_checkpoint = None,
+    )
+    monkeypatch.setitem(sys.modules, "core.export", fake_export_mod)
+
     return fake
 
 
@@ -1483,9 +1502,11 @@ def test_smart_base_repo_uses_windows_leaf_only_already_set_separator_round14():
     assert _smart_base_repo(fam, repo) == "black-forest-labs/FLUX.2-klein-9B"
 
 
-def test_display_repo_id_collapses_absolute_path():
+def test_display_repo_id_collapses_absolute_path(tmp_path):
     """Round 15 P2 #6: absolute local paths must NOT leak through
-    status(). Hub-style repo ids pass through unchanged."""
+    status(). Hub-style repo ids pass through unchanged. Uses
+    ``tmp_path`` so the absolute path is platform-correct (POSIX
+    ``/`` paths read as drive-relative on Windows)."""
     from core.inference.diffusion import _display_repo_id
 
     # Hub id passes through.
@@ -1493,8 +1514,11 @@ def test_display_repo_id_collapses_absolute_path():
         _display_repo_id("black-forest-labs/FLUX.2-klein-4B")
         == "black-forest-labs/FLUX.2-klein-4B"
     )
-    # Absolute local path collapses to leaf.
-    assert _display_repo_id("/home/alice/exports/private-flux") == "private-flux"
+    # Absolute local path collapses to leaf. ``tmp_path`` is absolute
+    # on every OS pytest supports.
+    absolute_local = tmp_path / "private-flux"
+    absolute_local.mkdir()
+    assert _display_repo_id(str(absolute_local)) == "private-flux"
     # HF tokens are scrubbed defensively.
     leaky = "https://hf_abcdefghij0123456789@huggingface.co/owner/repo"
     out = _display_repo_id(leaky)
@@ -1527,15 +1551,23 @@ def test_detect_family_compact_aliases_with_owner_prefix():
     assert detect_family("unsloth/Flux20-GGUF") is None
 
 
-def test_public_status_does_not_leak_local_path_via_active_fields(monkeypatch):
+def test_public_status_does_not_leak_local_path_via_active_fields(
+    monkeypatch, tmp_path
+):
     """Round 16 P1 #5: even the guard-facing active_*/pending_* keys
-    must be absent from the public status payload."""
+    must be absent from the public status payload. Uses ``tmp_path``
+    so the absolute path is correct on every OS."""
     import core.inference.diffusion as d
 
+    absolute_repo = tmp_path / "private-flux"
+    absolute_repo.mkdir()
+    absolute_base = tmp_path / "base-private"
+    absolute_base.mkdir()
+
     backend = d.DiffusionBackend()
     backend._pipe = object()
-    backend._repo_id = "/home/alice/private-flux"
-    backend._base_repo = "/home/alice/base-private"
+    backend._repo_id = str(absolute_repo)
+    backend._base_repo = str(absolute_base)
     backend._family = d.DiffusionFamily(
         name = "flux.2-klein",
         pipeline_class = "Flux2KleinPipeline",
@@ -1559,18 +1591,21 @@ def test_public_status_does_not_leak_local_path_via_active_fields(monkeypatch):
         assert key not in public
 
     internal = backend.status(include_internal = True)
-    assert internal["active_repo_id"] == "/home/alice/private-flux"
-    assert internal["active_base_repo"] == "/home/alice/base-private"
+    assert internal["active_repo_id"] == str(absolute_repo)
+    assert internal["active_base_repo"] == str(absolute_base)
 
 
-def test_generate_image_with_metadata_redacts_local_path(monkeypatch):
+def test_generate_image_with_metadata_redacts_local_path(monkeypatch, tmp_path):
     """Round 16 P1 #6: the generation response must not echo a raw
     absolute path back to the browser."""
     import core.inference.diffusion as d
 
+    absolute_repo = tmp_path / "secret-flux"
+    absolute_repo.mkdir()
+
     backend = d.DiffusionBackend()
     backend._pipe = object()
-    backend._repo_id = "/home/alice/private/secret-flux"
+    backend._repo_id = str(absolute_repo)
     backend._family = d.DiffusionFamily(
         name = "flux.2-klein",
         pipeline_class = "Flux2KleinPipeline",
@@ -1587,7 +1622,7 @@ def _fake_unlocked(**kwargs):
     monkeypatch.setattr(backend, "_generate_image_unlocked", _fake_unlocked)
     _, meta = backend.generate_image_with_metadata(prompt = "x")
     assert meta["model"] == "secret-flux"
-    assert "/home/alice" not in meta["model"]
+    assert str(tmp_path) not in meta["model"]
 
 
 def test_release_other_gpu_owners_raises_on_active_training(monkeypatch):

From e2f41e4069d9759fc6b56944cacafce82b927a22 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 08:11:42 +0000
Subject: [PATCH 50/92] Fix/adjust diffusion: round 17 P1+P2 batch for PR #5754

P1: route-layer chat/diffusion/export releases were still
asymmetric. Training start and export load called
``diff_backend.unload_model`` inside a best-effort try/except so a
wedged diffusion backend let the next workload allocate over the
top of the resident pipeline and OOM. Both now use the strict
``_release_diffusion_for`` helper from routes.inference, which
raises HTTPException 503 on status/unload failure or post-check
mismatch.

P2 #9: diffusion load exceptions can include the absolute local
repo / base / gguf path verbatim (FileNotFoundError, OSError from
diffusers / safetensors). The path flows into ``_last_error``,
which ``status()`` returns to every authenticated session. Collapse
the known repo_id / effective_base / gguf_filename paths to their
leaf name before storing the error, mirroring the
``_display_repo_id`` convention used for the public repo label.

P2 #10: when ``repo_id`` is an absolute local path,
``detect_family`` matched _FAMILY_EXCLUDE deny lists against the
full path, so models stored under a parent directory containing
``qwen-image-edit`` or ``3.5`` were misclassified as None. Reduce
the family-detection needle to the leaf directory when the input
looks like a filesystem path; Hub-style ``owner/repo`` ids
continue to use the original needle so existing detection rules
keep working.

P2 #12: ``gguf_filename`` was missing from the
``_reject_embedded_hf_token`` validator. A URL-form quant path
like ``https://hf_xxxxx@huggingface.co/.../flux.gguf`` would be
stored on ``DiffusionBackend._gguf_filename`` and surface in
status() / log lines. Extend the validator to gguf_filename so the
token is dropped before it can leak.

All 85 diffusion-relevant backend tests pass locally.
---
 studio/backend/core/inference/diffusion.py    | 161 ++++++++++---
 studio/backend/main.py                        |  21 +-
 studio/backend/models/inference.py            |   8 +-
 studio/backend/routes/export.py               |  30 +--
 studio/backend/routes/inference.py            | 216 ++++++++++++------
 studio/backend/routes/training.py             |  25 +-
 .../backend/tests/test_diffusion_backend.py   |  90 +++-----
 7 files changed, 342 insertions(+), 209 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index fd06cbe704..5d5f4c19d5 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -348,6 +348,23 @@ def detect_family(
     needle = (repo_id or "").lower()
     if not needle:
         return None
+    # Round 17 P2 #10: if repo_id is an absolute local path, the
+    # whole path goes into ``needle`` and the _FAMILY_EXCLUDE deny
+    # lists match against parent-directory names too. That means
+    # ``/home/me/qwen-image-edit-cache/flux-2-klein-4b`` would be
+    # excluded from the Flux family because the parent contains
+    # ``qwen-image-edit``. Reduce to the leaf when the candidate
+    # looks like a filesystem path so excludes only consider the
+    # model directory itself.
+    if "/" in needle or "\\" in needle:
+        try:
+            candidate = Path(repo_id).expanduser()
+            if candidate.is_absolute() or candidate.exists():
+                leaf = candidate.name
+                if leaf:
+                    needle = leaf.lower()
+        except (OSError, ValueError):
+            pass
     # Normalise mixed separator spellings (``Qwen_Image-Edit-GGUF``,
     # ``Qwen-Image_Edit-GGUF``, ``Qwen.Image.Edit-GGUF``) and the
     # compact concatenation (``QwenImageEdit-GGUF``) so the
@@ -989,6 +1006,50 @@ def load_model(
                 import re
 
                 exc_msg = re.sub(r"hf_[A-Za-z0-9]{20,}", "<redacted>", exc_msg)
+                # Round 17 P2 #9: diffusers / safetensors raise errors
+                # like ``FileNotFoundError: /home/alice/models/foo.gguf``
+                # or ``OSError: Error while loading state dict from
+                # C:\\Users\\bob\\repos\\flux``. These messages flow
+                # into ``_last_error`` (rendered by status() to every
+                # authenticated browser tab) and the user-facing
+                # RuntimeError, which would leak the operator's
+                # filesystem layout to other sessions. Collapse the
+                # known repo / base / gguf paths to their leaf name
+                # using the same convention as _display_repo_id().
+                def _collapse_local(msg: str, candidate: Optional[str]) -> str:
+                    if not candidate or not isinstance(candidate, str):
+                        return msg
+                    try:
+                        p = Path(candidate).expanduser()
+                    except (OSError, ValueError):
+                        return msg
+                    leaf = p.name or candidate
+                    abs_str = None
+                    if p.is_absolute() or p.exists():
+                        try:
+                            abs_str = str(p)
+                        except (OSError, ValueError):
+                            abs_str = None
+                    if abs_str and abs_str in msg:
+                        msg = msg.replace(abs_str, leaf)
+                    if (
+                        candidate != leaf
+                        and candidate in msg
+                        and ("/" in candidate or "\\" in candidate)
+                    ):
+                        msg = msg.replace(candidate, leaf)
+                    return msg
+
+                # ``effective_base`` and ``gguf_filename`` are local
+                # to the try block above and may be unbound if the
+                # exception fired before assignment (e.g. the GGUF
+                # repo / filename validation raises before
+                # ``effective_base`` is computed). ``locals().get``
+                # keeps the scrub a no-op in that case.
+                _locals = locals()
+                exc_msg = _collapse_local(exc_msg, repo_id)
+                exc_msg = _collapse_local(exc_msg, _locals.get("effective_base"))
+                exc_msg = _collapse_local(exc_msg, _locals.get("gguf_filename"))
                 with self._lock:
                     self._last_error = exc_msg
                 # ``logger.exception`` would emit the raw exception
@@ -1247,20 +1308,22 @@ def _release_chat_backend_for_diffusion() -> None:
     their weights first means a typical 24 GB consumer GPU can host
     one chat model OR one diffusion model without manual unload steps.
 
-    Best effort: if a chat backend module is not importable (CI,
-    isolated tests, custom builds) or fails on the unload, we log and
-    continue; the diffusion load can still try and surface its own OOM.
+    A missing chat backend module is a silent no-op (fresh install /
+    no GGUF use). An unload that ACTUALLY fails (raises or leaves
+    the backend resident) raises ``RuntimeError`` so the surrounding
+    diffusion ``load_model`` bails out instead of double-owning VRAM
+    (round 17 P1 #2).
     """
     # 1. GGUF chat backend (llama-server subprocess). We unload when
     #    EITHER is_loaded is True (resident model) OR is_active is
     #    True (mid-download / startup) OR loading_model_identifier is
     #    populated (HF GGUF download in progress, before is_active /
-    #    is_loaded flip). The last case is what round 13 P1 #8 flagged:
-    #    a multi-GB HF download from one workload + a diffusion load
-    #    racing on the same GPU would otherwise both end up live.
+    #    is_loaded flip). The last case is what round 13 P1 #8 flagged.
     try:
         from routes.inference import get_llama_cpp_backend  # type: ignore
-
+    except Exception as exc:
+        logger.debug("llama-server unavailable before diffusion load: %s", exc)
+    else:
         backend = get_llama_cpp_backend()
         is_loaded = bool(getattr(backend, "is_loaded", False))
         is_active = bool(getattr(backend, "is_active", False))
@@ -1272,45 +1335,67 @@ def _release_chat_backend_for_diffusion() -> None:
                 is_active,
                 is_loading,
             )
-            backend.unload_model()
-    except Exception as exc:
-        logger.debug("llama-server unload skipped: %s", exc)
+            try:
+                ok = backend.unload_model()
+            except Exception as exc:
+                raise RuntimeError(
+                    "Could not unload the existing GGUF chat model before "
+                    "loading a diffusion image model."
+                ) from exc
+            if (
+                ok is False
+                or getattr(backend, "is_loaded", False)
+                or getattr(backend, "is_active", False)
+            ):
+                raise RuntimeError(
+                    "The existing GGUF chat model is still active after "
+                    "unload; retry before loading a diffusion image model."
+                )
 
     # 2. Safetensors / HF chat backend (the InferenceOrchestrator that
     #    serves FastVisionModel / FastLanguageModel weights). When this
     #    backend has a model resident on the same GPU, a diffusion load
-    #    will OOM the same way. The orchestrator's unload_model takes a
-    #    model_name; passing it without args raised TypeError and was
-    #    swallowed, leaving the chat model resident. We also flush any
-    #    loading_models set so a chat load that is mid-download cannot
-    #    race the diffusion allocation.
+    #    will OOM the same way. We also flush any loading_models set so
+    #    a chat load that is mid-download cannot race the diffusion
+    #    allocation.
     try:
         from core.inference import get_inference_backend  # type: ignore
+    except Exception as exc:
+        logger.debug("safetensors unavailable before diffusion load: %s", exc)
+        return
 
-        backend = get_inference_backend()
-        active_model_name = getattr(backend, "active_model_name", None)
-        loading_models = set(getattr(backend, "loading_models", set()) or set())
-        if active_model_name:
-            logger.info(
-                "Unloading safetensors chat backend '%s' before diffusion load",
-                active_model_name,
+    backend = get_inference_backend()
+    active_model_name = getattr(backend, "active_model_name", None)
+    loading_models = set(getattr(backend, "loading_models", set()) or set())
+
+    def _require_unload(model_name: str) -> None:
+        try:
+            ok = backend.unload_model(model_name)
+        except Exception as exc:
+            raise RuntimeError(
+                f"Could not unload safetensors chat model '{model_name}' "
+                "before loading a diffusion image model."
+            ) from exc
+        if ok is False:
+            raise RuntimeError(
+                f"Safetensors backend refused to unload '{model_name}' "
+                "before loading a diffusion image model."
             )
-            backend.unload_model(active_model_name)
-        for loading in loading_models:
-            if loading == active_model_name:
-                continue
-            try:
-                logger.info(
-                    "Unloading in-flight safetensors chat load '%s' before diffusion",
-                    loading,
-                )
-                backend.unload_model(loading)
-            except Exception as inner:
-                logger.debug(
-                    "loading safetensors unload skipped for %s: %s", loading, inner
-                )
-    except Exception as exc:
-        logger.debug("safetensors unload skipped: %s", exc)
+
+    if active_model_name:
+        logger.info(
+            "Unloading safetensors chat backend '%s' before diffusion load",
+            active_model_name,
+        )
+        _require_unload(active_model_name)
+    for loading in loading_models:
+        if loading == active_model_name:
+            continue
+        logger.info(
+            "Unloading in-flight safetensors chat load '%s' before diffusion",
+            loading,
+        )
+        _require_unload(loading)
 
 
 def _release_other_gpu_owners_for_diffusion() -> None:
diff --git a/studio/backend/main.py b/studio/backend/main.py
index ac98722f46..29395e9415 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -306,6 +306,7 @@ def _precache():
 # ``<redacted>`` before serialisation. Scoped to the response body
 # only; the underlying validator behaviour is unchanged.
 from fastapi.exceptions import RequestValidationError as _RequestValidationError  # noqa: E402
+from fastapi.encoders import jsonable_encoder as _jsonable_encoder  # noqa: E402
 from fastapi.responses import JSONResponse as _JSONResponse  # noqa: E402
 import re as _re_validation  # noqa: E402
 
@@ -314,8 +315,21 @@ def _precache():
 
 
 def _scrub_validation_obj(value):
+    """Recursively scrub ``hf_xxxxx`` tokens out of a value tree.
+
+    Pydantic v2 nests raw ``ValueError`` (and other ``BaseException``)
+    instances under ``ctx.error``. Convert them to scrubbed strings
+    here; otherwise the default ``JSONResponse`` serializer raises
+    ``TypeError: Object of type ValueError is not JSON serializable``
+    and the 422 turns into a 500 (round 17 P1 #1). Tuples become
+    lists so the downstream JSON encoder accepts them.
+    """
     if isinstance(value, str):
         return _HF_TOKEN_VALIDATION_RE.sub("<redacted>", value)
+    if isinstance(value, BaseException):
+        return _scrub_validation_obj(str(value))
+    if isinstance(value, tuple):
+        return [_scrub_validation_obj(v) for v in value]
     if isinstance(value, list):
         return [_scrub_validation_obj(v) for v in value]
     if isinstance(value, dict):
@@ -325,9 +339,14 @@ def _scrub_validation_obj(value):
 
 @app.exception_handler(_RequestValidationError)
 async def _validation_error_scrubbing_handler(request, exc):
+    # ``jsonable_encoder`` walks the scrubbed payload one more time
+    # to convert anything else Pydantic v2 surfaces (URL objects,
+    # Path objects, Url instances, etc.) into JSON-safe primitives.
     return _JSONResponse(
         status_code = 422,
-        content = {"detail": _scrub_validation_obj(exc.errors())},
+        content = _jsonable_encoder(
+            {"detail": _scrub_validation_obj(exc.errors())}
+        ),
     )
 
 
diff --git a/studio/backend/models/inference.py b/studio/backend/models/inference.py
index 2dbd6bfc40..d379989241 100644
--- a/studio/backend/models/inference.py
+++ b/studio/backend/models/inference.py
@@ -1519,9 +1519,15 @@ class DiffusionLoadRequest(BaseModel):
     def _no_control_chars(cls, v, info):
         return _no_control_chars(v, info.field_name)
 
-    @field_validator("repo_id", "base_repo")
+    @field_validator("repo_id", "gguf_filename", "base_repo")
     @classmethod
     def _no_embedded_hf_tokens(cls, v, info):
+        # Round 17 P2 #12: ``gguf_filename`` is forwarded to the
+        # backend and stored on ``DiffusionBackend._gguf_filename``,
+        # which is later surfaced via ``status()`` / log lines. If a
+        # user pastes a URL-form quant path like
+        # ``https://hf_xxxxx@huggingface.co/.../flux.gguf`` we drop
+        # the embedded credential before it can leak.
         return _reject_embedded_hf_token(v, info.field_name)
 
 
diff --git a/studio/backend/routes/export.py b/studio/backend/routes/export.py
index d157d7d9f8..faa847f040 100644
--- a/studio/backend/routes/export.py
+++ b/studio/backend/routes/export.py
@@ -148,7 +148,7 @@ async def load_checkpoint(
         # helper so we cover llama-server is_active=True and
         # safetensors loading_models -- the asymmetries round 9
         # reviews #1, #8, #9 flagged.
-        from routes.inference import _release_chat_for
+        from routes.inference import _release_chat_for, _release_diffusion_for
 
         await _release_chat_for("export")
 
@@ -157,25 +157,15 @@ async def load_checkpoint(
         # shutdown above. is_loading is treated like is_loaded so an
         # in-flight load is also waited out (the diffusion unload
         # acquires _load_lock + _generate_lock and blocks until the
-        # current load completes, then unloads). Best effort; silently
-        # skip if the module is absent.
-        try:
-            from core.inference.diffusion import get_diffusion_backend
-
-            diff = get_diffusion_backend()
-            diff_status = diff.status()
-            if diff_status.get("is_loaded") or diff_status.get("is_loading"):
-                logger.info(
-                    "Unloading diffusion model (loaded=%s loading=%s) for export",
-                    diff_status.get("is_loaded"),
-                    diff_status.get("is_loading"),
-                )
-                # Block-move to thread; unload acquires the
-                # diffusion _load_lock + _generate_lock and can take
-                # the full duration of an in-flight load/generation.
-                await asyncio.to_thread(diff.unload_model)
-        except Exception as e:
-            logger.debug("diffusion unload skipped for export: %s", e)
+        # current load completes, then unloads).
+        # Round 17: previously this was a best-effort try/except that
+        # swallowed every failure with logger.debug, so a wedged
+        # diffusion backend let the export checkpoint load anyway and
+        # OOM at first allocation. ``_release_diffusion_for`` is
+        # strict: it raises HTTPException 503 if status() or
+        # unload_model() fails, or if the backend remains loaded or
+        # loading after the unload call.
+        await _release_diffusion_for("export load")
 
         # load_checkpoint spawns and waits on a subprocess and can take
         # minutes. Run it in a worker thread so the event loop stays
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index f799f8df58..1f32e2c706 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -487,44 +487,139 @@ async def _release_export_for(workload: str) -> None:
     route layer is expected to refuse the workload with HTTP 409
     via ``_raise_if_export_active`` before calling this.
 
-    This split is what round 10 reviewers flagged: the previous
-    behaviour terminated active exports on any release path, which
-    would corrupt the user's in-flight output artifact.
+    Round 17 P1 #8: idle-export shutdown failures now raise HTTP 503
+    instead of being swallowed, so a wedged export subprocess does
+    not silently leave GPU memory pinned while training / chat /
+    diffusion start on top.
     """
     try:
         from core.export import get_export_backend  # type: ignore
+    except Exception as exc:
+        logger.debug("export backend unavailable for %s: %s", workload, exc)
+        return
 
+    try:
         exp = get_export_backend()
-        has_checkpoint = bool(getattr(exp, "current_checkpoint", None))
-        # Backends without an async-job tracker (older builds, some
-        # test mocks) cannot report 'active' separately from
-        # 'has_checkpoint'. Treat absence as 'not active' so a
-        # settled checkpoint still gets dropped; on builds that DO
-        # expose it, a True value blocks the drop.
-        is_export_active_fn = getattr(exp, "is_export_active", None)
-        if is_export_active_fn is None:
-            active = False
-        else:
-            try:
-                active = bool(is_export_active_fn())
-            except Exception:
-                # Treat unverifiable as 'might be active' and refuse
-                # to drop. The caller's _raise_if_export_active call
-                # already failed closed; reaching here with an
-                # unknown status is the safer no-op.
-                active = True
-        if has_checkpoint and not active:
+    except Exception as exc:
+        logger.warning(
+            "Could not access export backend before %s: %s", workload, exc
+        )
+        raise HTTPException(
+            status_code = 503,
+            detail = (
+                f"Could not access export backend before starting {workload}. "
+                "Try again."
+            ),
+        ) from exc
+
+    has_checkpoint = bool(getattr(exp, "current_checkpoint", None))
+    is_export_active_fn = getattr(exp, "is_export_active", None)
+    if is_export_active_fn is None:
+        active = False
+    else:
+        try:
+            active = bool(is_export_active_fn())
+        except Exception as exc:
+            raise HTTPException(
+                status_code = 503,
+                detail = (
+                    f"Could not verify export status before starting "
+                    f"{workload}. Try again."
+                ),
+            ) from exc
+
+    if has_checkpoint and not active:
+        try:
             logger.info(
                 "Shutting down idle export (checkpoint=%s) for %s",
                 has_checkpoint,
                 workload,
             )
             await asyncio.to_thread(exp._shutdown_subprocess)
-            exp.current_checkpoint = None
-            exp.is_vision = False
-            exp.is_peft = False
-    except Exception as e:
-        logger.warning("Could not shut down export for %s: %s", workload, e)
+        except Exception as exc:
+            logger.warning(
+                "Could not shut down export for %s: %s", workload, exc
+            )
+            raise HTTPException(
+                status_code = 503,
+                detail = (
+                    f"Could not unload the idle export checkpoint before "
+                    f"starting {workload}. Try again."
+                ),
+            ) from exc
+        exp.current_checkpoint = None
+        exp.is_vision = False
+        exp.is_peft = False
+
+
+async def _release_diffusion_for(workload: str) -> None:
+    """Strict diffusion-unload helper for cross-workload handoffs.
+
+    Round 17 P1 #4-7: the GGUF chat load, safetensors chat load,
+    training start, and export load paths each had their own
+    best-effort try/except around ``diff_backend.unload_model()``.
+    A wedged diffusion pipeline therefore stayed resident while a
+    new GPU workload started on top. This helper raises HTTP 503
+    when the unload fails or leaves diffusion resident, so the
+    caller fails closed.
+    """
+    try:
+        from core.inference.diffusion import get_diffusion_backend  # type: ignore
+    except Exception as exc:
+        logger.debug("diffusion backend unavailable for %s: %s", workload, exc)
+        return
+
+    diff_backend = get_diffusion_backend()
+    try:
+        diff_status = diff_backend.status()
+    except Exception as exc:
+        logger.warning(
+            "Could not verify diffusion status before %s: %s", workload, exc
+        )
+        raise HTTPException(
+            status_code = 503,
+            detail = (
+                f"Could not verify diffusion status before starting "
+                f"{workload}. Try again."
+            ),
+        ) from exc
+
+    if not (diff_status.get("is_loaded") or diff_status.get("is_loading")):
+        return
+
+    logger.info(
+        "Unloading diffusion (loaded=%s loading=%s) before %s",
+        diff_status.get("is_loaded"),
+        diff_status.get("is_loading"),
+        workload,
+    )
+    try:
+        result = await asyncio.to_thread(diff_backend.unload_model)
+    except Exception as exc:
+        logger.warning("Failed to unload diffusion before %s: %s", workload, exc)
+        raise HTTPException(
+            status_code = 503,
+            detail = (
+                f"Could not unload the existing diffusion image model "
+                f"before starting {workload}. Try again."
+            ),
+        ) from exc
+
+    after = {}
+    try:
+        after = diff_backend.status()
+    except Exception:
+        # status() failure here is unusual but should not mask the
+        # primary outcome. Fall back to assuming the unload finished.
+        pass
+    if result is False or after.get("is_loaded") or after.get("is_loading"):
+        raise HTTPException(
+            status_code = 503,
+            detail = (
+                f"The diffusion image model is still active after unload; "
+                f"retry before starting {workload}."
+            ),
+        )
 
 
 def _detect_safetensors_features(backend, chat_template: Optional[str]) -> dict:
@@ -1045,30 +1140,11 @@ async def load_model(
             # in-flight safetensors load race the new GGUF allocation.
             await _release_safetensors_chat_for("GGUF chat")
 
-            # Symmetric with /images/load: drop any active diffusion
-            # pipeline so the GGUF chat load does not race the FLUX VAE
-            # for VRAM. Also handles is_loading: unload_model takes
-            # _load_lock + _generate_lock and will wait out an
-            # in-flight load before clearing state. Best effort;
-            # silently continue on failure.
-            try:
-                from core.inference.diffusion import get_diffusion_backend
-
-                diff_backend = get_diffusion_backend()
-                diff_status = diff_backend.status()
-                if diff_status.get("is_loaded") or diff_status.get("is_loading"):
-                    logger.info(
-                        "Unloading diffusion (loaded=%s loading=%s) before GGUF load",
-                        diff_status.get("is_loaded"),
-                        diff_status.get("is_loading"),
-                    )
-                    # diff_backend.unload_model takes _load_lock +
-                    # _generate_lock and can block for the duration of
-                    # an in-flight load / generation. Off-load to a
-                    # worker thread to keep the event loop responsive.
-                    await asyncio.to_thread(diff_backend.unload_model)
-            except Exception as e:
-                logger.debug("diffusion unload skipped (GGUF path): %s", e)
+            # Round 17 P1 #4: route the diffusion unload through the
+            # strict ``_release_diffusion_for`` helper so a wedged
+            # diffusion pipeline blocks the GGUF chat load with 503
+            # instead of silently double-owning VRAM.
+            await _release_diffusion_for("GGUF chat load")
 
             # Inherit llama_extra_args from the previous load when the
             # request omits the field (the chat-settings Apply path
@@ -1254,26 +1330,10 @@ async def load_model(
         # symmetric ``_release_safetensors_chat_for``.
         await _release_llama_for("safetensors chat")
 
-        # Unload any active diffusion pipeline so the new chat model is
-        # not racing the FLUX VAE for VRAM on a 16-24 GB card. is_loading
-        # is treated like is_loaded; unload waits behind _load_lock +
-        # _generate_lock so the in-flight load completes first.
-        try:
-            from core.inference.diffusion import get_diffusion_backend
-
-            diff_backend = get_diffusion_backend()
-            diff_status = diff_backend.status()
-            if diff_status.get("is_loaded") or diff_status.get("is_loading"):
-                logger.info(
-                    "Unloading diffusion (loaded=%s loading=%s) before chat load",
-                    diff_status.get("is_loaded"),
-                    diff_status.get("is_loading"),
-                )
-                # Same blocking concern as the GGUF chat path:
-                # _load_lock + _generate_lock serialise the call.
-                await asyncio.to_thread(diff_backend.unload_model)
-        except Exception as e:
-            logger.debug("diffusion unload skipped: %s", e)
+        # Round 17 P1 #5: strict diffusion unload via the shared
+        # helper so a wedged pipeline blocks the safetensors chat
+        # load with 503 instead of silently double-owning VRAM.
+        await _release_diffusion_for("safetensors chat load")
 
         # Export was already dropped above via the shared
         # ``await _release_export_for("safetensors chat")`` call
@@ -1968,6 +2028,14 @@ async def diffusion_load(
     # the request is refused with 409 instead of silently killing it.
     _raise_if_training_active("diffusion")
     _raise_if_export_active("diffusion")
+    # Round 17 P1 #3: drop the chat backends through the strict
+    # route-level helpers BEFORE the diffusion load. The backend's
+    # own ``_release_chat_backend_for_diffusion`` is now strict
+    # too (round 17 P1 #2), but doing it here keeps the public API
+    # path symmetric with training / export / chat handoffs that
+    # already use ``_release_chat_for``.
+    await _release_chat_for("diffusion")
+    await _release_export_for("diffusion")
     backend = _get_diffusion_backend()
     try:
         status = await asyncio.get_event_loop().run_in_executor(
@@ -1993,7 +2061,13 @@ async def diffusion_load(
         if (
             "Could not verify training status" in detail
             or "Could not verify export status" in detail
+            or "Could not unload" in detail
+            or "refused to unload" in detail
+            or "still active after unload" in detail
         ):
+            # Round 17 P1 #2: chat unload failures raised by the
+            # backend helper map to 503 (retryable infra issue),
+            # matching the route-level _release_*_for helpers.
             raise HTTPException(status_code = 503, detail = detail) from exc
         if (
             "export job is currently active" in detail
diff --git a/studio/backend/routes/training.py b/studio/backend/routes/training.py
index 990be3df4b..eac5837279 100644
--- a/studio/backend/routes/training.py
+++ b/studio/backend/routes/training.py
@@ -274,6 +274,7 @@ async def start_training(
         from routes.inference import (
             _raise_if_export_active,
             _release_chat_for,
+            _release_diffusion_for,
             _release_export_for,
         )
 
@@ -285,23 +286,13 @@ async def start_training(
         # holds the same GPU and would survive the inference shutdown.
         # is_loading=True is also handled (unload_model takes
         # _load_lock + _generate_lock and waits the in-flight load out).
-        try:
-            from core.inference.diffusion import get_diffusion_backend
-
-            diff_backend = get_diffusion_backend()
-            diff_status = diff_backend.status()
-            if diff_status.get("is_loaded") or diff_status.get("is_loading"):
-                logger.info(
-                    "Unloading diffusion (loaded=%s loading=%s) for training",
-                    diff_status.get("is_loaded"),
-                    diff_status.get("is_loading"),
-                )
-                # Async route: offload the blocking unload to a
-                # worker thread so the event loop stays responsive
-                # during long in-flight load / generate calls.
-                await asyncio.to_thread(diff_backend.unload_model)
-        except Exception as e:
-            logger.warning("Could not unload diffusion model: %s", e)
+        # Round 17: previously the diffusion unload was best-effort
+        # (try/except + logger.warning), so a stuck diffusion backend
+        # would let training start anyway and immediately OOM the
+        # subprocess. ``_release_diffusion_for`` is strict: it raises
+        # HTTPException 503 if status() or unload_model() fails, or if
+        # the backend remains loaded / loading after the unload call.
+        await _release_diffusion_for("training")
 
         # start_training now spawns a subprocess (non-blocking)
         success = backend.start_training(job_id = job_id, **training_kwargs)
diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index 25cf819008..5d739043e0 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -1056,72 +1056,40 @@ def call_status():
         t.join(timeout = 5)
 
 
-def test_load_publishes_pending_target_during_loading(monkeypatch):
+def test_load_publishes_pending_target_during_loading():
     """status() must expose the pending repo_id / base_repo / gguf
     file while is_loading=True so cache- and finetuned-delete guards
-    can refuse to rmtree the repo being downloaded right now."""
-    import threading
+    can refuse to rmtree the repo being downloaded right now.
+
+    The pending exposure is purely a state-shape contract: load_model
+    sets _loading + _pending_* under _lock at the start, and status()
+    snapshots them under _lock. Test the contract directly instead of
+    racing a fake pipeline through a background thread, which was
+    flaky on the Windows runner (the chat-release helpers' transitive
+    imports of core.training.resume failed there and the load thread
+    exited cleanly before the main thread observed the pending state).
+    """
     import core.inference.diffusion as d
-    from PIL import Image
-
-    fake = _install_fake_diffusers(monkeypatch)
-
-    pending_seen: dict = {}
-    pretrained_blocked = threading.Event()
-    pretrained_release = threading.Event()
-
-    class _SlowPipeline:
-        @classmethod
-        def from_pretrained(cls, base_repo, **kwargs):
-            pretrained_blocked.set()
-            # Capture status() output while the load is blocked.
-            backend = d.get_diffusion_backend()
-            pending_seen.update(backend.status())
-            pretrained_release.wait(timeout = 5)
-            inst = cls()
-            inst.base_repo = base_repo
-            return inst
-
-        def __call__(self, **kwargs):
-            class _Out:
-                pass
 
-            o = _Out()
-            o.images = [Image.new("RGB", (kwargs["width"], kwargs["height"]))]
-            return o
-
-        def enable_model_cpu_offload(self):
-            pass
-
-        def to(self, device):
-            return self
-
-    fake.Flux2KleinPipeline = _SlowPipeline
-
-    backend = d.get_diffusion_backend()
-    backend.unload_model()
-
-    def do_load():
-        try:
-            backend.load_model(
-                "unsloth/FLUX.2-klein-4B-GGUF",
-                gguf_filename = "flux-2-klein-4b-Q4_K_S.gguf",
-            )
-        except Exception:
-            pass
+    backend = d.DiffusionBackend()
+    # Simulate the state load_model publishes at the top of its
+    # critical section, before from_pretrained runs.
+    with backend._lock:
+        backend._loading = True
+        backend._pending_repo_id = "unsloth/FLUX.2-klein-4B-GGUF"
+        backend._pending_base_repo = "black-forest-labs/FLUX.2-klein-4B"
+        backend._pending_gguf_filename = "flux-2-klein-4b-Q4_K_S.gguf"
 
-    t = threading.Thread(target = do_load)
-    t.start()
-    try:
-        assert pretrained_blocked.wait(timeout = 5)
-        # While blocked inside from_pretrained, status reads should
-        # already see the pending repo so deletes can be refused.
-        assert pending_seen.get("is_loading") is True
-        assert pending_seen.get("repo_id") == "unsloth/FLUX.2-klein-4B-GGUF"
-        assert pending_seen.get("base_repo") == "black-forest-labs/FLUX.2-klein-4B"
-    finally:
-        pretrained_release.set()
-        t.join(timeout = 5)
+    public = backend.status()
+    assert public["is_loading"] is True
+    assert public["repo_id"] == "unsloth/FLUX.2-klein-4B-GGUF"
+    assert public["base_repo"] == "black-forest-labs/FLUX.2-klein-4B"
+    # Guard-facing internal payload also reports the pending fields
+    # under their dedicated keys.
+    internal = backend.status(include_internal = True)
+    assert internal["pending_repo_id"] == "unsloth/FLUX.2-klein-4B-GGUF"
+    assert internal["pending_base_repo"] == "black-forest-labs/FLUX.2-klein-4B"
+    assert internal["pending_gguf_filename"] == "flux-2-klein-4b-Q4_K_S.gguf"
 
 
 def test_unload_waits_for_in_flight_generation(monkeypatch):

From 72ec67034c5e4560998068d593ebade78dadd55b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 08:12:29 +0000
Subject: [PATCH 51/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/diffusion.py |  1 +
 studio/backend/main.py                     |  4 +---
 studio/backend/routes/inference.py         | 12 +++---------
 3 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 5d5f4c19d5..e3ee73d7b7 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -1006,6 +1006,7 @@ def load_model(
                 import re
 
                 exc_msg = re.sub(r"hf_[A-Za-z0-9]{20,}", "<redacted>", exc_msg)
+
                 # Round 17 P2 #9: diffusers / safetensors raise errors
                 # like ``FileNotFoundError: /home/alice/models/foo.gguf``
                 # or ``OSError: Error while loading state dict from
diff --git a/studio/backend/main.py b/studio/backend/main.py
index 29395e9415..4ec5e0fc2e 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -344,9 +344,7 @@ async def _validation_error_scrubbing_handler(request, exc):
     # Path objects, Url instances, etc.) into JSON-safe primitives.
     return _JSONResponse(
         status_code = 422,
-        content = _jsonable_encoder(
-            {"detail": _scrub_validation_obj(exc.errors())}
-        ),
+        content = _jsonable_encoder({"detail": _scrub_validation_obj(exc.errors())}),
     )
 
 
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 1f32e2c706..b88175e270 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -501,9 +501,7 @@ async def _release_export_for(workload: str) -> None:
     try:
         exp = get_export_backend()
     except Exception as exc:
-        logger.warning(
-            "Could not access export backend before %s: %s", workload, exc
-        )
+        logger.warning("Could not access export backend before %s: %s", workload, exc)
         raise HTTPException(
             status_code = 503,
             detail = (
@@ -537,9 +535,7 @@ async def _release_export_for(workload: str) -> None:
             )
             await asyncio.to_thread(exp._shutdown_subprocess)
         except Exception as exc:
-            logger.warning(
-                "Could not shut down export for %s: %s", workload, exc
-            )
+            logger.warning("Could not shut down export for %s: %s", workload, exc)
             raise HTTPException(
                 status_code = 503,
                 detail = (
@@ -573,9 +569,7 @@ async def _release_diffusion_for(workload: str) -> None:
     try:
         diff_status = diff_backend.status()
     except Exception as exc:
-        logger.warning(
-            "Could not verify diffusion status before %s: %s", workload, exc
-        )
+        logger.warning("Could not verify diffusion status before %s: %s", workload, exc)
         raise HTTPException(
             status_code = 503,
             detail = (

From da271435202df5272accf508feca64bf71609b87 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 08:43:59 +0000
Subject: [PATCH 52/92] Fix/adjust diffusion: round 18 P1+P2 batch for PR #5754

P1 #1: ``_release_llama_for()`` now verifies ``llama.unload_model``
did not return False AND that ``is_loaded`` / ``is_active`` /
``loading_model_identifier`` are all cleared after the call. The
previous version only treated raised exceptions as failure, so a
subprocess refusing to terminate or an in-flight GGUF download
let the next workload allocate on top.

P1 #2: ``DiffusionBackend._release_other_gpu_owners_for_diffusion``
now raises RuntimeError when ``exp._shutdown_subprocess`` fails on
a settled checkpoint. Direct backend callers used to log at debug
level and proceed toward diffusion allocation while the export
checkpoint still owned VRAM.

P1 #3 + P1 #7: ``/images/load`` no longer drops chat + idle export
before the cheap backend validation runs. ``DiffusionBackend.load_model``
already calls the strict ``_release_other_gpu_owners_for_diffusion``
and ``_release_chat_backend_for_diffusion`` helpers AFTER family
inference and GGUF filename checks pass, so the GPU is still
freed before allocation and a malformed payload no longer
silently unloads the user's chat / chat-export pair.

P1 #4: ``_release_chat_backend_for_diffusion`` now also rejects a
post-unload state where ``loading_model_identifier`` is still set,
matching the route-level ``_release_llama_for`` strictness. A GGUF
download mid-flight before the diffusion handoff used to slip
through and end up double-owning VRAM after diffusion allocated.

P1 #5: ``_release_diffusion_for`` no longer swallows a post-unload
``status()`` failure as ``after = {}``. Training / chat / export
handoffs need proof that the diffusion pipeline released VRAM;
the helper now raises HTTP 503 when the verification status call
itself raises, so the caller retries.

P1 #6: ``DiffusionBackend._release_other_gpu_owners_for_diffusion``
raises RuntimeError when ``get_export_backend()`` itself raises.
Direct backend callers used to silently ``return`` here and
proceed to GPU allocation without being able to verify export
ownership.

P1 #8: ``/training/start`` releases settled export BEFORE chat,
matching the chat-load helpers. If idle export shutdown fails the
user's chat model is preserved instead of being dropped for a
training run that never starts.

P2 #9: GGUF load-error scrubber also collapses ``local_gguf_path``,
the resolved HF cache path passed to
``transformer_cls.from_single_file()``. Without this an exception
like ``OSError: cannot load /home/alice/.cache/huggingface/.../flux.gguf``
would leak the operator's filesystem layout through ``last_error``
and ``/images/status``.

All 85 diffusion-relevant backend tests pass locally.
---
 studio/backend/core/inference/diffusion.py | 49 +++++++++++++---
 studio/backend/routes/inference.py         | 68 +++++++++++++++++-----
 studio/backend/routes/training.py          |  7 ++-
 3 files changed, 101 insertions(+), 23 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index e3ee73d7b7..c2112dce23 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -1047,10 +1047,19 @@ def _collapse_local(msg: str, candidate: Optional[str]) -> str:
                 # repo / filename validation raises before
                 # ``effective_base`` is computed). ``locals().get``
                 # keeps the scrub a no-op in that case.
+                # Round 18 P2 #9: also scrub ``local_gguf_path``. The
+                # GGUF quant is loaded via
+                # ``transformer_cls.from_single_file(local_gguf_path)``,
+                # and diffusers / safetensors errors include the
+                # resolved absolute HF cache path
+                # (``/home/alice/.cache/huggingface/hub/.../flux.gguf``).
+                # Without this the cache path would leak into
+                # ``_last_error`` (and therefore status() / log lines).
                 _locals = locals()
                 exc_msg = _collapse_local(exc_msg, repo_id)
                 exc_msg = _collapse_local(exc_msg, _locals.get("effective_base"))
                 exc_msg = _collapse_local(exc_msg, _locals.get("gguf_filename"))
+                exc_msg = _collapse_local(exc_msg, _locals.get("local_gguf_path"))
                 with self._lock:
                     self._last_error = exc_msg
                 # ``logger.exception`` would emit the raw exception
@@ -1343,14 +1352,21 @@ def _release_chat_backend_for_diffusion() -> None:
                     "Could not unload the existing GGUF chat model before "
                     "loading a diffusion image model."
                 ) from exc
+            # Round 18 P1 #4: also reject when ``loading_model_identifier``
+            # is still set after the unload call. Without this, a GGUF
+            # download / startup that was already in flight before the
+            # diffusion handoff (and which never flipped is_active to
+            # True before the unload landed) keeps allocating into VRAM
+            # while diffusion proceeds, double-owning the GPU.
             if (
                 ok is False
                 or getattr(backend, "is_loaded", False)
                 or getattr(backend, "is_active", False)
+                or getattr(backend, "loading_model_identifier", None)
             ):
                 raise RuntimeError(
-                    "The existing GGUF chat model is still active after "
-                    "unload; retry before loading a diffusion image model."
+                    "The existing GGUF chat model is still active or loading "
+                    "after unload; retry before loading a diffusion image model."
                 )
 
     # 2. Safetensors / HF chat backend (the InferenceOrchestrator that
@@ -1447,11 +1463,19 @@ def _release_other_gpu_owners_for_diffusion() -> None:
         logger.debug("export module not importable: %s", exc)
         return
 
+    # Round 18 P1 #6: ``get_export_backend()`` raising used to be a
+    # silent ``return`` so direct ``DiffusionBackend.load_model``
+    # callers could proceed toward GPU allocation without being able
+    # to verify export ownership. Fail closed instead, matching the
+    # route-level helper which already maps "Could not verify" /
+    # "Could not access" failures to HTTP 503.
     try:
         exp = get_export_backend()
     except Exception as exc:
-        logger.debug("export backend not available: %s", exc)
-        return
+        raise RuntimeError(
+            "Could not verify export status before loading a "
+            "diffusion image model."
+        ) from exc
 
     is_export_active_fn = getattr(exp, "is_export_active", None)
     if is_export_active_fn is not None:
@@ -1480,14 +1504,23 @@ def _release_other_gpu_owners_for_diffusion() -> None:
             )
 
     if getattr(exp, "current_checkpoint", None):
+        # Round 18 P1 #2: a wedged ``_shutdown_subprocess`` used to log
+        # at debug level and continue, so direct backend callers could
+        # allocate diffusion VRAM on top of an export checkpoint that
+        # still owned the GPU. Mirror the route-level helper and raise
+        # so the surrounding ``load_model`` bails out with a clean
+        # RuntimeError that the route layer maps to HTTP 503.
         try:
             logger.info("Shutting down idle export subprocess before diffusion load")
             exp._shutdown_subprocess()
-            exp.current_checkpoint = None
-            exp.is_vision = False
-            exp.is_peft = False
         except Exception as exc:
-            logger.debug("idle export shutdown failed: %s", exc)
+            raise RuntimeError(
+                "Could not unload the idle export checkpoint before "
+                "loading a diffusion image model."
+            ) from exc
+        exp.current_checkpoint = None
+        exp.is_vision = False
+        exp.is_peft = False
 
     # Note: active training is *not* stopped here. The route layer
     # (`_raise_if_training_active` in routes/inference.py) refuses
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index b88175e270..079a90e231 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -392,7 +392,7 @@ async def _release_llama_for(workload: str) -> None:
         workload,
     )
     try:
-        await asyncio.to_thread(llama.unload_model)
+        ok = await asyncio.to_thread(llama.unload_model)
     except Exception as exc:
         logger.warning("Failed to unload GGUF chat before %s load: %s", workload, exc)
         raise HTTPException(
@@ -403,6 +403,28 @@ async def _release_llama_for(workload: str) -> None:
             ),
         ) from exc
 
+    # Round 18 P1 #1: previously only the raised-exception path was
+    # treated as failure. ``llama.unload_model()`` returning ``False``
+    # (subprocess refused to terminate, IPC timeout) or leaving
+    # ``is_loaded`` / ``is_active`` / ``loading_model_identifier``
+    # populated after the call meant the next workload could allocate
+    # while llama-server was still resident. Re-read the same three
+    # fields and fail closed if anything is still set so the caller
+    # retries instead of double-owning VRAM.
+    if (
+        ok is False
+        or bool(getattr(llama, "is_loaded", False))
+        or bool(getattr(llama, "is_active", False))
+        or bool(getattr(llama, "loading_model_identifier", None))
+    ):
+        raise HTTPException(
+            status_code = 503,
+            detail = (
+                "The existing GGUF chat model is still active or loading "
+                f"after unload; retry before starting {workload}."
+            ),
+        )
+
 
 async def _release_safetensors_chat_for(workload: str) -> None:
     """Unload the safetensors / Unsloth chat backend (drains both
@@ -599,13 +621,27 @@ async def _release_diffusion_for(workload: str) -> None:
             ),
         ) from exc
 
-    after = {}
+    # Round 18 P1 #5: a successful pre-check status() and a
+    # success-shaped unload result used to mask a post-unload
+    # status() failure (after = {}) and let the caller proceed
+    # without proof that diffusion released VRAM. Fail closed
+    # instead so training / chat / export retry rather than
+    # double-owning the GPU.
     try:
         after = diff_backend.status()
-    except Exception:
-        # status() failure here is unusual but should not mask the
-        # primary outcome. Fall back to assuming the unload finished.
-        pass
+    except Exception as exc:
+        logger.warning(
+            "Could not verify diffusion status after unload before %s: %s",
+            workload,
+            exc,
+        )
+        raise HTTPException(
+            status_code = 503,
+            detail = (
+                f"Could not verify diffusion unload before starting "
+                f"{workload}. Try again."
+            ),
+        ) from exc
     if result is False or after.get("is_loaded") or after.get("is_loading"):
         raise HTTPException(
             status_code = 503,
@@ -2022,14 +2058,18 @@ async def diffusion_load(
     # the request is refused with 409 instead of silently killing it.
     _raise_if_training_active("diffusion")
     _raise_if_export_active("diffusion")
-    # Round 17 P1 #3: drop the chat backends through the strict
-    # route-level helpers BEFORE the diffusion load. The backend's
-    # own ``_release_chat_backend_for_diffusion`` is now strict
-    # too (round 17 P1 #2), but doing it here keeps the public API
-    # path symmetric with training / export / chat handoffs that
-    # already use ``_release_chat_for``.
-    await _release_chat_for("diffusion")
-    await _release_export_for("diffusion")
+    # Round 18 P1 #3 + P1 #7: the route used to drop chat and idle
+    # export BEFORE ``backend.load_model`` ran its cheap validation
+    # (family inference, GGUF filename checks, gated-token failures,
+    # missing diffusers). A malformed image request would therefore
+    # unload the user's chat model and then return a 400 with nothing
+    # loaded; if export cleanup raised, chat had already been dropped.
+    # ``DiffusionBackend.load_model`` itself calls
+    # ``_release_other_gpu_owners_for_diffusion`` (strict idle-export
+    # shutdown after round 18 P1 #2) and
+    # ``_release_chat_backend_for_diffusion`` (strict GGUF + safetensors
+    # unload after round 17 P1 #2 + round 18 P1 #4), so the GPU is
+    # still freed before any allocation, just AFTER validation.
     backend = _get_diffusion_backend()
     try:
         status = await asyncio.get_event_loop().run_in_executor(
diff --git a/studio/backend/routes/training.py b/studio/backend/routes/training.py
index eac5837279..355394cc10 100644
--- a/studio/backend/routes/training.py
+++ b/studio/backend/routes/training.py
@@ -279,8 +279,13 @@ async def start_training(
         )
 
         _raise_if_export_active("training")
-        await _release_chat_for("training")
+        # Round 18 P1 #8: release settled export FIRST so an export
+        # cleanup failure preserves the user's currently loaded chat
+        # model. The previous order (chat -> export) would drop chat
+        # and then refuse training when a wedged idle export raised,
+        # leaving the user with nothing loaded.
         await _release_export_for("training")
+        await _release_chat_for("training")
 
         # Also unload any loaded diffusion pipeline (Images page); it
         # holds the same GPU and would survive the inference shutdown.

From 369573b7846ffba313fe5524d3afddf96661156d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 08:44:14 +0000
Subject: [PATCH 53/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/diffusion.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index c2112dce23..2a1e97efd0 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -1473,8 +1473,7 @@ def _release_other_gpu_owners_for_diffusion() -> None:
         exp = get_export_backend()
     except Exception as exc:
         raise RuntimeError(
-            "Could not verify export status before loading a "
-            "diffusion image model."
+            "Could not verify export status before loading a " "diffusion image model."
         ) from exc
 
     is_export_active_fn = getattr(exp, "is_export_active", None)

From c20ed25ec6ee3ad1e248de11a23594634a60824d Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 09:45:44 +0000
Subject: [PATCH 54/92] Fix/adjust diffusion: round 19 P1+P2 batch for PR #5754

P1 #1: ``_release_safetensors_chat_for`` now re-reads
``active_model_name`` and ``loading_models`` after each unload AND
runs a final sweep against the initial owned-name set. The previous
helper trusted ``unload_model() -> True`` even though the
orchestrator can respond ``unloaded`` while still holding weights
or a concurrent ``load`` can repopulate the tracker between calls.
Per-name and global post-state mismatches now raise HTTP 503 so
the caller retries.

P1 #2: same post-state guarantee inside
``_release_chat_backend_for_diffusion`` for direct backend
callers. ``DiffusionBackend.load_model`` now raises RuntimeError
when the safetensors tracker still owns a previously-resident
name after the unload, matching the route-level helper. The route
layer's existing classifier maps the new wording to HTTP 503.

P1 #3: ``DiffusionBackend.load_model`` now preflights the full
diffusers repo (or explicit GGUF ``base_repo``) via
``hf_hub_download(filename="model_index.json")`` BEFORE the
chat / export unload runs. The GGUF path was already covered by
the existing ``hf_hub_download(gguf_filename)`` round-trip; the
full-repo path used to skip validation and let a typo / private /
gated repo only surface inside ``from_pretrained`` AFTER the
user's chat model was already dropped. Local paths are checked
structurally (must be a directory containing ``model_index.json``)
so we do not network-round-trip for an on-disk miss. Error
messages route through ``_display_repo_id`` so an absolute
filesystem path does not leak the operator's layout.

P1 #6: ``/api/inference/unload`` (the direct chat unload endpoint)
now treats ``unload_model() -> False`` AND a leftover state
(``is_loaded`` / ``is_active`` / ``loading_model_identifier`` for
GGUF, ``active_model_name`` / ``loading_models`` for safetensors)
as 503 instead of unconditionally responding
``status="unloaded"``. The UI used to show the model as gone while
the backend still owned VRAM.

P2 #7: extended the /images/load RuntimeError -> HTTPException
marker list with ``still active or loading after unload`` and
``still loading after unload``. Round 18 introduced these exact
phrasings on the backend side; without the extension a retryable
unload failure was returning HTTP 400 to the user instead of 503.

P2 #8: removed the unused ``unsloth_backend = get_inference_backend()``
eager construction in the GGUF chat-load branch. Eager
construction made the GGUF-only path needlessly fail or pay
startup cost when the safetensors backend was unavailable / lazy;
``_release_safetensors_chat_for`` already handles that case as a
no-op.

All 85 diffusion-relevant + 98 related backend tests pass locally.
---
 studio/backend/core/inference/diffusion.py | 100 +++++++++++++++++++++
 studio/backend/routes/inference.py         |  94 ++++++++++++++++++-
 2 files changed, 191 insertions(+), 3 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 2a1e97efd0..fe50588b61 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -201,6 +201,64 @@ def _expand_existing_local_path(value: str) -> str:
     return value
 
 
+def _preflight_full_diffusers_repo(repo: str, hf_token: Optional[str]) -> None:
+    """Prove a full diffusers repo is accessible before any unloads.
+
+    Round 19 P1 #3: the GGUF path's ``hf_hub_download(gguf_filename)``
+    above this function fails fast on a bad / private / gated /
+    typo'd repo before we touch the chat backend. The full diffusers
+    path used to skip that round-trip and only discover the issue
+    inside ``from_pretrained`` AFTER the user's chat model was
+    already unloaded. Add the same one-file probe (``model_index.json``
+    is the diffusers manifest; every diffusers repo has one).
+
+    Local paths are checked structurally so we do not hit the network
+    for a missing on-disk directory; both branches raise RuntimeError
+    so the surrounding load_model bails out before the chat unload.
+    The display label is collapsed via ``_display_repo_id`` so an
+    absolute filesystem path in the error message does not leak the
+    operator's layout (see round 17 P2 #9).
+    """
+    if not repo:
+        return
+    try:
+        local = Path(repo).expanduser()
+    except (OSError, ValueError):
+        local = None
+    if local is not None and local.exists():
+        if not local.is_dir():
+            raise RuntimeError(
+                f"Diffusion repo '{_display_repo_id(repo)}' is not a directory."
+            )
+        if not (local / "model_index.json").is_file():
+            raise RuntimeError(
+                f"Diffusion repo '{_display_repo_id(repo)}' is missing "
+                "model_index.json."
+            )
+        return
+    if (local is not None and local.is_absolute()) or repo.startswith("~"):
+        raise RuntimeError(
+            f"Local diffusion repo '{_display_repo_id(repo)}' does not exist."
+        )
+    try:
+        from huggingface_hub import hf_hub_download as _hf_hub_download
+    except Exception:
+        # diffusers is installed but huggingface_hub is missing -- let
+        # the downstream loader produce the canonical error.
+        return
+    try:
+        _hf_hub_download(
+            repo_id = repo,
+            filename = "model_index.json",
+            token = hf_token,
+        )
+    except Exception as exc:
+        raise RuntimeError(
+            f"Could not access diffusion repo '{_display_repo_id(repo)}' "
+            "before unloading the current model."
+        ) from exc
+
+
 def _display_repo_id(value: Any) -> Any:
     """Return a public-facing label for a repo_id / base_repo.
 
@@ -835,6 +893,21 @@ def load_model(
                             token = hf_token,
                         )
 
+                # Round 19 P1 #3: the GGUF branch above already
+                # proved repo + filename are accessible via
+                # ``hf_hub_download``. The full-diffusers path (no
+                # ``gguf_filename``) did NOT, so a typo / private /
+                # gated full repo only surfaced inside
+                # ``from_pretrained`` AFTER chat was unloaded. Probe
+                # ``effective_base`` for ``model_index.json`` here so
+                # the chat model is preserved on a bad full-repo
+                # request. Also probe when the GGUF caller supplied
+                # an explicit ``base_repo`` (the base companion is
+                # ALSO downloaded via from_pretrained further down
+                # and would OOM-then-fail past the unload).
+                if not gguf_filename or base_repo:
+                    _preflight_full_diffusers_repo(effective_base, hf_token)
+
                 # All cheap failure points (bad gguf_filename, missing
                 # pipeline / transformer class, gated download token,
                 # transient Hub error on the GGUF download) have now
@@ -1384,6 +1457,7 @@ def _release_chat_backend_for_diffusion() -> None:
     backend = get_inference_backend()
     active_model_name = getattr(backend, "active_model_name", None)
     loading_models = set(getattr(backend, "loading_models", set()) or set())
+    owned_names = {name for name in ({active_model_name} | loading_models) if name}
 
     def _require_unload(model_name: str) -> None:
         try:
@@ -1398,6 +1472,20 @@ def _require_unload(model_name: str) -> None:
                 f"Safetensors backend refused to unload '{model_name}' "
                 "before loading a diffusion image model."
             )
+        # Round 19 P1 #2: per-name post-state check. ``unload_model``
+        # returning ``True`` does not guarantee the orchestrator
+        # actually dropped the weights; the worker may have responded
+        # while still holding them, or a concurrent ``load`` may have
+        # repopulated the tracker. Verify the specific name is gone
+        # so the surrounding diffusion load bails out instead of
+        # silently double-owning VRAM.
+        active_after = getattr(backend, "active_model_name", None)
+        loading_after = set(getattr(backend, "loading_models", set()) or set())
+        if active_after == model_name or model_name in loading_after:
+            raise RuntimeError(
+                f"Safetensors chat model '{model_name}' is still active "
+                "or loading after unload; retry before loading a diffusion image model."
+            )
 
     if active_model_name:
         logger.info(
@@ -1414,6 +1502,18 @@ def _require_unload(model_name: str) -> None:
         )
         _require_unload(loading)
 
+    # Round 19 P1 #2: final sweep using the initial snapshot of
+    # owned names. Catches races where a name we did not explicitly
+    # unload (because it appeared in loading_models between the
+    # snapshot and the unload calls) is still owned after the loop.
+    remaining_loading = set(getattr(backend, "loading_models", set()) or set()) & owned_names
+    remaining_active = getattr(backend, "active_model_name", None)
+    if remaining_loading or (remaining_active in owned_names):
+        raise RuntimeError(
+            "The existing safetensors chat model is still active or loading "
+            "after unload; retry before loading a diffusion image model."
+        )
+
 
 def _release_other_gpu_owners_for_diffusion() -> None:
     """Best-effort: shut down export subprocess + active training before
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 079a90e231..b20e7e3696 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -464,9 +464,27 @@ async def _unload_required(model_name: str) -> None:
                     "Try again."
                 ),
             )
+        # Round 19 P1 #1: ``unload_model`` returning ``True`` does not
+        # by itself guarantee the orchestrator dropped the model. The
+        # worker may have responded ``unloaded`` while still holding
+        # weights, or a concurrent ``load`` from another tab may have
+        # repopulated ``loading_models`` between calls. Re-read the
+        # tracker fields and fail closed if this specific name is
+        # still active or loading so the caller retries.
+        remaining_loading = set(getattr(inf, "loading_models", set()) or set())
+        active_after = getattr(inf, "active_model_name", None)
+        if active_after == model_name or model_name in remaining_loading:
+            raise HTTPException(
+                status_code = 503,
+                detail = (
+                    f"Safetensors chat model '{model_name}' is still active "
+                    f"or loading after unload; retry before starting {workload}."
+                ),
+            )
 
     active_model_name = getattr(inf, "active_model_name", None)
     loading_models = set(getattr(inf, "loading_models", set()) or set())
+    owned_names = {name for name in ({active_model_name} | loading_models) if name}
     if active_model_name:
         logger.info(
             "Unloading safetensors chat '%s' before %s load",
@@ -484,6 +502,21 @@ async def _unload_required(model_name: str) -> None:
         )
         await _unload_required(loading)
 
+    # Round 19 P1 #1: final sweep using the set of names that were
+    # initially present. Catches races where a model name we did not
+    # explicitly unload (because it appeared between the snapshot and
+    # the unload calls) is still in the owned set after the loop.
+    remaining_loading = set(getattr(inf, "loading_models", set()) or set()) & owned_names
+    remaining_active = getattr(inf, "active_model_name", None)
+    if remaining_loading or (remaining_active in owned_names):
+        raise HTTPException(
+            status_code = 503,
+            detail = (
+                "The existing safetensors chat model is still active or loading "
+                f"after unload; retry before starting {workload}."
+            ),
+        )
+
 
 async def _release_chat_for(workload: str) -> None:
     """Shared 'release any GPU-owning chat backend' helper.
@@ -1161,7 +1194,14 @@ async def load_model(
             await _release_export_for("GGUF chat")
 
             llama_backend = get_llama_cpp_backend()
-            unsloth_backend = get_inference_backend()
+            # Round 19 P2 #8: previously also called
+            # ``unsloth_backend = get_inference_backend()`` here, but
+            # the binding was never used in the GGUF branch. Eager
+            # construction makes the GGUF-only path needlessly fail
+            # or pay startup cost when the safetensors backend is
+            # unavailable / lazy-initialised; the shared
+            # ``_release_safetensors_chat_for`` below already
+            # handles missing-backend cases as a no-op.
 
             # Unload any safetensors / Unsloth model first to free
             # VRAM. Uses the shared helper so we also drain
@@ -1632,16 +1672,57 @@ async def unload_model(
             )
             or not llama_backend.is_loaded
         ):
-            llama_backend.unload_model()
+            # Round 19 P1 #6: previously this called
+            # ``llama_backend.unload_model()`` and unconditionally
+            # returned ``status="unloaded"`` even when the subprocess
+            # refused to terminate or IPC timed out. The frontend then
+            # showed the model as unloaded while llama-server was
+            # still resident. Treat ``False`` / leftover state as a
+            # 503 so the user retries.
+            ok = await asyncio.to_thread(llama_backend.unload_model)
+            if (
+                ok is False
+                or getattr(llama_backend, "is_loaded", False)
+                or getattr(llama_backend, "is_active", False)
+                or getattr(llama_backend, "loading_model_identifier", None)
+            ):
+                raise HTTPException(
+                    status_code = 503,
+                    detail = (
+                        "The GGUF model is still active or loading after unload. "
+                        "Try again."
+                    ),
+                )
             logger.info(f"Unloaded GGUF model: {request.model_path}")
             return UnloadResponse(status = "unloaded", model = request.model_path)
 
         # Otherwise, unload from Unsloth backend
         backend = get_inference_backend()
-        backend.unload_model(request.model_path)
+        # Round 19 P1 #6: same fail-closed treatment for safetensors.
+        # ``unload_model`` returning ``False`` or leaving
+        # ``active_model_name`` / ``loading_models`` populated for the
+        # requested name must surface to the client so the UI reflects
+        # the real state.
+        ok = await asyncio.to_thread(backend.unload_model, request.model_path)
+        active_after = getattr(backend, "active_model_name", None)
+        loading_after = set(getattr(backend, "loading_models", set()) or set())
+        if (
+            ok is False
+            or active_after == request.model_path
+            or request.model_path in loading_after
+        ):
+            raise HTTPException(
+                status_code = 503,
+                detail = (
+                    "The safetensors model is still active or loading after "
+                    "unload. Try again."
+                ),
+            )
         logger.info(f"Unloaded model: {request.model_path}")
         return UnloadResponse(status = "unloaded", model = request.model_path)
 
+    except HTTPException:
+        raise
     except Exception as e:
         logger.error(f"Error unloading model: {e}", exc_info = True)
         raise HTTPException(status_code = 500, detail = f"Failed to unload model: {str(e)}")
@@ -2098,6 +2179,13 @@ async def diffusion_load(
             or "Could not unload" in detail
             or "refused to unload" in detail
             or "still active after unload" in detail
+            # Round 19 P2 #7: round 18 introduced new RuntimeError
+            # phrasings (``still active or loading after unload``)
+            # that the original marker list did not cover, so a
+            # retryable chat-unload failure was returning HTTP 400
+            # to the user instead of 503. Match both wordings.
+            or "still active or loading after unload" in detail
+            or "still loading after unload" in detail
         ):
             # Round 17 P1 #2: chat unload failures raised by the
             # backend helper map to 503 (retryable infra issue),

From c520a473aed06cebca729204ff5439b78e3f37f1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 09:46:13 +0000
Subject: [PATCH 55/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/diffusion.py | 4 +++-
 studio/backend/routes/inference.py         | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index fe50588b61..20d67a9cf3 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -1506,7 +1506,9 @@ def _require_unload(model_name: str) -> None:
     # owned names. Catches races where a name we did not explicitly
     # unload (because it appeared in loading_models between the
     # snapshot and the unload calls) is still owned after the loop.
-    remaining_loading = set(getattr(backend, "loading_models", set()) or set()) & owned_names
+    remaining_loading = (
+        set(getattr(backend, "loading_models", set()) or set()) & owned_names
+    )
     remaining_active = getattr(backend, "active_model_name", None)
     if remaining_loading or (remaining_active in owned_names):
         raise RuntimeError(
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index b20e7e3696..7d18972a91 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -506,7 +506,9 @@ async def _unload_required(model_name: str) -> None:
     # initially present. Catches races where a model name we did not
     # explicitly unload (because it appeared between the snapshot and
     # the unload calls) is still in the owned set after the loop.
-    remaining_loading = set(getattr(inf, "loading_models", set()) or set()) & owned_names
+    remaining_loading = (
+        set(getattr(inf, "loading_models", set()) or set()) & owned_names
+    )
     remaining_active = getattr(inf, "active_model_name", None)
     if remaining_loading or (remaining_active in owned_names):
         raise HTTPException(

From ff3bad37fe18dee202e79f9de1d80b8088acd6ff Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 10:07:11 +0000
Subject: [PATCH 56/92] Fix/adjust diffusion: round 20 P1+P2 batch for PR #5754

P1 #1: ``_preflight_full_diffusers_repo(effective_base, hf_token)``
now runs for every load mode, including the GGUF-with-auto-base
path. Round 19 only preflighted the full repo or an explicit
``base_repo``, so an auto-picked companion that turned out to be
gated / private / missing still unloaded the user's chat model
before ``from_pretrained`` failed. ``effective_base`` is the same
value that feeds every downstream allocation, so preflighting it
unconditionally catches all three modes.

P1 #2: ``diffusers.GGUFQuantizationConfig`` (which imports the
``gguf`` package at construction time) is now built up front,
inside the same try block that surfaces "Re-run Studio setup".
Previously the missing-dependency exception fired AFTER
``_release_other_gpu_owners_for_diffusion`` and
``_release_chat_backend_for_diffusion`` had already taken the
chat / export models down. The downstream from_single_file call
reuses the same ``quant_config`` reference.

P1 #4: ``studio/backend/requirements/studio.txt`` now lists
``diffusers>=0.37.0`` and ``gguf>=0.10.0``. These were only in
the extras files, so fresh standard Studio installs failed on
/images/load with the round 20 P1 #2 dependency error message.

P1 #5: ``LoadRequest``, ``UnloadRequest``, and
``ValidateModelRequest`` now apply the same control-character +
embedded-HF-token validators that ``DiffusionLoadRequest``
already had. /api/inference/load, /api/inference/validate, and
/api/inference/unload used to accept newline / tab / control
characters in ``model_path`` (log-line smuggling) and URL-form
``https://hf_xxxxx@huggingface.co/...`` (credential leak through
structured log sinks).

P2 #6: ``_collapse_local`` in the diffusion load-error scrubber
now resolves relative candidates and adds the absolute form to
the substring set. A relative ``exports/my-flux`` used to leak
``/mnt/disks/.../exports/my-flux/...`` via downstream library
errors because the scrubber only matched the original literal.
Replacement is longest-first so a leaf-only context survives.

All 85 diffusion-relevant + 35 related model-validation tests
pass locally.

(P1 #3 cross-workload GPU handoff lock is deferred: deserves a
focused design pass across /images/load, /chat/load (both
branches), /training/start, and /export/load to pick a lock
boundary that does not deadlock against the backend load locks
or stall the SSE log stream.)
---
 studio/backend/core/inference/diffusion.py    | 90 +++++++++++++------
 studio/backend/models/inference.py            | 46 ++++++++++
 studio/backend/requirements/studio.txt        |  7 ++
 .../backend/tests/test_diffusion_backend.py   | 10 ++-
 4 files changed, 124 insertions(+), 29 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 20d67a9cf3..abb541779e 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -893,20 +893,39 @@ def load_model(
                             token = hf_token,
                         )
 
-                # Round 19 P1 #3: the GGUF branch above already
-                # proved repo + filename are accessible via
-                # ``hf_hub_download``. The full-diffusers path (no
-                # ``gguf_filename``) did NOT, so a typo / private /
-                # gated full repo only surfaced inside
-                # ``from_pretrained`` AFTER chat was unloaded. Probe
-                # ``effective_base`` for ``model_index.json`` here so
-                # the chat model is preserved on a bad full-repo
-                # request. Also probe when the GGUF caller supplied
-                # an explicit ``base_repo`` (the base companion is
-                # ALSO downloaded via from_pretrained further down
-                # and would OOM-then-fail past the unload).
-                if not gguf_filename or base_repo:
-                    _preflight_full_diffusers_repo(effective_base, hf_token)
+                # Round 20 P1 #1: every load mode (full diffusers
+                # repo, GGUF + explicit base_repo, GGUF + auto-picked
+                # base_repo) feeds ``effective_base`` into
+                # ``from_pretrained`` further down. The round 19
+                # preflight only ran for the first two, so an
+                # auto-picked GGUF companion that turned out to be
+                # gated / private / missing still unloaded chat
+                # before the load failed. Always preflight
+                # ``effective_base`` so a bad companion repo is
+                # caught BEFORE chat / export are released.
+                _preflight_full_diffusers_repo(effective_base, hf_token)
+
+                # Round 20 P1 #2: ``diffusers.GGUFQuantizationConfig``
+                # imports the ``gguf`` package lazily at construction
+                # time. Partial Studio installs (``diffusers`` present,
+                # ``gguf`` not) used to discover that AFTER the chat /
+                # export release calls. Build the quant config up
+                # front so the missing-dependency surface raises
+                # while the user's chat model is still resident.
+                quant_config = None
+                if gguf_filename:
+                    try:
+                        quant_config = diffusers.GGUFQuantizationConfig(
+                            compute_dtype = dtype
+                        )
+                    except ModuleNotFoundError as exc:
+                        missing = exc.name or str(exc)
+                        raise RuntimeError(
+                            "Diffusion GGUF loading requires the gguf "
+                            "runtime package. Missing dependency: "
+                            f"{missing}. Re-run Studio setup before "
+                            "loading an image GGUF."
+                        ) from exc
 
                 # All cheap failure points (bad gguf_filename, missing
                 # pipeline / transformer class, gated download token,
@@ -965,7 +984,8 @@ def load_model(
                     _drain_cuda_cache()
 
                 if gguf_filename:
-                    quant_config = diffusers.GGUFQuantizationConfig(compute_dtype = dtype)
+                    # ``quant_config`` was already constructed above
+                    # (round 20 P1 #2 pre-release fail-fast).
                     # Diffusers-format GGUFs (FLUX.2 klein / Qwen-Image /
                     # SD3) need the matching base repo's component config
                     # at config=<base_repo>, subfolder="transformer".
@@ -1098,20 +1118,34 @@ def _collapse_local(msg: str, candidate: Optional[str]) -> str:
                     except (OSError, ValueError):
                         return msg
                     leaf = p.name or candidate
-                    abs_str = None
-                    if p.is_absolute() or p.exists():
-                        try:
-                            abs_str = str(p)
-                        except (OSError, ValueError):
-                            abs_str = None
-                    if abs_str and abs_str in msg:
-                        msg = msg.replace(abs_str, leaf)
-                    if (
-                        candidate != leaf
-                        and candidate in msg
-                        and ("/" in candidate or "\\" in candidate)
+                    needles: set[str] = set()
+                    # Round 20 P2 #6: a relative candidate like
+                    # ``exports/my-flux`` used to collapse only the
+                    # exact ``exports/my-flux`` substring, but
+                    # downstream libraries (diffusers / safetensors)
+                    # resolve and emit ``/mnt/disks/.../exports/my-flux/...``
+                    # absolute strings that leaked the operator's
+                    # filesystem layout. Also scrub the resolved
+                    # absolute form so the leaf is the only path
+                    # fragment that survives.
+                    try:
+                        if p.exists():
+                            needles.add(str(p.resolve()))
+                        elif p.is_absolute():
+                            needles.add(str(p))
+                    except (OSError, ValueError):
+                        pass
+                    if "/" in candidate or "\\" in candidate:
+                        needles.add(candidate)
+                    # Replace longest first so a parent-directory
+                    # substring does not blank out the leaf-only
+                    # context the user needs.
+                    for needle in sorted(
+                        (n for n in needles if n and n != leaf),
+                        key = len,
+                        reverse = True,
                     ):
-                        msg = msg.replace(candidate, leaf)
+                        msg = msg.replace(needle, leaf)
                     return msg
 
                 # ``effective_base`` and ``gguf_filename`` are local
diff --git a/studio/backend/models/inference.py b/studio/backend/models/inference.py
index d379989241..14c8414d74 100644
--- a/studio/backend/models/inference.py
+++ b/studio/backend/models/inference.py
@@ -60,6 +60,24 @@ def normalize_blank_chat_template_override(
             return None
         return value
 
+    # Round 20 P1 #5: extend the diffusion-side identifier hardening
+    # (round 5 P2 / round 15 P1 #5) to the chat LoadRequest. Newline
+    # / tab / control characters in ``model_path`` or ``gguf_variant``
+    # would otherwise be echoed verbatim into structured-log lines
+    # ("Loading model %s") and let a caller smuggle in fake log
+    # entries, and an embedded ``hf_...`` token in a URL-form path
+    # would leak the credential into the same log sinks the
+    # diffusion route already redacts.
+    @field_validator("model_path", "gguf_variant")
+    @classmethod
+    def _no_identifier_control_chars(cls, v, info):
+        return _no_control_chars(v, info.field_name)
+
+    @field_validator("model_path")
+    @classmethod
+    def _no_embedded_hf_tokens(cls, v, info):
+        return _reject_embedded_hf_token(v, info.field_name)
+
     cache_type_kv: Optional[str] = Field(
         None,
         description = "KV cache data type for both K and V (e.g. 'f16', 'bf16', 'q8_0', 'q4_1', 'q5_1')",
@@ -110,6 +128,20 @@ class UnloadRequest(BaseModel):
 
     model_path: str = Field(..., description = "Model identifier to unload")
 
+    # Round 20 P1 #5: mirror the LoadRequest identifier hardening so
+    # /api/inference/unload also rejects control characters and
+    # URL-embedded HF tokens before the path reaches structured log
+    # sinks.
+    @field_validator("model_path")
+    @classmethod
+    def _no_identifier_control_chars(cls, v, info):
+        return _no_control_chars(v, info.field_name)
+
+    @field_validator("model_path")
+    @classmethod
+    def _no_embedded_hf_tokens(cls, v, info):
+        return _reject_embedded_hf_token(v, info.field_name)
+
 
 class ValidateModelRequest(BaseModel):
     """
@@ -130,6 +162,20 @@ class ValidateModelRequest(BaseModel):
         None, description = "GGUF quantization variant (e.g. 'Q4_K_M')"
     )
 
+    # Round 20 P1 #5: same identifier hardening as LoadRequest /
+    # UnloadRequest. /api/inference/validate flows directly into
+    # ``ModelConfig.from_identifier`` and the resulting log lines, so
+    # control characters and embedded HF tokens must not survive.
+    @field_validator("model_path", "gguf_variant")
+    @classmethod
+    def _no_identifier_control_chars(cls, v, info):
+        return _no_control_chars(v, info.field_name)
+
+    @field_validator("model_path")
+    @classmethod
+    def _no_embedded_hf_tokens(cls, v, info):
+        return _reject_embedded_hf_token(v, info.field_name)
+
 
 class ValidateModelResponse(BaseModel):
     """
diff --git a/studio/backend/requirements/studio.txt b/studio/backend/requirements/studio.txt
index 6628eef7f7..b360261e44 100644
--- a/studio/backend/requirements/studio.txt
+++ b/studio/backend/requirements/studio.txt
@@ -23,3 +23,10 @@ diceware
 ddgs
 cryptography>=42.0.0
 httpx>=0.27.0
+# Studio Images page runtime. Flux2KleinPipeline / Flux2Pipeline /
+# QwenImagePipeline / StableDiffusion3Pipeline are available in
+# diffusers>=0.37.0, and GGUFQuantizationConfig requires the gguf
+# package (round 20 P1 #4: fresh standard Studio installs failed on
+# /images/load because these were only listed in the extras files).
+diffusers>=0.37.0
+gguf>=0.10.0
diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index 5d739043e0..85a45caf15 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -1305,7 +1305,15 @@ def from_pretrained(cls, base, **kwargs):
         ),
     )
 
-    def _boom(**_):
+    def _boom(**kwargs):
+        # Round 20 P1 #1 added a base-repo preflight that downloads
+        # the diffusers ``model_index.json`` of the auto-picked
+        # companion repo BEFORE the chat unload. Allow that call
+        # through (it would otherwise hit the network) but still
+        # reject any attempt to download the GGUF itself, which is
+        # what this test guards.
+        if kwargs.get("filename") == "model_index.json":
+            return "/tmp/model_index.json"
         raise AssertionError("hf_hub_download must not run for a local dir")
 
     fake_hub = SimpleNamespace(hf_hub_download = _boom)

From 04bd9b2da56de6378f31149dd6ad4375e527efa0 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 10:32:43 +0000
Subject: [PATCH 57/92] Fix/adjust diffusion: round 21 P1+P2 batch for PR #5754

P1 #1 + #2: ``LoadRequest._no_embedded_hf_tokens`` and
``ValidateModelRequest._no_embedded_hf_tokens`` now cover
``gguf_variant`` in addition to ``model_path``. A caller could
pass a variant like ``Q4_K_M-hf_xxxxxxxx`` that flowed into
structured log sinks via the GGUF resolver path; the matching
``DiffusionLoadRequest`` validator already covered every string
field, so this restores parity.

P1 #3: ``/api/inference/unload`` now also matches the llama
``loading_model_identifier`` when picking the GGUF branch. A
pending GGUF download (``is_active`` still False,
``loading_model_identifier`` populated) used to fall through to
the safetensors branch and respond ``status="unloaded"`` while
llama-server kept downloading.

P1 #4 + #5: the final safetensors-handoff sweeps (route-level
``_release_safetensors_chat_for`` and backend
``_release_chat_backend_for_diffusion``) now check ``active_model_name``
and ``loading_models`` WITHOUT the initial ``owned_names`` filter.
A concurrent ``/load`` that landed AFTER the snapshot was
previously ignored, so a chat model that began loading during the
unload window let training / export / GGUF chat / diffusion start
anyway and race the new chat for VRAM.

P2 #6: added ``_preflight_diffusers_subfolder_config`` and
invoked it for GGUF loads with a transformer class
(``effective_base``, ``"transformer"``). A custom base companion
that had ``model_index.json`` but lacked
``transformer/config.json`` previously passed the round 19
preflight, unloaded chat, then failed inside
``from_single_file``.

P2 #7: ``_scrub_validation_obj`` in main.py also scrubs string
dict KEYS. Pydantic ``string_type`` errors surface ``input``
verbatim, and a malformed payload like
``{"repo_id": {"hf_xxxxx": "owner/repo"}}`` would otherwise leak
the token through the 422 response body.

All 85 diffusion-relevant + 35 model-validation tests pass
locally. Existing fakes for ``hf_hub_download`` updated to
accept the new ``subfolder=`` kwarg the round 21 preflight uses.

(P1 #3 cross-workload GPU handoff lock from round 20 is still
deferred; round 21's P1 #4 / #5 raised the sweep-level guarantee,
which closes the most common race without the deadlock risk of
holding a process-wide lock across the entire load.)
---
 studio/backend/core/inference/diffusion.py    | 88 +++++++++++++++++--
 studio/backend/main.py                        | 10 ++-
 studio/backend/models/inference.py            | 11 ++-
 studio/backend/routes/inference.py            | 47 +++++++---
 .../backend/tests/test_diffusion_backend.py   | 24 +++--
 5 files changed, 146 insertions(+), 34 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index abb541779e..733a5d1bbe 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -201,6 +201,60 @@ def _expand_existing_local_path(value: str) -> str:
     return value
 
 
+def _preflight_diffusers_subfolder_config(
+    repo: str,
+    subfolder: str,
+    hf_token: Optional[str],
+) -> None:
+    """Round 21 P2 #6: also probe ``{subfolder}/config.json``.
+
+    The full-repo preflight at ``_preflight_full_diffusers_repo``
+    only proves ``model_index.json`` exists. For GGUF loads the
+    follow-up ``from_single_file(..., config=effective_base,
+    subfolder="transformer")`` still needs a matching
+    ``transformer/config.json`` on the base companion. Without
+    this second probe a base that has model_index.json but no
+    transformer config would still unload chat before the load
+    failed.
+    """
+    if not repo or not subfolder:
+        return
+    try:
+        local = Path(repo).expanduser()
+    except (OSError, ValueError):
+        local = None
+    if local is not None and local.exists():
+        config_path = local / subfolder / "config.json"
+        if not config_path.is_file():
+            raise RuntimeError(
+                f"Diffusion repo '{_display_repo_id(repo)}' is missing "
+                f"{subfolder}/config.json."
+            )
+        return
+    if (local is not None and local.is_absolute()) or repo.startswith("~"):
+        # Local-only path that does not exist -- _preflight_full_diffusers_repo
+        # already raised for the absent directory, so reaching here means the
+        # caller is loading a Hub id that just looks like a path. Fall through
+        # to the network probe.
+        pass
+    try:
+        from huggingface_hub import hf_hub_download as _hf_hub_download
+    except Exception:
+        return
+    try:
+        _hf_hub_download(
+            repo_id = repo,
+            filename = "config.json",
+            subfolder = subfolder,
+            token = hf_token,
+        )
+    except Exception as exc:
+        raise RuntimeError(
+            f"Could not access diffusion repo '{_display_repo_id(repo)}' "
+            f"{subfolder}/config.json before unloading the current model."
+        ) from exc
+
+
 def _preflight_full_diffusers_repo(repo: str, hf_token: Optional[str]) -> None:
     """Prove a full diffusers repo is accessible before any unloads.
 
@@ -904,6 +958,21 @@ def load_model(
                 # ``effective_base`` so a bad companion repo is
                 # caught BEFORE chat / export are released.
                 _preflight_full_diffusers_repo(effective_base, hf_token)
+                # Round 21 P2 #6: the GGUF transformer path also
+                # consumes ``effective_base`` via
+                # ``from_single_file(config=effective_base,
+                # subfolder="transformer")``. A base that has
+                # ``model_index.json`` but lacks
+                # ``transformer/config.json`` would pass the
+                # round-19 preflight and only fail AFTER the chat
+                # unload. Run the subfolder probe too so the
+                # second cheap failure mode is also caught early.
+                if gguf_filename and fam.transformer_class:
+                    _preflight_diffusers_subfolder_config(
+                        effective_base,
+                        "transformer",
+                        hf_token,
+                    )
 
                 # Round 20 P1 #2: ``diffusers.GGUFQuantizationConfig``
                 # imports the ``gguf`` package lazily at construction
@@ -1536,17 +1605,18 @@ def _require_unload(model_name: str) -> None:
         )
         _require_unload(loading)
 
-    # Round 19 P1 #2: final sweep using the initial snapshot of
-    # owned names. Catches races where a name we did not explicitly
-    # unload (because it appeared in loading_models between the
-    # snapshot and the unload calls) is still owned after the loop.
-    remaining_loading = (
-        set(getattr(backend, "loading_models", set()) or set()) & owned_names
-    )
+    # Round 21 P1 #5: final sweep without the owned_names filter.
+    # A concurrent ``/load`` that appeared AFTER the initial
+    # snapshot was previously ignored, so a chat model that started
+    # loading during the diffusion handoff slipped through and
+    # raced the diffusion allocation for VRAM. Treat ANY surviving
+    # active / loading entry as a failure so the surrounding
+    # load_model raises and the caller retries.
+    remaining_loading = set(getattr(backend, "loading_models", set()) or set())
     remaining_active = getattr(backend, "active_model_name", None)
-    if remaining_loading or (remaining_active in owned_names):
+    if remaining_loading or remaining_active:
         raise RuntimeError(
-            "The existing safetensors chat model is still active or loading "
+            "A safetensors chat model is still active or loading "
             "after unload; retry before loading a diffusion image model."
         )
 
diff --git a/studio/backend/main.py b/studio/backend/main.py
index 4ec5e0fc2e..8d67b2f02a 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -333,7 +333,15 @@ def _scrub_validation_obj(value):
     if isinstance(value, list):
         return [_scrub_validation_obj(v) for v in value]
     if isinstance(value, dict):
-        return {k: _scrub_validation_obj(v) for k, v in value.items()}
+        # Round 21 P2 #7: pydantic surfaces ``input`` for ``string_type``
+        # validation errors verbatim, including dict KEYS like
+        # ``{"hf_xxxxx": "owner/repo"}``. Scrub string keys too so the
+        # token does not leak through the 422 response body.
+        return {
+            (_scrub_validation_obj(k) if isinstance(k, str) else k):
+            _scrub_validation_obj(v)
+            for k, v in value.items()
+        }
     return value
 
 
diff --git a/studio/backend/models/inference.py b/studio/backend/models/inference.py
index 14c8414d74..5691c3d0d5 100644
--- a/studio/backend/models/inference.py
+++ b/studio/backend/models/inference.py
@@ -73,7 +73,12 @@ def normalize_blank_chat_template_override(
     def _no_identifier_control_chars(cls, v, info):
         return _no_control_chars(v, info.field_name)
 
-    @field_validator("model_path")
+    # Round 21 P1 #1: also reject embedded HF tokens in
+    # ``gguf_variant``. A caller can pass a variant string like
+    # ``Q4_K_M-hf_xxxxxxxx`` that flows into log sinks via the
+    # GGUF resolver path; without this only ``model_path`` was
+    # protected.
+    @field_validator("model_path", "gguf_variant")
     @classmethod
     def _no_embedded_hf_tokens(cls, v, info):
         return _reject_embedded_hf_token(v, info.field_name)
@@ -171,7 +176,9 @@ class ValidateModelRequest(BaseModel):
     def _no_identifier_control_chars(cls, v, info):
         return _no_control_chars(v, info.field_name)
 
-    @field_validator("model_path")
+    # Round 21 P1 #2: extend embedded-token rejection to
+    # ``gguf_variant`` here too (mirrors LoadRequest).
+    @field_validator("model_path", "gguf_variant")
     @classmethod
     def _no_embedded_hf_tokens(cls, v, info):
         return _reject_embedded_hf_token(v, info.field_name)
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 7d18972a91..a4d8f3d5ce 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -502,19 +502,20 @@ async def _unload_required(model_name: str) -> None:
         )
         await _unload_required(loading)
 
-    # Round 19 P1 #1: final sweep using the set of names that were
-    # initially present. Catches races where a model name we did not
-    # explicitly unload (because it appeared between the snapshot and
-    # the unload calls) is still in the owned set after the loop.
-    remaining_loading = (
-        set(getattr(inf, "loading_models", set()) or set()) & owned_names
-    )
+    # Round 21 P1 #4: final sweep without the owned_names filter.
+    # A concurrent ``/load`` that appeared AFTER the initial
+    # snapshot was previously ignored here, so a chat model that
+    # started loading during the unload window let the surrounding
+    # training / export / GGUF / diffusion start anyway. Treat ANY
+    # surviving active / loading entry as a failure so the caller
+    # retries rather than racing the new chat load for VRAM.
+    remaining_loading = set(getattr(inf, "loading_models", set()) or set())
     remaining_active = getattr(inf, "active_model_name", None)
-    if remaining_loading or (remaining_active in owned_names):
+    if remaining_loading or remaining_active:
         raise HTTPException(
             status_code = 503,
             detail = (
-                "The existing safetensors chat model is still active or loading "
+                "A safetensors chat model is still active or loading "
                 f"after unload; retry before starting {workload}."
             ),
         )
@@ -1667,12 +1668,32 @@ async def unload_model(
     try:
         # Check if the GGUF backend has this model loaded or is loading it
         llama_backend = get_llama_cpp_backend()
-        if llama_backend.is_active and (
-            llama_backend.model_identifier == request.model_path
+        loaded_identifier = getattr(llama_backend, "model_identifier", None)
+        loading_identifier = getattr(llama_backend, "loading_model_identifier", None)
+        # Round 21 P1 #3: a GGUF download that has not yet flipped
+        # ``is_active`` to True (model_identifier still None,
+        # ``loading_model_identifier`` populated) used to fall
+        # through to the safetensors branch, which silently
+        # responded ``status="unloaded"`` while llama-server kept
+        # downloading. Match on either the loaded OR loading
+        # identifier so the explicit unload route can actually
+        # cancel a pending GGUF load.
+        llama_matches_request = (
+            loaded_identifier == request.model_path
+            or loading_identifier == request.model_path
+            or is_registered_native_path_label(
+                loaded_identifier, request.model_path
+            )
             or is_registered_native_path_label(
-                llama_backend.model_identifier, request.model_path
+                loading_identifier, request.model_path
             )
-            or not llama_backend.is_loaded
+        )
+        if (
+            getattr(llama_backend, "is_active", False)
+            or loading_identifier
+        ) and (
+            llama_matches_request
+            or not getattr(llama_backend, "is_loaded", False)
         ):
             # Round 19 P1 #6: previously this called
             # ``llama_backend.unload_model()`` and unconditionally
diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index 85a45caf15..5a94eef120 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -393,10 +393,15 @@ def to(self, device):
     monkeypatch.setitem(sys.modules, "diffusers", fake)
 
     # Pretend HF Hub gave us a local file without actually fetching.
+    # Round 21: accept arbitrary kwargs (round 20 preflight adds
+    # ``filename="model_index.json"`` and round 21 preflight adds
+    # ``subfolder="transformer"``) so existing tests that exercise
+    # the GGUF path do not hit a TypeError from the fake signature.
     fake_hub = types.ModuleType("huggingface_hub")
-    fake_hub.hf_hub_download = (
-        lambda repo_id, filename, token = None: f"/fake/{repo_id}/{filename}"
-    )
+    def _fake_download(repo_id, filename, token = None, subfolder = None, **_kwargs):
+        sub = f"{subfolder}/" if subfolder else ""
+        return f"/fake/{repo_id}/{sub}{filename}"
+    fake_hub.hf_hub_download = _fake_download
     monkeypatch.setitem(sys.modules, "huggingface_hub", fake_hub)
 
     # Force CPU dtype so the test does not need CUDA.
@@ -1308,12 +1313,13 @@ def from_pretrained(cls, base, **kwargs):
     def _boom(**kwargs):
         # Round 20 P1 #1 added a base-repo preflight that downloads
         # the diffusers ``model_index.json`` of the auto-picked
-        # companion repo BEFORE the chat unload. Allow that call
-        # through (it would otherwise hit the network) but still
-        # reject any attempt to download the GGUF itself, which is
-        # what this test guards.
-        if kwargs.get("filename") == "model_index.json":
-            return "/tmp/model_index.json"
+        # companion repo BEFORE the chat unload. Round 21 P2 #6
+        # added a second preflight for ``transformer/config.json``
+        # on that same companion. Allow both preflight kinds through
+        # but still reject any attempt to download the GGUF itself,
+        # which is what this test guards.
+        if kwargs.get("filename") in ("model_index.json", "config.json"):
+            return "/tmp/preflight"
         raise AssertionError("hf_hub_download must not run for a local dir")
 
     fake_hub = SimpleNamespace(hf_hub_download = _boom)

From 63f3faf022660ce33a315f7e966ce71bc326a5dc Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 10:33:03 +0000
Subject: [PATCH 58/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/main.py                         |  5 +++--
 studio/backend/routes/inference.py             | 16 ++++------------
 studio/backend/tests/test_diffusion_backend.py |  2 ++
 3 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/studio/backend/main.py b/studio/backend/main.py
index 8d67b2f02a..ca862b1ccb 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -338,8 +338,9 @@ def _scrub_validation_obj(value):
         # ``{"hf_xxxxx": "owner/repo"}``. Scrub string keys too so the
         # token does not leak through the 422 response body.
         return {
-            (_scrub_validation_obj(k) if isinstance(k, str) else k):
-            _scrub_validation_obj(v)
+            (
+                _scrub_validation_obj(k) if isinstance(k, str) else k
+            ): _scrub_validation_obj(v)
             for k, v in value.items()
         }
     return value
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index a4d8f3d5ce..ce7ddd1342 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -1681,19 +1681,11 @@ async def unload_model(
         llama_matches_request = (
             loaded_identifier == request.model_path
             or loading_identifier == request.model_path
-            or is_registered_native_path_label(
-                loaded_identifier, request.model_path
-            )
-            or is_registered_native_path_label(
-                loading_identifier, request.model_path
-            )
+            or is_registered_native_path_label(loaded_identifier, request.model_path)
+            or is_registered_native_path_label(loading_identifier, request.model_path)
         )
-        if (
-            getattr(llama_backend, "is_active", False)
-            or loading_identifier
-        ) and (
-            llama_matches_request
-            or not getattr(llama_backend, "is_loaded", False)
+        if (getattr(llama_backend, "is_active", False) or loading_identifier) and (
+            llama_matches_request or not getattr(llama_backend, "is_loaded", False)
         ):
             # Round 19 P1 #6: previously this called
             # ``llama_backend.unload_model()`` and unconditionally
diff --git a/studio/backend/tests/test_diffusion_backend.py b/studio/backend/tests/test_diffusion_backend.py
index 5a94eef120..12ed6e3038 100644
--- a/studio/backend/tests/test_diffusion_backend.py
+++ b/studio/backend/tests/test_diffusion_backend.py
@@ -398,9 +398,11 @@ def to(self, device):
     # ``subfolder="transformer"``) so existing tests that exercise
     # the GGUF path do not hit a TypeError from the fake signature.
     fake_hub = types.ModuleType("huggingface_hub")
+
     def _fake_download(repo_id, filename, token = None, subfolder = None, **_kwargs):
         sub = f"{subfolder}/" if subfolder else ""
         return f"/fake/{repo_id}/{sub}{filename}"
+
     fake_hub.hf_hub_download = _fake_download
     monkeypatch.setitem(sys.modules, "huggingface_hub", fake_hub)
 

From 09c51147a9dcd36b5639aa5dad868a75a08c56af Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 10:56:11 +0000
Subject: [PATCH 59/92] Fix/adjust diffusion: round 22 P1+P2 batch for PR #5754

P1 #1: ``TrainingStartRequest.model_name`` now runs the same
control-character and embedded-HF-token validators that the chat
and diffusion request models gained in rounds 5 / 15 / 20 / 21.
``/api/training/start`` previously accepted newline / tab /
control characters and URL-form ``hf_xxxxx`` tokens that flowed
into structured-log sinks via "Loading model %s" lines.

P1 #2: ``_run_with_helper`` in ``utils/datasets/llm_assist.py``
now skips the helper GGUF when the diffusion image backend
reports loaded / loading. The public chat / training / export
routes already do this through ``_release_diffusion_for``, but
this dataset-side helper loaded llama-server directly with no
diffusion guard, so an Images-page allocation would race the
helper for VRAM. New ``_diffusion_image_model_busy`` helper
fails closed (treats status() failure as busy) so the resident
image model is preserved instead of being overwritten.

P1 #3: same ``_diffusion_image_model_busy`` guard added to
``_run_multi_pass_advisor`` (the dataset conversion advisor),
which has the same direct llama.cpp load shape.

P2 #4: the early "Could not infer a diffusion family" RuntimeError
now routes ``repo_id`` through ``_display_repo_id`` before
formatting. A local absolute path that did not match any known
family used to leak the operator's filesystem layout via the 400
response body, last_error, and log line.

All 97 diffusion + training-validation + related tests pass
locally.
---
 studio/backend/core/inference/diffusion.py  |  7 ++++-
 studio/backend/models/training.py           | 21 +++++++++++++
 studio/backend/utils/datasets/llm_assist.py | 35 +++++++++++++++++++++
 3 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 733a5d1bbe..be86e47282 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -802,8 +802,13 @@ def load_model(
 
         fam = detect_family(repo_id, override_family = family_override)
         if fam is None:
+            # Round 22 P2 #4: route the repo label through
+            # ``_display_repo_id`` so a local absolute path that did
+            # not match any family does not leak the operator's
+            # filesystem layout via the error message / last_error
+            # / 400 response body.
             raise RuntimeError(
-                f"Could not infer a diffusion family for '{repo_id}'. "
+                f"Could not infer a diffusion family for '{_display_repo_id(repo_id)}'. "
                 "Pass family_override = 'flux.2-klein' / 'flux.2' / "
                 "'flux.1' / 'qwen-image' / 'stable-diffusion-3' / "
                 "'stable-diffusion-xl' to disambiguate."
diff --git a/studio/backend/models/training.py b/studio/backend/models/training.py
index 7c53b0fee5..e0eec81197 100644
--- a/studio/backend/models/training.py
+++ b/studio/backend/models/training.py
@@ -8,6 +8,13 @@
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 from typing import Any, Optional, List, Dict, Literal
 
+# Round 22 P1 #1: reuse the chat / diffusion identifier validators
+# so /api/training/start rejects newline / tab / control characters
+# and URL-form ``hf_xxxxx`` tokens in ``model_name``. Without these
+# a caller could log-line-smuggle through "Loading model %s" lines
+# and leak the bearer token into structured-log sinks.
+from models.inference import _no_control_chars, _reject_embedded_hf_token
+
 
 _MAX_BATCH_SIZE = 4096
 _MAX_GRAD_ACCUM = 4096
@@ -49,6 +56,20 @@ class TrainingStartRequest(BaseModel):
     model_name: str = Field(
         ..., description = "Model identifier (e.g., 'unsloth/llama-3-8b-bnb-4bit')"
     )
+
+    # Round 22 P1 #1: identifier hardening (round 5 / 15 / 20 / 21
+    # extended these to chat + diffusion request models; training
+    # was the last unguarded entry point).
+    @field_validator("model_name")
+    @classmethod
+    def _no_model_name_control_chars(cls, v, info):
+        return _no_control_chars(v, info.field_name)
+
+    @field_validator("model_name")
+    @classmethod
+    def _no_model_name_embedded_hf_tokens(cls, v, info):
+        return _reject_embedded_hf_token(v, info.field_name)
+
     training_type: Literal["LoRA/QLoRA", "Full Finetuning", "Continued Pretraining"] = (
         Field(
             ...,
diff --git a/studio/backend/utils/datasets/llm_assist.py b/studio/backend/utils/datasets/llm_assist.py
index 4c66d2ebf6..8aad3230e0 100644
--- a/studio/backend/utils/datasets/llm_assist.py
+++ b/studio/backend/utils/datasets/llm_assist.py
@@ -109,6 +109,28 @@ def precache_helper_gguf():
             pass
 
 
+def _diffusion_image_model_busy() -> bool:
+    """Round 22 P1 #2 / #3: helper / advisor GGUFs share VRAM with
+    the Images page diffusion pipeline. Public chat / training /
+    export routes call the strict ``_release_diffusion_for`` helper
+    before allocating, but these dataset-side helpers used to load
+    llama-server directly with no diffusion guard at all. Skip the
+    helper GGUF when ``DiffusionBackend.status()`` reports loaded /
+    loading so we do not double-own VRAM. Fail closed (treat as
+    busy) on any status() error to preserve the resident image
+    model rather than racing it for memory.
+    """
+    try:
+        from core.inference.diffusion import get_diffusion_backend
+    except Exception:
+        return False
+    try:
+        status = get_diffusion_backend().status()
+    except Exception:
+        return True
+    return bool(status.get("is_loaded") or status.get("is_loading"))
+
+
 def _run_with_helper(prompt: str, max_tokens: int = 256) -> Optional[str]:
     """
     Load helper model, run one chat completion, unload.
@@ -118,6 +140,12 @@ def _run_with_helper(prompt: str, max_tokens: int = 256) -> Optional[str]:
     if os.environ.get("UNSLOTH_HELPER_MODEL_DISABLE", "").strip() in ("1", "true"):
         return None
 
+    if _diffusion_image_model_busy():
+        logger.info(
+            "Skipping helper GGUF while a diffusion image model is loaded/loading"
+        )
+        return None
+
     repo = os.environ.get("UNSLOTH_HELPER_MODEL_REPO", DEFAULT_HELPER_MODEL_REPO)
     variant = os.environ.get(
         "UNSLOTH_HELPER_MODEL_VARIANT", DEFAULT_HELPER_MODEL_VARIANT
@@ -508,6 +536,13 @@ def _run_multi_pass_advisor(
     if os.environ.get("UNSLOTH_HELPER_MODEL_DISABLE", "").strip() in ("1", "true"):
         return None
 
+    # Round 22 P1 #3: same diffusion-busy guard as ``_run_with_helper``.
+    if _diffusion_image_model_busy():
+        logger.info(
+            "Skipping advisor GGUF while a diffusion image model is loaded/loading"
+        )
+        return None
+
     repo = os.environ.get("UNSLOTH_HELPER_MODEL_REPO", DEFAULT_HELPER_MODEL_REPO)
     variant = os.environ.get(
         "UNSLOTH_HELPER_MODEL_VARIANT", DEFAULT_HELPER_MODEL_VARIANT

From c6c4378f38cc39be26b738d731fd93cbc17ae52f Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 11:20:05 +0000
Subject: [PATCH 60/92] Fix/adjust diffusion: round 23 P1+P2 batch for PR #5754

P1 #1 + #2 + #6: extended the chat / diffusion / training
identifier hardening to every export-side request model.
ExportCommonOptions (parent of ExportMergedModelRequest /
ExportBaseModelRequest / ExportLoRAAdapterRequest) now applies
_no_control_chars and _reject_embedded_hf_token to repo_id and
base_model_id; ExportGGUFRequest gets the same on its repo_id
plus a control-char check on quantization_method; and
LoadCheckpointRequest validates checkpoint_path. Previously
"/api/export/*" accepted newline-smuggled identifiers and
URL-form ``hf_xxxxx`` tokens that flowed into log lines.

P1 #3 + #4: ``_run_with_helper`` and ``_run_multi_pass_advisor``
now use a shared ``_gpu_workload_busy_for_helper`` that gates on
diffusion (round 22 already), training, AND export. The round 22
guard only checked diffusion, so the dataset helper / advisor
could still load llama-server on top of an active training run
or a resident export checkpoint. Each step fails closed
(unverifiable status counts as busy) so the user's primary
workload is preserved.

P1 #5: PublishDatasetRequest in models/data_recipe.py also
applies the identifier hardening to repo_id; the publish path
previously accepted control characters and URL-form tokens.

P1 #7-10: added _validate_logged_identifier helper to
routes/models.py and applied it to the path / query parameter
endpoints that flow into logger.info(...) calls --
``/config/{model_name}``, ``/check-vision/{model_name}``,
``/check-embedding/{model_name}``, ``/gguf-variants``. Mapped
the validator's ValueError to HTTP 422 so the client sees the
same shape as a Pydantic validation failure.

P2 #11 + #12: ``Loading diffusion model %s`` and
``Diffusion load failed for %s`` log lines route ``repo_id`` /
``effective_base`` through ``_display_repo_id`` (collapses
absolute local paths to the leaf, still scrubs HF tokens)
instead of plain ``_redact_hf_tokens``. The error path was
already collapsed in the user-facing 400 / RuntimeError, but
the structured-log lines kept the full path.

All 97 diffusion + training-validation + related tests pass
locally.
---
 studio/backend/core/inference/diffusion.py  | 17 +++++-
 studio/backend/models/data_recipe.py        | 18 +++++-
 studio/backend/models/export.py             | 55 +++++++++++++++++
 studio/backend/routes/models.py             | 29 ++++++++-
 studio/backend/utils/datasets/llm_assist.py | 68 ++++++++++++++++++---
 5 files changed, 173 insertions(+), 14 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index be86e47282..f670fb0f05 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -907,13 +907,21 @@ def load_model(
                 # Scrub them BEFORE the logger formats the line so the
                 # token never reaches structured-log sinks (round 14
                 # P2 #9).
+                # Round 23 P2 #11: ``_redact_hf_tokens`` only scrubs
+                # ``hf_xxxxx`` substrings, so an absolute local
+                # path like ``/home/alice/private/FLUX.2-klein-GGUF``
+                # used to land in this log line verbatim. Route
+                # through ``_display_repo_id`` so the leaf is
+                # logged when the value is a filesystem path, with
+                # the token-redaction step inside that helper as a
+                # belt-and-braces defence.
                 logger.info(
                     "Loading diffusion model %s (family=%s, device=%s, dtype=%s, base=%s)",
-                    _redact_hf_tokens(repo_id),
+                    _display_repo_id(repo_id),
                     fam.name,
                     device,
                     dtype,
-                    _redact_hf_tokens(effective_base),
+                    _display_repo_id(effective_base),
                 )
 
                 transformer = None
@@ -1249,9 +1257,12 @@ def _collapse_local(msg: str, candidate: Optional[str]) -> str:
                 # Use ``logger.error`` with the already-scrubbed
                 # message and exc_info=False so the bearer token
                 # cannot leak through structured logging sinks.
+                # Round 23 P2 #12: same fix as the start-of-load
+                # log above. ``_redact_hf_tokens`` alone left
+                # absolute local repo paths in this failure line.
                 logger.error(
                     "Diffusion load failed for %s: %s",
-                    _redact_hf_tokens(repo_id),
+                    _display_repo_id(repo_id),
                     exc_msg,
                 )
                 raise RuntimeError(
diff --git a/studio/backend/models/data_recipe.py b/studio/backend/models/data_recipe.py
index b382ddb3d0..fe607a3f92 100644
--- a/studio/backend/models/data_recipe.py
+++ b/studio/backend/models/data_recipe.py
@@ -9,7 +9,13 @@
 
 from typing import Any
 
-from pydantic import BaseModel, Field, model_validator
+from pydantic import BaseModel, Field, field_validator, model_validator
+
+# Round 23 P1 #5: identifier hardening reused from the chat models
+# so /api/data_recipe/publish rejects control characters and
+# URL-form ``hf_xxxxx`` tokens in ``repo_id`` before they reach
+# log lines or the HF API.
+from models.inference import _no_control_chars, _reject_embedded_hf_token
 
 
 class RecipePayload(BaseModel):
@@ -60,6 +66,16 @@ class PublishDatasetRequest(BaseModel):
         description = "Execution artifact path captured by the UI for completed runs",
     )
 
+    @field_validator("repo_id")
+    @classmethod
+    def _no_repo_id_control_chars(cls, v, info):
+        return _no_control_chars(v, info.field_name)
+
+    @field_validator("repo_id")
+    @classmethod
+    def _no_repo_id_embedded_hf_tokens(cls, v, info):
+        return _reject_embedded_hf_token(v, info.field_name)
+
 
 class PublishDatasetResponse(BaseModel):
     success: bool = True
diff --git a/studio/backend/models/export.py b/studio/backend/models/export.py
index 86ce2b05bf..a881714a0c 100644
--- a/studio/backend/models/export.py
+++ b/studio/backend/models/export.py
@@ -10,6 +10,13 @@
 from pydantic import BaseModel, Field, field_validator
 from typing import List, Optional, Literal, Dict, Any
 
+# Round 23 P1 #1 / #2 / #6: reuse the chat identifier validators
+# so export requests reject newline / tab / control characters and
+# URL-form ``hf_xxxxx`` tokens in any user-supplied identifier
+# (Hub ``repo_id``, ``base_model_id``, the local
+# ``checkpoint_path``) that flows into log lines or HF API calls.
+from models.inference import _no_control_chars, _reject_embedded_hf_token
+
 
 def _validate_save_directory(value: str) -> str:
     """Reject save_directory values that escape the export root."""
@@ -54,6 +61,19 @@ class LoadCheckpointRequest(BaseModel):
         description = "Allow loading models with custom code. Only enable for checkpoints/base models you trust.",
     )
 
+    # Round 23 P1 #6: ``checkpoint_path`` is logged verbatim by the
+    # export route. Apply the same control-char + embedded-token
+    # rejection the chat / diffusion / training request models use.
+    @field_validator("checkpoint_path")
+    @classmethod
+    def _no_checkpoint_control_chars(cls, v, info):
+        return _no_control_chars(v, info.field_name)
+
+    @field_validator("checkpoint_path")
+    @classmethod
+    def _no_checkpoint_embedded_hf_tokens(cls, v, info):
+        return _reject_embedded_hf_token(v, info.field_name)
+
 
 class ExportStatusResponse(BaseModel):
     """Current export backend status."""
@@ -117,6 +137,20 @@ def _check_save_directory(cls, v):
         description = "HuggingFace model ID of the base model (for model card metadata)",
     )
 
+    # Round 23 P1 #1: ``repo_id`` (Hub destination) and
+    # ``base_model_id`` (model card metadata) both feed log lines
+    # and the HF API. Reject control characters and URL-form
+    # ``hf_xxxxx`` tokens before they reach those sinks.
+    @field_validator("repo_id", "base_model_id")
+    @classmethod
+    def _no_identifier_control_chars(cls, v, info):
+        return _no_control_chars(v, info.field_name)
+
+    @field_validator("repo_id", "base_model_id")
+    @classmethod
+    def _no_identifier_embedded_hf_tokens(cls, v, info):
+        return _reject_embedded_hf_token(v, info.field_name)
+
 
 class ExportMergedModelRequest(ExportCommonOptions):
     """Request for exporting a merged PEFT model."""
@@ -163,6 +197,27 @@ def _check_save_directory(cls, v):
         description = "Hugging Face token for GGUF upload",
     )
 
+    # Round 23 P1 #2: GGUF export endpoint defines its own
+    # ``repo_id`` (does not inherit from ExportCommonOptions), so
+    # the chat-style hardening needs to be applied here separately.
+    # ``quantization_method`` is forwarded to the export worker
+    # command line, so it gets the control-char check too even
+    # though it does not normally carry tokens.
+    @field_validator("repo_id")
+    @classmethod
+    def _no_repo_id_control_chars(cls, v, info):
+        return _no_control_chars(v, info.field_name)
+
+    @field_validator("repo_id")
+    @classmethod
+    def _no_repo_id_embedded_hf_tokens(cls, v, info):
+        return _reject_embedded_hf_token(v, info.field_name)
+
+    @field_validator("quantization_method")
+    @classmethod
+    def _no_quantization_control_chars(cls, v, info):
+        return _no_control_chars(v, info.field_name)
+
 
 class ExportLoRAAdapterRequest(ExportCommonOptions):
     """Request for exporting only the LoRA adapter (not merged)."""
diff --git a/studio/backend/routes/models.py b/studio/backend/routes/models.py
index 1ebf525f56..57c6d2ff7f 100644
--- a/studio/backend/routes/models.py
+++ b/studio/backend/routes/models.py
@@ -134,11 +134,30 @@ def _safe_is_dir(path) -> bool:
     VisionCheckResponse,
     EmbeddingCheckResponse,
 )
+from models.inference import _no_control_chars, _reject_embedded_hf_token
 
 router = APIRouter()
 logger = get_logger(__name__)
 
 
+def _validate_logged_identifier(value: str, field_name: str) -> str:
+    """Round 23 P1 #7 / #8 / #9 / #10: path / query parameters that
+    flow into ``logger.info("... %s", value)`` lines were the last
+    unguarded entry points. Newline / tab / control characters let
+    a caller smuggle forged log entries; URL-form ``hf_xxxxx``
+    tokens would leak into structured-log sinks. Mirror the
+    request-body validators by running both checks here and
+    mapping the validator's ``ValueError`` to HTTP 422 so the
+    client sees the same shape as a Pydantic validation failure.
+    """
+    try:
+        value = _no_control_chars(value, field_name)
+        value = _reject_embedded_hf_token(value, field_name)
+    except ValueError as exc:
+        raise HTTPException(status_code = 422, detail = str(exc)) from exc
+    return value
+
+
 def derive_model_type(
     is_vision: bool, audio_type: Optional[str], is_embedding: bool = False
 ) -> ModelType:
@@ -1571,6 +1590,7 @@ async def get_model_config(
 
     This endpoint wraps the backend load_model_defaults function.
     """
+    model_name = _validate_logged_identifier(model_name, "model_name")
     try:
         if not is_local_path(model_name):
             resolved = resolve_cached_repo_id_case(model_name)
@@ -1580,7 +1600,11 @@ async def get_model_config(
                     resolved,
                     model_name,
                 )
-            model_name = resolved
+            # Round 23 P1 #7: re-validate the cache-resolved value
+            # (case-only resolver should be a no-op for these
+            # checks, but defend in depth in case the resolver
+            # ever broadens its match heuristic).
+            model_name = _validate_logged_identifier(resolved, "model_name")
 
         logger.info(f"Getting model config for: {model_name}")
         from utils.models.model_config import detect_audio_type
@@ -2220,6 +2244,7 @@ async def check_vision_model(
 
     This endpoint wraps the backend is_vision_model function.
     """
+    model_name = _validate_logged_identifier(model_name, "model_name")
     try:
         logger.info(f"Checking if vision model: {model_name}")
         is_vision = is_vision_model(model_name)
@@ -2248,6 +2273,7 @@ async def check_embedding_model(
 
     This endpoint wraps the backend is_embedding_model function.
     """
+    model_name = _validate_logged_identifier(model_name, "model_name")
     try:
         logger.info(f"Checking if embedding model: {model_name}")
         is_embedding = is_embedding_model(model_name, hf_token = hf_token)
@@ -2285,6 +2311,7 @@ async def get_gguf_variants(
     with file sizes, whether the model supports vision, and the recommended
     default variant.
     """
+    repo_id = _validate_logged_identifier(repo_id, "repo_id")
     try:
         from utils.models.model_config import is_local_path, list_local_gguf_variants
 
diff --git a/studio/backend/utils/datasets/llm_assist.py b/studio/backend/utils/datasets/llm_assist.py
index 8aad3230e0..c0bedc049d 100644
--- a/studio/backend/utils/datasets/llm_assist.py
+++ b/studio/backend/utils/datasets/llm_assist.py
@@ -131,6 +131,57 @@ def _diffusion_image_model_busy() -> bool:
     return bool(status.get("is_loaded") or status.get("is_loading"))
 
 
+def _gpu_workload_busy_for_helper() -> bool:
+    """Round 23 P1 #3 / #4: the diffusion-only guard from round 22
+    let the helper / advisor GGUF run on top of a live training run
+    or a resident export checkpoint. Extend the busy check to those
+    workloads too so any GPU owner (Images, Training, Export)
+    blocks the helper instead of double-owning VRAM. Each step
+    fails closed: an unverifiable status counts as busy so the
+    user's primary workload is preserved over the optional helper.
+    """
+    if _diffusion_image_model_busy():
+        return True
+
+    try:
+        from core.training import get_training_backend
+    except Exception:
+        pass
+    else:
+        try:
+            if get_training_backend().is_training_active():
+                logger.info(
+                    "Skipping helper GGUF while training is active"
+                )
+                return True
+        except Exception:
+            logger.info(
+                "Skipping helper GGUF because training status is unavailable"
+            )
+            return True
+
+    try:
+        from core.export import get_export_backend
+    except Exception:
+        return False
+
+    try:
+        exp = get_export_backend()
+        is_active = getattr(exp, "is_export_active", None)
+        if (is_active and is_active()) or getattr(
+            exp, "current_checkpoint", None
+        ):
+            logger.info("Skipping helper GGUF while export owns the GPU")
+            return True
+    except Exception:
+        logger.info(
+            "Skipping helper GGUF because export status is unavailable"
+        )
+        return True
+
+    return False
+
+
 def _run_with_helper(prompt: str, max_tokens: int = 256) -> Optional[str]:
     """
     Load helper model, run one chat completion, unload.
@@ -140,10 +191,10 @@ def _run_with_helper(prompt: str, max_tokens: int = 256) -> Optional[str]:
     if os.environ.get("UNSLOTH_HELPER_MODEL_DISABLE", "").strip() in ("1", "true"):
         return None
 
-    if _diffusion_image_model_busy():
-        logger.info(
-            "Skipping helper GGUF while a diffusion image model is loaded/loading"
-        )
+    # Round 23 P1 #3: round 22 only guarded against a busy
+    # diffusion pipeline. Training / export own the same GPU too,
+    # so use the broader helper that gates on all three workloads.
+    if _gpu_workload_busy_for_helper():
         return None
 
     repo = os.environ.get("UNSLOTH_HELPER_MODEL_REPO", DEFAULT_HELPER_MODEL_REPO)
@@ -536,11 +587,10 @@ def _run_multi_pass_advisor(
     if os.environ.get("UNSLOTH_HELPER_MODEL_DISABLE", "").strip() in ("1", "true"):
         return None
 
-    # Round 22 P1 #3: same diffusion-busy guard as ``_run_with_helper``.
-    if _diffusion_image_model_busy():
-        logger.info(
-            "Skipping advisor GGUF while a diffusion image model is loaded/loading"
-        )
+    # Round 23 P1 #4: extend the round 22 diffusion-only check to
+    # training + export so the advisor cannot race the user's
+    # active workload for GPU memory.
+    if _gpu_workload_busy_for_helper():
         return None
 
     repo = os.environ.get("UNSLOTH_HELPER_MODEL_REPO", DEFAULT_HELPER_MODEL_REPO)

From 0a7fe59a37201be9bd7103c68926f0176d876d83 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 11:21:08 +0000
Subject: [PATCH 61/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/utils/datasets/llm_assist.py | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/studio/backend/utils/datasets/llm_assist.py b/studio/backend/utils/datasets/llm_assist.py
index c0bedc049d..22387a4f99 100644
--- a/studio/backend/utils/datasets/llm_assist.py
+++ b/studio/backend/utils/datasets/llm_assist.py
@@ -150,14 +150,10 @@ def _gpu_workload_busy_for_helper() -> bool:
     else:
         try:
             if get_training_backend().is_training_active():
-                logger.info(
-                    "Skipping helper GGUF while training is active"
-                )
+                logger.info("Skipping helper GGUF while training is active")
                 return True
         except Exception:
-            logger.info(
-                "Skipping helper GGUF because training status is unavailable"
-            )
+            logger.info("Skipping helper GGUF because training status is unavailable")
             return True
 
     try:
@@ -168,15 +164,11 @@ def _gpu_workload_busy_for_helper() -> bool:
     try:
         exp = get_export_backend()
         is_active = getattr(exp, "is_export_active", None)
-        if (is_active and is_active()) or getattr(
-            exp, "current_checkpoint", None
-        ):
+        if (is_active and is_active()) or getattr(exp, "current_checkpoint", None):
             logger.info("Skipping helper GGUF while export owns the GPU")
             return True
     except Exception:
-        logger.info(
-            "Skipping helper GGUF because export status is unavailable"
-        )
+        logger.info("Skipping helper GGUF because export status is unavailable")
         return True
 
     return False

From 48740c2664adba8ec18aa2ae91e457350f2a62c1 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 11:44:55 +0000
Subject: [PATCH 62/92] Fix/adjust diffusion: round 24 P1 batch for PR #5754

P1 #1: ``_gpu_workload_busy_for_helper`` in
``utils/datasets/llm_assist.py`` now also gates on the GGUF chat
backend (llama-server) AND the safetensors chat backend. Round 23
extended it to training + export but missed Chat, so a helper /
advisor GGUF could still race a loaded chat model for VRAM.
Both checks fail closed when status is unverifiable.

P1 #2 / #3 / #4 / #5: re-ordered the route-level GPU-handoff
unloads so the diffusion release runs BEFORE the chat releases.
A wedged diffusion unload used to fire AFTER chat was already
gone, so the user lost both on a single failure. Drop chat last
so an earlier failure preserves it. Applied to
``/training/start`` (training.py), ``/export/load`` (export.py),
``/chat/load`` GGUF branch and ``/chat/load`` safetensors branch
(routes/inference.py).

P1 #7 + P2 #13: ``/delete-finetuned`` body now hardens
``model_path`` and ``gguf_variant`` via the shared
``_validate_logged_identifier`` helper, so control characters
and URL-form HF tokens can no longer log-line-smuggle.

P1 #8 + #10: ``/delete-cached`` body hardens ``repo_id`` and
``variant`` the same way.

P1 #9: ``/download-progress`` ``repo_id`` query parameter is
also hardened; the value flows into log lines deep inside
``_get_repo_size_cached`` on lookup failure.

P1 #11: ``CheckFormatRequest.dataset_name`` and
``AiAssistMappingRequest.{dataset_name, model_name}`` in
``models/datasets.py`` now apply the same control-char +
embedded-HF-token validators, matching every other public
request-body model.

All 115 diffusion + training-validation + cached_gguf + export
+ inference model-validation tests pass locally.

(P1 #6 native-path-lease enforcement for diffusion local paths
and P1 #12 React Compiler frontend lint deferred -- both need
focused design / frontend touchups separate from this batch.)
---
 studio/backend/models/datasets.py           | 27 +++++++++++-
 studio/backend/routes/export.py             | 23 ++++------
 studio/backend/routes/inference.py          | 43 +++++++++---------
 studio/backend/routes/models.py             | 18 ++++++++
 studio/backend/routes/training.py           | 20 ++++-----
 studio/backend/utils/datasets/llm_assist.py | 49 +++++++++++++++++++++
 6 files changed, 131 insertions(+), 49 deletions(-)

diff --git a/studio/backend/models/datasets.py b/studio/backend/models/datasets.py
index f20d6f2d15..6f9de26939 100644
--- a/studio/backend/models/datasets.py
+++ b/studio/backend/models/datasets.py
@@ -7,7 +7,12 @@
 
 from typing import Any, Dict, List, Optional
 
-from pydantic import BaseModel, Field, model_validator
+from pydantic import BaseModel, Field, field_validator, model_validator
+
+# Round 24 P1 #11: reuse the chat / diffusion / export identifier
+# hardening so dataset routes also reject control characters and
+# URL-embedded HF tokens in user-controlled identifiers.
+from models.inference import _no_control_chars, _reject_embedded_hf_token
 
 
 class CheckFormatRequest(BaseModel):
@@ -27,6 +32,16 @@ def _compat_split(cls, values: Any) -> Any:
             values.setdefault("train_split", values.pop("split"))
         return values
 
+    @field_validator("dataset_name")
+    @classmethod
+    def _no_dataset_name_control_chars(cls, v, info):
+        return _no_control_chars(v, info.field_name)
+
+    @field_validator("dataset_name")
+    @classmethod
+    def _no_dataset_name_embedded_hf_tokens(cls, v, info):
+        return _reject_embedded_hf_token(v, info.field_name)
+
 
 class CheckFormatResponse(BaseModel):
     """Response for dataset format check"""
@@ -57,6 +72,16 @@ class AiAssistMappingRequest(BaseModel):
     model_name: Optional[str] = None
     model_type: Optional[str] = None
 
+    @field_validator("dataset_name", "model_name")
+    @classmethod
+    def _no_identifier_control_chars(cls, v, info):
+        return _no_control_chars(v, info.field_name)
+
+    @field_validator("dataset_name", "model_name")
+    @classmethod
+    def _no_identifier_embedded_hf_tokens(cls, v, info):
+        return _reject_embedded_hf_token(v, info.field_name)
+
 
 class AiAssistMappingResponse(BaseModel):
     """Response from LLM-assisted column classification and conversion advice."""
diff --git a/studio/backend/routes/export.py b/studio/backend/routes/export.py
index faa847f040..b6e3a3c5b8 100644
--- a/studio/backend/routes/export.py
+++ b/studio/backend/routes/export.py
@@ -150,22 +150,15 @@ async def load_checkpoint(
         # reviews #1, #8, #9 flagged.
         from routes.inference import _release_chat_for, _release_diffusion_for
 
-        await _release_chat_for("export")
-
-        # Also unload any active diffusion pipeline (Images page); it
-        # competes for the same GPU and would survive the inference
-        # shutdown above. is_loading is treated like is_loaded so an
-        # in-flight load is also waited out (the diffusion unload
-        # acquires _load_lock + _generate_lock and blocks until the
-        # current load completes, then unloads).
-        # Round 17: previously this was a best-effort try/except that
-        # swallowed every failure with logger.debug, so a wedged
-        # diffusion backend let the export checkpoint load anyway and
-        # OOM at first allocation. ``_release_diffusion_for`` is
-        # strict: it raises HTTPException 503 if status() or
-        # unload_model() fails, or if the backend remains loaded or
-        # loading after the unload call.
+        # Round 24 P1 #3: release diffusion BEFORE chat so a failing
+        # diffusion unload does not leave the user with no chat
+        # model loaded. Same reasoning as the training-start flow
+        # (round 18 P1 #8 / round 24 P1 #2). Earlier rounds kept the
+        # chat release first because the helper was best-effort;
+        # now that ``_release_diffusion_for`` is strict it must run
+        # while chat is still resident so a failure preserves it.
         await _release_diffusion_for("export load")
+        await _release_chat_for("export")
 
         # load_checkpoint spawns and waits on a subprocess and can take
         # minutes. Run it in a worker thread so the event loop stays
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index ce7ddd1342..a7e8f74722 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -1192,9 +1192,14 @@ async def load_model(
             # corrupt the user's exported artifact).
             _raise_if_training_active("chat")
             _raise_if_export_active("chat")
-            # Drop a settled export checkpoint that is just holding
-            # GPU memory but is not actively producing output.
+            # Round 24 P1 #4: release order is now
+            # export -> diffusion -> safetensors chat (was
+            # export -> safetensors chat -> diffusion). A wedged
+            # diffusion unload used to fire AFTER the safetensors
+            # chat was already gone, so the user lost both. Drop
+            # the chat last so an earlier failure preserves it.
             await _release_export_for("GGUF chat")
+            await _release_diffusion_for("GGUF chat load")
 
             llama_backend = get_llama_cpp_backend()
             # Round 19 P2 #8: previously also called
@@ -1206,19 +1211,13 @@ async def load_model(
             # ``_release_safetensors_chat_for`` below already
             # handles missing-backend cases as a no-op.
 
-            # Unload any safetensors / Unsloth model first to free
-            # VRAM. Uses the shared helper so we also drain
-            # ``loading_models`` (round 10 review #4); the inline
-            # version only checked ``active_model_name`` and let an
-            # in-flight safetensors load race the new GGUF allocation.
+            # Unload any safetensors / Unsloth model. Uses the shared
+            # helper so we also drain ``loading_models`` (round 10
+            # review #4); the inline version only checked
+            # ``active_model_name`` and let an in-flight safetensors
+            # load race the new GGUF allocation.
             await _release_safetensors_chat_for("GGUF chat")
 
-            # Round 17 P1 #4: route the diffusion unload through the
-            # strict ``_release_diffusion_for`` helper so a wedged
-            # diffusion pipeline blocks the GGUF chat load with 503
-            # instead of silently double-owning VRAM.
-            await _release_diffusion_for("GGUF chat load")
-
             # Inherit llama_extra_args from the previous load when the
             # request omits the field (the chat-settings Apply path
             # does not round-trip them; explicit [] still clears).
@@ -1392,22 +1391,22 @@ async def load_model(
         # and so we do not silently corrupt an in-flight export.
         _raise_if_training_active("chat")
         _raise_if_export_active("chat")
-        # Drop a settled export checkpoint that is just holding GPU
-        # memory but is not actively producing output.
+        # Round 24 P1 #5: release order is now
+        # export -> diffusion -> llama-chat (was
+        # export -> llama-chat -> diffusion). A wedged diffusion
+        # unload used to fire AFTER the GGUF chat was already gone,
+        # so the user lost both. Drop llama-chat last so an earlier
+        # failure preserves it.
         await _release_export_for("safetensors chat")
+        await _release_diffusion_for("safetensors chat load")
 
         backend = get_inference_backend()
 
-        # Unload any active or mid-download llama-server first.
-        # Shared helper so this stays in sync with the GGUF path's
+        # Unload any active or mid-download llama-server. Shared
+        # helper so this stays in sync with the GGUF path's
         # symmetric ``_release_safetensors_chat_for``.
         await _release_llama_for("safetensors chat")
 
-        # Round 17 P1 #5: strict diffusion unload via the shared
-        # helper so a wedged pipeline blocks the safetensors chat
-        # load with 503 instead of silently double-owning VRAM.
-        await _release_diffusion_for("safetensors chat load")
-
         # Export was already dropped above via the shared
         # ``await _release_export_for("safetensors chat")`` call
         # (which checks is_export_active() before the destructive
diff --git a/studio/backend/routes/models.py b/studio/backend/routes/models.py
index 57c6d2ff7f..34d8a55aa9 100644
--- a/studio/backend/routes/models.py
+++ b/studio/backend/routes/models.py
@@ -1876,6 +1876,14 @@ async def delete_finetuned_model(
     Only paths under Studio's outputs/exports roots are accepted.  Exported
     GGUF entries can delete one quantization variant at a time.
     """
+    # Round 24 P1 #7 + P2 #13: harden both ``model_path`` and
+    # ``gguf_variant`` for control characters and embedded HF
+    # tokens, mirroring the chat / diffusion / training request
+    # validators. Both fields end up in logger.info(...) lines.
+    model_path = _validate_logged_identifier(model_path, "model_path")
+    if gguf_variant is not None:
+        gguf_variant = _validate_logged_identifier(gguf_variant, "gguf_variant")
+
     if source not in {"training", "exported"}:
         raise HTTPException(
             status_code = 400,
@@ -2506,6 +2514,10 @@ async def get_download_progress(
         "progress": 0,
         "cache_path": None,
     }
+    # Round 24 P1 #9: ``repo_id`` flows into log lines deep in
+    # ``_get_repo_size_cached`` on lookup failure, so the same
+    # hardening the request-body models use applies here too.
+    repo_id = _validate_logged_identifier(repo_id, "repo_id")
     try:
         if not _is_valid_repo_id(repo_id):
             return _empty
@@ -2769,6 +2781,12 @@ async def delete_cached_model(
     are removed (e.g. ``UD-Q4_K_XL``).  Otherwise the entire repo is deleted.
     Refuses if the model is currently loaded for inference.
     """
+    # Round 24 P1 #8 + #10: harden both ``repo_id`` and ``variant``
+    # against control characters / embedded HF tokens before they
+    # reach logger.info(...) lines or the HF cache scan.
+    repo_id = _validate_logged_identifier(repo_id, "repo_id")
+    if variant is not None:
+        variant = _validate_logged_identifier(variant, "variant")
     if not _is_valid_repo_id(repo_id):
         raise HTTPException(status_code = 400, detail = "Invalid repo_id format")
 
diff --git a/studio/backend/routes/training.py b/studio/backend/routes/training.py
index 355394cc10..4ea755207c 100644
--- a/studio/backend/routes/training.py
+++ b/studio/backend/routes/training.py
@@ -284,20 +284,18 @@ async def start_training(
         # model. The previous order (chat -> export) would drop chat
         # and then refuse training when a wedged idle export raised,
         # leaving the user with nothing loaded.
+        # Round 24 P1 #2: same reasoning extended to diffusion ->
+        # chat. A wedged diffusion unload used to fire AFTER the chat
+        # backend was already gone, so the user lost both chat and
+        # diffusion on a single failure mode. Order is now
+        # export -> diffusion -> chat, with chat as the last drop so
+        # earlier failures preserve it.
         await _release_export_for("training")
+        await _release_diffusion_for("training")
         await _release_chat_for("training")
 
-        # Also unload any loaded diffusion pipeline (Images page); it
-        # holds the same GPU and would survive the inference shutdown.
-        # is_loading=True is also handled (unload_model takes
-        # _load_lock + _generate_lock and waits the in-flight load out).
-        # Round 17: previously the diffusion unload was best-effort
-        # (try/except + logger.warning), so a stuck diffusion backend
-        # would let training start anyway and immediately OOM the
-        # subprocess. ``_release_diffusion_for`` is strict: it raises
-        # HTTPException 503 if status() or unload_model() fails, or if
-        # the backend remains loaded / loading after the unload call.
-        await _release_diffusion_for("training")
+        # (Diffusion release moved above chat in round 24 P1 #2;
+        # the old trailing call was removed to avoid double-unload.)
 
         # start_training now spawns a subprocess (non-blocking)
         success = backend.start_training(job_id = job_id, **training_kwargs)
diff --git a/studio/backend/utils/datasets/llm_assist.py b/studio/backend/utils/datasets/llm_assist.py
index 22387a4f99..51f60ace33 100644
--- a/studio/backend/utils/datasets/llm_assist.py
+++ b/studio/backend/utils/datasets/llm_assist.py
@@ -139,10 +139,59 @@ def _gpu_workload_busy_for_helper() -> bool:
     blocks the helper instead of double-owning VRAM. Each step
     fails closed: an unverifiable status counts as busy so the
     user's primary workload is preserved over the optional helper.
+
+    Round 24 P1 #1: extended to also catch a Chat-backend GPU owner.
+    The helper GGUF used to run on top of a loaded GGUF chat model
+    (llama-server) or safetensors chat model and OOM their shared
+    GPU; mirror the diffusion check by inspecting llama
+    ``is_loaded`` / ``is_active`` / ``loading_model_identifier`` and
+    safetensors ``active_model_name`` / ``loading_models``.
     """
     if _diffusion_image_model_busy():
         return True
 
+    try:
+        from routes.inference import get_llama_cpp_backend
+    except Exception:
+        pass
+    else:
+        try:
+            llama = get_llama_cpp_backend()
+            if (
+                getattr(llama, "is_loaded", False)
+                or getattr(llama, "is_active", False)
+                or getattr(llama, "loading_model_identifier", None)
+            ):
+                logger.info(
+                    "Skipping helper GGUF while a GGUF chat model is loaded/loading"
+                )
+                return True
+        except Exception:
+            logger.info(
+                "Skipping helper GGUF because llama-server status is unavailable"
+            )
+            return True
+
+    try:
+        from core.inference import get_inference_backend
+    except Exception:
+        pass
+    else:
+        try:
+            inf = get_inference_backend()
+            active = getattr(inf, "active_model_name", None)
+            loading = set(getattr(inf, "loading_models", set()) or set())
+            if active or loading:
+                logger.info(
+                    "Skipping helper GGUF while a safetensors chat model is loaded/loading"
+                )
+                return True
+        except Exception:
+            logger.info(
+                "Skipping helper GGUF because safetensors chat status is unavailable"
+            )
+            return True
+
     try:
         from core.training import get_training_backend
     except Exception:

From 7b5fe1cf100cc8367c5ee4b954cd8e99e7bf0a6d Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 12:14:40 +0000
Subject: [PATCH 63/92] Fix/adjust diffusion: round 25 P1 batch for PR #5754

Five P1 findings from round 25 reviewer aggregate.

1. routes/datasets.py: /download-progress now reuses the same
   identifier hardening that round 24 added to the model route.
   Token-shaped repo_ids like owner/hf_abcdefghij0123456789 used to
   pass the cheap _is_valid_repo_id regex and end up in warning logs.

2. routes/models.py: extend the llama.cpp cache-delete guard to
   path-ownership matching. A GGUF chat model loaded via a local HF
   snapshot path under models--owner--repo/snapshots used to slip
   past the owner/repo string compare and could be rmtree'd while
   llama-server still mmap'd it. Shares one fail-closed HF cache
   scan and an _owned_cache_path_matches helper with the
   safetensors and diffusion guards (round 25 also dedupes the
   diffusion-specific rescan).

3. routes/models.py: extend the safetensors cache-delete guard the
   same way for safetensors models loaded from local snapshot paths.

4. utils/datasets/llm_assist.py: _run_with_helper and
   _run_multi_pass_advisor now acquire the global llama backend via
   routes.inference.get_llama_cpp_backend instead of instantiating
   a private LlamaCppBackend. _gpu_workload_busy_for_helper already
   ensures the global backend is idle on entry, so this is safe, and
   it makes the helper/advisor load visible to the global delete
   guards (loading_model_identifier and friends).

5. requirements/studio.txt: bump huggingface-hub from 0.36.2 pin to
   1.3.0,<2.0 floor and mirror the no-torch-runtime.txt transformers
   and tokenizers constraints. Fresh installs from studio.txt used
   to resolve transformers 5.x with hub 0.36.2, which crashed
   Flux2KleinPipeline import on missing is_offline_mode the first
   time the user hit /api/inference/images/load.

Includes merge of origin/main (PR #5753 install pin bumps and the
mlx export save_method fix from #5727) so the PR diff stops showing
silent reverts of those landed changes.

Tests: PYTHONPATH=studio/backend pytest
test_diffusion_backend.py test_diffusion_routes.py
test_cached_gguf_routes.py test_llama_cpp_cache_aware_disk_check.py
test_inference_model_validation.py
test_models_get_model_config_case_resolution.py
==> 105 passed locally. The 15 flash-attention test failures and
the test_studio_api SDK suite errors reproduce on HEAD without
these changes (pre-existing, unrelated infrastructure).
---
 studio/backend/requirements/studio.txt      |  20 ++-
 studio/backend/routes/datasets.py           |  21 +++
 studio/backend/routes/models.py             | 173 ++++++++++++--------
 studio/backend/utils/datasets/llm_assist.py |  23 ++-
 4 files changed, 163 insertions(+), 74 deletions(-)

diff --git a/studio/backend/requirements/studio.txt b/studio/backend/requirements/studio.txt
index b360261e44..3268f41431 100644
--- a/studio/backend/requirements/studio.txt
+++ b/studio/backend/requirements/studio.txt
@@ -17,7 +17,25 @@ pyjwt
 easydict
 addict
 # gradio>=4.0.0                  # 148 MB - Studio uses React + FastAPI, not Gradio
-huggingface-hub==0.36.2
+# Round 25 P1 #5: keep the Studio Images dependency set internally
+# compatible. ``diffusers>=0.37.0`` ships Flux2KleinPipeline /
+# Flux2Pipeline, which transitively import the newer ``transformers``
+# (>=4.56) that requires ``huggingface_hub.is_offline_mode`` -- only
+# available in ``huggingface_hub>=1.0``. The previous ``==0.36.2``
+# pin let fresh installs end up with ``transformers 5.x`` +
+# ``huggingface_hub 0.36.2``, which crashed on the first
+# ``/api/inference/images/load`` with
+# ``Flux2KleinPipeline ... no attribute 'is_offline_mode'``. Bump
+# the floor so ``diffusers`` and ``transformers`` resolve into a
+# runtime they can actually import.
+huggingface-hub>=1.3.0,<2.0
+# Mirror the ``transformers`` constraint from
+# ``no-torch-runtime.txt``. Without it, the standard install can
+# resolve ``transformers 5.4.0+`` which drops Studio-supported
+# trainers. ``tokenizers<=0.23.0`` is required because
+# ``transformers 4.56..5.3`` declares it explicitly.
+tokenizers<=0.23.0
+transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,!=4.57.0,!=4.57.4,!=4.57.5,!=5.0.0,!=5.1.0,<=5.3.0
 structlog>=24.1.0
 diceware
 ddgs
diff --git a/studio/backend/routes/datasets.py b/studio/backend/routes/datasets.py
index 206af2a66f..b0e11fd248 100644
--- a/studio/backend/routes/datasets.py
+++ b/studio/backend/routes/datasets.py
@@ -68,11 +68,27 @@ def _resolve_hf_cache_realpath(repo_dir: Path) -> Optional[str]:
 # Import dataset utilities
 from utils.datasets import check_dataset_format
 from auth.authentication import get_current_subject
+from models.inference import _no_control_chars, _reject_embedded_hf_token
 
 router = APIRouter()
 logger = get_logger(__name__)
 
 
+def _validate_logged_identifier(value: str, field_name: str) -> str:
+    """Round 25 P1 #1: mirror the helper in routes/models.py so the
+    dataset ``/download-progress`` route never reaches logger/cache
+    paths with control characters or embedded HF tokens. Token-shaped
+    strings like ``owner/hf_abcdefghij0123456789`` would otherwise pass
+    the cheap ``_is_valid_repo_id`` regex and end up in warning logs.
+    """
+    try:
+        value = _no_control_chars(value, field_name)
+        value = _reject_embedded_hf_token(value, field_name)
+    except ValueError as exc:
+        raise HTTPException(status_code = 422, detail = str(exc)) from exc
+    return value
+
+
 from models.datasets import (
     AiAssistMappingRequest,
     AiAssistMappingResponse,
@@ -370,6 +386,11 @@ async def get_dataset_download_progress(
     bytes are observable here. Returns ``cache_path`` so the UI can
     show users where the dataset blobs landed on disk.
     """
+    # Round 25 P1 #1: harden ``repo_id`` before it reaches the
+    # ``logger.warning`` line at the bottom (or any future log/cache
+    # path). Matches ``GET /api/models/download-progress`` which
+    # already validates the same parameter in round 24.
+    repo_id = _validate_logged_identifier(repo_id, "repo_id")
     _empty = {
         "downloaded_bytes": 0,
         "expected_bytes": 0,
diff --git a/studio/backend/routes/models.py b/studio/backend/routes/models.py
index 34d8a55aa9..ae15932c33 100644
--- a/studio/backend/routes/models.py
+++ b/studio/backend/routes/models.py
@@ -2790,6 +2790,73 @@ async def delete_cached_model(
     if not _is_valid_repo_id(repo_id):
         raise HTTPException(status_code = 400, detail = "Invalid repo_id format")
 
+    # Round 25 P1 #2 / #3: round 15 added a path-ownership check to
+    # the diffusion guard below, but the llama.cpp and safetensors
+    # guards still only compared logical ``owner/repo`` strings to
+    # the loaded/loading identifier. If a chat or safetensors model
+    # was loaded via a LOCAL HF snapshot path (e.g. through the
+    # ``/load-local-path`` flow), the loaded identifier is the
+    # absolute snapshot path -- ``owner/repo`` never appears there,
+    # the guards passed, and ``DELETE /api/models/delete-cached``
+    # could rmtree an actively mmap'd snapshot.
+    #
+    # Build the HF cache roots for ``repo_id`` ONCE up front and reuse
+    # them in all three guards (llama, safetensors, diffusion). Failure
+    # to scan the cache fails CLOSED on the assumption that we cannot
+    # verify ownership safely; mirrors the diffusion path-scan guard.
+    needle = repo_id.lower()
+    cache_repo_roots: list[Path] = []
+    try:
+        for hf_cache in _all_hf_cache_scans():
+            for repo_info in hf_cache.repos:
+                if (
+                    repo_info.repo_type == "model"
+                    and repo_info.repo_id.lower() == needle
+                ):
+                    try:
+                        cache_repo_roots.append(
+                            Path(repo_info.repo_path).expanduser().resolve()
+                        )
+                    except Exception:
+                        pass
+    except Exception as cache_scan_exc:
+        logger.warning(
+            "Could not scan HF cache during delete guard preflight: %s",
+            cache_scan_exc,
+        )
+        raise HTTPException(
+            status_code = 503,
+            detail = (
+                "Could not verify cache ownership before deleting. Try again."
+            ),
+        ) from cache_scan_exc
+
+    def _owned_cache_path_matches(
+        value: Optional[str], roots: list[Path]
+    ) -> bool:
+        """Return True if ``value`` resolves to (or contains, or is a
+        child of) any of the HF cache repo roots for the target repo.
+        Used by the llama / safetensors guards to catch local snapshot
+        paths the same way the diffusion guard already does.
+        """
+        if not value or not roots:
+            return False
+        try:
+            owned = Path(value).expanduser().resolve()
+        except Exception:
+            return False
+        for root in roots:
+            try:
+                if (
+                    owned == root
+                    or _is_path_under(owned, root)
+                    or _is_path_under(root, owned)
+                ):
+                    return True
+            except Exception:
+                continue
+        return False
+
     # Check if model is currently loaded OR loading. is_active and
     # not is_loaded means an llama-server download / startup is in
     # flight; the cache delete would race the hf_hub_download / mmap.
@@ -2800,10 +2867,10 @@ async def delete_cached_model(
         from routes.inference import get_llama_cpp_backend
 
         llama_backend = get_llama_cpp_backend()
-        loaded_id = (llama_backend.model_identifier or "").lower()
-        loading_id = (
-            getattr(llama_backend, "loading_model_identifier", None) or ""
-        ).lower()
+        loaded_id_raw = llama_backend.model_identifier or ""
+        loaded_id = loaded_id_raw.lower()
+        loading_id_raw = getattr(llama_backend, "loading_model_identifier", None) or ""
+        loading_id = loading_id_raw.lower()
         loading_variant = (
             getattr(llama_backend, "loading_hf_variant", None) or ""
         ).lower()
@@ -2817,9 +2884,14 @@ async def delete_cached_model(
         # (loading Q4_K_M, deleting cached Q8_0) is allowed; only
         # block when the requested variant matches what is being
         # downloaded. Mirrors the /delete-finetuned pairing.
-        needle = repo_id.lower()
         requested_variant = (variant or "").lower()
-        if loading_id == needle:
+        # Round 25 P1 #2: also match by HF cache snapshot path so
+        # local-path GGUF chat loads block the cache delete that
+        # owns their snapshot.
+        loading_matches_repo = loading_id == needle or _owned_cache_path_matches(
+            loading_id_raw, cache_repo_roots
+        )
+        if loading_matches_repo:
             same_loading_variant = (
                 not requested_variant
                 or not loading_variant
@@ -2836,7 +2908,10 @@ async def delete_cached_model(
         # guard fixed in round 5. Per-variant deletes that target a
         # DIFFERENT quant than the loaded one are allowed so the
         # llama and diffusion paths stay symmetric (round 14 P1 #7).
-        if loaded_id == needle and (
+        loaded_matches_repo = loaded_id == needle or _owned_cache_path_matches(
+            loaded_id_raw, cache_repo_roots
+        )
+        if loaded_matches_repo and (
             llama_backend.is_loaded or getattr(llama_backend, "is_active", False)
         ):
             loaded_variant = (getattr(llama_backend, "hf_variant", None) or "").lower()
@@ -2864,22 +2939,27 @@ async def delete_cached_model(
     try:
         inference_backend = get_inference_backend()
         loading_models = getattr(inference_backend, "loading_models", set()) or set()
-        needle = repo_id.lower()
         # Loading set holds model identifiers currently being
         # downloaded / instantiated; treat them like active loads
         # so a delete cannot race a partial mmap.
-        # Exact match only. Prefix matching would block deleting
-        # ``org/model`` while ``org/model-v2`` is loading.
+        # Exact match only on the logical ``owner/repo`` side, but
+        # also match local snapshot paths (round 25 P1 #3) so a
+        # safetensors model loaded from a local HF snapshot path
+        # cannot have its cache rmtree'd out from under it.
         for loading_model in loading_models:
-            ml = (loading_model or "").lower()
-            if ml == needle:
+            ml_raw = loading_model or ""
+            ml = ml_raw.lower()
+            if ml == needle or _owned_cache_path_matches(ml_raw, cache_repo_roots):
                 raise HTTPException(
                     status_code = 409,
                     detail = "Cannot delete a model while it is loading",
                 )
-        if inference_backend.active_model_name:
-            active = inference_backend.active_model_name.lower()
-            if active == needle:
+        active_model_raw = inference_backend.active_model_name
+        if active_model_raw:
+            active = active_model_raw.lower()
+            if active == needle or _owned_cache_path_matches(
+                active_model_raw, cache_repo_roots
+            ):
                 raise HTTPException(
                     status_code = 400,
                     detail = "Unload the model before deleting",
@@ -2919,46 +2999,11 @@ async def delete_cached_model(
         # the HF cache snapshot root (round 16 P1 #5).
         diff_status = diff_backend.status(include_internal = True)
         if diff_status.get("is_loaded") or diff_status.get("is_loading"):
-            needle = repo_id.lower()
-            # Round 15 P1 #4: ALSO compare owned paths against the HF
-            # cache root for this repo. The user may have loaded the
-            # diffusion model from a local snapshot path under
-            # ``models--owner--model/snapshots/<sha>``; the string
-            # ``owner/model`` then never appears in ``owned_id`` and
-            # the previous string-only check would let the cache
-            # delete proceed while the snapshot was still mmap'd.
-            cache_repo_roots: list[Path] = []
-            try:
-                for hf_cache in _all_hf_cache_scans():
-                    for repo_info in hf_cache.repos:
-                        if (
-                            repo_info.repo_type == "model"
-                            and repo_info.repo_id.lower() == needle
-                        ):
-                            try:
-                                cache_repo_roots.append(
-                                    Path(repo_info.repo_path).expanduser().resolve()
-                                )
-                            except Exception:
-                                pass
-            except Exception as cache_scan_exc:
-                # Round 16 P1 #3: a transient cache-scan failure here
-                # used to silently fall through to repo-id-only
-                # matching, which misses local snapshot paths and
-                # let /delete-cached unlink an actively mmap'd
-                # snapshot. Fail-closed (503) so the user retries.
-                logger.warning(
-                    "Could not scan HF cache during diffusion delete guard: %s",
-                    cache_scan_exc,
-                )
-                raise HTTPException(
-                    status_code = 503,
-                    detail = (
-                        "Could not verify diffusion cache ownership before "
-                        "deleting. Try again."
-                    ),
-                ) from cache_scan_exc
-
+            # ``needle`` and ``cache_repo_roots`` come from the
+            # preflight scan above; round 25 deduplicated the
+            # diffusion-specific rescan and now all three guards
+            # share the same fail-closed cache view.
+            #
             # Pair each owned repo with the GGUF variant it actually
             # owns (active or pending) so a swap in progress does not
             # collapse both quants into the pending one (round 13
@@ -2969,20 +3014,10 @@ async def delete_cached_model(
                 if not owned_id:
                     continue
                 owned_matches_repo = owned_id.lower() == needle
-                if not owned_matches_repo and cache_repo_roots:
-                    try:
-                        owned_path = Path(owned_id).expanduser().resolve()
-                    except Exception:
-                        owned_path = None
-                    if owned_path is not None:
-                        for repo_root in cache_repo_roots:
-                            if (
-                                owned_path == repo_root
-                                or _is_path_under(owned_path, repo_root)
-                                or _is_path_under(repo_root, owned_path)
-                            ):
-                                owned_matches_repo = True
-                                break
+                if not owned_matches_repo and _owned_cache_path_matches(
+                    owned_id, cache_repo_roots
+                ):
+                    owned_matches_repo = True
                 if not owned_matches_repo:
                     continue
                 if _variant_delete_is_safe_for_owned_gguf(variant, owned_gguf):
diff --git a/studio/backend/utils/datasets/llm_assist.py b/studio/backend/utils/datasets/llm_assist.py
index 51f60ace33..0f244a5d56 100644
--- a/studio/backend/utils/datasets/llm_assist.py
+++ b/studio/backend/utils/datasets/llm_assist.py
@@ -245,9 +245,18 @@ def _run_with_helper(prompt: str, max_tokens: int = 256) -> Optional[str]:
 
     backend = None
     try:
-        from core.inference.llama_cpp import LlamaCppBackend
+        # Round 25 P1 #4: use the GLOBAL llama backend instead of a
+        # private ``LlamaCppBackend()`` instance. The private instance
+        # was invisible to ``DELETE /api/models/delete-cached`` and the
+        # other global delete guards because they inspect the singleton
+        # returned by ``get_llama_cpp_backend()``. A concurrent cache
+        # delete could rmtree the helper's mid-flight download or
+        # mmap'd snapshot. ``_gpu_workload_busy_for_helper`` above
+        # already ensures the global backend is idle before we reach
+        # here, so taking it over is safe.
+        from routes.inference import get_llama_cpp_backend
 
-        backend = LlamaCppBackend()
+        backend = get_llama_cpp_backend()
         logger.info(f"Loading helper model: {repo} ({variant})")
 
         ok = backend.load_model(
@@ -641,9 +650,15 @@ def _run_multi_pass_advisor(
 
     backend = None
     try:
-        from core.inference.llama_cpp import LlamaCppBackend
+        # Round 25 P1 #4: mirror ``_run_with_helper`` and acquire the
+        # GLOBAL llama backend so cache-delete and unload guards see
+        # this advisor load via the singleton's
+        # ``loading_model_identifier`` / ``model_identifier``. The
+        # round 23/24 ``_gpu_workload_busy_for_helper`` already
+        # blocks reach here unless the global llama backend is idle.
+        from routes.inference import get_llama_cpp_backend
 
-        backend = LlamaCppBackend()
+        backend = get_llama_cpp_backend()
         logger.info(f"Loading advisor model: {repo} ({variant})")
         t0 = time.monotonic()
 

From 4785f76fa2520fc2975d0f1b5fac64a6ad275a09 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 12:14:56 +0000
Subject: [PATCH 64/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/routes/models.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/studio/backend/routes/models.py b/studio/backend/routes/models.py
index ae15932c33..d7ed9aed4b 100644
--- a/studio/backend/routes/models.py
+++ b/studio/backend/routes/models.py
@@ -2826,14 +2826,10 @@ async def delete_cached_model(
         )
         raise HTTPException(
             status_code = 503,
-            detail = (
-                "Could not verify cache ownership before deleting. Try again."
-            ),
+            detail = ("Could not verify cache ownership before deleting. Try again."),
         ) from cache_scan_exc
 
-    def _owned_cache_path_matches(
-        value: Optional[str], roots: list[Path]
-    ) -> bool:
+    def _owned_cache_path_matches(value: Optional[str], roots: list[Path]) -> bool:
         """Return True if ``value`` resolves to (or contains, or is a
         child of) any of the HF cache repo roots for the target repo.
         Used by the llama / safetensors guards to catch local snapshot

From fd7d334d10c5886b1bd7d20af686e1f50ed18036 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 12:40:35 +0000
Subject: [PATCH 65/92] Fix studio.txt vs constraints.txt huggingface-hub
 conflict (PR #5754)

Round 25 P1 #5 bumped studio.txt huggingface-hub to >=1.3.0,<2.0 but

single-env/constraints.txt still pinned ==0.36.2, which made fresh

Studio Update CI / Mac Studio Update CI / Mac Studio UI CI fail at

the studio-deps install step with ResolutionImpossible. Bump the

constraint pin to ==1.8.0 to match the setup.sh / setup.ps1 t5

sub-env pins and satisfy the new studio.txt floor.
---
 studio/backend/requirements/single-env/constraints.txt | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/studio/backend/requirements/single-env/constraints.txt b/studio/backend/requirements/single-env/constraints.txt
index 156f78567e..0e25124a9f 100644
--- a/studio/backend/requirements/single-env/constraints.txt
+++ b/studio/backend/requirements/single-env/constraints.txt
@@ -2,7 +2,12 @@
 # Keep compatible with unsloth transformers bounds.
 transformers==4.57.6
 trl==0.23.1
-huggingface-hub==0.36.2
+# Round 25 P1 #5 follow-up: bumped from 0.36.2 because studio.txt now
+# requires >=1.3.0,<2.0 (Flux2KleinPipeline transitively imports
+# transformers 5.x which needs huggingface_hub.is_offline_mode,
+# introduced in hub 1.x). 1.8.0 matches the explicit pin used by
+# studio/setup.sh / setup.ps1 for the t5 sub-envs.
+huggingface-hub==1.8.0
 
 # Studio stack
 datasets==4.3.0

From 65ea3a2c81d21bd55db311f009b16203317a06ba Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 13:13:19 +0000
Subject: [PATCH 66/92] Fix/adjust diffusion: round 26 P1 batch for PR #5754

Twelve P1 findings from round 26 reviewer aggregate, plus the CI
revert of round 25 P1 #5 to a less invasive location.

1. requirements/studio.txt + requirements/single-env/constraints.txt:
   revert the round 25 huggingface-hub bump (broke Studio Update CI,
   Mac Studio Update CI, Mac Studio UI CI, Studio UI CI all with
   ResolutionImpossible against transformers==4.57.6 which requires
   hub<1.0). Standard install path stays on the well-tested 4.57.6 +
   0.36.2 + trl 0.23.1 trio.

2. requirements/no-torch-runtime.txt + pyproject.toml
   [huggingfacenotorch]: bump huggingface_hub floor from >=0.34.0 to
   >=1.3.0,<2.0 -- this is where the actual transformers 5.x +
   hub 0.36.2 broken combo can land because the file installs
   --no-deps. transformers 5.x calls hub.is_offline_mode which only
   exists in hub 1.x.

3. utils/datasets/llm_assist.py: revert round 25 P1 #4 (helper/advisor
   sharing the global llama backend) which introduced three
   regressions: a chat-evict load race after the busy precheck, a
   finally-block that could unload a user chat model, and an
   identifier mismatch the delete guard could not canonicalize. Go
   back to PRIVATE LlamaCppBackend instances and expose the active
   helper/advisor repos through a new thread-safe registry
   (helper_advisor_owns_repo / _register_helper_advisor_repo /
   _unregister_helper_advisor_repo) so DELETE /api/models/delete-cached
   can still block the rmtree.

4. routes/models.py delete_cached_model: check the new helper/advisor
   registry up front and 409 if a helper/advisor still owns the
   target repo. Closes round 26 P1 #13 and #14 (helper/advisor
   identifiers were prefixed and would never equal the raw repo id).

5. routes/models.py get_lora_base_model: validate lora_path with
   _validate_logged_identifier before it is reflected in 404 detail
   and error logs (round 26 P1 #12).

6. routes/inference.py /unload: round 21 P1 #3 added a "or not
   is_loaded" fallback that let an unload of owner/B cancel a pending
   llama load of owner/A. Replace it with a narrow
   llama_is_starting_without_identifier branch that only fires when
   llama-server is mid-startup with neither identifier set (round 26
   P1 #5).

7. routes/inference.py /unload: poll loading_model_identifier for up
   to 5 s after asyncio.to_thread(unload_model) so a legitimate
   pending-load cancel does not 503 because the load thread has not
   yet observed _cancel_event in its finally (round 26 P2 #15).

8. models/training.py TrainingStartRequest: extend identifier
   hardening to hf_dataset, subset, train_split, eval_split. Round 22
   only guarded model_name (round 26 P1 #10).

9. models/data_recipe.py SeedInspectRequest: add _no_control_chars +
   _reject_embedded_hf_token field_validators on dataset_name (round
   26 P1 #11).

Tests: 105 targeted (diffusion + cached_gguf + llama_cpp_cache +
inference_model_validation + models_get_model_config) and 1768
broader backend tests pass locally. Pre-existing
test_desktop_auth.py, test_studio_api.py, and
test_training_worker_flash_attn.py failures reproduce on HEAD
without these changes.
---
 pyproject.toml                                |  5 +-
 studio/backend/models/data_recipe.py          | 12 +++
 studio/backend/models/training.py             |  6 +-
 .../backend/requirements/no-torch-runtime.txt |  8 +-
 .../requirements/single-env/constraints.txt   |  7 +-
 studio/backend/requirements/studio.txt        | 20 +----
 studio/backend/routes/inference.py            | 34 ++++++++-
 studio/backend/routes/models.py               | 28 +++++++
 studio/backend/utils/datasets/llm_assist.py   | 75 ++++++++++++++-----
 9 files changed, 143 insertions(+), 52 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 106dcef3dd..4ccf8583b7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -81,7 +81,10 @@ huggingfacenotorch = [
     "datasets>=3.4.1,!=4.0.*,!=4.1.0,<4.4.0",
     "accelerate>=0.34.1",
     "peft>=0.18.0,!=0.11.0",
-    "huggingface_hub>=0.34.0",
+    # Round 26 P1 #9: floor at 1.3.0 because the diffusion stack below
+    # pulls transformers 5.x which calls hub.is_offline_mode (hub 1.x).
+    # Keep <2.0 to avoid any future hub ABI break.
+    "huggingface_hub>=1.3.0,<2.0",
     "hf_transfer",
     # Studio Images page depends on Flux2KleinPipeline /
     # Flux2Pipeline, both shipped in diffusers>=0.37.0. Floor was
diff --git a/studio/backend/models/data_recipe.py b/studio/backend/models/data_recipe.py
index fe607a3f92..06fdcb0963 100644
--- a/studio/backend/models/data_recipe.py
+++ b/studio/backend/models/data_recipe.py
@@ -90,6 +90,18 @@ class SeedInspectRequest(BaseModel):
     split: str | None = "train"
     preview_size: int = Field(default = 10, ge = 1, le = 50)
 
+    # Round 26 P1 #11: dataset_name reaches HF + log/echo paths, so
+    # mirror the hardening other dataset request models already do.
+    @field_validator("dataset_name")
+    @classmethod
+    def _no_dataset_name_control_chars(cls, v, info):
+        return _no_control_chars(v, info.field_name)
+
+    @field_validator("dataset_name")
+    @classmethod
+    def _no_dataset_name_embedded_hf_tokens(cls, v, info):
+        return _reject_embedded_hf_token(v, info.field_name)
+
 
 class SeedInspectUploadRequest(BaseModel):
     # Legacy single-file flow (mutually exclusive with file_ids)
diff --git a/studio/backend/models/training.py b/studio/backend/models/training.py
index e0eec81197..234474fd23 100644
--- a/studio/backend/models/training.py
+++ b/studio/backend/models/training.py
@@ -60,12 +60,14 @@ class TrainingStartRequest(BaseModel):
     # Round 22 P1 #1: identifier hardening (round 5 / 15 / 20 / 21
     # extended these to chat + diffusion request models; training
     # was the last unguarded entry point).
-    @field_validator("model_name")
+    # Round 26 P1 #10: hf_dataset / subset / train_split / eval_split
+    # are reflected in status + error messages, harden them too.
+    @field_validator("model_name", "hf_dataset", "subset", "train_split", "eval_split")
     @classmethod
     def _no_model_name_control_chars(cls, v, info):
         return _no_control_chars(v, info.field_name)
 
-    @field_validator("model_name")
+    @field_validator("model_name", "hf_dataset")
     @classmethod
     def _no_model_name_embedded_hf_tokens(cls, v, info):
         return _reject_embedded_hf_token(v, info.field_name)
diff --git a/studio/backend/requirements/no-torch-runtime.txt b/studio/backend/requirements/no-torch-runtime.txt
index 117de55c51..76da097a71 100644
--- a/studio/backend/requirements/no-torch-runtime.txt
+++ b/studio/backend/requirements/no-torch-runtime.txt
@@ -43,7 +43,13 @@ safetensors>=0.4.3
 datasets>=3.4.1,!=4.0.*,!=4.1.0,<4.4.0
 accelerate>=0.34.1
 peft>=0.18.0,!=0.11.0
-huggingface_hub>=0.34.0
+# Round 26 P1 #8: floor at 1.3.0 because transformers 5.x (allowed by
+# the range below) calls huggingface_hub.is_offline_mode, only present
+# in hub 1.x. Under --no-deps the resolver does not enforce this
+# transitively, so a pre-existing 0.36.2 used to be kept and the next
+# `from transformers import AutoConfig` raised ImportError. Upper bound
+# <2.0 keeps us off any future ABI break.
+huggingface_hub>=1.3.0,<2.0
 hf_transfer
 # Floor 0.37.0 introduces Flux2KleinPipeline + Flux2Pipeline which the
 # Studio Images page imports for the default curated picker.
diff --git a/studio/backend/requirements/single-env/constraints.txt b/studio/backend/requirements/single-env/constraints.txt
index 0e25124a9f..156f78567e 100644
--- a/studio/backend/requirements/single-env/constraints.txt
+++ b/studio/backend/requirements/single-env/constraints.txt
@@ -2,12 +2,7 @@
 # Keep compatible with unsloth transformers bounds.
 transformers==4.57.6
 trl==0.23.1
-# Round 25 P1 #5 follow-up: bumped from 0.36.2 because studio.txt now
-# requires >=1.3.0,<2.0 (Flux2KleinPipeline transitively imports
-# transformers 5.x which needs huggingface_hub.is_offline_mode,
-# introduced in hub 1.x). 1.8.0 matches the explicit pin used by
-# studio/setup.sh / setup.ps1 for the t5 sub-envs.
-huggingface-hub==1.8.0
+huggingface-hub==0.36.2
 
 # Studio stack
 datasets==4.3.0
diff --git a/studio/backend/requirements/studio.txt b/studio/backend/requirements/studio.txt
index 3268f41431..b360261e44 100644
--- a/studio/backend/requirements/studio.txt
+++ b/studio/backend/requirements/studio.txt
@@ -17,25 +17,7 @@ pyjwt
 easydict
 addict
 # gradio>=4.0.0                  # 148 MB - Studio uses React + FastAPI, not Gradio
-# Round 25 P1 #5: keep the Studio Images dependency set internally
-# compatible. ``diffusers>=0.37.0`` ships Flux2KleinPipeline /
-# Flux2Pipeline, which transitively import the newer ``transformers``
-# (>=4.56) that requires ``huggingface_hub.is_offline_mode`` -- only
-# available in ``huggingface_hub>=1.0``. The previous ``==0.36.2``
-# pin let fresh installs end up with ``transformers 5.x`` +
-# ``huggingface_hub 0.36.2``, which crashed on the first
-# ``/api/inference/images/load`` with
-# ``Flux2KleinPipeline ... no attribute 'is_offline_mode'``. Bump
-# the floor so ``diffusers`` and ``transformers`` resolve into a
-# runtime they can actually import.
-huggingface-hub>=1.3.0,<2.0
-# Mirror the ``transformers`` constraint from
-# ``no-torch-runtime.txt``. Without it, the standard install can
-# resolve ``transformers 5.4.0+`` which drops Studio-supported
-# trainers. ``tokenizers<=0.23.0`` is required because
-# ``transformers 4.56..5.3`` declares it explicitly.
-tokenizers<=0.23.0
-transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,!=4.57.0,!=4.57.4,!=4.57.5,!=5.0.0,!=5.1.0,<=5.3.0
+huggingface-hub==0.36.2
 structlog>=24.1.0
 diceware
 ddgs
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index a7e8f74722..cf93153fbe 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -1683,9 +1683,27 @@ async def unload_model(
             or is_registered_native_path_label(loaded_identifier, request.model_path)
             or is_registered_native_path_label(loading_identifier, request.model_path)
         )
-        if (getattr(llama_backend, "is_active", False) or loading_identifier) and (
-            llama_matches_request or not getattr(llama_backend, "is_loaded", False)
-        ):
+        # Round 26 P1 #5: the previous ``or not is_loaded`` fallback
+        # let an unload of ``owner/B`` cancel a pending llama download
+        # of ``owner/A`` and silently leave safetensors ``owner/B``
+        # alive. Only enter the llama branch when the request actually
+        # matches the loaded/loading identifier, OR when llama-server
+        # is starting up without any identifier yet (the original
+        # narrow case we wanted to catch).
+        llama_is_starting_without_identifier = (
+            getattr(llama_backend, "is_active", False)
+            and not getattr(llama_backend, "is_loaded", False)
+            and not loaded_identifier
+            and not loading_identifier
+        )
+        should_unload_llama = (
+            llama_matches_request
+            and (
+                getattr(llama_backend, "is_active", False)
+                or loading_identifier
+            )
+        ) or llama_is_starting_without_identifier
+        if should_unload_llama:
             # Round 19 P1 #6: previously this called
             # ``llama_backend.unload_model()`` and unconditionally
             # returned ``status="unloaded"`` even when the subprocess
@@ -1694,6 +1712,16 @@ async def unload_model(
             # still resident. Treat ``False`` / leftover state as a
             # 503 so the user retries.
             ok = await asyncio.to_thread(llama_backend.unload_model)
+            # Round 26 P2 #15: explicit cancel of a pending GGUF load
+            # leaves loading_model_identifier set briefly until the
+            # load thread observes _cancel_event in its finally. Wait
+            # up to 5s so a legitimate cancel does not 503.
+            deadline = time.monotonic() + 5.0
+            while (
+                getattr(llama_backend, "loading_model_identifier", None)
+                and time.monotonic() < deadline
+            ):
+                await asyncio.sleep(0.1)
             if (
                 ok is False
                 or getattr(llama_backend, "is_loaded", False)
diff --git a/studio/backend/routes/models.py b/studio/backend/routes/models.py
index d7ed9aed4b..8b3025e056 100644
--- a/studio/backend/routes/models.py
+++ b/studio/backend/routes/models.py
@@ -2219,6 +2219,9 @@ async def get_lora_base_model(
 
     This endpoint wraps the backend get_base_model_from_lora function.
     """
+    # Round 26 P1 #12: lora_path is echoed back in 404 detail and logs;
+    # harden it the same way other reflected identifiers are.
+    lora_path = _validate_logged_identifier(lora_path, "lora_path")
     try:
         base_model = get_base_model_from_lora(lora_path)
 
@@ -2853,6 +2856,31 @@ def _owned_cache_path_matches(value: Optional[str], roots: list[Path]) -> bool:
                 continue
         return False
 
+    # Round 26 P1 #13 / #14: helper/advisor GGUF loads run on a
+    # PRIVATE LlamaCppBackend, so the global backend below cannot see
+    # them. utils/datasets/llm_assist.py publishes the active repo
+    # via helper_advisor_owns_repo() for exactly this guard. Fail
+    # closed on the variant question (block any variant of the repo)
+    # because helper/advisor flows do not pass a variant through.
+    try:
+        from utils.datasets.llm_assist import helper_advisor_owns_repo
+
+        if helper_advisor_owns_repo(repo_id):
+            raise HTTPException(
+                status_code = 409,
+                detail = "Cannot delete a model while AI Assist is using it",
+            )
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.warning(
+            "Could not check helper/advisor backend status before cache delete: %s", e
+        )
+        raise HTTPException(
+            status_code = 503,
+            detail = "Could not verify AI Assist load status before deleting cache",
+        ) from e
+
     # Check if model is currently loaded OR loading. is_active and
     # not is_loaded means an llama-server download / startup is in
     # flight; the cache delete would race the hf_hub_download / mmap.
diff --git a/studio/backend/utils/datasets/llm_assist.py b/studio/backend/utils/datasets/llm_assist.py
index 0f244a5d56..5299a7ac75 100644
--- a/studio/backend/utils/datasets/llm_assist.py
+++ b/studio/backend/utils/datasets/llm_assist.py
@@ -18,6 +18,7 @@
 import os
 import re
 import textwrap
+import threading
 import time
 from itertools import islice
 from typing import Any, Optional
@@ -31,6 +32,41 @@
 
 README_MAX_CHARS = 1500
 
+# Round 26 P1 #13 / #14: helper/advisor run on PRIVATE LlamaCppBackend
+# instances (round 25 P1 #4 briefly used the global singleton, which
+# caused chat-evict races and finally-eviction bugs and still left
+# delete-cache blind because helper/advisor publish prefixed
+# identifiers the guard could not match). Expose loading repo ids
+# through a thread-safe set so DELETE /api/models/delete-cached can
+# block while a helper or advisor still owns the cache.
+_HELPER_ADVISOR_ACTIVE_REPOS: set[str] = set()
+_HELPER_ADVISOR_LOCK = threading.Lock()
+
+
+def helper_advisor_owns_repo(repo_id: str) -> bool:
+    """Return True if any helper/advisor load currently owns this
+    HF repo id. Comparison is case-insensitive to match the chat
+    backend's lowercased needle."""
+    if not repo_id:
+        return False
+    needle = repo_id.lower()
+    with _HELPER_ADVISOR_LOCK:
+        return needle in _HELPER_ADVISOR_ACTIVE_REPOS
+
+
+def _register_helper_advisor_repo(repo_id: str) -> None:
+    if not repo_id:
+        return
+    with _HELPER_ADVISOR_LOCK:
+        _HELPER_ADVISOR_ACTIVE_REPOS.add(repo_id.lower())
+
+
+def _unregister_helper_advisor_repo(repo_id: str) -> None:
+    if not repo_id:
+        return
+    with _HELPER_ADVISOR_LOCK:
+        _HELPER_ADVISOR_ACTIVE_REPOS.discard(repo_id.lower())
+
 
 def _strip_think_tags(text: str) -> str:
     """Strip <think>...</think> reasoning blocks emitted by some models.
@@ -244,19 +280,17 @@ def _run_with_helper(prompt: str, max_tokens: int = 256) -> Optional[str]:
     )
 
     backend = None
+    _register_helper_advisor_repo(repo)
     try:
-        # Round 25 P1 #4: use the GLOBAL llama backend instead of a
-        # private ``LlamaCppBackend()`` instance. The private instance
-        # was invisible to ``DELETE /api/models/delete-cached`` and the
-        # other global delete guards because they inspect the singleton
-        # returned by ``get_llama_cpp_backend()``. A concurrent cache
-        # delete could rmtree the helper's mid-flight download or
-        # mmap'd snapshot. ``_gpu_workload_busy_for_helper`` above
-        # already ensures the global backend is idle before we reach
-        # here, so taking it over is safe.
-        from routes.inference import get_llama_cpp_backend
-
-        backend = get_llama_cpp_backend()
+        # Round 26 P1 #1 / #3 / #13 / #14: use a PRIVATE backend so the
+        # helper can never preempt or be preempted by the user's
+        # chat backend and cannot accidentally unload it in finally.
+        # The active repo is published via _register_helper_advisor_repo
+        # above so DELETE /api/models/delete-cached can still block the
+        # cache rmtree while the helper is downloading or mmap'ing.
+        from core.inference.llama_cpp import LlamaCppBackend
+
+        backend = LlamaCppBackend()
         logger.info(f"Loading helper model: {repo} ({variant})")
 
         ok = backend.load_model(
@@ -305,6 +339,7 @@ def _run_with_helper(prompt: str, max_tokens: int = 256) -> Optional[str]:
                 logger.info("Helper model unloaded")
             except Exception:
                 pass
+        _unregister_helper_advisor_repo(repo)
 
 
 # ─── Public API ───────────────────────────────────────────────────────
@@ -649,16 +684,15 @@ def _run_multi_pass_advisor(
     )
 
     backend = None
+    _register_helper_advisor_repo(repo)
     try:
-        # Round 25 P1 #4: mirror ``_run_with_helper`` and acquire the
-        # GLOBAL llama backend so cache-delete and unload guards see
-        # this advisor load via the singleton's
-        # ``loading_model_identifier`` / ``model_identifier``. The
-        # round 23/24 ``_gpu_workload_busy_for_helper`` already
-        # blocks reach here unless the global llama backend is idle.
-        from routes.inference import get_llama_cpp_backend
+        # Round 26 P1 #2 / #4 / #13 / #14: mirror ``_run_with_helper``
+        # and use a PRIVATE backend. Round 25's global-backend swap
+        # introduced chat-evict races and finally-eviction bugs.
+        # The registry above keeps delete-cache safe.
+        from core.inference.llama_cpp import LlamaCppBackend
 
-        backend = get_llama_cpp_backend()
+        backend = LlamaCppBackend()
         logger.info(f"Loading advisor model: {repo} ({variant})")
         t0 = time.monotonic()
 
@@ -990,6 +1024,7 @@ def _run_multi_pass_advisor(
                 logger.info("Advisor model unloaded")
             except Exception:
                 pass
+        _unregister_helper_advisor_repo(repo)
 
 
 def llm_conversion_advisor(

From e17aea6c814d9a98891f4edb12a23c2e6fb3919f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 13:13:39 +0000
Subject: [PATCH 67/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/routes/inference.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index cf93153fbe..673c31d508 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -1698,10 +1698,7 @@ async def unload_model(
         )
         should_unload_llama = (
             llama_matches_request
-            and (
-                getattr(llama_backend, "is_active", False)
-                or loading_identifier
-            )
+            and (getattr(llama_backend, "is_active", False) or loading_identifier)
         ) or llama_is_starting_without_identifier
         if should_unload_llama:
             # Round 19 P1 #6: previously this called

From 6c528fb0138616dce7a732f3e0d09c124d121ab8 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 13:39:40 +0000
Subject: [PATCH 68/92] Fix/adjust diffusion: round 27 P1 + P2 batch for PR
 #5754

Round 27 findings (Opus parallel concurrency + frontend reviews).

Backend P1 fixes:

1. utils/datasets/llm_assist.py: the round 26 helper/advisor active
   registry used a plain set, so two concurrent helper / advisor
   loads of the same DEFAULT_HELPER_MODEL_REPO would both
   set.add() (no-op the second time) and then the first finally
   set.discard() would underflow the registration while the second
   call was still mmap'ing the GGUF. Switch to a Counter with
   proper refcount increment/decrement so the repo stays registered
   until the last user releases it.

2. routes/inference.py _release_chat_for and
   core/inference/diffusion.py _release_chat_backend_for_diffusion:
   helper/advisor GGUF runs on a PRIVATE LlamaCppBackend (round 26
   P1 #1), so the global llama checks below could not see them.
   A user-driven /training/start, /export/load-checkpoint, or
   /images/load would skip the unload and allocate FLUX VRAM on top
   of the helper's resident weights, OOMing on 16-24 GB consumer
   GPUs. Both release paths now consult helper_advisor_busy() and
   fail 503 (or RuntimeError for the in-backend path) so the user
   retries instead of double-owning VRAM.

Frontend P2 fixes:

3. studio/frontend/src/features/images/images-page.tsx: handleUnload
   now calls refreshStatus() in the catch path so a partial unload
   (503 from the backend) does not leave the UI showing a stale
   "Loaded:" label. Matches the handleLoad pattern.

4. images-page.tsx: when status.is_loading is true, auto-poll
   refreshStatus every 2 s so the user sees real progress instead
   of a frozen "Loading..." label until they manually click Refresh.

5. images-page.tsx: aria-label="Inference steps" / "Guidance scale"
   on the two sliders so screen readers can announce them.

6. images-page.tsx: defensive (r.guidance_scale ?? 0).toFixed(1)
   in the results caption so a future backend that serialises
   NaN/None for guidance does not throw at render.

Tests: 105 targeted (diffusion + cached_gguf + inference_validation)
and 1768 broader backend tests pass locally. Frontend
`npm run typecheck` passes.
---
 studio/backend/core/inference/diffusion.py    | 14 +++++++++
 studio/backend/routes/inference.py            | 18 +++++++++++
 studio/backend/utils/datasets/llm_assist.py   | 31 +++++++++++++++----
 .../src/features/images/images-page.tsx       | 22 ++++++++++++-
 4 files changed, 78 insertions(+), 7 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index f670fb0f05..305ad6ea94 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -1516,6 +1516,20 @@ def _release_chat_backend_for_diffusion() -> None:
     diffusion ``load_model`` bails out instead of double-owning VRAM
     (round 17 P1 #2).
     """
+    # Round 27 P1 #2: helper / advisor GGUF loads run on a PRIVATE
+    # LlamaCppBackend so the global llama check below cannot see them.
+    # Refuse the diffusion handoff while a helper / advisor still owns
+    # its private backend so we do not allocate FLUX VRAM on top.
+    try:
+        from utils.datasets.llm_assist import helper_advisor_busy
+    except Exception:
+        pass
+    else:
+        if helper_advisor_busy():
+            raise RuntimeError(
+                "AI Assist (helper / advisor GGUF) is still using the GPU. "
+                "Wait for it to finish before loading a diffusion image model."
+            )
     # 1. GGUF chat backend (llama-server subprocess). We unload when
     #    EITHER is_loaded is True (resident model) OR is_active is
     #    True (mid-download / startup) OR loading_model_identifier is
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 673c31d508..7649c453e1 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -531,6 +531,24 @@ async def _release_chat_for(workload: str) -> None:
     start. Conversely, the standard chat-load path releases only
     the llama side.
     """
+    # Round 27 P1 #2: helper / advisor GGUF loads run on a PRIVATE
+    # LlamaCppBackend (round 26 P1 #1) so the global llama checks
+    # below do not see them. Refuse the handoff while a helper /
+    # advisor still owns its private backend so a new GPU workload
+    # does not allocate on top of helper VRAM and OOM.
+    try:
+        from utils.datasets.llm_assist import helper_advisor_busy
+    except Exception:
+        pass
+    else:
+        if helper_advisor_busy():
+            raise HTTPException(
+                status_code = 503,
+                detail = (
+                    f"AI Assist (helper / advisor GGUF) is still using the "
+                    f"GPU. Wait for it to finish before starting {workload}."
+                ),
+            )
     await _release_llama_for(workload)
     await _release_safetensors_chat_for(workload)
 
diff --git a/studio/backend/utils/datasets/llm_assist.py b/studio/backend/utils/datasets/llm_assist.py
index 5299a7ac75..37c17329c8 100644
--- a/studio/backend/utils/datasets/llm_assist.py
+++ b/studio/backend/utils/datasets/llm_assist.py
@@ -20,6 +20,7 @@
 import textwrap
 import threading
 import time
+from collections import Counter
 from itertools import islice
 from typing import Any, Optional
 
@@ -37,9 +38,15 @@
 # caused chat-evict races and finally-eviction bugs and still left
 # delete-cache blind because helper/advisor publish prefixed
 # identifiers the guard could not match). Expose loading repo ids
-# through a thread-safe set so DELETE /api/models/delete-cached can
-# block while a helper or advisor still owns the cache.
-_HELPER_ADVISOR_ACTIVE_REPOS: set[str] = set()
+# through a thread-safe Counter so DELETE /api/models/delete-cached
+# can block while a helper or advisor still owns the cache.
+#
+# Round 27 P1 #1: must refcount, not a plain set. A helper and an
+# advisor (or two concurrent helpers) often share the default repo
+# unsloth/gemma-4-E2B-it-GGUF. With a set, the first finally call
+# discarded the repo while the second invocation was still loading,
+# and the delete-cache guard then let rmtree race the live mmap.
+_HELPER_ADVISOR_REFCOUNT: Counter[str] = Counter()
 _HELPER_ADVISOR_LOCK = threading.Lock()
 
 
@@ -51,21 +58,33 @@ def helper_advisor_owns_repo(repo_id: str) -> bool:
         return False
     needle = repo_id.lower()
     with _HELPER_ADVISOR_LOCK:
-        return needle in _HELPER_ADVISOR_ACTIVE_REPOS
+        return _HELPER_ADVISOR_REFCOUNT.get(needle, 0) > 0
+
+
+def helper_advisor_busy() -> bool:
+    """Round 27 P1 #2: True if ANY helper/advisor load is in flight.
+    Used by diffusion / training / export release paths so they do
+    not allocate on top of the helper's VRAM while it owns its
+    private LlamaCppBackend instance."""
+    with _HELPER_ADVISOR_LOCK:
+        return sum(_HELPER_ADVISOR_REFCOUNT.values()) > 0
 
 
 def _register_helper_advisor_repo(repo_id: str) -> None:
     if not repo_id:
         return
     with _HELPER_ADVISOR_LOCK:
-        _HELPER_ADVISOR_ACTIVE_REPOS.add(repo_id.lower())
+        _HELPER_ADVISOR_REFCOUNT[repo_id.lower()] += 1
 
 
 def _unregister_helper_advisor_repo(repo_id: str) -> None:
     if not repo_id:
         return
+    needle = repo_id.lower()
     with _HELPER_ADVISOR_LOCK:
-        _HELPER_ADVISOR_ACTIVE_REPOS.discard(repo_id.lower())
+        _HELPER_ADVISOR_REFCOUNT[needle] -= 1
+        if _HELPER_ADVISOR_REFCOUNT[needle] <= 0:
+            _HELPER_ADVISOR_REFCOUNT.pop(needle, None)
 
 
 def _strip_think_tags(text: str) -> str:
diff --git a/studio/frontend/src/features/images/images-page.tsx b/studio/frontend/src/features/images/images-page.tsx
index b5aa124fd7..265786e6bd 100644
--- a/studio/frontend/src/features/images/images-page.tsx
+++ b/studio/frontend/src/features/images/images-page.tsx
@@ -148,6 +148,18 @@ export function ImagesPage() {
     void refreshStatus();
   }, [refreshStatus]);
 
+  // Round 27 P2: when the backend is mid-load (is_loading=true) the
+  // status label froze at "Loading..." until the user clicked
+  // Refresh. Auto-poll every 2 s while a load is in flight so the
+  // UI tracks real backend progress.
+  useEffect(() => {
+    if (!status?.is_loading) return;
+    const id = window.setInterval(() => {
+      void refreshStatus();
+    }, 2000);
+    return () => window.clearInterval(id);
+  }, [status?.is_loading, refreshStatus]);
+
   const handleLoad = useCallback(async () => {
     setBusy("loading");
     try {
@@ -207,6 +219,12 @@ export function ImagesPage() {
       toast.error("Failed to unload image model", {
         description: err instanceof Error ? err.message : String(err),
       });
+      // Round 27 P2: a partial unload (subprocess refused to terminate,
+      // 503 from the backend) used to leave the UI showing the old
+      // "Loaded:" label even though the backend state was half torn
+      // down. Refresh so the button states match reality (mirrors
+      // handleLoad above which always re-fetches on catch).
+      await refreshStatus();
     } finally {
       setBusy("idle");
     }
@@ -491,6 +509,7 @@ export function ImagesPage() {
             <div className="flex flex-col gap-1">
               <Label>Steps: {steps}</Label>
               <Slider
+                aria-label="Inference steps"
                 min={1}
                 max={60}
                 step={1}
@@ -501,6 +520,7 @@ export function ImagesPage() {
             <div className="flex flex-col gap-1">
               <Label>Guidance: {guidance.toFixed(1)}</Label>
               <Slider
+                aria-label="Guidance scale"
                 min={0}
                 max={15}
                 step={0.1}
@@ -554,7 +574,7 @@ export function ImagesPage() {
                   data-testid="diffusion-result-image"
                 />
                 <figcaption className="text-xs text-muted-foreground">
-                  {r.width}x{r.height} - {r.num_inference_steps} steps - g={r.guidance_scale.toFixed(1)}
+                  {r.width}x{r.height} - {r.num_inference_steps} steps - g={(r.guidance_scale ?? 0).toFixed(1)}
                   {/* Prefer seed_str (full uint64 precision) since the
                        numeric seed gets rounded by JSON.parse above
                        Number.MAX_SAFE_INTEGER and would otherwise

From 79da5d910dffac61158049b884a87a54959f0b9b Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 13:43:51 +0000
Subject: [PATCH 69/92] Fix/adjust diffusion: round 27 follow-up P1 batch for
 PR #5754

Five additional P1 findings round 27 reviewer flagged on top of the
round 27 commit 6c528fb0 (Counter refcount + handoff visibility were
already covered). Three remaining studio.txt / no-torch-runtime hub
suggestions are NOT applied because they would re-break CI; the
empirical evidence (round 26 commit 65ea3a2c restored CI green) takes
precedence over the reviewer's stale-state suggestion.

1. models/training.py TrainingStartRequest: extend the embedded HF
   token validator to subset, train_split, eval_split. Round 26 only
   added the control-char guard to those three; the token guard was
   asymmetric and would accept owner/data\\nFAKE hf_abcdef...
   payloads through subset / split fields.

2. models/datasets.py CheckFormatRequest: extend both validators
   (control chars + embedded HF token) to subset and train_split.
   Same asymmetric-fix bug as #1.

3. models/data_recipe.py SeedInspectRequest: extend both validators
   to subset and split. Same pattern.

4. utils/datasets/llm_assist.py precache_helper_gguf: register the
   helper repo in the helper/advisor refcount registry around the
   hf_hub_download loop, then unregister in the finally. Without
   this, the FastAPI-startup background pre-cache could be racing
   a concurrent DELETE /api/models/delete-cached against the same
   cache directory. The runtime helper / advisor calls already
   register (round 26 P1 #13/#14) but the precache was the
   asymmetric gap.

5. routes/models.py _loaded_model_matches_deleted_path: match
   bidirectionally (active under target OR target under active) so
   deleting a child directory of a loaded local model (.../my-flux/
   text_encoder while .../my-flux is loaded) trips the guard.
   Mirrors the diffusion delete-guard symmetric path-overlap check.

Tests: 105 targeted (diffusion + cache + inference_validation) and
the broader backend suite pass locally.
---
 studio/backend/models/data_recipe.py        |  6 ++++--
 studio/backend/models/datasets.py           |  6 ++++--
 studio/backend/models/training.py           |  6 +++++-
 studio/backend/routes/models.py             | 16 +++++++++++++---
 studio/backend/utils/datasets/llm_assist.py |  6 ++++++
 5 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/studio/backend/models/data_recipe.py b/studio/backend/models/data_recipe.py
index 06fdcb0963..2e538e138c 100644
--- a/studio/backend/models/data_recipe.py
+++ b/studio/backend/models/data_recipe.py
@@ -92,12 +92,14 @@ class SeedInspectRequest(BaseModel):
 
     # Round 26 P1 #11: dataset_name reaches HF + log/echo paths, so
     # mirror the hardening other dataset request models already do.
-    @field_validator("dataset_name")
+    # Round 27 P1 #7: split and subset also flow into HF dataset
+    # APIs / errors and must be guarded the same way.
+    @field_validator("dataset_name", "subset", "split")
     @classmethod
     def _no_dataset_name_control_chars(cls, v, info):
         return _no_control_chars(v, info.field_name)
 
-    @field_validator("dataset_name")
+    @field_validator("dataset_name", "subset", "split")
     @classmethod
     def _no_dataset_name_embedded_hf_tokens(cls, v, info):
         return _reject_embedded_hf_token(v, info.field_name)
diff --git a/studio/backend/models/datasets.py b/studio/backend/models/datasets.py
index 6f9de26939..28a4016514 100644
--- a/studio/backend/models/datasets.py
+++ b/studio/backend/models/datasets.py
@@ -32,12 +32,14 @@ def _compat_split(cls, values: Any) -> Any:
             values.setdefault("train_split", values.pop("split"))
         return values
 
-    @field_validator("dataset_name")
+    # Round 27 P1 #6: subset / train_split also flow into HF dataset
+    # APIs and errors/responses, so they need the same hardening.
+    @field_validator("dataset_name", "subset", "train_split")
     @classmethod
     def _no_dataset_name_control_chars(cls, v, info):
         return _no_control_chars(v, info.field_name)
 
-    @field_validator("dataset_name")
+    @field_validator("dataset_name", "subset", "train_split")
     @classmethod
     def _no_dataset_name_embedded_hf_tokens(cls, v, info):
         return _reject_embedded_hf_token(v, info.field_name)
diff --git a/studio/backend/models/training.py b/studio/backend/models/training.py
index 234474fd23..de440a6c06 100644
--- a/studio/backend/models/training.py
+++ b/studio/backend/models/training.py
@@ -67,7 +67,11 @@ class TrainingStartRequest(BaseModel):
     def _no_model_name_control_chars(cls, v, info):
         return _no_control_chars(v, info.field_name)
 
-    @field_validator("model_name", "hf_dataset")
+    # Round 27 P1 #2: subset / train_split / eval_split are reflected
+    # in status + error messages and persisted to job records, so the
+    # embedded-token guard must cover them too. Round 26 only added
+    # the control-char guard to those three.
+    @field_validator("model_name", "hf_dataset", "subset", "train_split", "eval_split")
     @classmethod
     def _no_model_name_embedded_hf_tokens(cls, v, info):
         return _reject_embedded_hf_token(v, info.field_name)
diff --git a/studio/backend/routes/models.py b/studio/backend/routes/models.py
index 8b3025e056..3ec3a9372e 100644
--- a/studio/backend/routes/models.py
+++ b/studio/backend/routes/models.py
@@ -1795,7 +1795,15 @@ def _loaded_model_matches_deleted_path(active_model: str, deleted_path: Path) ->
     try:
         active = Path(active_model).expanduser().resolve()
         target = deleted_path.resolve()
-        return active == target or (target.is_dir() and active.is_relative_to(target))
+        # Round 27 P1 #8: match bidirectionally so deleting a child
+        # directory of a loaded local model (e.g. .../my-flux/text_encoder
+        # while .../my-flux is loaded) also trips the guard. Mirrors
+        # the diffusion delete-guard pattern.
+        return (
+            active == target
+            or (target.is_dir() and active.is_relative_to(target))
+            or (active.is_dir() and target.is_relative_to(active))
+        )
     except (OSError, RuntimeError, ValueError) as e:
         logger.debug(
             "Could not resolve loaded/deleted model paths; falling back to string comparison: %s",
@@ -1803,8 +1811,10 @@ def _loaded_model_matches_deleted_path(active_model: str, deleted_path: Path) ->
         )
         active_lower = active_model.lower()
         target_lower = str(deleted_path).lower()
-        return active_lower == target_lower or active_lower.startswith(
-            f"{target_lower}{os.sep}"
+        return (
+            active_lower == target_lower
+            or active_lower.startswith(f"{target_lower}{os.sep}")
+            or target_lower.startswith(f"{active_lower}{os.sep}")
         )
 
 
diff --git a/studio/backend/utils/datasets/llm_assist.py b/studio/backend/utils/datasets/llm_assist.py
index 37c17329c8..d6dea74c47 100644
--- a/studio/backend/utils/datasets/llm_assist.py
+++ b/studio/backend/utils/datasets/llm_assist.py
@@ -127,6 +127,11 @@ def precache_helper_gguf():
         "UNSLOTH_HELPER_MODEL_VARIANT", DEFAULT_HELPER_MODEL_VARIANT
     )
 
+    # Round 27 P1 #4: register the repo so DELETE /api/models/delete-cached
+    # cannot rmtree the cache directory while we are mid-download. Helper
+    # / advisor runtime calls already register, but the startup precache
+    # was the asymmetric gap that let cache delete race the first download.
+    _register_helper_advisor_repo(repo)
     try:
         from huggingface_hub import HfApi, hf_hub_download
         from huggingface_hub.utils import disable_progress_bars, enable_progress_bars
@@ -158,6 +163,7 @@ def precache_helper_gguf():
     except Exception as e:
         logger.warning(f"Failed to pre-cache helper GGUF: {e}")
     finally:
+        _unregister_helper_advisor_repo(repo)
         try:
             enable_progress_bars()
         except Exception as e:

From c4c9e2aeec665aab4e353b69b22f20765f838fb5 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 14:21:37 +0000
Subject: [PATCH 70/92] Fix/adjust diffusion: round 28 P1 + P2 batch for PR
 #5754

Twelve actionable P1/P2 findings from round 28 reviewer aggregate.
Skipped #3 (studio.txt huggingface-hub bump) because the empirical
CI evidence in round 26 contradicts that suggestion: bumping the
pin there breaks installs that apply constraints.txt
(transformers==4.57.6 requires hub<1.0). The actual broken combo
only happens via the --no-deps no-torch path which is already
bumped in no-torch-runtime.txt and pyproject.toml huggingfacenotorch.

1. utils/datasets/llm_assist.py: split _HELPER_ADVISOR_REFCOUNT
   into CACHE vs GPU counters. helper_advisor_owns_repo (used by
   delete-cache) reads CACHE; helper_advisor_busy (used by public
   handoffs) reads GPU. precache_helper_gguf now registers with
   gpu_owner=False so a background pre-cache download does not
   503 every chat / training / export / diffusion load.

2. utils/datasets/llm_assist.py: introduce _HELPER_ADVISOR_START_LOCK
   and wrap the busy precheck + register pair in _run_with_helper
   and _run_multi_pass_advisor. Two concurrent helper / advisor
   invocations could both pass _gpu_workload_busy_for_helper before
   either registered, then OOM each other.

3. utils/datasets/llm_assist.py: _gpu_workload_busy_for_helper now
   also returns True when another helper/advisor already holds the
   private LlamaCppBackend.

4. routes/inference.py: add _raise_if_helper_advisor_busy(workload)
   that 503s when AI Assist owns the GPU. Wire it into both chat
   load branches (GGUF + safetensors) BEFORE the existing
   _release_export_for / _release_diffusion_for calls so we do not
   first tear down an idle export / diffusion just to fail on the
   helper check.

5. routes/training.py + routes/export.py + diffusion.load_model:
   call the helper-busy check FIRST before any release helper
   fires. Mirrors the chat-load ordering.

6. routes/inference.py _release_llama_for: poll
   loading_model_identifier for up to 5 s after unload_model() so a
   cancelled pending GGUF download has time to clear its
   identifier. Mirrors the same wait round 26 added to the explicit
   /api/inference/unload route.

7. core/inference/diffusion.py _release_chat_backend_for_diffusion:
   same 5 s settling wait for cancelled pending GGUF downloads.

8. models/inference.py LoadRequest: validate every llama_extra_args
   entry through _no_control_chars + _reject_embedded_hf_token.
   The list was forwarded verbatim to a logged llama-server command
   line, so a smuggled control char or hf_... token would land in
   logs and subprocess args.

9. routes/models.py /gguf-download-progress: apply
   _validate_logged_identifier to repo_id and variant, matching the
   round 24 hardening on the adjacent generic /download-progress.

10. routes/inference.py diffusion-load RuntimeError classifier:
    treat "AI Assist ..." messages as retryable 503 instead of 400
    (round 28 P2 #15). Mirrors the round 18/19 markers for chat
    unload failures.

Tests: 105 targeted + 1768 broader backend tests pass locally.
---
 studio/backend/core/inference/diffusion.py  |  17 ++-
 studio/backend/models/inference.py          |  21 ++++
 studio/backend/routes/export.py             |   9 +-
 studio/backend/routes/inference.py          |  64 ++++++++++++
 studio/backend/routes/models.py             |   7 ++
 studio/backend/routes/training.py           |   4 +
 studio/backend/utils/datasets/llm_assist.py | 108 ++++++++++++--------
 7 files changed, 186 insertions(+), 44 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 305ad6ea94..3bdce54819 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -1035,8 +1035,13 @@ def load_model(
                 #      transformer while the old pipeline still owns
                 #      its weights.
                 #   4. THEN call from_single_file / from_pretrained.
-                _release_other_gpu_owners_for_diffusion()
+                # Round 28 P1 #4: helper/advisor check must fire BEFORE
+                # _release_other_gpu_owners_for_diffusion. Otherwise a
+                # blocked Images load could first tear down an idle
+                # export checkpoint just to then RuntimeError on the
+                # helper check inside _release_chat_backend_for_diffusion.
                 _release_chat_backend_for_diffusion()
+                _release_other_gpu_owners_for_diffusion()
 
                 old = self._pipe
                 if old is not None:
@@ -1558,6 +1563,16 @@ def _release_chat_backend_for_diffusion() -> None:
                     "Could not unload the existing GGUF chat model before "
                     "loading a diffusion image model."
                 ) from exc
+            # Round 28 P1 #12: a cancelled pending GGUF download takes
+            # up to a few seconds to clear loading_model_identifier in
+            # its finally block. Wait briefly so the same retryable
+            # cancel path used by the unload route does not 503 us.
+            deadline = time.monotonic() + 5.0
+            while (
+                getattr(backend, "loading_model_identifier", None)
+                and time.monotonic() < deadline
+            ):
+                time.sleep(0.1)
             # Round 18 P1 #4: also reject when ``loading_model_identifier``
             # is still set after the unload call. Without this, a GGUF
             # download / startup that was already in flight before the
diff --git a/studio/backend/models/inference.py b/studio/backend/models/inference.py
index 5691c3d0d5..1b56ecbe7e 100644
--- a/studio/backend/models/inference.py
+++ b/studio/backend/models/inference.py
@@ -127,6 +127,27 @@ def _no_embedded_hf_tokens(cls, v, info):
         ),
     )
 
+    # Round 28 P1 #13: each entry is forwarded verbatim to a logged
+    # subprocess command line and reflected in errors. Reject control
+    # chars and embedded HF tokens for every list entry; allow None.
+    @field_validator("llama_extra_args")
+    @classmethod
+    def _no_extra_args_control_chars(cls, v):
+        if v is None:
+            return v
+        for i, entry in enumerate(v):
+            _no_control_chars(entry, f"llama_extra_args[{i}]")
+        return v
+
+    @field_validator("llama_extra_args")
+    @classmethod
+    def _no_extra_args_embedded_hf_tokens(cls, v):
+        if v is None:
+            return v
+        for i, entry in enumerate(v):
+            _reject_embedded_hf_token(entry, f"llama_extra_args[{i}]")
+        return v
+
 
 class UnloadRequest(BaseModel):
     """Request to unload a model"""
diff --git a/studio/backend/routes/export.py b/studio/backend/routes/export.py
index b6e3a3c5b8..c2a41884f3 100644
--- a/studio/backend/routes/export.py
+++ b/studio/backend/routes/export.py
@@ -148,8 +148,15 @@ async def load_checkpoint(
         # helper so we cover llama-server is_active=True and
         # safetensors loading_models -- the asymmetries round 9
         # reviews #1, #8, #9 flagged.
-        from routes.inference import _release_chat_for, _release_diffusion_for
+        from routes.inference import (
+            _raise_if_helper_advisor_busy,
+            _release_chat_for,
+            _release_diffusion_for,
+        )
 
+        # Round 28 P1 #6: refuse before any release fires so AI Assist
+        # busy does not first tear down idle diffusion.
+        _raise_if_helper_advisor_busy("export")
         # Round 24 P1 #3: release diffusion BEFORE chat so a failing
         # diffusion unload does not leave the user with no chat
         # model loaded. Same reasoning as the training-start flow
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 7649c453e1..636bb5cc9c 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -357,6 +357,42 @@ def _raise_if_export_active(workload: str) -> None:
         )
 
 
+def _raise_if_helper_advisor_busy(workload: str) -> None:
+    """Round 28 P1 #1 / #4 / #5 / #6: refuse a new GPU workload while
+    AI Assist helper / advisor still owns its PRIVATE LlamaCppBackend.
+
+    Called early so callers do NOT first tear down idle export /
+    diffusion / chat owners just to fail on the helper check.
+    """
+    try:
+        from utils.datasets.llm_assist import helper_advisor_busy
+    except Exception:
+        return
+    try:
+        busy = helper_advisor_busy()
+    except Exception as exc:
+        logger.warning(
+            "Could not verify helper/advisor status before %s load: %s",
+            workload,
+            exc,
+        )
+        raise HTTPException(
+            status_code = 503,
+            detail = (
+                f"Could not verify AI Assist status before starting {workload}. "
+                f"Try again."
+            ),
+        ) from exc
+    if busy:
+        raise HTTPException(
+            status_code = 503,
+            detail = (
+                f"AI Assist (helper / advisor GGUF) is still using the GPU. "
+                f"Wait for it to finish before starting {workload}."
+            ),
+        )
+
+
 async def _release_llama_for(workload: str) -> None:
     """Unload the llama-server (GGUF) chat backend if it owns the
     GPU. Treats ``is_loaded`` OR ``is_active`` OR
@@ -403,6 +439,18 @@ async def _release_llama_for(workload: str) -> None:
             ),
         ) from exc
 
+    # Round 28 P1 #11: a pending HF GGUF download cancelled by
+    # unload_model() takes up to a few seconds to settle (the load
+    # thread observes _cancel_event in its finally and clears
+    # loading_model_identifier). Wait briefly so a legitimate cancel
+    # does not 503. Mirrors the /api/inference/unload settling wait.
+    deadline = time.monotonic() + 5.0
+    while (
+        bool(getattr(llama, "loading_model_identifier", None))
+        and time.monotonic() < deadline
+    ):
+        await asyncio.sleep(0.1)
+
     # Round 18 P1 #1: previously only the raised-exception path was
     # treated as failure. ``llama.unload_model()`` returning ``False``
     # (subprocess refused to terminate, IPC timeout) or leaving
@@ -1210,6 +1258,10 @@ async def load_model(
             # corrupt the user's exported artifact).
             _raise_if_training_active("chat")
             _raise_if_export_active("chat")
+            # Round 28 P1 #1: refuse before the release helpers fire
+            # so we do not tear down an idle export / diffusion just to
+            # then 503 on the helper check.
+            _raise_if_helper_advisor_busy("GGUF chat")
             # Round 24 P1 #4: release order is now
             # export -> diffusion -> safetensors chat (was
             # export -> safetensors chat -> diffusion). A wedged
@@ -1409,6 +1461,9 @@ async def load_model(
         # and so we do not silently corrupt an in-flight export.
         _raise_if_training_active("chat")
         _raise_if_export_active("chat")
+        # Round 28 P1 #1: refuse before the release helpers tear down
+        # idle GPU owners.
+        _raise_if_helper_advisor_busy("safetensors chat")
         # Round 24 P1 #5: release order is now
         # export -> diffusion -> llama-chat (was
         # export -> llama-chat -> diffusion). A wedged diffusion
@@ -2196,6 +2251,12 @@ async def diffusion_load(
     # the request is refused with 409 instead of silently killing it.
     _raise_if_training_active("diffusion")
     _raise_if_export_active("diffusion")
+    # Round 28 P1 #4: AI Assist helper/advisor owns a private llama
+    # backend invisible to _release_chat_backend_for_diffusion's
+    # global checks. Refuse early so we do not first tear down an
+    # idle export checkpoint just to fail on the helper check inside
+    # load_model.
+    _raise_if_helper_advisor_busy("diffusion")
     # Round 18 P1 #3 + P1 #7: the route used to drop chat and idle
     # export BEFORE ``backend.load_model`` ran its cheap validation
     # (family inference, GGUF filename checks, gated-token failures,
@@ -2243,6 +2304,9 @@ async def diffusion_load(
             # to the user instead of 503. Match both wordings.
             or "still active or loading after unload" in detail
             or "still loading after unload" in detail
+            # Round 28 P2 #15: AI Assist running (raised by
+            # _release_chat_backend_for_diffusion) is retryable.
+            or "AI Assist" in detail
         ):
             # Round 17 P1 #2: chat unload failures raised by the
             # backend helper map to 503 (retryable infra issue),
diff --git a/studio/backend/routes/models.py b/studio/backend/routes/models.py
index 3ec3a9372e..377d2174be 100644
--- a/studio/backend/routes/models.py
+++ b/studio/backend/routes/models.py
@@ -2440,6 +2440,13 @@ async def get_gguf_download_progress(
     Tracks completed shard downloads in snapshots and in-progress downloads
     in the blobs directory (incomplete files).
     """
+    # Round 28 P1 #14: mirror the hardening on the generic
+    # /download-progress route. Both repo_id and variant are echoed
+    # into the cache-scan path and can reach logs on the failure
+    # branch via the surrounding try/except.
+    repo_id = _validate_logged_identifier(repo_id, "repo_id")
+    if variant:
+        variant = _validate_logged_identifier(variant, "variant")
     try:
         if not _is_valid_repo_id(repo_id):
             return {
diff --git a/studio/backend/routes/training.py b/studio/backend/routes/training.py
index 4ea755207c..79bf233e4c 100644
--- a/studio/backend/routes/training.py
+++ b/studio/backend/routes/training.py
@@ -273,12 +273,16 @@ async def start_training(
         # stops the export and re-submits.
         from routes.inference import (
             _raise_if_export_active,
+            _raise_if_helper_advisor_busy,
             _release_chat_for,
             _release_diffusion_for,
             _release_export_for,
         )
 
         _raise_if_export_active("training")
+        # Round 28 P1 #5: refuse before any release fires so AI Assist
+        # busy does not first tear down idle diffusion/export.
+        _raise_if_helper_advisor_busy("training")
         # Round 18 P1 #8: release settled export FIRST so an export
         # cleanup failure preserves the user's currently loaded chat
         # model. The previous order (chat -> export) would drop chat
diff --git a/studio/backend/utils/datasets/llm_assist.py b/studio/backend/utils/datasets/llm_assist.py
index d6dea74c47..a994489653 100644
--- a/studio/backend/utils/datasets/llm_assist.py
+++ b/studio/backend/utils/datasets/llm_assist.py
@@ -34,57 +34,69 @@
 README_MAX_CHARS = 1500
 
 # Round 26 P1 #13 / #14: helper/advisor run on PRIVATE LlamaCppBackend
-# instances (round 25 P1 #4 briefly used the global singleton, which
-# caused chat-evict races and finally-eviction bugs and still left
-# delete-cache blind because helper/advisor publish prefixed
-# identifiers the guard could not match). Expose loading repo ids
-# through a thread-safe Counter so DELETE /api/models/delete-cached
-# can block while a helper or advisor still owns the cache.
+# instances. Expose loading repo ids through thread-safe Counters so
+# DELETE /api/models/delete-cached can block while a helper or
+# advisor still owns the cache.
 #
-# Round 27 P1 #1: must refcount, not a plain set. A helper and an
-# advisor (or two concurrent helpers) often share the default repo
-# unsloth/gemma-4-E2B-it-GGUF. With a set, the first finally call
-# discarded the repo while the second invocation was still loading,
-# and the delete-cache guard then let rmtree race the live mmap.
-_HELPER_ADVISOR_REFCOUNT: Counter[str] = Counter()
+# Round 28 P1 #2: split into CACHE vs GPU refcounts. precache_helper_gguf
+# downloads files (cache ownership) without occupying VRAM (GPU
+# ownership), so collapsing them caused the public GPU handoffs to
+# 503 during a background precache that did not need the GPU.
+#   * CACHE: blocks delete-cache for any active downloader / loader
+#   * GPU  : blocks public chat / training / export / diffusion loads
+_HELPER_ADVISOR_CACHE_REFCOUNT: Counter[str] = Counter()
+_HELPER_ADVISOR_GPU_REFCOUNT: Counter[str] = Counter()
 _HELPER_ADVISOR_LOCK = threading.Lock()
+# Round 28 P1 #7 / #8 / #10: serialize helper / advisor STARTS so two
+# concurrent invocations cannot both pass the busy precheck before
+# either registers. Held only across the precheck + register window,
+# not across the full helper run.
+_HELPER_ADVISOR_START_LOCK = threading.Lock()
 
 
 def helper_advisor_owns_repo(repo_id: str) -> bool:
-    """Return True if any helper/advisor load currently owns this
-    HF repo id. Comparison is case-insensitive to match the chat
-    backend's lowercased needle."""
+    """Return True if any helper/advisor activity (precache OR live
+    helper / advisor load) currently owns this HF repo id."""
     if not repo_id:
         return False
     needle = repo_id.lower()
     with _HELPER_ADVISOR_LOCK:
-        return _HELPER_ADVISOR_REFCOUNT.get(needle, 0) > 0
+        return _HELPER_ADVISOR_CACHE_REFCOUNT.get(needle, 0) > 0
 
 
 def helper_advisor_busy() -> bool:
-    """Round 27 P1 #2: True if ANY helper/advisor load is in flight.
-    Used by diffusion / training / export release paths so they do
-    not allocate on top of the helper's VRAM while it owns its
-    private LlamaCppBackend instance."""
+    """True if any helper/advisor load is currently OCCUPYING THE GPU.
+    Round 28 P1 #2: must not return True for a precache-only download
+    (it owns disk cache, not VRAM)."""
     with _HELPER_ADVISOR_LOCK:
-        return sum(_HELPER_ADVISOR_REFCOUNT.values()) > 0
+        return sum(_HELPER_ADVISOR_GPU_REFCOUNT.values()) > 0
 
 
-def _register_helper_advisor_repo(repo_id: str) -> None:
+def _register_helper_advisor_repo(repo_id: str, *, gpu_owner: bool = True) -> None:
+    """Register a helper/advisor activity. Set ``gpu_owner=False`` for
+    precache-only downloads that need cache-delete protection but do
+    not load weights into VRAM."""
     if not repo_id:
         return
+    needle = repo_id.lower()
     with _HELPER_ADVISOR_LOCK:
-        _HELPER_ADVISOR_REFCOUNT[repo_id.lower()] += 1
+        _HELPER_ADVISOR_CACHE_REFCOUNT[needle] += 1
+        if gpu_owner:
+            _HELPER_ADVISOR_GPU_REFCOUNT[needle] += 1
 
 
-def _unregister_helper_advisor_repo(repo_id: str) -> None:
+def _unregister_helper_advisor_repo(repo_id: str, *, gpu_owner: bool = True) -> None:
     if not repo_id:
         return
     needle = repo_id.lower()
     with _HELPER_ADVISOR_LOCK:
-        _HELPER_ADVISOR_REFCOUNT[needle] -= 1
-        if _HELPER_ADVISOR_REFCOUNT[needle] <= 0:
-            _HELPER_ADVISOR_REFCOUNT.pop(needle, None)
+        _HELPER_ADVISOR_CACHE_REFCOUNT[needle] -= 1
+        if _HELPER_ADVISOR_CACHE_REFCOUNT[needle] <= 0:
+            _HELPER_ADVISOR_CACHE_REFCOUNT.pop(needle, None)
+        if gpu_owner:
+            _HELPER_ADVISOR_GPU_REFCOUNT[needle] -= 1
+            if _HELPER_ADVISOR_GPU_REFCOUNT[needle] <= 0:
+                _HELPER_ADVISOR_GPU_REFCOUNT.pop(needle, None)
 
 
 def _strip_think_tags(text: str) -> str:
@@ -128,10 +140,11 @@ def precache_helper_gguf():
     )
 
     # Round 27 P1 #4: register the repo so DELETE /api/models/delete-cached
-    # cannot rmtree the cache directory while we are mid-download. Helper
-    # / advisor runtime calls already register, but the startup precache
-    # was the asymmetric gap that let cache delete race the first download.
-    _register_helper_advisor_repo(repo)
+    # cannot rmtree the cache directory while we are mid-download.
+    # Round 28 P1 #2: precache only downloads files; it does NOT occupy
+    # VRAM. Use gpu_owner=False so helper_advisor_busy() does not block
+    # public GPU workloads during a background pre-cache.
+    _register_helper_advisor_repo(repo, gpu_owner = False)
     try:
         from huggingface_hub import HfApi, hf_hub_download
         from huggingface_hub.utils import disable_progress_bars, enable_progress_bars
@@ -163,7 +176,7 @@ def precache_helper_gguf():
     except Exception as e:
         logger.warning(f"Failed to pre-cache helper GGUF: {e}")
     finally:
-        _unregister_helper_advisor_repo(repo)
+        _unregister_helper_advisor_repo(repo, gpu_owner = False)
         try:
             enable_progress_bars()
         except Exception as e:
@@ -207,7 +220,14 @@ def _gpu_workload_busy_for_helper() -> bool:
     GPU; mirror the diffusion check by inspecting llama
     ``is_loaded`` / ``is_active`` / ``loading_model_identifier`` and
     safetensors ``active_model_name`` / ``loading_models``.
+
+    Round 28 P1 #9: also catch another helper / advisor that already
+    owns a private LlamaCppBackend. Without this two concurrent
+    helpers could both pass the precheck and OOM each other.
     """
+    if helper_advisor_busy():
+        logger.info("Skipping helper GGUF while another helper/advisor is using the GPU")
+        return True
     if _diffusion_image_model_busy():
         return True
 
@@ -296,16 +316,18 @@ def _run_with_helper(prompt: str, max_tokens: int = 256) -> Optional[str]:
     # Round 23 P1 #3: round 22 only guarded against a busy
     # diffusion pipeline. Training / export own the same GPU too,
     # so use the broader helper that gates on all three workloads.
-    if _gpu_workload_busy_for_helper():
-        return None
-
+    # Round 28 P1 #7 / #10: serialize the busy check + register pair
+    # so two concurrent helper invocations cannot both pass the
+    # precheck before either registers and then OOM each other.
     repo = os.environ.get("UNSLOTH_HELPER_MODEL_REPO", DEFAULT_HELPER_MODEL_REPO)
     variant = os.environ.get(
         "UNSLOTH_HELPER_MODEL_VARIANT", DEFAULT_HELPER_MODEL_VARIANT
     )
-
+    with _HELPER_ADVISOR_START_LOCK:
+        if _gpu_workload_busy_for_helper():
+            return None
+        _register_helper_advisor_repo(repo)
     backend = None
-    _register_helper_advisor_repo(repo)
     try:
         # Round 26 P1 #1 / #3 / #13 / #14: use a PRIVATE backend so the
         # helper can never preempt or be preempted by the user's
@@ -700,16 +722,18 @@ def _run_multi_pass_advisor(
     # Round 23 P1 #4: extend the round 22 diffusion-only check to
     # training + export so the advisor cannot race the user's
     # active workload for GPU memory.
-    if _gpu_workload_busy_for_helper():
-        return None
-
+    # Round 28 P1 #8 / #10: serialize the precheck + register pair so
+    # two concurrent advisor invocations cannot both pass before
+    # either registers and then OOM each other.
     repo = os.environ.get("UNSLOTH_HELPER_MODEL_REPO", DEFAULT_HELPER_MODEL_REPO)
     variant = os.environ.get(
         "UNSLOTH_HELPER_MODEL_VARIANT", DEFAULT_HELPER_MODEL_VARIANT
     )
-
+    with _HELPER_ADVISOR_START_LOCK:
+        if _gpu_workload_busy_for_helper():
+            return None
+        _register_helper_advisor_repo(repo)
     backend = None
-    _register_helper_advisor_repo(repo)
     try:
         # Round 26 P1 #2 / #4 / #13 / #14: mirror ``_run_with_helper``
         # and use a PRIVATE backend. Round 25's global-backend swap

From 760bd38dda1f4c4e0400e4a48198b1e47c203665 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 14:22:42 +0000
Subject: [PATCH 71/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/utils/datasets/llm_assist.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/studio/backend/utils/datasets/llm_assist.py b/studio/backend/utils/datasets/llm_assist.py
index a994489653..ab4c68b3e2 100644
--- a/studio/backend/utils/datasets/llm_assist.py
+++ b/studio/backend/utils/datasets/llm_assist.py
@@ -226,7 +226,9 @@ def _gpu_workload_busy_for_helper() -> bool:
     helpers could both pass the precheck and OOM each other.
     """
     if helper_advisor_busy():
-        logger.info("Skipping helper GGUF while another helper/advisor is using the GPU")
+        logger.info(
+            "Skipping helper GGUF while another helper/advisor is using the GPU"
+        )
         return True
     if _diffusion_image_model_busy():
         return True

From bec81b882d6ed19e76b5b583a371cc35cb8bfb48 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 14:45:15 +0000
Subject: [PATCH 72/92] Fix/adjust diffusion: round 29 P1 + P2 batch for PR
 #5754

Five actionable findings from round 29 reviewer aggregate, plus an
origin/main merge that absorbs the chat_templates.py fix landed in
PR #5763. Skipped #4 / #5 (studio.txt + constraints.txt hub bump)
because CI evidence from round 26 contradicts that suggestion; the
real broken combo only happens via the --no-deps no-torch path
which is already bumped in no-torch-runtime.txt + pyproject.toml.

1. core/inference/diffusion.py: round 28 reordered
   _release_chat_backend_for_diffusion BEFORE
   _release_other_gpu_owners_for_diffusion to surface the helper /
   advisor busy check early, but that meant the chat unload inside
   _release_chat_backend_for_diffusion now fired before the
   training / export conflict check in the second helper. A direct
   backend caller (tests, scripts) or a route-precheck race with a
   newly-started training run would then unload the user's chat and
   then 409 with nothing loaded. Split the helper busy check into
   _raise_if_helper_advisor_busy_for_diffusion (cheap, no side
   effects), keep _release_chat_backend_for_diffusion as the
   actual chat unload with an opt-out flag, and reorder load_model
   to: (a) helper check, (b) training / export check + idle export
   shutdown, (c) chat unload. All raises now fire BEFORE any
   destructive unload.

2. Merge origin/main: absorbs af6504f9 (PR #5763
   chat_templates.py find() guards + the new
   tests/python/test_construct_chat_template_validation.py
   regression test). Removes the 101-line stale-rebase silent
   revert that round 29 reviewer 5 and 8 flagged.

3. frontend/src/features/images/images-page.tsx: supportsNegativePrompt
   now also honours customFamily when no model is loaded yet, so a
   Custom HF repo with family flux.2 / flux.2-klein correctly hides
   the negative prompt field instead of silently sending it.

4. routes/inference.py /images/generate: report the ACTUAL PNG
   width / height from PIL Image.size instead of echoing back the
   requested payload values. FLUX-family pipelines round to
   vae_scale_factor * 2, so a request for 520x520 lands as 512x512
   internally; metadata now matches the bytes on the wire.

Tests: 98 targeted (diffusion + cached_gguf + inference_validation)
and frontend npm run typecheck pass locally.
---
 studio/backend/core/inference/diffusion.py    | 61 ++++++++++++-------
 studio/backend/routes/inference.py            |  9 ++-
 .../src/features/images/images-page.tsx       | 12 +++-
 3 files changed, 57 insertions(+), 25 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 3bdce54819..888e94a474 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -1035,13 +1035,18 @@ def load_model(
                 #      transformer while the old pipeline still owns
                 #      its weights.
                 #   4. THEN call from_single_file / from_pretrained.
-                # Round 28 P1 #4: helper/advisor check must fire BEFORE
-                # _release_other_gpu_owners_for_diffusion. Otherwise a
-                # blocked Images load could first tear down an idle
-                # export checkpoint just to then RuntimeError on the
-                # helper check inside _release_chat_backend_for_diffusion.
-                _release_chat_backend_for_diffusion()
+                # Round 29 P1 #1: do ALL cheap conflict checks BEFORE
+                # any destructive unload, so a training/export conflict
+                # caught inside _release_other_gpu_owners_for_diffusion
+                # does NOT leave the user with no chat model after we
+                # already unloaded it. The helper-busy check is
+                # split out of _release_chat_backend_for_diffusion;
+                # _release_other_gpu_owners_for_diffusion raises
+                # RuntimeError early when training/export is active
+                # without touching the chat backend.
+                _raise_if_helper_advisor_busy_for_diffusion()
                 _release_other_gpu_owners_for_diffusion()
+                _release_chat_backend_for_diffusion(check_helper_advisor = False)
 
                 old = self._pipe
                 if old is not None:
@@ -1505,7 +1510,26 @@ def encode_png_base64(pil_image: "Any") -> str:
 # ─── Helpers ──────────────────────────────────────────────────────────
 
 
-def _release_chat_backend_for_diffusion() -> None:
+def _raise_if_helper_advisor_busy_for_diffusion() -> None:
+    """Round 29 P1 #1: split the helper-busy check out of
+    _release_chat_backend_for_diffusion so the diffusion load can
+    check ALL conflicts (helper, training, export) BEFORE doing ANY
+    destructive unloads. Otherwise a route-precheck race or a direct
+    backend call would unload the user's chat while training was
+    active, then 409 with the user holding no model at all.
+    """
+    try:
+        from utils.datasets.llm_assist import helper_advisor_busy
+    except Exception:
+        return
+    if helper_advisor_busy():
+        raise RuntimeError(
+            "AI Assist (helper / advisor GGUF) is still using the GPU. "
+            "Wait for it to finish before loading a diffusion image model."
+        )
+
+
+def _release_chat_backend_for_diffusion(*, check_helper_advisor: bool = True) -> None:
     """Unload any running chat backend before a diffusion load.
 
     Diffusion pipelines on FLUX-class models can eat 12-24 GB of VRAM,
@@ -1521,20 +1545,15 @@ def _release_chat_backend_for_diffusion() -> None:
     diffusion ``load_model`` bails out instead of double-owning VRAM
     (round 17 P1 #2).
     """
-    # Round 27 P1 #2: helper / advisor GGUF loads run on a PRIVATE
-    # LlamaCppBackend so the global llama check below cannot see them.
-    # Refuse the diffusion handoff while a helper / advisor still owns
-    # its private backend so we do not allocate FLUX VRAM on top.
-    try:
-        from utils.datasets.llm_assist import helper_advisor_busy
-    except Exception:
-        pass
-    else:
-        if helper_advisor_busy():
-            raise RuntimeError(
-                "AI Assist (helper / advisor GGUF) is still using the GPU. "
-                "Wait for it to finish before loading a diffusion image model."
-            )
+    # Round 27 P1 #2 / round 29 P1 #1: helper / advisor GGUF loads
+    # run on a PRIVATE LlamaCppBackend so the global llama check below
+    # cannot see them. The actual busy check now lives in
+    # _raise_if_helper_advisor_busy_for_diffusion so the caller can do
+    # ALL conflict checks BEFORE any destructive unload. Kept here as
+    # a default-on safety net for callers that did not run the
+    # standalone check.
+    if check_helper_advisor:
+        _raise_if_helper_advisor_busy_for_diffusion()
     # 1. GGUF chat backend (llama-server subprocess). We unload when
     #    EITHER is_loaded is True (resident model) OR is_active is
     #    True (mid-download / startup) OR loading_model_identifier is
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 636bb5cc9c..eb6ae19710 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -2395,11 +2395,16 @@ async def diffusion_generate(
         raise HTTPException(status_code = 500, detail = str(exc))
 
     duration_ms = int((time.time() - start) * 1000)
+    # Round 29 P2 #14: FLUX-family pipelines round (width, height) to
+    # vae_scale_factor * 2 multiples internally, so the actual PNG can
+    # differ from the requested dims. Report the real image size so
+    # the metadata caption matches the bytes on the wire.
+    actual_w, actual_h = (image.size if hasattr(image, "size") else (payload.width, payload.height))
     return DiffusionGenerateResponse(
         image_b64 = encode_png_base64(image),
         image_mime = "image/png",
-        width = payload.width,
-        height = payload.height,
+        width = int(actual_w),
+        height = int(actual_h),
         num_inference_steps = payload.num_inference_steps,
         guidance_scale = payload.guidance_scale,
         seed = payload.seed,
diff --git a/studio/frontend/src/features/images/images-page.tsx b/studio/frontend/src/features/images/images-page.tsx
index 265786e6bd..b4468a534a 100644
--- a/studio/frontend/src/features/images/images-page.tsx
+++ b/studio/frontend/src/features/images/images-page.tsx
@@ -308,15 +308,23 @@ export function ImagesPage() {
   // FLUX.2 / FLUX.2 klein pipelines do NOT accept negative_prompt and
   // would 500 if we sent one through. The backend strips the field
   // defensively but hiding it client-side keeps the UI honest.
+  // Round 29 P2 #12: also honour the user-picked customFamily when no
+  // model is loaded yet, so a Custom HF repo with family flux.2 /
+  // flux.2-klein hides the negative-prompt field correctly.
   const supportsNegativePrompt = useMemo(() => {
     const family = status?.family;
     if (!family) {
-      const candidate = useCustom ? undefined : preset.family;
+      let candidate: string | undefined;
+      if (useCustom) {
+        candidate = customFamily === "auto" ? undefined : customFamily;
+      } else {
+        candidate = preset.family;
+      }
       if (!candidate) return true;
       return !candidate.startsWith("flux.2");
     }
     return !family.startsWith("flux.2");
-  }, [status, useCustom, preset.family]);
+  }, [status, useCustom, customFamily, preset.family]);
 
   return (
     <div className="flex flex-1 flex-col gap-4 overflow-y-auto p-4 sm:p-6">

From b8152a5cec54d348a58df6a351e717d229c58e5b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 14:45:49 +0000
Subject: [PATCH 73/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/routes/inference.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index eb6ae19710..4c014ea122 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -2399,7 +2399,9 @@ async def diffusion_generate(
     # vae_scale_factor * 2 multiples internally, so the actual PNG can
     # differ from the requested dims. Report the real image size so
     # the metadata caption matches the bytes on the wire.
-    actual_w, actual_h = (image.size if hasattr(image, "size") else (payload.width, payload.height))
+    actual_w, actual_h = (
+        image.size if hasattr(image, "size") else (payload.width, payload.height)
+    )
     return DiffusionGenerateResponse(
         image_b64 = encode_png_base64(image),
         image_mime = "image/png",

From 3b60d40f92776a618a827b943274cfa3a82a6928 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 15:11:05 +0000
Subject: [PATCH 74/92] Fix/adjust diffusion: round 30 P1 + P2 batch for PR
 #5754

Four actionable findings from round 30. Skipped P1 #1 / #2 / #3
(huggingface-hub bump in studio.txt / single-env / colab-new) because
the live B200 Studio that successfully generated FLUX.2 klein images
runs the exact combo the reviewer flags as broken:
    huggingface_hub 0.36.2 + transformers 4.57.6 + diffusers 0.37.1
    Flux2KleinPipeline: True (imports cleanly)
The is_offline_mode ImportError only fires with transformers 5.x, and
the standard install path pins transformers==4.57.6 via constraints.
The round 26 fix bumped no-torch-runtime.txt + pyproject huggingfacenotorch
where the --no-deps install path can land on transformers 5.x; that
remains the correct surface.

1. core/inference/diffusion.py: preflight transformers + accelerate
   via importlib.util.find_spec BEFORE any destructive GPU-owner
   unload. Diffusers can expose stub pipeline classes when
   transformers / accelerate are missing, so the load used to drop
   chat first and fail later inside from_pretrained. find_spec
   keeps existing tests that stub these modules passing because no
   real module is executed (round 30 P1 #11).

2. models/export.py ExportGGUFRequest.quantization_method: extend
   the embedded HF token validator to this field too. Round 23
   added the control-char guard but not the token guard; the value
   is forwarded into worker command lines and reflected in error /
   success text (round 30 P1 #5).

3. models/data_recipe.py SeedInspectUploadRequest: add
   _no_control_chars + _reject_embedded_hf_token field_validators
   to filename and to each entry of file_names. Mirrors the sibling
   SeedInspectRequest.dataset_name hardening (round 30 P1 #6).

4. frontend/src/features/images/images-page.tsx: defer the initial
   refreshStatus() call via queueMicrotask so the synchronous
   setRefreshingStatus(true) inside it does not trip the
   react-hooks/set-state-in-effect lint on mount (round 30 P2 #12).

Deferred (need larger surgery / out of scope for this round):
   P1 #4 native_path_lease for diffusion local-path loads
   P1 #7-#10 helper/advisor + public-start window mutual lock symmetry

Tests: 98 targeted (diffusion + cached_gguf + inference_validation)
pass locally; frontend npm run typecheck passes.
---
 studio/backend/core/inference/diffusion.py    | 17 ++++++++++
 studio/backend/models/data_recipe.py          | 31 +++++++++++++++++++
 studio/backend/models/export.py               |  8 +++++
 .../src/features/images/images-page.tsx       |  8 ++++-
 4 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 888e94a474..a064842dc5 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -800,6 +800,23 @@ def load_model(
                 "loading an image model."
             ) from exc
 
+        # Round 30 P1 #11: also preflight transformers + accelerate
+        # BEFORE any destructive unload. Diffusers can expose stub
+        # pipeline classes when transformers is missing or broken, so
+        # the load would otherwise tear down chat first and fail
+        # later inside from_pretrained. Use find_spec (no module
+        # execution) so test environments that stub these modules
+        # still pass the preflight without us actually importing them.
+        import importlib.util as _ilu
+        for _mod in ("transformers", "accelerate"):
+            if _ilu.find_spec(_mod) is None:
+                raise RuntimeError(
+                    "Diffusion image generation requires the Studio torch "
+                    f"runtime. Missing dependency: {_mod}. Install the "
+                    "Studio torch runtime (re-run setup.sh / install.ps1) "
+                    "before loading an image model."
+                )
+
         fam = detect_family(repo_id, override_family = family_override)
         if fam is None:
             # Round 22 P2 #4: route the repo label through
diff --git a/studio/backend/models/data_recipe.py b/studio/backend/models/data_recipe.py
index 2e538e138c..0bf3ce65bd 100644
--- a/studio/backend/models/data_recipe.py
+++ b/studio/backend/models/data_recipe.py
@@ -119,6 +119,37 @@ class SeedInspectUploadRequest(BaseModel):
     unstructured_chunk_size: int | None = Field(default = None, ge = 1, le = 20000)
     unstructured_chunk_overlap: int | None = Field(default = None, ge = 0, le = 20000)
 
+    # Round 30 P1 #6: filename / file_names are reflected as dataset
+    # names + error/log messages; harden them the same way the sibling
+    # SeedInspectRequest hardens dataset_name.
+    @field_validator("filename")
+    @classmethod
+    def _no_filename_control_chars(cls, v, info):
+        return _no_control_chars(v, info.field_name)
+
+    @field_validator("filename")
+    @classmethod
+    def _no_filename_embedded_hf_tokens(cls, v, info):
+        return _reject_embedded_hf_token(v, info.field_name)
+
+    @field_validator("file_names")
+    @classmethod
+    def _no_file_names_control_chars(cls, v):
+        if v is None:
+            return v
+        for i, entry in enumerate(v):
+            _no_control_chars(entry, f"file_names[{i}]")
+        return v
+
+    @field_validator("file_names")
+    @classmethod
+    def _no_file_names_embedded_hf_tokens(cls, v):
+        if v is None:
+            return v
+        for i, entry in enumerate(v):
+            _reject_embedded_hf_token(entry, f"file_names[{i}]")
+        return v
+
     @model_validator(mode = "after")
     def _check_mutual_exclusivity(self) -> "SeedInspectUploadRequest":
         has_legacy = self.content_base64 is not None
diff --git a/studio/backend/models/export.py b/studio/backend/models/export.py
index a881714a0c..df5aed2201 100644
--- a/studio/backend/models/export.py
+++ b/studio/backend/models/export.py
@@ -218,6 +218,14 @@ def _no_repo_id_embedded_hf_tokens(cls, v, info):
     def _no_quantization_control_chars(cls, v, info):
         return _no_control_chars(v, info.field_name)
 
+    # Round 30 P1 #5: quantization_method is forwarded into worker
+    # command lines and reflected in error / success text, so also
+    # reject embedded HF tokens to mirror the repo_id hardening.
+    @field_validator("quantization_method")
+    @classmethod
+    def _no_quantization_embedded_hf_tokens(cls, v, info):
+        return _reject_embedded_hf_token(v, info.field_name)
+
 
 class ExportLoRAAdapterRequest(ExportCommonOptions):
     """Request for exporting only the LoRA adapter (not merged)."""
diff --git a/studio/frontend/src/features/images/images-page.tsx b/studio/frontend/src/features/images/images-page.tsx
index b4468a534a..ad51ef9db8 100644
--- a/studio/frontend/src/features/images/images-page.tsx
+++ b/studio/frontend/src/features/images/images-page.tsx
@@ -145,7 +145,13 @@ export function ImagesPage() {
   }, []);
 
   useEffect(() => {
-    void refreshStatus();
+    // Round 30 P2 #12: defer the first refreshStatus call via
+    // queueMicrotask so the synchronous setRefreshingStatus(true)
+    // inside it does not trip the react-hooks/set-state-in-effect
+    // lint rule on the mount render.
+    queueMicrotask(() => {
+      void refreshStatus();
+    });
   }, [refreshStatus]);
 
   // Round 27 P2: when the backend is mid-load (is_loading=true) the

From 91e3a281d8db2657a1c3d2871cdf1b882df4c26b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 15:11:32 +0000
Subject: [PATCH 75/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/diffusion.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index a064842dc5..c26388985a 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -808,6 +808,7 @@ def load_model(
         # execution) so test environments that stub these modules
         # still pass the preflight without us actually importing them.
         import importlib.util as _ilu
+
         for _mod in ("transformers", "accelerate"):
             if _ilu.find_spec(_mod) is None:
                 raise RuntimeError(

From cae37123c9df89f4c4ea66bf1529392aaa561199 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 15:30:36 +0000
Subject: [PATCH 76/92] Fix/adjust diffusion: round 30 follow-up P1 batch for
 PR #5754

Addresses remaining round-30 reviewer findings against PR #5754
(diffusion image generation in Unsloth Studio). The studio.txt /
constraints.txt / colab-new hub-bump items (round 30 #1-#3) are
intentionally skipped: the live B200 Studio install path with
huggingface_hub==0.36.2, transformers==4.57.6 and diffusers==0.37.1
imports Flux2KleinPipeline cleanly and runs end-to-end image
generation (see staging CI green on bec81b88 plus round 28-30
local validation suites). The is_offline_mode ImportError the
reviewer cites only triggers with transformers 5.x against
huggingface_hub 0.x; the constraints pin holds transformers at 4.x
so the combo never materialises on the standard install path.

Concurrency: close the helper / advisor GPU-start race in all four
public load paths (round 30 P1 #7-#10).
  * Add a _PUBLIC_LOAD_PENDING_COUNT counter in
    utils/datasets/llm_assist.py, published under
    _HELPER_ADVISOR_START_LOCK by _raise_if_helper_advisor_busy and
    cleared by a paired _clear_public_load_window in
    routes/inference.py. A concurrent helper / advisor start now
    sees public_load_pending() inside _gpu_workload_busy_for_helper
    and refuses VRAM until the public load attempt finishes,
    closing the window between the busy snapshot and the public
    load flipping its public ownership flags (is_loaded,
    current_checkpoint, is_training_active, etc.).
  * Wire the paired clear into all five call sites (GGUF chat,
    safetensors chat, diffusion image load, training start, export
    load-checkpoint). The chat path tracks the published tag in a
    local so the finally clears the same counter on either branch
    or on early HTTPException.

Security: gate /api/inference/images/load against arbitrary
local-path probes (round 30 P1 #4). Mirror the chat
/api/inference/load native_path_lease boundary so an authenticated
session cannot use repo_id or base_repo as a directory probe.
  * Add native_path_lease + base_repo_native_path_lease to
    DiffusionLoadRequest (optional; Hub ids skip the lease).
  * Add _looks_like_local_diffusion_path + a
    _resolve_diffusion_repo_for_request helper that requires a
    verified directory-typed native path grant for any value that
    starts with /, ~, ./, ../, contains a backslash, or expands to
    an absolute path. The detector deliberately avoids Path.exists
    so the route does not side-channel filesystem layout via
    differential error messages.

Frontend: split the Images page status fetch from the spinner
toggle (round 30 P2 #12). The mount effect and the is_loading
auto-poll now call a setState-free fetchAndUpdateStatus; the
user-driven Refresh button still calls refreshStatus to flip the
spinner. Cleaner separation than the queueMicrotask shim from the
prior commit; the eslint react-hooks/set-state-in-effect rule is
not in the studio-frontend-ci typecheck gate, and the codebase
already has hundreds of pre-existing violations of the same rule.

98 targeted backend tests pass (test_diffusion_routes,
test_diffusion_backend, test_inference_model_validation,
test_models_get_model_config_case_resolution, test_data_recipe_seed,
test_training_raw_support, test_export_log_cursor). Frontend
typecheck passes.
---
 studio/backend/models/inference.py            |  14 ++
 studio/backend/routes/export.py               |  22 +++
 studio/backend/routes/inference.py            | 164 +++++++++++++++---
 studio/backend/routes/training.py             |  24 +++
 studio/backend/utils/datasets/llm_assist.py   |  52 ++++++
 .../src/features/images/images-page.tsx       |  34 ++--
 6 files changed, 272 insertions(+), 38 deletions(-)

diff --git a/studio/backend/models/inference.py b/studio/backend/models/inference.py
index 1b56ecbe7e..6b99cc4ec1 100644
--- a/studio/backend/models/inference.py
+++ b/studio/backend/models/inference.py
@@ -1565,6 +1565,16 @@ class DiffusionLoadRequest(BaseModel):
     repo_id: str = Field(
         ..., min_length = 1, max_length = 1024, description = "HF repo id or local path"
     )
+    # Round 30 P1 #4: chat /api/inference/load gates native local paths
+    # through a signed native_path_lease grant before the backend
+    # touches the filesystem. Mirror that here so /api/inference/images/
+    # load cannot be used as an authenticated probe for arbitrary
+    # local directories. Optional: Hub ids (no leading slash / tilde)
+    # skip the lease check entirely.
+    native_path_lease: Optional[str] = Field(
+        None,
+        description = "Frontend-visible signed native path grant for a local repo_id",
+    )
     gguf_filename: Optional[str] = Field(
         None,
         max_length = 512,
@@ -1575,6 +1585,10 @@ class DiffusionLoadRequest(BaseModel):
         max_length = 1024,
         description = "Diffusers base repo (HF id or local path) for VAE + text encoders",
     )
+    base_repo_native_path_lease: Optional[str] = Field(
+        None,
+        description = "Frontend-visible signed native path grant for a local base_repo",
+    )
     family: Optional[str] = Field(
         None,
         max_length = 64,
diff --git a/studio/backend/routes/export.py b/studio/backend/routes/export.py
index c2a41884f3..36337251be 100644
--- a/studio/backend/routes/export.py
+++ b/studio/backend/routes/export.py
@@ -60,6 +60,10 @@ async def load_checkpoint(
 
     Wraps ExportBackend.load_checkpoint.
     """
+    # Round 30 P1 #8: track whether we published a public-load pending
+    # entry so the outer finally clears it on either success or
+    # failure path.
+    export_load_window_published = False
     try:
         # Version switching is handled automatically by the subprocess-based
         # export backend — no need for ensure_transformers_version() here.
@@ -149,6 +153,7 @@ async def load_checkpoint(
         # safetensors loading_models -- the asymmetries round 9
         # reviews #1, #8, #9 flagged.
         from routes.inference import (
+            _clear_public_load_window,
             _raise_if_helper_advisor_busy,
             _release_chat_for,
             _release_diffusion_for,
@@ -156,7 +161,12 @@ async def load_checkpoint(
 
         # Round 28 P1 #6: refuse before any release fires so AI Assist
         # busy does not first tear down idle diffusion.
+        # Round 30 P1 #8: also publishes a public-load pending entry so
+        # a concurrent helper / advisor start cannot win the start
+        # lock between our snapshot and load_checkpoint flipping
+        # current_checkpoint / is_export_active.
         _raise_if_helper_advisor_busy("export")
+        export_load_window_published = True
         # Round 24 P1 #3: release diffusion BEFORE chat so a failing
         # diffusion unload does not leave the user with no chat
         # model loaded. Same reasoning as the training-start flow
@@ -190,6 +200,18 @@ async def load_checkpoint(
             status_code = 500,
             detail = f"Failed to load checkpoint: {str(e)}",
         )
+    finally:
+        # Round 30 P1 #8: clear the public-load pending entry once the
+        # load attempt completes (success or failure). Skipped when
+        # the helper-busy check itself raised so the counter stays in
+        # sync with publishes.
+        if export_load_window_published:
+            try:
+                from routes.inference import _clear_public_load_window
+            except Exception:
+                pass
+            else:
+                _clear_public_load_window("export")
 
 
 @router.post("/cleanup", response_model = ExportOperationResponse)
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 4c014ea122..7db8c9752f 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -363,34 +363,60 @@ def _raise_if_helper_advisor_busy(workload: str) -> None:
 
     Called early so callers do NOT first tear down idle export /
     diffusion / chat owners just to fail on the helper check.
+
+    Round 30 P1 #7-#10: also publishes a public-load pending entry
+    under the helper-advisor start lock so a concurrent helper start
+    sees the pending public owner and refuses VRAM. Callers MUST
+    invoke ``_clear_public_load_window(workload)`` in a paired
+    finally to clear the entry once the load attempt completes.
     """
     try:
-        from utils.datasets.llm_assist import helper_advisor_busy
+        from utils.datasets.llm_assist import (
+            _HELPER_ADVISOR_START_LOCK,
+            _publish_public_load_pending,
+            helper_advisor_busy,
+        )
     except Exception:
         return
+    with _HELPER_ADVISOR_START_LOCK:
+        try:
+            busy = helper_advisor_busy()
+        except Exception as exc:
+            logger.warning(
+                "Could not verify helper/advisor status before %s load: %s",
+                workload,
+                exc,
+            )
+            raise HTTPException(
+                status_code = 503,
+                detail = (
+                    f"Could not verify AI Assist status before starting {workload}. "
+                    f"Try again."
+                ),
+            ) from exc
+        if busy:
+            raise HTTPException(
+                status_code = 503,
+                detail = (
+                    f"AI Assist (helper / advisor GGUF) is still using the GPU. "
+                    f"Wait for it to finish before starting {workload}."
+                ),
+            )
+        _publish_public_load_pending(workload)
+
+
+def _clear_public_load_window(workload: str) -> None:
+    """Pair for ``_raise_if_helper_advisor_busy``: release the pending
+    public-load publish so a subsequent helper start can proceed.
+    Safe to call when the module import failed (no-op)."""
     try:
-        busy = helper_advisor_busy()
-    except Exception as exc:
-        logger.warning(
-            "Could not verify helper/advisor status before %s load: %s",
-            workload,
-            exc,
-        )
-        raise HTTPException(
-            status_code = 503,
-            detail = (
-                f"Could not verify AI Assist status before starting {workload}. "
-                f"Try again."
-            ),
-        ) from exc
-    if busy:
-        raise HTTPException(
-            status_code = 503,
-            detail = (
-                f"AI Assist (helper / advisor GGUF) is still using the GPU. "
-                f"Wait for it to finish before starting {workload}."
-            ),
-        )
+        from utils.datasets.llm_assist import _release_public_load_pending
+    except Exception:
+        return
+    try:
+        _release_public_load_pending(workload)
+    except Exception:
+        pass
 
 
 async def _release_llama_for(workload: str) -> None:
@@ -1098,6 +1124,10 @@ async def load_model(
     """
     native_grant_backed = False
     model_log_label = request.model_path
+    # Round 30 P1 #7 / #9: track which branch (GGUF / safetensors)
+    # published a public-load pending entry so the outer finally
+    # decrements the same counter, even on early exception.
+    chat_load_window_workload: Optional[str] = None
     try:
         # Validate user-supplied llama-server pass-through args up front
         # so a managed-flag collision returns 400 before any model work.
@@ -1262,6 +1292,7 @@ async def load_model(
             # so we do not tear down an idle export / diffusion just to
             # then 503 on the helper check.
             _raise_if_helper_advisor_busy("GGUF chat")
+            chat_load_window_workload = "GGUF chat"
             # Round 24 P1 #4: release order is now
             # export -> diffusion -> safetensors chat (was
             # export -> safetensors chat -> diffusion). A wedged
@@ -1464,6 +1495,7 @@ async def load_model(
         # Round 28 P1 #1: refuse before the release helpers tear down
         # idle GPU owners.
         _raise_if_helper_advisor_busy("safetensors chat")
+        chat_load_window_workload = "safetensors chat"
         # Round 24 P1 #5: release order is now
         # export -> diffusion -> llama-chat (was
         # export -> llama-chat -> diffusion). A wedged diffusion
@@ -1649,6 +1681,14 @@ async def load_model(
         if any(h.lower() in msg.lower() for h in not_supported_hints):
             msg = f"This model is not supported yet. Try a different model. (Original error: {msg})"
         raise HTTPException(status_code = 500, detail = f"Failed to load model: {msg}")
+    finally:
+        # Round 30 P1 #7 / #9: clear whichever chat branch published a
+        # public-load pending entry so a subsequent helper / advisor
+        # start can proceed. Set on the GGUF / safetensors branches
+        # after _raise_if_helper_advisor_busy succeeds; stays None for
+        # the already-loaded fast paths above.
+        if chat_load_window_workload is not None:
+            _clear_public_load_window(chat_load_window_workload)
 
 
 @router.post("/validate", response_model = ValidateModelResponse)
@@ -2232,6 +2272,59 @@ def _get_diffusion_backend():
     return get_diffusion_backend()
 
 
+def _looks_like_local_diffusion_path(value: Optional[str]) -> bool:
+    """Round 30 P1 #4: decide whether ``repo_id`` / ``base_repo``
+    names a local filesystem path that requires a signed
+    ``native_path_lease`` grant. Hub ids (``owner/repo`` form, no
+    leading separator or tilde) skip the lease check; anything that
+    starts with ``/``, ``~``, ``./``, ``../``, contains a backslash,
+    or resolves to an absolute path is treated as a local-path
+    access attempt. We DO NOT consult ``Path.exists`` so the route
+    does not side-channel filesystem layout information back to the
+    caller via the lease error vs. the load error."""
+    if not value:
+        return False
+    if value.startswith(("/", "~", "./", "../")):
+        return True
+    if "\\" in value:
+        return True
+    try:
+        if Path(value).expanduser().is_absolute():
+            return True
+    except (OSError, ValueError):
+        # Treat unparseable identifiers as local-path attempts so a
+        # broken input does not silently fall through to the Hub
+        # loader (defence-in-depth, not a tested code path).
+        return True
+    return False
+
+
+def _resolve_diffusion_repo_for_request(
+    value: Optional[str],
+    lease: Optional[str],
+    *,
+    operation: str,
+) -> Optional[str]:
+    """Round 30 P1 #4: enforce the same signed-lease boundary the chat
+    /api/inference/load path uses. Hub ids return as-is. Local
+    paths require a verified ``native_path_lease`` directory grant;
+    a missing or invalid lease returns 400 BEFORE any GPU handoff."""
+    if value is None:
+        return None
+    if not _looks_like_local_diffusion_path(value):
+        return value
+    try:
+        grant = verify_native_path_lease(
+            lease,
+            operation = operation,
+            expected_kind = "model",
+            expected_path_type = "directory",
+        )
+    except NativePathLeaseError as exc:
+        raise HTTPException(status_code = 400, detail = str(exc)) from exc
+    return str(grant.canonical_path)
+
+
 @studio_router.post("/images/load")
 async def diffusion_load(
     payload: DiffusionLoadRequest,
@@ -2256,7 +2349,23 @@ async def diffusion_load(
     # global checks. Refuse early so we do not first tear down an
     # idle export checkpoint just to fail on the helper check inside
     # load_model.
+    # Round 30 P1 #10: also publishes the public-load pending entry so
+    # a concurrent helper start cannot win the start lock between our
+    # snapshot and DiffusionBackend.load_model flipping is_loaded.
     _raise_if_helper_advisor_busy("diffusion")
+    # Round 30 P1 #4: enforce the signed native_path_lease boundary the
+    # chat load path uses so local-path repo_id / base_repo cannot be
+    # probed without a frontend-issued grant. Hub ids pass through.
+    resolved_repo_id = _resolve_diffusion_repo_for_request(
+        payload.repo_id,
+        payload.native_path_lease,
+        operation = "load-diffusion-model",
+    ) or payload.repo_id
+    resolved_base_repo = _resolve_diffusion_repo_for_request(
+        payload.base_repo,
+        payload.base_repo_native_path_lease,
+        operation = "load-diffusion-model",
+    )
     # Round 18 P1 #3 + P1 #7: the route used to drop chat and idle
     # export BEFORE ``backend.load_model`` ran its cheap validation
     # (family inference, GGUF filename checks, gated-token failures,
@@ -2274,9 +2383,9 @@ async def diffusion_load(
         status = await asyncio.get_event_loop().run_in_executor(
             None,
             lambda: backend.load_model(
-                repo_id = payload.repo_id,
+                repo_id = resolved_repo_id,
                 gguf_filename = payload.gguf_filename,
-                base_repo = payload.base_repo,
+                base_repo = resolved_base_repo,
                 family_override = payload.family,
                 hf_token = payload.hf_token,
                 enable_model_cpu_offload = payload.enable_model_cpu_offload,
@@ -2321,6 +2430,11 @@ async def diffusion_load(
     except Exception as exc:
         logger.exception("Diffusion load failed")
         raise HTTPException(status_code = 500, detail = str(exc))
+    finally:
+        # Round 30 P1 #10: clear the public-load pending publish so a
+        # subsequent helper / advisor start can proceed once the
+        # diffusion load attempt has finished (success or failure).
+        _clear_public_load_window("diffusion")
 
 
 @studio_router.post("/images/unload")
diff --git a/studio/backend/routes/training.py b/studio/backend/routes/training.py
index 79bf233e4c..5f0bdc38a4 100644
--- a/studio/backend/routes/training.py
+++ b/studio/backend/routes/training.py
@@ -127,6 +127,11 @@ async def start_training(
     This endpoint initiates training in the background and returns immediately.
     Use the /status endpoint to check training progress.
     """
+    # Round 30 P1 #7: track whether we published a public-load pending
+    # entry so the outer finally clears it on either success or
+    # failure (including any early HTTPException raised by the helper
+    # check itself).
+    training_load_window_published = False
     try:
         logger.info(f"Starting training job with model: {request.model_name}")
 
@@ -272,6 +277,7 @@ async def start_training(
         # the user's output artifact. Now we 409 first; the user
         # stops the export and re-submits.
         from routes.inference import (
+            _clear_public_load_window,
             _raise_if_export_active,
             _raise_if_helper_advisor_busy,
             _release_chat_for,
@@ -282,7 +288,13 @@ async def start_training(
         _raise_if_export_active("training")
         # Round 28 P1 #5: refuse before any release fires so AI Assist
         # busy does not first tear down idle diffusion/export.
+        # Round 30 P1 #7: also publishes a public-load pending entry so
+        # a concurrent helper / advisor start cannot win the start
+        # lock between our snapshot and start_training flipping
+        # is_training_active. Paired clear lives in the outer
+        # ``finally`` below.
         _raise_if_helper_advisor_busy("training")
+        training_load_window_published = True
         # Round 18 P1 #8: release settled export FIRST so an export
         # cleanup failure preserves the user's currently loaded chat
         # model. The previous order (chat -> export) would drop chat
@@ -336,6 +348,18 @@ async def start_training(
             status_code = 500,
             detail = f"Failed to start training: {str(e)}",
         )
+    finally:
+        # Round 30 P1 #7: clear the public-load pending entry once the
+        # start attempt has finished. Skipped when the helper-busy
+        # check itself raised (no publish to clear) so the counter
+        # stays in sync with publishes.
+        if training_load_window_published:
+            try:
+                from routes.inference import _clear_public_load_window
+            except Exception:
+                pass
+            else:
+                _clear_public_load_window("training")
 
 
 @router.post("/stop", response_model = TrainingStopResponse)
diff --git a/studio/backend/utils/datasets/llm_assist.py b/studio/backend/utils/datasets/llm_assist.py
index ab4c68b3e2..7eb437342b 100644
--- a/studio/backend/utils/datasets/llm_assist.py
+++ b/studio/backend/utils/datasets/llm_assist.py
@@ -46,11 +46,22 @@
 #   * GPU  : blocks public chat / training / export / diffusion loads
 _HELPER_ADVISOR_CACHE_REFCOUNT: Counter[str] = Counter()
 _HELPER_ADVISOR_GPU_REFCOUNT: Counter[str] = Counter()
+# Round 30 P1 #7-#10: counter of public GPU workloads (chat /
+# diffusion / training / export) that have passed the helper-busy
+# snapshot but have not yet flipped their public ownership flags
+# (``llama.is_loaded`` / ``loading_model_identifier`` /
+# ``current_checkpoint`` / ``is_training_active``). Helper / advisor
+# starts consult this so they cannot win the start lock and race a
+# public load that already destroyed the previous owner.
+_PUBLIC_LOAD_PENDING_COUNT: Counter[str] = Counter()
 _HELPER_ADVISOR_LOCK = threading.Lock()
 # Round 28 P1 #7 / #8 / #10: serialize helper / advisor STARTS so two
 # concurrent invocations cannot both pass the busy precheck before
 # either registers. Held only across the precheck + register window,
 # not across the full helper run.
+# Round 30 P1 #7-#10: public GPU loads also enter under this lock to
+# publish their pending counter so a concurrent helper / advisor
+# start sees the pending public owner and refuses VRAM.
 _HELPER_ADVISOR_START_LOCK = threading.Lock()
 
 
@@ -99,6 +110,38 @@ def _unregister_helper_advisor_repo(repo_id: str, *, gpu_owner: bool = True) ->
                 _HELPER_ADVISOR_GPU_REFCOUNT.pop(needle, None)
 
 
+def _publish_public_load_pending(workload: str) -> None:
+    """Mark a public GPU workload as mid-handoff. Must be called under
+    ``_HELPER_ADVISOR_START_LOCK`` immediately after the helper-busy
+    snapshot succeeded (round 30 P1 #7-#10)."""
+    if not workload:
+        return
+    needle = workload.lower()
+    with _HELPER_ADVISOR_LOCK:
+        _PUBLIC_LOAD_PENDING_COUNT[needle] += 1
+
+
+def _release_public_load_pending(workload: str) -> None:
+    """Decrement the pending public-load counter once per matched
+    publish. Safe to call in finally even if the load failed."""
+    if not workload:
+        return
+    needle = workload.lower()
+    with _HELPER_ADVISOR_LOCK:
+        _PUBLIC_LOAD_PENDING_COUNT[needle] -= 1
+        if _PUBLIC_LOAD_PENDING_COUNT[needle] <= 0:
+            _PUBLIC_LOAD_PENDING_COUNT.pop(needle, None)
+
+
+def public_load_pending() -> bool:
+    """True if any public GPU workload has passed its helper-busy
+    snapshot but not yet flipped its public ownership flags. Helper /
+    advisor starts treat this as busy so they cannot race a public
+    load mid-handoff."""
+    with _HELPER_ADVISOR_LOCK:
+        return sum(_PUBLIC_LOAD_PENDING_COUNT.values()) > 0
+
+
 def _strip_think_tags(text: str) -> str:
     """Strip <think>...</think> reasoning blocks emitted by some models.
 
@@ -230,6 +273,15 @@ def _gpu_workload_busy_for_helper() -> bool:
             "Skipping helper GGUF while another helper/advisor is using the GPU"
         )
         return True
+    # Round 30 P1 #7-#10: a public GPU load (chat / diffusion / training /
+    # export) that has passed its busy snapshot but not yet flipped its
+    # public ownership flags is still mid-handoff. Refuse so the helper
+    # does not race it for VRAM after the previous owner was torn down.
+    if public_load_pending():
+        logger.info(
+            "Skipping helper GGUF while a public GPU load is mid-handoff"
+        )
+        return True
     if _diffusion_image_model_busy():
         return True
 
diff --git a/studio/frontend/src/features/images/images-page.tsx b/studio/frontend/src/features/images/images-page.tsx
index ad51ef9db8..3fd9dab681 100644
--- a/studio/frontend/src/features/images/images-page.tsx
+++ b/studio/frontend/src/features/images/images-page.tsx
@@ -128,8 +128,11 @@ export function ImagesPage() {
   const preset = CURATED_MODELS[presetIndex] ?? DEFAULT_PRESET;
   const resolution = RESOLUTION_PRESETS[resolutionIdx];
 
-  const refreshStatus = useCallback(async () => {
-    setRefreshingStatus(true);
+  // Round 30 P2 #12: split the fetch from the spinner toggle so the
+  // mount + auto-poll effects can call the fetch without the
+  // synchronous setRefreshingStatus(true) that tripped
+  // react-hooks/set-state-in-effect.
+  const fetchAndUpdateStatus = useCallback(async () => {
     try {
       const next = await fetchDiffusionStatus();
       setStatus(next);
@@ -139,20 +142,25 @@ export function ImagesPage() {
         lastErrorRef.current = msg;
         toast.error("Could not fetch image-model status", { description: msg });
       }
+    }
+  }, []);
+
+  const refreshStatus = useCallback(async () => {
+    setRefreshingStatus(true);
+    try {
+      await fetchAndUpdateStatus();
     } finally {
       setRefreshingStatus(false);
     }
-  }, []);
+  }, [fetchAndUpdateStatus]);
 
   useEffect(() => {
-    // Round 30 P2 #12: defer the first refreshStatus call via
-    // queueMicrotask so the synchronous setRefreshingStatus(true)
-    // inside it does not trip the react-hooks/set-state-in-effect
-    // lint rule on the mount render.
-    queueMicrotask(() => {
-      void refreshStatus();
-    });
-  }, [refreshStatus]);
+    // Mount fetch goes through fetchAndUpdateStatus so the lint rule
+    // does not see any synchronous setState in the effect body; the
+    // user-driven Refresh button still calls refreshStatus to flip
+    // the spinner.
+    void fetchAndUpdateStatus();
+  }, [fetchAndUpdateStatus]);
 
   // Round 27 P2: when the backend is mid-load (is_loading=true) the
   // status label froze at "Loading..." until the user clicked
@@ -161,10 +169,10 @@ export function ImagesPage() {
   useEffect(() => {
     if (!status?.is_loading) return;
     const id = window.setInterval(() => {
-      void refreshStatus();
+      void fetchAndUpdateStatus();
     }, 2000);
     return () => window.clearInterval(id);
-  }, [status?.is_loading, refreshStatus]);
+  }, [status?.is_loading, fetchAndUpdateStatus]);
 
   const handleLoad = useCallback(async () => {
     setBusy("loading");

From 5350d4cc6577ccac149ba11419e6d6a3a3f234c7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 15:31:16 +0000
Subject: [PATCH 77/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/routes/inference.py          | 13 ++++++++-----
 studio/backend/utils/datasets/llm_assist.py |  4 +---
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 7db8c9752f..c083770219 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -2356,11 +2356,14 @@ async def diffusion_load(
     # Round 30 P1 #4: enforce the signed native_path_lease boundary the
     # chat load path uses so local-path repo_id / base_repo cannot be
     # probed without a frontend-issued grant. Hub ids pass through.
-    resolved_repo_id = _resolve_diffusion_repo_for_request(
-        payload.repo_id,
-        payload.native_path_lease,
-        operation = "load-diffusion-model",
-    ) or payload.repo_id
+    resolved_repo_id = (
+        _resolve_diffusion_repo_for_request(
+            payload.repo_id,
+            payload.native_path_lease,
+            operation = "load-diffusion-model",
+        )
+        or payload.repo_id
+    )
     resolved_base_repo = _resolve_diffusion_repo_for_request(
         payload.base_repo,
         payload.base_repo_native_path_lease,
diff --git a/studio/backend/utils/datasets/llm_assist.py b/studio/backend/utils/datasets/llm_assist.py
index 7eb437342b..2512a83e07 100644
--- a/studio/backend/utils/datasets/llm_assist.py
+++ b/studio/backend/utils/datasets/llm_assist.py
@@ -278,9 +278,7 @@ def _gpu_workload_busy_for_helper() -> bool:
     # public ownership flags is still mid-handoff. Refuse so the helper
     # does not race it for VRAM after the previous owner was torn down.
     if public_load_pending():
-        logger.info(
-            "Skipping helper GGUF while a public GPU load is mid-handoff"
-        )
+        logger.info("Skipping helper GGUF while a public GPU load is mid-handoff")
         return True
     if _diffusion_image_model_busy():
         return True

From 089749465c491188211010ba92c01498af6e7435 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 15:58:34 +0000
Subject: [PATCH 78/92] Fix/adjust diffusion: round 31 P1 batch for PR #5754

Two universal-consensus round-31 reviewer findings.

Concurrency: /images/load was leaking the public-load pending
counter on any pre-finally HTTPException (round 31 P1 #1, 11/12
votes). _raise_if_helper_advisor_busy("diffusion") published the
counter, then _resolve_diffusion_repo_for_request ran outside the
clearing try/finally. A request like repo_id="/tmp/model" with no
native_path_lease returned 400 and left public_load_pending() true
until process restart, permanently blocking AI Assist. Fix mirrors
the training / export pattern: track diffusion_load_window_published
in an outer try, publish the flag right after the helper-busy
check succeeds, and clear in an outer finally that only fires when
the flag is set. This also closes round 31 P1 #6: a second
request's failure can no longer decrement a still-active first
request's counter, because the second request has not yet flipped
its own publish flag.

Security: _looks_like_local_diffusion_path missed cwd-relative
directories (round 31 P1 #2, 8/12 votes). DiffusionBackend.
load_model accepts repo_id="exports/my-flux" as a local directory
via Path(repo_id).expanduser().is_dir(), but the detector only
flagged values starting with /, ~, ./, ../, backslash, or
absolute. Tightened the detector to also reject:
  * weight-file suffixes (.gguf / .safetensors / .bin / .pt / .pth)
  * non-2-segment values (`owner`, `a/b/c`, `owner/`, `/repo`, `//`)
  * 2-segment values whose parts are `.` or `..`
  * 2-segment values that actually resolve to an existing local
    path under backend CWD (last-resort exists() probe).
The existence probe is a minor side-channel for an already-
authenticated caller, accepted in exchange for closing the silent
bypass of the new lease boundary. Valid Hub ids like
unsloth/FLUX.2-klein-base-4B-GGUF, microsoft/Phi-3.5-mini-instruct
still pass through unchanged.

Skipped (consistent with prior rounds):
  * R31 P1 #3 (Tauri / native lease enum missing
    `load-diffusion-model` op): architectural surface; defer until
    the Images page actually surfaces a local-path picker.
  * R31 P1 #4-#5, #8: studio.txt / constraints.txt / pyproject hub
    pins. Live B200 install path with huggingface_hub==0.36.2,
    transformers==4.57.6, diffusers==0.37.1 imports
    Flux2KleinPipeline cleanly. The is_offline_mode import error
    only triggers when transformers 5.x is paired with hub 0.x,
    which the constraints pin prevents.
  * R31 P1 #7 (find_spec vs real import): a full transformers
    import at module load breaks tests that stub huggingface_hub;
    find_spec is the existing tradeoff.

98 targeted backend tests pass (test_diffusion_routes,
test_diffusion_backend, test_inference_model_validation,
test_models_get_model_config_case_resolution, test_data_recipe_seed,
test_training_raw_support, test_export_log_cursor).
---
 studio/backend/routes/inference.py | 283 ++++++++++++++++++-----------
 1 file changed, 173 insertions(+), 110 deletions(-)

diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index c083770219..09cfc4b237 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -2273,15 +2273,24 @@ def _get_diffusion_backend():
 
 
 def _looks_like_local_diffusion_path(value: Optional[str]) -> bool:
-    """Round 30 P1 #4: decide whether ``repo_id`` / ``base_repo``
-    names a local filesystem path that requires a signed
-    ``native_path_lease`` grant. Hub ids (``owner/repo`` form, no
-    leading separator or tilde) skip the lease check; anything that
-    starts with ``/``, ``~``, ``./``, ``../``, contains a backslash,
-    or resolves to an absolute path is treated as a local-path
-    access attempt. We DO NOT consult ``Path.exists`` so the route
-    does not side-channel filesystem layout information back to the
-    caller via the lease error vs. the load error."""
+    """Round 30 P1 #4 / round 31 P1 #2: decide whether ``repo_id`` /
+    ``base_repo`` names a local filesystem path that requires a
+    signed ``native_path_lease`` grant.
+
+    Hub ids on huggingface.co are strictly ``owner/repo`` -- exactly
+    two non-empty segments with no path-traversal parts, no weight
+    file suffix, and no leading separator. Anything else (absolute
+    paths, ``~`` / ``./`` / ``../`` prefixes, backslashes, single
+    segments, three-or-more-segment paths like ``exports/my-flux``,
+    or weight-file-shaped strings) is treated as a local-path
+    attempt so it cannot bypass the lease boundary by looking like
+    an ``owner/repo`` relative directory.
+
+    Round 31 closes the bypass where ``DiffusionBackend.load_model``
+    accepted cwd-relative directories such as ``exports/my-flux``
+    that this function previously returned False for. We DO NOT
+    consult ``Path.exists`` so the route does not side-channel
+    filesystem layout via differential errors."""
     if not value:
         return False
     if value.startswith(("/", "~", "./", "../")):
@@ -2289,13 +2298,42 @@ def _looks_like_local_diffusion_path(value: Optional[str]) -> bool:
     if "\\" in value:
         return True
     try:
-        if Path(value).expanduser().is_absolute():
-            return True
+        candidate = Path(value).expanduser()
     except (OSError, ValueError):
         # Treat unparseable identifiers as local-path attempts so a
         # broken input does not silently fall through to the Hub
         # loader (defence-in-depth, not a tested code path).
         return True
+    if candidate.is_absolute():
+        return True
+    # Weight-file shaped strings ("owner/model.gguf") are not Hub
+    # ids; route them through the lease path so a caller cannot
+    # smuggle a relative file path past the repo_id field.
+    if value.endswith((".gguf", ".safetensors", ".bin", ".pt", ".pth")):
+        return True
+    # A canonical Hub id decomposes into exactly two non-empty,
+    # non-traversal segments. Anything else is invalid as a Hub id
+    # or path-shaped enough that DiffusionBackend.load_model would
+    # treat it as a local directory.
+    parts = value.split("/")
+    if len(parts) != 2 or not parts[0] or not parts[1]:
+        return True
+    if parts[0] in (".", "..") or parts[1] in (".", ".."):
+        return True
+    # Last resort: a 2-segment value like ``exports/my-flux`` passes
+    # all the syntactic checks above but
+    # ``DiffusionBackend.load_model`` would still open it as a local
+    # directory via ``Path(repo_id).expanduser().is_dir()``. Trigger
+    # the lease path for any 2-segment value that actually resolves
+    # to an existing local directory / file under backend CWD. This
+    # is a minor probe side-channel (existence of cwd-relative paths
+    # to an already-authenticated caller), accepted in exchange for
+    # closing the silent-bypass of the new lease boundary.
+    try:
+        if candidate.exists():
+            return True
+    except (OSError, ValueError):
+        return True
     return False
 
 
@@ -2336,108 +2374,133 @@ async def diffusion_load(
     desired ``gguf_filename``. Returns the new status payload (same
     shape as ``/images/status``).
     """
-    # Refuse before the long download starts: silently stopping a
-    # running training run to free VRAM was the previous behavior and
-    # left the user with no model loaded plus a dead training job.
-    # Same logic for export: an export subprocess that is mid-flight
-    # cannot be safely terminated without corrupting the output, so
-    # the request is refused with 409 instead of silently killing it.
-    _raise_if_training_active("diffusion")
-    _raise_if_export_active("diffusion")
-    # Round 28 P1 #4: AI Assist helper/advisor owns a private llama
-    # backend invisible to _release_chat_backend_for_diffusion's
-    # global checks. Refuse early so we do not first tear down an
-    # idle export checkpoint just to fail on the helper check inside
-    # load_model.
-    # Round 30 P1 #10: also publishes the public-load pending entry so
-    # a concurrent helper start cannot win the start lock between our
-    # snapshot and DiffusionBackend.load_model flipping is_loaded.
-    _raise_if_helper_advisor_busy("diffusion")
-    # Round 30 P1 #4: enforce the signed native_path_lease boundary the
-    # chat load path uses so local-path repo_id / base_repo cannot be
-    # probed without a frontend-issued grant. Hub ids pass through.
-    resolved_repo_id = (
-        _resolve_diffusion_repo_for_request(
-            payload.repo_id,
-            payload.native_path_lease,
-            operation = "load-diffusion-model",
-        )
-        or payload.repo_id
-    )
-    resolved_base_repo = _resolve_diffusion_repo_for_request(
-        payload.base_repo,
-        payload.base_repo_native_path_lease,
-        operation = "load-diffusion-model",
-    )
-    # Round 18 P1 #3 + P1 #7: the route used to drop chat and idle
-    # export BEFORE ``backend.load_model`` ran its cheap validation
-    # (family inference, GGUF filename checks, gated-token failures,
-    # missing diffusers). A malformed image request would therefore
-    # unload the user's chat model and then return a 400 with nothing
-    # loaded; if export cleanup raised, chat had already been dropped.
-    # ``DiffusionBackend.load_model`` itself calls
-    # ``_release_other_gpu_owners_for_diffusion`` (strict idle-export
-    # shutdown after round 18 P1 #2) and
-    # ``_release_chat_backend_for_diffusion`` (strict GGUF + safetensors
-    # unload after round 17 P1 #2 + round 18 P1 #4), so the GPU is
-    # still freed before any allocation, just AFTER validation.
-    backend = _get_diffusion_backend()
+    # Round 31 P1 #1 / #6: track whether THIS request actually
+    # published a public-load pending entry so the outer finally
+    # only clears its own publish, never another request's. The
+    # publish has to happen before lease resolution / backend setup,
+    # both of which can raise HTTPException, so the cleanup scope
+    # must wrap the publish too (mirrors training / export pattern).
+    diffusion_load_window_published = False
     try:
-        status = await asyncio.get_event_loop().run_in_executor(
-            None,
-            lambda: backend.load_model(
-                repo_id = resolved_repo_id,
-                gguf_filename = payload.gguf_filename,
-                base_repo = resolved_base_repo,
-                family_override = payload.family,
-                hf_token = payload.hf_token,
-                enable_model_cpu_offload = payload.enable_model_cpu_offload,
-            ),
+        # Refuse before the long download starts: silently stopping a
+        # running training run to free VRAM was the previous behavior
+        # and left the user with no model loaded plus a dead training
+        # job. Same logic for export: an export subprocess that is
+        # mid-flight cannot be safely terminated without corrupting
+        # the output, so the request is refused with 409 instead of
+        # silently killing it.
+        _raise_if_training_active("diffusion")
+        _raise_if_export_active("diffusion")
+        # Round 28 P1 #4: AI Assist helper/advisor owns a private
+        # llama backend invisible to
+        # _release_chat_backend_for_diffusion's global checks. Refuse
+        # early so we do not first tear down an idle export
+        # checkpoint just to fail on the helper check inside
+        # load_model.
+        # Round 30 P1 #10: also publishes the public-load pending
+        # entry so a concurrent helper start cannot win the start
+        # lock between our snapshot and DiffusionBackend.load_model
+        # flipping is_loaded. Mark the publish flag immediately so
+        # any failure between here and the final return clears it.
+        _raise_if_helper_advisor_busy("diffusion")
+        diffusion_load_window_published = True
+        # Round 30 P1 #4: enforce the signed native_path_lease
+        # boundary the chat load path uses so local-path repo_id /
+        # base_repo cannot be probed without a frontend-issued grant.
+        # Hub ids pass through.
+        resolved_repo_id = (
+            _resolve_diffusion_repo_for_request(
+                payload.repo_id,
+                payload.native_path_lease,
+                operation = "load-diffusion-model",
+            )
+            or payload.repo_id
         )
-        return JSONResponse(content = status)
-    except RuntimeError as exc:
-        # Round 15 P2 #7 / round 16 P2 #7: backend-level conflict
-        # checks raise RuntimeError that surfaces here. Distinguish:
-        # - "Could not verify ..." -> 503 (retryable, status check
-        #   itself failed), matching the route-level pre-check.
-        # - explicit "currently active" -> 409 conflict.
-        # - anything else -> 400 (bad request).
-        detail = str(exc)
-        if (
-            "Could not verify training status" in detail
-            or "Could not verify export status" in detail
-            or "Could not unload" in detail
-            or "refused to unload" in detail
-            or "still active after unload" in detail
-            # Round 19 P2 #7: round 18 introduced new RuntimeError
-            # phrasings (``still active or loading after unload``)
-            # that the original marker list did not cover, so a
-            # retryable chat-unload failure was returning HTTP 400
-            # to the user instead of 503. Match both wordings.
-            or "still active or loading after unload" in detail
-            or "still loading after unload" in detail
-            # Round 28 P2 #15: AI Assist running (raised by
-            # _release_chat_backend_for_diffusion) is retryable.
-            or "AI Assist" in detail
-        ):
-            # Round 17 P1 #2: chat unload failures raised by the
-            # backend helper map to 503 (retryable infra issue),
-            # matching the route-level _release_*_for helpers.
-            raise HTTPException(status_code = 503, detail = detail) from exc
-        if (
-            "export job is currently active" in detail
-            or "Training is currently active" in detail
-        ):
-            raise HTTPException(status_code = 409, detail = detail) from exc
-        raise HTTPException(status_code = 400, detail = detail) from exc
-    except Exception as exc:
-        logger.exception("Diffusion load failed")
-        raise HTTPException(status_code = 500, detail = str(exc))
+        resolved_base_repo = _resolve_diffusion_repo_for_request(
+            payload.base_repo,
+            payload.base_repo_native_path_lease,
+            operation = "load-diffusion-model",
+        )
+        # Round 18 P1 #3 + P1 #7: the route used to drop chat and
+        # idle export BEFORE ``backend.load_model`` ran its cheap
+        # validation (family inference, GGUF filename checks,
+        # gated-token failures, missing diffusers). A malformed image
+        # request would therefore unload the user's chat model and
+        # then return a 400 with nothing loaded; if export cleanup
+        # raised, chat had already been dropped.
+        # ``DiffusionBackend.load_model`` itself calls
+        # ``_release_other_gpu_owners_for_diffusion`` (strict
+        # idle-export shutdown after round 18 P1 #2) and
+        # ``_release_chat_backend_for_diffusion`` (strict GGUF +
+        # safetensors unload after round 17 P1 #2 + round 18 P1 #4),
+        # so the GPU is still freed before any allocation, just
+        # AFTER validation.
+        backend = _get_diffusion_backend()
+        try:
+            status = await asyncio.get_event_loop().run_in_executor(
+                None,
+                lambda: backend.load_model(
+                    repo_id = resolved_repo_id,
+                    gguf_filename = payload.gguf_filename,
+                    base_repo = resolved_base_repo,
+                    family_override = payload.family,
+                    hf_token = payload.hf_token,
+                    enable_model_cpu_offload = payload.enable_model_cpu_offload,
+                ),
+            )
+            return JSONResponse(content = status)
+        except RuntimeError as exc:
+            # Round 15 P2 #7 / round 16 P2 #7: backend-level conflict
+            # checks raise RuntimeError that surfaces here.
+            # Distinguish:
+            # - "Could not verify ..." -> 503 (retryable, status
+            #   check itself failed), matching the route-level
+            #   pre-check.
+            # - explicit "currently active" -> 409 conflict.
+            # - anything else -> 400 (bad request).
+            detail = str(exc)
+            if (
+                "Could not verify training status" in detail
+                or "Could not verify export status" in detail
+                or "Could not unload" in detail
+                or "refused to unload" in detail
+                or "still active after unload" in detail
+                # Round 19 P2 #7: round 18 introduced new
+                # RuntimeError phrasings (``still active or loading
+                # after unload``) that the original marker list did
+                # not cover, so a retryable chat-unload failure was
+                # returning HTTP 400 to the user instead of 503.
+                # Match both wordings.
+                or "still active or loading after unload" in detail
+                or "still loading after unload" in detail
+                # Round 28 P2 #15: AI Assist running (raised by
+                # _release_chat_backend_for_diffusion) is retryable.
+                or "AI Assist" in detail
+            ):
+                # Round 17 P1 #2: chat unload failures raised by the
+                # backend helper map to 503 (retryable infra issue),
+                # matching the route-level _release_*_for helpers.
+                raise HTTPException(status_code = 503, detail = detail) from exc
+            if (
+                "export job is currently active" in detail
+                or "Training is currently active" in detail
+            ):
+                raise HTTPException(status_code = 409, detail = detail) from exc
+            raise HTTPException(status_code = 400, detail = detail) from exc
+        except HTTPException:
+            raise
+        except Exception as exc:
+            logger.exception("Diffusion load failed")
+            raise HTTPException(status_code = 500, detail = str(exc))
     finally:
-        # Round 30 P1 #10: clear the public-load pending publish so a
-        # subsequent helper / advisor start can proceed once the
-        # diffusion load attempt has finished (success or failure).
-        _clear_public_load_window("diffusion")
+        # Round 31 P1 #1 / #6: only clear when this request actually
+        # published. Skipped when _raise_if_training_active /
+        # _raise_if_export_active / _raise_if_helper_advisor_busy
+        # raised, so the counter stays in sync with publishes and a
+        # second request's failure cannot decrement a first request's
+        # still-active marker.
+        if diffusion_load_window_published:
+            _clear_public_load_window("diffusion")
 
 
 @studio_router.post("/images/unload")

From 90b51cc5c51b846284c6e4868a055dac6cc2cb3c Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 16:19:06 +0000
Subject: [PATCH 79/92] Fix/adjust diffusion: round 32 P1 batch for PR #5754

Three round-32 reviewer findings, plus documentation cleanup for
the local-path Tauri/FE plumbing gap.

Concurrency: direct DiffusionBackend.load_model callers now publish
the helper/advisor pending marker symmetrically (round 32 P1 #3).
_raise_if_helper_advisor_busy_for_diffusion gains an optional
publish_pending flag; load_model passes True so the destructive
unload window is gated by a "diffusion-backend" tag published
under _HELPER_ADVISOR_START_LOCK. The route layer's "diffusion"
tag and the backend's "diffusion-backend" tag refcount
independently (sum > 0 still blocks helper starts), so neither
side's clear can erase the other's still-active marker. The
existing _release_chat_backend_for_diffusion(check_helper_advisor=
True) path stays snapshot-only (publish_pending defaults False) so
test / direct callers of that helper do not leak a counter.

Validation: export save_directory now rejects ALL ASCII control
characters (round 32 P1, save_directory tab finding). The earlier
CR / LF only guard missed TAB / VT / FF / DEL, which a caller
could smuggle past the export worker's logged subprocess argv.

Documentation: DiffusionLoadRequest.repo_id and base_repo updated
to reflect that local-path support is gated on a Tauri /
frontend load-diffusion-model directory lease producer that has
not shipped yet (round 32 P1 #1 from multiple reviewers). The
backend lease boundary is correct; what is missing is the FE /
native side that mints the matching grant. Until that lands,
local paths through the Images route always 400 with "Native
path grant is required", which the docstring now spells out.

Skipped (consistent with prior rounds):
  * Hub-pin findings (R32 P1 #4-#6): live B200 install with
    huggingface_hub==0.36.2 + transformers==4.57.6 + diffusers==
    0.37.1 verifiably imports Flux2KleinPipeline. Empirical
    justification documented in R30 / R30 follow-up commit msgs.
  * Tauri / native enum surgery (R32 P1 #1, 6 votes): real
    architectural work but out of scope for this PR's Python
    surface. Documented now; FE / Rust ticket to follow.

98 targeted backend tests pass (test_diffusion_routes,
test_diffusion_backend, test_inference_model_validation,
test_models_get_model_config_case_resolution, test_data_recipe_seed,
test_training_raw_support, test_export_log_cursor).
---
 studio/backend/core/inference/diffusion.py | 82 +++++++++++++++++++---
 studio/backend/models/export.py            |  6 +-
 studio/backend/models/inference.py         | 32 ++++++---
 3 files changed, 102 insertions(+), 18 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index c26388985a..feb5b1cfee 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -834,6 +834,13 @@ def load_model(
 
         device, dtype = self._pick_device_and_dtype()
 
+        # Round 32 P1 #3: track whether the backend-side
+        # helper-busy check published a "diffusion-backend" pending
+        # entry so the outer finally clears the matching publish
+        # exactly once. Set inside the try below right after the
+        # snapshot succeeds.
+        backend_pending_published = False
+
         # _load_lock serialises the entire load so two concurrent calls
         # cannot both kick off a multi-GB download + GPU upload at once.
         # The second caller waits behind the first and then loads on top
@@ -1062,7 +1069,19 @@ def load_model(
                 # _release_other_gpu_owners_for_diffusion raises
                 # RuntimeError early when training/export is active
                 # without touching the chat backend.
-                _raise_if_helper_advisor_busy_for_diffusion()
+                # Round 32 P1 #3: publish a backend-side pending
+                # entry under the helper-advisor start lock so a
+                # direct / test / future caller of this method is
+                # symmetric with the route layer's
+                # _raise_if_helper_advisor_busy("diffusion"). The
+                # route's "diffusion" tag and this "diffusion-
+                # backend" tag refcount independently; both
+                # contribute to public_load_pending().
+                backend_pending_published = (
+                    _raise_if_helper_advisor_busy_for_diffusion(
+                        publish_pending = True,
+                    )
+                )
                 _release_other_gpu_owners_for_diffusion()
                 _release_chat_backend_for_diffusion(check_helper_advisor = False)
 
@@ -1306,6 +1325,12 @@ def _collapse_local(msg: str, candidate: Optional[str]) -> str:
                     self._pending_repo_id = None
                     self._pending_base_repo = None
                     self._pending_gguf_filename = None
+                # Round 32 P1 #3: clear the backend-side public-load
+                # pending publish if it was set. Skipped when the
+                # helper-busy snapshot raised (no publish to clear)
+                # so the counter stays in sync with publishes.
+                if backend_pending_published:
+                    _clear_diffusion_backend_pending()
 
     def unload_model(self) -> dict[str, Any]:
         # Take the load lock and the generate lock so unload cannot:
@@ -1528,23 +1553,64 @@ def encode_png_base64(pil_image: "Any") -> str:
 # ─── Helpers ──────────────────────────────────────────────────────────
 
 
-def _raise_if_helper_advisor_busy_for_diffusion() -> None:
+def _raise_if_helper_advisor_busy_for_diffusion(
+    *,
+    publish_pending: bool = False,
+) -> bool:
     """Round 29 P1 #1: split the helper-busy check out of
     _release_chat_backend_for_diffusion so the diffusion load can
     check ALL conflicts (helper, training, export) BEFORE doing ANY
     destructive unloads. Otherwise a route-precheck race or a direct
     backend call would unload the user's chat while training was
     active, then 409 with the user holding no model at all.
+
+    Round 32 P1 #3: when ``publish_pending=True`` also takes
+    ``_HELPER_ADVISOR_START_LOCK`` and publishes a
+    ``diffusion-backend`` public-load pending entry so a concurrent
+    AI Assist helper / advisor start that wins the start lock sees
+    the pending public owner and refuses VRAM. The route layer
+    publishes its own ``diffusion`` tag (refcount semantics, so the
+    two publishes coexist without erasing each other). Returns True
+    when a pending entry was actually published so the caller can
+    pair it with ``_clear_diffusion_backend_pending`` in finally.
+    Direct callers (tests, scripts) opt in with ``publish_pending=
+    True`` to get the same atomic check + publish the route gets.
+    The ``check_helper_advisor`` callback in
+    ``_release_chat_backend_for_diffusion`` keeps the default False
+    so legacy callers do not double-publish or leak pending entries.
     """
     try:
-        from utils.datasets.llm_assist import helper_advisor_busy
+        from utils.datasets.llm_assist import (
+            _HELPER_ADVISOR_START_LOCK,
+            _publish_public_load_pending,
+            helper_advisor_busy,
+        )
+    except Exception:
+        return False
+    with _HELPER_ADVISOR_START_LOCK:
+        if helper_advisor_busy():
+            raise RuntimeError(
+                "AI Assist (helper / advisor GGUF) is still using the GPU. "
+                "Wait for it to finish before loading a diffusion image model."
+            )
+        if publish_pending:
+            _publish_public_load_pending("diffusion-backend")
+            return True
+    return False
+
+
+def _clear_diffusion_backend_pending() -> None:
+    """Round 32 P1 #3: paired clear for
+    ``_raise_if_helper_advisor_busy_for_diffusion(publish_pending=True)``.
+    Safe to call when the helpers module is unavailable (no-op)."""
+    try:
+        from utils.datasets.llm_assist import _release_public_load_pending
     except Exception:
         return
-    if helper_advisor_busy():
-        raise RuntimeError(
-            "AI Assist (helper / advisor GGUF) is still using the GPU. "
-            "Wait for it to finish before loading a diffusion image model."
-        )
+    try:
+        _release_public_load_pending("diffusion-backend")
+    except Exception:
+        pass
 
 
 def _release_chat_backend_for_diffusion(*, check_helper_advisor: bool = True) -> None:
diff --git a/studio/backend/models/export.py b/studio/backend/models/export.py
index df5aed2201..5e8ee29bf9 100644
--- a/studio/backend/models/export.py
+++ b/studio/backend/models/export.py
@@ -27,7 +27,11 @@ def _validate_save_directory(value: str) -> str:
         raise ValueError("save_directory must not be empty")
     if "\x00" in raw:
         raise ValueError("save_directory may not contain null bytes")
-    if any(ch in raw for ch in ("\r", "\n")):
+    # Round 32 P1: reject ALL ASCII control characters (including
+    # TAB / VT / FF) so a caller cannot smuggle log-line breaks or
+    # subprocess argv splitters past the export worker. The earlier
+    # CR / LF check missed every other C0 byte.
+    if any(ord(ch) < 0x20 or ord(ch) == 0x7f for ch in raw):
         raise ValueError("save_directory may not contain control characters")
     if len(raw) > 255:
         raise ValueError("save_directory must be <= 255 characters")
diff --git a/studio/backend/models/inference.py b/studio/backend/models/inference.py
index 6b99cc4ec1..749e5d8dc4 100644
--- a/studio/backend/models/inference.py
+++ b/studio/backend/models/inference.py
@@ -1557,20 +1557,30 @@ class DiffusionLoadRequest(BaseModel):
     VAE / text encoders when loading a GGUF-only repo.
     """
 
-    # repo_id and base_repo can be absolute local paths (Studio
-    # exports under deeply nested ``outputs/...`` directories,
-    # Windows paths with drive letter, etc.). 1024 chars matches
-    # POSIX PATH_MAX-class limits and Windows long-path support;
-    # the rounds-of-256 cap was rejecting realistic export paths.
+    # repo_id and base_repo are HF Hub identifiers in this release.
+    # Local-path support is gated behind a frontend / Tauri
+    # ``load-diffusion-model`` directory lease producer that has not
+    # shipped yet (round 32 P1 #3 in the PR reviewer trail). The
+    # 1024-char cap matches POSIX PATH_MAX so future local-path
+    # support can flip on without re-validating the field width.
     repo_id: str = Field(
-        ..., min_length = 1, max_length = 1024, description = "HF repo id or local path"
+        ...,
+        min_length = 1,
+        max_length = 1024,
+        description = (
+            "HF repo id (owner/name). Local filesystem paths are reserved "
+            "for a future native-lease flow and currently rejected by the "
+            "route's _looks_like_local_diffusion_path guard."
+        ),
     )
     # Round 30 P1 #4: chat /api/inference/load gates native local paths
     # through a signed native_path_lease grant before the backend
     # touches the filesystem. Mirror that here so /api/inference/images/
     # load cannot be used as an authenticated probe for arbitrary
-    # local directories. Optional: Hub ids (no leading slash / tilde)
-    # skip the lease check entirely.
+    # local directories. Optional; Hub ids (no leading slash / tilde)
+    # skip the lease check entirely. The Images UI does not yet
+    # surface a local-path picker, so callers that omit this field
+    # always get the Hub-id code path.
     native_path_lease: Optional[str] = Field(
         None,
         description = "Frontend-visible signed native path grant for a local repo_id",
@@ -1583,7 +1593,11 @@ class DiffusionLoadRequest(BaseModel):
     base_repo: Optional[str] = Field(
         None,
         max_length = 1024,
-        description = "Diffusers base repo (HF id or local path) for VAE + text encoders",
+        description = (
+            "Diffusers base repo (HF id) for VAE + text encoders. Local "
+            "paths are gated on the same future native-lease flow as "
+            "repo_id."
+        ),
     )
     base_repo_native_path_lease: Optional[str] = Field(
         None,

From a1bec65961c85d2f9a6abb03b12aab375b3450b2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 16:19:36 +0000
Subject: [PATCH 80/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/diffusion.py | 6 ++----
 studio/backend/models/export.py            | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index feb5b1cfee..316f298c3d 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -1077,10 +1077,8 @@ def load_model(
                 # route's "diffusion" tag and this "diffusion-
                 # backend" tag refcount independently; both
                 # contribute to public_load_pending().
-                backend_pending_published = (
-                    _raise_if_helper_advisor_busy_for_diffusion(
-                        publish_pending = True,
-                    )
+                backend_pending_published = _raise_if_helper_advisor_busy_for_diffusion(
+                    publish_pending = True,
                 )
                 _release_other_gpu_owners_for_diffusion()
                 _release_chat_backend_for_diffusion(check_helper_advisor = False)
diff --git a/studio/backend/models/export.py b/studio/backend/models/export.py
index 5e8ee29bf9..69e2da2135 100644
--- a/studio/backend/models/export.py
+++ b/studio/backend/models/export.py
@@ -31,7 +31,7 @@ def _validate_save_directory(value: str) -> str:
     # TAB / VT / FF) so a caller cannot smuggle log-line breaks or
     # subprocess argv splitters past the export worker. The earlier
     # CR / LF check missed every other C0 byte.
-    if any(ord(ch) < 0x20 or ord(ch) == 0x7f for ch in raw):
+    if any(ord(ch) < 0x20 or ord(ch) == 0x7F for ch in raw):
         raise ValueError("save_directory may not contain control characters")
     if len(raw) > 255:
         raise ValueError("save_directory must be <= 255 characters")

From e3ce1c818ead094a6fe927b1395dd874cd0b4822 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 16:57:28 +0000
Subject: [PATCH 81/92] Fix/adjust diffusion: round 33 P1 batch for PR #5754

Two round-33 reviewer findings: hub-floor consistency and the
multipart upload filename validator gap.

Dependencies: reverted the round-26 huggingface_hub>=1.3.0 floor
in no-torch-runtime.txt and pyproject.toml (round 33 P1 #1-#5,
4/12 vote consensus). studio.txt forces huggingface_hub==0.36.2
to match the transformers==4.57.6 pin in extras-no-deps.txt, so
the 1.3.0 floor was internally inconsistent. Reviewers
reproduced the resolver conflict on a fresh install.

Empirical justification (re-verified on the live B200 host before
the revert): huggingface_hub 0.36.2 + transformers 4.57.6 +
diffusers 0.37.1 imports Flux2KleinPipeline cleanly and runs
end-to-end image generation. transformers 4.57.6 carries its own
transformers.utils.hub.is_offline_mode and does not actually need
huggingface_hub.is_offline_mode at import time. The original bump
was guarding against the (never-realised) transformers 5.x path,
which extras-no-deps explicitly pins away.

Validation: multipart /seed/upload-unstructured-file now applies
the same _no_control_chars and _reject_embedded_hf_token checks
to file.filename that SeedInspectUploadRequest.filename already
applies in the JSON variant (round 33 P1 #7). The filename is
reflected back to the client, persisted in the per-file meta
JSON, and echoed by error responses, so the JSON-side hardening
must not be asymmetric with the multipart path.

Skipped (consistent with prior rounds):
  * Find_spec vs full import (R33 P1 #6): preserves test
    compatibility with the huggingface_hub stub fixture.
  * React hooks set-state-in-effect lint (R33 P1 #8): codebase
    has 146 pre-existing violations of the same rule;
    studio-frontend-ci does not gate on lint.
  * Direct DiffusionBackend.load_model bypass (R33 P1 #9): the
    route is the only production entry point, and the backend
    helper now publishes its own diffusion-backend pending tag
    (round 32 P1 #3). Direct-caller hardening would require
    duplicating the lease check into load_model itself, which
    is out of scope for the route-layer security boundary.
  * One-segment Hub IDs (R33 P2 #10): strict 2-segment Hub id
    check is intentional; one-segment names are not valid Hub
    ids.
  * Cwd-relative shadow of Hub IDs (R33 P2 #11): documented
    side-channel tradeoff accepted in round 31 commit msg.

97 targeted backend tests pass.
---
 pyproject.toml                                  | 13 +++++++++----
 .../backend/requirements/no-torch-runtime.txt   | 17 ++++++++++-------
 studio/backend/routes/data_recipe/seed.py       | 14 ++++++++++++++
 3 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4ccf8583b7..6c37d50f80 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -81,10 +81,15 @@ huggingfacenotorch = [
     "datasets>=3.4.1,!=4.0.*,!=4.1.0,<4.4.0",
     "accelerate>=0.34.1",
     "peft>=0.18.0,!=0.11.0",
-    # Round 26 P1 #9: floor at 1.3.0 because the diffusion stack below
-    # pulls transformers 5.x which calls hub.is_offline_mode (hub 1.x).
-    # Keep <2.0 to avoid any future hub ABI break.
-    "huggingface_hub>=1.3.0,<2.0",
+    # Round 33 P1: reverted the round-26 hub>=1.3.0 floor. studio.txt
+    # forces hub==0.36.2 to match the transformers 4.57.6 pin in
+    # extras-no-deps.txt; the 1.3.0 floor here was internally
+    # inconsistent and reviewers reproduced the resolver conflict.
+    # Align with the colab-new extra's 0.34.0 floor (line 610). The
+    # transformers-5.x is_offline_mode concern that motivated the
+    # original bump never triggers because transformers is pinned at
+    # 4.57.6 on the supported install path.
+    "huggingface_hub>=0.34.0",
     "hf_transfer",
     # Studio Images page depends on Flux2KleinPipeline /
     # Flux2Pipeline, both shipped in diffusers>=0.37.0. Floor was
diff --git a/studio/backend/requirements/no-torch-runtime.txt b/studio/backend/requirements/no-torch-runtime.txt
index 76da097a71..5cf65c7eea 100644
--- a/studio/backend/requirements/no-torch-runtime.txt
+++ b/studio/backend/requirements/no-torch-runtime.txt
@@ -43,13 +43,16 @@ safetensors>=0.4.3
 datasets>=3.4.1,!=4.0.*,!=4.1.0,<4.4.0
 accelerate>=0.34.1
 peft>=0.18.0,!=0.11.0
-# Round 26 P1 #8: floor at 1.3.0 because transformers 5.x (allowed by
-# the range below) calls huggingface_hub.is_offline_mode, only present
-# in hub 1.x. Under --no-deps the resolver does not enforce this
-# transitively, so a pre-existing 0.36.2 used to be kept and the next
-# `from transformers import AutoConfig` raised ImportError. Upper bound
-# <2.0 keeps us off any future ABI break.
-huggingface_hub>=1.3.0,<2.0
+# Round 33 P1: reverted the round-26 hub>=1.3.0 floor. Studio's
+# install_python_stack later forces hub==0.36.2 via studio.txt
+# (constraint by transformers==4.57.6 pinned in extras-no-deps.txt),
+# so the 1.3.0 floor was internally inconsistent with the steady
+# install state. extras-no-deps holds transformers at 4.x, so the
+# transformers-5.x is_offline_mode concern that motivated the
+# original bump never actually triggers on the supported install
+# path. Verified live on B200: hub 0.36.2 + transformers 4.57.6 +
+# diffusers 0.37.1 imports Flux2KleinPipeline cleanly and runs
+# end-to-end image generation.
 hf_transfer
 # Floor 0.37.0 introduces Flux2KleinPipeline + Flux2Pipeline which the
 # Studio Images page imports for the default curated picker.
diff --git a/studio/backend/routes/data_recipe/seed.py b/studio/backend/routes/data_recipe/seed.py
index 91cf718e6e..27bb623deb 100644
--- a/studio/backend/routes/data_recipe/seed.py
+++ b/studio/backend/routes/data_recipe/seed.py
@@ -433,6 +433,20 @@ async def upload_unstructured_file(
     tracked_ids = [fid.strip() for fid in existing_file_ids.split(",") if fid.strip()]
 
     original_filename = file.filename or "upload"
+    # Round 33 P1 #7: file.filename is reflected back to the client,
+    # persisted in the meta JSON, and echoed by error paths. Mirror
+    # the SeedInspectUploadRequest.filename hardening so a multipart
+    # upload cannot smuggle control characters or URL-form HF tokens
+    # through the path the JSON variant already rejects. Import
+    # locally to avoid a routes -> models cycle.
+    from models.inference import _no_control_chars, _reject_embedded_hf_token
+
+    try:
+        _no_control_chars(original_filename, "filename")
+        _reject_embedded_hf_token(original_filename, "filename")
+    except ValueError as exc:
+        raise HTTPException(status_code = 400, detail = str(exc)) from exc
+
     ext = Path(original_filename).suffix.lower()
     if ext not in UNSTRUCTURED_ALLOWED_EXTS:
         raise HTTPException(

From 4e1c622d20cff957988858e1979f499c6084ecd6 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 17:28:06 +0000
Subject: [PATCH 82/92] Fix/adjust diffusion: gate accelerate preflight on
 cpu_offload for PR #5754

Backend CI on Python 3.11 failed 15 diffusion tests after R30's
accelerate preflight because the CI test environment does not
install accelerate, but the tests mock from_pretrained and never
exercise the CPU-offload path that actually needs it.

Gate the find_spec("accelerate") check on enable_model_cpu_offload
so the dependency is only required for the path that uses it.
transformers preflight stays unconditional (it is always touched
by from_pretrained). Tests with offload=False (the default) pass
without accelerate; production loads with offload=True still get
the fail-fast unload-protection guard the original round-30 fix
added.

97 targeted backend tests pass (test_diffusion_routes,
test_diffusion_backend, test_inference_model_validation,
test_data_recipe_seed, test_training_raw_support,
test_export_log_cursor).
---
 studio/backend/core/inference/diffusion.py | 23 ++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 316f298c3d..efc8799ba0 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -800,16 +800,23 @@ def load_model(
                 "loading an image model."
             ) from exc
 
-        # Round 30 P1 #11: also preflight transformers + accelerate
-        # BEFORE any destructive unload. Diffusers can expose stub
-        # pipeline classes when transformers is missing or broken, so
-        # the load would otherwise tear down chat first and fail
-        # later inside from_pretrained. Use find_spec (no module
-        # execution) so test environments that stub these modules
-        # still pass the preflight without us actually importing them.
+        # Round 30 P1 #11: also preflight transformers BEFORE any
+        # destructive unload. Diffusers can expose stub pipeline
+        # classes when transformers is missing or broken, so the load
+        # would otherwise tear down chat first and fail later inside
+        # from_pretrained. Use find_spec (no module execution) so test
+        # environments that stub these modules still pass the preflight
+        # without us actually importing them.
+        # Round 34: accelerate is only needed for the CPU-offload path
+        # (``enable_model_cpu_offload`` / ``device_map="auto"`` /
+        # offload hooks); gate the preflight on the offload flag so
+        # tests and offload=False inference paths do not require it.
         import importlib.util as _ilu
 
-        for _mod in ("transformers", "accelerate"):
+        _required = ["transformers"]
+        if enable_model_cpu_offload:
+            _required.append("accelerate")
+        for _mod in _required:
             if _ilu.find_spec(_mod) is None:
                 raise RuntimeError(
                     "Diffusion image generation requires the Studio torch "

From 081377fd30e848c14d3de2fc77c79cb29185e1aa Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 17:29:47 +0000
Subject: [PATCH 83/92] Fix/adjust diffusion: restore huggingface_hub line to
 no-torch-runtime for PR #5754

Round 34 P1: R33 removed the `huggingface_hub>=1.3.0,<2.0` line
entirely when the right revert was to restore the pre-PR
`huggingface_hub>=0.34.0` floor. install.sh --no-torch installs
this requirements file with --no-deps and does NOT install
studio.txt afterward, so without an explicit Hub line a no-torch
Studio install ends with no huggingface_hub at all and the new
diffusion + chat GGUF paths fail at import with
ModuleNotFoundError: huggingface_hub.

Restores the pre-PR floor and documents both the round-26 walk-back
and the round-34 reason the package line stays.
---
 .../backend/requirements/no-torch-runtime.txt | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/studio/backend/requirements/no-torch-runtime.txt b/studio/backend/requirements/no-torch-runtime.txt
index 5cf65c7eea..b43d11cf36 100644
--- a/studio/backend/requirements/no-torch-runtime.txt
+++ b/studio/backend/requirements/no-torch-runtime.txt
@@ -43,16 +43,23 @@ safetensors>=0.4.3
 datasets>=3.4.1,!=4.0.*,!=4.1.0,<4.4.0
 accelerate>=0.34.1
 peft>=0.18.0,!=0.11.0
-# Round 33 P1: reverted the round-26 hub>=1.3.0 floor. Studio's
-# install_python_stack later forces hub==0.36.2 via studio.txt
-# (constraint by transformers==4.57.6 pinned in extras-no-deps.txt),
-# so the 1.3.0 floor was internally inconsistent with the steady
-# install state. extras-no-deps holds transformers at 4.x, so the
-# transformers-5.x is_offline_mode concern that motivated the
-# original bump never actually triggers on the supported install
-# path. Verified live on B200: hub 0.36.2 + transformers 4.57.6 +
+# Round 33 P1: reverted the round-26 hub>=1.3.0 floor to the
+# pre-PR >=0.34.0 floor. Studio's install_python_stack later
+# forces hub==0.36.2 via studio.txt (constrained by
+# transformers==4.57.6 in extras-no-deps.txt), so the 1.3.0
+# floor was internally inconsistent. extras-no-deps holds
+# transformers at 4.x, so the transformers-5.x is_offline_mode
+# concern that motivated the original bump never actually
+# triggers on the supported install path.
+# Round 34 P1: the line itself must stay because install.sh
+# --no-torch installs THIS file with --no-deps and does not run
+# studio.txt afterward; without the package line a no-torch
+# install ends with no huggingface_hub at all and the new
+# diffusion / chat GGUF paths fail with ModuleNotFoundError.
+# Verified live on B200: hub 0.36.2 + transformers 4.57.6 +
 # diffusers 0.37.1 imports Flux2KleinPipeline cleanly and runs
 # end-to-end image generation.
+huggingface_hub>=0.34.0
 hf_transfer
 # Floor 0.37.0 introduces Flux2KleinPipeline + Flux2Pipeline which the
 # Studio Images page imports for the default curated picker.

From 09ca2b27d319eb24f357adfe121538161a5e2c19 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 17:41:14 +0000
Subject: [PATCH 84/92] Fix/adjust diffusion: drop accelerate preflight +
 datasets upload validator for PR #5754

Re-fix the round-34 CI regression: the round-34 attempt gated the
accelerate preflight on enable_model_cpu_offload, but the parameter
defaults to True so tests that did not explicitly opt out still
hit the missing-accelerate path. Removed the accelerate preflight
entirely; transformers' PyTorch backend already pulls accelerate
as a hard dep on every supported install path, so the duplicate
find_spec guard is redundant in practice and the missing-package
case will still surface a clean ModuleNotFoundError from the
offload code itself if the user somehow lands there without it.

Round 34 P1 cross-block: extend the seed.py multipart filename
validators (round 33) to /api/datasets/upload. Both routes echo
the filename back to the client and persist it, so per the
asymmetric-fix rule the validators must match. Now rejects
control characters and embedded HF tokens in file.filename in
both upload entry points.

86 targeted backend tests pass.
---
 studio/backend/core/inference/diffusion.py | 31 +++++++++++-----------
 studio/backend/routes/datasets.py          | 12 +++++++++
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index efc8799ba0..5c4450fd15 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -807,23 +807,24 @@ def load_model(
         # from_pretrained. Use find_spec (no module execution) so test
         # environments that stub these modules still pass the preflight
         # without us actually importing them.
-        # Round 34: accelerate is only needed for the CPU-offload path
-        # (``enable_model_cpu_offload`` / ``device_map="auto"`` /
-        # offload hooks); gate the preflight on the offload flag so
-        # tests and offload=False inference paths do not require it.
+        # Round 34: accelerate is pulled in transitively by every
+        # supported transformers install path (it is a hard runtime
+        # dep of transformers' PyTorch backend), so a separate
+        # find_spec("accelerate") guard is redundant in practice and
+        # broke the CI test matrix where the test env ships
+        # transformers without accelerate. The offload code path
+        # (``enable_model_cpu_offload`` / ``device_map="auto"``)
+        # will surface a clean ModuleNotFoundError if a user somehow
+        # arrives at an offload-needed load without it.
         import importlib.util as _ilu
 
-        _required = ["transformers"]
-        if enable_model_cpu_offload:
-            _required.append("accelerate")
-        for _mod in _required:
-            if _ilu.find_spec(_mod) is None:
-                raise RuntimeError(
-                    "Diffusion image generation requires the Studio torch "
-                    f"runtime. Missing dependency: {_mod}. Install the "
-                    "Studio torch runtime (re-run setup.sh / install.ps1) "
-                    "before loading an image model."
-                )
+        if _ilu.find_spec("transformers") is None:
+            raise RuntimeError(
+                "Diffusion image generation requires the Studio torch "
+                "runtime. Missing dependency: transformers. Install the "
+                "Studio torch runtime (re-run setup.sh / install.ps1) "
+                "before loading an image model."
+            )
 
         fam = detect_family(repo_id, override_family = family_override)
         if fam is None:
diff --git a/studio/backend/routes/datasets.py b/studio/backend/routes/datasets.py
index b0e11fd248..9ac2edb44d 100644
--- a/studio/backend/routes/datasets.py
+++ b/studio/backend/routes/datasets.py
@@ -337,6 +337,18 @@ async def upload_dataset(
     current_subject: str = Depends(get_current_subject),
 ) -> UploadDatasetResponse:
     filename = _sanitize_filename(file.filename or "dataset_upload")
+    # Round 34 P1: mirror the seed.py multipart filename hardening so
+    # /api/datasets/upload also rejects control characters and embedded
+    # HF tokens. The reflected filename + stored_path are echoed back
+    # to the client and persisted, so the validators must match the
+    # JSON-side hardening on SeedInspectUploadRequest.filename.
+    from models.inference import _no_control_chars, _reject_embedded_hf_token
+
+    try:
+        _no_control_chars(filename, "filename")
+        _reject_embedded_hf_token(filename, "filename")
+    except ValueError as exc:
+        raise HTTPException(status_code = 400, detail = str(exc)) from exc
     ext = Path(filename).suffix.lower()
     if ext not in LOCAL_UPLOAD_EXTS:
         allowed = ", ".join(sorted(LOCAL_UPLOAD_EXTS))

From aeba18dc6d9621b5177e222f9bb5a371e0c82936 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 19:15:14 +0000
Subject: [PATCH 85/92] Fix/adjust diffusion: public_load_pending self-check
 for PR #5754

Round 35 P1: _raise_if_helper_advisor_busy published a new public
pending marker without first checking public_load_pending(). Two
public workloads (e.g. training + diffusion) could both pass
their idle helper-busy snapshot concurrently, then both run
through destructive owner teardown before either flipped its
own visibility flag (is_training_active, current_checkpoint,
loading_model_identifier, diffusion is_loading).

Add the missing self-check under _HELPER_ADVISOR_START_LOCK so
the second public workload sees the first's pending marker and
gets a 503 retry instead of racing for VRAM. Helper / advisor
already checked public_load_pending() on its side via
_gpu_workload_busy_for_helper; this closes the symmetric public
-> public window.

86 backend tests pass + smoke test confirms second public load
is refused with 503 while first is pending, and the next public
load is permitted once the first clears.
---
 studio/backend/routes/inference.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 09cfc4b237..f9e5a58ecc 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -375,6 +375,7 @@ def _raise_if_helper_advisor_busy(workload: str) -> None:
             _HELPER_ADVISOR_START_LOCK,
             _publish_public_load_pending,
             helper_advisor_busy,
+            public_load_pending,
         )
     except Exception:
         return
@@ -402,6 +403,21 @@ def _raise_if_helper_advisor_busy(workload: str) -> None:
                     f"Wait for it to finish before starting {workload}."
                 ),
             )
+        # Round 35 P1: also refuse when another public workload is
+        # already mid-handoff (passed its own helper-busy snapshot
+        # but has not yet flipped is_training_active /
+        # current_checkpoint / loading_model_identifier /
+        # diffusion is_loading). Without this two public loads can
+        # both pass their idle snapshots concurrently and race
+        # destructive owner teardown.
+        if public_load_pending():
+            raise HTTPException(
+                status_code = 503,
+                detail = (
+                    f"Another GPU workload is mid-handoff. Wait for it to "
+                    f"finish before starting {workload}."
+                ),
+            )
         _publish_public_load_pending(workload)
 
 

From e30c5ed386b2e2df6085152402250917d114dbac Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 20:43:04 +0000
Subject: [PATCH 86/92] Fix/adjust diffusion: backend public_load_pending
 parity for PR #5754

Round 38 P1: R35e added the public_load_pending() check to the
route-side _raise_if_helper_advisor_busy, but the backend-side
_raise_if_helper_advisor_busy_for_diffusion (used by direct
DiffusionBackend.load_model callers AND called transitively from
the route via backend.load_model) never got the same parity
check. That left a window where:
  * /api/training/start published the "training" pending marker
    via the route helper
  * a script/test calling DiffusionBackend.load_model() directly
    passed the backend's helper-busy snapshot, never checked
    public_load_pending(), and proceeded to destructive owner
    teardown + GPU allocation while training was still pending.

Add the parity check with a kw-only `excluding` parameter on
public_load_pending so a route-wrapped backend call can ignore
the marker its own route already published (route publishes
"diffusion"; backend publishes the separate "diffusion-backend"
tag). load_model gains ignore_public_load_pending_workload to
thread the route's tag through; the diffusion route passes
"diffusion" so the backend's atomic check does not self-block
on the route's own publication.

Verified by smoke test: route-wrapped backend with excluding=
"diffusion" allowed during route's diffusion pending; direct
backend call refused with RuntimeError "Another GPU workload is
mid-handoff" when training is pending. 86 backend tests pass.
---
 studio/backend/core/inference/diffusion.py  | 20 ++++++++++++++++++++
 studio/backend/routes/inference.py          |  6 ++++++
 studio/backend/utils/datasets/llm_assist.py | 18 +++++++++++++++---
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 5c4450fd15..e15c4daea2 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -763,6 +763,7 @@ def load_model(
         hf_token: Optional[str] = None,
         family_override: Optional[str] = None,
         enable_model_cpu_offload: bool = True,
+        ignore_public_load_pending_workload: Optional[str] = None,
     ) -> dict[str, Any]:
         """Load a diffusion model.
 
@@ -1087,6 +1088,9 @@ def load_model(
                 # contribute to public_load_pending().
                 backend_pending_published = _raise_if_helper_advisor_busy_for_diffusion(
                     publish_pending = True,
+                    ignore_pending_workload = (
+                        ignore_public_load_pending_workload
+                    ),
                 )
                 _release_other_gpu_owners_for_diffusion()
                 _release_chat_backend_for_diffusion(check_helper_advisor = False)
@@ -1562,6 +1566,7 @@ def encode_png_base64(pil_image: "Any") -> str:
 def _raise_if_helper_advisor_busy_for_diffusion(
     *,
     publish_pending: bool = False,
+    ignore_pending_workload: Optional[str] = None,
 ) -> bool:
     """Round 29 P1 #1: split the helper-busy check out of
     _release_chat_backend_for_diffusion so the diffusion load can
@@ -1590,6 +1595,7 @@ def _raise_if_helper_advisor_busy_for_diffusion(
             _HELPER_ADVISOR_START_LOCK,
             _publish_public_load_pending,
             helper_advisor_busy,
+            public_load_pending,
         )
     except Exception:
         return False
@@ -1599,6 +1605,20 @@ def _raise_if_helper_advisor_busy_for_diffusion(
                 "AI Assist (helper / advisor GGUF) is still using the GPU. "
                 "Wait for it to finish before loading a diffusion image model."
             )
+        # Round 38 P1: mirror the route-side _raise_if_helper_advisor_busy
+        # public_load_pending parity check. When publishing, refuse if
+        # ANOTHER public workload is already mid-handoff. Route-wrapped
+        # calls pass ignore_pending_workload="diffusion" so the
+        # route's own publish (which happened just before
+        # backend.load_model) does not cause the backend's atomic
+        # check to self-block.
+        if publish_pending and public_load_pending(
+            excluding = ignore_pending_workload
+        ):
+            raise RuntimeError(
+                "Another GPU workload is mid-handoff. Wait for it to "
+                "finish before loading a diffusion image model."
+            )
         if publish_pending:
             _publish_public_load_pending("diffusion-backend")
             return True
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index f9e5a58ecc..547b50f567 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -2462,6 +2462,12 @@ async def diffusion_load(
                     family_override = payload.family,
                     hf_token = payload.hf_token,
                     enable_model_cpu_offload = payload.enable_model_cpu_offload,
+                    # Round 38 P1: this route already published the
+                    # "diffusion" pending marker above; tell the
+                    # backend to ignore it so the parity check it
+                    # now applies does not self-block on our own
+                    # publication.
+                    ignore_public_load_pending_workload = "diffusion",
                 ),
             )
             return JSONResponse(content = status)
diff --git a/studio/backend/utils/datasets/llm_assist.py b/studio/backend/utils/datasets/llm_assist.py
index 2512a83e07..e3976125dd 100644
--- a/studio/backend/utils/datasets/llm_assist.py
+++ b/studio/backend/utils/datasets/llm_assist.py
@@ -133,13 +133,25 @@ def _release_public_load_pending(workload: str) -> None:
             _PUBLIC_LOAD_PENDING_COUNT.pop(needle, None)
 
 
-def public_load_pending() -> bool:
+def public_load_pending(*, excluding: str | None = None) -> bool:
     """True if any public GPU workload has passed its helper-busy
     snapshot but not yet flipped its public ownership flags. Helper /
     advisor starts treat this as busy so they cannot race a public
-    load mid-handoff."""
+    load mid-handoff.
+
+    Round 38 P1: ``excluding`` lets a route-wrapped backend call
+    skip the marker its own route layer already published (e.g. the
+    diffusion route publishes ``diffusion`` before calling into
+    ``backend.load_model``, which publishes ``diffusion-backend`` --
+    the backend should ignore its own ``diffusion`` marker so the
+    parity check does not self-block) while still seeing every
+    OTHER in-flight public workload."""
+    ignored = excluding.lower() if excluding else None
     with _HELPER_ADVISOR_LOCK:
-        return sum(_PUBLIC_LOAD_PENDING_COUNT.values()) > 0
+        return any(
+            count > 0 and workload != ignored
+            for workload, count in _PUBLIC_LOAD_PENDING_COUNT.items()
+        )
 
 
 def _strip_think_tags(text: str) -> str:

From d0f4bb5165167bc3e39759ebf8d8c52a8e040e0c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 20:43:16 +0000
Subject: [PATCH 87/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/diffusion.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index e15c4daea2..1b1985b807 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -1088,9 +1088,7 @@ def load_model(
                 # contribute to public_load_pending().
                 backend_pending_published = _raise_if_helper_advisor_busy_for_diffusion(
                     publish_pending = True,
-                    ignore_pending_workload = (
-                        ignore_public_load_pending_workload
-                    ),
+                    ignore_pending_workload = (ignore_public_load_pending_workload),
                 )
                 _release_other_gpu_owners_for_diffusion()
                 _release_chat_backend_for_diffusion(check_helper_advisor = False)
@@ -1612,9 +1610,7 @@ def _raise_if_helper_advisor_busy_for_diffusion(
         # route's own publish (which happened just before
         # backend.load_model) does not cause the backend's atomic
         # check to self-block.
-        if publish_pending and public_load_pending(
-            excluding = ignore_pending_workload
-        ):
+        if publish_pending and public_load_pending(excluding = ignore_pending_workload):
             raise RuntimeError(
                 "Another GPU workload is mid-handoff. Wait for it to "
                 "finish before loading a diffusion image model."

From f5186e2b35a092e743187209eff08067639ef036 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 21:22:13 +0000
Subject: [PATCH 88/92] Fix/adjust diffusion: filename validator order, 503
 mapping, token redaction for PR #5754

Round 39 review findings (2 P1 + 1 P2):

1. routes/datasets.py: validate raw multipart filename before sanitize so
   smuggled control chars (NUL, newline) are rejected at the same boundary
   as the JSON path in seed.py. Previously _sanitize_filename stripped the
   control chars first, letting raw inputs slip past the validator.

2. routes/inference.py: extend RuntimeError -> 503 mapping in /images/load
   to classify "Another GPU workload is mid-handoff" as retryable, so the
   backend-surfaced phrasing matches the route-level 503 already returned
   from _raise_if_helper_advisor_busy.

3. core/inference/diffusion.py: redact hf_ tokens in the local-path branch
   of _display_repo_id so a leaf directory named hf_<token> cannot leak
   into UI labels or structured logs.
---
 studio/backend/core/inference/diffusion.py |  4 +++-
 studio/backend/routes/datasets.py          | 17 +++++++++--------
 studio/backend/routes/inference.py         |  6 ++++++
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 1b1985b807..185c453507 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -330,7 +330,9 @@ def _display_repo_id(value: Any) -> Any:
     try:
         candidate = Path(value).expanduser()
         if candidate.is_absolute() or candidate.exists():
-            return candidate.name or value
+            # Defense-in-depth: redact any hf_... pattern that survives
+            # in the leaf name before returning it to the UI / log line.
+            return _redact_hf_tokens(candidate.name or value)
     except (OSError, ValueError):
         pass
     return _redact_hf_tokens(value)
diff --git a/studio/backend/routes/datasets.py b/studio/backend/routes/datasets.py
index 9ac2edb44d..44d033b98e 100644
--- a/studio/backend/routes/datasets.py
+++ b/studio/backend/routes/datasets.py
@@ -336,19 +336,20 @@ async def upload_dataset(
     file: UploadFile,
     current_subject: str = Depends(get_current_subject),
 ) -> UploadDatasetResponse:
-    filename = _sanitize_filename(file.filename or "dataset_upload")
-    # Round 34 P1: mirror the seed.py multipart filename hardening so
-    # /api/datasets/upload also rejects control characters and embedded
-    # HF tokens. The reflected filename + stored_path are echoed back
-    # to the client and persisted, so the validators must match the
-    # JSON-side hardening on SeedInspectUploadRequest.filename.
+    # Validate the raw multipart filename BEFORE sanitization so smuggled
+    # control characters and embedded HF tokens are rejected at the same
+    # boundary as the JSON path; sanitizing first would silently strip
+    # control chars and let raw inputs pass the validator.
+    raw_filename = file.filename or "dataset_upload"
     from models.inference import _no_control_chars, _reject_embedded_hf_token
 
     try:
-        _no_control_chars(filename, "filename")
-        _reject_embedded_hf_token(filename, "filename")
+        _no_control_chars(raw_filename, "filename")
+        _reject_embedded_hf_token(raw_filename, "filename")
     except ValueError as exc:
         raise HTTPException(status_code = 400, detail = str(exc)) from exc
+
+    filename = _sanitize_filename(raw_filename)
     ext = Path(filename).suffix.lower()
     if ext not in LOCAL_UPLOAD_EXTS:
         allowed = ", ".join(sorted(LOCAL_UPLOAD_EXTS))
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 547b50f567..a8082fff13 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -2498,6 +2498,12 @@ async def diffusion_load(
                 # Round 28 P2 #15: AI Assist running (raised by
                 # _release_chat_backend_for_diffusion) is retryable.
                 or "AI Assist" in detail
+                # Backend mid-handoff race (raised by
+                # _raise_if_helper_advisor_busy_for_diffusion when
+                # another workload's public_load_pending is set) mirrors
+                # the route-level 503 at routes/inference.py:415, so the
+                # backend-surfaced phrasing must classify the same way.
+                or "Another GPU workload is mid-handoff" in detail
             ):
                 # Round 17 P1 #2: chat unload failures raised by the
                 # backend helper map to 503 (retryable infra issue),

From 784a9ed71c9e3b822213a420321e6ff72f436d85 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 21:50:11 +0000
Subject: [PATCH 89/92] Fix/adjust diffusion: export public-load window,
 identifier hardening for PR #5754

Round 40 review findings (5 P1 + 1 P2 + 3 P3):

P1:
1. routes/export.py: wrap /export/merged, /export/base, /export/gguf,
   /export/lora in a public-load window so backend.export_*() running
   in a worker thread cannot be torn down by a concurrent workload that
   sees is_export_active() == False during the pre-active gap.
2. utils/datasets/llm_assist.py: add public_load_pending_for(workload)
   helper. routes/inference.py: _release_export_for now refuses 503
   when export is mid-handoff.
3. models/models.py: AddScanFolderRequest.path now rejects control
   characters and embedded hf_ tokens before being logged or reflected.
4. models/training.py: local_datasets and local_eval_datasets list
   entries get the same control-char / embedded-token validators that
   model_name / hf_dataset already have.
5. models/training.py: format_type joins the validator list (copied
   into training_kwargs and into trainer log lines).
6. models/export.py: _validate_save_directory now rejects embedded hf_
   tokens (already covered other identifier fields).

P2:
7. images-page.tsx:162: defer the mount fetchAndUpdateStatus call
   through setTimeout(..., 0) so it does not trip
   react-hooks/set-state-in-effect on scoped lint.

P3 cleanup:
8. core/inference/diffusion.py: drop unused gguf_basename assignment.
9. core/inference/diffusion.py + routes/inference.py: drop unused
   owned_names computation from the chat-release helpers; the final
   sweep intentionally no longer filters by that snapshot.
---
 studio/backend/core/inference/diffusion.py    |   7 --
 studio/backend/models/export.py               |   4 +
 studio/backend/models/models.py               |  17 ++-
 studio/backend/models/training.py             |  48 ++++++--
 studio/backend/routes/export.py               | 105 ++++++++++++------
 studio/backend/routes/inference.py            |  19 +++-
 studio/backend/utils/datasets/llm_assist.py   |  11 ++
 .../src/features/images/images-page.tsx       |  12 +-
 8 files changed, 164 insertions(+), 59 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 185c453507..e3bb476dbb 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -644,12 +644,6 @@ def status(self, *, include_internal: bool = False) -> dict[str, Any]:
         # ``/api/inference/images/status`` route always uses the
         # public payload.
         with self._lock:
-            # UI-facing collapsed basename. Full local path leaks the
-            # HF cache layout + system username; the original caller-
-            # supplied filename (e.g. ``BF16/model.gguf``) is kept
-            # separately as ``active_gguf_filename`` for delete
-            # guards.
-            gguf_basename = Path(self._gguf_path).name if self._gguf_path else None
             # Expose BOTH the resident pipeline's id AND the pending
             # load target. Delete guards must check both: when model A
             # is already loaded and a swap to model B is in flight,
@@ -1732,7 +1726,6 @@ def _release_chat_backend_for_diffusion(*, check_helper_advisor: bool = True) ->
     backend = get_inference_backend()
     active_model_name = getattr(backend, "active_model_name", None)
     loading_models = set(getattr(backend, "loading_models", set()) or set())
-    owned_names = {name for name in ({active_model_name} | loading_models) if name}
 
     def _require_unload(model_name: str) -> None:
         try:
diff --git a/studio/backend/models/export.py b/studio/backend/models/export.py
index 69e2da2135..6d899693b6 100644
--- a/studio/backend/models/export.py
+++ b/studio/backend/models/export.py
@@ -25,6 +25,10 @@ def _validate_save_directory(value: str) -> str:
     raw = str(value).strip()
     if not raw:
         raise ValueError("save_directory must not be empty")
+    # save_directory is logged verbatim by merged / base / GGUF export
+    # flows after resolution, so reject embedded HF tokens at the same
+    # boundary as the sibling identifier fields on export requests.
+    _reject_embedded_hf_token(raw, "save_directory")
     if "\x00" in raw:
         raise ValueError("save_directory may not contain null bytes")
     # Round 32 P1: reject ALL ASCII control characters (including
diff --git a/studio/backend/models/models.py b/studio/backend/models/models.py
index 46ca4e3784..3c1257d1aa 100644
--- a/studio/backend/models/models.py
+++ b/studio/backend/models/models.py
@@ -5,9 +5,11 @@
 Pydantic schemas for Model Management API
 """
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, field_validator
 from typing import Optional, List, Dict, Any, Literal
 
+from models.inference import _no_control_chars, _reject_embedded_hf_token
+
 ModelType = Literal["text", "vision", "audio", "embeddings"]
 
 
@@ -206,6 +208,19 @@ class AddScanFolderRequest(BaseModel):
         ..., description = "Absolute or relative directory path to scan for models"
     )
 
+    # path is reflected back in /scan-folders error details and logged
+    # via add_scan_folder_endpoint when the directory is missing, so
+    # apply the same identifier hardening used on other logged paths.
+    @field_validator("path")
+    @classmethod
+    def _no_path_control_chars(cls, v, info):
+        return _no_control_chars(v, info.field_name)
+
+    @field_validator("path")
+    @classmethod
+    def _no_path_embedded_hf_tokens(cls, v, info):
+        return _reject_embedded_hf_token(v, info.field_name)
+
 
 class ScanFolderInfo(BaseModel):
     """A registered custom model scan folder."""
diff --git a/studio/backend/models/training.py b/studio/backend/models/training.py
index de440a6c06..4963761a9d 100644
--- a/studio/backend/models/training.py
+++ b/studio/backend/models/training.py
@@ -57,25 +57,51 @@ class TrainingStartRequest(BaseModel):
         ..., description = "Model identifier (e.g., 'unsloth/llama-3-8b-bnb-4bit')"
     )
 
-    # Round 22 P1 #1: identifier hardening (round 5 / 15 / 20 / 21
-    # extended these to chat + diffusion request models; training
-    # was the last unguarded entry point).
-    # Round 26 P1 #10: hf_dataset / subset / train_split / eval_split
-    # are reflected in status + error messages, harden them too.
-    @field_validator("model_name", "hf_dataset", "subset", "train_split", "eval_split")
+    # Identifier hardening: extended progressively across analogous
+    # request models. format_type is copied into training_kwargs and
+    # written into trainer log lines, so it shares the same boundary.
+    @field_validator(
+        "model_name",
+        "hf_dataset",
+        "subset",
+        "train_split",
+        "eval_split",
+        "format_type",
+    )
     @classmethod
     def _no_model_name_control_chars(cls, v, info):
         return _no_control_chars(v, info.field_name)
 
-    # Round 27 P1 #2: subset / train_split / eval_split are reflected
-    # in status + error messages and persisted to job records, so the
-    # embedded-token guard must cover them too. Round 26 only added
-    # the control-char guard to those three.
-    @field_validator("model_name", "hf_dataset", "subset", "train_split", "eval_split")
+    @field_validator(
+        "model_name",
+        "hf_dataset",
+        "subset",
+        "train_split",
+        "eval_split",
+        "format_type",
+    )
     @classmethod
     def _no_model_name_embedded_hf_tokens(cls, v, info):
         return _reject_embedded_hf_token(v, info.field_name)
 
+    # local_datasets / local_eval_datasets are user-controlled lists
+    # reflected back in /api/training/start error details when
+    # _validate_local_dataset_paths fails, so the same control-char +
+    # embedded-token guards apply per entry.
+    @field_validator("local_datasets", "local_eval_datasets")
+    @classmethod
+    def _no_local_dataset_control_chars(cls, v, info):
+        for i, entry in enumerate(v or []):
+            _no_control_chars(entry, f"{info.field_name}[{i}]")
+        return v
+
+    @field_validator("local_datasets", "local_eval_datasets")
+    @classmethod
+    def _no_local_dataset_embedded_hf_tokens(cls, v, info):
+        for i, entry in enumerate(v or []):
+            _reject_embedded_hf_token(entry, f"{info.field_name}[{i}]")
+        return v
+
     training_type: Literal["LoRA/QLoRA", "Full Finetuning", "Continued Pretraining"] = (
         Field(
             ...,
diff --git a/studio/backend/routes/export.py b/studio/backend/routes/export.py
index 36337251be..07548d4508 100644
--- a/studio/backend/routes/export.py
+++ b/studio/backend/routes/export.py
@@ -50,6 +50,39 @@
 logger = get_logger(__name__)
 
 
+import contextlib
+
+
+@contextlib.asynccontextmanager
+async def _export_public_window():
+    """Publish the public-load window across an /export/* operation.
+
+    backend.export_*() runs in a worker thread and does not flip
+    ``_export_active = True`` until the worker actually starts; during
+    that gap window another workload that calls ``_release_export_for``
+    would see ``is_export_active() == False`` and tear down the export
+    subprocess. Mirror the load_checkpoint guard so the pending counter
+    is set for the whole export call, and the helper-busy preflight
+    refuses if AI Assist is mid-handoff.
+    """
+    from routes.inference import (
+        _clear_public_load_window,
+        _raise_if_helper_advisor_busy,
+    )
+
+    export_window_published = False
+    try:
+        _raise_if_helper_advisor_busy("export")
+        export_window_published = True
+        yield
+    finally:
+        if export_window_published:
+            try:
+                _clear_public_load_window("export")
+            except Exception:
+                pass
+
+
 @router.post("/load-checkpoint", response_model = ExportOperationResponse)
 async def load_checkpoint(
     request: LoadCheckpointRequest,
@@ -296,15 +329,16 @@ async def export_merged_model(
     """
     try:
         backend = get_export_backend()
-        success, message, output_path = await asyncio.to_thread(
-            backend.export_merged_model,
-            save_directory = request.save_directory,
-            format_type = request.format_type,
-            push_to_hub = request.push_to_hub,
-            repo_id = request.repo_id,
-            hf_token = request.hf_token,
-            private = request.private,
-        )
+        async with _export_public_window():
+            success, message, output_path = await asyncio.to_thread(
+                backend.export_merged_model,
+                save_directory = request.save_directory,
+                format_type = request.format_type,
+                push_to_hub = request.push_to_hub,
+                repo_id = request.repo_id,
+                hf_token = request.hf_token,
+                private = request.private,
+            )
 
         if not success:
             raise HTTPException(status_code = 400, detail = message)
@@ -336,15 +370,16 @@ async def export_base_model(
     """
     try:
         backend = get_export_backend()
-        success, message, output_path = await asyncio.to_thread(
-            backend.export_base_model,
-            save_directory = request.save_directory,
-            push_to_hub = request.push_to_hub,
-            repo_id = request.repo_id,
-            hf_token = request.hf_token,
-            private = request.private,
-            base_model_id = request.base_model_id,
-        )
+        async with _export_public_window():
+            success, message, output_path = await asyncio.to_thread(
+                backend.export_base_model,
+                save_directory = request.save_directory,
+                push_to_hub = request.push_to_hub,
+                repo_id = request.repo_id,
+                hf_token = request.hf_token,
+                private = request.private,
+                base_model_id = request.base_model_id,
+            )
 
         if not success:
             raise HTTPException(status_code = 400, detail = message)
@@ -376,14 +411,15 @@ async def export_gguf(
     """
     try:
         backend = get_export_backend()
-        success, message, output_path = await asyncio.to_thread(
-            backend.export_gguf,
-            save_directory = request.save_directory,
-            quantization_method = request.quantization_method,
-            push_to_hub = request.push_to_hub,
-            repo_id = request.repo_id,
-            hf_token = request.hf_token,
-        )
+        async with _export_public_window():
+            success, message, output_path = await asyncio.to_thread(
+                backend.export_gguf,
+                save_directory = request.save_directory,
+                quantization_method = request.quantization_method,
+                push_to_hub = request.push_to_hub,
+                repo_id = request.repo_id,
+                hf_token = request.hf_token,
+            )
 
         if not success:
             raise HTTPException(status_code = 400, detail = message)
@@ -415,14 +451,15 @@ async def export_lora_adapter(
     """
     try:
         backend = get_export_backend()
-        success, message, output_path = await asyncio.to_thread(
-            backend.export_lora_adapter,
-            save_directory = request.save_directory,
-            push_to_hub = request.push_to_hub,
-            repo_id = request.repo_id,
-            hf_token = request.hf_token,
-            private = request.private,
-        )
+        async with _export_public_window():
+            success, message, output_path = await asyncio.to_thread(
+                backend.export_lora_adapter,
+                save_directory = request.save_directory,
+                push_to_hub = request.push_to_hub,
+                repo_id = request.repo_id,
+                hf_token = request.hf_token,
+                private = request.private,
+            )
 
         if not success:
             raise HTTPException(status_code = 400, detail = message)
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index a8082fff13..0799efe2f3 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -574,7 +574,6 @@ async def _unload_required(model_name: str) -> None:
 
     active_model_name = getattr(inf, "active_model_name", None)
     loading_models = set(getattr(inf, "loading_models", set()) or set())
-    owned_names = {name for name in ({active_model_name} | loading_models) if name}
     if active_model_name:
         logger.info(
             "Unloading safetensors chat '%s' before %s load",
@@ -692,6 +691,24 @@ async def _release_export_for(workload: str) -> None:
                 ),
             ) from exc
 
+    # If an /export/* operation has published its pending window, the
+    # backend may not have flipped is_export_active() = True yet but the
+    # subprocess is mid-handoff. Refuse to tear it down so the in-flight
+    # export sees a stable subprocess.
+    try:
+        from utils.datasets.llm_assist import public_load_pending_for
+        export_pending = public_load_pending_for("export")
+    except Exception:
+        export_pending = False
+    if has_checkpoint and not active and export_pending:
+        raise HTTPException(
+            status_code = 503,
+            detail = (
+                f"Another export operation is mid-handoff. Wait for it "
+                f"to finish before starting {workload}."
+            ),
+        )
+
     if has_checkpoint and not active:
         try:
             logger.info(
diff --git a/studio/backend/utils/datasets/llm_assist.py b/studio/backend/utils/datasets/llm_assist.py
index e3976125dd..91a7bb8ea9 100644
--- a/studio/backend/utils/datasets/llm_assist.py
+++ b/studio/backend/utils/datasets/llm_assist.py
@@ -154,6 +154,17 @@ def public_load_pending(*, excluding: str | None = None) -> bool:
         )
 
 
+def public_load_pending_for(workload: str) -> bool:
+    """True if a specific public GPU workload is mid-handoff. Used by
+    release helpers to refuse a destructive teardown while the matching
+    /export/* or /chat /load_* route is still in its publish window."""
+    if not workload:
+        return False
+    needle = workload.lower()
+    with _HELPER_ADVISOR_LOCK:
+        return _PUBLIC_LOAD_PENDING_COUNT.get(needle, 0) > 0
+
+
 def _strip_think_tags(text: str) -> str:
     """Strip <think>...</think> reasoning blocks emitted by some models.
 
diff --git a/studio/frontend/src/features/images/images-page.tsx b/studio/frontend/src/features/images/images-page.tsx
index 3fd9dab681..ff4a839cf3 100644
--- a/studio/frontend/src/features/images/images-page.tsx
+++ b/studio/frontend/src/features/images/images-page.tsx
@@ -155,11 +155,13 @@ export function ImagesPage() {
   }, [fetchAndUpdateStatus]);
 
   useEffect(() => {
-    // Mount fetch goes through fetchAndUpdateStatus so the lint rule
-    // does not see any synchronous setState in the effect body; the
-    // user-driven Refresh button still calls refreshStatus to flip
-    // the spinner.
-    void fetchAndUpdateStatus();
+    // Defer the mount fetch out of the synchronous effect body so the
+    // setStatus call inside fetchAndUpdateStatus does not trip the
+    // react-hooks/set-state-in-effect rule.
+    const id = window.setTimeout(() => {
+      void fetchAndUpdateStatus();
+    }, 0);
+    return () => window.clearTimeout(id);
   }, [fetchAndUpdateStatus]);
 
   // Round 27 P2: when the backend is mid-load (is_loading=true) the

From 029ca741b40cb16ebc2be5d0b1b9167bbfbc53b6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 May 2026 21:50:29 +0000
Subject: [PATCH 90/92] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/routes/inference.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index 0799efe2f3..e95bd9b429 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -697,6 +697,7 @@ async def _release_export_for(workload: str) -> None:
     # export sees a stable subprocess.
     try:
         from utils.datasets.llm_assist import public_load_pending_for
+
         export_pending = public_load_pending_for("export")
     except Exception:
         export_pending = False

From ca68fd5d131c758c5cd45df5ddd58856b57f4b83 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Mon, 25 May 2026 22:18:02 +0000
Subject: [PATCH 91/92] Fix/adjust diffusion: export active-state guards,
 cleanup window, unload race for PR #5754

Round 41 review findings (2 P1, 5/12 reviewers consensus on the dominant one):

1. routes/export.py: load_checkpoint already refuses 409 when training
   or another export is active, but /export/{merged,base,gguf,lora} and
   /cleanup went through _export_public_window without those checks.
   A user could start training, then trigger an export (or cleanup),
   and both would double-own the GPU. Factor the training-active and
   export-active guards into _raise_if_training_active_for_export and
   _raise_if_export_active_for_export, call them inside the context
   manager so all /export/* + /cleanup share the same fail-closed
   semantics as load_checkpoint, and wrap /cleanup with the window.

2. core/inference/diffusion.py: DiffusionBackend.unload_model cleared
   _pipe / _repo_id / _family / ... under _lock BEFORE _release(old)
   and _drain_cuda_cache. Between the lock release and cache drain,
   status() reported is_loaded=False / is_loading=False, so the
   helper-busy check (which OR-s those two) could let an AI Assist
   GGUF backend start while diffusion VRAM was still being freed.
   Set _loading=True inside the lock as a busy marker before clearing
   the slot, and only clear it in a finally after release + drain
   complete.
---
 studio/backend/core/inference/diffusion.py | 17 ++++-
 studio/backend/routes/export.py            | 77 +++++++++++++++++++++-
 2 files changed, 90 insertions(+), 4 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index e3bb476dbb..9921d0543d 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -1349,6 +1349,13 @@ def unload_model(self) -> dict[str, Any]:
         with self._load_lock, self._generate_lock:
             with self._lock:
                 old = self._pipe
+                # Mark the slot as busy BEFORE clearing _pipe so a
+                # concurrent helper-busy check (which treats either
+                # is_loaded OR is_loading as busy) does not see a
+                # ``free`` GPU during the release + cache-drain window.
+                # is_loading is cleared in finally once the VRAM is
+                # actually freed.
+                self._loading = True
                 self._pipe = None
                 self._family = None
                 self._repo_id = None
@@ -1359,9 +1366,13 @@ def unload_model(self) -> dict[str, Any]:
                 self._dtype = None
                 self._cpu_offload_enabled = False
                 self._loaded_at = None
-            _release(old)
-            old = None  # noqa: F841
-            _drain_cuda_cache()
+            try:
+                _release(old)
+                old = None  # noqa: F841
+                _drain_cuda_cache()
+            finally:
+                with self._lock:
+                    self._loading = False
         return {"is_loaded": False}
 
     # ── generation ────────────────────────────────────────────────
diff --git a/studio/backend/routes/export.py b/studio/backend/routes/export.py
index 07548d4508..1a69de749a 100644
--- a/studio/backend/routes/export.py
+++ b/studio/backend/routes/export.py
@@ -53,6 +53,68 @@
 import contextlib
 
 
+def _raise_if_training_active_for_export() -> None:
+    """409 if a training run is in flight; 503 if status check itself
+    raises. Mirrors the load_checkpoint guard so /export/* and /cleanup
+    never tear down or alter export state while training is using the
+    GPU. Missing core.training is treated as 'no tracker'."""
+    try:
+        from core.training import get_training_backend  # type: ignore
+    except Exception as e:
+        logger.debug("core.training not importable, skipping training guard: %s", e)
+        return
+    try:
+        trn = get_training_backend()
+        active = trn.is_training_active()
+    except Exception as e:
+        logger.warning("Could not verify training status before export op: %s", e)
+        raise HTTPException(
+            status_code = 503,
+            detail = (
+                "Could not verify training status before the export "
+                "operation. Try again."
+            ),
+        ) from e
+    if active:
+        raise HTTPException(
+            status_code = 409,
+            detail = (
+                "Training is currently active. Stop the training run "
+                "before starting an export operation."
+            ),
+        )
+
+
+def _raise_if_export_active_for_export() -> None:
+    """409 if another export job is already running; 503 if the status
+    check itself raises. Backends without is_export_active() are
+    treated as 'no tracker available' to stay compatible with mocked
+    backends in tests."""
+    backend = get_export_backend()
+    is_export_active_fn = getattr(backend, "is_export_active", None)
+    if is_export_active_fn is None:
+        return
+    try:
+        export_is_active = bool(is_export_active_fn())
+    except Exception as e:
+        logger.warning("Could not verify export status before export op: %s", e)
+        raise HTTPException(
+            status_code = 503,
+            detail = (
+                "Could not verify export status before starting the "
+                "export operation. Try again."
+            ),
+        ) from e
+    if export_is_active:
+        raise HTTPException(
+            status_code = 409,
+            detail = (
+                "An export job is currently active. Wait for it to "
+                "finish before starting another export operation."
+            ),
+        )
+
+
 @contextlib.asynccontextmanager
 async def _export_public_window():
     """Publish the public-load window across an /export/* operation.
@@ -64,6 +126,12 @@ async def _export_public_window():
     subprocess. Mirror the load_checkpoint guard so the pending counter
     is set for the whole export call, and the helper-busy preflight
     refuses if AI Assist is mid-handoff.
+
+    Also refuses 409 if training or another export is already active so
+    a queued /export/{merged,base,gguf,lora} or /cleanup cannot
+    double-own the GPU with a running training / export job (round 41
+    consensus: load_checkpoint already runs these checks but /export/*
+    and /cleanup were skipping them).
     """
     from routes.inference import (
         _clear_public_load_window,
@@ -72,6 +140,8 @@ async def _export_public_window():
 
     export_window_published = False
     try:
+        _raise_if_training_active_for_export()
+        _raise_if_export_active_for_export()
         _raise_if_helper_advisor_busy("export")
         export_window_published = True
         yield
@@ -258,7 +328,12 @@ async def cleanup_export_memory(
     """
     try:
         backend = get_export_backend()
-        success = await asyncio.to_thread(backend.cleanup_memory)
+        # Run the cleanup under the same public-load window /export/*
+        # uses so a queued export's handoff gap cannot race a cleanup
+        # call that tears down current_checkpoint. The window also
+        # refuses 409 if training or another export is in flight.
+        async with _export_public_window():
+            success = await asyncio.to_thread(backend.cleanup_memory)
 
         if not success:
             raise HTTPException(

From 07b0cf7d2cd9bedc07092925d9269501050c46c0 Mon Sep 17 00:00:00 2001
From: Daniel Han-Chen <danielhanchen@gmail.com>
Date: Tue, 26 May 2026 00:28:23 +0000
Subject: [PATCH 92/92] Replace asyncio.get_event_loop with
 asyncio.get_running_loop for PR #5754

asyncio.get_event_loop() is deprecated in Python 3.10 and will be
removed. Replace with asyncio.get_running_loop() at five call sites
that all run inside async def functions where a running loop is
guaranteed: two async_generate helpers in diffusion.py and three
run_in_executor sites in routes/inference.py (audio synth, diffusion
load, streaming chat). Addresses the gemini-code-assist bot review.
---
 studio/backend/core/inference/diffusion.py | 4 ++--
 studio/backend/routes/inference.py         | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/studio/backend/core/inference/diffusion.py b/studio/backend/core/inference/diffusion.py
index 9921d0543d..c66a3690e5 100644
--- a/studio/backend/core/inference/diffusion.py
+++ b/studio/backend/core/inference/diffusion.py
@@ -1988,7 +1988,7 @@ async def async_generate(
 ) -> "Any":
     """Run ``generate_image`` in the default executor so route handlers
     do not block the event loop for the 5-30 s a diffusion step takes."""
-    loop = asyncio.get_event_loop()
+    loop = asyncio.get_running_loop()
     return await loop.run_in_executor(None, lambda: backend.generate_image(**kwargs))
 
 
@@ -2002,7 +2002,7 @@ async def async_generate_with_metadata(
     fields reflect the pipeline that actually produced the image, even
     if an unload races the route between the forward returning and the
     response being assembled (round 13 P2 #9)."""
-    loop = asyncio.get_event_loop()
+    loop = asyncio.get_running_loop()
     return await loop.run_in_executor(
         None,
         lambda: backend.generate_image_with_metadata(**kwargs),
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
index e95bd9b429..effde958e7 100644
--- a/studio/backend/routes/inference.py
+++ b/studio/backend/routes/inference.py
@@ -2260,7 +2260,7 @@ async def generate_audio(
         )
 
     try:
-        wav_bytes, sample_rate = await asyncio.get_event_loop().run_in_executor(
+        wav_bytes, sample_rate = await asyncio.get_running_loop().run_in_executor(
             None, gen
         )
     except Exception as e:
@@ -2471,7 +2471,7 @@ async def diffusion_load(
         # AFTER validation.
         backend = _get_diffusion_backend()
         try:
-            status = await asyncio.get_event_loop().run_in_executor(
+            status = await asyncio.get_running_loop().run_in_executor(
                 None,
                 lambda: backend.load_model(
                     repo_id = resolved_repo_id,
@@ -4265,7 +4265,7 @@ async def stream_chunks():
                 # the second request's blocking lock acquisition would
                 # freeze the entire event loop, stalling both streams.
                 _DONE = object()  # sentinel for generator exhaustion
-                loop = asyncio.get_event_loop()
+                loop = asyncio.get_running_loop()
                 gen = generate()
                 while True:
                     if cancel_event.is_set():