unslothai · BlackBox-Labs · May 8, 2026 · May 8, 2026 · May 8, 2026 · May 8, 2026
diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
@@ -517,6 +517,7 @@ async def load_model(
         else:
             if (
                 backend.active_model_name
+                and not backend.active_model_name.lower().endswith(".gguf")
                 and backend.active_model_name.lower() == model_identifier.lower()
             ):
                 logger.info(
@@ -573,6 +574,24 @@ async def load_model(
                     chat_template = _chat_template,
                 )
 
+        # ── Local GGUF already-loaded: kill before reload ──────────
+        # When settings change triggers a reload of a local GGUF model
+        # (no gguf_variant), kill the existing llama-server so the GGUF
+        # reload path below can start fresh with new params. Without this,
+        # the reload falls through to the transformers path, which fails
+        # because GGUF files don't have a HuggingFace config.json.
+        if (
+            not request.gguf_variant
+            and llama_backend.is_loaded
+            and llama_backend.model_identifier
+            and llama_backend.model_identifier.lower() == model_identifier.lower()
+        ):
+            logger.info(
+                f"Local GGUF already loaded, reloading with updated settings: "
+                f"{model_log_label}"
+            )
+            llama_backend.unload_model()
+
         # Create config using clean factory method
         # is_lora is auto-detected from adapter_config.json on disk/HF
         config = ModelConfig.from_identifier(