diff --git a/.github/workflows/studio-windows-inference-smoke.yml b/.github/workflows/studio-windows-inference-smoke.yml
index 01bf4127a7..2acc782984 100644
--- a/.github/workflows/studio-windows-inference-smoke.yml
+++ b/.github/workflows/studio-windows-inference-smoke.yml
@@ -258,11 +258,26 @@ jobs:
 
       - name: Load the GGUF (HF repo + variant, served from HF_HOME cache)
         run: |
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
-            -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
-            --max-time 600 \
-            -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}" \
-            | jq '{status, display_name, is_gguf, context_length}'
+          # Retry the load step a few times so a transient TCP RST during
+          # llama-server warm-up (Windows runner image churn,
+          # windows-latest -> windows-2025-vs2026 rollout) doesn't fail
+          # the whole job. The Studio backend's _wait_for_health now
+          # catches httpx.ReadError too; this retry layer covers the
+          # cases the backend can't recover from on its own.
+          LOAD_OK=0
+          for attempt in 1 2 3; do
+            HTTP=$(curl -s -o /tmp/load.json -w '%{http_code}' \
+              -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
+              -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
+              --max-time 600 \
+              -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}")
+            if [ "$HTTP" = "200" ]; then LOAD_OK=1; break; fi
+            echo "::warning::/api/inference/load attempt $attempt returned $HTTP; response:"
+            cat /tmp/load.json || true
+            sleep 10
+          done
+          [ "$LOAD_OK" = "1" ] || { echo "::error::/api/inference/load failed 3 attempts"; exit 22; }
+          jq '{status, display_name, is_gguf, context_length}' /tmp/load.json
 
       - name: Multi-turn determinism via OpenAI + Anthropic SDKs
         env:
@@ -350,6 +365,19 @@ jobs:
         shell: cmd
         run: echo Stop Studio (no-op; runner reclaims STUDIO_PID=%STUDIO_PID% at job end)
 
+      - name: Collect llama-server logs
+        if: always()
+        shell: bash
+        # Copy llama-server's own stdout/stderr (teed by Studio under
+        # ~/.unsloth/studio/logs/llama-server/) into the workspace so
+        # upload-artifact can pick it up. Crucial for diagnosing a
+        # subprocess crash where Studio's traceback only shows the
+        # symptom (httpx ReadError) but not the cause.
+        run: |
+          mkdir -p logs/llama-server
+          cp -v ~/.unsloth/studio/logs/llama-server/*.log logs/llama-server/ 2>/dev/null || \
+            echo "no llama-server logs to collect"
+
       - name: Upload logs
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
@@ -358,6 +386,7 @@ jobs:
           path: |
             logs/studio.log
             logs/install.log
+            logs/llama-server/*.log
           retention-days: 7
 
   # ─────────────────────────────────────────────────────────────────────
@@ -561,11 +590,21 @@ jobs:
           # a normal path.
           GGUF_PATH="${GITHUB_WORKSPACE//\\//}/gguf-cache/${GGUF_FILE}"
           ls -lh "$GGUF_PATH"
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
-            -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
-            --max-time 600 \
-            -d "{\"model_path\":\"$GGUF_PATH\",\"is_lora\":false,\"max_seq_length\":2048}" \
-            | jq '{status, display_name}'
+          # Retry: same rationale as the OpenAI/Anthropic job.
+          LOAD_OK=0
+          for attempt in 1 2 3; do
+            HTTP=$(curl -s -o /tmp/load.json -w '%{http_code}' \
+              -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
+              -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
+              --max-time 600 \
+              -d "{\"model_path\":\"$GGUF_PATH\",\"is_lora\":false,\"max_seq_length\":2048}")
+            if [ "$HTTP" = "200" ]; then LOAD_OK=1; break; fi
+            echo "::warning::/api/inference/load attempt $attempt returned $HTTP; response:"
+            cat /tmp/load.json || true
+            sleep 10
+          done
+          [ "$LOAD_OK" = "1" ] || { echo "::error::/api/inference/load failed 3 attempts"; exit 22; }
+          jq '{status, display_name}' /tmp/load.json
 
       - name: Tool calling, server-side tools, thinking on/off
         env:
@@ -768,6 +807,19 @@ jobs:
         shell: cmd
         run: echo Stop Studio (no-op; runner reclaims STUDIO_PID=%STUDIO_PID% at job end)
 
+      - name: Collect llama-server logs
+        if: always()
+        shell: bash
+        # Copy llama-server's own stdout/stderr (teed by Studio under
+        # ~/.unsloth/studio/logs/llama-server/) into the workspace so
+        # upload-artifact can pick it up. Crucial for diagnosing a
+        # subprocess crash where Studio's traceback only shows the
+        # symptom (httpx ReadError) but not the cause.
+        run: |
+          mkdir -p logs/llama-server
+          cp -v ~/.unsloth/studio/logs/llama-server/*.log logs/llama-server/ 2>/dev/null || \
+            echo "no llama-server logs to collect"
+
       - name: Upload logs
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
@@ -776,6 +828,7 @@ jobs:
           path: |
             logs/studio.log
             logs/install.log
+            logs/llama-server/*.log
           retention-days: 7
 
   # ─────────────────────────────────────────────────────────────────────
@@ -970,11 +1023,21 @@ jobs:
             -H 'content-type: application/json' \
             -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token)
           echo "API_KEY=$TOKEN" >> "$GITHUB_ENV"
-          curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
-            -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
-            --max-time 900 \
-            -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}" \
-            | jq '{status, display_name, is_vision}'
+          # Retry: same rationale as the OpenAI/Anthropic and Tool calling jobs.
+          LOAD_OK=0
+          for attempt in 1 2 3; do
+            HTTP=$(curl -s -o /tmp/load.json -w '%{http_code}' \
+              -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \
+              -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \
+              --max-time 900 \
+              -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}")
+            if [ "$HTTP" = "200" ]; then LOAD_OK=1; break; fi
+            echo "::warning::/api/inference/load attempt $attempt returned $HTTP; response:"
+            cat /tmp/load.json || true
+            sleep 10
+          done
+          [ "$LOAD_OK" = "1" ] || { echo "::error::/api/inference/load failed 3 attempts"; exit 22; }
+          jq '{status, display_name, is_vision}' /tmp/load.json
 
       - name: JSON schema decoding + image input
         env:
@@ -1156,6 +1219,19 @@ jobs:
         shell: cmd
         run: echo Stop Studio (no-op; runner reclaims STUDIO_PID=%STUDIO_PID% at job end)
 
+      - name: Collect llama-server logs
+        if: always()
+        shell: bash
+        # Copy llama-server's own stdout/stderr (teed by Studio under
+        # ~/.unsloth/studio/logs/llama-server/) into the workspace so
+        # upload-artifact can pick it up. Crucial for diagnosing a
+        # subprocess crash where Studio's traceback only shows the
+        # symptom (httpx ReadError) but not the cause.
+        run: |
+          mkdir -p logs/llama-server
+          cp -v ~/.unsloth/studio/logs/llama-server/*.log logs/llama-server/ 2>/dev/null || \
+            echo "no llama-server logs to collect"
+
       - name: Upload logs
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
@@ -1164,4 +1240,5 @@ jobs:
           path: |
             logs/studio.log
             logs/install.log
+            logs/llama-server/*.log
           retention-days: 7
diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 3682f1dbbb..a4c28166b9 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -101,6 +101,51 @@
 _SWA_CACHE_LOCK = threading.Lock()
 
 
+def _probe_dns_dead(host: str = "huggingface.co", timeout: float = 2.0) -> bool:
+    """Quick DNS check. Runs on a daemon thread so concurrent sockets
+    in the same process are not affected by socket.setdefaulttimeout."""
+    result: list[Optional[bool]] = [None]
+
+    def _probe() -> None:
+        try:
+            socket.gethostbyname(host)
+            result[0] = False
+        except Exception:
+            result[0] = True
+
+    t = threading.Thread(target = _probe, daemon = True)
+    t.start()
+    t.join(timeout)
+    # Thread still running -> resolver wedged -> treat as dead.
+    return True if result[0] is None else result[0]
+
+
+@contextlib.contextmanager
+def _hf_offline_if_dns_dead():
+    """Set HF_HUB_OFFLINE for the body of this block only when DNS to
+    huggingface.co fails. Restores the env on exit so a transient
+    resolver hiccup at the start of one load can't quarantine the whole
+    process. Respects an explicit user setting (no-op if already set)."""
+    if "HF_HUB_OFFLINE" in os.environ:
+        yield False
+        return
+    if not _probe_dns_dead():
+        yield False
+        return
+
+    transformers_was_set = "TRANSFORMERS_OFFLINE" in os.environ
+    os.environ["HF_HUB_OFFLINE"] = "1"
+    if not transformers_was_set:
+        os.environ["TRANSFORMERS_OFFLINE"] = "1"
+    logger.warning("huggingface.co unreachable; using local HF cache for this load.")
+    try:
+        yield True
+    finally:
+        os.environ.pop("HF_HUB_OFFLINE", None)
+        if not transformers_was_set:
+            os.environ.pop("TRANSFORMERS_OFFLINE", None)
+
+
 def _swa_cache_path() -> Path:
     home = os.environ.get("UNSLOTH_STUDIO_HOME") or os.environ.get("STUDIO_HOME")
     base = Path(home) if home else Path.home() / ".unsloth" / "studio"
@@ -483,6 +528,9 @@ def __init__(self):
         self._requested_n_ctx: int = 0
         self._stdout_lines: list[str] = []
         self._stdout_thread: Optional[threading.Thread] = None
+        # llama-server tee log (see _drain_stdout / _kill_process).
+        self._llama_log_fh = None
+        self._llama_log_path: Optional[Path] = None
         self._cancel_event = threading.Event()
         self._api_key: Optional[str] = None
 
@@ -1462,6 +1510,11 @@ def _drain_stdout(self):
         This prevents a pipe-buffer deadlock on Windows where the default
         pipe buffer is only ~4 KB.  Without draining, llama-server blocks
         on writes and never becomes healthy.
+
+        Each line is also teed to ``self._llama_log_fh`` when set so a
+        post-mortem (especially in CI) has the full subprocess output
+        even if the crash predates the drain-thread join in
+        ``_wait_for_health``.
         """
         try:
             for line in self._process.stdout:
@@ -1469,6 +1522,14 @@ def _drain_stdout(self):
                 if line:
                     self._stdout_lines.append(line)
                     logger.debug(f"[llama-server] {line}")
+                    fh = getattr(self, "_llama_log_fh", None)
+                    if fh is not None:
+                        try:
+                            fh.write(line + "\n")
+                            fh.flush()
+                        except (ValueError, OSError):
+                            # Log file closed under us; tee silently.
+                            pass
         except (ValueError, OSError):
             # Pipe closed — process is terminating
             pass
@@ -1804,6 +1865,55 @@ def _download_gguf(
             except Exception as e:
                 logger.warning(f"Could not list repo files: {e}")
 
+            # Offline: resolve variant -> filename from the local HF cache.
+            # The heuristic below assumes filenames echo the repo name,
+            # which breaks for e.g. Qwen3.6-27B-MTP-GGUF (no "MTP" in file).
+            # Match against the rel path (not just basename) so subdir
+            # layouts like ``BF16/foo.gguf`` are findable.
+            if not gguf_filename:
+                try:
+                    from utils.models.model_config import _iter_hf_cache_snapshots
+
+                    boundary = re.compile(
+                        r"(?<![a-zA-Z0-9])"
+                        + re.escape(hf_variant.lower())
+                        + r"(?![a-zA-Z0-9])"
+                    )
+                    for snap in _iter_hf_cache_snapshots(hf_repo):
+                        matches = sorted(
+                            p.relative_to(snap).as_posix()
+                            for p in snap.rglob("*.gguf")
+                            if "mmproj" not in p.name.lower()
+                            and boundary.search(p.relative_to(snap).as_posix().lower())
+                        )
+                        if not matches:
+                            continue
+                        gguf_filename = matches[0]
+                        m = _SHARD_FULL_RE.match(Path(gguf_filename).name)
+                        if m:
+                            prefix = m.group(1)
+                            total = m.group(3)
+                            sibling_pat = re.compile(
+                                r"^"
+                                + re.escape(prefix)
+                                + r"-\d{5}-of-"
+                                + re.escape(total)
+                                + r"\.gguf$"
+                            )
+                            gguf_extra_shards = [
+                                f
+                                for f in matches[1:]
+                                if sibling_pat.match(Path(f).name)
+                            ]
+                        logger.info(
+                            "Resolved variant %s -> %s from local HF cache",
+                            hf_variant,
+                            gguf_filename,
+                        )
+                        break
+                except Exception as e:
+                    logger.debug(f"Offline cache lookup for variant failed: {e}")
+
             if not gguf_filename:
                 repo_name = hf_repo.split("/")[-1].replace("-GGUF", "")
                 gguf_filename = f"{repo_name}-{hf_variant}.gguf"
@@ -1811,8 +1921,6 @@ def _download_gguf(
         # Check disk space and fall back to a smaller variant if needed
         all_gguf_files = [gguf_filename] + gguf_extra_shards
         try:
-            import os
-
             from huggingface_hub import get_paths_info, try_to_load_from_cache
 
             path_infos = list(get_paths_info(hf_repo, all_gguf_files, token = hf_token))
@@ -1946,24 +2054,50 @@ def _download_mmproj(
         Prefers mmproj-F16.gguf, falls back to any mmproj*.gguf file.
         Returns the local path, or None if no mmproj file exists.
         """
-        try:
-            from huggingface_hub import hf_hub_download, list_repo_files
 
-            files = list_repo_files(hf_repo, token = hf_token)
+        def _pick_mmproj(candidates: list[str]) -> Optional[str]:
             mmproj_files = sorted(
-                f for f in files if f.endswith(".gguf") and "mmproj" in f.lower()
+                f
+                for f in candidates
+                if f.lower().endswith(".gguf") and "mmproj" in Path(f).name.lower()
             )
             if not mmproj_files:
                 return None
-
-            # Prefer F16 variant
-            target = None
             for f in mmproj_files:
                 if f.lower().endswith("-f16.gguf"):
-                    target = f
-                    break
-            if target is None:
-                target = mmproj_files[0]
+                    return f
+            return mmproj_files[0]
+
+        target: Optional[str] = None
+        try:
+            from huggingface_hub import list_repo_files
+
+            target = _pick_mmproj(list_repo_files(hf_repo, token = hf_token))
+        except Exception as e:
+            logger.debug(f"Could not list repo files for mmproj: {e}")
+
+        # Offline: resolve mmproj from the local HF cache snapshot, same
+        # shape as _download_gguf's offline fallback above.
+        if target is None:
+            try:
+                from utils.models.model_config import _iter_hf_cache_snapshots
+
+                for snap in _iter_hf_cache_snapshots(hf_repo):
+                    rel_files = [
+                        p.relative_to(snap).as_posix() for p in snap.rglob("*.gguf")
+                    ]
+                    target = _pick_mmproj(rel_files)
+                    if target is not None:
+                        logger.info("Resolved mmproj %s from local HF cache", target)
+                        break
+            except Exception as e:
+                logger.debug(f"Offline cache lookup for mmproj failed: {e}")
+
+        if target is None:
+            return None
+
+        try:
+            from huggingface_hub import hf_hub_download
 
             logger.info(f"Downloading mmproj: {hf_repo}/{target}")
             local_path = hf_hub_download(
@@ -2052,18 +2186,22 @@ def load_model(
                 )
 
             # ── Phase 2: download (NO lock held, so cancel can proceed) ──
+            # Scope HF_HUB_OFFLINE to the download block only when DNS is
+            # dead; cleanup runs even on exception so a transient hiccup
+            # at the start of one load cannot quarantine future loads.
             if hf_repo:
-                model_path = self._download_gguf(
-                    hf_repo = hf_repo,
-                    hf_variant = hf_variant,
-                    hf_token = hf_token,
-                )
-                # Auto-download mmproj for vision models
-                if is_vision and not mmproj_path:
-                    mmproj_path = self._download_mmproj(
+                with _hf_offline_if_dns_dead():
+                    model_path = self._download_gguf(
                         hf_repo = hf_repo,
+                        hf_variant = hf_variant,
                         hf_token = hf_token,
                     )
+                    # Auto-download mmproj for vision models
+                    if is_vision and not mmproj_path:
+                        mmproj_path = self._download_mmproj(
+                            hf_repo = hf_repo,
+                            hf_token = hf_token,
+                        )
             elif gguf_path:
                 if not Path(gguf_path).is_file():
                     raise FileNotFoundError(f"GGUF file not found: {gguf_path}")
@@ -2603,6 +2741,30 @@ def load_model(
                 self._kill_process()
 
                 self._stdout_lines = []
+                # Tee llama-server output to a dedicated log file so a
+                # post-mortem in CI (or after a remote-debug session)
+                # has the full subprocess trail even when the parent
+                # only stored the last 50 lines. Path lives under the
+                # studio home so it ships in the same place all other
+                # Studio logs live.
+                self._llama_log_fh = None
+                try:
+                    log_dir = _swa_cache_path().parent / "logs" / "llama-server"
+                    log_dir.mkdir(parents = True, exist_ok = True)
+                    self._llama_log_path = (
+                        log_dir / f"llama-{int(time.time())}-port-{self._port}.log"
+                    )
+                    self._llama_log_fh = open(
+                        self._llama_log_path,
+                        "w",
+                        encoding = "utf-8",
+                        buffering = 1,
+                    )
+                    logger.info(f"llama-server stdout/stderr -> {self._llama_log_path}")
+                except OSError as e:
+                    # Best-effort; never block the load on logging.
+                    logger.debug(f"Could not open llama-server log file: {e}")
+                    self._llama_log_path = None
                 self._process = subprocess.Popen(
                     cmd,
                     stdout = subprocess.PIPE,
@@ -2899,6 +3061,13 @@ def _kill_process(self):
             if self._stdout_thread is not None:
                 self._stdout_thread.join(timeout = 2)
                 self._stdout_thread = None
+            fh = getattr(self, "_llama_log_fh", None)
+            if fh is not None:
+                try:
+                    fh.close()
+                except Exception:
+                    pass
+                self._llama_log_fh = None
 
     @staticmethod
     def _kill_orphaned_servers():
@@ -3110,7 +3279,17 @@ def _wait_for_health(self, timeout: float = 120.0, interval: float = 0.5) -> boo
                 resp = httpx.get(url, timeout = 2.0)
                 if resp.status_code == 200:
                     return True
-            except (httpx.ConnectError, httpx.TimeoutException):
+            except (
+                httpx.ConnectError,
+                httpx.TimeoutException,
+                # ReadError covers TCP RST mid-read while llama-server is
+                # still binding the port (Windows: WinError 10054). The
+                # crash-detection branch above catches a real exit; this
+                # one keeps a transient socket close from masking it.
+                httpx.ReadError,
+                httpx.RemoteProtocolError,
+                httpx.WriteError,
+            ):
                 pass
 
             time.sleep(interval)
diff --git a/studio/backend/core/inference/worker.py b/studio/backend/core/inference/worker.py
index 085a1ab899..cacede2d3e 100644
--- a/studio/backend/core/inference/worker.py
+++ b/studio/backend/core/inference/worker.py
@@ -648,6 +648,36 @@ def run_inference_process(
         os.environ["HF_HUB_DISABLE_XET"] = "1"
         logger.info("Xet transport disabled (HF_HUB_DISABLE_XET=1)")
 
+    # Offline auto-detect: skip 25s of hf_hub_download retries per file
+    # if DNS is dead; cached files resolve instantly under HF_HUB_OFFLINE=1.
+    # Scope is this subprocess only -- orchestrator spawns a fresh worker
+    # per load (see core/inference/orchestrator.py), so the env cannot
+    # persist across loads.
+    if "HF_HUB_OFFLINE" not in os.environ:
+        import socket as _socket
+        import threading as _threading
+
+        # Probe on a daemon thread so concurrent sockets in the parent
+        # interpreter are not affected by socket.setdefaulttimeout.
+        _result: list = [None]
+
+        def _probe() -> None:
+            try:
+                _socket.gethostbyname("huggingface.co")
+                _result[0] = False
+            except Exception:
+                _result[0] = True
+
+        _t = _threading.Thread(target = _probe, daemon = True)
+        _t.start()
+        _t.join(2.0)
+        if _result[0] is None or _result[0] is True:
+            os.environ["HF_HUB_OFFLINE"] = "1"
+            os.environ.setdefault("TRANSFORMERS_OFFLINE", "1")
+            logger.warning(
+                "huggingface.co unreachable; HF_HUB_OFFLINE=1 set for this worker."
+            )
+
     import warnings
     from loggers.config import LogConfig
 
diff --git a/studio/backend/tests/test_offline_gguf_cache_fallback.py b/studio/backend/tests/test_offline_gguf_cache_fallback.py
new file mode 100644
index 0000000000..d3b2f553a2
--- /dev/null
+++ b/studio/backend/tests/test_offline_gguf_cache_fallback.py
@@ -0,0 +1,828 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
+
+"""Regression tests for the offline GGUF cache fallback path (#5505).
+
+Three failure modes hit users when ``huggingface.co`` is unreachable
+but the requested GGUF repo is fully cached locally:
+
+* ``list_gguf_variants`` raised through ``HTTPException(500)`` so the
+  variant dropdown sat empty.
+* ``detect_gguf_model_remote`` returned ``None`` so a GGUF-only repo
+  was misrouted into the transformers/Unsloth backend (on macOS this
+  surfaced as a hardware error).
+* ``_download_gguf`` fell back to a synthetic ``{repo}-{variant}.gguf``
+  name that did not exist in cache when the in-repo filename did not
+  echo the repo name (e.g. ``unsloth/Qwen3.6-27B-MTP-GGUF`` ships
+  ``Qwen3.6-27B-UD-Q4_K_XL.gguf`` with no ``MTP`` token).
+
+Two follow-up regressions covered here:
+
+* P1 #1: the cache-side variant filter must match the snapshot-relative
+  path, not just the basename, so subdir layouts like
+  ``BF16/foo.gguf`` are findable.
+* P1 #2: the DNS auto-detect must scope ``HF_HUB_OFFLINE`` to one load
+  via try/finally so a transient resolver hiccup cannot lock the
+  long-lived ``LlamaCppBackend`` singleton offline forever.
+
+No GPU, no network, no subprocess. Linux, macOS, Windows compatible.
+"""
+
+from __future__ import annotations
+
+import os
+import socket
+import sys
+import types as _types
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+
+_BACKEND_DIR = str(Path(__file__).resolve().parent.parent)
+if _BACKEND_DIR not in sys.path:
+    sys.path.insert(0, _BACKEND_DIR)
+
+# Stub heavy/unavailable external deps before importing the modules
+# under test (same pattern as other studio backend tests).
+_loggers_stub = _types.ModuleType("loggers")
+_loggers_stub.get_logger = lambda name: __import__("logging").getLogger(name)
+sys.modules.setdefault("loggers", _loggers_stub)
+
+_structlog_stub = _types.ModuleType("structlog")
+sys.modules.setdefault("structlog", _structlog_stub)
+
+# Prefer real httpx if installed (CI installs it). Stub only as fallback.
+try:
+    import httpx  # noqa: F401
+except ImportError:
+    _httpx_stub = _types.ModuleType("httpx")
+    for _exc_name in (
+        "ConnectError",
+        "TimeoutException",
+        "ReadTimeout",
+        "ReadError",
+        "RemoteProtocolError",
+        "CloseError",
+        "HTTPError",
+        "RequestError",
+        "HTTPStatusError",
+    ):
+        setattr(_httpx_stub, _exc_name, type(_exc_name, (Exception,), {}))
+    _httpx_stub.Response = type("Response", (), {})
+    _httpx_stub.Request = type("Request", (), {})
+
+    class _FakeTimeout:
+        def __init__(self, *a, **kw):
+            pass
+
+    _httpx_stub.Timeout = _FakeTimeout
+    _httpx_stub.Client = type(
+        "Client",
+        (),
+        {
+            "__init__": lambda self, **kw: None,
+            "__enter__": lambda self: self,
+            "__exit__": lambda self, *a: None,
+        },
+    )
+    sys.modules.setdefault("httpx", _httpx_stub)
+
+
+from huggingface_hub import constants as hf_constants
+
+from core.inference.llama_cpp import (
+    LlamaCppBackend,
+    _hf_offline_if_dns_dead,
+    _probe_dns_dead,
+)
+from utils.models.model_config import (
+    _detect_gguf_from_hf_cache,
+    _extract_quant_label,
+    _iter_hf_cache_snapshots,
+    _list_gguf_variants_from_hf_cache,
+    detect_gguf_model_remote,
+    list_gguf_variants,
+)
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+def _build_cache(
+    root: Path,
+    repo_id: str,
+    files: dict[str, int],
+    *,
+    snapshot_sha: str = "a" * 40,
+) -> Path:
+    """Create ``$root/models--<repo>/snapshots/<sha>/<rel>`` for each entry."""
+    repo_dir = root / f"models--{repo_id.replace('/', '--')}"
+    (repo_dir / "blobs").mkdir(parents = True, exist_ok = True)
+    snap = repo_dir / "snapshots" / snapshot_sha
+    snap.mkdir(parents = True, exist_ok = True)
+    for rel, size in files.items():
+        full = snap / rel
+        full.parent.mkdir(parents = True, exist_ok = True)
+        full.write_bytes(b"\0" * size)
+    return snap
+
+
+@pytest.fixture
+def hf_cache(tmp_path, monkeypatch):
+    """Point ``huggingface_hub.constants.HF_HUB_CACHE`` at a temp dir."""
+    monkeypatch.setattr(hf_constants, "HF_HUB_CACHE", str(tmp_path))
+    return tmp_path
+
+
+@pytest.fixture
+def clean_offline_env(monkeypatch):
+    """Strip ``HF_HUB_OFFLINE`` / ``TRANSFORMERS_OFFLINE`` for the test."""
+    monkeypatch.delenv("HF_HUB_OFFLINE", raising = False)
+    monkeypatch.delenv("TRANSFORMERS_OFFLINE", raising = False)
+
+
+def _siblings(items: dict[str, int]):
+    """Mock ``hf_model_info(...).siblings`` payload."""
+    return _types.SimpleNamespace(
+        siblings = [
+            _types.SimpleNamespace(rfilename = name, size = size)
+            for name, size in items.items()
+        ],
+    )
+
+
+# ---------------------------------------------------------------------------
+# _iter_hf_cache_snapshots
+# ---------------------------------------------------------------------------
+
+
+class TestIterHfCacheSnapshots:
+    def test_returns_empty_when_cache_dir_missing(self, monkeypatch):
+        monkeypatch.setattr(hf_constants, "HF_HUB_CACHE", "/no/such/dir")
+        assert list(_iter_hf_cache_snapshots("unsloth/foo")) == []
+
+    def test_returns_empty_when_repo_not_cached(self, hf_cache):
+        assert list(_iter_hf_cache_snapshots("unsloth/not-here")) == []
+
+    def test_returns_empty_when_snapshots_dir_missing(self, hf_cache):
+        # Repo dir exists but no snapshots/ inside.
+        (hf_cache / "models--unsloth--bare").mkdir()
+        assert list(_iter_hf_cache_snapshots("unsloth/bare")) == []
+
+    def test_yields_newest_first(self, hf_cache):
+        old = _build_cache(
+            hf_cache, "unsloth/multi", {"x.gguf": 1}, snapshot_sha = "a" * 40
+        )
+        new = _build_cache(
+            hf_cache, "unsloth/multi", {"y.gguf": 1}, snapshot_sha = "b" * 40
+        )
+        os.utime(old, (1000, 1000))
+        os.utime(new, (2000, 2000))
+        out = list(_iter_hf_cache_snapshots("unsloth/multi"))
+        assert [p.name for p in out] == ["b" * 40, "a" * 40]
+
+    def test_repo_id_match_is_case_insensitive(self, hf_cache):
+        _build_cache(hf_cache, "unsloth/Foo-GGUF", {"Foo-Q4_K_M.gguf": 1})
+        # Lookup with a different casing of the org/name still resolves
+        out = list(_iter_hf_cache_snapshots("UNSLOTH/foo-gguf"))
+        assert len(out) == 1
+
+
+# ---------------------------------------------------------------------------
+# _list_gguf_variants_from_hf_cache / list_gguf_variants
+# ---------------------------------------------------------------------------
+
+
+class TestListGgufVariantsFromCache:
+    def test_returns_variants_when_cached(self, hf_cache):
+        _build_cache(
+            hf_cache,
+            "unsloth/Qwen3.5-4B-GGUF",
+            {
+                "Qwen3.5-4B-UD-Q4_K_XL.gguf": 100,
+                "Qwen3.5-4B-Q2_K.gguf": 50,
+            },
+        )
+        out = _list_gguf_variants_from_hf_cache("unsloth/Qwen3.5-4B-GGUF")
+        assert out is not None
+        variants, has_vision = out
+        assert sorted(v.quant for v in variants) == ["Q2_K", "UD-Q4_K_XL"]
+        assert has_vision is False
+
+    def test_returns_none_when_not_cached(self, hf_cache):
+        assert _list_gguf_variants_from_hf_cache("unsloth/absent") is None
+
+
+class TestListGgufVariantsOffline:
+    def test_offline_env_short_circuits_api(
+        self, hf_cache, clean_offline_env, monkeypatch
+    ):
+        _build_cache(hf_cache, "unsloth/a", {"a-UD-Q4_K_XL.gguf": 1})
+        monkeypatch.setenv("HF_HUB_OFFLINE", "1")
+
+        def boom(*a, **k):
+            raise AssertionError("API must not be called when offline env set")
+
+        with patch("huggingface_hub.model_info", boom):
+            variants, _has = list_gguf_variants("unsloth/a")
+        assert len(variants) == 1
+        assert variants[0].quant == "UD-Q4_K_XL"
+
+    def test_api_exception_falls_back_to_cache(
+        self,
+        hf_cache,
+        clean_offline_env,
+    ):
+        _build_cache(hf_cache, "unsloth/a", {"a-Q4_K_M.gguf": 1})
+
+        def boom(*a, **k):
+            raise OSError("network down")
+
+        with patch("huggingface_hub.model_info", boom):
+            variants, _has = list_gguf_variants("unsloth/a")
+        assert len(variants) == 1
+        assert variants[0].quant == "Q4_K_M"
+
+    def test_api_exception_with_no_cache_reraises(self, hf_cache, clean_offline_env):
+        def boom(*a, **k):
+            raise OSError("network down")
+
+        with patch("huggingface_hub.model_info", boom):
+            with pytest.raises(OSError, match = "network down"):
+                list_gguf_variants("unsloth/never-cached")
+
+    def test_online_path_unaffected(self, hf_cache, clean_offline_env):
+        # When the API succeeds, cache is not consulted.
+        api_payload = _siblings({"a-UD-Q4_K_XL.gguf": 5, "a-Q2_K.gguf": 3})
+
+        def hf_info(*a, **k):
+            return api_payload
+
+        with patch("huggingface_hub.model_info", hf_info):
+            variants, _has = list_gguf_variants("unsloth/a")
+        assert sorted(v.quant for v in variants) == ["Q2_K", "UD-Q4_K_XL"]
+
+
+# ---------------------------------------------------------------------------
+# _detect_gguf_from_hf_cache / detect_gguf_model_remote
+# ---------------------------------------------------------------------------
+
+
+class TestDetectGgufFromCache:
+    def test_picks_best_quant(self, hf_cache):
+        _build_cache(
+            hf_cache,
+            "unsloth/a",
+            {"a-Q2_K.gguf": 1, "a-UD-Q4_K_XL.gguf": 1},
+        )
+        assert _detect_gguf_from_hf_cache("unsloth/a") == "a-UD-Q4_K_XL.gguf"
+
+    def test_subdir_only_quant_resolves(self, hf_cache):
+        """P1 #1 regression: ``BF16/foo.gguf`` (quant only in directory).
+        Before the fix, the offline cache scan matched on basename and
+        missed this layout, falling through to the synthetic
+        ``{repo}-{variant}.gguf`` heuristic."""
+        _build_cache(
+            hf_cache,
+            "unsloth/gpt-oss-20b-BF16",
+            {"BF16/foo.gguf": 1},
+        )
+        out = _detect_gguf_from_hf_cache("unsloth/gpt-oss-20b-BF16")
+        assert (
+            out == "BF16/foo.gguf"
+        ), f"subdir-only layout must resolve to relative path, got {out}"
+
+    def test_returns_none_when_no_gguf(self, hf_cache):
+        _build_cache(hf_cache, "unsloth/a", {"README.md": 10})
+        assert _detect_gguf_from_hf_cache("unsloth/a") is None
+
+
+class TestDetectGgufModelRemoteOffline:
+    def test_offline_env_short_circuits_retries(
+        self,
+        hf_cache,
+        clean_offline_env,
+        monkeypatch,
+    ):
+        _build_cache(hf_cache, "unsloth/a", {"a-Q4_K_M.gguf": 1})
+        monkeypatch.setenv("HF_HUB_OFFLINE", "1")
+
+        def boom(*a, **k):
+            raise AssertionError("API must not be called when offline env set")
+
+        with patch("huggingface_hub.model_info", boom):
+            assert detect_gguf_model_remote("unsloth/a") == "a-Q4_K_M.gguf"
+
+    def test_api_3x_failure_then_cache(self, hf_cache, clean_offline_env):
+        _build_cache(hf_cache, "unsloth/a", {"a-Q4_K_M.gguf": 1})
+
+        def boom(*a, **k):
+            raise OSError("hub down")
+
+        # Patch time.sleep so the 1s/2s/4s backoff doesn't slow the test.
+        with (
+            patch("huggingface_hub.model_info", boom),
+            patch("time.sleep", lambda *_: None),
+        ):
+            out = detect_gguf_model_remote("unsloth/a")
+        assert out == "a-Q4_K_M.gguf"
+
+    def test_repository_not_found_does_not_consult_cache(
+        self,
+        hf_cache,
+        clean_offline_env,
+    ):
+        # Cache has a file but the API explicitly says repo is gone.
+        _build_cache(hf_cache, "unsloth/a", {"a-Q4_K_M.gguf": 1})
+
+        class RepositoryNotFoundError(Exception):
+            pass
+
+        def gone(*a, **k):
+            raise RepositoryNotFoundError("404")
+
+        with patch("huggingface_hub.model_info", gone):
+            out = detect_gguf_model_remote("unsloth/a")
+        # Early-return semantics preserved: 404 wins over a stale cache.
+        assert out is None
+
+
+# ---------------------------------------------------------------------------
+# _probe_dns_dead / _hf_offline_if_dns_dead
+# ---------------------------------------------------------------------------
+
+
+class _DnsState:
+    """Tiny helper that toggles ``socket.gethostbyname`` failure mode."""
+
+    def __init__(self, monkeypatch):
+        self._mp = monkeypatch
+        self._real = socket.gethostbyname
+
+    def fail(self):
+        def _fail(*a, **k):
+            raise socket.gaierror(-2, "Name or service not known")
+
+        self._mp.setattr(socket, "gethostbyname", _fail)
+
+    def ok(self):
+        self._mp.setattr(socket, "gethostbyname", lambda *a, **k: "127.0.0.1")
+
+    def restore(self):
+        self._mp.setattr(socket, "gethostbyname", self._real)
+
+
+@pytest.fixture
+def dns(monkeypatch):
+    return _DnsState(monkeypatch)
+
+
+class TestProbeDnsDead:
+    def test_returns_false_on_success(self, dns):
+        dns.ok()
+        assert _probe_dns_dead() is False
+
+    def test_returns_true_on_failure(self, dns):
+        dns.fail()
+        assert _probe_dns_dead() is True
+
+    def test_restores_prior_socket_timeout(self, dns):
+        dns.ok()
+        socket.setdefaulttimeout(7.5)
+        try:
+            _probe_dns_dead()
+            assert socket.getdefaulttimeout() == 7.5
+        finally:
+            socket.setdefaulttimeout(None)
+
+
+class TestHfOfflineIfDnsDead:
+    def test_dns_fail_sets_env_inside_block_only(self, dns, clean_offline_env):
+        dns.fail()
+        assert "HF_HUB_OFFLINE" not in os.environ
+        with _hf_offline_if_dns_dead() as did_set:
+            assert did_set is True
+            assert os.environ.get("HF_HUB_OFFLINE") == "1"
+            assert os.environ.get("TRANSFORMERS_OFFLINE") == "1"
+        # P1 #2: env must be restored after the block
+        assert "HF_HUB_OFFLINE" not in os.environ
+        assert "TRANSFORMERS_OFFLINE" not in os.environ
+
+    def test_dns_ok_is_noop(self, dns, clean_offline_env):
+        dns.ok()
+        with _hf_offline_if_dns_dead() as did_set:
+            assert did_set is False
+            assert "HF_HUB_OFFLINE" not in os.environ
+
+    def test_dns_recovers_between_calls(self, dns, clean_offline_env):
+        # First call: DNS dead -> env set inside, cleared on exit.
+        dns.fail()
+        with _hf_offline_if_dns_dead():
+            pass
+        assert "HF_HUB_OFFLINE" not in os.environ
+        # Second call: DNS healthy -> no env mutation.
+        dns.ok()
+        with _hf_offline_if_dns_dead() as did_set:
+            assert did_set is False
+            assert "HF_HUB_OFFLINE" not in os.environ
+
+    def test_user_set_hf_hub_offline_is_preserved(
+        self,
+        dns,
+        clean_offline_env,
+        monkeypatch,
+    ):
+        # User explicitly set offline before launching Studio.
+        monkeypatch.setenv("HF_HUB_OFFLINE", "1")
+        dns.fail()
+        with _hf_offline_if_dns_dead() as did_set:
+            assert did_set is False
+            assert os.environ.get("HF_HUB_OFFLINE") == "1"
+        # Helper must not pop a variable it did not set.
+        assert os.environ.get("HF_HUB_OFFLINE") == "1"
+
+    def test_user_set_transformers_offline_is_preserved(
+        self,
+        dns,
+        clean_offline_env,
+        monkeypatch,
+    ):
+        monkeypatch.setenv("TRANSFORMERS_OFFLINE", "1")
+        dns.fail()
+        with _hf_offline_if_dns_dead():
+            assert os.environ.get("HF_HUB_OFFLINE") == "1"
+            assert os.environ.get("TRANSFORMERS_OFFLINE") == "1"
+        # HF_HUB_OFFLINE was set by helper -> removed.
+        assert "HF_HUB_OFFLINE" not in os.environ
+        # TRANSFORMERS_OFFLINE pre-existed -> preserved.
+        assert os.environ.get("TRANSFORMERS_OFFLINE") == "1"
+
+    def test_exception_inside_block_still_restores_env(
+        self,
+        dns,
+        clean_offline_env,
+    ):
+        dns.fail()
+        with pytest.raises(RuntimeError, match = "boom"):
+            with _hf_offline_if_dns_dead():
+                raise RuntimeError("boom")
+        # Cleanup must happen on exception as well.
+        assert "HF_HUB_OFFLINE" not in os.environ
+        assert "TRANSFORMERS_OFFLINE" not in os.environ
+
+
+class TestExtractQuantLabelSubdir:
+    """``_extract_quant_label`` must consider the parent directories when
+    the basename has no quant token. Subdir layouts like ``BF16/foo.gguf``
+    are documented in this codebase and surface through the cache scan."""
+
+    def test_quant_in_basename_unchanged(self):
+        assert _extract_quant_label("BF16/foo-BF16.gguf") == "BF16"
+        assert _extract_quant_label("model-Q4_K_M.gguf") == "Q4_K_M"
+
+    def test_quant_only_in_parent_dir(self):
+        assert _extract_quant_label("BF16/foo.gguf") == "BF16"
+
+    def test_ud_prefix_in_parent_dir(self):
+        assert _extract_quant_label("UD-Q4_K_XL/weight.gguf") == "UD-Q4_K_XL"
+
+    def test_deeper_nesting_picks_nearest_quant_dir(self):
+        # When multiple parent segments could match, prefer the one closest
+        # to the file (innermost). This matches how repos like
+        # ``models/MXFP4_MOE/foo.gguf`` are laid out.
+        assert _extract_quant_label("models/MXFP4_MOE/foo.gguf") == "MXFP4_MOE"
+
+
+class TestDownloadMmprojOfflineCacheFallback:
+    """``LlamaCppBackend._download_mmproj`` must resolve cached mmproj
+    GGUFs offline, same shape as ``_download_gguf``. Without this the
+    offline vision GGUF load path returns ``None`` even when the mmproj
+    is present in cache."""
+
+    def test_cache_lookup_returns_cached_mmproj_when_list_repo_files_fails(
+        self,
+        hf_cache,
+    ):
+        _build_cache(
+            hf_cache,
+            "unsloth/vision-GGUF",
+            {
+                "vision-Q4_K_M.gguf": 1,
+                "mmproj-vision-F16.gguf": 1,
+            },
+        )
+        backend = LlamaCppBackend()
+
+        def boom_list(*a, **k):
+            raise OSError("offline")
+
+        def fake_download(*, repo_id, filename, token = None):
+            # Echo back so the test can verify the cache-resolved filename
+            return f"/fake/cache/{repo_id}/{filename}"
+
+        with (
+            patch("huggingface_hub.list_repo_files", boom_list),
+            patch("huggingface_hub.hf_hub_download", fake_download),
+        ):
+            out = backend._download_mmproj(
+                hf_repo = "unsloth/vision-GGUF",
+                hf_token = None,
+            )
+        assert out is not None, "mmproj must resolve from cache when offline"
+        assert "mmproj-vision-F16.gguf" in out
+
+    def test_prefers_f16_variant_when_multiple_mmproj_in_cache(self, hf_cache):
+        _build_cache(
+            hf_cache,
+            "unsloth/vision-GGUF",
+            {
+                "mmproj-vision-BF16.gguf": 1,
+                "mmproj-vision-F16.gguf": 1,
+            },
+        )
+        backend = LlamaCppBackend()
+
+        def boom_list(*a, **k):
+            raise OSError("offline")
+
+        captured = {}
+
+        def fake_download(*, repo_id, filename, token = None):
+            captured["filename"] = filename
+            return f"/fake/{filename}"
+
+        with (
+            patch("huggingface_hub.list_repo_files", boom_list),
+            patch("huggingface_hub.hf_hub_download", fake_download),
+        ):
+            backend._download_mmproj(
+                hf_repo = "unsloth/vision-GGUF",
+                hf_token = None,
+            )
+        assert captured.get("filename") == "mmproj-vision-F16.gguf"
+
+    def test_no_mmproj_in_cache_returns_none(self, hf_cache):
+        _build_cache(
+            hf_cache,
+            "unsloth/text-only-GGUF",
+            {"text-Q4_K_M.gguf": 1},
+        )
+        backend = LlamaCppBackend()
+
+        def boom_list(*a, **k):
+            raise OSError("offline")
+
+        with patch("huggingface_hub.list_repo_files", boom_list):
+            out = backend._download_mmproj(
+                hf_repo = "unsloth/text-only-GGUF",
+                hf_token = None,
+            )
+        assert out is None
+
+
+class TestListLocalGgufVariantsSubdir:
+    """Subdir layouts like ``BF16/foo.gguf`` and ``Q4_K_M/foo.gguf`` must
+    produce distinct quant labels, not collapse on basename."""
+
+    def test_two_subdir_variants_do_not_collapse(self, tmp_path):
+        from utils.models.model_config import list_local_gguf_variants
+
+        (tmp_path / "config.json").write_text("{}")
+        (tmp_path / "BF16").mkdir()
+        (tmp_path / "BF16" / "foo.gguf").write_bytes(b"\0" * 100)
+        (tmp_path / "Q4_K_M").mkdir()
+        (tmp_path / "Q4_K_M" / "foo.gguf").write_bytes(b"\0" * 50)
+
+        variants, _ = list_local_gguf_variants(str(tmp_path))
+        quants = {v.quant for v in variants}
+        assert "BF16" in quants, f"BF16 missing from {quants}"
+        assert "Q4_K_M" in quants, f"Q4_K_M missing from {quants}"
+        assert len(variants) == 2
+
+    def test_find_local_gguf_by_variant_locates_subdir(self, tmp_path):
+        from utils.models.model_config import _find_local_gguf_by_variant
+
+        (tmp_path / "config.json").write_text("{}")
+        (tmp_path / "BF16").mkdir()
+        target = tmp_path / "BF16" / "foo.gguf"
+        target.write_bytes(b"\0" * 10)
+
+        out = _find_local_gguf_by_variant(str(tmp_path), "BF16")
+        assert out is not None
+        assert Path(out).name == "foo.gguf"
+
+
+class TestListGgufVariantsPermanentErrors:
+    """Permanent HF errors must surface; cache fallback only on transient."""
+
+    def test_repository_not_found_re_raises(self, hf_cache, clean_offline_env):
+        from utils.models.model_config import list_gguf_variants
+
+        _build_cache(hf_cache, "u/repo-gguf", {"foo-Q4_K_M.gguf": 1})
+
+        class _RepoNotFound(Exception):
+            pass
+
+        _RepoNotFound.__name__ = "RepositoryNotFoundError"
+
+        def boom(*a, **k):
+            raise _RepoNotFound("repo deleted")
+
+        with patch("huggingface_hub.model_info", boom):
+            with pytest.raises(Exception) as exc_info:
+                list_gguf_variants("u/repo-gguf")
+        assert type(exc_info.value).__name__ == "RepositoryNotFoundError"
+
+    def test_gated_repo_re_raises(self, hf_cache, clean_offline_env):
+        from utils.models.model_config import list_gguf_variants
+
+        _build_cache(hf_cache, "u/gated-gguf", {"foo-Q4_K_M.gguf": 1})
+
+        class _GatedRepo(Exception):
+            pass
+
+        _GatedRepo.__name__ = "GatedRepoError"
+
+        def boom(*a, **k):
+            raise _GatedRepo("auth required")
+
+        with patch("huggingface_hub.model_info", boom):
+            with pytest.raises(Exception) as exc_info:
+                list_gguf_variants("u/gated-gguf")
+        assert type(exc_info.value).__name__ == "GatedRepoError"
+
+    def test_transient_error_still_falls_back_to_cache(
+        self, hf_cache, clean_offline_env
+    ):
+        from utils.models.model_config import list_gguf_variants
+
+        _build_cache(hf_cache, "u/transient-gguf", {"foo-Q4_K_M.gguf": 1})
+
+        def boom(*a, **k):
+            raise OSError("network down")
+
+        with patch("huggingface_hub.model_info", boom):
+            variants, _ = list_gguf_variants("u/transient-gguf")
+        assert any(v.quant == "Q4_K_M" for v in variants)
+
+
+class TestDetectGgufFromCacheExcludesMmproj:
+    """A partial cache with only a vision projector must not route the
+    projector as the main model."""
+
+    def test_mmproj_only_returns_none(self, hf_cache):
+        from utils.models.model_config import _detect_gguf_from_hf_cache
+
+        _build_cache(
+            hf_cache,
+            "u/vision-only-mmproj",
+            {"mmproj-vision-F16.gguf": 1},
+        )
+        assert _detect_gguf_from_hf_cache("u/vision-only-mmproj") is None
+
+    def test_main_plus_mmproj_returns_main(self, hf_cache):
+        from utils.models.model_config import _detect_gguf_from_hf_cache
+
+        _build_cache(
+            hf_cache,
+            "u/vision-full",
+            {
+                "model-Q4_K_M.gguf": 1,
+                "mmproj-vision-F16.gguf": 1,
+            },
+        )
+        out = _detect_gguf_from_hf_cache("u/vision-full")
+        assert out is not None
+        assert "mmproj" not in out.lower()
+
+
+class TestProbeDnsDeadNoGlobalTimeoutMutation:
+    """``_probe_dns_dead`` must not change ``socket.setdefaulttimeout``
+    process-wide -- concurrent sockets without explicit timeout would
+    inherit it for the probe window."""
+
+    def test_default_timeout_unchanged_when_dns_up(self, monkeypatch):
+        import socket as _socket
+        from core.inference.llama_cpp import _probe_dns_dead
+
+        prev = _socket.getdefaulttimeout()
+        set_calls = []
+
+        original_set = _socket.setdefaulttimeout
+
+        def tracking_set(value):
+            set_calls.append(value)
+            original_set(value)
+
+        monkeypatch.setattr(_socket, "setdefaulttimeout", tracking_set)
+        monkeypatch.setattr(_socket, "gethostbyname", lambda h: "127.0.0.1")
+
+        try:
+            _probe_dns_dead("example.invalid", timeout = 0.5)
+        finally:
+            # Restore exact state regardless of any test-side mutation.
+            original_set(prev)
+
+        assert set_calls == [], (
+            f"_probe_dns_dead mutated socket.setdefaulttimeout {set_calls}; "
+            "must isolate timeout to the probe thread"
+        )
+
+    def test_returns_dead_when_resolver_wedges(self, monkeypatch):
+        import socket as _socket
+        from core.inference.llama_cpp import _probe_dns_dead
+
+        # Simulate a wedged resolver: thread blocks forever.
+        def wedged(host):
+            import threading
+
+            threading.Event().wait()
+
+        monkeypatch.setattr(_socket, "gethostbyname", wedged)
+        assert _probe_dns_dead("example.invalid", timeout = 0.1) is True
+
+
+class TestWaitForHealthRetriesOnReadError:
+    """A TCP RST mid-read while llama-server is still binding the port
+    (Windows: WinError 10054) must not abort the health-poll loop --
+    that masks a legitimate 'still warming up' state as a fatal load."""
+
+    def test_read_error_then_success(self, monkeypatch):
+        import httpx
+
+        from core.inference.llama_cpp import LlamaCppBackend
+
+        backend = LlamaCppBackend()
+        backend._port = 65500
+
+        class _FakeProc:
+            returncode = None
+
+            def poll(self):
+                return None
+
+            def terminate(self):
+                pass
+
+            def kill(self):
+                pass
+
+            def wait(self, timeout = None):
+                return 0
+
+        backend._process = _FakeProc()
+        backend._stdout_thread = None
+        backend._stdout_lines = []
+
+        calls = {"n": 0}
+
+        def fake_get(url, timeout = None):
+            calls["n"] += 1
+            if calls["n"] == 1:
+                raise httpx.ReadError("WinError 10054")
+            if calls["n"] == 2:
+                raise httpx.RemoteProtocolError("short read")
+            if calls["n"] == 3:
+                raise httpx.WriteError("peer dropped")
+
+            class _OK:
+                status_code = 200
+
+            return _OK()
+
+        monkeypatch.setattr("core.inference.llama_cpp.httpx.get", fake_get)
+        assert backend._wait_for_health(timeout = 5.0, interval = 0.01) is True
+        assert calls["n"] == 4, (
+            f"_wait_for_health should retry past ReadError/RemoteProtocol/Write; "
+            f"saw {calls['n']} attempts"
+        )
+
+    def test_real_process_exit_still_short_circuits(self, monkeypatch):
+        from core.inference.llama_cpp import LlamaCppBackend
+
+        backend = LlamaCppBackend()
+        backend._port = 65501
+
+        class _DeadProc:
+            returncode = 137
+
+            def poll(self):
+                return 137
+
+            def terminate(self):
+                pass
+
+            def kill(self):
+                pass
+
+            def wait(self, timeout = None):
+                return 137
+
+        backend._process = _DeadProc()
+        backend._stdout_thread = None
+        backend._stdout_lines = ["fatal: out of memory"]
+        assert backend._wait_for_health(timeout = 5.0, interval = 0.01) is False
diff --git a/studio/backend/utils/models/model_config.py b/studio/backend/utils/models/model_config.py
index bf7f7a009b..2f3bd2431c 100644
--- a/studio/backend/utils/models/model_config.py
+++ b/studio/backend/utils/models/model_config.py
@@ -1259,12 +1259,10 @@ def _extract_quant_label(filename: str) -> str:
     """
     import re
 
-    # Use only the basename (rfilename may include directory)
     basename = filename.rsplit("/", 1)[-1]
     # Strip .gguf and any shard suffix (-00001-of-00010)
     stem = re.sub(r"-\d{3,}-of-\d{3,}", "", basename.rsplit(".", 1)[0])
-    # Match known quantization patterns
-    match = re.search(
+    quant_re = (
         r"(UD-)?"  # Optional UD- prefix (Ultra Discrete)
         r"(MXFP[0-9]+(?:_[A-Z0-9]+)*"  # MXFP variants: MXFP4, MXFP4_MOE
         r"|IQ[0-9]+_[A-Z]+(?:_[A-Z0-9]+)?"  # IQ variants: IQ4_XS, IQ4_NL, IQ1_S
@@ -1272,10 +1270,19 @@ def _extract_quant_label(filename: str) -> str:
         r"|Q[0-9]+_K_[A-Z]+"  # K-quant: Q4_K_M, Q3_K_S
         r"|Q[0-9]+_[0-9]+"  # Standard: Q8_0, Q5_1
         r"|Q[0-9]+_K"  # Short K-quant: Q6_K
-        r"|BF16|F16|F32)",  # Full precision
-        stem,
-        re.IGNORECASE,
+        r"|BF16|F16|F32)"  # Full precision
     )
+    match = re.search(quant_re, stem, re.IGNORECASE)
+    # Subdir layouts like ``BF16/foo.gguf`` keep the quant in the directory,
+    # not the basename. Look at the parent dirs too so the variant label
+    # matches the snapshot-relative path produced elsewhere.
+    if not match and "/" in filename:
+        parents = filename.rsplit("/", 1)[0]
+        for segment in reversed(parents.split("/")):
+            m = re.search(quant_re, segment, re.IGNORECASE)
+            if m:
+                match = m
+                break
     if match:
         prefix = match.group(1) or ""
         return f"{prefix}{match.group(2)}"
@@ -1283,6 +1290,57 @@ def _extract_quant_label(filename: str) -> str:
     return stem.split("-")[-1]
 
 
+def _iter_hf_cache_snapshots(repo_id: str):
+    """Yield HF cache snapshot dirs for *repo_id*, newest first.
+
+    Empty generator if HF_HUB_CACHE is missing, the repo isn't cached,
+    or has no snapshots. Repo name match is case-insensitive to handle
+    casing drift between download time and lookup.
+    """
+    try:
+        from huggingface_hub import constants as hf_constants
+    except Exception:
+        return
+
+    cache_dir = Path(hf_constants.HF_HUB_CACHE)
+    if not cache_dir.is_dir():
+        return
+
+    target = f"models--{repo_id.replace('/', '--')}".lower()
+    repo_dir: Optional[Path] = None
+    try:
+        for entry in cache_dir.iterdir():
+            if entry.is_dir() and entry.name.lower() == target:
+                repo_dir = entry
+                break
+    except OSError:
+        return
+    if repo_dir is None:
+        return
+
+    snapshots = repo_dir / "snapshots"
+    if not snapshots.is_dir():
+        return
+
+    try:
+        snap_dirs = [s for s in snapshots.iterdir() if s.is_dir()]
+    except OSError:
+        return
+    snap_dirs.sort(key = lambda s: s.stat().st_mtime, reverse = True)
+    yield from snap_dirs
+
+
+def _list_gguf_variants_from_hf_cache(
+    repo_id: str,
+) -> Optional[tuple[list[GgufVariantInfo], bool]]:
+    """Variants from the local HF cache snapshot, or None if not cached."""
+    for snap in _iter_hf_cache_snapshots(repo_id):
+        variants, has_vision = list_local_gguf_variants(str(snap))
+        if variants or has_vision:
+            return variants, has_vision
+    return None
+
+
 def list_gguf_variants(
     repo_id: str,
     hf_token: Optional[str] = None,
@@ -1298,7 +1356,40 @@ def list_gguf_variants(
     """
     from huggingface_hub import model_info as hf_model_info
 
-    info = hf_model_info(repo_id, token = hf_token, files_metadata = True)
+    # Offline: skip the API and serve from cache.
+    offline = os.environ.get("HF_HUB_OFFLINE", "").lower() in (
+        "1",
+        "true",
+        "yes",
+    ) or os.environ.get("TRANSFORMERS_OFFLINE", "").lower() in ("1", "true", "yes")
+    if offline:
+        cached = _list_gguf_variants_from_hf_cache(repo_id)
+        if cached is not None:
+            return cached
+
+    try:
+        info = hf_model_info(repo_id, token = hf_token, files_metadata = True)
+    except Exception as e:
+        # Permanent errors (deleted/gated/bad revision) must surface to
+        # the caller; serving stale cache here would mask the real cause.
+        # Matches the early-return in ``detect_gguf_model_remote``.
+        if type(e).__name__ in (
+            "RepositoryNotFoundError",
+            "GatedRepoError",
+            "RevisionNotFoundError",
+            "EntryNotFoundError",
+        ):
+            raise
+        # API failed transiently; fall back to local snapshot if fully downloaded.
+        cached = _list_gguf_variants_from_hf_cache(repo_id)
+        if cached is not None:
+            logger.warning(
+                "HF API unreachable for %s (%s); using local cache snapshot.",
+                repo_id,
+                e.__class__.__name__,
+            )
+            return cached
+        raise
     variants: list[GgufVariantInfo] = []
     has_vision = False
 
@@ -1392,16 +1483,13 @@ def list_local_gguf_variants(
             size = f.stat().st_size
         except OSError:
             size = 0
-        quant = _extract_quant_label(f.name)
+        # Pass the relative path so ``BF16/foo.gguf`` and ``Q4_K_M/foo.gguf``
+        # produce distinct quant labels instead of collapsing on basename.
+        rel = f.relative_to(p).as_posix()
+        quant = _extract_quant_label(rel)
         quant_totals[quant] = quant_totals.get(quant, 0) + size
-        # Only compute the (potentially expensive) relative path when this
-        # is the first file we've seen for this quant -- after that we'd
-        # discard the result anyway. Use posix-style separators so the
-        # filename matches what ``list_gguf_variants`` (the remote HF
-        # API path) returns on every platform; otherwise Windows would
-        # emit ``BF16\foo.gguf`` here.
         if quant not in quant_first_file:
-            quant_first_file[quant] = f.relative_to(p).as_posix()
+            quant_first_file[quant] = rel
 
     variants = [
         GgufVariantInfo(
@@ -1429,16 +1517,36 @@ def _find_local_gguf_by_variant(directory: str, variant: str) -> Optional[str]:
 
     # Recurse into subdirectories so variants stored under a quant-named
     # subdir (e.g. ``BF16/foo-BF16-00001-of-00002.gguf``) are found.
+    # Match against the relative path so the quant label can come from
+    # the directory name when the basename omits it.
     matches = sorted(
         f
         for f in _iter_gguf_files(p, recursive = True)
-        if not _is_mmproj(f.name) and _extract_quant_label(f.name) == variant
+        if not _is_mmproj(f.name)
+        and _extract_quant_label(f.relative_to(p).as_posix()) == variant
     )
     if matches:
         return str(matches[0].resolve())
     return None
 
 
+def _detect_gguf_from_hf_cache(repo_id: str) -> Optional[str]:
+    """Best GGUF filename for *repo_id* from the local HF cache, or None.
+
+    Excludes mmproj (vision projector) files so a partial cache that
+    only has the projector cannot route the projector as the main model.
+    """
+    for snap in _iter_hf_cache_snapshots(repo_id):
+        rel_files = [
+            f.relative_to(snap).as_posix()
+            for f in _iter_gguf_files(snap, recursive = True)
+            if not _is_mmproj(f.name)
+        ]
+        if rel_files:
+            return _pick_best_gguf(rel_files)
+    return None
+
+
 def detect_gguf_model_remote(
     repo_id: str,
     hf_token: Optional[str] = None,
@@ -1455,10 +1563,23 @@ def detect_gguf_model_remote(
     through to the MLX backend, which then fails opening a non-existent
     config.json on the GGUF-only repo. Three attempts with 1s/2s/4s
     backoff covers the typical free-runner HF Hub flakiness.
+
+    When offline, falls back to the local HF cache so a downloaded
+    repo is still routed to llama-server (not MLX/Unsloth).
     """
     import time
     from huggingface_hub import model_info as hf_model_info
 
+    offline = os.environ.get("HF_HUB_OFFLINE", "").lower() in (
+        "1",
+        "true",
+        "yes",
+    ) or os.environ.get("TRANSFORMERS_OFFLINE", "").lower() in ("1", "true", "yes")
+    if offline:
+        cached = _detect_gguf_from_hf_cache(repo_id)
+        if cached is not None:
+            return cached
+
     last_err: Optional[Exception] = None
     for attempt in range(3):
         try:
@@ -1479,6 +1600,17 @@ def detect_gguf_model_remote(
                 return None
             if attempt < 2:
                 time.sleep(2**attempt)
+
+    # All attempts failed; fall back to local cache for offline users.
+    cached = _detect_gguf_from_hf_cache(repo_id)
+    if cached is not None:
+        logger.warning(
+            "HF API unreachable for '%s' (%s); using local cache to detect GGUF.",
+            repo_id,
+            type(last_err).__name__ if last_err else "unknown",
+        )
+        return cached
+
     logger.warning(
         f"Could not check GGUF files for '{repo_id}' after 3 attempts: {last_err}"
     )