diff --git a/.github/workflows/studio-windows-inference-smoke.yml b/.github/workflows/studio-windows-inference-smoke.yml index 01bf4127a7..2acc782984 100644 --- a/.github/workflows/studio-windows-inference-smoke.yml +++ b/.github/workflows/studio-windows-inference-smoke.yml @@ -258,11 +258,26 @@ jobs: - name: Load the GGUF (HF repo + variant, served from HF_HOME cache) run: | - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \ - -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \ - --max-time 600 \ - -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}" \ - | jq '{status, display_name, is_gguf, context_length}' + # Retry the load step a few times so a transient TCP RST during + # llama-server warm-up (Windows runner image churn, + # windows-latest -> windows-2025-vs2026 rollout) doesn't fail + # the whole job. The Studio backend's _wait_for_health now + # catches httpx.ReadError too; this retry layer covers the + # cases the backend can't recover from on its own. + LOAD_OK=0 + for attempt in 1 2 3; do + HTTP=$(curl -s -o /tmp/load.json -w '%{http_code}' \ + -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \ + -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \ + --max-time 600 \ + -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}") + if [ "$HTTP" = "200" ]; then LOAD_OK=1; break; fi + echo "::warning::/api/inference/load attempt $attempt returned $HTTP; response:" + cat /tmp/load.json || true + sleep 10 + done + [ "$LOAD_OK" = "1" ] || { echo "::error::/api/inference/load failed 3 attempts"; exit 22; } + jq '{status, display_name, is_gguf, context_length}' /tmp/load.json - name: Multi-turn determinism via OpenAI + Anthropic SDKs env: @@ -350,6 +365,19 @@ jobs: shell: cmd run: echo Stop Studio (no-op; runner reclaims STUDIO_PID=%STUDIO_PID% at job end) + - name: Collect llama-server logs + if: always() + shell: bash + # Copy llama-server's own stdout/stderr (teed by Studio under + # ~/.unsloth/studio/logs/llama-server/) into the workspace so + # upload-artifact can pick it up. Crucial for diagnosing a + # subprocess crash where Studio's traceback only shows the + # symptom (httpx ReadError) but not the cause. + run: | + mkdir -p logs/llama-server + cp -v ~/.unsloth/studio/logs/llama-server/*.log logs/llama-server/ 2>/dev/null || \ + echo "no llama-server logs to collect" + - name: Upload logs if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -358,6 +386,7 @@ jobs: path: | logs/studio.log logs/install.log + logs/llama-server/*.log retention-days: 7 # ───────────────────────────────────────────────────────────────────── @@ -561,11 +590,21 @@ jobs: # a normal path. GGUF_PATH="${GITHUB_WORKSPACE//\\//}/gguf-cache/${GGUF_FILE}" ls -lh "$GGUF_PATH" - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \ - -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \ - --max-time 600 \ - -d "{\"model_path\":\"$GGUF_PATH\",\"is_lora\":false,\"max_seq_length\":2048}" \ - | jq '{status, display_name}' + # Retry: same rationale as the OpenAI/Anthropic job. + LOAD_OK=0 + for attempt in 1 2 3; do + HTTP=$(curl -s -o /tmp/load.json -w '%{http_code}' \ + -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \ + -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \ + --max-time 600 \ + -d "{\"model_path\":\"$GGUF_PATH\",\"is_lora\":false,\"max_seq_length\":2048}") + if [ "$HTTP" = "200" ]; then LOAD_OK=1; break; fi + echo "::warning::/api/inference/load attempt $attempt returned $HTTP; response:" + cat /tmp/load.json || true + sleep 10 + done + [ "$LOAD_OK" = "1" ] || { echo "::error::/api/inference/load failed 3 attempts"; exit 22; } + jq '{status, display_name}' /tmp/load.json - name: Tool calling, server-side tools, thinking on/off env: @@ -768,6 +807,19 @@ jobs: shell: cmd run: echo Stop Studio (no-op; runner reclaims STUDIO_PID=%STUDIO_PID% at job end) + - name: Collect llama-server logs + if: always() + shell: bash + # Copy llama-server's own stdout/stderr (teed by Studio under + # ~/.unsloth/studio/logs/llama-server/) into the workspace so + # upload-artifact can pick it up. Crucial for diagnosing a + # subprocess crash where Studio's traceback only shows the + # symptom (httpx ReadError) but not the cause. + run: | + mkdir -p logs/llama-server + cp -v ~/.unsloth/studio/logs/llama-server/*.log logs/llama-server/ 2>/dev/null || \ + echo "no llama-server logs to collect" + - name: Upload logs if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -776,6 +828,7 @@ jobs: path: | logs/studio.log logs/install.log + logs/llama-server/*.log retention-days: 7 # ───────────────────────────────────────────────────────────────────── @@ -970,11 +1023,21 @@ jobs: -H 'content-type: application/json' \ -d "{\"username\":\"unsloth\",\"password\":\"$NEW\"}" | jq -r .access_token) echo "API_KEY=$TOKEN" >> "$GITHUB_ENV" - curl -fs -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \ - -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \ - --max-time 900 \ - -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}" \ - | jq '{status, display_name, is_vision}' + # Retry: same rationale as the OpenAI/Anthropic and Tool calling jobs. + LOAD_OK=0 + for attempt in 1 2 3; do + HTTP=$(curl -s -o /tmp/load.json -w '%{http_code}' \ + -X POST "http://127.0.0.1:${STUDIO_PORT}/api/inference/load" \ + -H "Authorization: Bearer $TOKEN" -H 'content-type: application/json' \ + --max-time 900 \ + -d "{\"model_path\":\"$GGUF_REPO\",\"gguf_variant\":\"$GGUF_VARIANT\",\"is_lora\":false,\"max_seq_length\":2048}") + if [ "$HTTP" = "200" ]; then LOAD_OK=1; break; fi + echo "::warning::/api/inference/load attempt $attempt returned $HTTP; response:" + cat /tmp/load.json || true + sleep 10 + done + [ "$LOAD_OK" = "1" ] || { echo "::error::/api/inference/load failed 3 attempts"; exit 22; } + jq '{status, display_name, is_vision}' /tmp/load.json - name: JSON schema decoding + image input env: @@ -1156,6 +1219,19 @@ jobs: shell: cmd run: echo Stop Studio (no-op; runner reclaims STUDIO_PID=%STUDIO_PID% at job end) + - name: Collect llama-server logs + if: always() + shell: bash + # Copy llama-server's own stdout/stderr (teed by Studio under + # ~/.unsloth/studio/logs/llama-server/) into the workspace so + # upload-artifact can pick it up. Crucial for diagnosing a + # subprocess crash where Studio's traceback only shows the + # symptom (httpx ReadError) but not the cause. + run: | + mkdir -p logs/llama-server + cp -v ~/.unsloth/studio/logs/llama-server/*.log logs/llama-server/ 2>/dev/null || \ + echo "no llama-server logs to collect" + - name: Upload logs if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -1164,4 +1240,5 @@ jobs: path: | logs/studio.log logs/install.log + logs/llama-server/*.log retention-days: 7 diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index 3682f1dbbb..a4c28166b9 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -101,6 +101,51 @@ _SWA_CACHE_LOCK = threading.Lock() +def _probe_dns_dead(host: str = "huggingface.co", timeout: float = 2.0) -> bool: + """Quick DNS check. Runs on a daemon thread so concurrent sockets + in the same process are not affected by socket.setdefaulttimeout.""" + result: list[Optional[bool]] = [None] + + def _probe() -> None: + try: + socket.gethostbyname(host) + result[0] = False + except Exception: + result[0] = True + + t = threading.Thread(target = _probe, daemon = True) + t.start() + t.join(timeout) + # Thread still running -> resolver wedged -> treat as dead. + return True if result[0] is None else result[0] + + +@contextlib.contextmanager +def _hf_offline_if_dns_dead(): + """Set HF_HUB_OFFLINE for the body of this block only when DNS to + huggingface.co fails. Restores the env on exit so a transient + resolver hiccup at the start of one load can't quarantine the whole + process. Respects an explicit user setting (no-op if already set).""" + if "HF_HUB_OFFLINE" in os.environ: + yield False + return + if not _probe_dns_dead(): + yield False + return + + transformers_was_set = "TRANSFORMERS_OFFLINE" in os.environ + os.environ["HF_HUB_OFFLINE"] = "1" + if not transformers_was_set: + os.environ["TRANSFORMERS_OFFLINE"] = "1" + logger.warning("huggingface.co unreachable; using local HF cache for this load.") + try: + yield True + finally: + os.environ.pop("HF_HUB_OFFLINE", None) + if not transformers_was_set: + os.environ.pop("TRANSFORMERS_OFFLINE", None) + + def _swa_cache_path() -> Path: home = os.environ.get("UNSLOTH_STUDIO_HOME") or os.environ.get("STUDIO_HOME") base = Path(home) if home else Path.home() / ".unsloth" / "studio" @@ -483,6 +528,9 @@ def __init__(self): self._requested_n_ctx: int = 0 self._stdout_lines: list[str] = [] self._stdout_thread: Optional[threading.Thread] = None + # llama-server tee log (see _drain_stdout / _kill_process). + self._llama_log_fh = None + self._llama_log_path: Optional[Path] = None self._cancel_event = threading.Event() self._api_key: Optional[str] = None @@ -1462,6 +1510,11 @@ def _drain_stdout(self): This prevents a pipe-buffer deadlock on Windows where the default pipe buffer is only ~4 KB. Without draining, llama-server blocks on writes and never becomes healthy. + + Each line is also teed to ``self._llama_log_fh`` when set so a + post-mortem (especially in CI) has the full subprocess output + even if the crash predates the drain-thread join in + ``_wait_for_health``. """ try: for line in self._process.stdout: @@ -1469,6 +1522,14 @@ def _drain_stdout(self): if line: self._stdout_lines.append(line) logger.debug(f"[llama-server] {line}") + fh = getattr(self, "_llama_log_fh", None) + if fh is not None: + try: + fh.write(line + "\n") + fh.flush() + except (ValueError, OSError): + # Log file closed under us; tee silently. + pass except (ValueError, OSError): # Pipe closed — process is terminating pass @@ -1804,6 +1865,55 @@ def _download_gguf( except Exception as e: logger.warning(f"Could not list repo files: {e}") + # Offline: resolve variant -> filename from the local HF cache. + # The heuristic below assumes filenames echo the repo name, + # which breaks for e.g. Qwen3.6-27B-MTP-GGUF (no "MTP" in file). + # Match against the rel path (not just basename) so subdir + # layouts like ``BF16/foo.gguf`` are findable. + if not gguf_filename: + try: + from utils.models.model_config import _iter_hf_cache_snapshots + + boundary = re.compile( + r"(? %s from local HF cache", + hf_variant, + gguf_filename, + ) + break + except Exception as e: + logger.debug(f"Offline cache lookup for variant failed: {e}") + if not gguf_filename: repo_name = hf_repo.split("/")[-1].replace("-GGUF", "") gguf_filename = f"{repo_name}-{hf_variant}.gguf" @@ -1811,8 +1921,6 @@ def _download_gguf( # Check disk space and fall back to a smaller variant if needed all_gguf_files = [gguf_filename] + gguf_extra_shards try: - import os - from huggingface_hub import get_paths_info, try_to_load_from_cache path_infos = list(get_paths_info(hf_repo, all_gguf_files, token = hf_token)) @@ -1946,24 +2054,50 @@ def _download_mmproj( Prefers mmproj-F16.gguf, falls back to any mmproj*.gguf file. Returns the local path, or None if no mmproj file exists. """ - try: - from huggingface_hub import hf_hub_download, list_repo_files - files = list_repo_files(hf_repo, token = hf_token) + def _pick_mmproj(candidates: list[str]) -> Optional[str]: mmproj_files = sorted( - f for f in files if f.endswith(".gguf") and "mmproj" in f.lower() + f + for f in candidates + if f.lower().endswith(".gguf") and "mmproj" in Path(f).name.lower() ) if not mmproj_files: return None - - # Prefer F16 variant - target = None for f in mmproj_files: if f.lower().endswith("-f16.gguf"): - target = f - break - if target is None: - target = mmproj_files[0] + return f + return mmproj_files[0] + + target: Optional[str] = None + try: + from huggingface_hub import list_repo_files + + target = _pick_mmproj(list_repo_files(hf_repo, token = hf_token)) + except Exception as e: + logger.debug(f"Could not list repo files for mmproj: {e}") + + # Offline: resolve mmproj from the local HF cache snapshot, same + # shape as _download_gguf's offline fallback above. + if target is None: + try: + from utils.models.model_config import _iter_hf_cache_snapshots + + for snap in _iter_hf_cache_snapshots(hf_repo): + rel_files = [ + p.relative_to(snap).as_posix() for p in snap.rglob("*.gguf") + ] + target = _pick_mmproj(rel_files) + if target is not None: + logger.info("Resolved mmproj %s from local HF cache", target) + break + except Exception as e: + logger.debug(f"Offline cache lookup for mmproj failed: {e}") + + if target is None: + return None + + try: + from huggingface_hub import hf_hub_download logger.info(f"Downloading mmproj: {hf_repo}/{target}") local_path = hf_hub_download( @@ -2052,18 +2186,22 @@ def load_model( ) # ── Phase 2: download (NO lock held, so cancel can proceed) ── + # Scope HF_HUB_OFFLINE to the download block only when DNS is + # dead; cleanup runs even on exception so a transient hiccup + # at the start of one load cannot quarantine future loads. if hf_repo: - model_path = self._download_gguf( - hf_repo = hf_repo, - hf_variant = hf_variant, - hf_token = hf_token, - ) - # Auto-download mmproj for vision models - if is_vision and not mmproj_path: - mmproj_path = self._download_mmproj( + with _hf_offline_if_dns_dead(): + model_path = self._download_gguf( hf_repo = hf_repo, + hf_variant = hf_variant, hf_token = hf_token, ) + # Auto-download mmproj for vision models + if is_vision and not mmproj_path: + mmproj_path = self._download_mmproj( + hf_repo = hf_repo, + hf_token = hf_token, + ) elif gguf_path: if not Path(gguf_path).is_file(): raise FileNotFoundError(f"GGUF file not found: {gguf_path}") @@ -2603,6 +2741,30 @@ def load_model( self._kill_process() self._stdout_lines = [] + # Tee llama-server output to a dedicated log file so a + # post-mortem in CI (or after a remote-debug session) + # has the full subprocess trail even when the parent + # only stored the last 50 lines. Path lives under the + # studio home so it ships in the same place all other + # Studio logs live. + self._llama_log_fh = None + try: + log_dir = _swa_cache_path().parent / "logs" / "llama-server" + log_dir.mkdir(parents = True, exist_ok = True) + self._llama_log_path = ( + log_dir / f"llama-{int(time.time())}-port-{self._port}.log" + ) + self._llama_log_fh = open( + self._llama_log_path, + "w", + encoding = "utf-8", + buffering = 1, + ) + logger.info(f"llama-server stdout/stderr -> {self._llama_log_path}") + except OSError as e: + # Best-effort; never block the load on logging. + logger.debug(f"Could not open llama-server log file: {e}") + self._llama_log_path = None self._process = subprocess.Popen( cmd, stdout = subprocess.PIPE, @@ -2899,6 +3061,13 @@ def _kill_process(self): if self._stdout_thread is not None: self._stdout_thread.join(timeout = 2) self._stdout_thread = None + fh = getattr(self, "_llama_log_fh", None) + if fh is not None: + try: + fh.close() + except Exception: + pass + self._llama_log_fh = None @staticmethod def _kill_orphaned_servers(): @@ -3110,7 +3279,17 @@ def _wait_for_health(self, timeout: float = 120.0, interval: float = 0.5) -> boo resp = httpx.get(url, timeout = 2.0) if resp.status_code == 200: return True - except (httpx.ConnectError, httpx.TimeoutException): + except ( + httpx.ConnectError, + httpx.TimeoutException, + # ReadError covers TCP RST mid-read while llama-server is + # still binding the port (Windows: WinError 10054). The + # crash-detection branch above catches a real exit; this + # one keeps a transient socket close from masking it. + httpx.ReadError, + httpx.RemoteProtocolError, + httpx.WriteError, + ): pass time.sleep(interval) diff --git a/studio/backend/core/inference/worker.py b/studio/backend/core/inference/worker.py index 085a1ab899..cacede2d3e 100644 --- a/studio/backend/core/inference/worker.py +++ b/studio/backend/core/inference/worker.py @@ -648,6 +648,36 @@ def run_inference_process( os.environ["HF_HUB_DISABLE_XET"] = "1" logger.info("Xet transport disabled (HF_HUB_DISABLE_XET=1)") + # Offline auto-detect: skip 25s of hf_hub_download retries per file + # if DNS is dead; cached files resolve instantly under HF_HUB_OFFLINE=1. + # Scope is this subprocess only -- orchestrator spawns a fresh worker + # per load (see core/inference/orchestrator.py), so the env cannot + # persist across loads. + if "HF_HUB_OFFLINE" not in os.environ: + import socket as _socket + import threading as _threading + + # Probe on a daemon thread so concurrent sockets in the parent + # interpreter are not affected by socket.setdefaulttimeout. + _result: list = [None] + + def _probe() -> None: + try: + _socket.gethostbyname("huggingface.co") + _result[0] = False + except Exception: + _result[0] = True + + _t = _threading.Thread(target = _probe, daemon = True) + _t.start() + _t.join(2.0) + if _result[0] is None or _result[0] is True: + os.environ["HF_HUB_OFFLINE"] = "1" + os.environ.setdefault("TRANSFORMERS_OFFLINE", "1") + logger.warning( + "huggingface.co unreachable; HF_HUB_OFFLINE=1 set for this worker." + ) + import warnings from loggers.config import LogConfig diff --git a/studio/backend/tests/test_offline_gguf_cache_fallback.py b/studio/backend/tests/test_offline_gguf_cache_fallback.py new file mode 100644 index 0000000000..d3b2f553a2 --- /dev/null +++ b/studio/backend/tests/test_offline_gguf_cache_fallback.py @@ -0,0 +1,828 @@ +# SPDX-License-Identifier: AGPL-3.0-only +# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0 + +"""Regression tests for the offline GGUF cache fallback path (#5505). + +Three failure modes hit users when ``huggingface.co`` is unreachable +but the requested GGUF repo is fully cached locally: + +* ``list_gguf_variants`` raised through ``HTTPException(500)`` so the + variant dropdown sat empty. +* ``detect_gguf_model_remote`` returned ``None`` so a GGUF-only repo + was misrouted into the transformers/Unsloth backend (on macOS this + surfaced as a hardware error). +* ``_download_gguf`` fell back to a synthetic ``{repo}-{variant}.gguf`` + name that did not exist in cache when the in-repo filename did not + echo the repo name (e.g. ``unsloth/Qwen3.6-27B-MTP-GGUF`` ships + ``Qwen3.6-27B-UD-Q4_K_XL.gguf`` with no ``MTP`` token). + +Two follow-up regressions covered here: + +* P1 #1: the cache-side variant filter must match the snapshot-relative + path, not just the basename, so subdir layouts like + ``BF16/foo.gguf`` are findable. +* P1 #2: the DNS auto-detect must scope ``HF_HUB_OFFLINE`` to one load + via try/finally so a transient resolver hiccup cannot lock the + long-lived ``LlamaCppBackend`` singleton offline forever. + +No GPU, no network, no subprocess. Linux, macOS, Windows compatible. +""" + +from __future__ import annotations + +import os +import socket +import sys +import types as _types +from pathlib import Path +from unittest.mock import patch + +import pytest + + +_BACKEND_DIR = str(Path(__file__).resolve().parent.parent) +if _BACKEND_DIR not in sys.path: + sys.path.insert(0, _BACKEND_DIR) + +# Stub heavy/unavailable external deps before importing the modules +# under test (same pattern as other studio backend tests). +_loggers_stub = _types.ModuleType("loggers") +_loggers_stub.get_logger = lambda name: __import__("logging").getLogger(name) +sys.modules.setdefault("loggers", _loggers_stub) + +_structlog_stub = _types.ModuleType("structlog") +sys.modules.setdefault("structlog", _structlog_stub) + +# Prefer real httpx if installed (CI installs it). Stub only as fallback. +try: + import httpx # noqa: F401 +except ImportError: + _httpx_stub = _types.ModuleType("httpx") + for _exc_name in ( + "ConnectError", + "TimeoutException", + "ReadTimeout", + "ReadError", + "RemoteProtocolError", + "CloseError", + "HTTPError", + "RequestError", + "HTTPStatusError", + ): + setattr(_httpx_stub, _exc_name, type(_exc_name, (Exception,), {})) + _httpx_stub.Response = type("Response", (), {}) + _httpx_stub.Request = type("Request", (), {}) + + class _FakeTimeout: + def __init__(self, *a, **kw): + pass + + _httpx_stub.Timeout = _FakeTimeout + _httpx_stub.Client = type( + "Client", + (), + { + "__init__": lambda self, **kw: None, + "__enter__": lambda self: self, + "__exit__": lambda self, *a: None, + }, + ) + sys.modules.setdefault("httpx", _httpx_stub) + + +from huggingface_hub import constants as hf_constants + +from core.inference.llama_cpp import ( + LlamaCppBackend, + _hf_offline_if_dns_dead, + _probe_dns_dead, +) +from utils.models.model_config import ( + _detect_gguf_from_hf_cache, + _extract_quant_label, + _iter_hf_cache_snapshots, + _list_gguf_variants_from_hf_cache, + detect_gguf_model_remote, + list_gguf_variants, +) + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +def _build_cache( + root: Path, + repo_id: str, + files: dict[str, int], + *, + snapshot_sha: str = "a" * 40, +) -> Path: + """Create ``$root/models--/snapshots//`` for each entry.""" + repo_dir = root / f"models--{repo_id.replace('/', '--')}" + (repo_dir / "blobs").mkdir(parents = True, exist_ok = True) + snap = repo_dir / "snapshots" / snapshot_sha + snap.mkdir(parents = True, exist_ok = True) + for rel, size in files.items(): + full = snap / rel + full.parent.mkdir(parents = True, exist_ok = True) + full.write_bytes(b"\0" * size) + return snap + + +@pytest.fixture +def hf_cache(tmp_path, monkeypatch): + """Point ``huggingface_hub.constants.HF_HUB_CACHE`` at a temp dir.""" + monkeypatch.setattr(hf_constants, "HF_HUB_CACHE", str(tmp_path)) + return tmp_path + + +@pytest.fixture +def clean_offline_env(monkeypatch): + """Strip ``HF_HUB_OFFLINE`` / ``TRANSFORMERS_OFFLINE`` for the test.""" + monkeypatch.delenv("HF_HUB_OFFLINE", raising = False) + monkeypatch.delenv("TRANSFORMERS_OFFLINE", raising = False) + + +def _siblings(items: dict[str, int]): + """Mock ``hf_model_info(...).siblings`` payload.""" + return _types.SimpleNamespace( + siblings = [ + _types.SimpleNamespace(rfilename = name, size = size) + for name, size in items.items() + ], + ) + + +# --------------------------------------------------------------------------- +# _iter_hf_cache_snapshots +# --------------------------------------------------------------------------- + + +class TestIterHfCacheSnapshots: + def test_returns_empty_when_cache_dir_missing(self, monkeypatch): + monkeypatch.setattr(hf_constants, "HF_HUB_CACHE", "/no/such/dir") + assert list(_iter_hf_cache_snapshots("unsloth/foo")) == [] + + def test_returns_empty_when_repo_not_cached(self, hf_cache): + assert list(_iter_hf_cache_snapshots("unsloth/not-here")) == [] + + def test_returns_empty_when_snapshots_dir_missing(self, hf_cache): + # Repo dir exists but no snapshots/ inside. + (hf_cache / "models--unsloth--bare").mkdir() + assert list(_iter_hf_cache_snapshots("unsloth/bare")) == [] + + def test_yields_newest_first(self, hf_cache): + old = _build_cache( + hf_cache, "unsloth/multi", {"x.gguf": 1}, snapshot_sha = "a" * 40 + ) + new = _build_cache( + hf_cache, "unsloth/multi", {"y.gguf": 1}, snapshot_sha = "b" * 40 + ) + os.utime(old, (1000, 1000)) + os.utime(new, (2000, 2000)) + out = list(_iter_hf_cache_snapshots("unsloth/multi")) + assert [p.name for p in out] == ["b" * 40, "a" * 40] + + def test_repo_id_match_is_case_insensitive(self, hf_cache): + _build_cache(hf_cache, "unsloth/Foo-GGUF", {"Foo-Q4_K_M.gguf": 1}) + # Lookup with a different casing of the org/name still resolves + out = list(_iter_hf_cache_snapshots("UNSLOTH/foo-gguf")) + assert len(out) == 1 + + +# --------------------------------------------------------------------------- +# _list_gguf_variants_from_hf_cache / list_gguf_variants +# --------------------------------------------------------------------------- + + +class TestListGgufVariantsFromCache: + def test_returns_variants_when_cached(self, hf_cache): + _build_cache( + hf_cache, + "unsloth/Qwen3.5-4B-GGUF", + { + "Qwen3.5-4B-UD-Q4_K_XL.gguf": 100, + "Qwen3.5-4B-Q2_K.gguf": 50, + }, + ) + out = _list_gguf_variants_from_hf_cache("unsloth/Qwen3.5-4B-GGUF") + assert out is not None + variants, has_vision = out + assert sorted(v.quant for v in variants) == ["Q2_K", "UD-Q4_K_XL"] + assert has_vision is False + + def test_returns_none_when_not_cached(self, hf_cache): + assert _list_gguf_variants_from_hf_cache("unsloth/absent") is None + + +class TestListGgufVariantsOffline: + def test_offline_env_short_circuits_api( + self, hf_cache, clean_offline_env, monkeypatch + ): + _build_cache(hf_cache, "unsloth/a", {"a-UD-Q4_K_XL.gguf": 1}) + monkeypatch.setenv("HF_HUB_OFFLINE", "1") + + def boom(*a, **k): + raise AssertionError("API must not be called when offline env set") + + with patch("huggingface_hub.model_info", boom): + variants, _has = list_gguf_variants("unsloth/a") + assert len(variants) == 1 + assert variants[0].quant == "UD-Q4_K_XL" + + def test_api_exception_falls_back_to_cache( + self, + hf_cache, + clean_offline_env, + ): + _build_cache(hf_cache, "unsloth/a", {"a-Q4_K_M.gguf": 1}) + + def boom(*a, **k): + raise OSError("network down") + + with patch("huggingface_hub.model_info", boom): + variants, _has = list_gguf_variants("unsloth/a") + assert len(variants) == 1 + assert variants[0].quant == "Q4_K_M" + + def test_api_exception_with_no_cache_reraises(self, hf_cache, clean_offline_env): + def boom(*a, **k): + raise OSError("network down") + + with patch("huggingface_hub.model_info", boom): + with pytest.raises(OSError, match = "network down"): + list_gguf_variants("unsloth/never-cached") + + def test_online_path_unaffected(self, hf_cache, clean_offline_env): + # When the API succeeds, cache is not consulted. + api_payload = _siblings({"a-UD-Q4_K_XL.gguf": 5, "a-Q2_K.gguf": 3}) + + def hf_info(*a, **k): + return api_payload + + with patch("huggingface_hub.model_info", hf_info): + variants, _has = list_gguf_variants("unsloth/a") + assert sorted(v.quant for v in variants) == ["Q2_K", "UD-Q4_K_XL"] + + +# --------------------------------------------------------------------------- +# _detect_gguf_from_hf_cache / detect_gguf_model_remote +# --------------------------------------------------------------------------- + + +class TestDetectGgufFromCache: + def test_picks_best_quant(self, hf_cache): + _build_cache( + hf_cache, + "unsloth/a", + {"a-Q2_K.gguf": 1, "a-UD-Q4_K_XL.gguf": 1}, + ) + assert _detect_gguf_from_hf_cache("unsloth/a") == "a-UD-Q4_K_XL.gguf" + + def test_subdir_only_quant_resolves(self, hf_cache): + """P1 #1 regression: ``BF16/foo.gguf`` (quant only in directory). + Before the fix, the offline cache scan matched on basename and + missed this layout, falling through to the synthetic + ``{repo}-{variant}.gguf`` heuristic.""" + _build_cache( + hf_cache, + "unsloth/gpt-oss-20b-BF16", + {"BF16/foo.gguf": 1}, + ) + out = _detect_gguf_from_hf_cache("unsloth/gpt-oss-20b-BF16") + assert ( + out == "BF16/foo.gguf" + ), f"subdir-only layout must resolve to relative path, got {out}" + + def test_returns_none_when_no_gguf(self, hf_cache): + _build_cache(hf_cache, "unsloth/a", {"README.md": 10}) + assert _detect_gguf_from_hf_cache("unsloth/a") is None + + +class TestDetectGgufModelRemoteOffline: + def test_offline_env_short_circuits_retries( + self, + hf_cache, + clean_offline_env, + monkeypatch, + ): + _build_cache(hf_cache, "unsloth/a", {"a-Q4_K_M.gguf": 1}) + monkeypatch.setenv("HF_HUB_OFFLINE", "1") + + def boom(*a, **k): + raise AssertionError("API must not be called when offline env set") + + with patch("huggingface_hub.model_info", boom): + assert detect_gguf_model_remote("unsloth/a") == "a-Q4_K_M.gguf" + + def test_api_3x_failure_then_cache(self, hf_cache, clean_offline_env): + _build_cache(hf_cache, "unsloth/a", {"a-Q4_K_M.gguf": 1}) + + def boom(*a, **k): + raise OSError("hub down") + + # Patch time.sleep so the 1s/2s/4s backoff doesn't slow the test. + with ( + patch("huggingface_hub.model_info", boom), + patch("time.sleep", lambda *_: None), + ): + out = detect_gguf_model_remote("unsloth/a") + assert out == "a-Q4_K_M.gguf" + + def test_repository_not_found_does_not_consult_cache( + self, + hf_cache, + clean_offline_env, + ): + # Cache has a file but the API explicitly says repo is gone. + _build_cache(hf_cache, "unsloth/a", {"a-Q4_K_M.gguf": 1}) + + class RepositoryNotFoundError(Exception): + pass + + def gone(*a, **k): + raise RepositoryNotFoundError("404") + + with patch("huggingface_hub.model_info", gone): + out = detect_gguf_model_remote("unsloth/a") + # Early-return semantics preserved: 404 wins over a stale cache. + assert out is None + + +# --------------------------------------------------------------------------- +# _probe_dns_dead / _hf_offline_if_dns_dead +# --------------------------------------------------------------------------- + + +class _DnsState: + """Tiny helper that toggles ``socket.gethostbyname`` failure mode.""" + + def __init__(self, monkeypatch): + self._mp = monkeypatch + self._real = socket.gethostbyname + + def fail(self): + def _fail(*a, **k): + raise socket.gaierror(-2, "Name or service not known") + + self._mp.setattr(socket, "gethostbyname", _fail) + + def ok(self): + self._mp.setattr(socket, "gethostbyname", lambda *a, **k: "127.0.0.1") + + def restore(self): + self._mp.setattr(socket, "gethostbyname", self._real) + + +@pytest.fixture +def dns(monkeypatch): + return _DnsState(monkeypatch) + + +class TestProbeDnsDead: + def test_returns_false_on_success(self, dns): + dns.ok() + assert _probe_dns_dead() is False + + def test_returns_true_on_failure(self, dns): + dns.fail() + assert _probe_dns_dead() is True + + def test_restores_prior_socket_timeout(self, dns): + dns.ok() + socket.setdefaulttimeout(7.5) + try: + _probe_dns_dead() + assert socket.getdefaulttimeout() == 7.5 + finally: + socket.setdefaulttimeout(None) + + +class TestHfOfflineIfDnsDead: + def test_dns_fail_sets_env_inside_block_only(self, dns, clean_offline_env): + dns.fail() + assert "HF_HUB_OFFLINE" not in os.environ + with _hf_offline_if_dns_dead() as did_set: + assert did_set is True + assert os.environ.get("HF_HUB_OFFLINE") == "1" + assert os.environ.get("TRANSFORMERS_OFFLINE") == "1" + # P1 #2: env must be restored after the block + assert "HF_HUB_OFFLINE" not in os.environ + assert "TRANSFORMERS_OFFLINE" not in os.environ + + def test_dns_ok_is_noop(self, dns, clean_offline_env): + dns.ok() + with _hf_offline_if_dns_dead() as did_set: + assert did_set is False + assert "HF_HUB_OFFLINE" not in os.environ + + def test_dns_recovers_between_calls(self, dns, clean_offline_env): + # First call: DNS dead -> env set inside, cleared on exit. + dns.fail() + with _hf_offline_if_dns_dead(): + pass + assert "HF_HUB_OFFLINE" not in os.environ + # Second call: DNS healthy -> no env mutation. + dns.ok() + with _hf_offline_if_dns_dead() as did_set: + assert did_set is False + assert "HF_HUB_OFFLINE" not in os.environ + + def test_user_set_hf_hub_offline_is_preserved( + self, + dns, + clean_offline_env, + monkeypatch, + ): + # User explicitly set offline before launching Studio. + monkeypatch.setenv("HF_HUB_OFFLINE", "1") + dns.fail() + with _hf_offline_if_dns_dead() as did_set: + assert did_set is False + assert os.environ.get("HF_HUB_OFFLINE") == "1" + # Helper must not pop a variable it did not set. + assert os.environ.get("HF_HUB_OFFLINE") == "1" + + def test_user_set_transformers_offline_is_preserved( + self, + dns, + clean_offline_env, + monkeypatch, + ): + monkeypatch.setenv("TRANSFORMERS_OFFLINE", "1") + dns.fail() + with _hf_offline_if_dns_dead(): + assert os.environ.get("HF_HUB_OFFLINE") == "1" + assert os.environ.get("TRANSFORMERS_OFFLINE") == "1" + # HF_HUB_OFFLINE was set by helper -> removed. + assert "HF_HUB_OFFLINE" not in os.environ + # TRANSFORMERS_OFFLINE pre-existed -> preserved. + assert os.environ.get("TRANSFORMERS_OFFLINE") == "1" + + def test_exception_inside_block_still_restores_env( + self, + dns, + clean_offline_env, + ): + dns.fail() + with pytest.raises(RuntimeError, match = "boom"): + with _hf_offline_if_dns_dead(): + raise RuntimeError("boom") + # Cleanup must happen on exception as well. + assert "HF_HUB_OFFLINE" not in os.environ + assert "TRANSFORMERS_OFFLINE" not in os.environ + + +class TestExtractQuantLabelSubdir: + """``_extract_quant_label`` must consider the parent directories when + the basename has no quant token. Subdir layouts like ``BF16/foo.gguf`` + are documented in this codebase and surface through the cache scan.""" + + def test_quant_in_basename_unchanged(self): + assert _extract_quant_label("BF16/foo-BF16.gguf") == "BF16" + assert _extract_quant_label("model-Q4_K_M.gguf") == "Q4_K_M" + + def test_quant_only_in_parent_dir(self): + assert _extract_quant_label("BF16/foo.gguf") == "BF16" + + def test_ud_prefix_in_parent_dir(self): + assert _extract_quant_label("UD-Q4_K_XL/weight.gguf") == "UD-Q4_K_XL" + + def test_deeper_nesting_picks_nearest_quant_dir(self): + # When multiple parent segments could match, prefer the one closest + # to the file (innermost). This matches how repos like + # ``models/MXFP4_MOE/foo.gguf`` are laid out. + assert _extract_quant_label("models/MXFP4_MOE/foo.gguf") == "MXFP4_MOE" + + +class TestDownloadMmprojOfflineCacheFallback: + """``LlamaCppBackend._download_mmproj`` must resolve cached mmproj + GGUFs offline, same shape as ``_download_gguf``. Without this the + offline vision GGUF load path returns ``None`` even when the mmproj + is present in cache.""" + + def test_cache_lookup_returns_cached_mmproj_when_list_repo_files_fails( + self, + hf_cache, + ): + _build_cache( + hf_cache, + "unsloth/vision-GGUF", + { + "vision-Q4_K_M.gguf": 1, + "mmproj-vision-F16.gguf": 1, + }, + ) + backend = LlamaCppBackend() + + def boom_list(*a, **k): + raise OSError("offline") + + def fake_download(*, repo_id, filename, token = None): + # Echo back so the test can verify the cache-resolved filename + return f"/fake/cache/{repo_id}/{filename}" + + with ( + patch("huggingface_hub.list_repo_files", boom_list), + patch("huggingface_hub.hf_hub_download", fake_download), + ): + out = backend._download_mmproj( + hf_repo = "unsloth/vision-GGUF", + hf_token = None, + ) + assert out is not None, "mmproj must resolve from cache when offline" + assert "mmproj-vision-F16.gguf" in out + + def test_prefers_f16_variant_when_multiple_mmproj_in_cache(self, hf_cache): + _build_cache( + hf_cache, + "unsloth/vision-GGUF", + { + "mmproj-vision-BF16.gguf": 1, + "mmproj-vision-F16.gguf": 1, + }, + ) + backend = LlamaCppBackend() + + def boom_list(*a, **k): + raise OSError("offline") + + captured = {} + + def fake_download(*, repo_id, filename, token = None): + captured["filename"] = filename + return f"/fake/{filename}" + + with ( + patch("huggingface_hub.list_repo_files", boom_list), + patch("huggingface_hub.hf_hub_download", fake_download), + ): + backend._download_mmproj( + hf_repo = "unsloth/vision-GGUF", + hf_token = None, + ) + assert captured.get("filename") == "mmproj-vision-F16.gguf" + + def test_no_mmproj_in_cache_returns_none(self, hf_cache): + _build_cache( + hf_cache, + "unsloth/text-only-GGUF", + {"text-Q4_K_M.gguf": 1}, + ) + backend = LlamaCppBackend() + + def boom_list(*a, **k): + raise OSError("offline") + + with patch("huggingface_hub.list_repo_files", boom_list): + out = backend._download_mmproj( + hf_repo = "unsloth/text-only-GGUF", + hf_token = None, + ) + assert out is None + + +class TestListLocalGgufVariantsSubdir: + """Subdir layouts like ``BF16/foo.gguf`` and ``Q4_K_M/foo.gguf`` must + produce distinct quant labels, not collapse on basename.""" + + def test_two_subdir_variants_do_not_collapse(self, tmp_path): + from utils.models.model_config import list_local_gguf_variants + + (tmp_path / "config.json").write_text("{}") + (tmp_path / "BF16").mkdir() + (tmp_path / "BF16" / "foo.gguf").write_bytes(b"\0" * 100) + (tmp_path / "Q4_K_M").mkdir() + (tmp_path / "Q4_K_M" / "foo.gguf").write_bytes(b"\0" * 50) + + variants, _ = list_local_gguf_variants(str(tmp_path)) + quants = {v.quant for v in variants} + assert "BF16" in quants, f"BF16 missing from {quants}" + assert "Q4_K_M" in quants, f"Q4_K_M missing from {quants}" + assert len(variants) == 2 + + def test_find_local_gguf_by_variant_locates_subdir(self, tmp_path): + from utils.models.model_config import _find_local_gguf_by_variant + + (tmp_path / "config.json").write_text("{}") + (tmp_path / "BF16").mkdir() + target = tmp_path / "BF16" / "foo.gguf" + target.write_bytes(b"\0" * 10) + + out = _find_local_gguf_by_variant(str(tmp_path), "BF16") + assert out is not None + assert Path(out).name == "foo.gguf" + + +class TestListGgufVariantsPermanentErrors: + """Permanent HF errors must surface; cache fallback only on transient.""" + + def test_repository_not_found_re_raises(self, hf_cache, clean_offline_env): + from utils.models.model_config import list_gguf_variants + + _build_cache(hf_cache, "u/repo-gguf", {"foo-Q4_K_M.gguf": 1}) + + class _RepoNotFound(Exception): + pass + + _RepoNotFound.__name__ = "RepositoryNotFoundError" + + def boom(*a, **k): + raise _RepoNotFound("repo deleted") + + with patch("huggingface_hub.model_info", boom): + with pytest.raises(Exception) as exc_info: + list_gguf_variants("u/repo-gguf") + assert type(exc_info.value).__name__ == "RepositoryNotFoundError" + + def test_gated_repo_re_raises(self, hf_cache, clean_offline_env): + from utils.models.model_config import list_gguf_variants + + _build_cache(hf_cache, "u/gated-gguf", {"foo-Q4_K_M.gguf": 1}) + + class _GatedRepo(Exception): + pass + + _GatedRepo.__name__ = "GatedRepoError" + + def boom(*a, **k): + raise _GatedRepo("auth required") + + with patch("huggingface_hub.model_info", boom): + with pytest.raises(Exception) as exc_info: + list_gguf_variants("u/gated-gguf") + assert type(exc_info.value).__name__ == "GatedRepoError" + + def test_transient_error_still_falls_back_to_cache( + self, hf_cache, clean_offline_env + ): + from utils.models.model_config import list_gguf_variants + + _build_cache(hf_cache, "u/transient-gguf", {"foo-Q4_K_M.gguf": 1}) + + def boom(*a, **k): + raise OSError("network down") + + with patch("huggingface_hub.model_info", boom): + variants, _ = list_gguf_variants("u/transient-gguf") + assert any(v.quant == "Q4_K_M" for v in variants) + + +class TestDetectGgufFromCacheExcludesMmproj: + """A partial cache with only a vision projector must not route the + projector as the main model.""" + + def test_mmproj_only_returns_none(self, hf_cache): + from utils.models.model_config import _detect_gguf_from_hf_cache + + _build_cache( + hf_cache, + "u/vision-only-mmproj", + {"mmproj-vision-F16.gguf": 1}, + ) + assert _detect_gguf_from_hf_cache("u/vision-only-mmproj") is None + + def test_main_plus_mmproj_returns_main(self, hf_cache): + from utils.models.model_config import _detect_gguf_from_hf_cache + + _build_cache( + hf_cache, + "u/vision-full", + { + "model-Q4_K_M.gguf": 1, + "mmproj-vision-F16.gguf": 1, + }, + ) + out = _detect_gguf_from_hf_cache("u/vision-full") + assert out is not None + assert "mmproj" not in out.lower() + + +class TestProbeDnsDeadNoGlobalTimeoutMutation: + """``_probe_dns_dead`` must not change ``socket.setdefaulttimeout`` + process-wide -- concurrent sockets without explicit timeout would + inherit it for the probe window.""" + + def test_default_timeout_unchanged_when_dns_up(self, monkeypatch): + import socket as _socket + from core.inference.llama_cpp import _probe_dns_dead + + prev = _socket.getdefaulttimeout() + set_calls = [] + + original_set = _socket.setdefaulttimeout + + def tracking_set(value): + set_calls.append(value) + original_set(value) + + monkeypatch.setattr(_socket, "setdefaulttimeout", tracking_set) + monkeypatch.setattr(_socket, "gethostbyname", lambda h: "127.0.0.1") + + try: + _probe_dns_dead("example.invalid", timeout = 0.5) + finally: + # Restore exact state regardless of any test-side mutation. + original_set(prev) + + assert set_calls == [], ( + f"_probe_dns_dead mutated socket.setdefaulttimeout {set_calls}; " + "must isolate timeout to the probe thread" + ) + + def test_returns_dead_when_resolver_wedges(self, monkeypatch): + import socket as _socket + from core.inference.llama_cpp import _probe_dns_dead + + # Simulate a wedged resolver: thread blocks forever. + def wedged(host): + import threading + + threading.Event().wait() + + monkeypatch.setattr(_socket, "gethostbyname", wedged) + assert _probe_dns_dead("example.invalid", timeout = 0.1) is True + + +class TestWaitForHealthRetriesOnReadError: + """A TCP RST mid-read while llama-server is still binding the port + (Windows: WinError 10054) must not abort the health-poll loop -- + that masks a legitimate 'still warming up' state as a fatal load.""" + + def test_read_error_then_success(self, monkeypatch): + import httpx + + from core.inference.llama_cpp import LlamaCppBackend + + backend = LlamaCppBackend() + backend._port = 65500 + + class _FakeProc: + returncode = None + + def poll(self): + return None + + def terminate(self): + pass + + def kill(self): + pass + + def wait(self, timeout = None): + return 0 + + backend._process = _FakeProc() + backend._stdout_thread = None + backend._stdout_lines = [] + + calls = {"n": 0} + + def fake_get(url, timeout = None): + calls["n"] += 1 + if calls["n"] == 1: + raise httpx.ReadError("WinError 10054") + if calls["n"] == 2: + raise httpx.RemoteProtocolError("short read") + if calls["n"] == 3: + raise httpx.WriteError("peer dropped") + + class _OK: + status_code = 200 + + return _OK() + + monkeypatch.setattr("core.inference.llama_cpp.httpx.get", fake_get) + assert backend._wait_for_health(timeout = 5.0, interval = 0.01) is True + assert calls["n"] == 4, ( + f"_wait_for_health should retry past ReadError/RemoteProtocol/Write; " + f"saw {calls['n']} attempts" + ) + + def test_real_process_exit_still_short_circuits(self, monkeypatch): + from core.inference.llama_cpp import LlamaCppBackend + + backend = LlamaCppBackend() + backend._port = 65501 + + class _DeadProc: + returncode = 137 + + def poll(self): + return 137 + + def terminate(self): + pass + + def kill(self): + pass + + def wait(self, timeout = None): + return 137 + + backend._process = _DeadProc() + backend._stdout_thread = None + backend._stdout_lines = ["fatal: out of memory"] + assert backend._wait_for_health(timeout = 5.0, interval = 0.01) is False diff --git a/studio/backend/utils/models/model_config.py b/studio/backend/utils/models/model_config.py index bf7f7a009b..2f3bd2431c 100644 --- a/studio/backend/utils/models/model_config.py +++ b/studio/backend/utils/models/model_config.py @@ -1259,12 +1259,10 @@ def _extract_quant_label(filename: str) -> str: """ import re - # Use only the basename (rfilename may include directory) basename = filename.rsplit("/", 1)[-1] # Strip .gguf and any shard suffix (-00001-of-00010) stem = re.sub(r"-\d{3,}-of-\d{3,}", "", basename.rsplit(".", 1)[0]) - # Match known quantization patterns - match = re.search( + quant_re = ( r"(UD-)?" # Optional UD- prefix (Ultra Discrete) r"(MXFP[0-9]+(?:_[A-Z0-9]+)*" # MXFP variants: MXFP4, MXFP4_MOE r"|IQ[0-9]+_[A-Z]+(?:_[A-Z0-9]+)?" # IQ variants: IQ4_XS, IQ4_NL, IQ1_S @@ -1272,10 +1270,19 @@ def _extract_quant_label(filename: str) -> str: r"|Q[0-9]+_K_[A-Z]+" # K-quant: Q4_K_M, Q3_K_S r"|Q[0-9]+_[0-9]+" # Standard: Q8_0, Q5_1 r"|Q[0-9]+_K" # Short K-quant: Q6_K - r"|BF16|F16|F32)", # Full precision - stem, - re.IGNORECASE, + r"|BF16|F16|F32)" # Full precision ) + match = re.search(quant_re, stem, re.IGNORECASE) + # Subdir layouts like ``BF16/foo.gguf`` keep the quant in the directory, + # not the basename. Look at the parent dirs too so the variant label + # matches the snapshot-relative path produced elsewhere. + if not match and "/" in filename: + parents = filename.rsplit("/", 1)[0] + for segment in reversed(parents.split("/")): + m = re.search(quant_re, segment, re.IGNORECASE) + if m: + match = m + break if match: prefix = match.group(1) or "" return f"{prefix}{match.group(2)}" @@ -1283,6 +1290,57 @@ def _extract_quant_label(filename: str) -> str: return stem.split("-")[-1] +def _iter_hf_cache_snapshots(repo_id: str): + """Yield HF cache snapshot dirs for *repo_id*, newest first. + + Empty generator if HF_HUB_CACHE is missing, the repo isn't cached, + or has no snapshots. Repo name match is case-insensitive to handle + casing drift between download time and lookup. + """ + try: + from huggingface_hub import constants as hf_constants + except Exception: + return + + cache_dir = Path(hf_constants.HF_HUB_CACHE) + if not cache_dir.is_dir(): + return + + target = f"models--{repo_id.replace('/', '--')}".lower() + repo_dir: Optional[Path] = None + try: + for entry in cache_dir.iterdir(): + if entry.is_dir() and entry.name.lower() == target: + repo_dir = entry + break + except OSError: + return + if repo_dir is None: + return + + snapshots = repo_dir / "snapshots" + if not snapshots.is_dir(): + return + + try: + snap_dirs = [s for s in snapshots.iterdir() if s.is_dir()] + except OSError: + return + snap_dirs.sort(key = lambda s: s.stat().st_mtime, reverse = True) + yield from snap_dirs + + +def _list_gguf_variants_from_hf_cache( + repo_id: str, +) -> Optional[tuple[list[GgufVariantInfo], bool]]: + """Variants from the local HF cache snapshot, or None if not cached.""" + for snap in _iter_hf_cache_snapshots(repo_id): + variants, has_vision = list_local_gguf_variants(str(snap)) + if variants or has_vision: + return variants, has_vision + return None + + def list_gguf_variants( repo_id: str, hf_token: Optional[str] = None, @@ -1298,7 +1356,40 @@ def list_gguf_variants( """ from huggingface_hub import model_info as hf_model_info - info = hf_model_info(repo_id, token = hf_token, files_metadata = True) + # Offline: skip the API and serve from cache. + offline = os.environ.get("HF_HUB_OFFLINE", "").lower() in ( + "1", + "true", + "yes", + ) or os.environ.get("TRANSFORMERS_OFFLINE", "").lower() in ("1", "true", "yes") + if offline: + cached = _list_gguf_variants_from_hf_cache(repo_id) + if cached is not None: + return cached + + try: + info = hf_model_info(repo_id, token = hf_token, files_metadata = True) + except Exception as e: + # Permanent errors (deleted/gated/bad revision) must surface to + # the caller; serving stale cache here would mask the real cause. + # Matches the early-return in ``detect_gguf_model_remote``. + if type(e).__name__ in ( + "RepositoryNotFoundError", + "GatedRepoError", + "RevisionNotFoundError", + "EntryNotFoundError", + ): + raise + # API failed transiently; fall back to local snapshot if fully downloaded. + cached = _list_gguf_variants_from_hf_cache(repo_id) + if cached is not None: + logger.warning( + "HF API unreachable for %s (%s); using local cache snapshot.", + repo_id, + e.__class__.__name__, + ) + return cached + raise variants: list[GgufVariantInfo] = [] has_vision = False @@ -1392,16 +1483,13 @@ def list_local_gguf_variants( size = f.stat().st_size except OSError: size = 0 - quant = _extract_quant_label(f.name) + # Pass the relative path so ``BF16/foo.gguf`` and ``Q4_K_M/foo.gguf`` + # produce distinct quant labels instead of collapsing on basename. + rel = f.relative_to(p).as_posix() + quant = _extract_quant_label(rel) quant_totals[quant] = quant_totals.get(quant, 0) + size - # Only compute the (potentially expensive) relative path when this - # is the first file we've seen for this quant -- after that we'd - # discard the result anyway. Use posix-style separators so the - # filename matches what ``list_gguf_variants`` (the remote HF - # API path) returns on every platform; otherwise Windows would - # emit ``BF16\foo.gguf`` here. if quant not in quant_first_file: - quant_first_file[quant] = f.relative_to(p).as_posix() + quant_first_file[quant] = rel variants = [ GgufVariantInfo( @@ -1429,16 +1517,36 @@ def _find_local_gguf_by_variant(directory: str, variant: str) -> Optional[str]: # Recurse into subdirectories so variants stored under a quant-named # subdir (e.g. ``BF16/foo-BF16-00001-of-00002.gguf``) are found. + # Match against the relative path so the quant label can come from + # the directory name when the basename omits it. matches = sorted( f for f in _iter_gguf_files(p, recursive = True) - if not _is_mmproj(f.name) and _extract_quant_label(f.name) == variant + if not _is_mmproj(f.name) + and _extract_quant_label(f.relative_to(p).as_posix()) == variant ) if matches: return str(matches[0].resolve()) return None +def _detect_gguf_from_hf_cache(repo_id: str) -> Optional[str]: + """Best GGUF filename for *repo_id* from the local HF cache, or None. + + Excludes mmproj (vision projector) files so a partial cache that + only has the projector cannot route the projector as the main model. + """ + for snap in _iter_hf_cache_snapshots(repo_id): + rel_files = [ + f.relative_to(snap).as_posix() + for f in _iter_gguf_files(snap, recursive = True) + if not _is_mmproj(f.name) + ] + if rel_files: + return _pick_best_gguf(rel_files) + return None + + def detect_gguf_model_remote( repo_id: str, hf_token: Optional[str] = None, @@ -1455,10 +1563,23 @@ def detect_gguf_model_remote( through to the MLX backend, which then fails opening a non-existent config.json on the GGUF-only repo. Three attempts with 1s/2s/4s backoff covers the typical free-runner HF Hub flakiness. + + When offline, falls back to the local HF cache so a downloaded + repo is still routed to llama-server (not MLX/Unsloth). """ import time from huggingface_hub import model_info as hf_model_info + offline = os.environ.get("HF_HUB_OFFLINE", "").lower() in ( + "1", + "true", + "yes", + ) or os.environ.get("TRANSFORMERS_OFFLINE", "").lower() in ("1", "true", "yes") + if offline: + cached = _detect_gguf_from_hf_cache(repo_id) + if cached is not None: + return cached + last_err: Optional[Exception] = None for attempt in range(3): try: @@ -1479,6 +1600,17 @@ def detect_gguf_model_remote( return None if attempt < 2: time.sleep(2**attempt) + + # All attempts failed; fall back to local cache for offline users. + cached = _detect_gguf_from_hf_cache(repo_id) + if cached is not None: + logger.warning( + "HF API unreachable for '%s' (%s); using local cache to detect GGUF.", + repo_id, + type(last_err).__name__ if last_err else "unknown", + ) + return cached + logger.warning( f"Could not check GGUF files for '{repo_id}' after 3 attempts: {last_err}" )