Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 138 additions & 21 deletions studio/backend/core/inference/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,46 @@
_SWA_CACHE_LOCK = threading.Lock()


def _probe_dns_dead(host: str = "huggingface.co", timeout: float = 2.0) -> bool:
"""Quick blocking DNS check. Restores any prior default timeout."""
prev = socket.getdefaulttimeout()
socket.setdefaulttimeout(timeout)
try:

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Avoid mutating global socket timeout in DNS probe

_probe_dns_dead sets socket.setdefaulttimeout(timeout) before calling gethostbyname, which changes the default timeout process-wide for the duration of the probe. During a load_model call this can affect unrelated concurrent requests in the same backend process: any path that opens sockets without an explicit timeout will inherit 2.0s and may fail spuriously. This creates cross-request, timing-dependent network failures that are hard to diagnose.

Useful? React with 👍 / 👎.

try:
socket.gethostbyname(host)
return False
except Exception:
return True
finally:
socket.setdefaulttimeout(prev)


@contextlib.contextmanager
def _hf_offline_if_dns_dead():
"""Set HF_HUB_OFFLINE for the body of this block only when DNS to
huggingface.co fails. Restores the env on exit so a transient
resolver hiccup at the start of one load can't quarantine the whole
process. Respects an explicit user setting (no-op if already set)."""
if "HF_HUB_OFFLINE" in os.environ:
yield False
return
if not _probe_dns_dead():
yield False
return

transformers_was_set = "TRANSFORMERS_OFFLINE" in os.environ
os.environ["HF_HUB_OFFLINE"] = "1"
if not transformers_was_set:
os.environ["TRANSFORMERS_OFFLINE"] = "1"
logger.warning("huggingface.co unreachable; using local HF cache for this load.")
try:
yield True
finally:
os.environ.pop("HF_HUB_OFFLINE", None)
if not transformers_was_set:
os.environ.pop("TRANSFORMERS_OFFLINE", None)


def _swa_cache_path() -> Path:
home = os.environ.get("UNSLOTH_STUDIO_HOME") or os.environ.get("STUDIO_HOME")
base = Path(home) if home else Path.home() / ".unsloth" / "studio"
Expand Down Expand Up @@ -1804,15 +1844,62 @@ def _download_gguf(
except Exception as e:
logger.warning(f"Could not list repo files: {e}")

# Offline: resolve variant -> filename from the local HF cache.
# The heuristic below assumes filenames echo the repo name,
# which breaks for e.g. Qwen3.6-27B-MTP-GGUF (no "MTP" in file).
# Match against the rel path (not just basename) so subdir
# layouts like ``BF16/foo.gguf`` are findable.
if not gguf_filename:
try:
from utils.models.model_config import _iter_hf_cache_snapshots

boundary = re.compile(
r"(?<![a-zA-Z0-9])"
+ re.escape(hf_variant.lower())
+ r"(?![a-zA-Z0-9])"
)
for snap in _iter_hf_cache_snapshots(hf_repo):
matches = sorted(
p.relative_to(snap).as_posix()
for p in snap.rglob("*.gguf")
if "mmproj" not in p.name.lower()
and boundary.search(p.relative_to(snap).as_posix().lower())
)
if not matches:
continue
gguf_filename = matches[0]
m = _SHARD_FULL_RE.match(Path(gguf_filename).name)
if m:
prefix = m.group(1)
total = m.group(3)
sibling_pat = re.compile(
r"^"
+ re.escape(prefix)
+ r"-\d{5}-of-"
+ re.escape(total)
+ r"\.gguf$"
)
gguf_extra_shards = [
f
for f in matches[1:]
if sibling_pat.match(Path(f).name)
]
logger.info(
"Resolved variant %s -> %s from local HF cache",
hf_variant,
gguf_filename,
)
break
except Exception as e:
logger.debug(f"Offline cache lookup for variant failed: {e}")

if not gguf_filename:
repo_name = hf_repo.split("/")[-1].replace("-GGUF", "")
gguf_filename = f"{repo_name}-{hf_variant}.gguf"

# Check disk space and fall back to a smaller variant if needed
all_gguf_files = [gguf_filename] + gguf_extra_shards
try:
import os

from huggingface_hub import get_paths_info, try_to_load_from_cache

path_infos = list(get_paths_info(hf_repo, all_gguf_files, token = hf_token))
Expand Down Expand Up @@ -1946,24 +2033,50 @@ def _download_mmproj(
Prefers mmproj-F16.gguf, falls back to any mmproj*.gguf file.
Returns the local path, or None if no mmproj file exists.
"""
try:
from huggingface_hub import hf_hub_download, list_repo_files

files = list_repo_files(hf_repo, token = hf_token)
def _pick_mmproj(candidates: list[str]) -> Optional[str]:
mmproj_files = sorted(
f for f in files if f.endswith(".gguf") and "mmproj" in f.lower()
f
for f in candidates
if f.lower().endswith(".gguf") and "mmproj" in Path(f).name.lower()
)
if not mmproj_files:
return None

# Prefer F16 variant
target = None
for f in mmproj_files:
if f.lower().endswith("-f16.gguf"):
target = f
break
if target is None:
target = mmproj_files[0]
return f
return mmproj_files[0]

target: Optional[str] = None
try:
from huggingface_hub import list_repo_files

target = _pick_mmproj(list_repo_files(hf_repo, token = hf_token))
except Exception as e:
logger.debug(f"Could not list repo files for mmproj: {e}")

# Offline: resolve mmproj from the local HF cache snapshot, same
# shape as _download_gguf's offline fallback above.
if target is None:
try:
from utils.models.model_config import _iter_hf_cache_snapshots

for snap in _iter_hf_cache_snapshots(hf_repo):
rel_files = [
p.relative_to(snap).as_posix() for p in snap.rglob("*.gguf")
]
target = _pick_mmproj(rel_files)
if target is not None:
logger.info("Resolved mmproj %s from local HF cache", target)
break
except Exception as e:
logger.debug(f"Offline cache lookup for mmproj failed: {e}")

if target is None:
return None

try:
from huggingface_hub import hf_hub_download

logger.info(f"Downloading mmproj: {hf_repo}/{target}")
local_path = hf_hub_download(
Expand Down Expand Up @@ -2052,18 +2165,22 @@ def load_model(
)

# ── Phase 2: download (NO lock held, so cancel can proceed) ──
# Scope HF_HUB_OFFLINE to the download block only when DNS is
# dead; cleanup runs even on exception so a transient hiccup
# at the start of one load cannot quarantine future loads.
if hf_repo:
model_path = self._download_gguf(
hf_repo = hf_repo,
hf_variant = hf_variant,
hf_token = hf_token,
)
# Auto-download mmproj for vision models
if is_vision and not mmproj_path:
mmproj_path = self._download_mmproj(
with _hf_offline_if_dns_dead():
model_path = self._download_gguf(
hf_repo = hf_repo,
hf_variant = hf_variant,
hf_token = hf_token,
)
# Auto-download mmproj for vision models
if is_vision and not mmproj_path:
mmproj_path = self._download_mmproj(
hf_repo = hf_repo,
hf_token = hf_token,
)
elif gguf_path:
if not Path(gguf_path).is_file():
raise FileNotFoundError(f"GGUF file not found: {gguf_path}")
Expand Down
21 changes: 21 additions & 0 deletions studio/backend/core/inference/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,27 @@ def run_inference_process(
os.environ["HF_HUB_DISABLE_XET"] = "1"
logger.info("Xet transport disabled (HF_HUB_DISABLE_XET=1)")

# Offline auto-detect: skip 25s of hf_hub_download retries per file
# if DNS is dead; cached files resolve instantly under HF_HUB_OFFLINE=1.
# Scope is this subprocess only -- orchestrator spawns a fresh worker
# per load (see core/inference/orchestrator.py), so the env cannot
# persist across loads.
if "HF_HUB_OFFLINE" not in os.environ:
import socket as _socket

prev_timeout = _socket.getdefaulttimeout()
_socket.setdefaulttimeout(2.0)
try:
_socket.gethostbyname("huggingface.co")
except Exception:
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ.setdefault("TRANSFORMERS_OFFLINE", "1")
Comment on lines +675 to +676

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Do not persist HF_HUB_OFFLINE on a single DNS miss

When startup DNS resolution fails once, this permanently sets HF_HUB_OFFLINE=1 for the entire worker lifetime. A transient resolver hiccup at process start therefore forces all subsequent model loads into offline mode (including when connectivity is later healthy), preventing normal Hub fetches until the worker is restarted.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

For consistency with the change in llama_cpp.py, it would be beneficial to log a warning when offline mode is enabled. This helps in debugging by making it clear why the application is operating in offline mode. In accordance with repository guidelines, ensure the warning message is dynamically generated to include the specific configuration values it refers to (e.g., the source or value of the offline setting) to ensure accuracy and avoid confusion.

References
  1. User-facing warning messages should be dynamically generated to include the specific configuration values they refer to, rather than using hardcoded examples, to ensure accuracy and avoid confusion.

Comment on lines +675 to +676

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Avoid forcing offline mode after one DNS lookup failure

A transient failure in _socket.gethostbyname("huggingface.co") now unconditionally sets HF_HUB_OFFLINE=1 for the entire worker process, so the same load attempt will refuse all Hub HTTP calls even if connectivity is actually available moments later. This regresses online reliability for partially cached models: instead of retrying/download behavior, the load fails immediately in offline-only mode until a new subprocess is spawned. Fresh evidence in this revision is that run_inference_process sets the env once at startup while the spawned worker is kept alive through the loaded handshake in orchestrator.load_model, so every Hub call in that attempt inherits offline mode.

Useful? React with 👍 / 👎.

logger.warning(
"huggingface.co unreachable; HF_HUB_OFFLINE=1 set for this worker."
)
finally:
_socket.setdefaulttimeout(prev_timeout)

import warnings
from loggers.config import LogConfig

Expand Down
Loading
Loading