diff --git a/studio/backend/core/inference/_vulkan_probe.py b/studio/backend/core/inference/_vulkan_probe.py
new file mode 100644
index 0000000000..f2e225f4c4
--- /dev/null
+++ b/studio/backend/core/inference/_vulkan_probe.py
@@ -0,0 +1,104 @@
+"""Standalone free-VRAM probe for the bundled ggml Vulkan backend.
+
+Run in a short-lived subprocess (``python _vulkan_probe.py <bindir>``) so the
+Vulkan instance never lives in the long-running backend process. Loads the
+bundled ggml Vulkan backend from ``<bindir>`` and prints one
+``<idx>\\t<free_bytes>\\t<is_igpu>`` line per device to stdout. The indices
+are ggml's own Vulkan device ordinals (the space GGML_VK_VISIBLE_DEVICES
+expects), which need not match nvidia-smi order. ``is_igpu`` is ``1`` for an
+integrated GPU (shared system RAM) and ``0`` otherwise, taken from ggml's own
+device type so the reader needn't guess from VRAM-vs-RAM ratios.
+
+Uses only the standard library so it stays runnable as a bare script without
+importing the backend package.
+"""
+
+import ctypes
+import os
+import sys
+
+# ggml_backend_dev_type enum (ggml-backend.h): CPU=0, GPU=1, IGPU=2, ...
+_GGML_BACKEND_DEVICE_TYPE_IGPU = 2
+
+
+def _igpu_flags(base, lib, count: int) -> list[bool]:
+    """Per-device integrated-GPU flags via ggml's backend registry.
+
+    The Vulkan reg enumerates devices in the same order as
+    ``ggml_backend_vk_get_device_memory`` (ggml-vulkan builds each device
+    context with ``ctx->device = i``), so reg index == device ordinal.
+    Returns all-False on any failure so the reader never over-caps a
+    discrete card just because the type couldn't be read.
+    """
+    flags = [False] * count
+    try:
+        lib.ggml_backend_vk_reg.restype = ctypes.c_void_p
+        lib.ggml_backend_vk_reg.argtypes = []
+        base.ggml_backend_reg_dev_count.restype = ctypes.c_size_t
+        base.ggml_backend_reg_dev_count.argtypes = [ctypes.c_void_p]
+        base.ggml_backend_reg_dev_get.restype = ctypes.c_void_p
+        base.ggml_backend_reg_dev_get.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
+        base.ggml_backend_dev_type.restype = ctypes.c_int
+        base.ggml_backend_dev_type.argtypes = [ctypes.c_void_p]
+
+        reg = lib.ggml_backend_vk_reg()
+        if not reg:
+            return flags
+        dev_count = base.ggml_backend_reg_dev_count(reg)
+        for i in range(min(count, dev_count)):
+            dev = base.ggml_backend_reg_dev_get(reg, i)
+            if dev:
+                flags[i] = base.ggml_backend_dev_type(dev) == _GGML_BACKEND_DEVICE_TYPE_IGPU
+    except Exception:
+        # iGPU detection is best-effort: any failure (missing symbol,
+        # registry call error) degrades to "discrete" so the memory
+        # readings still get through instead of crashing the probe.
+        pass
+    return flags
+
+
+def main() -> int:
+    if len(sys.argv) < 2:
+        return 0
+    bindir = sys.argv[1]
+
+    if sys.platform == "win32":
+        base_name, vk_name = "ggml-base.dll", "ggml-vulkan.dll"
+        try:
+            os.add_dll_directory(bindir)
+        except Exception:
+            pass
+    else:
+        base_name, vk_name = "libggml-base.so", "libggml-vulkan.so"
+
+    try:
+        base = ctypes.CDLL(os.path.join(bindir, base_name), mode = ctypes.RTLD_GLOBAL)
+        lib = ctypes.CDLL(os.path.join(bindir, vk_name), mode = ctypes.RTLD_GLOBAL)
+    except OSError as e:
+        print(f"ggml-vulkan load failed: {e}", file = sys.stderr)
+        return 1
+
+    lib.ggml_backend_vk_get_device_count.restype = ctypes.c_int
+    lib.ggml_backend_vk_get_device_count.argtypes = []
+    lib.ggml_backend_vk_get_device_memory.restype = None
+    lib.ggml_backend_vk_get_device_memory.argtypes = [
+        ctypes.c_int,
+        ctypes.POINTER(ctypes.c_size_t),
+        ctypes.POINTER(ctypes.c_size_t),
+    ]
+
+    count = lib.ggml_backend_vk_get_device_count()
+    igpu = _igpu_flags(base, lib, count)
+    rows = []
+    for i in range(count):
+        free, total = ctypes.c_size_t(0), ctypes.c_size_t(0)
+        # total is a required out-param of the C call but unused: the reader
+        # leaves a flat per-device margin, not a fraction of total.
+        lib.ggml_backend_vk_get_device_memory(i, ctypes.byref(free), ctypes.byref(total))
+        rows.append("%d\t%d\t%d" % (i, free.value, int(igpu[i])))
+    sys.stdout.write("\n".join(rows))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index b83e4e6961..ec9f18f9c9 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -658,6 +658,42 @@ def _backfill_usage_from_timings(usage, timings):
     return out
 
 
+def _vulkan_lib_filename() -> str:
+    return "ggml-vulkan.dll" if sys.platform == "win32" else "libggml-vulkan.so"
+
+
+# Free system RAM to leave on an integrated GPU, mirroring llama.cpp's own
+# auto-fit margin (llama-server --fit-target, default 1024 MiB per device).
+# ggml reports an iGPU's "VRAM" as shared system RAM, so we hold back the same
+# per-device margin --fit would rather than inventing a larger reserve.
+_IGPU_HOST_RESERVE_MIB = 1024
+
+
+def _apply_igpu_host_reserve_mib(free_mib: int, is_igpu: bool) -> int:
+    """Reserve host headroom on an integrated (shared-memory) Vulkan GPU.
+
+    ggml sums every memory heap for an integrated GPU (ggml-vulkan's
+    ggml_backend_vk_get_device_memory), so its reported free "VRAM" is really
+    free system RAM. Sizing context/offload against all of it would crowd out
+    the host and push it into swap or the OOM killer. We leave the same
+    per-device margin llama.cpp's --fit uses (``_IGPU_HOST_RESERVE_MIB``).
+    ``is_igpu`` comes straight from ggml's device type, so a discrete card is
+    never touched. Only ever reduces the budget.
+    """
+    if not is_igpu:
+        return free_mib
+    return max(0, free_mib - _IGPU_HOST_RESERVE_MIB)
+
+
+def _llama_lib_dir(binary: str) -> Path:
+    # The installer exposes llama-server as a top-level symlink
+    # (~/.unsloth/llama.cpp/llama-server) into build/bin/, where the ggml
+    # backend libs actually live. Resolve it so callers looking for sibling
+    # libs (Vulkan detection, LD_LIBRARY_PATH, the probe's bindir) hit the real
+    # directory instead of the symlink's parent.
+    return Path(binary).resolve().parent
+
+
 class LlamaCppBackend:
     """Manages a llama-server subprocess for GGUF model inference.
 
@@ -1297,6 +1333,20 @@ def _get_gguf_size_bytes(model_path: str) -> int:
 
         return total
 
+    @staticmethod
+    def _is_vulkan_backend(binary: Optional[str] = None) -> bool:
+        """True if the installed llama.cpp build is the Vulkan one.
+
+        Builds are single-backend, so the presence of the Vulkan ggml
+        backend library next to llama-server is sufficient. Used to keep
+        the free-memory probe and the GPU pin in the same device-index
+        space (ggml's Vulkan ordinals, not nvidia-smi order).
+        """
+        binary = binary or LlamaCppBackend._find_llama_server_binary()
+        if not binary:
+            return False
+        return (_llama_lib_dir(binary) / _vulkan_lib_filename()).is_file()
+
     @staticmethod
     def _amd_apu_wants_unified_memory() -> bool:
         """True only for AMD unified-memory APUs (gfx1150/gfx1151), where
@@ -1322,7 +1372,24 @@ def _amd_apu_wants_unified_memory() -> bool:
         return False
 
     @staticmethod
-    def _get_gpu_free_memory() -> list[tuple[int, int]]:
+    def _get_gpu_free_memory(binary: Optional[str] = None) -> list[tuple[int, int]]:
+        """Query free memory per GPU across all supported backends.
+
+        On a Vulkan build, the ggml Vulkan probe is authoritative so the
+        returned indices are Vulkan ordinals (the space the GPU pin writes
+        to ``GGML_VK_VISIBLE_DEVICES``). Otherwise ``nvidia-smi`` / torch
+        cover NVIDIA + AMD ROCm.
+
+        Returns list of (gpu_index, free_mib) sorted by index. Empty
+        list if no supported GPU is reachable.
+        """
+        binary = binary or LlamaCppBackend._find_llama_server_binary()
+        if LlamaCppBackend._is_vulkan_backend(binary):
+            return LlamaCppBackend._get_gpu_free_memory_vulkan(binary)
+        return LlamaCppBackend._get_gpu_free_memory_nvidia_torch()
+
+    @staticmethod
+    def _get_gpu_free_memory_nvidia_torch() -> list[tuple[int, int]]:
         """Query free memory per GPU.
 
         Order:
@@ -1428,6 +1495,85 @@ def _get_gpu_free_memory() -> list[tuple[int, int]]:
             logger.debug(f"torch GPU probe failed: {e}")
             return []
 
+    @staticmethod
+    def _get_gpu_free_memory_vulkan(binary: Optional[str] = None) -> list[tuple[int, int]]:
+        """Query free VRAM per device via the bundled ggml Vulkan backend.
+
+        Loads ``libggml-vulkan`` in a short-lived subprocess and calls
+        ``ggml_backend_vk_get_device_memory`` for each device, so no Vulkan
+        instance is created in this process. Returns list of
+        (device_index, free_mib) sorted by index, where the index is ggml's
+        own Vulkan device ordinal (the space ``GGML_VK_VISIBLE_DEVICES``
+        expects). Integrated GPUs leave a per-device host-RAM margin (see
+        ``_apply_igpu_host_reserve_mib``). Returns [] when no Vulkan build is
+        installed or no device is reachable.
+        """
+        binary = binary or LlamaCppBackend._find_llama_server_binary()
+        if not binary:
+            return []
+        binary_dir = _llama_lib_dir(binary)
+        if not (binary_dir / _vulkan_lib_filename()).is_file():
+            return []
+
+        env = child_env_without_native_path_secret()
+        # Enumerate ggml's canonical, full device list. An inherited
+        # GGML_VK_VISIBLE_DEVICES would renumber/restrict the ordinals, but
+        # load_model writes its own pin in that same full space, so letting
+        # the probe see a pre-existing mask would make the pin double-apply
+        # and target the wrong device.
+        env.pop("GGML_VK_VISIBLE_DEVICES", None)
+        if sys.platform != "win32":
+            # Let the loader resolve sibling ggml libs next to the binary.
+            existing_ld = env.get("LD_LIBRARY_PATH", "")
+            env["LD_LIBRARY_PATH"] = (
+                f"{binary_dir}:{existing_ld}" if existing_ld else str(binary_dir)
+            )
+        probe_script = Path(__file__).with_name("_vulkan_probe.py")
+        try:
+            result = subprocess.run(
+                [sys.executable, str(probe_script), str(binary_dir)],
+                capture_output = True,
+                text = True,
+                timeout = 15,
+                env = env,
+                **_windows_hidden_subprocess_kwargs(),
+            )
+            if result.returncode != 0:
+                logger.debug(
+                    f"vulkan GPU probe exited {result.returncode}: {result.stderr.strip()}"
+                )
+                return []
+        except Exception as e:
+            logger.debug(f"vulkan GPU probe failed: {e}")
+            return []
+
+        gpus: list[tuple[int, int]] = []
+        for line in result.stdout.strip().splitlines():
+            parts = line.split("\t")
+            if len(parts) != 3:
+                continue
+            try:
+                idx = int(parts[0])
+                free_mib = int(parts[1]) // (1024 * 1024)
+                is_igpu = parts[2] == "1"
+            except ValueError:
+                continue
+            capped = _apply_igpu_host_reserve_mib(free_mib, is_igpu)
+            if capped < free_mib:
+                logger.info(
+                    f"Vulkan device VK{idx} is an integrated GPU sharing system "
+                    f"RAM; reserving {free_mib - capped}MiB host headroom "
+                    f"({free_mib}->{capped}MiB usable)"
+                )
+            gpus.append((idx, capped))
+        gpus.sort(key = lambda g: g[0])
+        if gpus:
+            logger.info(
+                "Vulkan GPU memory detected: "
+                + ", ".join(f"VK{idx}={free}MiB" for idx, free in gpus)
+            )
+        return gpus
+
     # Skip the wait when the last kill is older than this; the driver has
     # already reclaimed the prior process's allocations.
     _VRAM_SETTLE_WINDOW_S: float = 15.0
@@ -2915,6 +3061,7 @@ def load_model(
                     "Run setup.sh to build it, install llama.cpp, "
                     "or set LLAMA_SERVER_PATH environment variable."
                 )
+            is_vulkan_backend = self._is_vulkan_backend(binary)
 
             # ── Phase 2: download (NO lock held, so cancel can proceed) ──
             # mtp_draft_path arrives set for local Gemma loads (detected
@@ -3000,7 +3147,7 @@ def load_model(
                 gpus: list[tuple[int, int]] = []
                 try:
                     model_size = self._get_gguf_size_bytes(model_path)
-                    gpus = self._get_gpu_free_memory()
+                    gpus = self._get_gpu_free_memory(binary)
 
                     # Resolve effective context: 0 means let llama-server use
                     # the model's native length. Only expand to a known native
@@ -3387,7 +3534,7 @@ def load_model(
                 import sys
 
                 env = child_env_without_native_path_secret()
-                binary_dir = str(Path(binary).parent)
+                binary_dir = str(_llama_lib_dir(binary))
 
                 # AMD unified-memory APUs (gfx1150/gfx1151): let llama.cpp use
                 # shared system RAM. setdefault so a user value wins.
@@ -3482,26 +3629,32 @@ def load_model(
                 # set, so set HIP_VISIBLE_DEVICES too.
                 if gpu_indices is not None:
                     pinned = ",".join(str(i) for i in gpu_indices)
-                    env["CUDA_VISIBLE_DEVICES"] = pinned
-                    try:
-                        import torch as _torch
-                        if getattr(_torch.version, "hip", None) is not None:
-                            env["HIP_VISIBLE_DEVICES"] = pinned
-                            # Do NOT also set ROCR_VISIBLE_DEVICES to the same
-                            # value. ROCR_VISIBLE_DEVICES filters at the HSA/ROCr
-                            # layer and HIP_VISIBLE_DEVICES at the HIP layer, so
-                            # setting both with the same physical indices applies
-                            # the mask twice: ROCR reduces the visible set and
-                            # re-indexes it from 0, then HIP indexes into the
-                            # already-reduced set. A single non-zero pin (e.g.
-                            # "1") then points out of range at the HIP layer, HIP
-                            # enumerates 0 devices, and llama.cpp falls back to
-                            # CPU ("ggml_cuda_init: no ROCm-capable device is
-                            # detected"). The HIP mask alone narrows correctly;
-                            # clear any inherited ROCR mask so it can't double up.
-                            env.pop("ROCR_VISIBLE_DEVICES", None)
-                    except Exception as e:
-                        logger.debug("Failed to set ROCm visibility env vars for child: %s", e)
+                    if is_vulkan_backend:
+                        # gpu_indices are ggml Vulkan ordinals (see
+                        # _get_gpu_free_memory); the Vulkan backend ignores
+                        # CUDA_VISIBLE_DEVICES, so pin via its own mask.
+                        env["GGML_VK_VISIBLE_DEVICES"] = pinned
+                    else:
+                        env["CUDA_VISIBLE_DEVICES"] = pinned
+                        try:
+                            import torch as _torch
+                            if getattr(_torch.version, "hip", None) is not None:
+                                env["HIP_VISIBLE_DEVICES"] = pinned
+                                # Do NOT also set ROCR_VISIBLE_DEVICES to the same
+                                # value. ROCR_VISIBLE_DEVICES filters at the HSA/ROCr
+                                # layer and HIP_VISIBLE_DEVICES at the HIP layer, so
+                                # setting both with the same physical indices applies
+                                # the mask twice: ROCR reduces the visible set and
+                                # re-indexes it from 0, then HIP indexes into the
+                                # already-reduced set. A single non-zero pin (e.g.
+                                # "1") then points out of range at the HIP layer, HIP
+                                # enumerates 0 devices, and llama.cpp falls back to
+                                # CPU ("ggml_cuda_init: no ROCm-capable device is
+                                # detected"). The HIP mask alone narrows correctly;
+                                # clear any inherited ROCR mask so it can't double up.
+                                env.pop("ROCR_VISIBLE_DEVICES", None)
+                        except Exception as e:
+                            logger.debug("Failed to set ROCm visibility env vars for child: %s", e)
 
                 # Captured before any text-only fallback strips it from cmd.
                 launched_with_mmproj = "--mmproj" in cmd
diff --git a/studio/backend/tests/test_llama_cpp_vulkan_probe.py b/studio/backend/tests/test_llama_cpp_vulkan_probe.py
new file mode 100644
index 0000000000..c641ad4bd0
--- /dev/null
+++ b/studio/backend/tests/test_llama_cpp_vulkan_probe.py
@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
+
+"""Vulkan free-VRAM reader regression tests on a synthetic probe output.
+
+Covers the post-probe handling in
+``LlamaCppBackend._get_gpu_free_memory_vulkan``:
+
+  * integrated GPUs (probe reports is_igpu=1) leave a flat per-device host
+    margin matching llama.cpp's --fit-target, so context auto-sizing can't
+    over-commit shared RAM,
+  * discrete GPUs (is_igpu=0) are left untouched,
+  * an inherited ``GGML_VK_VISIBLE_DEVICES`` is stripped before probing so
+    enumeration stays in ggml's canonical full-device space.
+
+The ggml Vulkan library is never loaded: subprocess.run is mocked to emit
+the tab-separated lines the real ``_vulkan_probe.py`` would print.
+"""
+
+from __future__ import annotations
+
+import subprocess
+import sys
+import types as _types
+from pathlib import Path
+from unittest import mock
+
+import pytest
+
+_BACKEND_DIR = str(Path(__file__).resolve().parent.parent)
+if _BACKEND_DIR not in sys.path:
+    sys.path.insert(0, _BACKEND_DIR)
+
+import importlib as _importlib  # noqa: E402
+
+
+def _maybe_stub(name: str, builder):
+    try:
+        _importlib.import_module(name)
+    except ImportError:
+        sys.modules[name] = builder()
+
+
+def _build_loggers_stub():
+    m = _types.ModuleType("loggers")
+    m.get_logger = lambda name: __import__("logging").getLogger(name)
+    return m
+
+
+_maybe_stub("loggers", _build_loggers_stub)
+_maybe_stub("structlog", lambda: _types.ModuleType("structlog"))
+
+from core.inference import llama_cpp as _llama_mod  # noqa: E402
+from core.inference.llama_cpp import LlamaCppBackend, _vulkan_lib_filename  # noqa: E402
+
+MIB = 1024 * 1024
+GIB = 1024 * MIB
+
+
+def _make_vulkan_install(tmp_path: Path) -> str:
+    """A binary whose sibling dir holds the Vulkan ggml lib, so the
+    reader's ``is_vulkan_backend`` sibling-file check passes."""
+    bindir = tmp_path / "build" / "bin"
+    bindir.mkdir(parents = True)
+    binary = bindir / ("llama-server.exe" if sys.platform == "win32" else "llama-server")
+    binary.write_bytes(b"stub")
+    (bindir / _vulkan_lib_filename()).write_bytes(b"stub")
+    return str(binary)
+
+
+def _mock_probe(rows: list[str], captured_env: dict | None = None):
+    """Patch subprocess.run so the _vulkan_probe.py call returns ``rows``
+    (already tab-formatted), recording the env it was launched with."""
+    real_run = subprocess.run
+
+    def fake_run(cmd, *args, **kwargs):
+        if isinstance(cmd, list) and any("_vulkan_probe" in str(c) for c in cmd):
+            if captured_env is not None:
+                captured_env.clear()
+                captured_env.update(kwargs.get("env") or {})
+            return subprocess.CompletedProcess(
+                args = cmd, returncode = 0, stdout = "\n".join(rows), stderr = ""
+            )
+        return real_run(cmd, *args, **kwargs)
+
+    return mock.patch("subprocess.run", side_effect = fake_run)
+
+
+def _row(idx: int, free_bytes: int, is_igpu: int) -> str:
+    return f"{idx}\t{free_bytes}\t{is_igpu}"
+
+
+def test_integrated_gpu_leaves_host_margin(tmp_path):
+    binary = _make_vulkan_install(tmp_path)
+    # iGPU with 30 GiB free; reserve a flat 1024 MiB (llama.cpp --fit-target).
+    rows = [_row(0, 30 * GIB, is_igpu = 1)]
+    with _mock_probe(rows):
+        gpus = LlamaCppBackend._get_gpu_free_memory_vulkan(binary)
+    assert gpus == [(0, 30 * 1024 - 1024)], gpus
+
+
+def test_discrete_gpu_free_is_untouched(tmp_path):
+    binary = _make_vulkan_install(tmp_path)
+    rows = [_row(0, 23 * GIB, is_igpu = 0)]
+    with _mock_probe(rows):
+        gpus = LlamaCppBackend._get_gpu_free_memory_vulkan(binary)
+    assert gpus == [(0, 23 * 1024)], gpus
+
+
+def test_large_discrete_gpu_is_untouched(tmp_path):
+    binary = _make_vulkan_install(tmp_path)
+    # A 48 GiB discrete card stays untouched regardless of size; only the
+    # iGPU flag triggers the host margin, never a VRAM/RAM ratio.
+    rows = [_row(0, 47 * GIB, is_igpu = 0)]
+    with _mock_probe(rows):
+        gpus = LlamaCppBackend._get_gpu_free_memory_vulkan(binary)
+    assert gpus == [(0, 47 * 1024)], gpus
+
+
+def test_inherited_visible_devices_mask_is_stripped(tmp_path, monkeypatch):
+    binary = _make_vulkan_install(tmp_path)
+    monkeypatch.setenv("GGML_VK_VISIBLE_DEVICES", "1")
+    captured: dict = {}
+    rows = [_row(0, 23 * GIB, is_igpu = 0)]
+    with _mock_probe(rows, captured_env = captured):
+        LlamaCppBackend._get_gpu_free_memory_vulkan(binary)
+    assert "GGML_VK_VISIBLE_DEVICES" not in captured, captured
+
+
+if __name__ == "__main__":
+    raise SystemExit(pytest.main([__file__, "-v"]))
diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py
index bd8c90dc9c..70ff052c09 100644
--- a/studio/install_llama_prebuilt.py
+++ b/studio/install_llama_prebuilt.py
@@ -10,6 +10,7 @@
 import errno
 import fnmatch
 import functools
+import glob
 import hashlib
 import json
 import os
@@ -262,6 +263,7 @@ class HostInfo:
     has_physical_nvidia: bool
     has_usable_nvidia: bool
     has_rocm: bool = False
+    has_intel_gpu: bool = False
     rocm_gfx_target: str | None = None
     # (major, minor) from platform.mac_ver(); None off macOS or if unparseable.
     # Skips a macos prebuilt whose minimum-OS exceeds this host.
@@ -1495,6 +1497,21 @@ def direct_upstream_release_plan(
                         install_kind = "windows-hip",
                     )
                 )
+        # Intel (or other non-NVIDIA/non-AMD) GPU: use the Vulkan prebuilt.
+        elif host.has_intel_gpu:
+            vulkan_asset = f"llama-{release_tag}-bin-win-vulkan-x64.zip"
+            vulkan_url = assets.get(vulkan_asset)
+            if vulkan_url:
+                attempts.append(
+                    AssetChoice(
+                        repo = repo,
+                        tag = release_tag,
+                        name = vulkan_asset,
+                        url = vulkan_url,
+                        source_label = "upstream",
+                        install_kind = "windows-vulkan",
+                    )
+                )
         cpu_asset = f"llama-{release_tag}-bin-win-cpu-x64.zip"
         cpu_url = assets.get(cpu_asset)
         if cpu_url:
@@ -1555,6 +1572,21 @@ def direct_upstream_release_plan(
                 )
             )
     elif host.is_linux and host.is_x86_64 and not host.has_usable_nvidia:
+        # Intel (or other non-NVIDIA/non-AMD) GPU: use the Vulkan prebuilt.
+        if host.has_intel_gpu and not host.has_usable_nvidia and not host.has_rocm:
+            vulkan_asset = f"llama-{release_tag}-bin-ubuntu-vulkan-x64.tar.gz"
+            vulkan_url = assets.get(vulkan_asset)
+            if vulkan_url:
+                attempts.append(
+                    AssetChoice(
+                        repo = repo,
+                        tag = release_tag,
+                        name = vulkan_asset,
+                        url = vulkan_url,
+                        source_label = "upstream",
+                        install_kind = "linux-vulkan",
+                    )
+                )
         if host.has_rocm:
             # Lemonade first, mirroring the Windows ROCm branch above, so a
             # ROCm host routed to ggml-org does not silently get the CPU build.
@@ -3062,6 +3094,41 @@ def _resolve_exe(name: str) -> str | None:
         # Note: amdhip64.dll presence alone is NOT treated as GPU evidence
         # since the HIP SDK can be installed without an AMD GPU.
 
+    # Detect an Intel GPU; gates the Vulkan prebuilt. Linux reads the DRM
+    # sysfs vendor id (0x8086); Windows queries the WMI video controller list.
+    # Only probed when there is no usable NVIDIA and no ROCm GPU, since the
+    # Vulkan selection branches are gated the same way -- this keeps the probe
+    # (notably the Windows powershell call) off the NVIDIA/AMD path.
+    has_intel_gpu = False
+    if not has_usable_nvidia and not has_rocm:
+        if is_linux:
+            for _vendor_file in glob.glob("/sys/class/drm/card*/device/vendor"):
+                try:
+                    with open(_vendor_file) as _vf:
+                        if _vf.read().strip().lower() == "0x8086":
+                            has_intel_gpu = True
+                            break
+                except OSError:
+                    continue
+        elif is_windows:
+            _ps = shutil.which("powershell") or shutil.which("pwsh")
+            if _ps:
+                try:
+                    _result = run_capture(
+                        [
+                            _ps,
+                            "-NoProfile",
+                            "-Command",
+                            "Get-CimInstance Win32_VideoController | "
+                            "Select-Object -ExpandProperty Name",
+                        ],
+                        timeout = 15,
+                    )
+                    if _result.returncode == 0 and "intel" in _result.stdout.lower():
+                        has_intel_gpu = True
+                except Exception:
+                    pass
+
     return HostInfo(
         system = system,
         machine = machine,
@@ -3077,6 +3144,7 @@ def _resolve_exe(name: str) -> str | None:
         has_physical_nvidia = has_physical_nvidia,
         has_usable_nvidia = has_usable_nvidia,
         has_rocm = has_rocm,
+        has_intel_gpu = has_intel_gpu,
         rocm_gfx_target = rocm_gfx_target,
         macos_version = macos_version,
     )
@@ -4140,6 +4208,21 @@ def resolve_upstream_asset_choice(
                 "falling back to source build with HIP support"
             )
 
+        # Intel (or other non-NVIDIA/non-AMD) GPU: use the Vulkan prebuilt.
+        if host.has_intel_gpu and not host.has_usable_nvidia and not host.has_rocm:
+            vulkan_name = f"llama-{llama_tag}-bin-ubuntu-vulkan-x64.tar.gz"
+            if vulkan_name in upstream_assets:
+                log(f"Intel GPU detected -- using upstream Vulkan prebuilt {vulkan_name}")
+                return AssetChoice(
+                    repo = UPSTREAM_REPO,
+                    tag = llama_tag,
+                    name = vulkan_name,
+                    url = upstream_assets[vulkan_name],
+                    source_label = "upstream",
+                    install_kind = "linux-vulkan",
+                )
+            log("Intel GPU detected but no Vulkan prebuilt found -- falling back to CPU")
+
         upstream_name = f"llama-{llama_tag}-bin-ubuntu-x64.tar.gz"
         if upstream_name not in upstream_assets:
             raise PrebuiltFallback("upstream Linux CPU asset was not found")
@@ -4180,6 +4263,23 @@ def resolve_upstream_asset_choice(
                 )
             log("AMD ROCm detected on Windows but no HIP prebuilt found -- falling back to CPU")
 
+        # Intel (or other non-NVIDIA/non-AMD) GPU on Windows: use Vulkan.
+        if host.has_intel_gpu and not host.has_usable_nvidia and not host.has_rocm:
+            vulkan_name = f"llama-{llama_tag}-bin-win-vulkan-x64.zip"
+            if vulkan_name in upstream_assets:
+                log(
+                    f"Intel GPU detected on Windows -- using upstream Vulkan prebuilt {vulkan_name}"
+                )
+                return AssetChoice(
+                    repo = UPSTREAM_REPO,
+                    tag = llama_tag,
+                    name = vulkan_name,
+                    url = upstream_assets[vulkan_name],
+                    source_label = "upstream",
+                    install_kind = "windows-vulkan",
+                )
+            log("Intel GPU detected on Windows but no Vulkan prebuilt found -- falling back to CPU")
+
         upstream_name = f"llama-{llama_tag}-bin-win-cpu-x64.zip"
         if upstream_name not in upstream_assets:
             raise PrebuiltFallback("upstream Windows CPU asset was not found")
@@ -4692,6 +4792,7 @@ def runtime_patterns_for_choice(choice: AssetChoice) -> list[str]:
         "linux-arm64-cuda",
         "linux-rocm",
         "linux-arm64",
+        "linux-vulkan",
     }:
         return ["llama-server", "llama-quantize", "lib*.so*"]
     if choice.install_kind in {"macos-arm64", "macos-x64"}:
@@ -4700,6 +4801,7 @@ def runtime_patterns_for_choice(choice: AssetChoice) -> list[str]:
         "windows-cpu",
         "windows-cuda",
         "windows-hip",
+        "windows-vulkan",
         "windows-rocm",
         "windows-arm64",
     }:
@@ -5743,8 +5845,10 @@ def validate_server(
             "linux-cuda",
             "linux-arm64-cuda",
             "linux-rocm",
+            "linux-vulkan",
             "windows-cuda",
             "windows-hip",
+            "windows-vulkan",
             "windows-rocm",
             "macos-arm64",
         }
@@ -6347,6 +6451,16 @@ def runtime_payload_health_groups(choice: AssetChoice) -> list[list[str]]:
             ["libmtmd.so*"],
             ["libggml-hip.so*"],
         ]
+    if choice.install_kind == "linux-vulkan":
+        return [
+            ["libllama-common.so*"],
+            ["libllama.so*"],
+            ["libggml.so*"],
+            ["libggml-base.so*"],
+            ["libggml-cpu-*.so*"],
+            ["libmtmd.so*"],
+            ["libggml-vulkan.so*"],
+        ]
     if choice.install_kind in {"windows-cpu", "windows-arm64"}:
         return [["llama.dll"]]
     if choice.install_kind == "windows-cuda":
@@ -6366,6 +6480,8 @@ def runtime_payload_health_groups(choice: AssetChoice) -> list[list[str]]:
         return groups
     if choice.install_kind in {"windows-hip", "windows-rocm"}:
         return [["llama.dll"], ["*hip*.dll"]]
+    if choice.install_kind == "windows-vulkan":
+        return [["llama.dll"], ["ggml-vulkan.dll"]]
     return []
 
 
@@ -6637,6 +6753,37 @@ def validate_prebuilt_attempts(
     raise PrebuiltFallback("no prebuilt bundle passed validation")
 
 
+def force_vulkan_requested() -> bool:
+    """Whether UNSLOTH_FORCE_VULKAN opts this host into the Vulkan llama.cpp
+    prebuilt instead of its detected CUDA/ROCm backend -- e.g. so an AMD user
+    can run the Vulkan build for inference. Scoped to the llama.cpp backend:
+    the torch/training stack is installed separately and still sees the real
+    GPU.
+    """
+    return os.environ.get("UNSLOTH_FORCE_VULKAN", "").strip().lower() in (
+        "1",
+        "true",
+        "yes",
+    )
+
+
+def _vulkan_only_host(host: HostInfo) -> HostInfo:
+    """Rewrite ``host`` so the asset selectors take their Vulkan branch.
+
+    That branch fires on ``has_intel_gpu and not nvidia and not rocm``, so the
+    CUDA/ROCm flags are cleared and the integrated-GPU flag is raised. The
+    synthetic integrated-GPU flag never leaves install planning -- it only
+    routes the llama.cpp prebuilt choice, not the torch/training stack.
+    """
+    return dataclasses_replace(
+        host,
+        has_usable_nvidia = False,
+        has_physical_nvidia = False,
+        has_rocm = False,
+        has_intel_gpu = True,
+    )
+
+
 def install_prebuilt(
     install_dir: Path,
     llama_tag: str,
@@ -6654,6 +6801,22 @@ def install_prebuilt(
         override_rocm_gfx = override_rocm_gfx,
         force_cpu = force_cpu,
     )
+    # UNSLOTH_FORCE_VULKAN installs the upstream ggml-org Vulkan prebuilt
+    # instead of the detected CUDA/ROCm backend. The unsloth published repo
+    # ships only CUDA/ROCm assets, hence UPSTREAM_REPO.
+    if force_vulkan_requested():
+        if host.is_macos:
+            log(
+                "UNSLOTH_FORCE_VULKAN is set but ignored on macOS "
+                "(Metal is used; there is no Vulkan prebuilt)"
+            )
+        else:
+            log(
+                "UNSLOTH_FORCE_VULKAN is set; installing the upstream Vulkan "
+                "llama.cpp prebuilt instead of the detected GPU backend"
+            )
+            host = _vulkan_only_host(host)
+            published_repo = UPSTREAM_REPO
     choice: AssetChoice | None = None
     try:
         with install_lock(install_lock_path(install_dir)):
@@ -6666,7 +6829,10 @@ def install_prebuilt(
                     f"no existing llama.cpp install detected at {install_dir}; performing fresh prebuilt install"
                 )
             # Single resolver: linux-x64 takes the fast filename path internally,
-            # every other fork host reads the manifest.
+            # every other fork host reads the manifest. A forced-Vulkan host has
+            # already had published_repo pointed at UPSTREAM_REPO above, so the
+            # simple planner routes through direct_upstream_release_plan and
+            # carries the Vulkan asset branch.
             requested_tag, release_plans = resolve_simple_install_release_plans(
                 llama_tag,
                 host,