diff --git a/CMakeLists.txt b/CMakeLists.txt
index f3bbfb58c2..557400862b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -178,6 +178,7 @@ option(GGML_XTHEADVECTOR     "ggml: enable xtheadvector"      OFF)
 option(GGML_VXE              "ggml: enable vxe"               ${GGML_NATIVE})
 
 option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
+option(GGML_CPU_STATIC        "ggml: build CPU backend as static library even with GGML_BACKEND_DL"   OFF)
 set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
 set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
 
@@ -186,6 +187,14 @@ set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline
 option(GGML_CPU                             "ggml: enable CPU backend"                        ON)
 option(GGML_SCHED_NO_REALLOC                "ggml: disallow reallocations in ggml-alloc (for debugging)" OFF)
 
+# When non-empty, ggml libs land on disk as `lib<prefix>ggml-*.{a,so,dll}` and the
+# runtime backend loader looks for the same prefix. CMake target names and the
+# `find_package(ggml CONFIG)` package name are unchanged. Defaults to
+# `qvac-speech-` on the speech branch to match the loader's compiled-in default
+# in `ggml-backend-reg.cpp`.
+set(GGML_LIB_OUTPUT_PREFIX "qvac-speech-" CACHE STRING
+    "ggml: prefix for built ggml library filenames")
+
 # 3rd party libs / backends
 option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"               ON)
 option(GGML_BLAS                            "ggml: use BLAS"                                  ${GGML_BLAS_DEFAULT})
@@ -294,6 +303,41 @@ include(GNUInstallDirs)
 
 add_subdirectory(src)
 
+if (NOT GGML_LIB_OUTPUT_PREFIX STREQUAL "")
+    function(_ggml_apply_lib_output_prefix target)
+        if (NOT TARGET ${target})
+            return()
+        endif()
+        get_target_property(_lop_type ${target} TYPE)
+        if (_lop_type STREQUAL "INTERFACE_LIBRARY" OR
+            _lop_type STREQUAL "OBJECT_LIBRARY")
+            return()
+        endif()
+        get_target_property(_lop_old ${target} OUTPUT_NAME)
+        if (NOT _lop_old OR _lop_old STREQUAL "_lop_old-NOTFOUND")
+            set(_lop_old ${target})
+        endif()
+        set_target_properties(${target} PROPERTIES
+            OUTPUT_NAME "${GGML_LIB_OUTPUT_PREFIX}${_lop_old}")
+    endfunction()
+
+    foreach (_lop_target ggml ggml-base)
+        _ggml_apply_lib_output_prefix(${_lop_target})
+    endforeach()
+    if (DEFINED GGML_AVAILABLE_BACKENDS)
+        foreach (_lop_target ${GGML_AVAILABLE_BACKENDS})
+            _ggml_apply_lib_output_prefix(${_lop_target})
+        endforeach()
+    endif()
+
+    if (TARGET ggml-base)
+        target_compile_definitions(ggml-base PRIVATE
+            GGML_BACKEND_DL_PROJECT_PREFIX="${GGML_LIB_OUTPUT_PREFIX}")
+    endif()
+
+    message(STATUS "ggml: lib output prefix '${GGML_LIB_OUTPUT_PREFIX}'")
+endif()
+
 #
 # tests and examples
 #
@@ -324,6 +368,7 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-cpp.h
     include/ggml-cuda.h
     include/ggml-opt.h
+    include/ggml-opencl.h
     include/ggml-metal.h
     include/ggml-rpc.h
     include/ggml-virtgpu.h
diff --git a/cmake/ggml-config.cmake.in b/cmake/ggml-config.cmake.in
index 91c9d5cd34..09e2ca9a9c 100644
--- a/cmake/ggml-config.cmake.in
+++ b/cmake/ggml-config.cmake.in
@@ -104,7 +104,7 @@ set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
 if(NOT TARGET ggml::ggml)
     find_package(Threads REQUIRED)
 
-    find_library(GGML_LIBRARY ggml
+    find_library(GGML_LIBRARY NAMES "${GGML_LIB_OUTPUT_PREFIX}ggml" ggml
         REQUIRED
         HINTS ${GGML_LIB_DIR}
         NO_CMAKE_FIND_ROOT_PATH)
@@ -112,25 +112,39 @@ if(NOT TARGET ggml::ggml)
     add_library(ggml::ggml UNKNOWN IMPORTED)
     set_target_properties(ggml::ggml
         PROPERTIES
-            IMPORTED_LOCATION "${GGML_LIBRARY}")
+            IMPORTED_LOCATION "${GGML_LIBRARY}"
+            INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}")
 
-    find_library(GGML_BASE_LIBRARY ggml-base
+    find_library(GGML_BASE_LIBRARY NAMES "${GGML_LIB_OUTPUT_PREFIX}ggml-base" ggml-base
         REQUIRED
         HINTS ${GGML_LIB_DIR}
         NO_CMAKE_FIND_ROOT_PATH)
 
     add_library(ggml::ggml-base UNKNOWN IMPORTED)
+    set_property(TARGET ggml::ggml-base PROPERTY INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}")
     set_target_properties(ggml::ggml-base
         PROPERTIES
             IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")
 
+    if(GGML_MAX_NAME)
+        set_property(TARGET ggml::ggml-base APPEND PROPERTY
+            INTERFACE_COMPILE_DEFINITIONS "GGML_MAX_NAME=${GGML_MAX_NAME}")
+    endif()
+
     set(_ggml_all_targets "")
-    if (NOT GGML_BACKEND_DL)
+    # In hybrid mode (GGML_BACKEND_DL + GGML_CPU_STATIC), only the CPU backend
+    # is static and must still be exported to downstream consumers.
+    if (NOT GGML_BACKEND_DL OR GGML_CPU_STATIC)
         foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
+            string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
+            if (GGML_BACKEND_DL AND GGML_CPU_STATIC AND NOT is_cpu_variant)
+                continue()
+            endif()
             string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
             string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)
 
-            find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
+            find_library(${_ggml_backend_pfx}_LIBRARY
+                NAMES "${GGML_LIB_OUTPUT_PREFIX}${_ggml_backend}" "${_ggml_backend}"
                 REQUIRED
                 HINTS ${GGML_LIB_DIR}
                 NO_CMAKE_FIND_ROOT_PATH)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 78853304d9..d619c21623 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -185,8 +185,10 @@ endif()
 
 # ggml
 
-if (GGML_BACKEND_DL AND NOT BUILD_SHARED_LIBS)
-    message(FATAL_ERROR "GGML_BACKEND_DL requires BUILD_SHARED_LIBS")
+# GGML_BACKEND_DL works with static core when PIC is enabled below.
+
+if (GGML_CPU_STATIC AND GGML_CPU_ALL_VARIANTS)
+    message(FATAL_ERROR "GGML_CPU_STATIC is incompatible with GGML_CPU_ALL_VARIANTS")
 endif()
 
 add_library(ggml-base
@@ -221,6 +223,10 @@ if (GGML_SCHED_NO_REALLOC)
     target_compile_definitions(ggml-base PUBLIC GGML_SCHED_NO_REALLOC)
 endif()
 
+if (DEFINED GGML_MAX_NAME)
+    target_compile_definitions(ggml-base PUBLIC GGML_MAX_NAME=${GGML_MAX_NAME})
+endif()
+
 add_library(ggml
             ggml-backend-dl.cpp
             ggml-backend-reg.cpp)
@@ -245,10 +251,11 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux")
 endif()
 
 function(ggml_add_backend_library backend)
-    if (GGML_BACKEND_DL)
+    if (GGML_BACKEND_DL AND NOT (GGML_CPU_STATIC AND ${backend} MATCHES "^ggml-cpu"))
         add_library(${backend} MODULE ${ARGN})
         # write the shared library to the output directory
-        set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+        set_target_properties(${backend} PROPERTIES
+            LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
         target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
         add_dependencies(ggml ${backend})
         if (GGML_BACKEND_DIR)
@@ -265,7 +272,7 @@ function(ggml_add_backend_library backend)
     target_link_libraries(${backend} PRIVATE ggml-base)
     target_include_directories(${backend} PRIVATE ..)
 
-    if (${BUILD_SHARED_LIBS})
+    if (BUILD_SHARED_LIBS OR GGML_BACKEND_DL)
         target_compile_definitions(${backend} PRIVATE GGML_BACKEND_BUILD)
         target_compile_definitions(${backend} PUBLIC  GGML_BACKEND_SHARED)
     endif()
@@ -301,6 +308,9 @@ function(ggml_add_backend backend)
             string(TOUPPER "GGML_USE_${backend}" backend_use)
             target_compile_definitions(ggml PUBLIC ${backend_use})
         endif()
+        if (GGML_CPU_STATIC AND "${backend}" STREQUAL "CPU")
+            target_compile_definitions(ggml PUBLIC GGML_USE_CPU)
+        endif()
     endif()
 endfunction()
 
@@ -484,7 +494,7 @@ if(CMAKE_SYSTEM_NAME MATCHES "visionOS")
     target_compile_definitions(ggml-base PUBLIC _DARWIN_C_SOURCE)
 endif()
 
-if (BUILD_SHARED_LIBS)
+if (BUILD_SHARED_LIBS OR GGML_BACKEND_DL)
     foreach (target ggml-base ggml)
         set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE ON)
         target_compile_definitions(${target} PRIVATE GGML_BUILD)
diff --git a/src/ggml-backend-reg.cpp b/src/ggml-backend-reg.cpp
index 20b3a5482f..8fa8102436 100644
--- a/src/ggml-backend-reg.cpp
+++ b/src/ggml-backend-reg.cpp
@@ -462,9 +462,9 @@ static fs::path backend_filename_prefix() {
 #endif
 #else
 #ifdef _WIN32
-    return fs::u8path("ggml-");
+    return fs::u8path("qvac-speech-ggml-");
 #else
-    return fs::u8path("libggml-");
+    return fs::u8path("libqvac-speech-ggml-");
 #endif
 #endif
 }
@@ -553,6 +553,13 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
                 }
             }
         }
+        // Android app packaging can flatten native libraries into one directory.
+        // If loading from the requested subdirectory fails, retry by filename only
+        // and leave lookup to dlopen's default search path resolution.
+        fs::path filename = backend_filename_prefix().native() + name_path.native() + backend_filename_extension().native();
+        if (auto reg = get_reg().load_backend(filename, silent)) {
+            return reg;
+        }
         return nullptr;
     }
 
diff --git a/src/ggml-opencl/ggml-opencl.cpp b/src/ggml-opencl/ggml-opencl.cpp
index 7c2e4bc2f9..cb42e27af0 100644
--- a/src/ggml-opencl/ggml-opencl.cpp
+++ b/src/ggml-opencl/ggml-opencl.cpp
@@ -4353,15 +4353,6 @@ static ggml_backend_i ggml_backend_opencl_i = {
 };
 
 ggml_backend_t ggml_backend_opencl_init(void) {
-    // qvac-parakeet patch: bail out cleanly when the OpenCL backend
-    // discovery saw zero usable devices. Upstream calls
-    // ggml_backend_reg_dev_get() unconditionally, which asserts on an
-    // empty device list. Parakeet's host code expects a nullable result
-    // from ggml_backend_opencl_init() (it falls back to CPU when the
-    // returned backend is null); the assertion makes that fallback path
-    // unreachable on hosts where ggml-opencl can't find any GPU it
-    // accepts (Adreno-only environments without an Adreno device,
-    // headless CI runners, etc.).
     ggml_backend_reg_t reg = ggml_backend_opencl_reg();
     if (ggml_backend_reg_dev_count(reg) == 0) {
         return nullptr;
@@ -6272,7 +6263,11 @@ static size_t ggml_backend_opencl_reg_device_count(ggml_backend_reg_t reg) {
 }
 
 static ggml_backend_dev_t ggml_backend_opencl_reg_device_get(ggml_backend_reg_t reg, size_t index) {
-    GGML_ASSERT(index < ggml_backend_opencl_reg_device_count(reg));
+    size_t n = ggml_backend_opencl_reg_device_count(reg);
+    if (n == 0) {
+        return nullptr;
+    }
+    GGML_ASSERT(index < n);
 
     return &g_ggml_backend_opencl_devices[index];
 
diff --git a/src/ggml-vulkan/ggml-vulkan.cpp b/src/ggml-vulkan/ggml-vulkan.cpp
index 19e7fbdaae..9f1b42749e 100644
--- a/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/src/ggml-vulkan/ggml-vulkan.cpp
@@ -23,8 +23,14 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
 
 #include <algorithm>
 #include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <filesystem>
+#include <fstream>
 #include <iomanip>
 #include <iostream>
+#include <system_error>
 #include <tuple>
 #include <vector>
 #include <deque>
@@ -864,6 +870,24 @@ struct vk_device_struct {
     bool allow_sysmem_fallback;
     bool disable_graph_optimize;
 
+    // Optional persistent VkPipelineCache.  Enabled only when the caller
+    // sets GGML_VK_PIPELINE_CACHE_DIR to a non-empty path.  When enabled,
+    // createPipelineCache is seeded from disk at init and getPipelineCacheData
+    // is written back from ggml_vk_cleanup(), so repeated
+    // ggml_backend_vk_init() invocations (and separate processes) skip the
+    // shader-compile wave that Vulkan normally pays on every cold
+    // command-buffer graph-build.  When pipeline_cache is VK_NULL_HANDLE
+    // (default / opt-out / mkdir failure) behaviour is identical to upstream
+    // (createComputePipeline takes VK_NULL_HANDLE, which is legal).
+    vk::PipelineCache pipeline_cache = VK_NULL_HANDLE;
+    std::string       pipeline_cache_path;
+    // QVAC-17872 round-2: bytes already on disk for this cache.  Used by
+    // the eager flush in ggml_vk_load_shaders to skip the disk write on
+    // pure cache-hit paths (warm runs where every pipeline came from the
+    // seed blob): if getPipelineCacheData().size() == this value, the
+    // cache content is unchanged and there is nothing to persist.
+    size_t            pipeline_cache_last_size = 0;
+
     std::unique_ptr<vk_memory_logger> memory_logger;
 
     ~vk_device_struct() {
@@ -888,10 +912,71 @@ struct vk_device_struct {
 
         device.destroyDescriptorSetLayout(dsl);
 
+        // Destroy the VkPipelineCache handle here if it's still alive.  The
+        // on-disk persistence happens earlier, in ggml_vk_cleanup(), because
+        // this destructor is not reliably reached at process exit: pipelines
+        // and helpers hold shared_ptr<vk_device_struct> refs that keep the
+        // refcount above 0 until well after the Vulkan dispatcher is gone.
+        if (pipeline_cache) {
+            device.destroyPipelineCache(pipeline_cache);
+            pipeline_cache = VK_NULL_HANDLE;
+        }
+
         device.destroy();
     }
 };
 
+// Flush the optional persistent pipeline cache to disk.  Called from
+// ggml_vk_cleanup() while the device shared_ptr is still alive and the
+// Vulkan dispatcher is still valid.  Safe to call multiple times per device
+// (the write is atomic via tmp + rename; idempotent).  No-op when persistent
+// caching was not enabled at init time.
+static void ggml_vk_save_pipeline_cache(vk_device & device) {
+    if (!device || !device->pipeline_cache || device->pipeline_cache_path.empty()) {
+        return;
+    }
+    try {
+        const std::vector<uint8_t> blob = device->device.getPipelineCacheData(device->pipeline_cache);
+        if (blob.empty()) {
+            return;
+        }
+        // QVAC-17872 round-2: skip the disk write if the cache content
+        // is byte-equivalent in size to what we already have on disk.
+        // Avoids re-writing 1 MB on every cleanup of a process that
+        // didn't compile any new pipelines (warm runs).  The eager-flush
+        // path in ggml_vk_load_shaders uses the same pipeline_cache_last_size
+        // bookkeeping so they cooperate idempotently.
+        if (blob.size() == device->pipeline_cache_last_size) {
+            return;
+        }
+        const std::filesystem::path final_path(device->pipeline_cache_path);
+        std::filesystem::path tmp_path = final_path;
+        tmp_path += ".tmp";
+        std::ofstream out(tmp_path, std::ios::binary | std::ios::trunc);
+        if (!out) {
+            return;
+        }
+        out.write(reinterpret_cast<const char *>(blob.data()),
+                  static_cast<std::streamsize>(blob.size()));
+        out.close();
+        if (out.good()) {
+            std::error_code ec;
+            std::filesystem::rename(tmp_path, final_path, ec);
+            if (!ec) {
+                device->pipeline_cache_last_size = blob.size();
+            } else {
+                std::error_code ignore;
+                std::filesystem::remove(tmp_path, ignore);
+            }
+        } else {
+            std::error_code ignore;
+            std::filesystem::remove(tmp_path, ignore);
+        }
+    } catch (const std::exception &) {
+        // best-effort; silently drop the write
+    }
+}
+
 void vk_command_pool::init(vk_device& device, vk_queue *q_) {
     cmd_buffers.clear();
     q = q_;
@@ -2206,7 +2291,10 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
 #endif
 
     try {
-        pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
+        // device->pipeline_cache is VK_NULL_HANDLE when persistent caching is
+        // opt-ed-out or its init failed; VK treats that as "no cache" — same
+        // as before this patch.
+        pipeline->pipeline = device->device.createComputePipeline(device->pipeline_cache, compute_pipeline_create_info).value;
     } catch (const vk::SystemError& e) {
         std::cerr << "ggml_vulkan: Compute pipeline creation failed for " << pipeline->name << std::endl;
         std::cerr << "ggml_vulkan: " << e.what() << std::endl;
@@ -4783,6 +4871,53 @@ static void ggml_vk_load_shaders(vk_device& device) {
     for (auto &c : compiles) {
         c.wait();
     }
+
+    // QVAC-17872 round-2: persist the pipeline cache eagerly when this
+    // load_shaders call actually GREW the cache (i.e. compiled at least
+    // one pipeline whose SPIR-V was not already in the seed blob).
+    // Without this, lazy-compile work done by
+    // ggml_pipeline_request_descriptor_sets during a long-running graph
+    // compute is only flushed in ggml_vk_cleanup at backend free time —
+    // a process crash in between throws away the entire cold-compile
+    // wave and the next process pays it again.
+    //
+    // Crucially, on a warm run with a populated seed blob, every
+    // pipeline still goes through createComputePipeline → compiles is
+    // non-empty → but getPipelineCacheData().size() == seed size, so we
+    // skip the disk write.  This keeps warm-run overhead at zero (we
+    // measured a +90 ms WALL regression with an unconditional flush).
+    if (!compiles.empty() && device->pipeline_cache && !device->pipeline_cache_path.empty()) {
+        try {
+            const std::vector<uint8_t> blob = device->device.getPipelineCacheData(device->pipeline_cache);
+            if (!blob.empty() && blob.size() > device->pipeline_cache_last_size) {
+                const std::filesystem::path final_path(device->pipeline_cache_path);
+                std::filesystem::path tmp_path = final_path;
+                tmp_path += ".tmp";
+                std::ofstream out(tmp_path, std::ios::binary | std::ios::trunc);
+                if (out) {
+                    out.write(reinterpret_cast<const char *>(blob.data()),
+                              static_cast<std::streamsize>(blob.size()));
+                    out.close();
+                    if (out.good()) {
+                        std::error_code ec;
+                        std::filesystem::rename(tmp_path, final_path, ec);
+                        if (!ec) {
+                            device->pipeline_cache_last_size = blob.size();
+                        } else {
+                            std::error_code ignore;
+                            std::filesystem::remove(tmp_path, ignore);
+                        }
+                    } else {
+                        std::error_code ignore;
+                        std::filesystem::remove(tmp_path, ignore);
+                    }
+                }
+            }
+        } catch (const std::exception &) {
+            // best-effort; on any failure we silently fall back to the
+            // ggml_vk_cleanup-time flush.
+        }
+    }
 }
 
 static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch);
@@ -5507,6 +5642,87 @@ static vk_device ggml_vk_get_device(size_t idx) {
         descriptor_set_layout_create_info.setPNext(&dslbfci);
         device->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
 
+        // -------------------------------------------------------------------
+        // Persistent VkPipelineCache (explicit opt-in only).
+        //
+        // Enabled by setting GGML_VK_PIPELINE_CACHE_DIR to a non-empty
+        // directory path.  When unset or empty the feature is off and
+        // behaviour is byte-identical to upstream ggml-vulkan.
+        //
+        // No auto-discovery of $XDG_CACHE_HOME or $HOME: ggml is a library
+        // distributed through package managers and consumed by applications
+        // that should decide whether and where to persist Vulkan artefacts.
+        // Writing to the user's home directory without being asked is a
+        // side effect library consumers cannot see from the API surface.
+        //
+        // Filename keyed on vendorID/deviceID/driverVersion; Vulkan itself
+        // validates the blob header and silently ignores stale data if the
+        // shader bundle or driver changed.
+        //
+        // The cache is consulted by createComputePipeline in
+        // ggml_vk_create_pipeline_func and flushed back to disk from
+        // ggml_vk_cleanup().  A cold first-process graph dispatch that used
+        // to pay seconds of shader compile drops to tens of ms on drivers
+        // without an aggressive per-app system cache (Mesa/RADV,
+        // Android Adreno/Mali, fresh NVIDIA installs, containers).
+        // See: QVAC-17872 for measured cold->warm deltas.
+        // -------------------------------------------------------------------
+        {
+            const char * env_dir = getenv("GGML_VK_PIPELINE_CACHE_DIR");
+
+            std::string dir;
+            if (env_dir != nullptr && *env_dir != '\0') {
+                dir = env_dir;
+            }
+
+            if (!dir.empty()) {
+                std::error_code mkec;
+                std::filesystem::create_directories(dir, mkec);
+                (void) mkec;  // on failure we still try createPipelineCache with an empty seed
+
+                char fname[64];
+                snprintf(fname, sizeof(fname),
+                         "%04x-%04x-%08x.pcache",
+                         (unsigned) device->properties.vendorID,
+                         (unsigned) device->properties.deviceID,
+                         (unsigned) device->properties.driverVersion);
+                device->pipeline_cache_path = (std::filesystem::path(dir) / fname).string();
+
+                std::vector<uint8_t> seed;
+                {
+                    std::ifstream in(device->pipeline_cache_path, std::ios::binary | std::ios::ate);
+                    if (in) {
+                        const std::streamoff n = in.tellg();
+                        if (n > 0) {
+                            seed.resize(static_cast<size_t>(n));
+                            in.seekg(0, std::ios::beg);
+                            in.read(reinterpret_cast<char *>(seed.data()), static_cast<std::streamsize>(seed.size()));
+                            if (!in) seed.clear();
+                        }
+                    }
+                }
+
+                vk::PipelineCacheCreateInfo pci(
+                    {},
+                    seed.size(),
+                    seed.empty() ? nullptr : seed.data());
+                try {
+                    device->pipeline_cache = device->device.createPipelineCache(pci);
+                    // QVAC-17872 round-2: seed size matches the disk blob;
+                    // if the eager-flush path observes the same size after
+                    // a load_shaders call, it's a pure cache-hit run and
+                    // the disk write is skipped.  The driver may rewrite
+                    // header fields that change blob.size() vs file size
+                    // by a few bytes — that's still a one-time growth and
+                    // we'll write the new size, then steady-state from there.
+                    device->pipeline_cache_last_size = seed.size();
+                } catch (const vk::SystemError &) {
+                    device->pipeline_cache = VK_NULL_HANDLE;
+                    device->pipeline_cache_path.clear();
+                }
+            }
+        }
+
         ggml_vk_load_shaders(device);
 
         // Only use transfer queue on AMD non-GCN, when the graphics queue is not enabled
@@ -13357,6 +13573,13 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
 // Clean up on backend free
 static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
     VK_LOG_DEBUG("ggml_vk_cleanup(" << ctx->name << ")");
+
+    // Persist the optional on-disk pipeline cache while the device shared_ptr
+    // and the Vulkan dispatcher are still valid.  Doing this from
+    // ~vk_device_struct() is unreliable: pipelines and helpers hold
+    // shared_ptr<vk_device_struct> refs that keep the refcount non-zero by
+    // typical process-exit time, so the device destructor often never runs.
+    ggml_vk_save_pipeline_cache(ctx->device);
     // discard any unsubmitted command buffers
     ctx->compute_ctx.reset();
     // wait for any pending command buffers to finish
@@ -15895,7 +16118,8 @@ static size_t ggml_backend_vk_reg_get_device_count(ggml_backend_reg_t reg) {
 }
 
 static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, size_t device) {
-    static std::vector<ggml_backend_dev_t> devices;
+    static std::vector<std::unique_ptr<ggml_backend_device>> devices;
+    static std::vector<std::unique_ptr<ggml_backend_vk_device_context>> contexts;
 
     static bool initialized = false;
 
@@ -15905,7 +16129,7 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
         if (!initialized) {
             const int min_batch_size = getenv("GGML_OP_OFFLOAD_MIN_BATCH") ? atoi(getenv("GGML_OP_OFFLOAD_MIN_BATCH")) : 32;
             for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
-                ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
+                auto ctx = std::make_unique<ggml_backend_vk_device_context>();
                 char desc[256];
                 ggml_backend_vk_get_device_description(i, desc, sizeof(desc));
                 ctx->device = i;
@@ -15914,18 +16138,20 @@ static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg,
                 ctx->is_integrated_gpu = ggml_backend_vk_get_device_type(i) == vk::PhysicalDeviceType::eIntegratedGpu;
                 ctx->pci_bus_id = ggml_backend_vk_get_device_pci_id(i);
                 ctx->op_offload_min_batch_size = min_batch_size;
-                devices.push_back(new ggml_backend_device {
+                auto dev = std::make_unique<ggml_backend_device>(ggml_backend_device {
                     /* .iface   = */ ggml_backend_vk_device_i,
                     /* .reg     = */ reg,
-                    /* .context = */ ctx,
+                    /* .context = */ ctx.get(),
                 });
+                contexts.push_back(std::move(ctx));
+                devices.push_back(std::move(dev));
             }
             initialized = true;
         }
     }
 
     GGML_ASSERT(device < devices.size());
-    return devices[device];
+    return devices[device].get();
 }
 
 static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = {