GustavoA1604 · GustavoA1604 · May 6, 2026 · May 6, 2026 · May 6, 2026 · May 6, 2026
diff --git a/PROGRESS.md b/PROGRESS.md
diff --git a/patches/README.md b/patches/README.md
@@ -8,11 +8,14 @@ standalone patches and are applied after the clone.
 |--------|------------------|
 | `ggml-metal-chatterbox-ops.patch` | Building with **Metal** (Apple Silicon T3 + full pipeline). |
 | `ggml-opencl-chatterbox-ops.patch` | Building with **OpenCL** (e.g. Android / Termux + Adreno: `CONV_TRANSPOSE_1D` for HiFT, `SIN`, backend notes). |
-| (none) | **CPU** / **CUDA** / **Vulkan** only — stock upstream `ggml` is enough. |
+| `ggml-vulkan-pipeline-cache.patch` | Building with **Vulkan** — opt-in persistent `VkPipelineCache` keyed by `<vendorID>-<deviceID>-<driverVersion>`.  Recovers ~91 % of the cold→warm gap on the first warm run.  Disabled by `GGML_VK_PIPELINE_CACHE_DIR=""`. |
+| `ggml-vulkan-eager-cache-save.patch` | Building with **Vulkan** — write back the pipeline cache after every `ggml_vk_load_shaders` compile batch (crash-safety against SIGKILL/abort losing freshly compiled pipelines).  Stacks on the previous patch. |
+| (none) | **CPU** / **CUDA** only — stock upstream `ggml` is enough. |
 
-`setup-ggml.sh` always applies **both** patches in order (Metal, then
-OpenCL).  Extra OpenCL code is inert when you configure without
-`GGML_OPENCL=ON`.
+`setup-ggml.sh` always applies **all four** patches in order (Metal,
+OpenCL, Vulkan-pipeline-cache, Vulkan-eager-cache-save).  Each is
+inert when you configure without the corresponding backend
+(`GGML_METAL=ON` / `GGML_OPENCL=ON` / `GGML_VULKAN=ON`).
 
 ## Apply
 
@@ -46,6 +49,8 @@ git clone https://github.com/ggml-org/ggml.git ggml
 cd ggml && git reset --hard $GGML_COMMIT && git clean -fdq
 git apply ../patches/ggml-metal-chatterbox-ops.patch
 git apply ../patches/ggml-opencl-chatterbox-ops.patch
+git apply ../patches/ggml-vulkan-pipeline-cache.patch
+git apply ../patches/ggml-vulkan-eager-cache-save.patch
 ```
 
 `GGML_COMMIT` lives at the top of `scripts/setup-ggml.sh` as the

diff --git a/patches/ggml-vulkan-eager-cache-save.patch b/patches/ggml-vulkan-eager-cache-save.patch
@@ -0,0 +1,104 @@
+diff --git a/src/ggml-vulkan/ggml-vulkan.cpp b/src/ggml-vulkan/ggml-vulkan.cpp
+--- a/src/ggml-vulkan/ggml-vulkan.cpp
++++ b/src/ggml-vulkan/ggml-vulkan.cpp
+@@ -881,6 +881,12 @@
+     // VK_NULL_HANDLE, which is legal).
+     vk::PipelineCache pipeline_cache = VK_NULL_HANDLE;
+     std::string       pipeline_cache_path;
++    // QVAC-17872 round-2: bytes already on disk for this cache.  Used by
++    // the eager flush in ggml_vk_load_shaders to skip the disk write on
++    // pure cache-hit paths (warm runs where every pipeline came from the
++    // seed blob): if getPipelineCacheData().size() == this value, the
++    // cache content is unchanged and there is nothing to persist.
++    size_t            pipeline_cache_last_size = 0;
+
+     std::unique_ptr<vk_memory_logger> memory_logger;
+
+@@ -934,6 +940,15 @@
+         if (blob.empty()) {
+             return;
+         }
++        // QVAC-17872 round-2: skip the disk write if the cache content
++        // is byte-equivalent in size to what we already have on disk.
++        // Avoids re-writing 1 MB on every cleanup of a process that
++        // didn't compile any new pipelines (warm runs).  The eager-flush
++        // path in ggml_vk_load_shaders uses the same pipeline_cache_last_size
++        // bookkeeping so they cooperate idempotently.
++        if (blob.size() == device->pipeline_cache_last_size) {
++            return;
++        }
+         const std::string tmp_path = device->pipeline_cache_path + ".tmp";
+         std::ofstream out(tmp_path, std::ios::binary | std::ios::trunc);
+         if (!out) {
+@@ -942,8 +957,9 @@
+         out.write(reinterpret_cast<const char *>(blob.data()),
+                   static_cast<std::streamsize>(blob.size()));
+         out.close();
+-        if (out.good()) {
+-            (void) std::rename(tmp_path.c_str(), device->pipeline_cache_path.c_str());
++        if (out.good() &&
++            std::rename(tmp_path.c_str(), device->pipeline_cache_path.c_str()) == 0) {
++            device->pipeline_cache_last_size = blob.size();
+         } else {
+             (void) std::remove(tmp_path.c_str());
+         }
+@@ -4846,6 +4862,44 @@
+     for (auto &c : compiles) {
+         c.wait();
+     }
++
++    // QVAC-17872 round-2: persist the pipeline cache eagerly when this
++    // load_shaders call actually GREW the cache (i.e. compiled at least
++    // one pipeline whose SPIR-V was not already in the seed blob).
++    // Without this, lazy-compile work done by
++    // ggml_pipeline_request_descriptor_sets during a long-running graph
++    // compute is only flushed in ggml_vk_cleanup at backend free time —
++    // a process crash in between throws away the entire cold-compile
++    // wave and the next process pays it again.
++    //
++    // Crucially, on a warm run with a populated seed blob, every
++    // pipeline still goes through createComputePipeline → compiles is
++    // non-empty → but getPipelineCacheData().size() == seed size, so we
++    // skip the disk write.  This keeps warm-run overhead at zero (we
++    // measured a +90 ms WALL regression with an unconditional flush).
++    if (!compiles.empty() && device->pipeline_cache && !device->pipeline_cache_path.empty()) {
++        try {
++            const std::vector<uint8_t> blob = device->device.getPipelineCacheData(device->pipeline_cache);
++            if (!blob.empty() && blob.size() > device->pipeline_cache_last_size) {
++                const std::string tmp_path = device->pipeline_cache_path + ".tmp";
++                std::ofstream out(tmp_path, std::ios::binary | std::ios::trunc);
++                if (out) {
++                    out.write(reinterpret_cast<const char *>(blob.data()),
++                              static_cast<std::streamsize>(blob.size()));
++                    out.close();
++                    if (out.good() &&
++                        std::rename(tmp_path.c_str(), device->pipeline_cache_path.c_str()) == 0) {
++                        device->pipeline_cache_last_size = blob.size();
++                    } else {
++                        (void) std::remove(tmp_path.c_str());
++                    }
++                }
++            }
++        } catch (const std::exception &) {
++            // best-effort; on any failure we silently fall back to the
++            // ggml_vk_cleanup-time flush.
++        }
++    }
+ }
+
+ static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch);
+@@ -5638,6 +5692,14 @@
+                     seed.empty() ? nullptr : seed.data());
+                 try {
+                     device->pipeline_cache = device->device.createPipelineCache(pci);
++                    // QVAC-17872 round-2: seed size matches the disk blob;
++                    // if the eager-flush path observes the same size after
++                    // a load_shaders call, it's a pure cache-hit run and
++                    // the disk write is skipped.  The driver may rewrite
++                    // header fields that change blob.size() vs file size
++                    // by a few bytes — that's still a one-time growth and
++                    // we'll write the new size, then steady-state from there.
++                    device->pipeline_cache_last_size = seed.size();
+                 } catch (const vk::SystemError &) {
+                     device->pipeline_cache = VK_NULL_HANDLE;
+                     device->pipeline_cache_path.clear();
diff --git a/patches/ggml-vulkan-pipeline-cache.patch b/patches/ggml-vulkan-pipeline-cache.patch
@@ -0,0 +1,199 @@
+diff --git a/src/ggml-vulkan/ggml-vulkan.cpp b/src/ggml-vulkan/ggml-vulkan.cpp
+index 19e7fbda..7c4d7ffe 100644
+--- a/src/ggml-vulkan/ggml-vulkan.cpp
++++ b/src/ggml-vulkan/ggml-vulkan.cpp
+@@ -23,8 +23,14 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();
+
+ #include <algorithm>
+ #include <cmath>
++#include <cstdio>
++#include <cstdlib>
++#include <cstring>
++#include <filesystem>
++#include <fstream>
+ #include <iomanip>
+ #include <iostream>
++#include <system_error>
+ #include <tuple>
+ #include <vector>
+ #include <deque>
+@@ -864,6 +870,18 @@ struct vk_device_struct {
+     bool allow_sysmem_fallback;
+     bool disable_graph_optimize;
+
++    // Optional persistent VkPipelineCache.  When enabled via
++    // GGML_VK_PIPELINE_CACHE_DIR / $XDG_CACHE_HOME / $HOME, createPipelineCache
++    // is seeded from disk at init and getPipelineCacheData is written back
++    // from the destructor, so repeated ggml_backend_vk_init() invocations
++    // (and separate processes) skip the shader-compile wave that Vulkan
++    // normally pays on every cold command-buffer graph-build.  When
++    // pipeline_cache is VK_NULL_HANDLE (default / opt-out / mkdir failure)
++    // behaviour is identical to upstream (createComputePipeline takes
++    // VK_NULL_HANDLE, which is legal).
++    vk::PipelineCache pipeline_cache = VK_NULL_HANDLE;
++    std::string       pipeline_cache_path;
++
+     std::unique_ptr<vk_memory_logger> memory_logger;
+
+     ~vk_device_struct() {
+@@ -888,10 +906,52 @@ struct vk_device_struct {
+
+         device.destroyDescriptorSetLayout(dsl);
+
++        // Destroy the VkPipelineCache handle here if it's still alive.  The
++        // on-disk persistence happens earlier, in ggml_vk_cleanup(), because
++        // this destructor is not reliably reached at process exit: pipelines
++        // and helpers hold shared_ptr<vk_device_struct> refs that keep the
++        // refcount above 0 until well after the Vulkan dispatcher is gone.
++        if (pipeline_cache) {
++            device.destroyPipelineCache(pipeline_cache);
++            pipeline_cache = VK_NULL_HANDLE;
++        }
++
+         device.destroy();
+     }
+ };
+
++// Flush the optional persistent pipeline cache to disk.  Called from
++// ggml_vk_cleanup() while the device shared_ptr is still alive and the
++// Vulkan dispatcher is still valid.  Safe to call multiple times per device
++// (the write is atomic via tmp + rename; idempotent).  No-op when persistent
++// caching was not enabled at init time.
++static void ggml_vk_save_pipeline_cache(vk_device & device) {
++    if (!device || !device->pipeline_cache || device->pipeline_cache_path.empty()) {
++        return;
++    }
++    try {
++        const std::vector<uint8_t> blob = device->device.getPipelineCacheData(device->pipeline_cache);
++        if (blob.empty()) {
++            return;
++        }
++        const std::string tmp_path = device->pipeline_cache_path + ".tmp";
++        std::ofstream out(tmp_path, std::ios::binary | std::ios::trunc);
++        if (!out) {
++            return;
++        }
++        out.write(reinterpret_cast<const char *>(blob.data()),
++                  static_cast<std::streamsize>(blob.size()));
++        out.close();
++        if (out.good()) {
++            (void) std::rename(tmp_path.c_str(), device->pipeline_cache_path.c_str());
++        } else {
++            (void) std::remove(tmp_path.c_str());
++        }
++    } catch (const std::exception &) {
++        // best-effort; silently drop the write
++    }
++}
++
+ void vk_command_pool::init(vk_device& device, vk_queue *q_) {
+     cmd_buffers.clear();
+     q = q_;
+@@ -2206,7 +2266,10 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
+ #endif
+
+     try {
+-        pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
++        // device->pipeline_cache is VK_NULL_HANDLE when persistent caching is
++        // opt-ed-out or its init failed; VK treats that as "no cache" — same
++        // as before this patch.
++        pipeline->pipeline = device->device.createComputePipeline(device->pipeline_cache, compute_pipeline_create_info).value;
+     } catch (const vk::SystemError& e) {
+         std::cerr << "ggml_vulkan: Compute pipeline creation failed for " << pipeline->name << std::endl;
+         std::cerr << "ggml_vulkan: " << e.what() << std::endl;
+@@ -5507,6 +5570,81 @@ static vk_device ggml_vk_get_device(size_t idx) {
+         descriptor_set_layout_create_info.setPNext(&dslbfci);
+         device->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
+
++        // -------------------------------------------------------------------
++        // Persistent VkPipelineCache (opt-in / default-on-when-HOME-exists).
++        //
++        // Disabled by setting GGML_VK_PIPELINE_CACHE_DIR to the empty string.
++        // Path priority:
++        //   1. $GGML_VK_PIPELINE_CACHE_DIR (if non-empty)
++        //   2. $XDG_CACHE_HOME/ggml/vulkan
++        //   3. $HOME/.cache/ggml/vulkan
++        // Filename keyed on vendorID/deviceID/driverVersion; Vulkan itself
++        // validates the blob header and silently ignores stale data if the
++        // shader bundle or driver changed.
++        //
++        // The cache is consulted by createComputePipeline in
++        // ggml_vk_create_pipeline_func and flushed back to disk from
++        // ~vk_device_struct().  A cold first-process graph dispatch that
++        // used to pay seconds of shader compile drops to tens of ms on
++        // drivers without an aggressive per-app system cache (Mesa/RADV,
++        // Android Adreno/Mali, fresh NVIDIA installs, containers).
++        // See: QVAC-17872 for measured cold→warm deltas.
++        // -------------------------------------------------------------------
++        {
++            const char * env_dir  = getenv("GGML_VK_PIPELINE_CACHE_DIR");
++            const char * xdg_dir  = getenv("XDG_CACHE_HOME");
++            const char * home_dir = getenv("HOME");
++
++            std::string dir;
++            if (env_dir != nullptr) {
++                // Explicit env var wins: non-empty -> use it; empty -> disabled.
++                if (*env_dir) dir = env_dir;
++            } else if (xdg_dir && *xdg_dir) {
++                dir = std::string(xdg_dir) + "/ggml/vulkan";
++            } else if (home_dir && *home_dir) {
++                dir = std::string(home_dir) + "/.cache/ggml/vulkan";
++            }
++
++            if (!dir.empty()) {
++                std::error_code mkec;
++                std::filesystem::create_directories(dir, mkec);
++                (void) mkec;  // on failure we still try createPipelineCache with an empty seed
++
++                char fname[64];
++                snprintf(fname, sizeof(fname),
++                         "%04x-%04x-%08x.pcache",
++                         (unsigned) device->properties.vendorID,
++                         (unsigned) device->properties.deviceID,
++                         (unsigned) device->properties.driverVersion);
++                device->pipeline_cache_path = dir + "/" + fname;
++
++                std::vector<uint8_t> seed;
++                {
++                    std::ifstream in(device->pipeline_cache_path, std::ios::binary | std::ios::ate);
++                    if (in) {
++                        const std::streamoff n = in.tellg();
++                        if (n > 0) {
++                            seed.resize(static_cast<size_t>(n));
++                            in.seekg(0, std::ios::beg);
++                            in.read(reinterpret_cast<char *>(seed.data()), static_cast<std::streamsize>(seed.size()));
++                            if (!in) seed.clear();
++                        }
++                    }
++                }
++
++                vk::PipelineCacheCreateInfo pci(
++                    {},
++                    seed.size(),
++                    seed.empty() ? nullptr : seed.data());
++                try {
++                    device->pipeline_cache = device->device.createPipelineCache(pci);
++                } catch (const vk::SystemError &) {
++                    device->pipeline_cache = VK_NULL_HANDLE;
++                    device->pipeline_cache_path.clear();
++                }
++            }
++        }
++
+         ggml_vk_load_shaders(device);
+
+         // Only use transfer queue on AMD non-GCN, when the graphics queue is not enabled
+@@ -13357,6 +13495,13 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
+ // Clean up on backend free
+ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
+     VK_LOG_DEBUG("ggml_vk_cleanup(" << ctx->name << ")");
++
++    // Persist the optional on-disk pipeline cache while the device shared_ptr
++    // and the Vulkan dispatcher are still valid.  Doing this from
++    // ~vk_device_struct() is unreliable: pipelines and helpers hold
++    // shared_ptr<vk_device_struct> refs that keep the refcount non-zero by
++    // typical process-exit time, so the device destructor often never runs.
++    ggml_vk_save_pipeline_cache(ctx->device);
+     // discard any unsubmitted command buffers
+     ctx->compute_ctx.reset();
+     // wait for any pending command buffers to finish