Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
520 changes: 520 additions & 0 deletions PROGRESS.md

Large diffs are not rendered by default.

13 changes: 9 additions & 4 deletions patches/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ standalone patches and are applied after the clone.
|--------|------------------|
| `ggml-metal-chatterbox-ops.patch` | Building with **Metal** (Apple Silicon T3 + full pipeline). |
| `ggml-opencl-chatterbox-ops.patch` | Building with **OpenCL** (e.g. Android / Termux + Adreno: `CONV_TRANSPOSE_1D` for HiFT, `SIN`, backend notes). |
| (none) | **CPU** / **CUDA** / **Vulkan** only — stock upstream `ggml` is enough. |
| `ggml-vulkan-pipeline-cache.patch` | Building with **Vulkan** — opt-in persistent `VkPipelineCache` keyed by `<vendorID>-<deviceID>-<driverVersion>`. Recovers ~91 % of the cold→warm gap on the first warm run. Disabled by `GGML_VK_PIPELINE_CACHE_DIR=""`. |
| `ggml-vulkan-eager-cache-save.patch` | Building with **Vulkan** — write back the pipeline cache after every `ggml_vk_load_shaders` compile batch (crash-safety against SIGKILL/abort losing freshly compiled pipelines). Stacks on the previous patch. |
| (none) | **CPU** / **CUDA** only — stock upstream `ggml` is enough. |

`setup-ggml.sh` always applies **both** patches in order (Metal, then
OpenCL). Extra OpenCL code is inert when you configure without
`GGML_OPENCL=ON`.
`setup-ggml.sh` always applies **all four** patches in order (Metal,
OpenCL, Vulkan-pipeline-cache, Vulkan-eager-cache-save). Each is
inert when you configure without the corresponding backend
(`GGML_METAL=ON` / `GGML_OPENCL=ON` / `GGML_VULKAN=ON`).

## Apply

Expand Down Expand Up @@ -46,6 +49,8 @@ git clone https://github.com/ggml-org/ggml.git ggml
cd ggml && git reset --hard $GGML_COMMIT && git clean -fdq
git apply ../patches/ggml-metal-chatterbox-ops.patch
git apply ../patches/ggml-opencl-chatterbox-ops.patch
git apply ../patches/ggml-vulkan-pipeline-cache.patch
git apply ../patches/ggml-vulkan-eager-cache-save.patch
```

`GGML_COMMIT` lives at the top of `scripts/setup-ggml.sh` as the
Expand Down
104 changes: 104 additions & 0 deletions patches/ggml-vulkan-eager-cache-save.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
diff --git a/src/ggml-vulkan/ggml-vulkan.cpp b/src/ggml-vulkan/ggml-vulkan.cpp
--- a/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/src/ggml-vulkan/ggml-vulkan.cpp
@@ -881,6 +881,12 @@
// VK_NULL_HANDLE, which is legal).
vk::PipelineCache pipeline_cache = VK_NULL_HANDLE;
std::string pipeline_cache_path;
+ // QVAC-17872 round-2: bytes already on disk for this cache. Used by
+ // the eager flush in ggml_vk_load_shaders to skip the disk write on
+ // pure cache-hit paths (warm runs where every pipeline came from the
+ // seed blob): if getPipelineCacheData().size() == this value, the
+ // cache content is unchanged and there is nothing to persist.
+ size_t pipeline_cache_last_size = 0;

std::unique_ptr<vk_memory_logger> memory_logger;

@@ -934,6 +940,15 @@
if (blob.empty()) {
return;
}
+ // QVAC-17872 round-2: skip the disk write if the cache content
+ // is byte-equivalent in size to what we already have on disk.
+ // Avoids re-writing 1 MB on every cleanup of a process that
+ // didn't compile any new pipelines (warm runs). The eager-flush
+ // path in ggml_vk_load_shaders uses the same pipeline_cache_last_size
+ // bookkeeping so they cooperate idempotently.
+ if (blob.size() == device->pipeline_cache_last_size) {
+ return;
+ }
const std::string tmp_path = device->pipeline_cache_path + ".tmp";
std::ofstream out(tmp_path, std::ios::binary | std::ios::trunc);
if (!out) {
@@ -942,8 +957,9 @@
out.write(reinterpret_cast<const char *>(blob.data()),
static_cast<std::streamsize>(blob.size()));
out.close();
- if (out.good()) {
- (void) std::rename(tmp_path.c_str(), device->pipeline_cache_path.c_str());
+ if (out.good() &&
+ std::rename(tmp_path.c_str(), device->pipeline_cache_path.c_str()) == 0) {
+ device->pipeline_cache_last_size = blob.size();
} else {
(void) std::remove(tmp_path.c_str());
}
@@ -4846,6 +4862,44 @@
for (auto &c : compiles) {
c.wait();
}
+
+ // QVAC-17872 round-2: persist the pipeline cache eagerly when this
+ // load_shaders call actually GREW the cache (i.e. compiled at least
+ // one pipeline whose SPIR-V was not already in the seed blob).
+ // Without this, lazy-compile work done by
+ // ggml_pipeline_request_descriptor_sets during a long-running graph
+ // compute is only flushed in ggml_vk_cleanup at backend free time —
+ // a process crash in between throws away the entire cold-compile
+ // wave and the next process pays it again.
+ //
+ // Crucially, on a warm run with a populated seed blob, every
+ // pipeline still goes through createComputePipeline → compiles is
+ // non-empty → but getPipelineCacheData().size() == seed size, so we
+ // skip the disk write. This keeps warm-run overhead at zero (we
+ // measured a +90 ms WALL regression with an unconditional flush).
+ if (!compiles.empty() && device->pipeline_cache && !device->pipeline_cache_path.empty()) {
+ try {
+ const std::vector<uint8_t> blob = device->device.getPipelineCacheData(device->pipeline_cache);
+ if (!blob.empty() && blob.size() > device->pipeline_cache_last_size) {
+ const std::string tmp_path = device->pipeline_cache_path + ".tmp";
+ std::ofstream out(tmp_path, std::ios::binary | std::ios::trunc);
+ if (out) {
+ out.write(reinterpret_cast<const char *>(blob.data()),
+ static_cast<std::streamsize>(blob.size()));
+ out.close();
+ if (out.good() &&
+ std::rename(tmp_path.c_str(), device->pipeline_cache_path.c_str()) == 0) {
+ device->pipeline_cache_last_size = blob.size();
+ } else {
+ (void) std::remove(tmp_path.c_str());
+ }
+ }
+ }
+ } catch (const std::exception &) {
+ // best-effort; on any failure we silently fall back to the
+ // ggml_vk_cleanup-time flush.
+ }
+ }
}

static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch);
@@ -5638,6 +5692,14 @@
seed.empty() ? nullptr : seed.data());
try {
device->pipeline_cache = device->device.createPipelineCache(pci);
+ // QVAC-17872 round-2: seed size matches the disk blob;
+ // if the eager-flush path observes the same size after
+ // a load_shaders call, it's a pure cache-hit run and
+ // the disk write is skipped. The driver may rewrite
+ // header fields that change blob.size() vs file size
+ // by a few bytes — that's still a one-time growth and
+ // we'll write the new size, then steady-state from there.
+ device->pipeline_cache_last_size = seed.size();
} catch (const vk::SystemError &) {
device->pipeline_cache = VK_NULL_HANDLE;
device->pipeline_cache_path.clear();
199 changes: 199 additions & 0 deletions patches/ggml-vulkan-pipeline-cache.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
diff --git a/src/ggml-vulkan/ggml-vulkan.cpp b/src/ggml-vulkan/ggml-vulkan.cpp
index 19e7fbda..7c4d7ffe 100644
--- a/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/src/ggml-vulkan/ggml-vulkan.cpp
@@ -23,8 +23,14 @@ DispatchLoaderDynamic & ggml_vk_default_dispatcher();

#include <algorithm>
#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <filesystem>
+#include <fstream>
#include <iomanip>
#include <iostream>
+#include <system_error>
#include <tuple>
#include <vector>
#include <deque>
@@ -864,6 +870,18 @@ struct vk_device_struct {
bool allow_sysmem_fallback;
bool disable_graph_optimize;

+ // Optional persistent VkPipelineCache. When enabled via
+ // GGML_VK_PIPELINE_CACHE_DIR / $XDG_CACHE_HOME / $HOME, createPipelineCache
+ // is seeded from disk at init and getPipelineCacheData is written back
+ // from the destructor, so repeated ggml_backend_vk_init() invocations
+ // (and separate processes) skip the shader-compile wave that Vulkan
+ // normally pays on every cold command-buffer graph-build. When
+ // pipeline_cache is VK_NULL_HANDLE (default / opt-out / mkdir failure)
+ // behaviour is identical to upstream (createComputePipeline takes
+ // VK_NULL_HANDLE, which is legal).
+ vk::PipelineCache pipeline_cache = VK_NULL_HANDLE;
+ std::string pipeline_cache_path;
+
std::unique_ptr<vk_memory_logger> memory_logger;

~vk_device_struct() {
@@ -888,10 +906,52 @@ struct vk_device_struct {

device.destroyDescriptorSetLayout(dsl);

+ // Destroy the VkPipelineCache handle here if it's still alive. The
+ // on-disk persistence happens earlier, in ggml_vk_cleanup(), because
+ // this destructor is not reliably reached at process exit: pipelines
+ // and helpers hold shared_ptr<vk_device_struct> refs that keep the
+ // refcount above 0 until well after the Vulkan dispatcher is gone.
+ if (pipeline_cache) {
+ device.destroyPipelineCache(pipeline_cache);
+ pipeline_cache = VK_NULL_HANDLE;
+ }
+
device.destroy();
}
};

+// Flush the optional persistent pipeline cache to disk. Called from
+// ggml_vk_cleanup() while the device shared_ptr is still alive and the
+// Vulkan dispatcher is still valid. Safe to call multiple times per device
+// (the write is atomic via tmp + rename; idempotent). No-op when persistent
+// caching was not enabled at init time.
+static void ggml_vk_save_pipeline_cache(vk_device & device) {
+ if (!device || !device->pipeline_cache || device->pipeline_cache_path.empty()) {
+ return;
+ }
+ try {
+ const std::vector<uint8_t> blob = device->device.getPipelineCacheData(device->pipeline_cache);
+ if (blob.empty()) {
+ return;
+ }
+ const std::string tmp_path = device->pipeline_cache_path + ".tmp";
+ std::ofstream out(tmp_path, std::ios::binary | std::ios::trunc);
+ if (!out) {
+ return;
+ }
+ out.write(reinterpret_cast<const char *>(blob.data()),
+ static_cast<std::streamsize>(blob.size()));
+ out.close();
+ if (out.good()) {
+ (void) std::rename(tmp_path.c_str(), device->pipeline_cache_path.c_str());
+ } else {
+ (void) std::remove(tmp_path.c_str());
+ }
+ } catch (const std::exception &) {
+ // best-effort; silently drop the write
+ }
+}
+
void vk_command_pool::init(vk_device& device, vk_queue *q_) {
cmd_buffers.clear();
q = q_;
@@ -2206,7 +2266,10 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
#endif

try {
- pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
+ // device->pipeline_cache is VK_NULL_HANDLE when persistent caching is
+ // opt-ed-out or its init failed; VK treats that as "no cache" — same
+ // as before this patch.
+ pipeline->pipeline = device->device.createComputePipeline(device->pipeline_cache, compute_pipeline_create_info).value;
} catch (const vk::SystemError& e) {
std::cerr << "ggml_vulkan: Compute pipeline creation failed for " << pipeline->name << std::endl;
std::cerr << "ggml_vulkan: " << e.what() << std::endl;
@@ -5507,6 +5570,81 @@ static vk_device ggml_vk_get_device(size_t idx) {
descriptor_set_layout_create_info.setPNext(&dslbfci);
device->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);

+ // -------------------------------------------------------------------
+ // Persistent VkPipelineCache (opt-in / default-on-when-HOME-exists).
+ //
+ // Disabled by setting GGML_VK_PIPELINE_CACHE_DIR to the empty string.
+ // Path priority:
+ // 1. $GGML_VK_PIPELINE_CACHE_DIR (if non-empty)
+ // 2. $XDG_CACHE_HOME/ggml/vulkan
+ // 3. $HOME/.cache/ggml/vulkan
+ // Filename keyed on vendorID/deviceID/driverVersion; Vulkan itself
+ // validates the blob header and silently ignores stale data if the
+ // shader bundle or driver changed.
+ //
+ // The cache is consulted by createComputePipeline in
+ // ggml_vk_create_pipeline_func and flushed back to disk from
+ // ~vk_device_struct(). A cold first-process graph dispatch that
+ // used to pay seconds of shader compile drops to tens of ms on
+ // drivers without an aggressive per-app system cache (Mesa/RADV,
+ // Android Adreno/Mali, fresh NVIDIA installs, containers).
+ // See: QVAC-17872 for measured cold→warm deltas.
+ // -------------------------------------------------------------------
+ {
+ const char * env_dir = getenv("GGML_VK_PIPELINE_CACHE_DIR");
+ const char * xdg_dir = getenv("XDG_CACHE_HOME");
+ const char * home_dir = getenv("HOME");
+
+ std::string dir;
+ if (env_dir != nullptr) {
+ // Explicit env var wins: non-empty -> use it; empty -> disabled.
+ if (*env_dir) dir = env_dir;
+ } else if (xdg_dir && *xdg_dir) {
+ dir = std::string(xdg_dir) + "/ggml/vulkan";
+ } else if (home_dir && *home_dir) {
+ dir = std::string(home_dir) + "/.cache/ggml/vulkan";
+ }
+
+ if (!dir.empty()) {
+ std::error_code mkec;
+ std::filesystem::create_directories(dir, mkec);
+ (void) mkec; // on failure we still try createPipelineCache with an empty seed
+
+ char fname[64];
+ snprintf(fname, sizeof(fname),
+ "%04x-%04x-%08x.pcache",
+ (unsigned) device->properties.vendorID,
+ (unsigned) device->properties.deviceID,
+ (unsigned) device->properties.driverVersion);
+ device->pipeline_cache_path = dir + "/" + fname;
+
+ std::vector<uint8_t> seed;
+ {
+ std::ifstream in(device->pipeline_cache_path, std::ios::binary | std::ios::ate);
+ if (in) {
+ const std::streamoff n = in.tellg();
+ if (n > 0) {
+ seed.resize(static_cast<size_t>(n));
+ in.seekg(0, std::ios::beg);
+ in.read(reinterpret_cast<char *>(seed.data()), static_cast<std::streamsize>(seed.size()));
+ if (!in) seed.clear();
+ }
+ }
+ }
+
+ vk::PipelineCacheCreateInfo pci(
+ {},
+ seed.size(),
+ seed.empty() ? nullptr : seed.data());
+ try {
+ device->pipeline_cache = device->device.createPipelineCache(pci);
+ } catch (const vk::SystemError &) {
+ device->pipeline_cache = VK_NULL_HANDLE;
+ device->pipeline_cache_path.clear();
+ }
+ }
+ }
+
ggml_vk_load_shaders(device);

// Only use transfer queue on AMD non-GCN, when the graphics queue is not enabled
@@ -13357,6 +13495,13 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
// Clean up on backend free
static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
VK_LOG_DEBUG("ggml_vk_cleanup(" << ctx->name << ")");
+
+ // Persist the optional on-disk pipeline cache while the device shared_ptr
+ // and the Vulkan dispatcher are still valid. Doing this from
+ // ~vk_device_struct() is unreliable: pipelines and helpers hold
+ // shared_ptr<vk_device_struct> refs that keep the refcount non-zero by
+ // typical process-exit time, so the device destructor often never runs.
+ ggml_vk_save_pipeline_cache(ctx->device);
// discard any unsubmitted command buffers
ctx->compute_ctx.reset();
// wait for any pending command buffers to finish
Loading