tetherto · gianni-cor · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/src/ggml-adreno.h b/src/ggml-adreno.h
@@ -0,0 +1,87 @@
+#pragma once
+
+// Adreno GPU detection helper, shared between the Android backend-selection
+// logic in ggml-backend-reg.cpp and its unit test (tests/test-adreno-version.cpp).
+//
+// Header-only (inline) and dependency-free so the unit test can exercise the
+// pure parsing logic without linking the ggml runtime or needing a GPU.
+//
+// Mirrors the Adreno-version policy used by qvac-fabric-llm.cpp's ggml fork so
+// the speech stack (whisper / parakeet / tts) selects GPU backends the same
+// way the LLM stack does: on Android, detect the GPU through Vulkan (present
+// on virtually every Android GPU) and only fall back to OpenCL for Adreno,
+// whose Vulkan compute path is unstable.
+
+#include <algorithm>
+#include <cctype>
+#include <regex>
+#include <string>
+
+// Parse the Adreno GPU generation from a ggml device description.
+//   e.g. "Adreno (TM) 830" -> 830, "Adreno 730" -> 730
+// Returns:
+//   > 0 : the parsed Adreno generation number
+//   -1  : the description is not an Adreno GPU
+//   -3  : the description is an Adreno GPU but the version failed to parse
+//
+// The description is lowercased before matching so "Adreno"/"ADRENO"/"adreno"
+// are all recognised (a small hardening over the raw substring check in
+// qvac-fabric-llm.cpp; it never produces a false negative for Adreno).
+//
+// Limitation (inherited from qvac-fabric-llm.cpp): the first digit run wins, so
+// a non-numeric model name like "Adreno X1-85" would parse as 1. That naming is
+// Snapdragon-X (Windows-on-ARM) only; Android phone Adrenos are 5xx/6xx/7xx/8xx,
+// which parse correctly.
+inline int ggml_adreno_version_from_description(const std::string & gpu_description) {
+    std::string lowered = gpu_description;
+    std::transform(lowered.begin(), lowered.end(), lowered.begin(), [](unsigned char c) {
+        return static_cast<char>(std::tolower(c));
+    });
+
+    if (lowered.find("dreno") == std::string::npos) {
+        return -1;
+    }
+
+    static const std::regex digits_regex(R"((\d+))");
+    std::smatch matches;
+    if (std::regex_search(lowered, matches, digits_regex) && matches.size() > 1) {
+        try {
+            return std::stoi(matches[1].str());
+        } catch (const std::exception &) {
+            return -3;
+        }
+    }
+    return -3;
+}
+
+// Android OpenCL/Vulkan backend policy decision, factored out of
+// ggml_backend_load_all_from_path() so the version thresholds are unit-testable
+// without a GPU. Input is the smallest Adreno generation among the GPU devices
+// (the value ggml_backend_min_adreno_version() returns: a positive generation,
+// or <= 0 when no Adreno GPU is present).
+//
+//   not Adreno (<= 0) -> no OpenCL; keep Vulkan/CPU
+//   Adreno > 700      -> load OpenCL (kept alongside Vulkan; the consumer picks
+//                        OpenCL over Vulkan -- see transcription-whispercpp)
+//   Adreno 1..700     -> CPU only: unload Vulkan and don't load OpenCL
+//                        (only Adreno 700+ has a stable ggml GPU path)
+//
+// Note: this is stricter than qvac-fabric-llm.cpp, which loads OpenCL on
+// Adreno <= 600. We treat Adreno <= 600 the same as 601..700 (CPU only): older
+// Adreno GPUs are no more capable than the 601..700 tier we already exclude, so
+// there is no reason to expose a GPU backend on them.
+struct ggml_adreno_backend_policy {
+    bool load_opencl;
+    bool unload_vulkan;
+};
+
+inline ggml_adreno_backend_policy ggml_adreno_resolve_backend_policy(int min_adreno_version) {
+    if (min_adreno_version <= 0) {
+        return ggml_adreno_backend_policy{ /*load_opencl=*/ false, /*unload_vulkan=*/ false };
+    }
+    if (min_adreno_version > 700) {
+        return ggml_adreno_backend_policy{ /*load_opencl=*/ true, /*unload_vulkan=*/ false };
+    }
+    // Adreno 1..700 (incl. <= 600): CPU only.
+    return ggml_adreno_backend_policy{ /*load_opencl=*/ false, /*unload_vulkan=*/ true };
+}
diff --git a/src/ggml-backend-reg.cpp b/src/ggml-backend-reg.cpp
@@ -2,9 +2,11 @@
 #include "ggml-backend.h"
 #include "ggml-backend-dl.h"
 #include "ggml-impl.h"
+#include "ggml-adreno.h"
 #include <algorithm>
 #include <cstring>
 #include <filesystem>
+#include <limits>
 #include <memory>
 #include <string>
 #include <type_traits>
@@ -687,6 +689,35 @@ void ggml_backend_load_all() {
     ggml_backend_load_all_from_path(nullptr);
 }
 
+#ifdef __ANDROID__
+namespace {
+// Smallest Adreno generation among the GPU devices a (Vulkan) backend exposes,
+// or a negative sentinel: -2 if `reg` is null, -1 if no Adreno GPU is present.
+// Vulkan is used as the probe because it is present on virtually every Android
+// GPU, so the GPU can be identified before deciding whether to load OpenCL.
+// Mirrors qvac-fabric-llm.cpp's ggml fork (the LLM stack's backend selection).
+int ggml_backend_min_adreno_version(ggml_backend_reg_t reg) {
+    if (reg == nullptr) {
+        return -2;
+    }
+    int min_found = std::numeric_limits<int>::max();
+    for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
+        ggml_backend_dev_t dev = ggml_backend_reg_dev_get(reg, i);
+        if (dev == nullptr) {
+            continue;
+        }
+        const char * description = ggml_backend_dev_description(dev);
+        GGML_LOG_INFO("%s: found device description: %s\n", __func__, description ? description : "(null)");
+        const int dev_adreno_version = ggml_adreno_version_from_description(description ? description : "");
+        if (dev_adreno_version > 0) {
+            min_found = std::min(min_found, dev_adreno_version);
+        }
+    }
+    return (min_found < std::numeric_limits<int>::max()) ? min_found : -1;
+}
+} // namespace
+#endif // __ANDROID__
+
 void ggml_backend_load_all_from_path(const char * dir_path) {
 #ifdef NDEBUG
     bool silent = true;
@@ -704,7 +735,39 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
     ggml_backend_load_best("sycl", silent, dir_path);
     ggml_backend_load_best("vulkan", silent, dir_path);
     ggml_backend_load_best("virtgpu", silent, dir_path);
-    ggml_backend_load_best("opencl", silent, dir_path);
+
+    // OpenCL is only useful (and stable) for ggml on Adreno GPUs; on every
+    // other GPU the Adreno-tuned OpenCL kernels are either unsupported or buggy.
+    // On Android, use the already-loaded Vulkan backend to detect the GPU and
+    // only keep OpenCL for an Adreno that benefits from it. This mirrors
+    // qvac-fabric-llm.cpp's ggml fork so the speech stack selects backends the
+    // same way the LLM stack does. Off Android (or when no Vulkan backend is
+    // present) behaviour is unchanged: OpenCL is loaded unconditionally here.
+    bool load_opencl = true;
+#ifdef __ANDROID__
+    {
+        ggml_backend_reg_t vulkan_backend = ggml_backend_reg_by_name("vulkan");
+        const int min_adreno_version = ggml_backend_min_adreno_version(vulkan_backend);
+        const ggml_adreno_backend_policy policy = ggml_adreno_resolve_backend_policy(min_adreno_version);
+        load_opencl = policy.load_opencl;
+        if (min_adreno_version <= 0) {
+            GGML_LOG_INFO("%s: no Adreno GPU detected (%d); skipping OpenCL, relying on Vulkan/CPU\n",
+                          __func__, min_adreno_version);
+        } else if (policy.unload_vulkan) {
+            GGML_LOG_INFO("%s: Adreno %d detected; removing Vulkan and relying on CPU only\n",
+                          __func__, min_adreno_version);
+            if (vulkan_backend != nullptr) {
+                ggml_backend_unload(vulkan_backend);
+            }
+        } else if (policy.load_opencl) {
+            GGML_LOG_INFO("%s: Adreno %d detected; keeping OpenCL backend\n", __func__, min_adreno_version);
+        }
+    }
+#endif // __ANDROID__
+    if (load_opencl) {
+        ggml_backend_load_best("opencl", silent, dir_path);
+    }
+
     ggml_backend_load_best("hexagon", silent, dir_path);
     ggml_backend_load_best("musa", silent, dir_path);
     ggml_backend_load_best("openvino", silent, dir_path);

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -168,6 +168,18 @@ target_link_libraries(${TEST_TARGET} PRIVATE ggml Threads::Threads)
 add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
 set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
 
+#
+# test-adreno-version
+# Pure unit test for src/ggml-adreno.h (Adreno GPU-string parser used by the
+# Android OpenCL/Vulkan backend-selection policy). Header-only: needs the src/
+# include path but does not link the ggml runtime. Always built.
+
+set(TEST_TARGET test-adreno-version)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+target_include_directories(${TEST_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src)
+add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
+set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
+
 
 if (NOT GGML_BACKEND_DL)
     #

diff --git a/tests/test-adreno-version.cpp b/tests/test-adreno-version.cpp
@@ -0,0 +1,94 @@
+// Unit test for ggml_adreno_version_from_description() — the pure GPU-string
+// parser behind the Android Adreno backend-selection policy in
+// ggml-backend-reg.cpp (see src/ggml-adreno.h).
+//
+// Pure / header-only: does not link the ggml runtime or need a GPU, so it runs
+// on every host/CI. The hardware-dependent half of the policy
+// (ggml_backend_min_adreno_version + the OpenCL load decision) is exercised
+// end-to-end on the device farm via transcription-whispercpp's GPU test.
+
+#include "ggml-adreno.h"
+
+#include <cstdio>
+#include <string>
+
+static int g_failures = 0;
+
+static void expect_version(const std::string & description, int expected) {
+    const int got = ggml_adreno_version_from_description(description);
+    if (got != expected) {
+        std::printf("FAIL: \"%s\" -> %d (expected %d)\n", description.c_str(), got, expected);
+        g_failures++;
+    } else {
+        std::printf("ok:   \"%s\" -> %d\n", description.c_str(), got);
+    }
+}
+
+static void expect_policy(int min_adreno_version, bool load_opencl, bool unload_vulkan) {
+    const ggml_adreno_backend_policy got = ggml_adreno_resolve_backend_policy(min_adreno_version);
+    if (got.load_opencl != load_opencl || got.unload_vulkan != unload_vulkan) {
+        std::printf("FAIL: policy(%d) -> {load_opencl=%d, unload_vulkan=%d} (expected {%d, %d})\n",
+                    min_adreno_version, got.load_opencl, got.unload_vulkan, load_opencl, unload_vulkan);
+        g_failures++;
+    } else {
+        std::printf("ok:   policy(%d) -> {load_opencl=%d, unload_vulkan=%d}\n",
+                    min_adreno_version, got.load_opencl, got.unload_vulkan);
+    }
+}
+
+int main() {
+    // Real Adreno descriptions (as reported via the Vulkan device name).
+    expect_version("Adreno (TM) 830", 830);   // Samsung S25 (Snapdragon 8 Elite)
+    expect_version("Adreno (TM) 750", 750);   // Snapdragon 8 Gen 3
+    expect_version("Adreno (TM) 740", 740);
+    expect_version("Adreno (TM) 660", 660);
+    expect_version("Adreno 730", 730);        // no "(TM)" variant
+    expect_version("Adreno(TM)619", 619);     // no spaces
+
+    // Case-insensitive (hardening over the raw substring check in fabric-llm).
+    expect_version("ADRENO 830", 830);
+    expect_version("adreno 612", 612);
+
+    // Non-Adreno GPUs -> -1 (not Adreno). These are the devices that must keep
+    // using Vulkan / Metal, never OpenCL.
+    expect_version("Mali-G715", -1);          // Pixel 9 (proven to work on Vulkan)
+    expect_version("Mali-G78 MP14", -1);
+    expect_version("NVIDIA GeForce RTX 5090", -1);
+    expect_version("AMD Radeon (RADV RAPHAEL_MENDOCINO)", -1);
+    expect_version("Apple M2", -1);
+    expect_version("llvmpipe (LLVM 20.1.2, 256 bits)", -1);
+    expect_version("Intel(R) Arc(TM) A770 Graphics", -1);
+    expect_version("", -1);
+
+    // "dreno" present but no parseable number -> -3 (treated as "no usable
+    // Adreno version" by the caller; distinct from "not Adreno").
+    expect_version("Adreno (TM)", -3);
+    expect_version("Adreno", -3);
+
+    // Backend policy {load_opencl, unload_vulkan} per Adreno generation.
+    // Non-Adreno / no GPU -> no OpenCL, keep Vulkan/CPU.
+    expect_policy(-2, false, false);   // null Vulkan backend
+    expect_policy(-1, false, false);   // no Adreno GPU (e.g. Mali)
+    // Adreno 7xx / 8xx -> load OpenCL (Vulkan kept; consumer picks OpenCL).
+    expect_policy(830, true, false);
+    expect_policy(750, true, false);
+    expect_policy(730, true, false);
+    expect_policy(701, true, false);
+    // Boundary: exactly 700 is NOT > 700 -> CPU-only tier.
+    expect_policy(700, false, true);
+    // Adreno 1..700 -> CPU only (unload Vulkan, no OpenCL). This now includes
+    // Adreno <= 600, which is treated the same as 601..700 (stricter than
+    // qvac-fabric-llm.cpp, which loaded OpenCL on <= 600).
+    expect_policy(660, false, true);
+    expect_policy(601, false, true);
+    expect_policy(600, false, true);   // <= 600 now CPU-only (was OpenCL)
+    expect_policy(500, false, true);   // <= 600 now CPU-only (was OpenCL)
+    expect_policy(1, false, true);     // smallest positive Adreno -> CPU only
+
+    if (g_failures == 0) {
+        std::printf("All Adreno-version parsing cases passed.\n");
+        return 0;
+    }
+    std::printf("%d Adreno-version parsing case(s) failed.\n", g_failures);
+    return 1;
+}