diff --git a/src/ggml-adreno.h b/src/ggml-adreno.h new file mode 100644 index 0000000000..16c9d7407e --- /dev/null +++ b/src/ggml-adreno.h @@ -0,0 +1,87 @@ +#pragma once + +// Adreno GPU detection helper, shared between the Android backend-selection +// logic in ggml-backend-reg.cpp and its unit test (tests/test-adreno-version.cpp). +// +// Header-only (inline) and dependency-free so the unit test can exercise the +// pure parsing logic without linking the ggml runtime or needing a GPU. +// +// Mirrors the Adreno-version policy used by qvac-fabric-llm.cpp's ggml fork so +// the speech stack (whisper / parakeet / tts) selects GPU backends the same +// way the LLM stack does: on Android, detect the GPU through Vulkan (present +// on virtually every Android GPU) and only fall back to OpenCL for Adreno, +// whose Vulkan compute path is unstable. + +#include +#include +#include +#include + +// Parse the Adreno GPU generation from a ggml device description. +// e.g. "Adreno (TM) 830" -> 830, "Adreno 730" -> 730 +// Returns: +// > 0 : the parsed Adreno generation number +// -1 : the description is not an Adreno GPU +// -3 : the description is an Adreno GPU but the version failed to parse +// +// The description is lowercased before matching so "Adreno"/"ADRENO"/"adreno" +// are all recognised (a small hardening over the raw substring check in +// qvac-fabric-llm.cpp; it never produces a false negative for Adreno). +// +// Limitation (inherited from qvac-fabric-llm.cpp): the first digit run wins, so +// a non-numeric model name like "Adreno X1-85" would parse as 1. That naming is +// Snapdragon-X (Windows-on-ARM) only; Android phone Adrenos are 5xx/6xx/7xx/8xx, +// which parse correctly. +inline int ggml_adreno_version_from_description(const std::string & gpu_description) { + std::string lowered = gpu_description; + std::transform(lowered.begin(), lowered.end(), lowered.begin(), [](unsigned char c) { + return static_cast(std::tolower(c)); + }); + + if (lowered.find("dreno") == std::string::npos) { + return -1; + } + + static const std::regex digits_regex(R"((\d+))"); + std::smatch matches; + if (std::regex_search(lowered, matches, digits_regex) && matches.size() > 1) { + try { + return std::stoi(matches[1].str()); + } catch (const std::exception &) { + return -3; + } + } + return -3; +} + +// Android OpenCL/Vulkan backend policy decision, factored out of +// ggml_backend_load_all_from_path() so the version thresholds are unit-testable +// without a GPU. Input is the smallest Adreno generation among the GPU devices +// (the value ggml_backend_min_adreno_version() returns: a positive generation, +// or <= 0 when no Adreno GPU is present). +// +// not Adreno (<= 0) -> no OpenCL; keep Vulkan/CPU +// Adreno > 700 -> load OpenCL (kept alongside Vulkan; the consumer picks +// OpenCL over Vulkan -- see transcription-whispercpp) +// Adreno 1..700 -> CPU only: unload Vulkan and don't load OpenCL +// (only Adreno 700+ has a stable ggml GPU path) +// +// Note: this is stricter than qvac-fabric-llm.cpp, which loads OpenCL on +// Adreno <= 600. We treat Adreno <= 600 the same as 601..700 (CPU only): older +// Adreno GPUs are no more capable than the 601..700 tier we already exclude, so +// there is no reason to expose a GPU backend on them. +struct ggml_adreno_backend_policy { + bool load_opencl; + bool unload_vulkan; +}; + +inline ggml_adreno_backend_policy ggml_adreno_resolve_backend_policy(int min_adreno_version) { + if (min_adreno_version <= 0) { + return ggml_adreno_backend_policy{ /*load_opencl=*/ false, /*unload_vulkan=*/ false }; + } + if (min_adreno_version > 700) { + return ggml_adreno_backend_policy{ /*load_opencl=*/ true, /*unload_vulkan=*/ false }; + } + // Adreno 1..700 (incl. <= 600): CPU only. + return ggml_adreno_backend_policy{ /*load_opencl=*/ false, /*unload_vulkan=*/ true }; +} diff --git a/src/ggml-backend-reg.cpp b/src/ggml-backend-reg.cpp index 5ac3549dfc..4f6f858a19 100644 --- a/src/ggml-backend-reg.cpp +++ b/src/ggml-backend-reg.cpp @@ -2,9 +2,11 @@ #include "ggml-backend.h" #include "ggml-backend-dl.h" #include "ggml-impl.h" +#include "ggml-adreno.h" #include #include #include +#include #include #include #include @@ -687,6 +689,35 @@ void ggml_backend_load_all() { ggml_backend_load_all_from_path(nullptr); } +#ifdef __ANDROID__ +namespace { +// Smallest Adreno generation among the GPU devices a (Vulkan) backend exposes, +// or a negative sentinel: -2 if `reg` is null, -1 if no Adreno GPU is present. +// Vulkan is used as the probe because it is present on virtually every Android +// GPU, so the GPU can be identified before deciding whether to load OpenCL. +// Mirrors qvac-fabric-llm.cpp's ggml fork (the LLM stack's backend selection). +int ggml_backend_min_adreno_version(ggml_backend_reg_t reg) { + if (reg == nullptr) { + return -2; + } + int min_found = std::numeric_limits::max(); + for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { + ggml_backend_dev_t dev = ggml_backend_reg_dev_get(reg, i); + if (dev == nullptr) { + continue; + } + const char * description = ggml_backend_dev_description(dev); + GGML_LOG_INFO("%s: found device description: %s\n", __func__, description ? description : "(null)"); + const int dev_adreno_version = ggml_adreno_version_from_description(description ? description : ""); + if (dev_adreno_version > 0) { + min_found = std::min(min_found, dev_adreno_version); + } + } + return (min_found < std::numeric_limits::max()) ? min_found : -1; +} +} // namespace +#endif // __ANDROID__ + void ggml_backend_load_all_from_path(const char * dir_path) { #ifdef NDEBUG bool silent = true; @@ -704,7 +735,39 @@ void ggml_backend_load_all_from_path(const char * dir_path) { ggml_backend_load_best("sycl", silent, dir_path); ggml_backend_load_best("vulkan", silent, dir_path); ggml_backend_load_best("virtgpu", silent, dir_path); - ggml_backend_load_best("opencl", silent, dir_path); + + // OpenCL is only useful (and stable) for ggml on Adreno GPUs; on every + // other GPU the Adreno-tuned OpenCL kernels are either unsupported or buggy. + // On Android, use the already-loaded Vulkan backend to detect the GPU and + // only keep OpenCL for an Adreno that benefits from it. This mirrors + // qvac-fabric-llm.cpp's ggml fork so the speech stack selects backends the + // same way the LLM stack does. Off Android (or when no Vulkan backend is + // present) behaviour is unchanged: OpenCL is loaded unconditionally here. + bool load_opencl = true; +#ifdef __ANDROID__ + { + ggml_backend_reg_t vulkan_backend = ggml_backend_reg_by_name("vulkan"); + const int min_adreno_version = ggml_backend_min_adreno_version(vulkan_backend); + const ggml_adreno_backend_policy policy = ggml_adreno_resolve_backend_policy(min_adreno_version); + load_opencl = policy.load_opencl; + if (min_adreno_version <= 0) { + GGML_LOG_INFO("%s: no Adreno GPU detected (%d); skipping OpenCL, relying on Vulkan/CPU\n", + __func__, min_adreno_version); + } else if (policy.unload_vulkan) { + GGML_LOG_INFO("%s: Adreno %d detected; removing Vulkan and relying on CPU only\n", + __func__, min_adreno_version); + if (vulkan_backend != nullptr) { + ggml_backend_unload(vulkan_backend); + } + } else if (policy.load_opencl) { + GGML_LOG_INFO("%s: Adreno %d detected; keeping OpenCL backend\n", __func__, min_adreno_version); + } + } +#endif // __ANDROID__ + if (load_opencl) { + ggml_backend_load_best("opencl", silent, dir_path); + } + ggml_backend_load_best("hexagon", silent, dir_path); ggml_backend_load_best("musa", silent, dir_path); ggml_backend_load_best("openvino", silent, dir_path); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7c28e344c5..cbaf33c57c 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -168,6 +168,18 @@ target_link_libraries(${TEST_TARGET} PRIVATE ggml Threads::Threads) add_test(NAME ${TEST_TARGET} COMMAND $) set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") +# +# test-adreno-version +# Pure unit test for src/ggml-adreno.h (Adreno GPU-string parser used by the +# Android OpenCL/Vulkan backend-selection policy). Header-only: needs the src/ +# include path but does not link the ggml runtime. Always built. + +set(TEST_TARGET test-adreno-version) +add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp) +target_include_directories(${TEST_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src) +add_test(NAME ${TEST_TARGET} COMMAND $) +set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw") + if (NOT GGML_BACKEND_DL) # diff --git a/tests/test-adreno-version.cpp b/tests/test-adreno-version.cpp new file mode 100644 index 0000000000..462b51c25d --- /dev/null +++ b/tests/test-adreno-version.cpp @@ -0,0 +1,94 @@ +// Unit test for ggml_adreno_version_from_description() — the pure GPU-string +// parser behind the Android Adreno backend-selection policy in +// ggml-backend-reg.cpp (see src/ggml-adreno.h). +// +// Pure / header-only: does not link the ggml runtime or need a GPU, so it runs +// on every host/CI. The hardware-dependent half of the policy +// (ggml_backend_min_adreno_version + the OpenCL load decision) is exercised +// end-to-end on the device farm via transcription-whispercpp's GPU test. + +#include "ggml-adreno.h" + +#include +#include + +static int g_failures = 0; + +static void expect_version(const std::string & description, int expected) { + const int got = ggml_adreno_version_from_description(description); + if (got != expected) { + std::printf("FAIL: \"%s\" -> %d (expected %d)\n", description.c_str(), got, expected); + g_failures++; + } else { + std::printf("ok: \"%s\" -> %d\n", description.c_str(), got); + } +} + +static void expect_policy(int min_adreno_version, bool load_opencl, bool unload_vulkan) { + const ggml_adreno_backend_policy got = ggml_adreno_resolve_backend_policy(min_adreno_version); + if (got.load_opencl != load_opencl || got.unload_vulkan != unload_vulkan) { + std::printf("FAIL: policy(%d) -> {load_opencl=%d, unload_vulkan=%d} (expected {%d, %d})\n", + min_adreno_version, got.load_opencl, got.unload_vulkan, load_opencl, unload_vulkan); + g_failures++; + } else { + std::printf("ok: policy(%d) -> {load_opencl=%d, unload_vulkan=%d}\n", + min_adreno_version, got.load_opencl, got.unload_vulkan); + } +} + +int main() { + // Real Adreno descriptions (as reported via the Vulkan device name). + expect_version("Adreno (TM) 830", 830); // Samsung S25 (Snapdragon 8 Elite) + expect_version("Adreno (TM) 750", 750); // Snapdragon 8 Gen 3 + expect_version("Adreno (TM) 740", 740); + expect_version("Adreno (TM) 660", 660); + expect_version("Adreno 730", 730); // no "(TM)" variant + expect_version("Adreno(TM)619", 619); // no spaces + + // Case-insensitive (hardening over the raw substring check in fabric-llm). + expect_version("ADRENO 830", 830); + expect_version("adreno 612", 612); + + // Non-Adreno GPUs -> -1 (not Adreno). These are the devices that must keep + // using Vulkan / Metal, never OpenCL. + expect_version("Mali-G715", -1); // Pixel 9 (proven to work on Vulkan) + expect_version("Mali-G78 MP14", -1); + expect_version("NVIDIA GeForce RTX 5090", -1); + expect_version("AMD Radeon (RADV RAPHAEL_MENDOCINO)", -1); + expect_version("Apple M2", -1); + expect_version("llvmpipe (LLVM 20.1.2, 256 bits)", -1); + expect_version("Intel(R) Arc(TM) A770 Graphics", -1); + expect_version("", -1); + + // "dreno" present but no parseable number -> -3 (treated as "no usable + // Adreno version" by the caller; distinct from "not Adreno"). + expect_version("Adreno (TM)", -3); + expect_version("Adreno", -3); + + // Backend policy {load_opencl, unload_vulkan} per Adreno generation. + // Non-Adreno / no GPU -> no OpenCL, keep Vulkan/CPU. + expect_policy(-2, false, false); // null Vulkan backend + expect_policy(-1, false, false); // no Adreno GPU (e.g. Mali) + // Adreno 7xx / 8xx -> load OpenCL (Vulkan kept; consumer picks OpenCL). + expect_policy(830, true, false); + expect_policy(750, true, false); + expect_policy(730, true, false); + expect_policy(701, true, false); + // Boundary: exactly 700 is NOT > 700 -> CPU-only tier. + expect_policy(700, false, true); + // Adreno 1..700 -> CPU only (unload Vulkan, no OpenCL). This now includes + // Adreno <= 600, which is treated the same as 601..700 (stricter than + // qvac-fabric-llm.cpp, which loaded OpenCL on <= 600). + expect_policy(660, false, true); + expect_policy(601, false, true); + expect_policy(600, false, true); // <= 600 now CPU-only (was OpenCL) + expect_policy(500, false, true); // <= 600 now CPU-only (was OpenCL) + expect_policy(1, false, true); // smallest positive Adreno -> CPU only + + if (g_failures == 0) { + std::printf("All Adreno-version parsing cases passed.\n"); + return 0; + } + std::printf("%d Adreno-version parsing case(s) failed.\n", g_failures); + return 1; +}