Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions src/ggml-adreno.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#pragma once

// Adreno GPU detection helper, shared between the Android backend-selection
// logic in ggml-backend-reg.cpp and its unit test (tests/test-adreno-version.cpp).
//
// Header-only (inline) and dependency-free so the unit test can exercise the
// pure parsing logic without linking the ggml runtime or needing a GPU.
//
// Mirrors the Adreno-version policy used by qvac-fabric-llm.cpp's ggml fork so
// the speech stack (whisper / parakeet / tts) selects GPU backends the same
// way the LLM stack does: on Android, detect the GPU through Vulkan (present
// on virtually every Android GPU) and only fall back to OpenCL for Adreno,
// whose Vulkan compute path is unstable.

#include <algorithm>
#include <cctype>
#include <regex>
#include <string>

// Parse the Adreno GPU generation from a ggml device description.
// e.g. "Adreno (TM) 830" -> 830, "Adreno 730" -> 730
// Returns:
// > 0 : the parsed Adreno generation number
// -1 : the description is not an Adreno GPU
// -3 : the description is an Adreno GPU but the version failed to parse
//
// The description is lowercased before matching so "Adreno"/"ADRENO"/"adreno"
// are all recognised (a small hardening over the raw substring check in
// qvac-fabric-llm.cpp; it never produces a false negative for Adreno).
//
// Limitation (inherited from qvac-fabric-llm.cpp): the first digit run wins, so
// a non-numeric model name like "Adreno X1-85" would parse as 1. That naming is
// Snapdragon-X (Windows-on-ARM) only; Android phone Adrenos are 5xx/6xx/7xx/8xx,
// which parse correctly.
inline int ggml_adreno_version_from_description(const std::string & gpu_description) {
std::string lowered = gpu_description;
std::transform(lowered.begin(), lowered.end(), lowered.begin(), [](unsigned char c) {
return static_cast<char>(std::tolower(c));
});

if (lowered.find("dreno") == std::string::npos) {
return -1;
}

static const std::regex digits_regex(R"((\d+))");
std::smatch matches;
if (std::regex_search(lowered, matches, digits_regex) && matches.size() > 1) {
try {
return std::stoi(matches[1].str());
} catch (const std::exception &) {
return -3;
}
}
return -3;
}

// Android OpenCL/Vulkan backend policy decision, factored out of
// ggml_backend_load_all_from_path() so the version thresholds are unit-testable
// without a GPU. Input is the smallest Adreno generation among the GPU devices
// (the value ggml_backend_min_adreno_version() returns: a positive generation,
// or <= 0 when no Adreno GPU is present).
//
// not Adreno (<= 0) -> no OpenCL; keep Vulkan/CPU
// Adreno > 700 -> load OpenCL (kept alongside Vulkan; the consumer picks
// OpenCL over Vulkan -- see transcription-whispercpp)
// Adreno 1..700 -> CPU only: unload Vulkan and don't load OpenCL
// (only Adreno 700+ has a stable ggml GPU path)
//
// Note: this is stricter than qvac-fabric-llm.cpp, which loads OpenCL on
// Adreno <= 600. We treat Adreno <= 600 the same as 601..700 (CPU only): older
// Adreno GPUs are no more capable than the 601..700 tier we already exclude, so
// there is no reason to expose a GPU backend on them.
struct ggml_adreno_backend_policy {
bool load_opencl;
bool unload_vulkan;
};

inline ggml_adreno_backend_policy ggml_adreno_resolve_backend_policy(int min_adreno_version) {
if (min_adreno_version <= 0) {
return ggml_adreno_backend_policy{ /*load_opencl=*/ false, /*unload_vulkan=*/ false };
}
if (min_adreno_version > 700) {
return ggml_adreno_backend_policy{ /*load_opencl=*/ true, /*unload_vulkan=*/ false };
}
// Adreno 1..700 (incl. <= 600): CPU only.
return ggml_adreno_backend_policy{ /*load_opencl=*/ false, /*unload_vulkan=*/ true };
}
65 changes: 64 additions & 1 deletion src/ggml-backend-reg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
#include "ggml-backend.h"
#include "ggml-backend-dl.h"
#include "ggml-impl.h"
#include "ggml-adreno.h"
#include <algorithm>
#include <cstring>
#include <filesystem>
#include <limits>
#include <memory>
#include <string>
#include <type_traits>
Expand Down Expand Up @@ -687,6 +689,35 @@ void ggml_backend_load_all() {
ggml_backend_load_all_from_path(nullptr);
}

#ifdef __ANDROID__
namespace {
// Smallest Adreno generation among the GPU devices a (Vulkan) backend exposes,
// or a negative sentinel: -2 if `reg` is null, -1 if no Adreno GPU is present.
// Vulkan is used as the probe because it is present on virtually every Android
// GPU, so the GPU can be identified before deciding whether to load OpenCL.
// Mirrors qvac-fabric-llm.cpp's ggml fork (the LLM stack's backend selection).
int ggml_backend_min_adreno_version(ggml_backend_reg_t reg) {
if (reg == nullptr) {
return -2;
}
int min_found = std::numeric_limits<int>::max();
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
ggml_backend_dev_t dev = ggml_backend_reg_dev_get(reg, i);
if (dev == nullptr) {
continue;
}
const char * description = ggml_backend_dev_description(dev);
GGML_LOG_INFO("%s: found device description: %s\n", __func__, description ? description : "(null)");
const int dev_adreno_version = ggml_adreno_version_from_description(description ? description : "");
if (dev_adreno_version > 0) {
min_found = std::min(min_found, dev_adreno_version);
}
}
return (min_found < std::numeric_limits<int>::max()) ? min_found : -1;
}
} // namespace
#endif // __ANDROID__

void ggml_backend_load_all_from_path(const char * dir_path) {
#ifdef NDEBUG
bool silent = true;
Expand All @@ -704,7 +735,39 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
ggml_backend_load_best("sycl", silent, dir_path);
ggml_backend_load_best("vulkan", silent, dir_path);
ggml_backend_load_best("virtgpu", silent, dir_path);
ggml_backend_load_best("opencl", silent, dir_path);

// OpenCL is only useful (and stable) for ggml on Adreno GPUs; on every
// other GPU the Adreno-tuned OpenCL kernels are either unsupported or buggy.
// On Android, use the already-loaded Vulkan backend to detect the GPU and
// only keep OpenCL for an Adreno that benefits from it. This mirrors
// qvac-fabric-llm.cpp's ggml fork so the speech stack selects backends the
// same way the LLM stack does. Off Android (or when no Vulkan backend is
// present) behaviour is unchanged: OpenCL is loaded unconditionally here.
bool load_opencl = true;
#ifdef __ANDROID__
{
ggml_backend_reg_t vulkan_backend = ggml_backend_reg_by_name("vulkan");
const int min_adreno_version = ggml_backend_min_adreno_version(vulkan_backend);
const ggml_adreno_backend_policy policy = ggml_adreno_resolve_backend_policy(min_adreno_version);
load_opencl = policy.load_opencl;
if (min_adreno_version <= 0) {
GGML_LOG_INFO("%s: no Adreno GPU detected (%d); skipping OpenCL, relying on Vulkan/CPU\n",
__func__, min_adreno_version);
} else if (policy.unload_vulkan) {
GGML_LOG_INFO("%s: Adreno %d detected; removing Vulkan and relying on CPU only\n",
__func__, min_adreno_version);
if (vulkan_backend != nullptr) {
ggml_backend_unload(vulkan_backend);
}
} else if (policy.load_opencl) {
GGML_LOG_INFO("%s: Adreno %d detected; keeping OpenCL backend\n", __func__, min_adreno_version);
}
}
#endif // __ANDROID__
if (load_opencl) {
ggml_backend_load_best("opencl", silent, dir_path);
}

ggml_backend_load_best("hexagon", silent, dir_path);
ggml_backend_load_best("musa", silent, dir_path);
ggml_backend_load_best("openvino", silent, dir_path);
Expand Down
12 changes: 12 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,18 @@ target_link_libraries(${TEST_TARGET} PRIVATE ggml Threads::Threads)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")

#
# test-adreno-version
# Pure unit test for src/ggml-adreno.h (Adreno GPU-string parser used by the
# Android OpenCL/Vulkan backend-selection policy). Header-only: needs the src/
# include path but does not link the ggml runtime. Always built.

set(TEST_TARGET test-adreno-version)
add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
target_include_directories(${TEST_TARGET} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")


if (NOT GGML_BACKEND_DL)
#
Expand Down
94 changes: 94 additions & 0 deletions tests/test-adreno-version.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
// Unit test for ggml_adreno_version_from_description() — the pure GPU-string
// parser behind the Android Adreno backend-selection policy in
// ggml-backend-reg.cpp (see src/ggml-adreno.h).
//
// Pure / header-only: does not link the ggml runtime or need a GPU, so it runs
// on every host/CI. The hardware-dependent half of the policy
// (ggml_backend_min_adreno_version + the OpenCL load decision) is exercised
// end-to-end on the device farm via transcription-whispercpp's GPU test.

#include "ggml-adreno.h"

#include <cstdio>
#include <string>

static int g_failures = 0;

static void expect_version(const std::string & description, int expected) {
const int got = ggml_adreno_version_from_description(description);
if (got != expected) {
std::printf("FAIL: \"%s\" -> %d (expected %d)\n", description.c_str(), got, expected);
g_failures++;
} else {
std::printf("ok: \"%s\" -> %d\n", description.c_str(), got);
}
}

static void expect_policy(int min_adreno_version, bool load_opencl, bool unload_vulkan) {
const ggml_adreno_backend_policy got = ggml_adreno_resolve_backend_policy(min_adreno_version);
if (got.load_opencl != load_opencl || got.unload_vulkan != unload_vulkan) {
std::printf("FAIL: policy(%d) -> {load_opencl=%d, unload_vulkan=%d} (expected {%d, %d})\n",
min_adreno_version, got.load_opencl, got.unload_vulkan, load_opencl, unload_vulkan);
g_failures++;
} else {
std::printf("ok: policy(%d) -> {load_opencl=%d, unload_vulkan=%d}\n",
min_adreno_version, got.load_opencl, got.unload_vulkan);
}
}

int main() {
// Real Adreno descriptions (as reported via the Vulkan device name).
expect_version("Adreno (TM) 830", 830); // Samsung S25 (Snapdragon 8 Elite)
expect_version("Adreno (TM) 750", 750); // Snapdragon 8 Gen 3
expect_version("Adreno (TM) 740", 740);
expect_version("Adreno (TM) 660", 660);
expect_version("Adreno 730", 730); // no "(TM)" variant
expect_version("Adreno(TM)619", 619); // no spaces

// Case-insensitive (hardening over the raw substring check in fabric-llm).
expect_version("ADRENO 830", 830);
expect_version("adreno 612", 612);

// Non-Adreno GPUs -> -1 (not Adreno). These are the devices that must keep
// using Vulkan / Metal, never OpenCL.
expect_version("Mali-G715", -1); // Pixel 9 (proven to work on Vulkan)
expect_version("Mali-G78 MP14", -1);
expect_version("NVIDIA GeForce RTX 5090", -1);
expect_version("AMD Radeon (RADV RAPHAEL_MENDOCINO)", -1);
expect_version("Apple M2", -1);
expect_version("llvmpipe (LLVM 20.1.2, 256 bits)", -1);
expect_version("Intel(R) Arc(TM) A770 Graphics", -1);
expect_version("", -1);

// "dreno" present but no parseable number -> -3 (treated as "no usable
// Adreno version" by the caller; distinct from "not Adreno").
expect_version("Adreno (TM)", -3);
expect_version("Adreno", -3);

// Backend policy {load_opencl, unload_vulkan} per Adreno generation.
// Non-Adreno / no GPU -> no OpenCL, keep Vulkan/CPU.
expect_policy(-2, false, false); // null Vulkan backend
expect_policy(-1, false, false); // no Adreno GPU (e.g. Mali)
// Adreno 7xx / 8xx -> load OpenCL (Vulkan kept; consumer picks OpenCL).
expect_policy(830, true, false);
expect_policy(750, true, false);
expect_policy(730, true, false);
expect_policy(701, true, false);
// Boundary: exactly 700 is NOT > 700 -> CPU-only tier.
expect_policy(700, false, true);
// Adreno 1..700 -> CPU only (unload Vulkan, no OpenCL). This now includes
// Adreno <= 600, which is treated the same as 601..700 (stricter than
// qvac-fabric-llm.cpp, which loaded OpenCL on <= 600).
expect_policy(660, false, true);
expect_policy(601, false, true);
expect_policy(600, false, true); // <= 600 now CPU-only (was OpenCL)
expect_policy(500, false, true); // <= 600 now CPU-only (was OpenCL)
expect_policy(1, false, true); // smallest positive Adreno -> CPU only

if (g_failures == 0) {
std::printf("All Adreno-version parsing cases passed.\n");
return 0;
}
std::printf("%d Adreno-version parsing case(s) failed.\n", g_failures);
return 1;
}