Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions packages/qvac-lib-infer-llamacpp-embed/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
# Changelog

## [0.15.0] - 2026-04-30

### Added

#### Multi-GPU pipeline parallelism via `split-mode` config

- New `split-mode` (`'none'` | `'layer'` | `'row'`) and `tensor-split` config options enable distributing an embedding model across multiple GPUs via pipeline or tensor parallelism.
- When `split-mode` is `'layer'` or `'row'` and a GPU backend is available, the `--device` flag is omitted so llama.cpp distributes layers/rows across all available GPUs rather than pinning to a single device.
- When no GPU backend is available the addon falls back to CPU and silently drops `split-mode`, `tensor-split`, and `main-gpu`.
- `main_gpu` underscore variant is now accepted alongside `main-gpu`; providing both simultaneously throws `InvalidArgument`.
- `split_mode` underscore variant is accepted alongside `split-mode`; providing both simultaneously throws `InvalidArgument`.

## [0.14.0] - 2026-04-10

This release migrates the embed addon off `BaseInference` inheritance and the `WeightsProvider` download layer onto the composable `createJobHandler` + `exclusiveRunQueue` utilities from `@qvac/infer-base@^0.4.0`. The constructor signature is replaced with a single object whose `files.model` field is an ordered array of absolute paths, mirroring the parallel LLM and diffusion addon refactors. This is a breaking change — every caller must update.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,8 @@ struct DeviceDescription {
const BackendInterface& bckI)
: gpuDescription(bckI.ggml_backend_dev_description(dev)),
gpuBackend(bckI.ggml_backend_dev_name(dev)) {
std::transform(
gpuDescription.begin(),
gpuDescription.end(),
gpuDescription.begin(),
tolower);
std::transform(
gpuBackend.begin(), gpuBackend.end(), gpuBackend.begin(), tolower);
std::ranges::transform(gpuDescription, gpuDescription.begin(), tolower);
std::ranges::transform(gpuBackend, gpuBackend.begin(), tolower);
{
std::string backendTypeStr;
switch (backendTypeEnum) {
Expand Down Expand Up @@ -168,29 +163,36 @@ backend_selection::parseMainGpu(const std::string& mainGpuStr) {
} catch (const std::exception&) {
// Not an integer, try enum values
std::string lowerStr = mainGpuStr;
std::transform(lowerStr.begin(), lowerStr.end(), lowerStr.begin(), tolower);
std::ranges::transform(lowerStr, lowerStr.begin(), tolower);

if (lowerStr == "integrated") {
return MainGpu(MainGpuType::Integrated);
} else if (lowerStr == "dedicated") {
}
if (lowerStr == "dedicated") {
return MainGpu(MainGpuType::Dedicated);
} else {
throw qvac_errors::StatusError(
qvac_errors::general_error::InvalidArgument,
"main-gpu must be an integer device index, 'integrated', or "
"'dedicated'");
}
throw qvac_errors::StatusError(
qvac_errors::general_error::InvalidArgument,
"main-gpu must be an integer device index, 'integrated', or "
"'dedicated'");
}
}

std::optional<MainGpu> backend_selection::tryMainGpuFromMap(
std::unordered_map<std::string, std::string>& configFilemap) {
std::optional<MainGpu> mainGpu = std::nullopt;
if (auto mainGpuIt = configFilemap.find("main-gpu");
mainGpuIt != configFilemap.end()) {
mainGpu = parseMainGpu(mainGpuIt->second);
configFilemap.erase(mainGpuIt);
auto hIt = configFilemap.find("main-gpu");
auto uIt = configFilemap.find("main_gpu");
if (hIt != configFilemap.end() && uIt != configFilemap.end()) {
throw qvac_errors::StatusError(
qvac_errors::general_error::InvalidArgument,
"both 'main-gpu' and 'main_gpu' are present; use one or the other.");
}
auto foundIt = (hIt != configFilemap.end()) ? hIt : uIt;
if (foundIt == configFilemap.end()) {
return std::nullopt;
}
std::optional<MainGpu> mainGpu = parseMainGpu(foundIt->second);
configFilemap.erase(foundIt);
return mainGpu;
}

Expand Down Expand Up @@ -265,13 +267,30 @@ std::pair<BackendType, std::string> backend_selection::chooseBackend(
const BackendType preferredBackendType, llamaLogCallbackF llamaLogcallback,
const std::optional<MainGpu>& mainGpu) {
BackendInterface bckI{
ggml_backend_dev_count,
ggml_backend_dev_backend_reg,
ggml_backend_dev_get,
ggml_backend_reg_name,
ggml_backend_dev_description,
ggml_backend_dev_name,
ggml_backend_dev_type,
llamaLogcallback};
.ggml_backend_dev_count = ggml_backend_dev_count,
.ggml_backend_dev_backend_reg = ggml_backend_dev_backend_reg,
.ggml_backend_dev_get = ggml_backend_dev_get,
.ggml_backend_reg_name = ggml_backend_reg_name,
.ggml_backend_dev_description = ggml_backend_dev_description,
.ggml_backend_dev_name = ggml_backend_dev_name,
.ggml_backend_dev_type = ggml_backend_dev_type,
.llamaLogCallback = llamaLogcallback};
return backend_selection::chooseBackend(preferredBackendType, bckI, mainGpu);
}

size_t
backend_selection::getEffectiveGpuDeviceCount(const BackendInterface& bckI) {
size_t gpuCount = 0;
size_t igpuCount = 0;
const size_t totalDevices = bckI.ggml_backend_dev_count();
for (size_t i = 0; i < totalDevices; ++i) {
ggml_backend_dev_t dev = bckI.ggml_backend_dev_get(i);
enum ggml_backend_dev_type devType = bckI.ggml_backend_dev_type(dev);
if (devType == GGML_BACKEND_DEVICE_TYPE_GPU) {
++gpuCount;
} else if (devType == GGML_BACKEND_DEVICE_TYPE_IGPU) {
++igpuCount;
}
}
return gpuCount > 0 ? gpuCount : igpuCount;
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ using llamaLogCallbackF =
void (*)(ggml_log_level level, const char* text, void* userData);

struct BackendInterface {
size_t (*ggml_backend_dev_count)(void);
size_t (*ggml_backend_dev_count)();
ggml_backend_reg_t (*ggml_backend_dev_backend_reg)(ggml_backend_dev_t device);
ggml_backend_dev_t (*ggml_backend_dev_get)(size_t index);
const char* (*ggml_backend_reg_name)(ggml_backend_reg_t reg);
Expand All @@ -49,4 +49,10 @@ std::pair<BackendType, std::string> chooseBackend(
std::pair<BackendType, std::string> chooseBackend(
BackendType preferredBackendType, llamaLogCallbackF llamaLogcallback,
const std::optional<MainGpu>& mainGpu = std::nullopt);

/// @brief Count GPU devices available for multi-GPU split mode.
/// Returns the number of discrete GPUs when any are present; otherwise
/// falls back to the iGPU count. This mirrors backends like Vulkan which
/// exclude iGPUs by default when discrete GPUs exist.
size_t getEffectiveGpuDeviceCount(const BackendInterface& bckI);
} // namespace backend_selection
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,8 @@ void batchDecode(
// not NONE
embd = llama_get_embeddings_seq(
ctx,
*batch.seq_id
[i]); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
// NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
*batch.seq_id[i]);
// NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
embeddingPos = *batch.seq_id[i];
if (embd == nullptr) {
Expand Down Expand Up @@ -248,6 +248,42 @@ std::size_t BertEmbeddings::size() const { return embeddingCount_; }
std::size_t BertEmbeddings::embeddingSize() const { return embeddingSize_; }

namespace {
llama_split_mode
parseSplitMode(std::unordered_map<std::string, std::string>& configFilemap) {
auto hIt = configFilemap.find("split-mode");
auto uIt = configFilemap.find("split_mode");
if (hIt != configFilemap.end() && uIt != configFilemap.end()) {
throw qvac_errors::StatusError(
qvac_errors::general_error::InvalidArgument,
string_format(
"%s: both 'split-mode' and 'split_mode' are present; "
"use one or the other.\n",
__func__));
}
auto splitModeIt = (hIt != configFilemap.end()) ? hIt : uIt;
if (splitModeIt == configFilemap.end()) {
return LLAMA_SPLIT_MODE_NONE;
}
std::string val = splitModeIt->second;
std::ranges::transform(val, val.begin(), ::tolower);
llama_split_mode splitMode = LLAMA_SPLIT_MODE_NONE;
if (val == "layer") {
splitMode = LLAMA_SPLIT_MODE_LAYER;
} else if (val == "row") {
splitMode = LLAMA_SPLIT_MODE_ROW;
} else if (val != "none") {
throw qvac_errors::StatusError(
qvac_errors::general_error::InvalidArgument,
string_format(
"%s: invalid split-mode '%s', must be 'none', 'layer', or "
"'row'.\n",
__func__,
splitModeIt->second.c_str()));
}
configFilemap.erase(splitModeIt);
return splitMode;
}

common_params setupParams(
const std::string& modelGgufPath,
std::unordered_map<std::string, std::string> configFilemap,
Expand All @@ -262,6 +298,8 @@ common_params setupParams(
configVector.emplace_back("--model");
configVector.emplace_back(modelGgufPath);

llama_split_mode splitMode = parseSplitMode(configFilemap);

auto deviceIt = configFilemap.find("device");
if (deviceIt == configFilemap.end()) {
std::string errorMsg =
Expand All @@ -281,20 +319,48 @@ common_params setupParams(
const std::pair<BackendType, std::string> chosenBackend =
chooseBackend(preferredBackend, llamaLogCallback, mainGpu);

if (chosenBackend.first != BackendType::GPU &&
chosenBackend.first != BackendType::CPU) {
if (chosenBackend.first == BackendType::GPU) {
resolvedBackendDevice = 1;
params.split_mode = splitMode;

if (splitMode != LLAMA_SPLIT_MODE_NONE && mainGpu.has_value()) {
if (std::holds_alternative<int>(mainGpu.value())) {
configFilemap["main-gpu"] =
std::to_string(std::get<int>(mainGpu.value()));
} else {
qvac_lib_infer_llamacpp_embed::logging::llamaLogCallback(
GGML_LOG_LEVEL_WARN,
"[BertModel] main-gpu 'dedicated'/'integrated' ignored in "
"multi-GPU split-mode; use an integer device index instead\n",
nullptr);
}
}
} else if (chosenBackend.first == BackendType::CPU) {
resolvedBackendDevice = 0;
params.split_mode = LLAMA_SPLIT_MODE_NONE;
params.main_gpu = -1;
if (splitMode != LLAMA_SPLIT_MODE_NONE) {
qvac_lib_infer_llamacpp_embed::logging::llamaLogCallback(
GGML_LOG_LEVEL_WARN,
"[BertModel] split-mode, tensor-split and main-gpu ignored: "
"no GPU backend available, falling back to CPU\n",
nullptr);
splitMode = LLAMA_SPLIT_MODE_NONE;
configFilemap.erase("tensor-split");
}
} else {
throw qvac_errors::StatusError(
qvac_errors::general_error::InternalError,
"preferredDeviceFromString: wrong deduced device, must be 'gpu' or "
"'cpu'.\n");
}
if (chosenBackend.first == BackendType::GPU) {
resolvedBackendDevice = 1;
} else {
resolvedBackendDevice = 0;
// In multi-GPU split mode we intentionally omit --device so llama.cpp
// distributes layers/rows across all available GPUs rather than pinning
// to the single backend that chooseBackend selected.
if (splitMode == LLAMA_SPLIT_MODE_NONE) {
configVector.emplace_back("--device");
configVector.emplace_back(chosenBackend.second);
}
configVector.emplace_back("--device");
configVector.emplace_back(chosenBackend.second);
configFilemap.erase(deviceIt);
}

Expand Down Expand Up @@ -374,9 +440,10 @@ void BertModel::init(
setVerbosityLevel(configCopy);

std::string openclCacheDir;
if (auto it = configCopy.find("openclCacheDir"); it != configCopy.end()) {
openclCacheDir = it->second;
configCopy.erase(it);
if (auto configIt = configCopy.find("openclCacheDir");
configIt != configCopy.end()) {
openclCacheDir = configIt->second;
configCopy.erase(configIt);
}

lazyCommonInit();
Expand Down Expand Up @@ -490,6 +557,8 @@ BertModel::preprocessPrompt(const std::string& prompt) const {
return splitLines(prompt, init_.params.embd_sep);
}

const common_params& BertModel::getCommonParams() const { return init_.params; }

bool BertModel::isLoaded() const {
return is_loaded_ && model_ != nullptr && ctx_ != nullptr;
}
Expand Down Expand Up @@ -623,7 +692,8 @@ BertEmbeddings BertModel::processBatched(
return BertEmbeddings(
std::move(embeddings),
BertEmbeddings::Layout{
numStoredEmbeddings, static_cast<std::size_t>(n_embd)});
.embeddingCount = numStoredEmbeddings,
.embeddingSize = static_cast<std::size_t>(n_embd)});
};

for (std::size_t k = 0; k < nPrompts && !stopCancelled_.load(); k++) {
Expand Down Expand Up @@ -676,7 +746,9 @@ BertEmbeddings BertModel::processBatched(
init_.params.embd_normalize);
return BertEmbeddings(
std::move(embeddings),
BertEmbeddings::Layout{embeddingCount, static_cast<std::size_t>(n_embd)});
BertEmbeddings::Layout{
.embeddingCount = embeddingCount,
.embeddingSize = static_cast<std::size_t>(n_embd)});
}

BertEmbeddings
Expand All @@ -701,7 +773,9 @@ BertEmbeddings BertModel::encodeHostF32Sequences(
if (sequenceArray.empty()) {
return BertEmbeddings(
std::vector<float>{},
BertEmbeddings::Layout{0, static_cast<std::size_t>(n_embd)});
BertEmbeddings::Layout{
.embeddingCount = 0,
.embeddingSize = static_cast<std::size_t>(n_embd)});
}

// Tokenize all sequences once and validate context size
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,8 @@ class BertModel : public qvac_lib_inference_addon_cpp::model::IModel,

bool isLoaded() const;

const common_params& getCommonParams() const;

void setWeightsForFile(
const std::string& filename,
std::unique_ptr<std::basic_streambuf<char>>&& shard) final;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
namespace qvac_lib_infer_llamacpp_embed::logging {

// Global verbosity level - same for all instances
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
extern qvac_lib_inference_addon_cpp::logger::Priority g_verbosityLevel;

// Parse verbosity from config map and set global level
Expand All @@ -23,6 +24,7 @@ void llamaLogCallback(ggml_log_level level, const char* text, void* userData);
//
// Simple logging macro that uses global verbosity level
// Usage: QLOG_IF(Priority::DEBUG, "Debug message");
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define QLOG_IF(priority, message) \
do { \
if (static_cast<int>(priority) <= \
Expand Down
Loading
Loading