tetherto · gianni-cor · May 2, 2026 · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026
@@ -1,5 +1,17 @@
 # Changelog
 
+## [0.15.0] - 2026-04-30
+
+### Added
+
+#### Multi-GPU pipeline parallelism via `split-mode` config
+
+- New `split-mode` (`'none'` | `'layer'` | `'row'`) and `tensor-split` config options enable distributing an embedding model across multiple GPUs via pipeline or tensor parallelism.
+- When `split-mode` is `'layer'` or `'row'` and a GPU backend is available, the `--device` flag is omitted so llama.cpp distributes layers/rows across all available GPUs rather than pinning to a single device.
+- When no GPU backend is available the addon falls back to CPU and silently drops `split-mode`, `tensor-split`, and `main-gpu`.
+- `main_gpu` underscore variant is now accepted alongside `main-gpu`; providing both simultaneously throws `InvalidArgument`.
+- `split_mode` underscore variant is accepted alongside `split-mode`; providing both simultaneously throws `InvalidArgument`.
+
 ## [0.14.0] - 2026-04-10
 
 This release migrates the embed addon off `BaseInference` inheritance and the `WeightsProvider` download layer onto the composable `createJobHandler` + `exclusiveRunQueue` utilities from `@qvac/infer-base@^0.4.0`. The constructor signature is replaced with a single object whose `files.model` field is an ordered array of absolute paths, mirroring the parallel LLM and diffusion addon refactors. This is a breaking change — every caller must update.

@@ -23,13 +23,8 @@ struct DeviceDescription {
       const BackendInterface& bckI)
       : gpuDescription(bckI.ggml_backend_dev_description(dev)),
         gpuBackend(bckI.ggml_backend_dev_name(dev)) {
-    std::transform(
-        gpuDescription.begin(),
-        gpuDescription.end(),
-        gpuDescription.begin(),
-        tolower);
-    std::transform(
-        gpuBackend.begin(), gpuBackend.end(), gpuBackend.begin(), tolower);
+    std::ranges::transform(gpuDescription, gpuDescription.begin(), tolower);
+    std::ranges::transform(gpuBackend, gpuBackend.begin(), tolower);
     {
       std::string backendTypeStr;
       switch (backendTypeEnum) {
@@ -168,29 +163,36 @@ backend_selection::parseMainGpu(const std::string& mainGpuStr) {
   } catch (const std::exception&) {
     // Not an integer, try enum values
     std::string lowerStr = mainGpuStr;
-    std::transform(lowerStr.begin(), lowerStr.end(), lowerStr.begin(), tolower);
+    std::ranges::transform(lowerStr, lowerStr.begin(), tolower);
 
     if (lowerStr == "integrated") {
       return MainGpu(MainGpuType::Integrated);
-    } else if (lowerStr == "dedicated") {
+    }
+    if (lowerStr == "dedicated") {
       return MainGpu(MainGpuType::Dedicated);
-    } else {
-      throw qvac_errors::StatusError(
-          qvac_errors::general_error::InvalidArgument,
-          "main-gpu must be an integer device index, 'integrated', or "
-          "'dedicated'");
     }
+    throw qvac_errors::StatusError(
+        qvac_errors::general_error::InvalidArgument,
+        "main-gpu must be an integer device index, 'integrated', or "
+        "'dedicated'");
   }
 }
 
 std::optional<MainGpu> backend_selection::tryMainGpuFromMap(
     std::unordered_map<std::string, std::string>& configFilemap) {
-  std::optional<MainGpu> mainGpu = std::nullopt;
-  if (auto mainGpuIt = configFilemap.find("main-gpu");
-      mainGpuIt != configFilemap.end()) {
-    mainGpu = parseMainGpu(mainGpuIt->second);
-    configFilemap.erase(mainGpuIt);
+  auto hIt = configFilemap.find("main-gpu");
+  auto uIt = configFilemap.find("main_gpu");
+  if (hIt != configFilemap.end() && uIt != configFilemap.end()) {
+    throw qvac_errors::StatusError(
+        qvac_errors::general_error::InvalidArgument,
+        "both 'main-gpu' and 'main_gpu' are present; use one or the other.");
+  }
+  auto foundIt = (hIt != configFilemap.end()) ? hIt : uIt;
+  if (foundIt == configFilemap.end()) {
+    return std::nullopt;
   }
+  std::optional<MainGpu> mainGpu = parseMainGpu(foundIt->second);
+  configFilemap.erase(foundIt);
   return mainGpu;
 }
 
@@ -265,13 +267,30 @@ std::pair<BackendType, std::string> backend_selection::chooseBackend(
     const BackendType preferredBackendType, llamaLogCallbackF llamaLogcallback,
     const std::optional<MainGpu>& mainGpu) {
   BackendInterface bckI{
-      ggml_backend_dev_count,
-      ggml_backend_dev_backend_reg,
-      ggml_backend_dev_get,
-      ggml_backend_reg_name,
-      ggml_backend_dev_description,
-      ggml_backend_dev_name,
-      ggml_backend_dev_type,
-      llamaLogcallback};
+      .ggml_backend_dev_count = ggml_backend_dev_count,
+      .ggml_backend_dev_backend_reg = ggml_backend_dev_backend_reg,
+      .ggml_backend_dev_get = ggml_backend_dev_get,
+      .ggml_backend_reg_name = ggml_backend_reg_name,
+      .ggml_backend_dev_description = ggml_backend_dev_description,
+      .ggml_backend_dev_name = ggml_backend_dev_name,
+      .ggml_backend_dev_type = ggml_backend_dev_type,
+      .llamaLogCallback = llamaLogcallback};
   return backend_selection::chooseBackend(preferredBackendType, bckI, mainGpu);
 }
+
+size_t
+backend_selection::getEffectiveGpuDeviceCount(const BackendInterface& bckI) {
+  size_t gpuCount = 0;
+  size_t igpuCount = 0;
+  const size_t totalDevices = bckI.ggml_backend_dev_count();
+  for (size_t i = 0; i < totalDevices; ++i) {
+    ggml_backend_dev_t dev = bckI.ggml_backend_dev_get(i);
+    enum ggml_backend_dev_type devType = bckI.ggml_backend_dev_type(dev);
+    if (devType == GGML_BACKEND_DEVICE_TYPE_GPU) {
+      ++gpuCount;
+    } else if (devType == GGML_BACKEND_DEVICE_TYPE_IGPU) {
+      ++igpuCount;
+    }
+  }
+  return gpuCount > 0 ? gpuCount : igpuCount;
+}
@@ -28,7 +28,7 @@ using llamaLogCallbackF =
     void (*)(ggml_log_level level, const char* text, void* userData);
 
 struct BackendInterface {
-  size_t (*ggml_backend_dev_count)(void);
+  size_t (*ggml_backend_dev_count)();
   ggml_backend_reg_t (*ggml_backend_dev_backend_reg)(ggml_backend_dev_t device);
   ggml_backend_dev_t (*ggml_backend_dev_get)(size_t index);
   const char* (*ggml_backend_reg_name)(ggml_backend_reg_t reg);
@@ -49,4 +49,10 @@ std::pair<BackendType, std::string> chooseBackend(
 std::pair<BackendType, std::string> chooseBackend(
     BackendType preferredBackendType, llamaLogCallbackF llamaLogcallback,
     const std::optional<MainGpu>& mainGpu = std::nullopt);
+
+/// @brief Count GPU devices available for multi-GPU split mode.
+/// Returns the number of discrete GPUs when any are present; otherwise
+/// falls back to the iGPU count. This mirrors backends like Vulkan which
+/// exclude iGPUs by default when discrete GPUs exist.
+size_t getEffectiveGpuDeviceCount(const BackendInterface& bckI);
 } // namespace backend_selection
@@ -85,8 +85,8 @@ void batchDecode(
       // not NONE
       embd = llama_get_embeddings_seq(
           ctx,
-          *batch.seq_id
-               [i]); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+          // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+          *batch.seq_id[i]);
       // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
       embeddingPos = *batch.seq_id[i];
       if (embd == nullptr) {
@@ -248,6 +248,42 @@ std::size_t BertEmbeddings::size() const { return embeddingCount_; }
 std::size_t BertEmbeddings::embeddingSize() const { return embeddingSize_; }
 
 namespace {
+llama_split_mode
+parseSplitMode(std::unordered_map<std::string, std::string>& configFilemap) {
+  auto hIt = configFilemap.find("split-mode");
+  auto uIt = configFilemap.find("split_mode");
+  if (hIt != configFilemap.end() && uIt != configFilemap.end()) {
+    throw qvac_errors::StatusError(
+        qvac_errors::general_error::InvalidArgument,
+        string_format(
+            "%s: both 'split-mode' and 'split_mode' are present; "
+            "use one or the other.\n",
+            __func__));
+  }
+  auto splitModeIt = (hIt != configFilemap.end()) ? hIt : uIt;
+  if (splitModeIt == configFilemap.end()) {
+    return LLAMA_SPLIT_MODE_NONE;
+  }
+  std::string val = splitModeIt->second;
+  std::ranges::transform(val, val.begin(), ::tolower);
+  llama_split_mode splitMode = LLAMA_SPLIT_MODE_NONE;
+  if (val == "layer") {
+    splitMode = LLAMA_SPLIT_MODE_LAYER;
+  } else if (val == "row") {
+    splitMode = LLAMA_SPLIT_MODE_ROW;
+  } else if (val != "none") {
+    throw qvac_errors::StatusError(
+        qvac_errors::general_error::InvalidArgument,
+        string_format(
+            "%s: invalid split-mode '%s', must be 'none', 'layer', or "
+            "'row'.\n",
+            __func__,
+            splitModeIt->second.c_str()));
+  }
+  configFilemap.erase(splitModeIt);
+  return splitMode;
+}
+
 common_params setupParams(
     const std::string& modelGgufPath,
     std::unordered_map<std::string, std::string> configFilemap,
@@ -262,6 +298,8 @@ common_params setupParams(
   configVector.emplace_back("--model");
   configVector.emplace_back(modelGgufPath);
 
+  llama_split_mode splitMode = parseSplitMode(configFilemap);
+
   auto deviceIt = configFilemap.find("device");
   if (deviceIt == configFilemap.end()) {
     std::string errorMsg =
@@ -281,20 +319,48 @@ common_params setupParams(
     const std::pair<BackendType, std::string> chosenBackend =
         chooseBackend(preferredBackend, llamaLogCallback, mainGpu);
 
-    if (chosenBackend.first != BackendType::GPU &&
-        chosenBackend.first != BackendType::CPU) {
+    if (chosenBackend.first == BackendType::GPU) {
+      resolvedBackendDevice = 1;
+      params.split_mode = splitMode;
+
+      if (splitMode != LLAMA_SPLIT_MODE_NONE && mainGpu.has_value()) {
+        if (std::holds_alternative<int>(mainGpu.value())) {
+          configFilemap["main-gpu"] =
+              std::to_string(std::get<int>(mainGpu.value()));
+        } else {
+          qvac_lib_infer_llamacpp_embed::logging::llamaLogCallback(
+              GGML_LOG_LEVEL_WARN,
+              "[BertModel] main-gpu 'dedicated'/'integrated' ignored in "
+              "multi-GPU split-mode; use an integer device index instead\n",
+              nullptr);
+        }
+      }
+    } else if (chosenBackend.first == BackendType::CPU) {
+      resolvedBackendDevice = 0;
+      params.split_mode = LLAMA_SPLIT_MODE_NONE;
+      params.main_gpu = -1;
+      if (splitMode != LLAMA_SPLIT_MODE_NONE) {
+        qvac_lib_infer_llamacpp_embed::logging::llamaLogCallback(
+            GGML_LOG_LEVEL_WARN,
+            "[BertModel] split-mode, tensor-split and main-gpu ignored: "
+            "no GPU backend available, falling back to CPU\n",
+            nullptr);
+        splitMode = LLAMA_SPLIT_MODE_NONE;
+        configFilemap.erase("tensor-split");
+      }
+    } else {
       throw qvac_errors::StatusError(
           qvac_errors::general_error::InternalError,
           "preferredDeviceFromString: wrong deduced device, must be 'gpu' or "
           "'cpu'.\n");
     }
-    if (chosenBackend.first == BackendType::GPU) {
-      resolvedBackendDevice = 1;
-    } else {
-      resolvedBackendDevice = 0;
+    // In multi-GPU split mode we intentionally omit --device so llama.cpp
+    // distributes layers/rows across all available GPUs rather than pinning
+    // to the single backend that chooseBackend selected.
+    if (splitMode == LLAMA_SPLIT_MODE_NONE) {
+      configVector.emplace_back("--device");
+      configVector.emplace_back(chosenBackend.second);
     }
-    configVector.emplace_back("--device");
-    configVector.emplace_back(chosenBackend.second);
     configFilemap.erase(deviceIt);
   }
 
@@ -374,9 +440,10 @@ void BertModel::init(
   setVerbosityLevel(configCopy);
 
   std::string openclCacheDir;
-  if (auto it = configCopy.find("openclCacheDir"); it != configCopy.end()) {
-    openclCacheDir = it->second;
-    configCopy.erase(it);
+  if (auto configIt = configCopy.find("openclCacheDir");
+      configIt != configCopy.end()) {
+    openclCacheDir = configIt->second;
+    configCopy.erase(configIt);
   }
 
   lazyCommonInit();
@@ -490,6 +557,8 @@ BertModel::preprocessPrompt(const std::string& prompt) const {
   return splitLines(prompt, init_.params.embd_sep);
 }
 
+const common_params& BertModel::getCommonParams() const { return init_.params; }
+
 bool BertModel::isLoaded() const {
   return is_loaded_ && model_ != nullptr && ctx_ != nullptr;
 }
@@ -623,7 +692,8 @@ BertEmbeddings BertModel::processBatched(
     return BertEmbeddings(
         std::move(embeddings),
         BertEmbeddings::Layout{
-            numStoredEmbeddings, static_cast<std::size_t>(n_embd)});
+            .embeddingCount = numStoredEmbeddings,
+            .embeddingSize = static_cast<std::size_t>(n_embd)});
   };
 
   for (std::size_t k = 0; k < nPrompts && !stopCancelled_.load(); k++) {
@@ -676,7 +746,9 @@ BertEmbeddings BertModel::processBatched(
       init_.params.embd_normalize);
   return BertEmbeddings(
       std::move(embeddings),
-      BertEmbeddings::Layout{embeddingCount, static_cast<std::size_t>(n_embd)});
+      BertEmbeddings::Layout{
+          .embeddingCount = embeddingCount,
+          .embeddingSize = static_cast<std::size_t>(n_embd)});
 }
 
 BertEmbeddings
@@ -701,7 +773,9 @@ BertEmbeddings BertModel::encodeHostF32Sequences(
   if (sequenceArray.empty()) {
     return BertEmbeddings(
         std::vector<float>{},
-        BertEmbeddings::Layout{0, static_cast<std::size_t>(n_embd)});
+        BertEmbeddings::Layout{
+            .embeddingCount = 0,
+            .embeddingSize = static_cast<std::size_t>(n_embd)});
   }
 
   // Tokenize all sequences once and validate context size

@@ -182,6 +182,8 @@ class BertModel : public qvac_lib_inference_addon_cpp::model::IModel,
 
   bool isLoaded() const;
 
+  const common_params& getCommonParams() const;
+
   void setWeightsForFile(
       const std::string& filename,
       std::unique_ptr<std::basic_streambuf<char>>&& shard) final;

@@ -10,6 +10,7 @@
 namespace qvac_lib_infer_llamacpp_embed::logging {
 
 // Global verbosity level - same for all instances
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 extern qvac_lib_inference_addon_cpp::logger::Priority g_verbosityLevel;
 
 // Parse verbosity from config map and set global level
@@ -23,6 +24,7 @@ void llamaLogCallback(ggml_log_level level, const char* text, void* userData);
 //
 // Simple logging macro that uses global verbosity level
 // Usage: QLOG_IF(Priority::DEBUG, "Debug message");
+// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
 #define QLOG_IF(priority, message)                                             \
   do {                                                                         \
     if (static_cast<int>(priority) <=                                          \