Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
62 commits
Select commit Hold shift + click to select a range
bb2d249
(improvement) llamacpp-llm: Qwen3 dynamic tools template
mialso Mar 4, 2026
b9ed672
(improvement) llamacpp-llm: add llm config tools flag
mialso Mar 4, 2026
48cedbd
(improvement) llamacpp-llm: use template based on tools param
mialso Mar 4, 2026
3bf572d
(improvement) llamacpp-llm: count tools token offset with tokenizer
mialso Mar 4, 2026
d63ad92
(improvement) llamacpp-llm: track n-past, run Qwen3 tests, fix reset
mialso Mar 4, 2026
9a75956
(improvement) llamacpp-llm: save cache with respect to tools flag
mialso Mar 5, 2026
83f2d8b
(fix) llamacpp-llm: add Qwen3ToolsDynamicTemplate.cpp to production C…
DmitryMalishev Mar 9, 2026
9f97519
chore: retrigger CI for CMakeLists fix
DmitryMalishev Mar 9, 2026
0f23162
(fix) llamacpp-llm: fix use-after-free SIGSEGV on process exit (linux)
DmitryMalishev Mar 10, 2026
f3adb55
Revert "(fix) llamacpp-llm: fix use-after-free SIGSEGV on process exi…
DmitryMalishev Mar 10, 2026
c1eab4a
(fix) llamacpp-llm: robust threadpool teardown to prevent SIGSEGV on …
DmitryMalishev Mar 10, 2026
7bd6fc2
Revert "(fix) llamacpp-llm: robust threadpool teardown to prevent SIG…
DmitryMalishev Mar 10, 2026
b5fe234
fix(llm): reset stale state before non-cached run after prefill
DmitryMalishev Mar 11, 2026
3cba2d2
fix(llm): trim stale tool tokens in multi-turn sessions with tools_at…
DmitryMalishev Mar 11, 2026
d305c92
(fix) llamacpp-llm: dynamic tools cache trim, tmp template, debugs
mialso Mar 12, 2026
2d74bf9
fix(llm): pass toolsAtEnd flag to context constructors to fix templat…
olyasir Mar 12, 2026
22e4c89
feat(llm): strip tool_call/think blocks from re-sent assistant responses
olyasir Mar 12, 2026
b6dae3a
(fix) llamacpp-llm: use correct template in tests
mialso Mar 12, 2026
31b2069
(chore) llamacpp-llm: move qwen3 cache tests to own file
mialso Mar 12, 2026
e2b660b
(improvement) llamacpp-llm: simplify nPastBeforeTools reset, multi-tu…
mialso Mar 12, 2026
47292a5
(improvement) llamacpp-llm: simply nPastBeforeTools tracking, no trim…
mialso Mar 13, 2026
aedadda
(chore) llamacpp-llm: remove redundant getters and cleanup
mialso Mar 13, 2026
f13b1aa
(internal) llamacpp-llm: run Qwen3 context tests
mialso Mar 13, 2026
c1e85c2
(chore) cleanup
mialso Mar 13, 2026
f2fe2a5
(chore) fix lint errors in examples
olyasir Mar 13, 2026
63b31e2
(chore) fix remaining lint errors in benchToolsPlacement
olyasir Mar 13, 2026
9384335
(chore) fix indentation in benchToolsPlacement ternary
olyasir Mar 13, 2026
52d6706
Merge remote-tracking branch 'origin/main' into feature/llm-dynamic-t…
mialso Mar 13, 2026
4bab03b
Merge branch 'main' into feature/llm-dynamic-tools
olyasir Mar 13, 2026
04cb86f
(chore) llamacpp-llm: remove unused example files
olyasir Mar 13, 2026
05674f0
(chore) remove scratch planning docs
olyasir Mar 13, 2026
71c3f19
(doc) llamacpp-llm: tools_at_end param description
mialso Mar 13, 2026
00a72f6
Merge branch 'main' into feature/llm-dynamic-tools
olyasir Mar 16, 2026
c52e076
(chore) llamacpp-llm: changelog and version bump
mialso Mar 16, 2026
cc18a15
refactor(llamacpp-llm): address PR #706 review comments
DmitryMalishev Mar 17, 2026
44da74e
refactor(llamacpp-llm): remove toolsAtEnd_ from ReloadableState, sing…
DmitryMalishev Mar 17, 2026
4dbb387
Merge branch 'main' into feature/llm-dynamic-tools
DmitryMalishev Mar 17, 2026
4161c77
fix(llamacpp-llm): use dts.reset() after post-eval trim for full stat…
DmitryMalishev Mar 17, 2026
27e6a5c
(draft) llamacpp-llm: dynamic tools cache tokens test debug
mialso Mar 18, 2026
181b98a
(internal) llamacpp-llm: dynamic tools token count and cache match test
mialso Mar 18, 2026
a03ad49
Revert "(internal) llamacpp-llm: dynamic tools token count and cache …
mialso Mar 18, 2026
a58893b
Revert "(draft) llamacpp-llm: dynamic tools cache tokens test debug"
mialso Mar 18, 2026
047debf
Merge branch 'main' into feature/llm-dynamic-tools
olyasir Mar 19, 2026
661cbb1
Merge branch 'main' into feature/llm-dynamic-tools
DmitryMalishev Mar 19, 2026
afea85e
fix(llamacpp-llm): address PR review comments N3-N8, merge main
DmitryMalishev Mar 19, 2026
a4086e8
style(llamacpp-llm): apply clang-format to all PR-touched C++ files
DmitryMalishev Mar 19, 2026
11b186b
style(llamacpp-llm): fix remaining clang-format-19 brace-init formatting
DmitryMalishev Mar 19, 2026
1bb6556
chore: remove accidentally committed binary file
DmitryMalishev Mar 19, 2026
02a327a
chore(llm): bump version to 0.14.0
DmitryMalishev Mar 19, 2026
7d33988
chore: remove working artifacts from feature branch
DmitryMalishev Mar 19, 2026
2ddac41
chore: remove accidentally committed sdk model history file
DmitryMalishev Mar 19, 2026
79dab19
doc: add dynamic-tools examples to README
DmitryMalishev Mar 19, 2026
22603f9
fix(llm): reset use_jinja from params_ instead of save/restore
DmitryMalishev Mar 19, 2026
b9a54ec
fix(llm): reset use_jinja before second getPrompt call
DmitryMalishev Mar 19, 2026
306f401
Merge branch 'main' into feature/llm-dynamic-tools
DmitryMalishev Mar 19, 2026
11991ae
Merge branch 'main' into feature/llm-dynamic-tools
olyasir Mar 20, 2026
7057252
Merge branch 'main' into feature/llm-dynamic-tools
olyasir Mar 20, 2026
615f7e1
Merge branch 'main' into feature/llm-dynamic-tools
gianni-cor Mar 20, 2026
75bbf05
Merge branch 'main' into feature/llm-dynamic-tools
gianni-cor Mar 20, 2026
e7d6d7b
Merge branch 'main' into feature/llm-dynamic-tools
olyasir Mar 20, 2026
cd24afa
Merge branch 'main' into feature/llm-dynamic-tools
olyasir Mar 20, 2026
804ca06
Merge branch 'main' into feature/llm-dynamic-tools
gianni-cor Mar 21, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions packages/qvac-lib-infer-llamacpp-llm/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
# Changelog

## [0.14.0] - 2026-03-19

### Added

#### `tools_at_end` configuration for dynamic tool management in multi-turn conversations

New `tools_at_end` configuration option (`"true"` or `"false"`, default: `"false"`) places tool definitions at the end of the prompt (after conversation history) instead of in the system prompt. This enables KV cache optimization for multi-turn conversations with dynamic tool sets, where tools change between turns. Currently supports Qwen3 models only.

- **KV cache trimming**: After each turn, tools are automatically removed from the KV cache, preventing stale tool definitions from accumulating
- **Conversation history reuse**: History tokens are preserved in cache, saving recomputation on long conversations
- **Dynamic tool replacement**: Different tool sets can be used per turn without cache bloat from unused tools

## [0.13.0] - 2026-03-18

### Added
Expand Down
2 changes: 2 additions & 0 deletions packages/qvac-lib-infer-llamacpp-llm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ endif()
${PROJECT_SOURCE_DIR}/addon/src/utils/BackendSelection.cpp
${PROJECT_SOURCE_DIR}/addon/src/utils/ChatTemplateUtils.cpp
${PROJECT_SOURCE_DIR}/addon/src/utils/Qwen3ReasoningUtils.cpp
${PROJECT_SOURCE_DIR}/addon/src/utils/Qwen3ToolsDynamicTemplate.cpp
${PROJECT_SOURCE_DIR}/addon/src/utils/QwenTemplate.cpp
)

Expand Down Expand Up @@ -118,6 +119,7 @@ if(BUILD_CLI)
${PROJECT_SOURCE_DIR}/addon/src/utils/BackendSelection.cpp
${PROJECT_SOURCE_DIR}/addon/src/utils/ChatTemplateUtils.cpp
${PROJECT_SOURCE_DIR}/addon/src/utils/Qwen3ReasoningUtils.cpp
${PROJECT_SOURCE_DIR}/addon/src/utils/Qwen3ToolsDynamicTemplate.cpp
${PROJECT_SOURCE_DIR}/addon/src/utils/QwenTemplate.cpp
)

Expand Down
3 changes: 3 additions & 0 deletions packages/qvac-lib-infer-llamacpp-llm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ const config = {
| presence_penalty | float | 0 | Presence penalty for sampling |
| frequency_penalty | float | 0 | Frequency penalty for sampling |
| tools | `"true"` or `"false"` | `"false"` | Enable tool calling with jinja templating |
| tools_at_end | `"true"` or `"false"` | `"false"` | Place tools at end of prompt ([details](./docs/tools-at-end.md)) |
| verbosity | 0 – 3 (0=ERROR, 1=WARNING, 2=INFO, 3=DEBUG) | 0 | Logging verbosity level |
| n_discarded | integer | 0 | Tokens to discard in sliding window context |
| main-gpu | integer, `"integrated"`, or `"dedicated"` | — | GPU selection for multi-GPU systems |
Expand Down Expand Up @@ -287,6 +288,8 @@ npm run quickstart
- [LoRA Finetuning](./examples/finetune/simple-lora-finetune.js) – Basic LoRA finetuning.
- [LoRA Finetuning Pause/Resume](./examples/finetune/simple-lora-finetune-pause-resume.js) – Pause and resume finetuning.
- [LoRA Inference](./examples/simple-lora-inference.js) – Inference with a finetuned LoRA adapter.
- [Bench Tools Placement](./examples/benchToolsPlacement.js) – Benchmarks standard vs `tools_at_end` placement across multi-turn conversations.
- [Test Tool Removal](./examples/testToolRemoval.js) – Demonstrates dynamic tool addition and removal between turns.

## OCR with Vision-Language Models

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,9 @@ void LlamaModel::init(bool acquireLock) {

common_params params;
std::optional<int> adrenoVersion;
commonParamsParse(modelPath, configFilemap, params, adrenoVersion);
bool toolsAtEnd = false;
commonParamsParse(
modelPath, configFilemap, params, adrenoVersion, toolsAtEnd);

const std::string errorWhenFailed = toString(UnableToLoadModel);
auto streamedFiles =
Expand All @@ -334,7 +336,8 @@ void LlamaModel::init(bool acquireLock) {
snap->llmContext_ = createContext(
std::string(constructionArgs_.projectionPath),
params,
std::move(llamaInit));
std::move(llamaInit),
toolsAtEnd);

if (snap->configuredNDiscarded_ > 0 && snap->llmContext_) {
snap->llmContext_->setNDiscarded(snap->configuredNDiscarded_);
Expand All @@ -360,6 +363,14 @@ bool LlamaModel::isLoaded() {
return static_cast<bool>(state_->llmContext_);
}

llama_pos LlamaModel::getNPastBeforeTools() const {
std::shared_lock lock(stateMtx_);
if (state_->llmContext_) {
return state_->llmContext_->dynamicToolsState().nPastBeforeTools();
}
return -1;
}

llama_context* LlamaModel::getContext() {
if (!state_->llmContext_) {
return nullptr;
Expand Down Expand Up @@ -504,6 +515,11 @@ std::string LlamaModel::processPromptImpl(const Prompt& prompt) {
std::string out;
ResolvedPrompt resolved = resolveChatAndTools(prompt.input);

if (resolved.shouldResetAfterInference &&
state_->llmContext_->getNPast() > 0) {
resetState(true);
}

if (resolved.chatMsgs.empty() && resolved.tools.empty()) {
QLOG_IF(
Priority::INFO,
Expand Down Expand Up @@ -552,6 +568,18 @@ std::string LlamaModel::processPromptImpl(const Prompt& prompt) {
if (!prompt.outputCallback) {
out = oss.str();
}
auto& dts = state_->llmContext_->dynamicToolsState();
if (dts.toolsAtEnd() && !resolved.tools.empty() &&
dts.nPastBeforeTools() > 0 &&
state_->llmContext_->getNPast() > dts.nPastBeforeTools()) {
state_->llmContext_->removeLastNTokens(
state_->llmContext_->getNPast() - dts.nPastBeforeTools());
dts.reset();
if (state_->llmContext_->getFirstMsgTokens() >
state_->llmContext_->getNPast()) {
state_->llmContext_->setFirstMsgTokens(state_->llmContext_->getNPast());
}
}
if (resolved.shouldResetAfterInference) {
resetState(false);
}
Expand Down Expand Up @@ -589,7 +617,8 @@ qvac_lib_inference_addon_cpp::RuntimeStats LlamaModel::runtimeStats() const {
void LlamaModel::commonParamsParse(
const std::string& modelPath,
std::unordered_map<std::string, std::string>& configFilemap,
common_params& params, std::optional<int>& outAdrenoVersion) {
common_params& params, std::optional<int>& outAdrenoVersion,
bool& outToolsAtEnd) {

std::vector<std::string> configVector;

Expand Down Expand Up @@ -632,6 +661,26 @@ void LlamaModel::commonParamsParse(
configFilemap.erase(iter);
}

// parse tools_at_end flag from config
if (auto iter = configFilemap.find("tools_at_end");
iter != configFilemap.end()) {
std::string val = iter->second;
std::transform(val.begin(), val.end(), val.begin(), ::tolower);
outToolsAtEnd = (val == "true");
configFilemap.erase(iter);
}
Comment thread
olyasir marked this conversation as resolved.

if (outToolsAtEnd) {
auto arch = metadata_.tryGetString("general.architecture");
if (!arch.has_value() || arch.value() != "qwen3") {
QLOG_IF(
Priority::WARNING,
"[LlamaModel] tools_at_end is only supported for Qwen3 models, "
"ignoring\n");
outToolsAtEnd = false;
}
}

auto deviceIt = configFilemap.find("device");
if (deviceIt == configFilemap.end()) {
std::string errorMsg =
Expand Down Expand Up @@ -968,12 +1017,14 @@ void LlamaModel::resetState(bool resetStats) {

std::unique_ptr<LlmContext> LlamaModel::createContext(
std::string&& projectionPath, common_params& params,
common_init_result&& llamaInit) {
common_init_result&& llamaInit, bool toolsAtEnd) {
if (!projectionPath.empty()) {
params.mmproj.path = std::move(projectionPath);
return std::make_unique<MtmdLlmContext>(params, std::move(llamaInit));
return std::make_unique<MtmdLlmContext>(
params, std::move(llamaInit), toolsAtEnd);
}
return std::make_unique<TextLlmContext>(params, std::move(llamaInit));
return std::make_unique<TextLlmContext>(
params, std::move(llamaInit), toolsAtEnd);
}

bool LlamaModel::loadMedia(const std::vector<uint8_t>& input) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,14 @@ class LlamaModel : public IModel, public IModelAsyncLoad, public IModelCancel {
*/
bool isLoaded();

/**
* Get the nPast position before tool evaluation.
* This is used to find the boundary in the KV cache after evaluating
* conversation tokens but before tool tokens.
* @return the nPast position, or -1 if not set.
*/
llama_pos getNPastBeforeTools() const;

void waitForLoadInitialization() final {
std::shared_ptr<ReloadableState> localState;
{
Expand Down Expand Up @@ -233,7 +241,8 @@ class LlamaModel : public IModel, public IModelAsyncLoad, public IModelCancel {
void commonParamsParse(
const std::string& modelPath,
std::unordered_map<std::string, std::string>& configFilemap,
common_params& params, std::optional<int>& outAdrenoVersion);
common_params& params, std::optional<int>& outAdrenoVersion,
bool& outToolsAtEnd);

/**
* The Format prompt method. It formats the prompt json to chat messages.
Expand All @@ -246,7 +255,8 @@ class LlamaModel : public IModel, public IModelAsyncLoad, public IModelCancel {
void resetState(bool resetStats = true);
std::unique_ptr<LlmContext> createContext(
std::string&& projectionPath, common_params& params,
common_init_result&& llamaInit);
common_init_result&& llamaInit, bool toolsAtEnd);

bool loadMedia(const std::vector<uint8_t>& input);

void setInitLoader(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,32 +84,58 @@ class LlamaBatch {
const llama_batch* operator->() const noexcept { return &batch_; }
};

struct ThreadPoolDeleter{
void operator()(ggml_threadpool* ptr) {
if (ptr != nullptr) {
auto* cpuDev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (cpuDev == nullptr) {
throw qvac_errors::StatusError(
ADDON_ID, toString(NoBackendFound), "no CPU backend found");
}
auto* reg = ggml_backend_dev_backend_reg(cpuDev);
void* procAddr =
ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
if (procAddr == nullptr) {
throw qvac_errors::StatusError(
ADDON_ID,
toString(UnableToDeleteThreadPool),
"Failed to get ggml_threadpool_free function address");
}
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
auto* ggmlThreadpoolFreeFn =
reinterpret_cast<decltype(ggml_threadpool_free)*>(procAddr);
ggmlThreadpoolFreeFn(ptr);
struct ThreadPoolDeleter {
void operator()(ggml_threadpool* ptr) {
if (ptr != nullptr) {
auto* cpuDev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (cpuDev == nullptr) {
throw qvac_errors::StatusError(
ADDON_ID, toString(NoBackendFound), "no CPU backend found");
}
auto* reg = ggml_backend_dev_backend_reg(cpuDev);
void* procAddr =
ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
if (procAddr == nullptr) {
throw qvac_errors::StatusError(
ADDON_ID,
toString(UnableToDeleteThreadPool),
"Failed to get ggml_threadpool_free function address");
}
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
auto* ggmlThreadpoolFreeFn =
reinterpret_cast<decltype(ggml_threadpool_free)*>(procAddr);
ggmlThreadpoolFreeFn(ptr);
}
}
};
using ThreadPoolPtr = std::unique_ptr<ggml_threadpool, ThreadPoolDeleter>;

class DynamicToolsState {
public:
void setToolsAtEnd(bool v) { toolsAtEnd_ = v; }
[[nodiscard]] bool toolsAtEnd() const { return toolsAtEnd_; }
[[nodiscard]] llama_pos nPastBeforeTools() const { return nPastBeforeTools_; }
void setNPastBeforeTools(llama_pos pos) { nPastBeforeTools_ = pos; }
void recordToolBoundary(llama_pos nPast, llama_pos totalTokens) {
if (toolsAtEnd_ && nConversationOnlyTokens_ > 0) {
nPastBeforeTools_ = nPast - (totalTokens - nConversationOnlyTokens_);
}
}
void setConversationOnlyTokens(llama_pos n) { nConversationOnlyTokens_ = n; }
[[nodiscard]] llama_pos conversationOnlyTokens() const {
return nConversationOnlyTokens_;
}
void reset() {
nConversationOnlyTokens_ = 0;
nPastBeforeTools_ = -1;
}

private:
bool toolsAtEnd_ = false;
llama_pos nConversationOnlyTokens_ = 0;
llama_pos nPastBeforeTools_ = -1;
};

class LlmContext { // NOLINT(cppcoreguidelines-special-member-functions)
public:
LlmContext() = default;
Expand Down Expand Up @@ -211,6 +237,11 @@ class LlmContext { // NOLINT(cppcoreguidelines-special-member-functions)
*/
virtual void setNDiscarded(llama_pos nDiscarded) = 0;

DynamicToolsState& dynamicToolsState() { return dynamicToolsState_; }
[[nodiscard]] const DynamicToolsState& dynamicToolsState() const {
return dynamicToolsState_;
}

/**
* Get the number of context slides (discards) that have occurred.
*/
Expand Down Expand Up @@ -276,6 +307,7 @@ class LlmContext { // NOLINT(cppcoreguidelines-special-member-functions)
*
*/
virtual void resetMedia() {};
};


private:
DynamicToolsState dynamicToolsState_;
};
Loading
Loading