Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions packages/qvac-lib-infer-llamacpp-llm/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
# Changelog

## [0.12.3] - 2026-03-17

### Added

#### `contextSlides` runtime stat

`runtimeStats()` now includes a `contextSlides` counter that reports how many times the KV cache context window was slid during inference. This replaces the previous approach of parsing log messages to detect sliding context events, providing a reliable, structured stat for downstream consumers.

#### `RuntimeStats` TypeScript interface

Added a `RuntimeStats` type to `index.d.ts` covering all stats keys returned by the C++ addon: `TTFT`, `TPS`, `CacheTokens`, `generatedTokens`, `promptTokens`, and `contextSlides`.

## [0.12.2] - 2026-03-13

This release fixes antiprompt (reverse-prompt) detection for short stop sequences like `\n`, which is critical for translation workloads that rely on newline-based early stopping.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,9 @@ std::string LlamaModel::processPrompt(const Prompt& prompt) {
std::string LlamaModel::processPromptImpl(const Prompt& prompt) {
state_->lastRunWasPrefill_ = prompt.prefill;

// Reset per-inference slide counter so it doesn't leak across runs
state_->llmContext_->resetNSlides();

for (const auto& media : prompt.media) {
loadMedia(media);
}
Expand Down Expand Up @@ -425,12 +428,15 @@ qvac_lib_inference_addon_cpp::RuntimeStats LlamaModel::runtimeStats() const {
int32_t promptTokens = state_->lastRunWasPrefill_ ? 0 : perfData.n_p_eval;
llama_perf_context_reset(state_->llmContext_->getCtx());

int32_t contextSlides = state_->llmContext_->getNSlides();

return {
{"TTFT", timeToFirstToken},
{"TPS", tokensPerSecond},
{"CacheTokens", state_->llmContext_->getNPast()},
{"generatedTokens", generatedTokens},
{"promptTokens", promptTokens}};
{"promptTokens", promptTokens},
{"contextSlides", static_cast<int64_t>(contextSlides)}};
Comment thread
gianni-cor marked this conversation as resolved.
}

// NOLINTNEXTLINE(readability-convert-member-functions-to-static,readability-function-cognitive-complexity)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,16 @@ class LlmContext { // NOLINT(cppcoreguidelines-special-member-functions)
*/
virtual void setNDiscarded(llama_pos nDiscarded) = 0;

/**
* Get the number of context slides (discards) that have occurred.
*/
[[nodiscard]] virtual int32_t getNSlides() const = 0;

/**
* Reset the slide counter to zero. Called at the start of each inference.
*/
virtual void resetNSlides() = 0;

/**
* The load media method. It loads the media from memory buffer.
* Default implementation does nothing (for text-only contexts).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ bool MtmdLlmContext::evalMessageWithTools(
llama_memory_seq_add(
mem, 0, firstMsgTokens_ + nDiscarded_, nPast_, -nDiscarded_);
nPast_ -= nDiscarded_;
++nSlides_;
QLOG_IF(
Priority::DEBUG,
string_format(
Expand All @@ -252,6 +253,7 @@ bool MtmdLlmContext::evalMessageWithTools(
auto* mem = llama_get_memory(lctx_);
llama_memory_seq_rm(mem, 0, firstMsgTokens_, nPast_);
nPast_ = firstMsgTokens_;
++nSlides_;
QLOG_IF(
Priority::DEBUG,
string_format(
Expand Down Expand Up @@ -337,6 +339,7 @@ void MtmdLlmContext::applyContextDiscard() {
llama_memory_seq_add(
mem, 0, firstMsgTokens_ + nDiscarded_, nPast_, -nDiscarded_);
nPast_ -= nDiscarded_;
++nSlides_;
QLOG_IF(
Priority::DEBUG,
string_format(
Expand Down Expand Up @@ -476,6 +479,9 @@ void MtmdLlmContext::setNDiscarded(llama_pos nDiscarded) {
this->nDiscarded_ = nDiscarded;
}

int32_t MtmdLlmContext::getNSlides() const { return nSlides_; }
void MtmdLlmContext::resetNSlides() { nSlides_ = 0; }

void MtmdLlmContext::loadMedia(const std::vector<uint8_t>& media) {
if (media.empty()) {
resetMedia();
Expand Down Expand Up @@ -549,6 +555,13 @@ void MtmdLlmContext::resetState(bool resetStats) {
// Reset the first msg token length
firstMsgTokens_ = 0;

// On partial reset (resetStats=false), preserve nSlides_ so
// runtimeStats() can read the per-inference value.
// On full reset (resetStats=true), clear it along with perf stats.
if (resetStats) {
nSlides_ = 0;
}

// Clear UTF-8 buffer when resetting state
utf8Buffer_.clear();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,9 @@ class MtmdLlmContext: public LlmContext {
*/
void setNDiscarded(llama_pos nDiscarded) override;

[[nodiscard]] int32_t getNSlides() const override;
void resetNSlides() override;

/**
* The load media method. It loads the media from memory buffer.
*
Expand Down Expand Up @@ -198,6 +201,7 @@ class MtmdLlmContext: public LlmContext {
llama_pos nPast_ = 0;
llama_pos nDiscarded_ = 0;
llama_pos firstMsgTokens_ = 0;
int32_t nSlides_ = 0;

// UTF-8 token buffer for handling incomplete emoji sequences
qvac_lib_inference_addon_llama::UTF8TokenBuffer utf8Buffer_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,7 @@ bool TextLlmContext::evalMessageWithTools(
llama_memory_seq_add(
mem, 0, firstMsgTokens_ + nDiscarded_, nPast_, -nDiscarded_);
nPast_ -= nDiscarded_;
++nSlides_;
QLOG_IF(
Priority::DEBUG,
string_format(
Expand All @@ -295,6 +296,7 @@ bool TextLlmContext::evalMessageWithTools(
auto* mem = llama_get_memory(lctx_);
llama_memory_seq_rm(mem, 0, firstMsgTokens_, nPast_);
nPast_ = firstMsgTokens_;
++nSlides_;
QLOG_IF(
Priority::DEBUG,
string_format(
Expand Down Expand Up @@ -384,6 +386,7 @@ void TextLlmContext::applyContextDiscard() {
llama_memory_seq_add(
mem, 0, firstMsgTokens_ + nDiscarded_, nPast_, -nDiscarded_);
nPast_ -= nDiscarded_;
++nSlides_;
QLOG_IF(
Priority::DEBUG,
string_format(
Expand Down Expand Up @@ -530,6 +533,13 @@ void TextLlmContext::resetState(bool resetStats) {
// Reset the first msg token length
firstMsgTokens_ = 0;

// On partial reset (resetStats=false), preserve nSlides_ so
// runtimeStats() can read the per-inference value.
// On full reset (resetStats=true), clear it along with perf stats.
if (resetStats) {
nSlides_ = 0;
}

// Clear UTF-8 buffer when resetting state
utf8Buffer_.clear();

Expand Down Expand Up @@ -564,6 +574,9 @@ void TextLlmContext::setNDiscarded(llama_pos nDiscarded) {
this->nDiscarded_ = nDiscarded;
}

int32_t TextLlmContext::getNSlides() const { return nSlides_; }
void TextLlmContext::resetNSlides() { nSlides_ = 0; }

llama_pos TextLlmContext::removeLastNTokens(llama_pos count) {
// Validate input
if (count <= 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ class TextLlmContext: public LlmContext {
*/
void setNDiscarded(llama_pos nDiscarded) override;

[[nodiscard]] int32_t getNSlides() const override;
void resetNSlides() override;

/**
* The reset state method. It resets the context.
*
Expand Down Expand Up @@ -168,6 +171,7 @@ class TextLlmContext: public LlmContext {
llama_pos nPast_ = 0;
llama_pos nDiscarded_ = 0;
llama_pos firstMsgTokens_ = 0;
int32_t nSlides_ = 0;
ThreadPoolPtr threadpool_;
ThreadPoolPtr threadpoolBatch_;

Expand Down
9 changes: 9 additions & 0 deletions packages/qvac-lib-infer-llamacpp-llm/index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,15 @@ export interface DownloadWeightsOptions {
closeLoader?: boolean
}

export interface RuntimeStats {
TTFT: number
TPS: number
CacheTokens: number
generatedTokens: number
promptTokens: number
contextSlides: number
}

export interface DownloadResult {
filePath: string | null
error: boolean
Expand Down
2 changes: 1 addition & 1 deletion packages/qvac-lib-infer-llamacpp-llm/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@qvac/llm-llamacpp",
"version": "0.12.2",
"version": "0.12.3",
"description": "llama addon for qvac",
"addon": true,
"scripts": {
Expand Down
Loading
Loading