tetherto · gianni-cor · Mar 18, 2026 · Mar 13, 2026 · Mar 13, 2026 · Mar 17, 2026
@@ -1,5 +1,17 @@
 # Changelog
 
+## [0.12.3] - 2026-03-17
+
+### Added
+
+#### `contextSlides` runtime stat
+
+`runtimeStats()` now includes a `contextSlides` counter that reports how many times the KV cache context window was slid during inference. This replaces the previous approach of parsing log messages to detect sliding context events, providing a reliable, structured stat for downstream consumers.
+
+#### `RuntimeStats` TypeScript interface
+
+Added a `RuntimeStats` type to `index.d.ts` covering all stats keys returned by the C++ addon: `TTFT`, `TPS`, `CacheTokens`, `generatedTokens`, `promptTokens`, and `contextSlides`.
+
 ## [0.12.2] - 2026-03-13
 
 This release fixes antiprompt (reverse-prompt) detection for short stop sequences like `\n`, which is critical for translation workloads that rely on newline-based early stopping.

@@ -348,6 +348,9 @@ std::string LlamaModel::processPrompt(const Prompt& prompt) {
 std::string LlamaModel::processPromptImpl(const Prompt& prompt) {
   state_->lastRunWasPrefill_ = prompt.prefill;
 
+  // Reset per-inference slide counter so it doesn't leak across runs
+  state_->llmContext_->resetNSlides();
+
   for (const auto& media : prompt.media) {
     loadMedia(media);
   }
@@ -425,12 +428,15 @@ qvac_lib_inference_addon_cpp::RuntimeStats LlamaModel::runtimeStats() const {
   int32_t promptTokens = state_->lastRunWasPrefill_ ? 0 : perfData.n_p_eval;
   llama_perf_context_reset(state_->llmContext_->getCtx());
 
+  int32_t contextSlides = state_->llmContext_->getNSlides();
+
   return {
       {"TTFT", timeToFirstToken},
       {"TPS", tokensPerSecond},
       {"CacheTokens", state_->llmContext_->getNPast()},
       {"generatedTokens", generatedTokens},
-      {"promptTokens", promptTokens}};
+      {"promptTokens", promptTokens},
+      {"contextSlides", static_cast<int64_t>(contextSlides)}};
 }
 
 // NOLINTNEXTLINE(readability-convert-member-functions-to-static,readability-function-cognitive-complexity)

@@ -200,6 +200,16 @@ class LlmContext { // NOLINT(cppcoreguidelines-special-member-functions)
    */
   virtual void setNDiscarded(llama_pos nDiscarded) = 0;
 
+  /**
+   * Get the number of context slides (discards) that have occurred.
+   */
+  [[nodiscard]] virtual int32_t getNSlides() const = 0;
+
+  /**
+   * Reset the slide counter to zero. Called at the start of each inference.
+   */
+  virtual void resetNSlides() = 0;
+
   /**
    * The load media method. It loads the media from memory buffer.
    * Default implementation does nothing (for text-only contexts).

@@ -240,6 +240,7 @@ bool MtmdLlmContext::evalMessageWithTools(
       llama_memory_seq_add(
           mem, 0, firstMsgTokens_ + nDiscarded_, nPast_, -nDiscarded_);
       nPast_ -= nDiscarded_;
+      ++nSlides_;
       QLOG_IF(
           Priority::DEBUG,
           string_format(
@@ -252,6 +253,7 @@ bool MtmdLlmContext::evalMessageWithTools(
       auto* mem = llama_get_memory(lctx_);
       llama_memory_seq_rm(mem, 0, firstMsgTokens_, nPast_);
       nPast_ = firstMsgTokens_;
+      ++nSlides_;
       QLOG_IF(
           Priority::DEBUG,
           string_format(
@@ -337,6 +339,7 @@ void MtmdLlmContext::applyContextDiscard() {
   llama_memory_seq_add(
       mem, 0, firstMsgTokens_ + nDiscarded_, nPast_, -nDiscarded_);
   nPast_ -= nDiscarded_;
+  ++nSlides_;
   QLOG_IF(
       Priority::DEBUG,
       string_format(
@@ -476,6 +479,9 @@ void MtmdLlmContext::setNDiscarded(llama_pos nDiscarded) {
   this->nDiscarded_ = nDiscarded;
 }
 
+int32_t MtmdLlmContext::getNSlides() const { return nSlides_; }
+void MtmdLlmContext::resetNSlides() { nSlides_ = 0; }
+
 void MtmdLlmContext::loadMedia(const std::vector<uint8_t>& media) {
   if (media.empty()) {
     resetMedia();
@@ -549,6 +555,13 @@ void MtmdLlmContext::resetState(bool resetStats) {
   // Reset the first msg token length
   firstMsgTokens_ = 0;
 
+  // On partial reset (resetStats=false), preserve nSlides_ so
+  // runtimeStats() can read the per-inference value.
+  // On full reset (resetStats=true), clear it along with perf stats.
+  if (resetStats) {
+    nSlides_ = 0;
+  }
+
   // Clear UTF-8 buffer when resetting state
   utf8Buffer_.clear();
 

@@ -115,6 +115,9 @@ class MtmdLlmContext: public LlmContext {
    */
   void setNDiscarded(llama_pos nDiscarded) override;
 
+  [[nodiscard]] int32_t getNSlides() const override;
+  void resetNSlides() override;
+
   /**
    * The load media method. It loads the media from memory buffer.
    *
@@ -198,6 +201,7 @@ class MtmdLlmContext: public LlmContext {
   llama_pos nPast_ = 0;
   llama_pos nDiscarded_ = 0;
   llama_pos firstMsgTokens_ = 0;
+  int32_t nSlides_ = 0;
 
   // UTF-8 token buffer for handling incomplete emoji sequences
   qvac_lib_inference_addon_llama::UTF8TokenBuffer utf8Buffer_;

@@ -283,6 +283,7 @@ bool TextLlmContext::evalMessageWithTools(
       llama_memory_seq_add(
           mem, 0, firstMsgTokens_ + nDiscarded_, nPast_, -nDiscarded_);
       nPast_ -= nDiscarded_;
+      ++nSlides_;
       QLOG_IF(
           Priority::DEBUG,
           string_format(
@@ -295,6 +296,7 @@ bool TextLlmContext::evalMessageWithTools(
       auto* mem = llama_get_memory(lctx_);
       llama_memory_seq_rm(mem, 0, firstMsgTokens_, nPast_);
       nPast_ = firstMsgTokens_;
+      ++nSlides_;
       QLOG_IF(
           Priority::DEBUG,
           string_format(
@@ -384,6 +386,7 @@ void TextLlmContext::applyContextDiscard() {
   llama_memory_seq_add(
       mem, 0, firstMsgTokens_ + nDiscarded_, nPast_, -nDiscarded_);
   nPast_ -= nDiscarded_;
+  ++nSlides_;
   QLOG_IF(
       Priority::DEBUG,
       string_format(
@@ -530,6 +533,13 @@ void TextLlmContext::resetState(bool resetStats) {
   // Reset the first msg token length
   firstMsgTokens_ = 0;
 
+  // On partial reset (resetStats=false), preserve nSlides_ so
+  // runtimeStats() can read the per-inference value.
+  // On full reset (resetStats=true), clear it along with perf stats.
+  if (resetStats) {
+    nSlides_ = 0;
+  }
+
   // Clear UTF-8 buffer when resetting state
   utf8Buffer_.clear();
 
@@ -564,6 +574,9 @@ void TextLlmContext::setNDiscarded(llama_pos nDiscarded) {
   this->nDiscarded_ = nDiscarded;
 }
 
+int32_t TextLlmContext::getNSlides() const { return nSlides_; }
+void TextLlmContext::resetNSlides() { nSlides_ = 0; }
+
 llama_pos TextLlmContext::removeLastNTokens(llama_pos count) {
   // Validate input
   if (count <= 0) {

@@ -108,6 +108,9 @@ class TextLlmContext: public LlmContext {
    */
   void setNDiscarded(llama_pos nDiscarded) override;
 
+  [[nodiscard]] int32_t getNSlides() const override;
+  void resetNSlides() override;
+
   /**
    * The reset state method. It resets the context.
    *
@@ -168,6 +171,7 @@ class TextLlmContext: public LlmContext {
   llama_pos nPast_ = 0;
   llama_pos nDiscarded_ = 0;
   llama_pos firstMsgTokens_ = 0;
+  int32_t nSlides_ = 0;
   ThreadPoolPtr threadpool_;
   ThreadPoolPtr threadpoolBatch_;
 

@@ -115,6 +115,15 @@ export interface DownloadWeightsOptions {
   closeLoader?: boolean
 }
 
+export interface RuntimeStats {
+  TTFT: number
+  TPS: number
+  CacheTokens: number
+  generatedTokens: number
+  promptTokens: number
+  contextSlides: number
+}
+
 export interface DownloadResult {
   filePath: string | null
   error: boolean

@@ -1,6 +1,6 @@
 {
   "name": "@qvac/llm-llamacpp",
-  "version": "0.12.2",
+  "version": "0.12.3",
   "description": "llama addon for qvac",
   "addon": true,
   "scripts": {