tetherto · gianni-cor · Apr 30, 2026 · Apr 29, 2026 · Apr 29, 2026 · Apr 30, 2026
@@ -1,5 +1,15 @@
 # Changelog
 
+## [0.19.1] - 2026-04-30
+
+### Fixed
+
+#### GPT-OSS Harmony tool calling: `<|call|>` frame delimiter now surfaces to the SDK
+
+The `<|call|>` token (Harmony frame terminator) is in the model's EOG set. When sampled, it rendered as 0 bytes and silently stopped generation — tool call output was truncated with no visible frame boundary, resulting in the SDK parsing 0 tool calls.
+
+The generation loop now detects Harmony models and intercepts `<|call|>` before the generic EOG break: it renders the token as visible text (`special=true`) so the SDK can identify frame boundaries, then stops generation cleanly. GPT-OSS uses a turn-based tool protocol — one tool call per generation pass — and the SDK is expected to execute the tool, append results, and re-prompt for subsequent calls.
+
 ## [0.19.0] - 2026-04-29
 
 This release adds per-request structured-output support to the LLM addon: callers can now constrain a single completion to either a JSON Schema or a raw GBNF grammar without reloading the model.

@@ -100,6 +100,22 @@ MtmdLlmContext::MtmdLlmContext(
     antipromptTokens_.insert(
         antipromptTokens_.end(), tempTokens.begin(), tempTokens.end());
   }
+
+  isHarmonyModel_ =
+      qvac_lib_inference_addon_llama::utils::isHarmonyModel(model_);
+  if (isHarmonyModel_) {
+    harmonyCallToken_ =
+        qvac_lib_inference_addon_llama::utils::getHarmonyCallToken(lctx_);
+    if (harmonyCallToken_ == LLAMA_TOKEN_NULL) {
+      isHarmonyModel_ = false;
+    }
+  }
+  QLOG_IF(
+      Priority::DEBUG,
+      string_format(
+          "[MtmdLlm] Harmony detection: isHarmony=%d callToken=%d\n",
+          isHarmonyModel_,
+          harmonyCallToken_));
 }
 
 void MtmdLlmContext::initVisionContext() {
@@ -453,7 +469,25 @@ bool MtmdLlmContext::generateResponse(
       }
     }
 
-    if (llama_vocab_is_eog(vocab_, tokenId) || checkAntiprompt()) {
+    bool isEos = llama_vocab_is_eog(vocab_, tokenId);
+
+    if (isEos && isHarmonyModel_ && params_.use_jinja &&
+        tokenId == harmonyCallToken_) {
+      QLOG_IF(
+          Priority::DEBUG,
+          string_format(
+              "[MtmdLlm] Harmony <|call|> stop: tokenId=%d\n", tokenId));
+      if (outputCallback) {
+        std::string callMarker = common_token_to_piece(lctx_, tokenId, true);
+        if (!callMarker.empty()) {
+          outputCallback(callMarker);
+        }
+      }
+      flushPendingUtf8ToCallback(outputCallback);
+      break;
+    }
+
+    if (isEos || checkAntiprompt()) {
       flushPendingUtf8ToCallback(outputCallback);
       break;
     }

@@ -220,5 +220,10 @@ class MtmdLlmContext : public LlmContext {
 
   // UTF-8 token buffer for handling incomplete emoji sequences
   qvac_lib_inference_addon_llama::UTF8TokenBuffer utf8Buffer_;
+
+  // GPT-OSS Harmony: <|call|> is a frame delimiter, not a stop signal
+  bool isHarmonyModel_ = false;
+  llama_token harmonyCallToken_ = LLAMA_TOKEN_NULL;
+
   std::atomic<bool> stopGeneration_ = false;
 };
@@ -53,6 +53,24 @@ TextLlmContext::TextLlmContext(
           lctx_, reasoningState_);
     }
 
+    isHarmonyModel_ =
+        qvac_lib_inference_addon_llama::utils::isHarmonyModel(model_);
+    if (isHarmonyModel_) {
+      harmonyCallToken_ =
+          qvac_lib_inference_addon_llama::utils::getHarmonyCallToken(lctx_);
+      if (harmonyCallToken_ == LLAMA_TOKEN_NULL) {
+        isHarmonyModel_ = false;
+      }
+    }
+    QLOG_IF(
+        Priority::DEBUG,
+        string_format(
+            "[TextLlm] Harmony detection: isHarmony=%d callToken=%d "
+            "useJinja=%d\n",
+            isHarmonyModel_,
+            harmonyCallToken_,
+            params_.use_jinja));
+
     std::string chatTemplate =
         getChatTemplate(model_, params_, tools_.enabled());
     tmpls_ = common_chat_templates_init(model_, chatTemplate);
@@ -510,6 +528,22 @@ bool TextLlmContext::generateResponse(
       }
     }
 
+    if (isEos && isHarmonyModel_ && params_.use_jinja &&
+        tokenId == harmonyCallToken_) {
+      QLOG_IF(
+          Priority::DEBUG,
+          string_format(
+              "[TextLlm] Harmony <|call|> stop: tokenId=%d\n", tokenId));
+      if (outputCallback) {
+        std::string callMarker = common_token_to_piece(lctx_, tokenId, true);
+        if (!callMarker.empty()) {
+          outputCallback(callMarker);
+        }
+      }
+      flushPendingUtf8ToCallback(outputCallback);
+      break;
+    }
+
     if (isEos || checkAntiprompt()) {
       flushPendingUtf8ToCallback(outputCallback);
       break;

@@ -198,5 +198,9 @@ class TextLlmContext : public LlmContext {
   // Cache whether this is a Qwen3 model (checked once at load time)
   bool isQwen3Model_ = false;
 
+  // GPT-OSS Harmony: <|call|> is a frame delimiter, not a stop signal
+  bool isHarmonyModel_ = false;
+  llama_token harmonyCallToken_ = LLAMA_TOKEN_NULL;
+
   std::atomic<bool> stopGeneration_ = false;
 };
@@ -31,6 +31,11 @@ bool isQwen3Architecture(const std::string& architecture) {
   return archStr == "qwen3";
 }
 
+bool isHarmonyArchitecture(const std::string& architecture) {
+  const std::string archStr = normalizeArchitecture(architecture);
+  return archStr == "gpt-oss";
+}
+
 bool modelNameLooksLikeQwen3(const std::string& modelName) {
   std::string normalizedName = modelName;
   std::transform(
@@ -85,6 +90,23 @@ bool isQwen3Model(const ::llama_model* model) {
       getModelArchitecture(model), getModelName(model));
 }
 
+bool isHarmonyModel(const ::llama_model* model) {
+  if (model == nullptr) {
+    return false;
+  }
+  std::optional<std::string> arch = getModelArchitecture(model);
+  return arch.has_value() && isHarmonyArchitecture(arch.value());
+}
+
+llama_token getHarmonyCallToken(::llama_context* lctx) {
+  std::vector<llama_token> tokens =
+      common_tokenize(lctx, "<|call|>", false, true);
+  if (tokens.size() == 1) {
+    return tokens[0];
+  }
+  return LLAMA_TOKEN_NULL;
+}
+
 bool supportsToolsCompactForModelMetadata(
     const std::optional<std::string>& architecture,
     const std::optional<std::string>& modelName) {

@@ -14,6 +14,8 @@ namespace qvac_lib_inference_addon_llama {
 namespace utils {
 
 bool isQwen3Model(const ::llama_model* model);
+bool isHarmonyModel(const ::llama_model* model);
+llama_token getHarmonyCallToken(::llama_context* lctx);
 std::optional<std::string> getModelArchitecture(const ::llama_model* model);
 bool supportsToolsCompactForModelMetadata(
     const std::optional<std::string>& architecture,