Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions packages/qvac-lib-infer-llamacpp-llm/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# Changelog

## [0.19.1] - 2026-04-30

### Fixed

#### GPT-OSS Harmony tool calling: `<|call|>` frame delimiter now surfaces to the SDK

The `<|call|>` token (Harmony frame terminator) is in the model's EOG set. When sampled, it rendered as 0 bytes and silently stopped generation — tool call output was truncated with no visible frame boundary, resulting in the SDK parsing 0 tool calls.

The generation loop now detects Harmony models and intercepts `<|call|>` before the generic EOG break: it renders the token as visible text (`special=true`) so the SDK can identify frame boundaries, then stops generation cleanly. GPT-OSS uses a turn-based tool protocol — one tool call per generation pass — and the SDK is expected to execute the tool, append results, and re-prompt for subsequent calls.

## [0.19.0] - 2026-04-29

This release adds per-request structured-output support to the LLM addon: callers can now constrain a single completion to either a JSON Schema or a raw GBNF grammar without reloading the model.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,22 @@ MtmdLlmContext::MtmdLlmContext(
antipromptTokens_.insert(
antipromptTokens_.end(), tempTokens.begin(), tempTokens.end());
}

isHarmonyModel_ =
qvac_lib_inference_addon_llama::utils::isHarmonyModel(model_);
if (isHarmonyModel_) {
harmonyCallToken_ =
qvac_lib_inference_addon_llama::utils::getHarmonyCallToken(lctx_);
if (harmonyCallToken_ == LLAMA_TOKEN_NULL) {
isHarmonyModel_ = false;
}
}
QLOG_IF(
Priority::DEBUG,
string_format(
"[MtmdLlm] Harmony detection: isHarmony=%d callToken=%d\n",
isHarmonyModel_,
harmonyCallToken_));
}

void MtmdLlmContext::initVisionContext() {
Expand Down Expand Up @@ -453,7 +469,25 @@ bool MtmdLlmContext::generateResponse(
}
}

if (llama_vocab_is_eog(vocab_, tokenId) || checkAntiprompt()) {
bool isEos = llama_vocab_is_eog(vocab_, tokenId);

if (isEos && isHarmonyModel_ && params_.use_jinja &&
tokenId == harmonyCallToken_) {
QLOG_IF(
Priority::DEBUG,
string_format(
"[MtmdLlm] Harmony <|call|> stop: tokenId=%d\n", tokenId));
if (outputCallback) {
std::string callMarker = common_token_to_piece(lctx_, tokenId, true);
if (!callMarker.empty()) {
outputCallback(callMarker);
}
}
flushPendingUtf8ToCallback(outputCallback);
break;
}

if (isEos || checkAntiprompt()) {
flushPendingUtf8ToCallback(outputCallback);
break;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -220,5 +220,10 @@ class MtmdLlmContext : public LlmContext {

// UTF-8 token buffer for handling incomplete emoji sequences
qvac_lib_inference_addon_llama::UTF8TokenBuffer utf8Buffer_;

// GPT-OSS Harmony: <|call|> is a frame delimiter, not a stop signal
bool isHarmonyModel_ = false;
llama_token harmonyCallToken_ = LLAMA_TOKEN_NULL;

std::atomic<bool> stopGeneration_ = false;
};
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,24 @@ TextLlmContext::TextLlmContext(
lctx_, reasoningState_);
}

isHarmonyModel_ =
qvac_lib_inference_addon_llama::utils::isHarmonyModel(model_);
if (isHarmonyModel_) {
harmonyCallToken_ =
qvac_lib_inference_addon_llama::utils::getHarmonyCallToken(lctx_);
if (harmonyCallToken_ == LLAMA_TOKEN_NULL) {
isHarmonyModel_ = false;
}
}
QLOG_IF(
Priority::DEBUG,
string_format(
"[TextLlm] Harmony detection: isHarmony=%d callToken=%d "
"useJinja=%d\n",
isHarmonyModel_,
harmonyCallToken_,
params_.use_jinja));

std::string chatTemplate =
getChatTemplate(model_, params_, tools_.enabled());
tmpls_ = common_chat_templates_init(model_, chatTemplate);
Expand Down Expand Up @@ -510,6 +528,22 @@ bool TextLlmContext::generateResponse(
}
}

if (isEos && isHarmonyModel_ && params_.use_jinja &&
tokenId == harmonyCallToken_) {
QLOG_IF(
Priority::DEBUG,
string_format(
"[TextLlm] Harmony <|call|> stop: tokenId=%d\n", tokenId));
if (outputCallback) {
std::string callMarker = common_token_to_piece(lctx_, tokenId, true);
if (!callMarker.empty()) {
outputCallback(callMarker);
}
}
flushPendingUtf8ToCallback(outputCallback);
break;
}

if (isEos || checkAntiprompt()) {
flushPendingUtf8ToCallback(outputCallback);
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,5 +198,9 @@ class TextLlmContext : public LlmContext {
// Cache whether this is a Qwen3 model (checked once at load time)
bool isQwen3Model_ = false;

// GPT-OSS Harmony: <|call|> is a frame delimiter, not a stop signal
bool isHarmonyModel_ = false;
llama_token harmonyCallToken_ = LLAMA_TOKEN_NULL;

std::atomic<bool> stopGeneration_ = false;
};
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ bool isQwen3Architecture(const std::string& architecture) {
return archStr == "qwen3";
}

bool isHarmonyArchitecture(const std::string& architecture) {
const std::string archStr = normalizeArchitecture(architecture);
return archStr == "gpt-oss";
}

bool modelNameLooksLikeQwen3(const std::string& modelName) {
std::string normalizedName = modelName;
std::transform(
Expand Down Expand Up @@ -85,6 +90,23 @@ bool isQwen3Model(const ::llama_model* model) {
getModelArchitecture(model), getModelName(model));
}

bool isHarmonyModel(const ::llama_model* model) {
if (model == nullptr) {
return false;
}
std::optional<std::string> arch = getModelArchitecture(model);
return arch.has_value() && isHarmonyArchitecture(arch.value());
}

llama_token getHarmonyCallToken(::llama_context* lctx) {
std::vector<llama_token> tokens =
common_tokenize(lctx, "<|call|>", false, true);
if (tokens.size() == 1) {
return tokens[0];
}
return LLAMA_TOKEN_NULL;
}

bool supportsToolsCompactForModelMetadata(
const std::optional<std::string>& architecture,
const std::optional<std::string>& modelName) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ namespace qvac_lib_inference_addon_llama {
namespace utils {

bool isQwen3Model(const ::llama_model* model);
bool isHarmonyModel(const ::llama_model* model);
llama_token getHarmonyCallToken(::llama_context* lctx);
std::optional<std::string> getModelArchitecture(const ::llama_model* model);
bool supportsToolsCompactForModelMetadata(
const std::optional<std::string>& architecture,
Expand Down
Loading
Loading