diff --git a/packages/llm-llamacpp/CHANGELOG.md b/packages/llm-llamacpp/CHANGELOG.md index b072182a5c..371d8972fb 100644 --- a/packages/llm-llamacpp/CHANGELOG.md +++ b/packages/llm-llamacpp/CHANGELOG.md @@ -1,5 +1,22 @@ # Changelog +## [0.20.1] - 2026-05-11 + +### Fixed + +#### MedPsy GGUF models now apply their embedded chat template + +MedPsy models report `general.architecture = qwen3` in GGUF metadata, so the llm addon was substituting the hardcoded Qwen3 chat templates in `ChatTemplateUtils` whenever the model was loaded. That replaced the model's own embedded Jinja chat template — which contains a `{%- set persona -%}` block injecting the `"You are MedPsy, ..."` system prompt the model is fine-tuned to expect — and as a result the model lost its identity at runtime and answered as a generic assistant. + +The addon now identifies MedPsy models via the GGUF `general.basename` metadata (case-insensitive match against `MedPsy`) and: + +- `ChatTemplateUtils::getChatTemplateForModel` returns an empty string for MedPsy, so `common_chat_templates_init` falls through to the model's embedded chat template instead of substituting the hardcoded Qwen3 ones. The Qwen3 reasoning state and EOS handling in `TextLlmContext` continue to apply because the architecture is still `qwen3`. +- `LlamaModel::commonParamsParse` auto-enables `params.use_jinja` when it detects the MedPsy basename, so the embedded Jinja template is applied even when the caller did not pass `tools: 'true'`. The auto-enable is gated on `!use_jinja`, so passing `tools: 'true'` continues to work and the auto-enable log is correctly skipped. + +After the fix, MedPsy self-identifies correctly at runtime (e.g. `"I'm MedPsy, a medical and healthcare AI assistant developed by QVAC."`). + +The new `qvac_lib_inference_addon_llama::utils::isMedPsyBasename` and `isMedPsyModel` helpers are unit-tested for null, empty, exact match, mixed case, and near-miss strings such as `MedPsy-7B` and `NotMedPsy`. + ## [0.20.0] - 2026-05-10 ### Changed diff --git a/packages/llm-llamacpp/addon/src/model-interface/LlamaModel.cpp b/packages/llm-llamacpp/addon/src/model-interface/LlamaModel.cpp index 0f565b3c68..34c402d714 100644 --- a/packages/llm-llamacpp/addon/src/model-interface/LlamaModel.cpp +++ b/packages/llm-llamacpp/addon/src/model-interface/LlamaModel.cpp @@ -749,6 +749,22 @@ void LlamaModel::commonParamsParse( configFilemap.erase(jit); } + // MedPsy ships only a Jinja chat template embedded in its GGUF; the non-jinja + // fallback path used by llama.cpp does not execute the {%- set persona -%} + // block that injects the model's persona system prompt, so the model loses + // its identity when jinja is off. Auto-enable jinja whenever we detect the + // MedPsy basename so the embedded template is applied regardless of the + // tools setting. + if (!params.use_jinja && + qvac_lib_inference_addon_llama::utils::isMedPsyBasename( + metadata_.tryGetString("general.basename").value_or(""))) { + params.use_jinja = true; + QLOG_IF( + Priority::INFO, + "[LlamaModel] MedPsy basename detected; auto-enabling jinja so the " + "embedded chat template is applied\n"); + } + // reasoning-budget controls whether the model emits a reasoning // channel. -1 (default) leaves it on; 0 disables. `std::from_chars` is used // instead of `std::stoi` because the latter accepts trailing garbage ("0abc" diff --git a/packages/llm-llamacpp/addon/src/utils/ChatTemplateUtils.cpp b/packages/llm-llamacpp/addon/src/utils/ChatTemplateUtils.cpp index b7b4badf4b..3307822224 100644 --- a/packages/llm-llamacpp/addon/src/utils/ChatTemplateUtils.cpp +++ b/packages/llm-llamacpp/addon/src/utils/ChatTemplateUtils.cpp @@ -2,6 +2,8 @@ #include #include +#include +#include #include @@ -16,24 +18,47 @@ namespace utils { namespace { -std::string normalizeArchitecture(const std::string& architecture) { - std::string normalized = architecture; - std::transform( - normalized.begin(), - normalized.end(), - normalized.begin(), - [](unsigned char c) { return std::tolower(c); }); - return normalized; +// Lowercased literal used for case-insensitive equality against +// `general.basename` GGUF metadata to identify MedPsy models. +inline constexpr std::string_view MEDPSY_BASENAME_LOWER{"medpsy"}; + +std::string toLower(std::string_view value) { + std::string lowered(value.size(), '\0'); + std::ranges::transform(value, lowered.begin(), [](unsigned char ch) { + return std::tolower(ch); + }); + return lowered; +} + +std::string normalizeArchitecture(std::string_view architecture) { + return toLower(architecture); +} + +bool isQwen3Architecture(std::string_view architecture) { + return normalizeArchitecture(architecture) == "qwen3"; } -bool isQwen3Architecture(const std::string& architecture) { - const std::string archStr = normalizeArchitecture(architecture); - return archStr == "qwen3"; +bool isHarmonyArchitecture(std::string_view architecture) { + return normalizeArchitecture(architecture) == "gpt-oss"; } -bool isHarmonyArchitecture(const std::string& architecture) { - const std::string archStr = normalizeArchitecture(architecture); - return archStr == "gpt-oss"; +std::optional +readMetadataString(const ::llama_model* model, const char* key) { + if (model == nullptr || key == nullptr) { + return std::nullopt; + } + + char buffer[256] = {0}; + int32_t len = llama_model_meta_val_str(model, key, buffer, sizeof(buffer)); + if (len > 0 && static_cast(len) < sizeof(buffer)) { + buffer[len] = '\0'; + return std::string(buffer); + } + return std::nullopt; +} + +std::optional getModelBasename(const ::llama_model* model) { + return readMetadataString(model, "general.basename"); } } // namespace @@ -48,9 +73,9 @@ std::optional getModelArchitecture(const ::llama_model* model) { char arch[64] = {0}; int32_t len = llama_model_meta_val_str( model, "general.architecture", arch, sizeof(arch)); - if (len > 0 && len < sizeof(arch)) { + if (len > 0 && static_cast(len) < sizeof(arch)) { arch[len] = '\0'; - return normalizeArchitecture(std::string(arch)); + return normalizeArchitecture(arch); } return std::nullopt; } @@ -63,6 +88,18 @@ bool isQwen3Model(const ::llama_model* model) { return supportsToolsCompactForModelMetadata(getModelArchitecture(model)); } +bool isMedPsyBasename(std::string_view basename) { + return !basename.empty() && toLower(basename) == MEDPSY_BASENAME_LOWER; +} + +bool isMedPsyModel(const ::llama_model* model) { + // No explicit nullptr guard needed: getModelBasename() -> + // readMetadataString() returns std::nullopt for a null model, and + // value_or("") below feeds isMedPsyBasename an empty string view which it + // rejects. + return isMedPsyBasename(getModelBasename(model).value_or("")); +} + bool isHarmonyModel(const ::llama_model* model) { if (model == nullptr) { return false; @@ -100,6 +137,18 @@ std::string getChatTemplateForModel( return manualOverride; } + // MedPsy ships its own chat template embedded in GGUF metadata. Returning an + // empty string makes common_chat_templates_init() defer to that embedded + // template instead of substituting the hardcoded Qwen3 templates below, even + // when the model's architecture is reported as qwen3. + if (isMedPsyModel(model)) { + QLOG_IF( + Priority::INFO, + "[ChatTemplateUtils] MedPsy basename detected; using embedded chat " + "template\n"); + return ""; + } + if (isQwen3Model(model)) { return toolsCompact ? getToolsDynamicQwen3Template() : getFixedQwen3Template(); diff --git a/packages/llm-llamacpp/addon/src/utils/ChatTemplateUtils.hpp b/packages/llm-llamacpp/addon/src/utils/ChatTemplateUtils.hpp index d32b954e9c..b523e51b98 100644 --- a/packages/llm-llamacpp/addon/src/utils/ChatTemplateUtils.hpp +++ b/packages/llm-llamacpp/addon/src/utils/ChatTemplateUtils.hpp @@ -2,6 +2,7 @@ #include #include +#include #include "common/chat.h" #include "common/common.h" @@ -20,15 +21,38 @@ std::optional getModelArchitecture(const ::llama_model* model); bool supportsToolsCompactForModelMetadata( const std::optional& architecture); +/** + * @brief Returns true when the GGUF metadata basename identifies a MedPsy + * model. Exposed for unit testing without requiring a real ::llama_model. + * + * Comparison is case-insensitive against the literal "MedPsy"; an empty + * basename returns false (callers should pass `value_or("")` from the + * upstream `std::optional` metadata accessor). + */ +bool isMedPsyBasename(std::string_view basename); + +/** + * @brief Returns true when the model's `general.basename` metadata identifies + * it as a MedPsy model. MedPsy ships its own chat template embedded in the + * GGUF, so callers should defer to it rather than substituting the hardcoded + * Qwen3 templates. + */ +bool isMedPsyModel(const ::llama_model* model); + std::optional selectToolsCompactMarkerForModelMetadata( const std::optional& architecture); /** * @brief Gets the appropriate chat template for a model * - * For Qwen3 models, returns the fixed template or tools-compact template - * based on the toolsCompact flag. - * For other models, returns the manual override or empty string. + * Resolution order: + * 1. A non-empty `manualOverride` always wins. + * 2. Models whose GGUF `general.basename` is "MedPsy" return an empty + * string so callers fall through to the embedded chat template, even + * when the architecture is reported as qwen3. + * 3. Qwen3 models return either the tools-compact dynamic template or the + * fixed Qwen3 template based on the `toolsCompact` flag. + * 4. All other models return an empty string. */ std::string getChatTemplateForModel( const ::llama_model* model, const std::string& manualOverride, diff --git a/packages/llm-llamacpp/package.json b/packages/llm-llamacpp/package.json index 8a08c68b3e..2c7189a310 100644 --- a/packages/llm-llamacpp/package.json +++ b/packages/llm-llamacpp/package.json @@ -1,6 +1,6 @@ { "name": "@qvac/llm-llamacpp", - "version": "0.20.0", + "version": "0.20.1", "description": "llama addon for qvac", "addon": true, "scripts": { diff --git a/packages/llm-llamacpp/test/integration/gemma4.test.js b/packages/llm-llamacpp/test/integration/gemma4.test.js index a984fe7e6e..8c45816d26 100644 --- a/packages/llm-llamacpp/test/integration/gemma4.test.js +++ b/packages/llm-llamacpp/test/integration/gemma4.test.js @@ -169,30 +169,33 @@ test('Gemma 4 supports multi-turn conversation with KV cache', { const systemMsg = { role: 'system', content: 'You are a helpful assistant. Answer concisely with just the city name.' } const userTurn1 = { role: 'user', content: 'What is the capital of France?' } - const prompt1 = [ - { role: 'session', content: sessionName }, - systemMsg, - userTurn1 - ] - const response1 = await addon.run(prompt1) + // Cache control is a runOption (cacheKey), NOT a `{ role: 'session' }` + // chat message — the latter was removed in v0.15.0 and is silently dropped + // by Jinja chat templates that have no matching elif branch. + const prompt1 = [systemMsg, userTurn1] + const response1 = await addon.run(prompt1, { cacheKey: sessionName }) const output1 = await collectResponse(response1) t.ok(output1.length > 0, `first turn produced output (${output1.length} chars)`) const lowerOutput1 = output1.toLowerCase() t.ok(/paris/.test(lowerOutput1), `first turn mentions Paris: "${output1.slice(0, 100)}"`) + t.ok(response1.stats?.CacheTokens > 0, `first turn populated KV cache (CacheTokens=${response1.stats?.CacheTokens})`) const prompt2 = [ - { role: 'session', content: sessionName }, systemMsg, userTurn1, { role: 'assistant', content: output1 }, { role: 'user', content: 'And what about Germany?' } ] - const response2 = await addon.run(prompt2) + const response2 = await addon.run(prompt2, { cacheKey: sessionName }) const output2 = await collectResponse(response2) t.ok(output2.length > 0, `second turn produced output (${output2.length} chars)`) const lowerOutput2 = output2.toLowerCase() t.ok(/berlin/.test(lowerOutput2), `second turn mentions Berlin: "${output2.slice(0, 100)}"`) t.ok(output2 !== output1, 'second turn produced different output from first') + t.ok( + response2.stats?.CacheTokens > response1.stats?.CacheTokens, + `second turn extended the KV cache from turn 1 (${response1.stats?.CacheTokens} -> ${response2.stats?.CacheTokens})` + ) } finally { await addon.unload().catch(() => {}) } diff --git a/packages/llm-llamacpp/test/integration/qwen3-5.test.js b/packages/llm-llamacpp/test/integration/qwen3-5.test.js index 7e60098907..0062f5f24f 100644 --- a/packages/llm-llamacpp/test/integration/qwen3-5.test.js +++ b/packages/llm-llamacpp/test/integration/qwen3-5.test.js @@ -171,30 +171,33 @@ test('Qwen3.5-0.8B supports multi-turn conversation with KV cache', { const systemMsg = { role: 'system', content: 'You are a helpful assistant. Answer concisely with just the city name.' } const userTurn1 = { role: 'user', content: 'What is the capital of France?' } - const prompt1 = [ - { role: 'session', content: sessionName }, - systemMsg, - userTurn1 - ] - const response1 = await addon.run(prompt1) + // Cache control is a runOption (cacheKey), NOT a `{ role: 'session' }` + // chat message — the latter was removed in v0.15.0 and is silently dropped + // by Jinja chat templates that have no matching elif branch. + const prompt1 = [systemMsg, userTurn1] + const response1 = await addon.run(prompt1, { cacheKey: sessionName }) const output1 = await collectResponse(response1) t.ok(output1.length > 0, `first turn produced output (${output1.length} chars)`) const lowerOutput1 = output1.toLowerCase() t.ok(/paris/.test(lowerOutput1), `first turn mentions Paris: "${output1.slice(0, 100)}"`) + t.ok(response1.stats?.CacheTokens > 0, `first turn populated KV cache (CacheTokens=${response1.stats?.CacheTokens})`) const prompt2 = [ - { role: 'session', content: sessionName }, systemMsg, userTurn1, { role: 'assistant', content: output1 }, { role: 'user', content: 'And what about Germany?' } ] - const response2 = await addon.run(prompt2) + const response2 = await addon.run(prompt2, { cacheKey: sessionName }) const output2 = await collectResponse(response2) t.ok(output2.length > 0, `second turn produced output (${output2.length} chars)`) const lowerOutput2 = output2.toLowerCase() t.ok(/berlin/.test(lowerOutput2), `second turn mentions Berlin: "${output2.slice(0, 100)}"`) t.ok(output2 !== output1, 'second turn produced different output from first') + t.ok( + response2.stats?.CacheTokens > response1.stats?.CacheTokens, + `second turn extended the KV cache from turn 1 (${response1.stats?.CacheTokens} -> ${response2.stats?.CacheTokens})` + ) } finally { await addon.unload().catch(() => {}) } diff --git a/packages/llm-llamacpp/test/unit/test_chat_template_utils.cpp b/packages/llm-llamacpp/test/unit/test_chat_template_utils.cpp index 8b353c600c..c64cc7e7f8 100644 --- a/packages/llm-llamacpp/test/unit/test_chat_template_utils.cpp +++ b/packages/llm-llamacpp/test/unit/test_chat_template_utils.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -39,6 +40,32 @@ TEST_F(ChatTemplateUtilsTest, IsQwen3ModelWithNullptr) { EXPECT_FALSE(isQwen3Model(nullptr)); } +TEST_F(ChatTemplateUtilsTest, IsMedPsyModelWithNullptr) { + EXPECT_FALSE(isMedPsyModel(nullptr)); +} + +TEST_F(ChatTemplateUtilsTest, IsMedPsyBasenameEmpty) { + EXPECT_FALSE(isMedPsyBasename(std::string_view{})); + EXPECT_FALSE(isMedPsyBasename("")); +} + +TEST_F(ChatTemplateUtilsTest, IsMedPsyBasenameExactMatch) { + EXPECT_TRUE(isMedPsyBasename("MedPsy")); +} + +TEST_F(ChatTemplateUtilsTest, IsMedPsyBasenameCaseInsensitive) { + EXPECT_TRUE(isMedPsyBasename("medpsy")); + EXPECT_TRUE(isMedPsyBasename("MEDPSY")); + EXPECT_TRUE(isMedPsyBasename("MedPSY")); +} + +TEST_F(ChatTemplateUtilsTest, IsMedPsyBasenameRejectsOtherNames) { + EXPECT_FALSE(isMedPsyBasename("Qwen3")); + EXPECT_FALSE(isMedPsyBasename("Llama-3.1")); + EXPECT_FALSE(isMedPsyBasename("MedPsy-7B")); + EXPECT_FALSE(isMedPsyBasename("NotMedPsy")); +} + TEST_F( ChatTemplateUtilsTest, SupportsToolsCompactForModelMetadataByArchitecture) { EXPECT_TRUE(supportsToolsCompactForModelMetadata(std::string("qwen3")));