tetherto · gianni-cor · May 12, 2026 · May 11, 2026 · May 11, 2026 · May 11, 2026
@@ -1,5 +1,22 @@
 # Changelog
 
+## [0.20.1] - 2026-05-11
+
+### Fixed
+
+#### MedPsy GGUF models now apply their embedded chat template
+
+MedPsy models report `general.architecture = qwen3` in GGUF metadata, so the llm addon was substituting the hardcoded Qwen3 chat templates in `ChatTemplateUtils` whenever the model was loaded. That replaced the model's own embedded Jinja chat template — which contains a `{%- set persona -%}` block injecting the `"You are MedPsy, ..."` system prompt the model is fine-tuned to expect — and as a result the model lost its identity at runtime and answered as a generic assistant.
+
+The addon now identifies MedPsy models via the GGUF `general.basename` metadata (case-insensitive match against `MedPsy`) and:
+
+- `ChatTemplateUtils::getChatTemplateForModel` returns an empty string for MedPsy, so `common_chat_templates_init` falls through to the model's embedded chat template instead of substituting the hardcoded Qwen3 ones. The Qwen3 reasoning state and EOS handling in `TextLlmContext` continue to apply because the architecture is still `qwen3`.
+- `LlamaModel::commonParamsParse` auto-enables `params.use_jinja` when it detects the MedPsy basename, so the embedded Jinja template is applied even when the caller did not pass `tools: 'true'`. The auto-enable is gated on `!use_jinja`, so passing `tools: 'true'` continues to work and the auto-enable log is correctly skipped.
+
+After the fix, MedPsy self-identifies correctly at runtime (e.g. `"I'm MedPsy, a medical and healthcare AI assistant developed by QVAC."`).
+
+The new `qvac_lib_inference_addon_llama::utils::isMedPsyBasename` and `isMedPsyModel` helpers are unit-tested for null, empty, exact match, mixed case, and near-miss strings such as `MedPsy-7B` and `NotMedPsy`.
+
 ## [0.20.0] - 2026-05-10
 
 ### Changed

@@ -749,6 +749,22 @@ void LlamaModel::commonParamsParse(
     configFilemap.erase(jit);
   }
 
+  // MedPsy ships only a Jinja chat template embedded in its GGUF; the non-jinja
+  // fallback path used by llama.cpp does not execute the {%- set persona -%}
+  // block that injects the model's persona system prompt, so the model loses
+  // its identity when jinja is off. Auto-enable jinja whenever we detect the
+  // MedPsy basename so the embedded template is applied regardless of the
+  // tools setting.
+  if (!params.use_jinja &&
+      qvac_lib_inference_addon_llama::utils::isMedPsyBasename(
+          metadata_.tryGetString("general.basename").value_or(""))) {
+    params.use_jinja = true;
+    QLOG_IF(
+        Priority::INFO,
+        "[LlamaModel] MedPsy basename detected; auto-enabling jinja so the "
+        "embedded chat template is applied\n");
+  }
+
   // reasoning-budget controls whether the model emits a <think> reasoning
   // channel. -1 (default) leaves it on; 0 disables. `std::from_chars` is used
   // instead of `std::stoi` because the latter accepts trailing garbage ("0abc"

@@ -2,6 +2,8 @@
 
 #include <algorithm>
 #include <cctype>
+#include <ranges>
+#include <string_view>
 
 #include <llama.h>
 
@@ -16,24 +18,47 @@ namespace utils {
 
 namespace {
 
-std::string normalizeArchitecture(const std::string& architecture) {
-  std::string normalized = architecture;
-  std::transform(
-      normalized.begin(),
-      normalized.end(),
-      normalized.begin(),
-      [](unsigned char c) { return std::tolower(c); });
-  return normalized;
+// Lowercased literal used for case-insensitive equality against
+// `general.basename` GGUF metadata to identify MedPsy models.
+inline constexpr std::string_view MEDPSY_BASENAME_LOWER{"medpsy"};
+
+std::string toLower(std::string_view value) {
+  std::string lowered(value.size(), '\0');
+  std::ranges::transform(value, lowered.begin(), [](unsigned char ch) {
+    return std::tolower(ch);
+  });
+  return lowered;
+}
+
+std::string normalizeArchitecture(std::string_view architecture) {
+  return toLower(architecture);
+}
+
+bool isQwen3Architecture(std::string_view architecture) {
+  return normalizeArchitecture(architecture) == "qwen3";
 }
 
-bool isQwen3Architecture(const std::string& architecture) {
-  const std::string archStr = normalizeArchitecture(architecture);
-  return archStr == "qwen3";
+bool isHarmonyArchitecture(std::string_view architecture) {
+  return normalizeArchitecture(architecture) == "gpt-oss";
 }
 
-bool isHarmonyArchitecture(const std::string& architecture) {
-  const std::string archStr = normalizeArchitecture(architecture);
-  return archStr == "gpt-oss";
+std::optional<std::string>
+readMetadataString(const ::llama_model* model, const char* key) {
+  if (model == nullptr || key == nullptr) {
+    return std::nullopt;
+  }
+
+  char buffer[256] = {0};
+  int32_t len = llama_model_meta_val_str(model, key, buffer, sizeof(buffer));
+  if (len > 0 && static_cast<size_t>(len) < sizeof(buffer)) {
+    buffer[len] = '\0';
+    return std::string(buffer);
+  }
+  return std::nullopt;
+}
+
+std::optional<std::string> getModelBasename(const ::llama_model* model) {
+  return readMetadataString(model, "general.basename");
 }
 
 } // namespace
@@ -48,9 +73,9 @@ std::optional<std::string> getModelArchitecture(const ::llama_model* model) {
   char arch[64] = {0};
   int32_t len = llama_model_meta_val_str(
       model, "general.architecture", arch, sizeof(arch));
-  if (len > 0 && len < sizeof(arch)) {
+  if (len > 0 && static_cast<size_t>(len) < sizeof(arch)) {
     arch[len] = '\0';
-    return normalizeArchitecture(std::string(arch));
+    return normalizeArchitecture(arch);
   }
   return std::nullopt;
 }
@@ -63,6 +88,18 @@ bool isQwen3Model(const ::llama_model* model) {
   return supportsToolsCompactForModelMetadata(getModelArchitecture(model));
 }
 
+bool isMedPsyBasename(std::string_view basename) {
+  return !basename.empty() && toLower(basename) == MEDPSY_BASENAME_LOWER;
+}
+
+bool isMedPsyModel(const ::llama_model* model) {
+  // No explicit nullptr guard needed: getModelBasename() ->
+  // readMetadataString() returns std::nullopt for a null model, and
+  // value_or("") below feeds isMedPsyBasename an empty string view which it
+  // rejects.
+  return isMedPsyBasename(getModelBasename(model).value_or(""));
+}
+
 bool isHarmonyModel(const ::llama_model* model) {
   if (model == nullptr) {
     return false;
@@ -100,6 +137,18 @@ std::string getChatTemplateForModel(
     return manualOverride;
   }
 
+  // MedPsy ships its own chat template embedded in GGUF metadata. Returning an
+  // empty string makes common_chat_templates_init() defer to that embedded
+  // template instead of substituting the hardcoded Qwen3 templates below, even
+  // when the model's architecture is reported as qwen3.
+  if (isMedPsyModel(model)) {
+    QLOG_IF(
+        Priority::INFO,
+        "[ChatTemplateUtils] MedPsy basename detected; using embedded chat "
+        "template\n");
+    return "";
+  }
+
   if (isQwen3Model(model)) {
     return toolsCompact ? getToolsDynamicQwen3Template()
                         : getFixedQwen3Template();

@@ -2,6 +2,7 @@
 
 #include <optional>
 #include <string>
+#include <string_view>
 
 #include "common/chat.h"
 #include "common/common.h"
@@ -20,15 +21,38 @@ std::optional<std::string> getModelArchitecture(const ::llama_model* model);
 bool supportsToolsCompactForModelMetadata(
     const std::optional<std::string>& architecture);
 
+/**
+ * @brief Returns true when the GGUF metadata basename identifies a MedPsy
+ * model. Exposed for unit testing without requiring a real ::llama_model.
+ *
+ * Comparison is case-insensitive against the literal "MedPsy"; an empty
+ * basename returns false (callers should pass `value_or("")` from the
+ * upstream `std::optional<std::string>` metadata accessor).
+ */
+bool isMedPsyBasename(std::string_view basename);
+
+/**
+ * @brief Returns true when the model's `general.basename` metadata identifies
+ * it as a MedPsy model. MedPsy ships its own chat template embedded in the
+ * GGUF, so callers should defer to it rather than substituting the hardcoded
+ * Qwen3 templates.
+ */
+bool isMedPsyModel(const ::llama_model* model);
+
 std::optional<std::string> selectToolsCompactMarkerForModelMetadata(
     const std::optional<std::string>& architecture);
 
 /**
  * @brief Gets the appropriate chat template for a model
  *
- * For Qwen3 models, returns the fixed template or tools-compact template
- * based on the toolsCompact flag.
- * For other models, returns the manual override or empty string.
+ * Resolution order:
+ *   1. A non-empty `manualOverride` always wins.
+ *   2. Models whose GGUF `general.basename` is "MedPsy" return an empty
+ *      string so callers fall through to the embedded chat template, even
+ *      when the architecture is reported as qwen3.
+ *   3. Qwen3 models return either the tools-compact dynamic template or the
+ *      fixed Qwen3 template based on the `toolsCompact` flag.
+ *   4. All other models return an empty string.
  */
 std::string getChatTemplateForModel(
     const ::llama_model* model, const std::string& manualOverride,

@@ -1,6 +1,6 @@
 {
   "name": "@qvac/llm-llamacpp",
-  "version": "0.20.0",
+  "version": "0.20.1",
   "description": "llama addon for qvac",
   "addon": true,
   "scripts": {

@@ -169,30 +169,33 @@ test('Gemma 4 supports multi-turn conversation with KV cache', {
     const systemMsg = { role: 'system', content: 'You are a helpful assistant. Answer concisely with just the city name.' }
     const userTurn1 = { role: 'user', content: 'What is the capital of France?' }
 
-    const prompt1 = [
-      { role: 'session', content: sessionName },
-      systemMsg,
-      userTurn1
-    ]
-    const response1 = await addon.run(prompt1)
+    // Cache control is a runOption (cacheKey), NOT a `{ role: 'session' }`
+    // chat message — the latter was removed in v0.15.0 and is silently dropped
+    // by Jinja chat templates that have no matching elif branch.
+    const prompt1 = [systemMsg, userTurn1]
+    const response1 = await addon.run(prompt1, { cacheKey: sessionName })
     const output1 = await collectResponse(response1)
     t.ok(output1.length > 0, `first turn produced output (${output1.length} chars)`)
     const lowerOutput1 = output1.toLowerCase()
     t.ok(/paris/.test(lowerOutput1), `first turn mentions Paris: "${output1.slice(0, 100)}"`)
+    t.ok(response1.stats?.CacheTokens > 0, `first turn populated KV cache (CacheTokens=${response1.stats?.CacheTokens})`)
 
     const prompt2 = [
-      { role: 'session', content: sessionName },
       systemMsg,
       userTurn1,
       { role: 'assistant', content: output1 },
       { role: 'user', content: 'And what about Germany?' }
     ]
-    const response2 = await addon.run(prompt2)
+    const response2 = await addon.run(prompt2, { cacheKey: sessionName })
     const output2 = await collectResponse(response2)
     t.ok(output2.length > 0, `second turn produced output (${output2.length} chars)`)
     const lowerOutput2 = output2.toLowerCase()
     t.ok(/berlin/.test(lowerOutput2), `second turn mentions Berlin: "${output2.slice(0, 100)}"`)
     t.ok(output2 !== output1, 'second turn produced different output from first')
+    t.ok(
+      response2.stats?.CacheTokens > response1.stats?.CacheTokens,
+      `second turn extended the KV cache from turn 1 (${response1.stats?.CacheTokens} -> ${response2.stats?.CacheTokens})`
+    )
   } finally {
     await addon.unload().catch(() => {})
   }

@@ -171,30 +171,33 @@ test('Qwen3.5-0.8B supports multi-turn conversation with KV cache', {
     const systemMsg = { role: 'system', content: 'You are a helpful assistant. Answer concisely with just the city name.' }
     const userTurn1 = { role: 'user', content: 'What is the capital of France?' }
 
-    const prompt1 = [
-      { role: 'session', content: sessionName },
-      systemMsg,
-      userTurn1
-    ]
-    const response1 = await addon.run(prompt1)
+    // Cache control is a runOption (cacheKey), NOT a `{ role: 'session' }`
+    // chat message — the latter was removed in v0.15.0 and is silently dropped
+    // by Jinja chat templates that have no matching elif branch.
+    const prompt1 = [systemMsg, userTurn1]
+    const response1 = await addon.run(prompt1, { cacheKey: sessionName })
     const output1 = await collectResponse(response1)
     t.ok(output1.length > 0, `first turn produced output (${output1.length} chars)`)
     const lowerOutput1 = output1.toLowerCase()
     t.ok(/paris/.test(lowerOutput1), `first turn mentions Paris: "${output1.slice(0, 100)}"`)
+    t.ok(response1.stats?.CacheTokens > 0, `first turn populated KV cache (CacheTokens=${response1.stats?.CacheTokens})`)
 
     const prompt2 = [
-      { role: 'session', content: sessionName },
       systemMsg,
       userTurn1,
       { role: 'assistant', content: output1 },
       { role: 'user', content: 'And what about Germany?' }
     ]
-    const response2 = await addon.run(prompt2)
+    const response2 = await addon.run(prompt2, { cacheKey: sessionName })
     const output2 = await collectResponse(response2)
     t.ok(output2.length > 0, `second turn produced output (${output2.length} chars)`)
     const lowerOutput2 = output2.toLowerCase()
     t.ok(/berlin/.test(lowerOutput2), `second turn mentions Berlin: "${output2.slice(0, 100)}"`)
     t.ok(output2 !== output1, 'second turn produced different output from first')
+    t.ok(
+      response2.stats?.CacheTokens > response1.stats?.CacheTokens,
+      `second turn extended the KV cache from turn 1 (${response1.stats?.CacheTokens} -> ${response2.stats?.CacheTokens})`
+    )
   } finally {
     await addon.unload().catch(() => {})
   }

@@ -1,5 +1,6 @@
 #include <filesystem>
 #include <string>
+#include <string_view>
 #include <unordered_map>
 
 #include <gtest/gtest.h>
@@ -39,6 +40,32 @@ TEST_F(ChatTemplateUtilsTest, IsQwen3ModelWithNullptr) {
   EXPECT_FALSE(isQwen3Model(nullptr));
 }
 
+TEST_F(ChatTemplateUtilsTest, IsMedPsyModelWithNullptr) {
+  EXPECT_FALSE(isMedPsyModel(nullptr));
+}
+
+TEST_F(ChatTemplateUtilsTest, IsMedPsyBasenameEmpty) {
+  EXPECT_FALSE(isMedPsyBasename(std::string_view{}));
+  EXPECT_FALSE(isMedPsyBasename(""));
+}
+
+TEST_F(ChatTemplateUtilsTest, IsMedPsyBasenameExactMatch) {
+  EXPECT_TRUE(isMedPsyBasename("MedPsy"));
+}
+
+TEST_F(ChatTemplateUtilsTest, IsMedPsyBasenameCaseInsensitive) {
+  EXPECT_TRUE(isMedPsyBasename("medpsy"));
+  EXPECT_TRUE(isMedPsyBasename("MEDPSY"));
+  EXPECT_TRUE(isMedPsyBasename("MedPSY"));
+}
+
+TEST_F(ChatTemplateUtilsTest, IsMedPsyBasenameRejectsOtherNames) {
+  EXPECT_FALSE(isMedPsyBasename("Qwen3"));
+  EXPECT_FALSE(isMedPsyBasename("Llama-3.1"));
+  EXPECT_FALSE(isMedPsyBasename("MedPsy-7B"));
+  EXPECT_FALSE(isMedPsyBasename("NotMedPsy"));
+}
+
 TEST_F(
     ChatTemplateUtilsTest, SupportsToolsCompactForModelMetadataByArchitecture) {
   EXPECT_TRUE(supportsToolsCompactForModelMetadata(std::string("qwen3")));