ggml-org · ngxson · May 20, 2026 · May 19, 2026 · May 19, 2026 · May 20, 2026
diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp
@@ -4,6 +4,7 @@
 
 #include <map>
 #include <sstream>
+#include <string_view>
 #include <algorithm>
 
 #if __cplusplus >= 202000L
@@ -25,6 +26,57 @@ static std::string trim(const std::string & str) {
     return str.substr(start, end - start);
 }
 
+// moves mtmd media-marker(s), to the front
+// keeping the order required by models like DeepSeek-OCR
+static std::string media_markers_first(const std::string & content) {
+    // mtmd marker is "<__media__>"
+    // server marker pattern is "<__media_<hex>__>"
+    static constexpr std::string_view prefix = "<__media";
+    static constexpr std::string_view suffix = "__>";
+
+    std::string markers;
+    std::string text;
+    text.reserve(content.size());
+
+    size_t i = 0;
+    while (i < content.size()) {
+        // find start and end of the next marker
+        if (content.compare(i, prefix.size(), prefix) == 0) {
+            const size_t e = content.find(suffix, i + prefix.size());
+            if (e != std::string::npos) {
+                const size_t end = e + suffix.size();
+                markers.append(content, i, end - i);
+                markers.push_back('\n'); // add newline after each marker
+                i = end;
+                if (i < content.size() && content[i] == '\n') {
+                    ++i;  // skip the newline after the marker
+                }
+                continue;
+            }
+        }
+        // add to text
+        text.push_back(content[i++]);
+    }
+
+    // no markers found
+    if (markers.empty()) {
+        return content;
+    }
+    // strip leading newlines
+    const size_t start = text.find_first_not_of('\n');
+    if (start == std::string::npos) {
+        // markers only
+        return markers;
+    }
+    text.erase(0, start);
+    // strip trailing newlines
+    while (!text.empty() && text.back() == '\n') {
+        text.pop_back();
+    }
+
+    return markers + text;
+}
+
 static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
     { "chatml",            LLM_CHAT_TEMPLATE_CHATML            },
     { "llama2",            LLM_CHAT_TEMPLATE_LLAMA_2           },
@@ -557,9 +609,10 @@ int32_t llm_chat_apply_template(
             ss << LU8("<｜Assistant｜>");
         }
     } else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_OCR) {
+        // DeepSeek-OCR expects "<image>\n<prompt>", but other callers
+        // (e.g., the Server WebUI) may put the image after the text.
         for (auto message : chat) {
-            // no template
-            ss << message->content;
+            ss << media_markers_first(message->content);
         }
     } else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
         // ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb

diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
@@ -336,7 +336,7 @@ static common_chat_msg simple_msg(const std::string & role, const std::string &
 int main_automated_tests(void) {
     // jinja::enable_debug(true);
 
-    std::vector<llama_chat_message> conversation {
+    std::vector<llama_chat_message> common_conversation {
         {"system", "You are a helpful assistant"},
         {"user", "Hello"},
         {"assistant", "Hi there"},
@@ -355,6 +355,9 @@ int main_automated_tests(void) {
         std::string eos_token = "";
         bool supported_with_jinja = true;
         std::vector<llama_chat_message> extra_conversation = {};
+        // when set override the default conversation for this test case
+        // useful for testing tool calls and other features not covered by the default conversation
+        std::vector<llama_chat_message> conversation = common_conversation;
     };
     std::vector<TestCase> test_cases {
         {
@@ -625,6 +628,40 @@ int main_automated_tests(void) {
             /* .eos_token= */ "",
             /* .supported_with_jinja= */ true,
             /* .extra_conversation= */ {{"user", "What is the weather?"}, {"assistant_tool_call", "<tool_call>\n{\"name\": \"get_weather\", \"arguments\": {\"location\": \"NYC\"}}\n</tool_call>"}, {"tool_response", "{\"temperature\": 72}"}},
+        },
+        // DeepSeek-OCR needs markers before the prompt and is newline-sensitive
+        {
+            /* .name= */ "deepseek-ai/DeepSeek-OCR (WebUI Two Images)",
+            /* .template_str= */ "deepseek-ocr",
+            /* .expected_output= */ "<__media_a__>\n<__media_b__>\nFree OCR.",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+            /* .supported_with_jinja= */ false,
+            /* .extra_conversation= */ {},
+            /* .conversation= */ {{"user", "Free OCR.\n<__media_a__>\n<__media_b__>"}},
+        },
+        {
+            /* .name= */ "deepseek-ai/DeepSeek-OCR (mtmd-cli)",
+            /* .template_str= */ "deepseek-ocr",
+            /* .expected_output= */ "<__media__>\nFree OCR. ",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+            /* .supported_with_jinja= */ false,
+            /* .extra_conversation= */ {},
+            /* .conversation= */ {{"user", "<__media__>Free OCR. "}},
+        },
+        {
+            /* .name= */ "deepseek-ai/DeepSeek-OCR (newlines around marker)",
+            /* .template_str= */ "deepseek-ocr",
+            /* .expected_output= */ "<__media_a__>\nFree OCR.",
+            /* .expected_output_jinja= */ "",
+            /* .bos_token= */ "",
+            /* .eos_token= */ "",
+            /* .supported_with_jinja= */ false,
+            /* .extra_conversation= */ {},
+            /* .conversation= */ {{"user", "\n<__media_a__>\n\nFree OCR."}},
         }
     };
     std::vector<char> formatted_chat(1024);
@@ -642,13 +679,13 @@ int main_automated_tests(void) {
     }
 
     // test invalid chat template
-    res = llama_chat_apply_template("INVALID TEMPLATE", conversation.data(), conversation.size(), true, formatted_chat.data(), formatted_chat.size());
+    res = llama_chat_apply_template("INVALID TEMPLATE", common_conversation.data(), common_conversation.size(), true, formatted_chat.data(), formatted_chat.size());
     assert(res < 0);
     const auto add_generation_prompt = true;
 
     for (const auto & test_case : test_cases) {
         std::cout << "\n\n=== " << test_case.name << " ===\n\n";
-        auto conv = conversation;
+        auto conv = test_case.conversation;
         conv.insert(conv.end(), test_case.extra_conversation.begin(), test_case.extra_conversation.end());
         formatted_chat.resize(2048);
         res = llama_chat_apply_template(
@@ -670,18 +707,18 @@ int main_automated_tests(void) {
         }
     }
 
-    std::vector<common_chat_msg> messages;
-    messages.reserve(conversation.size());
-    for (const auto & msg : conversation) {
-        messages.push_back(simple_msg(msg.role, msg.content));
-    }
     for (const auto & test_case : test_cases) {
         if (!test_case.supported_with_jinja) {
             continue;
         }
         std::cout << "\n\n=== " << test_case.name << " (jinja) ===\n\n";
         try {
-            auto msgs = messages;
+            auto conv = test_case.conversation;
+            std::vector<common_chat_msg> msgs;
+            msgs.reserve(conv.size() + test_case.extra_conversation.size());
+            for (const auto & msg : conv) {
+                msgs.push_back(simple_msg(msg.role, msg.content));
+            }
             for (const auto & msg : test_case.extra_conversation) {
                 msgs.push_back(simple_msg(msg.role, msg.content));
             }

@@ -1510,9 +1510,7 @@ struct clip_model_loader {
                         hparams.image_size = 1024;
                         hparams.warmup_image_size = 1024;
                         hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
-                        hparams.image_pad_color[0] = hparams.image_mean[0];
-                        hparams.image_pad_color[1] = hparams.image_mean[1];
-                        hparams.image_pad_color[2] = hparams.image_mean[2];
+                        hparams.image_pad_color = {127, 127, 127};
 
                         get_u32(KEY_SAM_N_BLOCK, hparams.sam_n_layer, true);
                         get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true);