Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,4 @@ a.out.*

AGENTS.local.md
.pi/SYSTEM.md
llama_cpp_windows.zip
104 changes: 104 additions & 0 deletions README-deepseek-ocr.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# DeepSeek-OCR Server

## Quick start

```bash
./build/bin/llama-server \
-m "/path/to/deepseek-ocr-q4_k_m.gguf" \
--mmproj "/path/to/mmproj-deepseek-ocr-f16.gguf" \
--temp 0 --flash-attn off \
--chat-template deepseek-ocr \
-ngl 0 -c 2048 --host 0.0.0.0 --port 8000
```

Flags:
- `-ngl 0` — CPU only (required on GPUs with <6GB VRAM)
- `--flash-attn off` — avoids CUDA OOM on low-VRAM GPUs
- `--chat-template deepseek-ocr` — enables the correct prompt format
- `--mmproj-gpu false` — if you still get OOM, also forces mmproj to CPU

## API: `/v1/chat/completions`

```
POST http://localhost:8000/v1/chat/completions
Content-Type: application/json
```

### Request

```json
{
"model": "deepseek-ocr",
"max_tokens": 512,
"temperature": 0,
"messages": [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<base64>"}},
{"type": "text", "text": "<|grounding|>Convert the document to markdown."}
]
}
]
}
```

### Response

```json
{
"choices": [{
"message": {
"content": "text[[76, 149, 945, 288]]\n<ocr text here>\nequation[[104, 299, 691, 351]]\n\\[latex...\\]"
}
}],
"usage": {
"prompt_tokens": 277,
"completion_tokens": 128,
"total_tokens": 405
}
}
```

## Tips

**VRAM issues:** The Q4_K_M model + F16 mmproj needs ~2.7GB + compute buffers. On 4GB GPUs (RTX 3050), use `-ngl 0`. For partial GPU offload try `-ngl 12 --flash-attn on -c 1024`.

**Prompt prefix:** Always include `<|grounding|>` in the text to get document OCR mode. Without it the model may behave like a generic chatbot.

**Output tags:** The model outputs `<|ref|>type<|/ref|><|det|>[x1,y1,x2,y2]<|/det|>` bounding boxes in CLI mode. The server strips the `<|ref|>`/`<|det|>` wrapping depending on the chat template prefix.

## Python example

```python
import requests, base64

with open("document.png", "rb") as f:
b64 = base64.b64encode(f.read()).decode()

r = requests.post("http://localhost:8000/v1/chat/completions", json={
"model": "deepseek-ocr",
"max_tokens": 512,
"temperature": 0,
"messages": [{
"role": "user",
"content": [
{"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{b64}"}},
{"type": "text",
"text": "<|grounding|>Convert the document to markdown."}
]
}]
})

print(r.json()["choices"][0]["message"]["content"])
```

## Troubleshooting

| Error | Fix |
|-------|-----|
| `500 failed to process image` | GPU OOM — add `-ngl 0 --mmproj-gpu false` |
| `number of bitmaps (1) does not match number of markers (0)` | Missing `--chat-template deepseek-ocr` flag |
| `GGML_ASSERT(batch.n_tokens > 0)` | Outdated build — rebuild with latest patches |
| Output is `<__media__><|grounding|>...` literal text | Missing `--chat-template deepseek-ocr` |
21 changes: 18 additions & 3 deletions common/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2474,10 +2474,25 @@ static common_chat_params common_chat_templates_apply_legacy(const struct common
}

common_chat_params common_chat_templates_apply(const struct common_chat_templates * tmpls,
const struct common_chat_templates_inputs & inputs) {
const struct common_chat_templates_inputs & inputs) {
GGML_ASSERT(tmpls != nullptr);
return inputs.use_jinja ? common_chat_templates_apply_jinja(tmpls, inputs) :
common_chat_templates_apply_legacy(tmpls, inputs);

// if use_jinja is requested, check if the template source is actually a built-in template name
// if so, fall back to the legacy path which resolves built-in templates by name
if (inputs.use_jinja) {
const auto src = common_chat_templates_source(tmpls);
if (!src.empty()) {
// test if this is a built-in template name by attempting to apply it with an empty chat
// if it returns >= 0, it's a built-in template
int32_t test_res = llama_chat_apply_template(src.c_str(), nullptr, 0, false, nullptr, 0);
if (test_res >= 0) {
return common_chat_templates_apply_legacy(tmpls, inputs);
}
}
return common_chat_templates_apply_jinja(tmpls, inputs);
}

return common_chat_templates_apply_legacy(tmpls, inputs);
}

common_chat_msg common_chat_parse(const std::string & input,
Expand Down
12 changes: 11 additions & 1 deletion tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2608,6 +2608,13 @@ struct clip_model_loader {
// alloc memory and offload data
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend);
ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
if (!ctx_clip.buf && ctx_clip.backend != ctx_clip.backend_cpu) {
LOG_WRN("%s: WARNING: failed to allocate tensors on %s, falling back to CPU\n", __func__, ggml_backend_name(ctx_clip.backend));
ctx_clip.backend = ctx_clip.backend_cpu;
buft = ggml_backend_get_default_buffer_type(ctx_clip.backend);
ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
}
GGML_ASSERT(ctx_clip.buf && "failed to allocate tensors on any backend");
ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
for (auto & t : tensors_to_load) {
ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name);
Expand Down Expand Up @@ -3341,7 +3348,10 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// build the inference graph
ggml_backend_sched_reset(ctx->sched.get());
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
if (!ggml_backend_sched_alloc_graph(ctx->sched.get(), gf)) {
LOG_ERR("%s: failed to allocate compute graph (OOM)\n", __func__);
return false;
}

// set inputs
const auto & model = ctx->model;
Expand Down
3 changes: 2 additions & 1 deletion tools/server/server-common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ const char * get_media_marker() {
if (env && env[0] != '\0') {
return std::string(env);
}
return std::string("<__media_") + random_string() + "__>";
// must match mtmd_default_marker() so the tokenizer can split on it
return std::string(mtmd_default_marker());
}();
return marker.c_str();
}
Expand Down
53 changes: 50 additions & 3 deletions tools/server/server-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -823,7 +823,13 @@ struct server_context_impl {
if (!mmproj_path.empty()) {
mtmd_context_params mparams = mtmd_context_params_default();

mparams.use_gpu = params_base.mmproj_use_gpu;
// if user explicitly sets n_gpu_layers to 0, disable mmproj GPU too
bool mmproj_use_gpu = params_base.mmproj_use_gpu;
if (mmproj_use_gpu && params_base.n_gpu_layers == 0) {
LOG_INF("%s: n_gpu_layers=0, disabling mmproj GPU\n", __func__);
mmproj_use_gpu = false;
}
mparams.use_gpu = mmproj_use_gpu;
mparams.print_timings = false;
mparams.n_threads = params_base.cpuparams.n_threads;
mparams.flash_attn_type = params_base.flash_attn_type;
Expand Down Expand Up @@ -2751,17 +2757,19 @@ struct server_context_impl {
n_swa > 0);

bool has_mtmd = false;
bool slot_released = false;

// check if we should process the image
while (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) {
while (!slot_released && slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) {
// process the image
size_t n_tokens_out = 0;
int32_t res = input_tokens.process_chunk(ctx_tgt, mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out);
if (res != 0) {
SLT_ERR(slot, "failed to process image, res = %d\n", res);
send_error(slot, "failed to process image", ERROR_TYPE_SERVER);
slot.release();
continue;
slot_released = true;
break;
}

if (ctx_dft) {
Expand All @@ -2785,6 +2793,11 @@ struct server_context_impl {
has_mtmd = true;
}

if (slot_released) {
// released inside mtmd loop, skip the rest
continue;
}

// add prompt tokens for processing in the current batch
while (slot.prompt.n_tokens() < slot.task->n_tokens() && batch.n_tokens < n_batch) {
// get next token to process
Expand Down Expand Up @@ -2842,6 +2855,40 @@ struct server_context_impl {
if (slot.prompt.n_tokens() == slot.task->n_tokens()) {
slot.state = SLOT_STATE_DONE_PROMPT;

// If the prompt ended with mtmd chunks and no text tokens remain,
// add the last valid text token to the batch to produce generation logits
if (batch.n_tokens == 0) {
GGML_ASSERT(has_mtmd);
// find the last non-NULL token from the processed prompt
bool found = false;
for (int i = (int)slot.prompt.tokens.size() - 1; i >= 0; i--) {
if (slot.prompt.tokens[i] != LLAMA_TOKEN_NULL) {
common_batch_add(batch,
slot.prompt.tokens[i],
slot.prompt.tokens.pos_next(),
{ slot.id },
slot.task->need_embd());
slot.prompt.tokens.push_back(slot.prompt.tokens[i]);
found = true;
break;
}
}
if (!found) {
// image-only prompt (no text tokens at all)
// use BOS to produce logits so sampling can proceed
const auto bos = llama_vocab_bos(vocab);
if (bos != LLAMA_TOKEN_NULL) {
common_batch_add(batch,
bos,
slot.prompt.tokens.pos_next(),
{ slot.id },
slot.task->need_embd());
slot.prompt.tokens.push_back(bos);
}
}
SRV_DBG("slot %12.*s: id %2d | task %d | prompt ended with mtmd, added synthetic token to batch\n", 12, __func__, slot.id, slot.task ? slot.task->id : -1);
}

GGML_ASSERT(batch.n_tokens > 0);

// extract the logits only for the last token
Expand Down