From df7561db8d369c07f33e353c21aafb275112b175 Mon Sep 17 00:00:00 2001
From: olyasir <sirkinolya@gmail.com>
Date: Mon, 9 Mar 2026 12:57:11 +0200
Subject: [PATCH 1/4] test: add LightON OCR-2 integration test

Add integration test for LightON OCR-2 vision-language model OCR.
Uses Q4_K_M LLM + F16 mmproj with newspaper document image.
Validates text extraction with keyword matching.
---
 .../test/integration/ocr-lighton.test.js      | 143 ++++++++++++++++++
 1 file changed, 143 insertions(+)
 create mode 100644 packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js

diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js
new file mode 100644
index 0000000000..3121be0e91
--- /dev/null
+++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js
@@ -0,0 +1,143 @@
+'use strict'
+// test/integration/ocr-lighton.test.js
+const test = require('brittle')
+const fs = require('bare-fs')
+const path = require('bare-path')
+const { ensureModel, getMediaPath } = require('./utils')
+const FilesystemDL = require('@qvac/dl-filesystem')
+const LlmLlamacpp = require('../../index.js')
+const os = require('bare-os')
+
+const platform = os.platform()
+const arch = os.arch()
+const isDarwinX64 = platform === 'darwin' && arch === 'x64'
+const isLinuxArm64 = platform === 'linux' && arch === 'arm64'
+const isMobile = platform === 'ios' || platform === 'android'
+
+const useCpu = isDarwinX64 || isLinuxArm64
+
+const LIGHTON_OCR_CONFIG = {
+  llmModel: {
+    modelName: 'LightOnOCR-2-1B-ocr-soup-Q4_K_M.gguf',
+    downloadUrl: 'https://huggingface.co/noctrex/LightOnOCR-2-1B-ocr-soup-GGUF/resolve/main/LightOnOCR-2-1B-ocr-soup-Q4_K_M.gguf'
+  },
+  projModel: {
+    modelName: 'mmproj-LightOnOCR-2-F16.gguf',
+    downloadUrl: 'https://huggingface.co/noctrex/LightOnOCR-2-1B-ocr-soup-GGUF/resolve/main/mmproj-F16.gguf'
+  },
+  ctx_size: '4096'
+}
+
+const TEST_CONSTANTS = {
+  timeout: 900_000, // 15 minutes — image encoding is slow (~30s+ on mobile)
+  maxTokens: '2048'
+}
+
+const DEVICE_CONFIGS = isMobile
+  ? [{ id: 'cpu', device: 'cpu' }, { id: 'gpu', device: 'gpu' }]
+  : useCpu
+    ? [{ id: 'cpu', device: 'cpu' }]
+    : [{ id: 'gpu', device: 'gpu' }]
+
+function getConfig (device) {
+  return {
+    gpu_layers: '98',
+    temp: '0.1',
+    verbosity: '2',
+    device,
+    ctx_size: LIGHTON_OCR_CONFIG.ctx_size,
+    predict: TEST_CONSTANTS.maxTokens
+  }
+}
+
+async function setupLightOnInference (t, device = 'gpu') {
+  const [modelName, dirPath] = await ensureModel(LIGHTON_OCR_CONFIG.llmModel)
+  t.ok(fs.existsSync(path.join(dirPath, modelName)), 'LLM model file should exist')
+
+  const [projModelName] = await ensureModel(LIGHTON_OCR_CONFIG.projModel)
+  t.ok(fs.existsSync(path.join(dirPath, projModelName)), 'Projection model file should exist')
+
+  const loader = new FilesystemDL({ dirPath })
+  const inference = new LlmLlamacpp({
+    modelName,
+    loader,
+    logger: console,
+    diskPath: dirPath,
+    projectionModel: projModelName
+  }, getConfig(device))
+
+  t.teardown(async () => {
+    await loader.close()
+    await inference.unload()
+  })
+
+  await inference.load()
+
+  return { inference, loader }
+}
+
+async function runOcr (inference, imageFilePath) {
+  const imageBytes = new Uint8Array(fs.readFileSync(imageFilePath))
+
+  const messages = [
+    { role: 'user', type: 'media', content: imageBytes },
+    { role: 'user', content: 'Extract all text from this image and format it as markdown.' }
+  ]
+
+  const startTime = Date.now()
+  const response = await inference.run(messages)
+  const generatedText = []
+  let error = null
+
+  response.onUpdate(data => {
+    generatedText.push(data)
+  }).onError(err => {
+    error = err
+  })
+
+  await response.await()
+
+  if (error) {
+    throw new Error('Inference error: ' + error)
+  }
+
+  return {
+    generatedText: generatedText.join(''),
+    startTime,
+    endTime: Date.now()
+  }
+}
+
+// Test: LightON OCR-2 can extract text from a newspaper document image
+test('LightON OCR-2 can extract text from document image', { timeout: TEST_CONSTANTS.timeout }, async t => {
+  for (const deviceConfig of DEVICE_CONFIGS) {
+    const label = `[${deviceConfig.id.toUpperCase()}]`
+
+    const { inference } = await setupLightOnInference(t, deviceConfig.device)
+
+    // Use the newspaper image — a small document with clear text
+    const imageFilePath = getMediaPath('news-paper.jpg')
+    t.ok(fs.existsSync(imageFilePath), `${label} news-paper.jpg image file should exist`)
+
+    const { generatedText, startTime, endTime } = await runOcr(inference, imageFilePath)
+    const totalTime = endTime - startTime
+
+    t.comment(`${label} Generated text (${generatedText.length} chars): ${generatedText.substring(0, 500)}...`)
+    t.comment(`${label} Total time: ${(totalTime / 1000).toFixed(2)}s`)
+
+    // Assert output is non-empty
+    t.ok(generatedText.length > 0, `${label} Should generate OCR output`)
+
+    // Assert key text from the newspaper is present (Titanic headline)
+    const lowerText = generatedText.toLowerCase()
+    const expectedKeywords = ['titanic', 'new york', 'iceberg']
+    const foundKeywords = expectedKeywords.filter(kw => lowerText.includes(kw))
+
+    t.ok(
+      foundKeywords.length >= 1,
+      `${label} OCR output should contain at least one expected keyword. ` +
+      `Found: ${foundKeywords.join(', ') || 'none'}. ` +
+      `Expected any of: ${expectedKeywords.join(', ')}`
+    )
+  }
+})

From 1c3a5a1348ad5cb2dc0c08a113880616b2fb1c1d Mon Sep 17 00:00:00 2001
From: olyasir <sirkinolya@gmail.com>
Date: Mon, 9 Mar 2026 15:09:58 +0200
Subject: [PATCH 2/4] fix: increase OCR test timeout and add mobile test entry

- Increase timeout from 15min to 30min for darwin-x64 (model download + slow inference)
- Add ocr-lighton.test.js to mobile integration.auto.cjs
---
 .../test/integration/ocr-lighton.test.js                      | 2 +-
 .../test/mobile/integration.auto.cjs                          | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js
index 3121be0e91..e49f17c0b0 100644
--- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js
+++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js
@@ -29,7 +29,7 @@ const LIGHTON_OCR_CONFIG = {
 }
 
 const TEST_CONSTANTS = {
-  timeout: 900_000, // 15 minutes — image encoding is slow (~30s+ on mobile)
+  timeout: 1_800_000, // 30 minutes — model download (~1.2GB) + slow image encoding on Intel Macs
   maxTokens: '2048'
 }
 
diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/mobile/integration.auto.cjs b/packages/qvac-lib-infer-llamacpp-llm/test/mobile/integration.auto.cjs
index 15117862e4..ff58849a4c 100644
--- a/packages/qvac-lib-infer-llamacpp-llm/test/mobile/integration.auto.cjs
+++ b/packages/qvac-lib-infer-llamacpp-llm/test/mobile/integration.auto.cjs
@@ -34,6 +34,10 @@ async function runMultiInstanceTest (options = {}) { // eslint-disable-line no-u
   return runIntegrationModule('../integration/multi-instance.test.js', options)
 }
 
+async function runOcrLightonTest (options = {}) { // eslint-disable-line no-unused-vars
+  return runIntegrationModule('../integration/ocr-lighton.test.js', options)
+}
+
 async function runReasoningTest (options = {}) { // eslint-disable-line no-unused-vars
   return runIntegrationModule('../integration/reasoning.test.js', options)
 }

From 797a52afa5fbc5df0a65f3ae3ef3f160879fee67 Mon Sep 17 00:00:00 2001
From: olyasir <sirkinolya@gmail.com>
Date: Mon, 9 Mar 2026 19:00:08 +0200
Subject: [PATCH 3/4] fix: use cpu-only for LightOn OCR test on mobile

---
 .../test/integration/ocr-lighton.test.js                  | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js
index e49f17c0b0..9adf88567a 100644
--- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js
+++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js
@@ -33,11 +33,9 @@ const TEST_CONSTANTS = {
   maxTokens: '2048'
 }
 
-const DEVICE_CONFIGS = isMobile
-  ? [{ id: 'cpu', device: 'cpu' }, { id: 'gpu', device: 'gpu' }]
-  : useCpu
-    ? [{ id: 'cpu', device: 'cpu' }]
-    : [{ id: 'gpu', device: 'gpu' }]
+const DEVICE_CONFIGS = (isMobile || useCpu)
+  ? [{ id: 'cpu', device: 'cpu' }]
+  : [{ id: 'gpu', device: 'gpu' }]
 
 function getConfig (device) {
   return {

From d00366fdfd58468eb74035446b76fe89fc7c31cc Mon Sep 17 00:00:00 2001
From: olyasir <sirkinolya@gmail.com>
Date: Mon, 9 Mar 2026 20:40:02 +0200
Subject: [PATCH 4/4] doc: add LightOn OCR-2 vision-language usage
 documentation

---
 .../qvac-lib-infer-llamacpp-llm/README.md     | 74 +++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/packages/qvac-lib-infer-llamacpp-llm/README.md b/packages/qvac-lib-infer-llamacpp-llm/README.md
index 3cb0cc3266..dcc668d07d 100644
--- a/packages/qvac-lib-infer-llamacpp-llm/README.md
+++ b/packages/qvac-lib-infer-llamacpp-llm/README.md
@@ -285,6 +285,80 @@ npm run quickstart
 -   [Native Logging](./examples/nativelog.js) – Demonstrates C++ addon logging integration.
 -   [Tool Calling](./examples/toolCalling.js) – Demonstrates tool calling capabilities.
 
+## OCR with Vision-Language Models
+
+In addition to ONNX-based OCR (`@qvac/ocr-onnx`), you can use vision-language models through `@qvac/llm-llamacpp` for OCR tasks. This is useful for structured document understanding (tables, forms, multi-column layouts) where traditional OCR pipelines struggle.
+
+### Supported OCR Models
+
+| Model | Params | Quantization | Description |
+|-------|--------|-------------|-------------|
+| LightON OCR-2 1B | 0.6B (LLM) + ~550M (vision) | Q4_K_M | OCR-specialized, full-page transcription, 11 languages |
+| SmolVLM2-500M | 500M | Q8_0 | General vision-language, can follow targeted extraction prompts |
+
+### LightON OCR-2
+
+[LightON OCR-2](https://huggingface.co/noctrex/LightOnOCR-2-1B-ocr-soup-GGUF) is an OCR-specialized vision-language model (Apache 2.0) that produces detailed markdown/HTML output with tables. It supports 11 languages: English, French, German, Spanish, Italian, Dutch, Portuguese, Polish, Romanian, Czech, and Swedish.
+
+**Characteristics:**
+- Always does full-page transcription regardless of prompt
+- Produces detailed structured output (markdown tables, HTML)
+- Requires `--jinja` flag / jinja chat template in llama.cpp
+- Requires both LLM model and F16 mmproj (vision projector)
+
+**Performance (Pixel 10 Pro, CPU-only, Q4_K_M + F16 mmproj):**
+- Image encode: ~30s (768x1024 image)
+- Prompt eval: 26.6 t/s
+- Generation: 4.14 t/s
+
+**Usage Example:**
+
+```js
+const LlmLlamacpp = require('@qvac/llm-llamacpp')
+const FilesystemDL = require('@qvac/dl-filesystem')
+const fs = require('bare-fs')
+
+const dirPath = './models'
+const loader = new FilesystemDL({ dirPath })
+
+const model = new LlmLlamacpp({
+  modelName: 'LightOnOCR-2-1B-ocr-soup-Q4_K_M.gguf',
+  loader,
+  logger: console,
+  diskPath: dirPath,
+  projectionModel: 'mmproj-F16.gguf'
+}, {
+  device: 'cpu',
+  gpu_layers: '0',
+  ctx_size: '4096',
+  temp: '0.1',
+  predict: '2048'
+})
+
+await model.load()
+
+const imageBytes = new Uint8Array(fs.readFileSync('./document.png'))
+
+const messages = [
+  { role: 'user', type: 'media', content: imageBytes },
+  { role: 'user', content: 'Extract all text from this image and format it as markdown.' }
+]
+
+const response = await model.run(messages)
+const output = []
+
+response.onUpdate(token => {
+  output.push(token)
+})
+
+await response.await()
+
+console.log(output.join(''))
+
+await model.unload()
+await loader.close()
+```
+
 ## Architecture
 
 See [docs/](./docs) for a detailed explanation of the architecture and data flow logic.