From df7561db8d369c07f33e353c21aafb275112b175 Mon Sep 17 00:00:00 2001 From: olyasir Date: Mon, 9 Mar 2026 12:57:11 +0200 Subject: [PATCH 1/4] test: add LightON OCR-2 integration test Add integration test for LightON OCR-2 vision-language model OCR. Uses Q4_K_M LLM + F16 mmproj with newspaper document image. Validates text extraction with keyword matching. --- .../test/integration/ocr-lighton.test.js | 143 ++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js new file mode 100644 index 0000000000..3121be0e91 --- /dev/null +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js @@ -0,0 +1,143 @@ +'use strict' +// test/integration/ocr-lighton.test.js +const test = require('brittle') +const fs = require('bare-fs') +const path = require('bare-path') +const { ensureModel, getMediaPath } = require('./utils') +const FilesystemDL = require('@qvac/dl-filesystem') +const LlmLlamacpp = require('../../index.js') +const os = require('bare-os') + +const platform = os.platform() +const arch = os.arch() +const isDarwinX64 = platform === 'darwin' && arch === 'x64' +const isLinuxArm64 = platform === 'linux' && arch === 'arm64' +const isMobile = platform === 'ios' || platform === 'android' + +const useCpu = isDarwinX64 || isLinuxArm64 + +const LIGHTON_OCR_CONFIG = { + llmModel: { + modelName: 'LightOnOCR-2-1B-ocr-soup-Q4_K_M.gguf', + downloadUrl: 'https://huggingface.co/noctrex/LightOnOCR-2-1B-ocr-soup-GGUF/resolve/main/LightOnOCR-2-1B-ocr-soup-Q4_K_M.gguf' + }, + projModel: { + modelName: 'mmproj-LightOnOCR-2-F16.gguf', + downloadUrl: 'https://huggingface.co/noctrex/LightOnOCR-2-1B-ocr-soup-GGUF/resolve/main/mmproj-F16.gguf' + }, + ctx_size: '4096' +} + +const TEST_CONSTANTS = { + timeout: 900_000, // 15 minutes — image encoding is slow (~30s+ on mobile) + maxTokens: '2048' +} + +const DEVICE_CONFIGS = isMobile + ? [{ id: 'cpu', device: 'cpu' }, { id: 'gpu', device: 'gpu' }] + : useCpu + ? [{ id: 'cpu', device: 'cpu' }] + : [{ id: 'gpu', device: 'gpu' }] + +function getConfig (device) { + return { + gpu_layers: '98', + temp: '0.1', + verbosity: '2', + device, + ctx_size: LIGHTON_OCR_CONFIG.ctx_size, + predict: TEST_CONSTANTS.maxTokens + } +} + +async function setupLightOnInference (t, device = 'gpu') { + const [modelName, dirPath] = await ensureModel(LIGHTON_OCR_CONFIG.llmModel) + t.ok(fs.existsSync(path.join(dirPath, modelName)), 'LLM model file should exist') + + const [projModelName] = await ensureModel(LIGHTON_OCR_CONFIG.projModel) + t.ok(fs.existsSync(path.join(dirPath, projModelName)), 'Projection model file should exist') + + const loader = new FilesystemDL({ dirPath }) + const inference = new LlmLlamacpp({ + modelName, + loader, + logger: console, + diskPath: dirPath, + projectionModel: projModelName + }, getConfig(device)) + + t.teardown(async () => { + await loader.close() + await inference.unload() + }) + + await inference.load() + + return { inference, loader } +} + +async function runOcr (inference, imageFilePath) { + const imageBytes = new Uint8Array(fs.readFileSync(imageFilePath)) + + const messages = [ + { role: 'user', type: 'media', content: imageBytes }, + { role: 'user', content: 'Extract all text from this image and format it as markdown.' } + ] + + const startTime = Date.now() + const response = await inference.run(messages) + const generatedText = [] + let error = null + + response.onUpdate(data => { + generatedText.push(data) + }).onError(err => { + error = err + }) + + await response.await() + + if (error) { + throw new Error('Inference error: ' + error) + } + + return { + generatedText: generatedText.join(''), + startTime, + endTime: Date.now() + } +} + +// Test: LightON OCR-2 can extract text from a newspaper document image +test('LightON OCR-2 can extract text from document image', { timeout: TEST_CONSTANTS.timeout }, async t => { + for (const deviceConfig of DEVICE_CONFIGS) { + const label = `[${deviceConfig.id.toUpperCase()}]` + + const { inference } = await setupLightOnInference(t, deviceConfig.device) + + // Use the newspaper image — a small document with clear text + const imageFilePath = getMediaPath('news-paper.jpg') + t.ok(fs.existsSync(imageFilePath), `${label} news-paper.jpg image file should exist`) + + const { generatedText, startTime, endTime } = await runOcr(inference, imageFilePath) + const totalTime = endTime - startTime + + t.comment(`${label} Generated text (${generatedText.length} chars): ${generatedText.substring(0, 500)}...`) + t.comment(`${label} Total time: ${(totalTime / 1000).toFixed(2)}s`) + + // Assert output is non-empty + t.ok(generatedText.length > 0, `${label} Should generate OCR output`) + + // Assert key text from the newspaper is present (Titanic headline) + const lowerText = generatedText.toLowerCase() + const expectedKeywords = ['titanic', 'new york', 'iceberg'] + const foundKeywords = expectedKeywords.filter(kw => lowerText.includes(kw)) + + t.ok( + foundKeywords.length >= 1, + `${label} OCR output should contain at least one expected keyword. ` + + `Found: ${foundKeywords.join(', ') || 'none'}. ` + + `Expected any of: ${expectedKeywords.join(', ')}` + ) + } +}) From 1c3a5a1348ad5cb2dc0c08a113880616b2fb1c1d Mon Sep 17 00:00:00 2001 From: olyasir Date: Mon, 9 Mar 2026 15:09:58 +0200 Subject: [PATCH 2/4] fix: increase OCR test timeout and add mobile test entry - Increase timeout from 15min to 30min for darwin-x64 (model download + slow inference) - Add ocr-lighton.test.js to mobile integration.auto.cjs --- .../test/integration/ocr-lighton.test.js | 2 +- .../test/mobile/integration.auto.cjs | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js index 3121be0e91..e49f17c0b0 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js @@ -29,7 +29,7 @@ const LIGHTON_OCR_CONFIG = { } const TEST_CONSTANTS = { - timeout: 900_000, // 15 minutes — image encoding is slow (~30s+ on mobile) + timeout: 1_800_000, // 30 minutes — model download (~1.2GB) + slow image encoding on Intel Macs maxTokens: '2048' } diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/mobile/integration.auto.cjs b/packages/qvac-lib-infer-llamacpp-llm/test/mobile/integration.auto.cjs index 15117862e4..ff58849a4c 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/mobile/integration.auto.cjs +++ b/packages/qvac-lib-infer-llamacpp-llm/test/mobile/integration.auto.cjs @@ -34,6 +34,10 @@ async function runMultiInstanceTest (options = {}) { // eslint-disable-line no-u return runIntegrationModule('../integration/multi-instance.test.js', options) } +async function runOcrLightonTest (options = {}) { // eslint-disable-line no-unused-vars + return runIntegrationModule('../integration/ocr-lighton.test.js', options) +} + async function runReasoningTest (options = {}) { // eslint-disable-line no-unused-vars return runIntegrationModule('../integration/reasoning.test.js', options) } From 797a52afa5fbc5df0a65f3ae3ef3f160879fee67 Mon Sep 17 00:00:00 2001 From: olyasir Date: Mon, 9 Mar 2026 19:00:08 +0200 Subject: [PATCH 3/4] fix: use cpu-only for LightOn OCR test on mobile --- .../test/integration/ocr-lighton.test.js | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js index e49f17c0b0..9adf88567a 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js @@ -33,11 +33,9 @@ const TEST_CONSTANTS = { maxTokens: '2048' } -const DEVICE_CONFIGS = isMobile - ? [{ id: 'cpu', device: 'cpu' }, { id: 'gpu', device: 'gpu' }] - : useCpu - ? [{ id: 'cpu', device: 'cpu' }] - : [{ id: 'gpu', device: 'gpu' }] +const DEVICE_CONFIGS = (isMobile || useCpu) + ? [{ id: 'cpu', device: 'cpu' }] + : [{ id: 'gpu', device: 'gpu' }] function getConfig (device) { return { From d00366fdfd58468eb74035446b76fe89fc7c31cc Mon Sep 17 00:00:00 2001 From: olyasir Date: Mon, 9 Mar 2026 20:40:02 +0200 Subject: [PATCH 4/4] doc: add LightOn OCR-2 vision-language usage documentation --- .../qvac-lib-infer-llamacpp-llm/README.md | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/packages/qvac-lib-infer-llamacpp-llm/README.md b/packages/qvac-lib-infer-llamacpp-llm/README.md index 3cb0cc3266..dcc668d07d 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/README.md +++ b/packages/qvac-lib-infer-llamacpp-llm/README.md @@ -285,6 +285,80 @@ npm run quickstart - [Native Logging](./examples/nativelog.js) – Demonstrates C++ addon logging integration. - [Tool Calling](./examples/toolCalling.js) – Demonstrates tool calling capabilities. +## OCR with Vision-Language Models + +In addition to ONNX-based OCR (`@qvac/ocr-onnx`), you can use vision-language models through `@qvac/llm-llamacpp` for OCR tasks. This is useful for structured document understanding (tables, forms, multi-column layouts) where traditional OCR pipelines struggle. + +### Supported OCR Models + +| Model | Params | Quantization | Description | +|-------|--------|-------------|-------------| +| LightON OCR-2 1B | 0.6B (LLM) + ~550M (vision) | Q4_K_M | OCR-specialized, full-page transcription, 11 languages | +| SmolVLM2-500M | 500M | Q8_0 | General vision-language, can follow targeted extraction prompts | + +### LightON OCR-2 + +[LightON OCR-2](https://huggingface.co/noctrex/LightOnOCR-2-1B-ocr-soup-GGUF) is an OCR-specialized vision-language model (Apache 2.0) that produces detailed markdown/HTML output with tables. It supports 11 languages: English, French, German, Spanish, Italian, Dutch, Portuguese, Polish, Romanian, Czech, and Swedish. + +**Characteristics:** +- Always does full-page transcription regardless of prompt +- Produces detailed structured output (markdown tables, HTML) +- Requires `--jinja` flag / jinja chat template in llama.cpp +- Requires both LLM model and F16 mmproj (vision projector) + +**Performance (Pixel 10 Pro, CPU-only, Q4_K_M + F16 mmproj):** +- Image encode: ~30s (768x1024 image) +- Prompt eval: 26.6 t/s +- Generation: 4.14 t/s + +**Usage Example:** + +```js +const LlmLlamacpp = require('@qvac/llm-llamacpp') +const FilesystemDL = require('@qvac/dl-filesystem') +const fs = require('bare-fs') + +const dirPath = './models' +const loader = new FilesystemDL({ dirPath }) + +const model = new LlmLlamacpp({ + modelName: 'LightOnOCR-2-1B-ocr-soup-Q4_K_M.gguf', + loader, + logger: console, + diskPath: dirPath, + projectionModel: 'mmproj-F16.gguf' +}, { + device: 'cpu', + gpu_layers: '0', + ctx_size: '4096', + temp: '0.1', + predict: '2048' +}) + +await model.load() + +const imageBytes = new Uint8Array(fs.readFileSync('./document.png')) + +const messages = [ + { role: 'user', type: 'media', content: imageBytes }, + { role: 'user', content: 'Extract all text from this image and format it as markdown.' } +] + +const response = await model.run(messages) +const output = [] + +response.onUpdate(token => { + output.push(token) +}) + +await response.await() + +console.log(output.join('')) + +await model.unload() +await loader.close() +``` + ## Architecture See [docs/](./docs) for a detailed explanation of the architecture and data flow logic.