diff --git a/packages/qvac-lib-infer-llamacpp-llm/README.md b/packages/qvac-lib-infer-llamacpp-llm/README.md index 973f4b2dcf..29cb26f1f1 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/README.md +++ b/packages/qvac-lib-infer-llamacpp-llm/README.md @@ -292,6 +292,80 @@ npm run quickstart - [Native Logging](./examples/nativelog.js) – Demonstrates C++ addon logging integration. - [Tool Calling](./examples/toolCalling.js) – Demonstrates tool calling capabilities. +## OCR with Vision-Language Models + +In addition to ONNX-based OCR (`@qvac/ocr-onnx`), you can use vision-language models through `@qvac/llm-llamacpp` for OCR tasks. This is useful for structured document understanding (tables, forms, multi-column layouts) where traditional OCR pipelines struggle. + +### Supported OCR Models + +| Model | Params | Quantization | Description | +|-------|--------|-------------|-------------| +| LightON OCR-2 1B | 0.6B (LLM) + ~550M (vision) | Q4_K_M | OCR-specialized, full-page transcription, 11 languages | +| SmolVLM2-500M | 500M | Q8_0 | General vision-language, can follow targeted extraction prompts | + +### LightON OCR-2 + +[LightON OCR-2](https://huggingface.co/noctrex/LightOnOCR-2-1B-ocr-soup-GGUF) is an OCR-specialized vision-language model (Apache 2.0) that produces detailed markdown/HTML output with tables. It supports 11 languages: English, French, German, Spanish, Italian, Dutch, Portuguese, Polish, Romanian, Czech, and Swedish. + +**Characteristics:** +- Always does full-page transcription regardless of prompt +- Produces detailed structured output (markdown tables, HTML) +- Requires `--jinja` flag / jinja chat template in llama.cpp +- Requires both LLM model and F16 mmproj (vision projector) + +**Performance (Pixel 10 Pro, CPU-only, Q4_K_M + F16 mmproj):** +- Image encode: ~30s (768x1024 image) +- Prompt eval: 26.6 t/s +- Generation: 4.14 t/s + +**Usage Example:** + +```js +const LlmLlamacpp = require('@qvac/llm-llamacpp') +const FilesystemDL = require('@qvac/dl-filesystem') +const fs = require('bare-fs') + +const dirPath = './models' +const loader = new FilesystemDL({ dirPath }) + +const model = new LlmLlamacpp({ + modelName: 'LightOnOCR-2-1B-ocr-soup-Q4_K_M.gguf', + loader, + logger: console, + diskPath: dirPath, + projectionModel: 'mmproj-F16.gguf' +}, { + device: 'cpu', + gpu_layers: '0', + ctx_size: '4096', + temp: '0.1', + predict: '2048' +}) + +await model.load() + +const imageBytes = new Uint8Array(fs.readFileSync('./document.png')) + +const messages = [ + { role: 'user', type: 'media', content: imageBytes }, + { role: 'user', content: 'Extract all text from this image and format it as markdown.' } +] + +const response = await model.run(messages) +const output = [] + +response.onUpdate(token => { + output.push(token) +}) + +await response.await() + +console.log(output.join('')) + +await model.unload() +await loader.close() +``` + ## Architecture See [docs/](./docs) for a detailed explanation of the architecture and data flow logic. diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js new file mode 100644 index 0000000000..9adf88567a --- /dev/null +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js @@ -0,0 +1,141 @@ +'use strict' +// test/integration/ocr-lighton.test.js +const test = require('brittle') +const fs = require('bare-fs') +const path = require('bare-path') +const { ensureModel, getMediaPath } = require('./utils') +const FilesystemDL = require('@qvac/dl-filesystem') +const LlmLlamacpp = require('../../index.js') +const os = require('bare-os') + +const platform = os.platform() +const arch = os.arch() +const isDarwinX64 = platform === 'darwin' && arch === 'x64' +const isLinuxArm64 = platform === 'linux' && arch === 'arm64' +const isMobile = platform === 'ios' || platform === 'android' + +const useCpu = isDarwinX64 || isLinuxArm64 + +const LIGHTON_OCR_CONFIG = { + llmModel: { + modelName: 'LightOnOCR-2-1B-ocr-soup-Q4_K_M.gguf', + downloadUrl: 'https://huggingface.co/noctrex/LightOnOCR-2-1B-ocr-soup-GGUF/resolve/main/LightOnOCR-2-1B-ocr-soup-Q4_K_M.gguf' + }, + projModel: { + modelName: 'mmproj-LightOnOCR-2-F16.gguf', + downloadUrl: 'https://huggingface.co/noctrex/LightOnOCR-2-1B-ocr-soup-GGUF/resolve/main/mmproj-F16.gguf' + }, + ctx_size: '4096' +} + +const TEST_CONSTANTS = { + timeout: 1_800_000, // 30 minutes — model download (~1.2GB) + slow image encoding on Intel Macs + maxTokens: '2048' +} + +const DEVICE_CONFIGS = (isMobile || useCpu) + ? [{ id: 'cpu', device: 'cpu' }] + : [{ id: 'gpu', device: 'gpu' }] + +function getConfig (device) { + return { + gpu_layers: '98', + temp: '0.1', + verbosity: '2', + device, + ctx_size: LIGHTON_OCR_CONFIG.ctx_size, + predict: TEST_CONSTANTS.maxTokens + } +} + +async function setupLightOnInference (t, device = 'gpu') { + const [modelName, dirPath] = await ensureModel(LIGHTON_OCR_CONFIG.llmModel) + t.ok(fs.existsSync(path.join(dirPath, modelName)), 'LLM model file should exist') + + const [projModelName] = await ensureModel(LIGHTON_OCR_CONFIG.projModel) + t.ok(fs.existsSync(path.join(dirPath, projModelName)), 'Projection model file should exist') + + const loader = new FilesystemDL({ dirPath }) + const inference = new LlmLlamacpp({ + modelName, + loader, + logger: console, + diskPath: dirPath, + projectionModel: projModelName + }, getConfig(device)) + + t.teardown(async () => { + await loader.close() + await inference.unload() + }) + + await inference.load() + + return { inference, loader } +} + +async function runOcr (inference, imageFilePath) { + const imageBytes = new Uint8Array(fs.readFileSync(imageFilePath)) + + const messages = [ + { role: 'user', type: 'media', content: imageBytes }, + { role: 'user', content: 'Extract all text from this image and format it as markdown.' } + ] + + const startTime = Date.now() + const response = await inference.run(messages) + const generatedText = [] + let error = null + + response.onUpdate(data => { + generatedText.push(data) + }).onError(err => { + error = err + }) + + await response.await() + + if (error) { + throw new Error('Inference error: ' + error) + } + + return { + generatedText: generatedText.join(''), + startTime, + endTime: Date.now() + } +} + +// Test: LightON OCR-2 can extract text from a newspaper document image +test('LightON OCR-2 can extract text from document image', { timeout: TEST_CONSTANTS.timeout }, async t => { + for (const deviceConfig of DEVICE_CONFIGS) { + const label = `[${deviceConfig.id.toUpperCase()}]` + + const { inference } = await setupLightOnInference(t, deviceConfig.device) + + // Use the newspaper image — a small document with clear text + const imageFilePath = getMediaPath('news-paper.jpg') + t.ok(fs.existsSync(imageFilePath), `${label} news-paper.jpg image file should exist`) + + const { generatedText, startTime, endTime } = await runOcr(inference, imageFilePath) + const totalTime = endTime - startTime + + t.comment(`${label} Generated text (${generatedText.length} chars): ${generatedText.substring(0, 500)}...`) + t.comment(`${label} Total time: ${(totalTime / 1000).toFixed(2)}s`) + + // Assert output is non-empty + t.ok(generatedText.length > 0, `${label} Should generate OCR output`) + + // Assert key text from the newspaper is present (Titanic headline) + const lowerText = generatedText.toLowerCase() + const expectedKeywords = ['titanic', 'new york', 'iceberg'] + const foundKeywords = expectedKeywords.filter(kw => lowerText.includes(kw)) + + t.ok( + foundKeywords.length >= 1, + `${label} OCR output should contain at least one expected keyword. ` + + `Found: ${foundKeywords.join(', ') || 'none'}. ` + + `Expected any of: ${expectedKeywords.join(', ')}` + ) + } +}) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/mobile/integration.auto.cjs b/packages/qvac-lib-infer-llamacpp-llm/test/mobile/integration.auto.cjs index 42d852e96c..71f1efd206 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/mobile/integration.auto.cjs +++ b/packages/qvac-lib-infer-llamacpp-llm/test/mobile/integration.auto.cjs @@ -38,6 +38,10 @@ async function runMultiInstanceTest (options = {}) { // eslint-disable-line no-u return runIntegrationModule('../integration/multi-instance.test.js', options) } +async function runOcrLightonTest (options = {}) { // eslint-disable-line no-unused-vars + return runIntegrationModule('../integration/ocr-lighton.test.js', options) +} + async function runReasoningTest (options = {}) { // eslint-disable-line no-unused-vars return runIntegrationModule('../integration/reasoning.test.js', options) }