tetherto · olyasir · Mar 10, 2026 · Mar 9, 2026 · Mar 9, 2026 · Mar 9, 2026
@@ -292,6 +292,80 @@ npm run quickstart
 -   [Native Logging](./examples/nativelog.js) – Demonstrates C++ addon logging integration.
 -   [Tool Calling](./examples/toolCalling.js) – Demonstrates tool calling capabilities.
 
+## OCR with Vision-Language Models
+
+In addition to ONNX-based OCR (`@qvac/ocr-onnx`), you can use vision-language models through `@qvac/llm-llamacpp` for OCR tasks. This is useful for structured document understanding (tables, forms, multi-column layouts) where traditional OCR pipelines struggle.
+
+### Supported OCR Models
+
+| Model | Params | Quantization | Description |
+|-------|--------|-------------|-------------|
+| LightON OCR-2 1B | 0.6B (LLM) + ~550M (vision) | Q4_K_M | OCR-specialized, full-page transcription, 11 languages |
+| SmolVLM2-500M | 500M | Q8_0 | General vision-language, can follow targeted extraction prompts |
+
+### LightON OCR-2
+
+[LightON OCR-2](https://huggingface.co/noctrex/LightOnOCR-2-1B-ocr-soup-GGUF) is an OCR-specialized vision-language model (Apache 2.0) that produces detailed markdown/HTML output with tables. It supports 11 languages: English, French, German, Spanish, Italian, Dutch, Portuguese, Polish, Romanian, Czech, and Swedish.
+
+**Characteristics:**
+- Always does full-page transcription regardless of prompt
+- Produces detailed structured output (markdown tables, HTML)
+- Requires `--jinja` flag / jinja chat template in llama.cpp
+- Requires both LLM model and F16 mmproj (vision projector)
+
+**Performance (Pixel 10 Pro, CPU-only, Q4_K_M + F16 mmproj):**
+- Image encode: ~30s (768x1024 image)
+- Prompt eval: 26.6 t/s
+- Generation: 4.14 t/s
+
+**Usage Example:**
+
+```js
+const LlmLlamacpp = require('@qvac/llm-llamacpp')
+const FilesystemDL = require('@qvac/dl-filesystem')
+const fs = require('bare-fs')
+
+const dirPath = './models'
+const loader = new FilesystemDL({ dirPath })
+
+const model = new LlmLlamacpp({
+  modelName: 'LightOnOCR-2-1B-ocr-soup-Q4_K_M.gguf',
+  loader,
+  logger: console,
+  diskPath: dirPath,
+  projectionModel: 'mmproj-F16.gguf'
+}, {
+  device: 'cpu',
+  gpu_layers: '0',
+  ctx_size: '4096',
+  temp: '0.1',
+  predict: '2048'
+})
+
+await model.load()
+
+const imageBytes = new Uint8Array(fs.readFileSync('./document.png'))
+
+const messages = [
+  { role: 'user', type: 'media', content: imageBytes },
+  { role: 'user', content: 'Extract all text from this image and format it as markdown.' }
+]
+
+const response = await model.run(messages)
+const output = []
+
+response.onUpdate(token => {
+  output.push(token)
+})
+
+await response.await()
+
+console.log(output.join(''))
+
+await model.unload()
+await loader.close()
+```
+
 ## Architecture
 
 See [docs/](./docs) for a detailed explanation of the architecture and data flow logic.

@@ -0,0 +1,141 @@
+'use strict'
+// test/integration/ocr-lighton.test.js
+const test = require('brittle')
+const fs = require('bare-fs')
+const path = require('bare-path')
+const { ensureModel, getMediaPath } = require('./utils')
+const FilesystemDL = require('@qvac/dl-filesystem')
+const LlmLlamacpp = require('../../index.js')
+const os = require('bare-os')
+
+const platform = os.platform()
+const arch = os.arch()
+const isDarwinX64 = platform === 'darwin' && arch === 'x64'
+const isLinuxArm64 = platform === 'linux' && arch === 'arm64'
+const isMobile = platform === 'ios' || platform === 'android'
+
+const useCpu = isDarwinX64 || isLinuxArm64
+
+const LIGHTON_OCR_CONFIG = {
+  llmModel: {
+    modelName: 'LightOnOCR-2-1B-ocr-soup-Q4_K_M.gguf',
+    downloadUrl: 'https://huggingface.co/noctrex/LightOnOCR-2-1B-ocr-soup-GGUF/resolve/main/LightOnOCR-2-1B-ocr-soup-Q4_K_M.gguf'
+  },
+  projModel: {
+    modelName: 'mmproj-LightOnOCR-2-F16.gguf',
+    downloadUrl: 'https://huggingface.co/noctrex/LightOnOCR-2-1B-ocr-soup-GGUF/resolve/main/mmproj-F16.gguf'
+  },
+  ctx_size: '4096'
+}
+
+const TEST_CONSTANTS = {
+  timeout: 1_800_000, // 30 minutes — model download (~1.2GB) + slow image encoding on Intel Macs
+  maxTokens: '2048'
+}
+
+const DEVICE_CONFIGS = (isMobile || useCpu)
+  ? [{ id: 'cpu', device: 'cpu' }]
+  : [{ id: 'gpu', device: 'gpu' }]
+
+function getConfig (device) {
+  return {
+    gpu_layers: '98',
+    temp: '0.1',
+    verbosity: '2',
+    device,
+    ctx_size: LIGHTON_OCR_CONFIG.ctx_size,
+    predict: TEST_CONSTANTS.maxTokens
+  }
+}
+
+async function setupLightOnInference (t, device = 'gpu') {
+  const [modelName, dirPath] = await ensureModel(LIGHTON_OCR_CONFIG.llmModel)
+  t.ok(fs.existsSync(path.join(dirPath, modelName)), 'LLM model file should exist')
+
+  const [projModelName] = await ensureModel(LIGHTON_OCR_CONFIG.projModel)
+  t.ok(fs.existsSync(path.join(dirPath, projModelName)), 'Projection model file should exist')
+
+  const loader = new FilesystemDL({ dirPath })
+  const inference = new LlmLlamacpp({
+    modelName,
+    loader,
+    logger: console,
+    diskPath: dirPath,
+    projectionModel: projModelName
+  }, getConfig(device))
+
+  t.teardown(async () => {
+    await loader.close()
+    await inference.unload()
+  })
+
+  await inference.load()
+
+  return { inference, loader }
+}
+
+async function runOcr (inference, imageFilePath) {
+  const imageBytes = new Uint8Array(fs.readFileSync(imageFilePath))
+
+  const messages = [
+    { role: 'user', type: 'media', content: imageBytes },
+    { role: 'user', content: 'Extract all text from this image and format it as markdown.' }
+  ]
+
+  const startTime = Date.now()
+  const response = await inference.run(messages)
+  const generatedText = []
+  let error = null
+
+  response.onUpdate(data => {
+    generatedText.push(data)
+  }).onError(err => {
+    error = err
+  })
+
+  await response.await()
+
+  if (error) {
+    throw new Error('Inference error: ' + error)
+  }
+
+  return {
+    generatedText: generatedText.join(''),
+    startTime,
+    endTime: Date.now()
+  }
+}
+
+// Test: LightON OCR-2 can extract text from a newspaper document image
+test('LightON OCR-2 can extract text from document image', { timeout: TEST_CONSTANTS.timeout }, async t => {
+  for (const deviceConfig of DEVICE_CONFIGS) {
+    const label = `[${deviceConfig.id.toUpperCase()}]`
+
+    const { inference } = await setupLightOnInference(t, deviceConfig.device)
+
+    // Use the newspaper image — a small document with clear text
+    const imageFilePath = getMediaPath('news-paper.jpg')
+    t.ok(fs.existsSync(imageFilePath), `${label} news-paper.jpg image file should exist`)
+
+    const { generatedText, startTime, endTime } = await runOcr(inference, imageFilePath)
+    const totalTime = endTime - startTime
+
+    t.comment(`${label} Generated text (${generatedText.length} chars): ${generatedText.substring(0, 500)}...`)
+    t.comment(`${label} Total time: ${(totalTime / 1000).toFixed(2)}s`)
+
+    // Assert output is non-empty
+    t.ok(generatedText.length > 0, `${label} Should generate OCR output`)
+
+    // Assert key text from the newspaper is present (Titanic headline)
+    const lowerText = generatedText.toLowerCase()
+    const expectedKeywords = ['titanic', 'new york', 'iceberg']
+    const foundKeywords = expectedKeywords.filter(kw => lowerText.includes(kw))
+
+    t.ok(
+      foundKeywords.length >= 1,
+      `${label} OCR output should contain at least one expected keyword. ` +
+      `Found: ${foundKeywords.join(', ') || 'none'}. ` +
+      `Expected any of: ${expectedKeywords.join(', ')}`
+    )
+  }
+})
@@ -38,6 +38,10 @@ async function runMultiInstanceTest (options = {}) { // eslint-disable-line no-u
   return runIntegrationModule('../integration/multi-instance.test.js', options)
 }
 
+async function runOcrLightonTest (options = {}) { // eslint-disable-line no-unused-vars
+  return runIntegrationModule('../integration/ocr-lighton.test.js', options)
+}
+
 async function runReasoningTest (options = {}) { // eslint-disable-line no-unused-vars
   return runIntegrationModule('../integration/reasoning.test.js', options)
 }