Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions packages/qvac-lib-infer-llamacpp-llm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,80 @@ npm run quickstart
- [Native Logging](./examples/nativelog.js) – Demonstrates C++ addon logging integration.
- [Tool Calling](./examples/toolCalling.js) – Demonstrates tool calling capabilities.

## OCR with Vision-Language Models

In addition to ONNX-based OCR (`@qvac/ocr-onnx`), you can use vision-language models through `@qvac/llm-llamacpp` for OCR tasks. This is useful for structured document understanding (tables, forms, multi-column layouts) where traditional OCR pipelines struggle.

### Supported OCR Models

| Model | Params | Quantization | Description |
|-------|--------|-------------|-------------|
| LightON OCR-2 1B | 0.6B (LLM) + ~550M (vision) | Q4_K_M | OCR-specialized, full-page transcription, 11 languages |
| SmolVLM2-500M | 500M | Q8_0 | General vision-language, can follow targeted extraction prompts |

### LightON OCR-2

[LightON OCR-2](https://huggingface.co/noctrex/LightOnOCR-2-1B-ocr-soup-GGUF) is an OCR-specialized vision-language model (Apache 2.0) that produces detailed markdown/HTML output with tables. It supports 11 languages: English, French, German, Spanish, Italian, Dutch, Portuguese, Polish, Romanian, Czech, and Swedish.

**Characteristics:**
- Always does full-page transcription regardless of prompt
- Produces detailed structured output (markdown tables, HTML)
- Requires `--jinja` flag / jinja chat template in llama.cpp
- Requires both LLM model and F16 mmproj (vision projector)

**Performance (Pixel 10 Pro, CPU-only, Q4_K_M + F16 mmproj):**
- Image encode: ~30s (768x1024 image)
- Prompt eval: 26.6 t/s
- Generation: 4.14 t/s

**Usage Example:**

```js
const LlmLlamacpp = require('@qvac/llm-llamacpp')
const FilesystemDL = require('@qvac/dl-filesystem')
const fs = require('bare-fs')

const dirPath = './models'
const loader = new FilesystemDL({ dirPath })

const model = new LlmLlamacpp({
modelName: 'LightOnOCR-2-1B-ocr-soup-Q4_K_M.gguf',
loader,
logger: console,
diskPath: dirPath,
projectionModel: 'mmproj-F16.gguf'
}, {
device: 'cpu',
gpu_layers: '0',
ctx_size: '4096',
temp: '0.1',
predict: '2048'
})

await model.load()

const imageBytes = new Uint8Array(fs.readFileSync('./document.png'))

const messages = [
{ role: 'user', type: 'media', content: imageBytes },
{ role: 'user', content: 'Extract all text from this image and format it as markdown.' }
]

const response = await model.run(messages)
const output = []

response.onUpdate(token => {
output.push(token)
})

await response.await()

console.log(output.join(''))

await model.unload()
await loader.close()
```

## Architecture

See [docs/](./docs) for a detailed explanation of the architecture and data flow logic.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
'use strict'
// test/integration/ocr-lighton.test.js
const test = require('brittle')
const fs = require('bare-fs')
const path = require('bare-path')
const { ensureModel, getMediaPath } = require('./utils')
const FilesystemDL = require('@qvac/dl-filesystem')
const LlmLlamacpp = require('../../index.js')
const os = require('bare-os')

const platform = os.platform()
const arch = os.arch()
const isDarwinX64 = platform === 'darwin' && arch === 'x64'
const isLinuxArm64 = platform === 'linux' && arch === 'arm64'
const isMobile = platform === 'ios' || platform === 'android'

const useCpu = isDarwinX64 || isLinuxArm64

const LIGHTON_OCR_CONFIG = {
llmModel: {
modelName: 'LightOnOCR-2-1B-ocr-soup-Q4_K_M.gguf',
downloadUrl: 'https://huggingface.co/noctrex/LightOnOCR-2-1B-ocr-soup-GGUF/resolve/main/LightOnOCR-2-1B-ocr-soup-Q4_K_M.gguf'
},
projModel: {
modelName: 'mmproj-LightOnOCR-2-F16.gguf',
downloadUrl: 'https://huggingface.co/noctrex/LightOnOCR-2-1B-ocr-soup-GGUF/resolve/main/mmproj-F16.gguf'
},
ctx_size: '4096'
}

const TEST_CONSTANTS = {
timeout: 1_800_000, // 30 minutes — model download (~1.2GB) + slow image encoding on Intel Macs
maxTokens: '2048'
}

const DEVICE_CONFIGS = (isMobile || useCpu)
? [{ id: 'cpu', device: 'cpu' }]
: [{ id: 'gpu', device: 'gpu' }]

function getConfig (device) {
return {
gpu_layers: '98',
temp: '0.1',
verbosity: '2',
device,
ctx_size: LIGHTON_OCR_CONFIG.ctx_size,
predict: TEST_CONSTANTS.maxTokens
}
}

async function setupLightOnInference (t, device = 'gpu') {
const [modelName, dirPath] = await ensureModel(LIGHTON_OCR_CONFIG.llmModel)
t.ok(fs.existsSync(path.join(dirPath, modelName)), 'LLM model file should exist')

const [projModelName] = await ensureModel(LIGHTON_OCR_CONFIG.projModel)
t.ok(fs.existsSync(path.join(dirPath, projModelName)), 'Projection model file should exist')

const loader = new FilesystemDL({ dirPath })
const inference = new LlmLlamacpp({
modelName,
loader,
logger: console,
diskPath: dirPath,
projectionModel: projModelName
}, getConfig(device))

t.teardown(async () => {
await loader.close()
await inference.unload()
})

await inference.load()

return { inference, loader }
}

async function runOcr (inference, imageFilePath) {
const imageBytes = new Uint8Array(fs.readFileSync(imageFilePath))

const messages = [
{ role: 'user', type: 'media', content: imageBytes },
{ role: 'user', content: 'Extract all text from this image and format it as markdown.' }
]

const startTime = Date.now()
const response = await inference.run(messages)
const generatedText = []
let error = null

response.onUpdate(data => {
generatedText.push(data)
}).onError(err => {
error = err
})

await response.await()

if (error) {
throw new Error('Inference error: ' + error)
}

return {
generatedText: generatedText.join(''),
startTime,
endTime: Date.now()
}
}

// Test: LightON OCR-2 can extract text from a newspaper document image
test('LightON OCR-2 can extract text from document image', { timeout: TEST_CONSTANTS.timeout }, async t => {
for (const deviceConfig of DEVICE_CONFIGS) {
const label = `[${deviceConfig.id.toUpperCase()}]`

const { inference } = await setupLightOnInference(t, deviceConfig.device)

// Use the newspaper image — a small document with clear text
const imageFilePath = getMediaPath('news-paper.jpg')
t.ok(fs.existsSync(imageFilePath), `${label} news-paper.jpg image file should exist`)

const { generatedText, startTime, endTime } = await runOcr(inference, imageFilePath)
const totalTime = endTime - startTime

t.comment(`${label} Generated text (${generatedText.length} chars): ${generatedText.substring(0, 500)}...`)
t.comment(`${label} Total time: ${(totalTime / 1000).toFixed(2)}s`)

// Assert output is non-empty
t.ok(generatedText.length > 0, `${label} Should generate OCR output`)

// Assert key text from the newspaper is present (Titanic headline)
const lowerText = generatedText.toLowerCase()
const expectedKeywords = ['titanic', 'new york', 'iceberg']
const foundKeywords = expectedKeywords.filter(kw => lowerText.includes(kw))

t.ok(
foundKeywords.length >= 1,
`${label} OCR output should contain at least one expected keyword. ` +
`Found: ${foundKeywords.join(', ') || 'none'}. ` +
`Expected any of: ${expectedKeywords.join(', ')}`
)
}
})
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ async function runMultiInstanceTest (options = {}) { // eslint-disable-line no-u
return runIntegrationModule('../integration/multi-instance.test.js', options)
}

async function runOcrLightonTest (options = {}) { // eslint-disable-line no-unused-vars
return runIntegrationModule('../integration/ocr-lighton.test.js', options)
}

async function runReasoningTest (options = {}) { // eslint-disable-line no-unused-vars
return runIntegrationModule('../integration/reasoning.test.js', options)
}
Expand Down
Loading