diff --git a/.github/workflows/on-pr-qvac-lib-infer-llamacpp-llm.yml b/.github/workflows/on-pr-qvac-lib-infer-llamacpp-llm.yml index c3820af3f6..161a4b9cec 100644 --- a/.github/workflows/on-pr-qvac-lib-infer-llamacpp-llm.yml +++ b/.github/workflows/on-pr-qvac-lib-infer-llamacpp-llm.yml @@ -115,6 +115,15 @@ jobs: working-directory: packages/qvac-lib-infer-llamacpp-llm run: npm run test:dts + - name: Run lint and unit tests + id: run_lint_and_unit_tests + uses: tetherto/oss-actions/.github/actions/run-lint-and-unit-tests@4c64bed91fc8eba3a201adb1495e61b4c1a2246d + with: + gpr-token: ${{ secrets.GITHUB_TOKEN }} + pat-token: ${{ secrets.GITHUB_TOKEN }} + registry-type: gpr + workdir: packages/qvac-lib-infer-llamacpp-llm + prebuild: needs: [authorize, sanity-checks] if: needs.authorize.outputs.allowed == 'true' diff --git a/packages/qvac-lib-infer-llamacpp-llm/CHANGELOG.md b/packages/qvac-lib-infer-llamacpp-llm/CHANGELOG.md index 9a0801faf5..860bd60dee 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/CHANGELOG.md +++ b/packages/qvac-lib-infer-llamacpp-llm/CHANGELOG.md @@ -1,5 +1,135 @@ # Changelog +## [0.16.0] - 2026-04-14 + +This release migrates the LLM addon off `BaseInference` inheritance and the `WeightsProvider` download layer onto the composable `createJobHandler` + `exclusiveRunQueue` utilities from `@qvac/infer-base@^0.4.0`. The constructor signature is replaced with a single object whose `files.model` field is an ordered array of absolute paths and `files.projectionModel` is an optional absolute path for multimodal models. This is a breaking change — every caller must update. + +## Breaking Changes + +### Constructor signature: single object with `files`, no `Loader` + +`LlmLlamacpp` now takes a single `{ files, config, logger?, opts? }` object. The old `Loader` + `diskPath` + `modelName` + two-arg `(args, config)` shape is gone — callers pre-resolve absolute paths and supply them as `files.model`. + +```js +// BEFORE (≤ 0.15.x) +const FilesystemDL = require('@qvac/dl-filesystem') +const loader = new FilesystemDL({ dirPath: '/models' }) +const model = new LlmLlamacpp({ + loader, + modelName: 'Qwen3-1.7B-Q4_0.gguf', + diskPath: '/models', + logger: console, + opts: { stats: true } +}, { ctx_size: '4096', gpu_layers: '99' }) + +// AFTER (0.16.0) +const model = new LlmLlamacpp({ + files: { + model: ['/models/Qwen3-1.7B-Q4_0.gguf'] + }, + config: { ctx_size: '4096', gpu_layers: '99' }, + logger: console, + opts: { stats: true } +}) +``` + +For sharded models the caller passes the full ordered list — the `.tensors.txt` companion first, followed by every `-NNNNN-of-MMMMM.gguf` shard in ascending order. For multimodal models, `files.projectionModel` carries the absolute path to the mmproj file: + +```js +const model = new LlmLlamacpp({ + files: { + model: [ + '/models/medgemma-4b-it-Q4_1.tensors.txt', + '/models/medgemma-4b-it-Q4_1-00001-of-00005.gguf', + '/models/medgemma-4b-it-Q4_1-00002-of-00005.gguf', + '/models/medgemma-4b-it-Q4_1-00003-of-00005.gguf', + '/models/medgemma-4b-it-Q4_1-00004-of-00005.gguf', + '/models/medgemma-4b-it-Q4_1-00005-of-00005.gguf' + ], + projectionModel: '/models/mmproj-model-f16.gguf' + }, + config: { gpu_layers: '99' } +}) +``` + +### `BaseInference` inheritance and `WeightsProvider` removed + +`LlmLlamacpp` no longer extends `BaseInference` and no longer touches the `WeightsProvider` download layer. The class composes `createJobHandler` and `exclusiveRunQueue` from `@qvac/infer-base@^0.4.0` directly. Public lifecycle methods (`load` / `run` / `finetune` / `pause` / `cancel` / `unload` / `getState`) are unchanged in shape, but `downloadWeights` and the loader-based progress callbacks are gone — the caller is responsible for placing files on disk before constructing the model. + +In-memory streaming from network sources (URLs, Hyperdrive) is no longer supported in the current API. The SDK does not currently use it (models are stored to disk first); this can be re-added when/if the SDK plans to support that feature. Before, it was possible through the `Loader` abstraction. + +### Dependency changes + +- `@qvac/infer-base` bumped from `^0.3.0` to `^0.4.0`. +- `bare-fs` is now a runtime dependency (used to stream shards from disk). +- `@qvac/dl-base` and `@qvac/dl-filesystem` are no longer used by this package and have been removed from `devDependencies`. + +### `getState()` returns a narrower shape + +`getState()` previously returned `{ configLoaded, weightsLoaded, destroyed }` (the three-field shape inherited from `BaseInference`). It now returns `{ configLoaded }` only. The `weightsLoaded` and `destroyed` fields are gone — `weightsLoaded` collapsed into `configLoaded` because the refactored `load()` does both in one step, and `destroyed` is no longer tracked since `unload()` resets `configLoaded` and nulls the addon handle instead. Callers reading `state.weightsLoaded` or `state.destroyed` must switch to `state.configLoaded`. + +### Public methods removed from `LlmLlamacpp` + +`LlmLlamacpp` previously exposed these methods via `BaseInference` inheritance, all of which are now gone: + +- `downloadWeights(onDownloadProgress, opts)` — the download layer is removed; the caller places files on disk and passes absolute paths in `files.model` / `files.projectionModel`. +- `unpause()` / `stop()` — BaseInference job-lifecycle helpers. The refactor still exposes `pause()` and `cancel()`; `unpause` is superseded by issuing a new `run()` after `cancel()`. +- `status()` — replaced by `getState()` for the static readiness flag; per-job state is observed via the `QvacResponse` returned by `run()`. +- `destroy()` — folded into `unload()`, which now both releases native resources and nulls `this.addon`. +- `getApiDefinition()` — no longer exposed; consumers should import types from `index.d.ts`. + +### `load()` takes no arguments + +`load()` previously forwarded `...args` through `BaseInference.load` into LLM's `_load(closeLoader, onDownloadProgress)`. Both arguments are gone — `closeLoader` is meaningless without a `Loader`, and `onDownloadProgress` is superseded by the caller owning download-and-placement before construction. Call `await model.load()` with no arguments. + +### Type exports removed from `index.d.ts` + +The following exports are no longer part of the package's public type surface because the loader/download layer they described is gone: `ReportProgressCallback`, `Loader`, `DownloadWeightsOptions`, `DownloadResult`. TypeScript consumers importing any of these must update to the new `LlmLlamacppArgs` / `files` shape. + +## Features + +### Constructor input validation + +The constructor now throws `TypeError('files.model must be a non-empty array of absolute paths')` when `files` or `files.model` is missing or empty. This produces a clear error for callers porting old code instead of a confusing `Cannot read properties of undefined`. + +### `run()`-before-`load()` guard + +Calling `run()` before `load()` now throws `Error('Addon not initialized. Call load() first.')` instead of dereferencing `null` and crashing. `finetune()` already had this guard since the previous release. + +### `load()` is now idempotent when already loaded + +A second `load()` call on an already-loaded instance is now a silent no-op instead of unloading and reloading. This aligns with the ReadyResource pattern used elsewhere in QVAC and prevents accidental double-loads from triggering expensive work. Callers that intentionally want to swap weights must call `unload()` first (which clears `configLoaded`) and then `load()` again. + +### Crash-safe shard streaming + +If `_streamShards()` or `addon.activate()` throws mid-load (for example a corrupted shard file or a native init failure), the partially-initialized addon is now best-effort-unloaded and `this.addon` is reset to `null`. A subsequent `load()` call starts cleanly instead of leaking a zombie native instance. + +### Restored JSDoc on `FinetuneOptions` + +Every `FinetuneOptions` field carries a `/** … */` doc comment again, including the default values (`numberOfEpochs = 1`, `learningRate = 1e-4`, `batchSize = 128`, …) so IDE tooltips show them without needing to read `docs/finetuning.md`. + +## Bug Fixes + +### `unload()` clears the addon reference + +`unload()` now sets `this.addon = null` after `await this.addon.unload()`, so post-unload `cancel()` / `pause()` / `run()` calls hit the explicit guards rather than dereferencing a disposed native handle. `pause()`, `cancel()`, and the job-handler cancel closure all use optional chaining for the same reason. + +### Removed dead `_isSuppressedNoResponseLog` filter + +The `_createFilteredLogger` infrastructure that wrapped the user-supplied logger to swallow `'No response found for job'` warnings was tied to the old `BaseInference` `_jobToResponse` Map. The new architecture cannot emit that message at all, so the filter, the wrapped logger, and the `_originalLogger` indirection are all removed. The user-supplied logger is now used directly. + +### `load()` is serialized through the exclusive run queue + +`load()` is now routed through the same `exclusiveRunQueue` used by `run()`, `finetune()`, and `unload()`. Previously two overlapping `load()` calls on the same instance could both pass the `configLoaded` guard before it flipped to `true`, both stream shards into and activate the native addon, and clobber `this.addon` — leaking one native handle. Concurrent `load()` on a single instance is now safe. + +### Constructor rejects non-absolute path entries + +Each entry in `files.model` is now validated with `path.isAbsolute()` (matching the existing error-message contract), and the same check now applies to the optional `files.projectionModel` — previously it had no validation at all. Relative paths are rejected at construction time instead of bubbling up from `bare-fs` or the native load. + +## Pull Requests + +- [#1494](https://github.com/tetherto/qvac/pull/1494) - chore[bc]: LLM addon interface refactor — remove BaseInference and WeightsProvider + ## [0.15.0] - 2026-04-09 ### Breaking Changes diff --git a/packages/qvac-lib-infer-llamacpp-llm/README.md b/packages/qvac-lib-infer-llamacpp-llm/README.md index e71a948b7d..456f2e81d0 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/README.md +++ b/packages/qvac-lib-infer-llamacpp-llm/README.md @@ -8,13 +8,13 @@ This native C++ addon, built using the `Bare` Runtime, simplifies running Large - [Building from Source](#building-from-source) - [Usage](#usage) - [1. Import the Model Class](#1-import-the-model-class) - - [2. Create a Data Loader](#2-create-a-data-loader) - - [3. Create the `args` obj](#3-create-the-args-obj) - - [4. Create the `config` obj](#4-create-the-config-obj) - - [5. Create Model Instance](#5-create-model-instance) - - [6. Load Model](#6-load-model) - - [7. Run Inference](#7-run-inference) - - [8. Release Resources](#8-release-resources) + - [2. Create the `args` obj](#2-create-the-args-obj) + - [Sharded models](#sharded-models) + - [3. Create the `config` obj](#3-create-the-config-obj) + - [4. Create Model Instance](#4-create-model-instance) + - [5. Load Model](#5-load-model) + - [6. Run Inference](#6-run-inference) + - [7. Release Resources](#7-release-resources) - [API behavior by state](#api-behavior-by-state) - [Fine-tuning](#fine-tuning) - [Quickstart Example](#quickstart-example) @@ -72,47 +72,77 @@ See [build.md](./build.md) for detailed instructions on how to build the addon f ```js const LlmLlamacpp = require('@qvac/llm-llamacpp') +const path = require('bare-path') ``` -### 2. Create a Data Loader - -Data Loaders abstract the way model files are accessed. Use a [`FileSystemDataLoader`](../dl-filesystem) to load model files from your local file system. Models can be downloaded directly from HuggingFace. +### 2. Create the `args` obj ```js -const FilesystemDL = require('@qvac/dl-filesystem') - -// Download model from HuggingFace (see examples/utils.js for downloadModel helper) -const [modelName, dirPath] = await downloadModel( - 'https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf', - 'Llama-3.2-1B-Instruct-Q4_0.gguf' -) - -const fsDL = new FilesystemDL({ dirPath }) -``` - -### 3. Create the `args` obj +const dirPath = path.resolve('./models') +const modelName = 'Llama-3.2-1B-Instruct-Q4_0.gguf' -```js const args = { - loader: fsDL, + files: { + model: [path.join(dirPath, modelName)] + // projectionModel: path.join(dirPath, 'mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf') // for multimodal support pass the projection model path + }, + config, opts: { stats: true }, - logger: console, - diskPath: dirPath, - modelName, - // projectionModel: 'mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf' // for multimodal support you need to pass the projection model name + logger: console } ``` The `args` obj contains the following properties: -* `loader`: The Data Loader instance from which the model file will be streamed. -* `logger`: This property is used to create a [`QvacLogger`](../logging) instance, which handles all logging functionality. +* `files.model`: Required. An array of absolute paths to the GGUF model file(s) to load. The caller is responsible for passing the complete set of files for the model, including every shard and the `.tensors.txt` companion for multi-shard models (see [Sharded models](#sharded-models) below). +* `files.projectionModel`: Optional. Absolute path to the projection model file. This is required for multimodal support. +* `config`: The model configuration object (see next section). +* `logger`: This property is used to create a [`QvacLogger`](../logging) instance, which handles all logging functionality. * `opts.stats`: This flag determines whether to calculate inference stats. -* `diskPath`: The local directory where the model file will be downloaded to. -* `modelName`: The name of model file in the Data Loader. -* `projectionModel`: The name of the projection model file in the Data Loader. This is required for multimodal support. -### 4. Create the `config` obj +#### Sharded models + +The addon no longer expands sharded models internally. If you are loading a multi-shard GGUF model, **the caller MUST pass every file** — including the `.tensors.txt` companion file that lives alongside the shards — in `files.model`. Anything missing will cause the addon to fail during weight streaming. + +**Required ordering for multi-shard models:** +1. The `.tensors.txt` companion file **first**. +2. Each `*-NNNNN-of-MMMMM.gguf` shard in **numerical order** (shard `00001` before `00002`, and so on). + +Example — loading a 5-shard model: + +```js +const path = require('bare-path') +const LlmLlamacpp = require('@qvac/llm-llamacpp') + +const dir = path.resolve('./models') +const modelBase = 'my-big-model-Q4_K_M' + +const model = new LlmLlamacpp({ + files: { + model: [ + path.join(dir, `${modelBase}.tensors.txt`), + path.join(dir, `${modelBase}-00001-of-00005.gguf`), + path.join(dir, `${modelBase}-00002-of-00005.gguf`), + path.join(dir, `${modelBase}-00003-of-00005.gguf`), + path.join(dir, `${modelBase}-00004-of-00005.gguf`), + path.join(dir, `${modelBase}-00005-of-00005.gguf`) + ] + }, + config, + logger: console, + opts: { stats: true } +}) + +await model.load() +``` + +For single-file GGUF models, pass a one-element array: + +```js +files: { model: [path.join(dir, 'Llama-3.2-1B-Instruct-Q4_0.gguf')] } +``` + +### 3. Create the `config` obj The `config` obj consists of a set of hyper-parameters which can be used to tweak the behaviour of the model. *All parameters must by strings.* @@ -159,43 +189,21 @@ const config = { | System with both | ✅ Uses dedicated GPU (preferred) | ✅ Uses dedicated GPU | ✅ Uses integrated GPU | -### 5. Create Model Instance +### 4. Create Model Instance ```js -const model = new LlmLlamacpp(args, config) +const model = new LlmLlamacpp(args) ``` -### 6. Load Model +### 5. Load Model ```js await model.load() ``` -_Optionally_ you can pass the following parameters to tweak the loading behaviour. -* `close?`: This boolean value determines whether to close the Data Loader after loading. Defaults to `true` -* `reportProgressCallback?`: A callback function which gets called periodically with progress updates. It can be used to display overall progress percentage. +Loads the model file(s) passed in `files.model` and activates the native addon. If a projection model was provided (`files.projectionModel`), it is loaded as part of the same step. -_For example:_ - -```js -await model.load(false, progress => process.stdout.write(`\rOverall Progress: ${progress.overallProgress}%`)) -``` - -**Progress Callback Data** - -The progress callback receives an object with the following properties: - -| Property | Type | Description | -|---------------------|--------|-----------------------------------------| -| `action` | string | Current operation being performed | -| `totalSize` | number | Total bytes to be loaded | -| `totalFiles` | number | Total number of files to process | -| `filesProcessed` | number | Number of files completed so far | -| `currentFile` | string | Name of file currently being processed | -| `currentFileProgress` | string | Percentage progress on current file | -| `overallProgress` | string | Overall loading progress percentage | - -### 7. Run Inference +### 6. Run Inference Pass an array of messages (following the chat completion format) to the `run` method. Process the generated tokens asynchronously: @@ -227,14 +235,13 @@ try { When `opts.stats` is enabled, `response.stats` includes runtime metrics such as `TTFT`, `TPS`, token counters, and `backendDevice` (`"cpu"` or `"gpu"`). `backendDevice` reflects the resolved device used at runtime after backend selection/fallback logic, not only the requested config. -### 8. Release Resources +### 7. Release Resources Unload the model when finished: ```javascript try { await model.unload() - await fsDL.close() } catch (error) { console.error('Failed to unload model:', error) } @@ -341,24 +348,24 @@ In addition to ONNX-based OCR (`@qvac/ocr-onnx`), you can use vision-language mo ```js const LlmLlamacpp = require('@qvac/llm-llamacpp') -const FilesystemDL = require('@qvac/dl-filesystem') const fs = require('bare-fs') +const path = require('bare-path') -const dirPath = './models' -const loader = new FilesystemDL({ dirPath }) +const dirPath = path.resolve('./models') const model = new LlmLlamacpp({ - modelName: 'LightOnOCR-2-1B-ocr-soup-Q4_K_M.gguf', - loader, - logger: console, - diskPath: dirPath, - projectionModel: 'mmproj-F16.gguf' -}, { - device: 'cpu', - gpu_layers: '0', - ctx_size: '4096', - temp: '0.1', - predict: '2048' + files: { + model: [path.join(dirPath, 'LightOnOCR-2-1B-ocr-soup-Q4_K_M.gguf')], + projectionModel: path.join(dirPath, 'mmproj-F16.gguf') + }, + config: { + device: 'cpu', + gpu_layers: '0', + ctx_size: '4096', + temp: '0.1', + predict: '2048' + }, + logger: console }) await model.load() @@ -382,7 +389,6 @@ await response.await() console.log(output.join('')) await model.unload() -await loader.close() ``` ## Architecture diff --git a/packages/qvac-lib-infer-llamacpp-llm/addon.js b/packages/qvac-lib-infer-llamacpp-llm/addon.js index 5582f626b6..92f56f9041 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/addon.js +++ b/packages/qvac-lib-infer-llamacpp-llm/addon.js @@ -1,5 +1,69 @@ const path = require('bare-path') +/** + * Normalize a raw native event into `Output` / `Error` / `JobEnded` / + * `FinetuneProgress`, or `null` to drop it. `state.skipNextRuntimeStats` + * is used to swallow the TPS trailer that follows a finetune terminal. + * + * @param {string} rawEvent + * @param {*} rawData + * @param {*} rawError + * @param {{ skipNextRuntimeStats: boolean }} state + * @returns {{ type: string, data: *, error: * } | null} + */ +function mapAddonEvent (rawEvent, rawData, rawError, state) { + // TPS-shaped runtime stats — either a real inference terminal or the stale + // trailer that follows a finetune terminal. + if (rawData && typeof rawData === 'object' && 'TPS' in rawData) { + if (state.skipNextRuntimeStats) { + state.skipNextRuntimeStats = false + return null + } + const stats = { ...rawData } + if (stats.backendDevice === 0) { + stats.backendDevice = 'cpu' + } else if (stats.backendDevice === 1) { + stats.backendDevice = 'gpu' + } + return { type: 'JobEnded', data: stats, error: null } + } + + // Finetune terminal: dispatch JobEnded carrying the finetune payload and arm + // the skip flag so the TPS the C++ addon emits right after is not mistaken + // for an inference result that would clobber `_hasActiveResponse`. + if ( + rawData && + typeof rawData === 'object' && + rawData.op === 'finetune' && + typeof rawData.status === 'string' + ) { + state.skipNextRuntimeStats = true + return { type: 'JobEnded', data: rawData, error: null } + } + + // Per-iteration finetune metrics. + if ( + rawData && + typeof rawData === 'object' && + rawData.type === 'finetune_progress' + ) { + return { type: 'FinetuneProgress', data: rawData, error: null } + } + + // Name-based mapping. LogMsg must be checked before the string-to-Output + // fallback: `JsLogMsgOutputHandler` delivers the log as a plain string, + // so without this branch it would be misrouted into the job output. + let type = rawEvent + if (typeof rawEvent === 'string' && rawEvent.includes('Error')) { + type = 'Error' + } else if (typeof rawEvent === 'string' && rawEvent.includes('LogMsg')) { + type = 'LogMsg' + } else if (typeof rawData === 'string') { + type = 'Output' + } + return { type, data: rawData, error: rawError } +} + /** * An interface between Bare addon in C++ and JS runtime. */ @@ -29,10 +93,9 @@ class LlamaInterface { } /** - * * @param {Object} weightsData * @param {String} weightsData.filename - * @param {Uint8Array} weightsData.contents + * @param {Uint8Array|null} weightsData.chunk * @param {Boolean} weightsData.completed */ async loadWeights (weightsData) { @@ -86,5 +149,6 @@ class LlamaInterface { } module.exports = { - LlamaInterface + LlamaInterface, + mapAddonEvent } diff --git a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/llm-parameter-sweep.js b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/llm-parameter-sweep.js index 6f84cd946a..c65f81c658 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/llm-parameter-sweep.js +++ b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/llm-parameter-sweep.js @@ -3,7 +3,6 @@ const fs = require('bare-fs') const path = require('bare-path') const process = require('bare-process') -const FilesystemDL = require('@qvac/dl-filesystem') const { parseAddonSource, resolveAddonCtor, @@ -249,7 +248,6 @@ async function main () { continue } } - let loader = null let model = null let modelLoaded = false let caseRepeatsAttempted = 0 @@ -269,18 +267,16 @@ async function main () { debugLogger.log(`Running: ${testCase.caseId}`) - loader = new FilesystemDL({ dirPath: modelDef.modelDir }) const config = buildConfigObject(testCase.runtimeConfig) const addonRuntimeLogger = createAddonRuntimeLogger(debugEnabled) // Load model once for this case model = new AddonCtor({ - modelName: testCase.modelName, - loader, + files: { model: [path.join(modelDef.modelDir, testCase.modelName)] }, + config, logger: addonRuntimeLogger, - diskPath: modelDef.modelDir, opts: { stats: true } - }, config) + }) const loadStart = process.hrtime() let loadMs = null @@ -326,12 +322,6 @@ async function main () { }) completedCases.add(caseKey) saveProgress() - // Clean up loader before continuing - try { - await loader.close().catch(() => {}) - } catch { - // Ignore cleanup errors - } continue // Skip to next case } throw loadError @@ -529,13 +519,6 @@ async function main () { } } - // Close loader after all prompts - try { - await loader.close().catch(() => {}) - } catch (closeError) { - debugLogger.warn(`Failed to close loader: ${closeError.message || String(closeError)}`) - } - // Add delay after case completion to allow cleanup await new Promise(resolve => setTimeout(resolve, 200)) @@ -606,13 +589,6 @@ async function main () { } catch { // Ignore cleanup errors } - try { - if (loader) { - await loader.close().catch(() => {}) - } - } catch { - // Ignore cleanup errors - } debugLogger.error(`Case ${testCase.caseId} failed completely: ${caseError.message || String(caseError)}`) const remainingRepeats = Math.max(0, (promptsForCase.length * repeats) - caseRepeatsAttempted) for (let i = 0; i < remainingRepeats; i++) { diff --git a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/package.json b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/package.json index d2286498c6..fd334080f4 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/package.json +++ b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/package.json @@ -11,7 +11,6 @@ "run:judge": "node ./prepare-models.js --target addon && bare ./run-judge.js" }, "dependencies": { - "@qvac/dl-filesystem": "latest", "@qvac/llm-llamacpp": "latest", "bare-fs": "latest", "bare-path": "latest", diff --git a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/prepare-prompts.js b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/prepare-prompts.js index 1be3bc7962..0a70fe49d1 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/prepare-prompts.js +++ b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/prepare-prompts.js @@ -3,7 +3,6 @@ const fs = require('bare-fs') const path = require('bare-path') const process = require('bare-process') -const FilesystemDL = require('@qvac/dl-filesystem') const Llm = require('../../index') const { PROMPT_CTX_SIZES, @@ -200,24 +199,20 @@ function batchTemplateMessages () { } async function main () { - if (!fs.existsSync(path.join(MODEL_DIR, MODEL_NAME))) { - throw new Error(`Missing tokenizer model at ${path.join(MODEL_DIR, MODEL_NAME)}. Run model prep first.`) + const modelPath = path.join(MODEL_DIR, MODEL_NAME) + if (!fs.existsSync(modelPath)) { + throw new Error(`Missing tokenizer model at ${modelPath}. Run model prep first.`) } - const loader = new FilesystemDL({ dirPath: MODEL_DIR }) let model = null try { try { - model = new Llm( - { - modelName: MODEL_NAME, - loader, - diskPath: MODEL_DIR, - opts: { stats: true } - }, - FAST_PROBE_RUNTIME - ) + model = new Llm({ + files: { model: [modelPath] }, + config: FAST_PROBE_RUNTIME, + opts: { stats: true } + }) await model.load() console.log('Prompt calibration runtime: gpu (fast path)') } catch (gpuErr) { @@ -227,15 +222,11 @@ async function main () { } console.warn(`GPU probe init failed; falling back to CPU: ${msg}`) if (model) await model.unload().catch(() => {}) - model = new Llm( - { - modelName: MODEL_NAME, - loader, - diskPath: MODEL_DIR, - opts: { stats: true } - }, - SAFE_FALLBACK_RUNTIME - ) + model = new Llm({ + files: { model: [modelPath] }, + config: SAFE_FALLBACK_RUNTIME, + opts: { stats: true } + }) await model.load() console.log('Prompt calibration runtime: cpu (fallback)') } @@ -282,7 +273,6 @@ async function main () { console.log(`Wrote ${prompts.length} prompts to ${OUTPUT_PATH}`) } finally { if (model) await model.unload().catch(() => {}) - await loader.close().catch(() => {}) } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/run-judge.js b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/run-judge.js index 648ad6cb5b..77037a40e7 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/run-judge.js +++ b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/run-judge.js @@ -3,7 +3,6 @@ const fs = require('bare-fs') const path = require('bare-path') const process = require('bare-process') -const FilesystemDL = require('@qvac/dl-filesystem') const { parseAddonSource, resolveAddonCtor, @@ -94,22 +93,20 @@ function pairKey (reference, candidate) { function createJudgeRuntimeManager (opts) { let model = null - let loader = null const cache = new Map() const maxChars = 6000 return { async init () { if (model) return - loader = new FilesystemDL({ dirPath: opts.modelDef.modelDir }) const config = buildConfigObject(opts.runtimeConfig) - model = new opts.AddonCtor({ - modelName: opts.modelName, - loader, - diskPath: opts.modelDef.modelDir, + const AddonCtor = opts.AddonCtor + model = new AddonCtor({ + files: { model: [path.join(opts.modelDef.modelDir, opts.modelName)] }, + config, opts: { stats: true }, logger: createAddonRuntimeLogger(opts.debug) - }, config) + }) await model.load() }, @@ -156,10 +153,6 @@ function createJudgeRuntimeManager (opts) { await model.unload().catch(() => {}) model = null } - if (loader) { - await loader.close().catch(() => {}) - loader = null - } } } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/verify-prompts.js b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/verify-prompts.js index 902822a1e9..d954df48f4 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/verify-prompts.js +++ b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/verify-prompts.js @@ -3,7 +3,6 @@ const fs = require('bare-fs') const path = require('bare-path') const process = require('bare-process') -const FilesystemDL = require('@qvac/dl-filesystem') const Llm = require('../../index') const { PROMPT_CTX_SIZES, @@ -53,24 +52,20 @@ async function main () { if (!byId.has('long')) failures.push('Missing base prompt: long') - if (!fs.existsSync(path.join(MODEL_DIR, MODEL_NAME))) { - throw new Error(`Missing tokenizer model at ${path.join(MODEL_DIR, MODEL_NAME)}`) + const modelPath = path.join(MODEL_DIR, MODEL_NAME) + if (!fs.existsSync(modelPath)) { + throw new Error(`Missing tokenizer model at ${modelPath}`) } - const loader = new FilesystemDL({ dirPath: MODEL_DIR }) let model = null try { try { - model = new Llm( - { - modelName: MODEL_NAME, - loader, - diskPath: MODEL_DIR, - opts: { stats: true } - }, - FAST_PROBE_RUNTIME - ) + model = new Llm({ + files: { model: [modelPath] }, + config: FAST_PROBE_RUNTIME, + opts: { stats: true } + }) await model.load() console.log('Prompt verification runtime: gpu (fast path)') } catch (gpuErr) { @@ -80,15 +75,11 @@ async function main () { } console.warn(`GPU probe init failed; falling back to CPU: ${msg}`) if (model) await model.unload().catch(() => {}) - model = new Llm( - { - modelName: MODEL_NAME, - loader, - diskPath: MODEL_DIR, - opts: { stats: true } - }, - SAFE_FALLBACK_RUNTIME - ) + model = new Llm({ + files: { model: [modelPath] }, + config: SAFE_FALLBACK_RUNTIME, + opts: { stats: true } + }) await model.load() console.log('Prompt verification runtime: cpu (fallback)') } @@ -140,7 +131,6 @@ async function main () { } } finally { if (model) await model.unload().catch(() => {}) - await loader.close().catch(() => {}) } if (failures.length) { diff --git a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/server/bare_infer.js b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/server/bare_infer.js index 303caff5e9..514b42f261 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/server/bare_infer.js +++ b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/server/bare_infer.js @@ -24,38 +24,26 @@ async function main () { const prompts = JSON.parse(fs.readFileSync(promptsFile, 'utf-8')) console.log(`Loaded ${prompts.length} prompts`) - // Load FilesystemDL directly (same package used by modelManager) - let FsDL - try { - FsDL = require('@qvac/dl-filesystem') - } catch { - // Fallback: resolve from main package node_modules - FsDL = require('../../node_modules/@qvac/dl-filesystem') - } - - const loader = new FsDL({ dirPath: diskPath }) - // Create LlmLlamacpp directly (bypassing modelManager) so we can pass // tools: 'true' which enables jinja template rendering for models with // custom chat templates (like AfriqueGemma) const model = new LlmLlamacpp({ - loader, - logger: console, - diskPath, - modelName - }, { - device: 'cpu', - gpu_layers: '0', - ctx_size: '2048', - temp: '0', - top_p: '1', - top_k: '1', - predict: maxTokens, - repeat_penalty: '1', - seed: '42', - tools: 'true', - 'reverse-prompt': '\n', - verbosity: '1' + files: { model: [path.join(diskPath, modelName)] }, + config: { + device: 'cpu', + gpu_layers: '0', + ctx_size: '2048', + temp: '0', + top_p: '1', + top_k: '1', + predict: maxTokens, + repeat_penalty: '1', + seed: '42', + tools: 'true', + 'reverse-prompt': '\n', + verbosity: '1' + }, + logger: console }) await model.load() @@ -84,7 +72,6 @@ async function main () { console.log(`Outputs written to ${outputsFile}`) await model.unload() - await loader.close() } main().catch(error => { diff --git a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/server/src/services/modelManager.js b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/server/src/services/modelManager.js index efc7b0823d..52c0c56d95 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/server/src/services/modelManager.js +++ b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/server/src/services/modelManager.js @@ -1,7 +1,7 @@ 'use strict' const LlmLlamacpp = require('@qvac/llm-llamacpp') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const logger = require('../utils/logger') /** @@ -71,32 +71,26 @@ class ModelManager { * Internal method to load a model */ async _loadModel (modelPath, diskPath, localModelName, config) { - // Create FilesystemDL for local model loading - const loader = new FilesystemDL({ - dirPath: diskPath - }) - const model = new LlmLlamacpp({ - diskPath, - modelName: localModelName, - loader, + files: { model: [path.join(diskPath, localModelName)] }, + config: { + device: config?.device, + gpu_layers: config?.gpu_layers, + ctx_size: config?.ctx_size, + temp: config?.temp, + top_p: config?.top_p, + top_k: config?.top_k, + n_predict: config?.n_predict, + repeat_penalty: config?.repeat_penalty, + seed: config?.seed, + verbosity: '3' + }, logger: { info: logger.info.bind(logger), error: logger.error.bind(logger), warn: logger.warn.bind(logger), debug: logger.debug.bind(logger) } - }, { - device: config?.device, - gpu_layers: config?.gpu_layers, - ctx_size: config?.ctx_size, - temp: config?.temp, - top_p: config?.top_p, - top_k: config?.top_k, - n_predict: config?.n_predict, - repeat_penalty: config?.repeat_penalty, - seed: config?.seed, - verbosity: '3' }) logger.info('Loading model into VRAM...') diff --git a/packages/qvac-lib-infer-llamacpp-llm/docs/afriquegemma-translation.md b/packages/qvac-lib-infer-llamacpp-llm/docs/afriquegemma-translation.md index f11248534f..afada8c60c 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/docs/afriquegemma-translation.md +++ b/packages/qvac-lib-infer-llamacpp-llm/docs/afriquegemma-translation.md @@ -192,24 +192,25 @@ wget -O ~/.qvac/models/AfriqueGemma-4B-Q4_K_M.gguf \ ```javascript const LlmLlamacpp = require('@qvac/llm-llamacpp') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') -const loader = new FilesystemDL({ dirPath: '/path/to/models' }) +const modelDir = '/path/to/models' const model = new LlmLlamacpp({ - loader, - modelName: 'AfriqueGemma-4B-Q4_K_M.gguf', - diskPath: '/path/to/models', + files: { + model: [path.join(modelDir, 'AfriqueGemma-4B-Q4_K_M.gguf')] + }, + config: { + device: 'cpu', + ctx_size: '2048', + temp: '0', + top_k: '1', + top_p: '1', + n_predict: '64', + seed: '42', + tools: 'true' + }, logger: console -}, { - device: 'cpu', - ctx_size: '2048', - temp: '0', - top_k: '1', - top_p: '1', - n_predict: '64', - seed: '42', - tools: 'true' }) await model.load() @@ -230,7 +231,6 @@ translation = translation.split('\n')[0].trim() console.log(translation) await model.unload() -await loader.close() ``` ### Python Validation (Transformers) @@ -320,7 +320,7 @@ The model generates text beyond the first translation line. Use one of: ### Production Considerations -1. **Model path:** Store GGUF in a persistent volume. Use `FilesystemDL` or `HyperdriveDL` for loading. +1. **Model path:** Store GGUF in a persistent volume. Models are passed to `LlmLlamacpp` as absolute paths via `files.model` (an array of one or more GGUF file paths). 2. **Warm-up:** First inference after load is slower due to KV cache initialization. Run a dummy prompt after `model.load()`. 3. **Concurrency:** `LlmLlamacpp` supports one active inference at a time. Queue requests at the application layer. 4. **Error handling:** Wrap `model.run()` in try/catch. The addon throws on context overflow or busy state. diff --git a/packages/qvac-lib-infer-llamacpp-llm/docs/architecture.md b/packages/qvac-lib-infer-llamacpp-llm/docs/architecture.md index 4052cd0096..4815b47755 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/docs/architecture.md +++ b/packages/qvac-lib-infer-llamacpp-llm/docs/architecture.md @@ -1,6 +1,6 @@ # Architecture Documentation -**Package:** `@qvac/llm-llamacpp` v0.9.0 +**Package:** `@qvac/llm-llamacpp` v0.16.0 **Stack:** JavaScript, C++20, llama.cpp, Bare Runtime, CMake, vcpkg **License:** Apache-2.0 @@ -23,7 +23,7 @@ ### Architecture Decisions - [Decision 1: llama.cpp as Inference Backend](#decision-1-llamacpp-as-inference-backend) - [Decision 2: Bare Runtime over Node.js](#decision-2-bare-runtime-over-nodejs) -- [Decision 3: Pluggable Data Loader Architecture](#decision-3-pluggable-data-loader-architecture) +- [Decision 3: Caller-Supplied File Paths](#decision-3-caller-supplied-file-paths) - [Decision 4: Incremental Buffer-Based Weight Loading](#decision-4-incremental-buffer-based-weight-loading) - [Decision 5: Chat Message Format](#decision-5-chat-message-format-json-serialization) - [Decision 6: Exclusive Run Queue](#decision-6-exclusive-run-queue-indexjs) @@ -42,20 +42,19 @@ **Core value:** - High-level JavaScript API for LLM inference -- Peer-to-peer model distribution via Hyperdrive - Streaming token-by-token output - Text and multimodal (vision + text) models -- Pluggable model weight loaders +- Caller-owned model files (any source: filesystem, P2P, HTTP, etc.) ## Key Features - **Cross-platform**: macOS, Linux, Windows, iOS, Android -- **Multiple loaders**: Hyperdrive (P2P), filesystem, custom +- **Caller-owned files**: caller provides absolute file paths; the addon never downloads or discovers files on its own - **Streaming responses**: Async iterators or callbacks - **GPU acceleration**: Metal, Vulkan, OpenCL - **Quantized models**: GGUF format - **Multimodal**: Vision models (i.e. Qwen3-VL, SmolVLM, etc.) -- **Sharded loading**: Automatic split GGUF handling +- **Sharded loading**: Caller passes every shard (and the `.tensors.txt` companion); the addon streams them into llama.cpp in order ## Target Platforms @@ -70,6 +69,8 @@ **Dependencies:** - qvac-lib-inference-addon-cpp (≥1.1.2): C++ addon framework (single-job runner, runJob/activate/loadWeights/cancel/destroyInstance) - qvac-fabric-llm.cpp (≥7248.2.3): Inference engine +- @qvac/infer-base: `createJobHandler` and `exclusiveRunQueue` helpers (job/response lifecycle + single-job serialization) +- @qvac/logging: `QvacLogger` wrapper - Bare Runtime (≥1.24.0): JavaScript runtime - Linux requires Clang/LLVM 19 with libc++ @@ -86,35 +87,35 @@ graph TB subgraph "Application Layer" APP[QVAC Applications] end - + subgraph "Inference Addons" LLM[llm-llamacpp
LLMs] EMBED[embed-llamacpp
Embeddings] WHISPER[whispercpp
STT] NMT[nmtcpp
Translation] end - + subgraph "core libs" - BASE["@qvac/infer-base"] - DL["@qvac/dl-hyperdrive"] + BASE["@qvac/infer-base
(job handler + run queue)"] + LOG["@qvac/logging"] end - + subgraph "Native Framework" ADDON[addon-cpp] end - + subgraph "Backend" BARE[Bare Runtime] LLAMA[llama.cpp] end - + APP --> LLM LLM --> BASE - LLM --> DL + LLM --> LOG LLM --> ADDON ADDON --> BARE ADDON --> LLAMA - + style LLM fill:#e1f5ff,stroke:#0066cc,stroke-width:3px ``` @@ -123,23 +124,24 @@ graph TB **Dependency Table:** -| Package | Type | Version | Purpose | -|---------|------|---------|---------| -| @qvac/infer-base | Framework | ^0.2.0 | Base classes, WeightsProvider, QvacResponse | -| @qvac/dl-hyperdrive | Peer | ^0.1.1 | P2P model loading | -| qvac-lib-inference-addon-cpp | Native | ≥1.1.1 | C++ addon framework (single-job runner) | -| qvac-fabric-llm.cpp | Native | ≥7248.2.3 | Inference engine | -| Bare Runtime | Runtime | ≥1.24.0 | JavaScript execution | +| Package | Type | Purpose | +|---------|------|---------| +| @qvac/infer-base | Framework | `createJobHandler`, `exclusiveRunQueue`, `QvacResponse` | +| @qvac/logging | Framework | `QvacLogger` wrapper | +| qvac-lib-inference-addon-cpp | Native | C++ addon framework (single-job runner) | +| qvac-fabric-llm.cpp | Native | Inference engine | +| Bare Runtime | Runtime | JavaScript execution | **Integration Points:** | From | To | Mechanism | Data Format | |------|-----|-----------|-------------| -| JavaScript | LlmLlamacpp | Constructor | args, config objects | -| LlmLlamacpp | BaseInference | Inheritance | Template method pattern | +| JavaScript | LlmLlamacpp | Constructor `{ files, config, logger, opts }` | Object | +| LlmLlamacpp | createJobHandler | Composition | Job handle + callbacks | +| LlmLlamacpp | exclusiveRunQueue | Composition | Promise-based queue | | LlmLlamacpp | LlamaInterface | Composition | Method calls | | LlamaInterface | C++ Addon | require.addon() | Native binding | -| WeightsProvider | Data Loader | Interface | Stream protocol | +| LlmLlamacpp | bare-fs | Direct read stream | Absolute file paths | @@ -149,39 +151,56 @@ graph TB ### Main Class: LlmLlamacpp +`LlmLlamacpp` is a standalone class (no inheritance). It composes a job handler (`createJobHandler`), a single-job run queue (`exclusiveRunQueue`), and a `LlamaInterface` native bridge. + ```mermaid classDiagram class LlmLlamacpp { - +constructor(args, config) - +load(closeLoader, onProgress) Promise~void~ - +run(messages) Promise~QvacResponse~ + +constructor(args: LlmLlamacppArgs) + +load() Promise~void~ + +run(messages, runOptions?) Promise~QvacResponse~ + +finetune(finetuningOptions) Promise~FinetuneHandle~ + +pause() Promise~void~ + +cancel() Promise~void~ +unload() Promise~void~ - +downloadWeights(onProgress, opts) Promise~string~ + +getState() object } - - class BaseInference { - <> - +load() Promise~void~ - +run() Promise~QvacResponse~ + + class LlamaInterface { + +activate() Promise~void~ + +loadWeights(chunk) Promise~void~ + +runJob(inputs) Promise~boolean~ + +finetune(params) Promise~boolean~ + +cancel() Promise~void~ +unload() Promise~void~ } - + + class JobHandler { + <> + +start() QvacResponse + +output(token) void + +end(stats?, payload?) void + +fail(error) void + +active QvacResponse + } + + class RunQueue { + <> + +(fn) Promise~any~ + } + class QvacResponse { +iterate() AsyncIterator~string~ - +onUpdate(callback) QvacResponse - +await() Promise~void~ + +onUpdate(cb) QvacResponse + +await() Promise~object~ +cancel() Promise~void~ +stats object } - - class WeightsProvider { - +downloadFiles(files, path, opts) Promise~void~ - +streamFiles(shards, onChunk, onProgress) Promise~void~ - } - - LlmLlamacpp --|> BaseInference - LlmLlamacpp *-- WeightsProvider - LlmLlamacpp ..> QvacResponse : creates + + LlmLlamacpp *-- LlamaInterface + LlmLlamacpp *-- JobHandler + LlmLlamacpp *-- RunQueue + JobHandler ..> QvacResponse : creates ```
@@ -191,18 +210,34 @@ classDiagram | Class | Responsibility | Lifecycle | Dependencies | |-------|----------------|-----------|--------------| -| LlmLlamacpp | Orchestrate model lifecycle, manage loading/inference | Created by user, persistent | WeightsProvider, LlamaInterface | -| BaseInference | Define standard inference API | Abstract base class | None | -| QvacResponse | Stream inference output | Created per run() call, short-lived | None | -| WeightsProvider | Abstract model weight loading | Created by LlmLlamacpp | DataLoader | +| LlmLlamacpp | Orchestrate model lifecycle, stream weights, submit jobs, handle events | Created by user, persistent | LlamaInterface, createJobHandler, exclusiveRunQueue | +| LlamaInterface | JS wrapper around the native addon (handle, callbacks) | Created lazily in `_load()` | binding.js | +| JobHandler (createJobHandler) | Track the current job, create `QvacResponse`, route `output`/`end`/`fail` | One per LlmLlamacpp instance | None | +| exclusiveRunQueue | Serialize `run()` / `finetune()` / `unload()` into single-in-flight FIFO | One per LlmLlamacpp instance | None | +| QvacResponse | Stream inference output, expose `await()`/`iterate()`/`onUpdate()` | Created per job, short-lived | None | **Key Relationships:** | From | To | Type | Purpose | |------|-----|------|---------| -| LlmLlamacpp | BaseInference | Inheritance | Standard QVAC inference API | -| LlmLlamacpp | WeightsProvider | Composition | Model weight acquisition | -| LlmLlamacpp | QvacResponse | Creates | Streaming output per inference | +| LlmLlamacpp | LlamaInterface | Composition | Native addon bridge | +| LlmLlamacpp | JobHandler | Composition | Per-job lifecycle + response | +| LlmLlamacpp | exclusiveRunQueue | Composition | Serialize public API calls | +| LlmLlamacpp | bare-fs | Direct use | Stream shard files in `_streamShards()` | + +**Constructor signature (new):** + +```js +new LlmLlamacpp({ + files: { model: string[], projectionModel?: string }, + config: Record, + logger?: object, + opts?: { stats?: boolean } +}) +``` + +- `files.model` is an array of absolute file paths. For single-file GGUFs, pass a one-element array. For sharded GGUFs the caller passes the `.tensors.txt` companion first, followed by every shard in numerical order. +- `load()` takes no arguments. It constructs the native addon with the first shard-matching entry of `files.model` (via `pickPrimaryGgufPath`) as the primary model path, streams all entries via `bare-fs` + `loadWeights`, and finally calls `activate()`.
@@ -219,22 +254,23 @@ graph TB subgraph "Layer 1: JavaScript API" APP["Application Code"] LLMCLASS["LlmLlamacpp
(index.js)"] - BASEINF["BaseInference
(@qvac/infer-base)"] - WEIGHTSPR["WeightsProvider
(@qvac/infer-base)"] + JOBH["createJobHandler
(@qvac/infer-base)"] + RUNQ["exclusiveRunQueue
(@qvac/infer-base)"] RESPONSE["QvacResponse
(@qvac/infer-base)"] + BAREFS["bare-fs read stream
(shard streaming)"] end - + subgraph "Layer 2: Bridge" LLAMAIF["LlamaInterface
(addon.js)"] BINDING["require.addon
(binding.js)"] end - + subgraph "Layer 3: C++ Addon" JSINTERFACE["JsInterface
(addon-cpp JsInterface)"] ADDONCPP["AddonCpp / AddonJs
(addon-cpp + addon/AddonJs.hpp)"] WEIGHTSLOAD["WeightsLoader
(addon-cpp)"] end - + subgraph "Layer 4: Model" LLAMAMODEL["LlamaModel
(model-interface/LlamaModel.cpp)"] METADATA["ModelMetaData
(model-interface/ModelMetadata.cpp)"] @@ -242,28 +278,28 @@ graph TB TEXTCTX["TextLlmContext
(model-interface/TextLlmContext.cpp)"] MTMDCTX["MtmdLlmContext
(model-interface/MtmdLlmContext.cpp)"] end - + subgraph "Layer 5: Backend" LLAMACPP["llama.cpp"] GGML["GGML"] GPU["GPU Backends"] end - + APP --> LLMCLASS - LLMCLASS --> BASEINF - LLMCLASS --> WEIGHTSPR + LLMCLASS --> JOBH + LLMCLASS --> RUNQ + LLMCLASS --> BAREFS LLMCLASS --> LLAMAIF - LLMCLASS -.-> RESPONSE - + JOBH -.-> RESPONSE + LLAMAIF --> BINDING BINDING --> JSINTERFACE - WEIGHTSPR --> WEIGHTSLOAD - + BAREFS --> LLAMAIF + JSINTERFACE --> ADDONCPP ADDONCPP --> WEIGHTSLOAD ADDONCPP --> LLAMAMODEL - ADDONCPP --> WEIGHTSLOAD - + LLAMAMODEL --> METADATA LLAMAMODEL --> ASYNCWL ASYNCWL --> METADATA @@ -271,10 +307,10 @@ graph TB LLAMAMODEL --> MTMDCTX TEXTCTX --> LLAMACPP MTMDCTX --> LLAMACPP - + LLAMACPP --> GGML GGML --> GPU - + style LLMCLASS fill:#e1f5ff style ADDONCPP fill:#ffe1e1 style LLAMAMODEL fill:#ffe1e1 @@ -288,7 +324,7 @@ graph TB | Layer | Components | Responsibility | Language | Why This Layer | |-------|------------|----------------|----------|----------------| -| 1. JavaScript API | LlmLlamacpp, BaseInference | High-level API, error handling | JS | Ergonomic API for npm consumers | +| 1. JavaScript API | LlmLlamacpp, createJobHandler, exclusiveRunQueue, bare-fs | High-level API, job/response lifecycle, shard streaming | JS | Ergonomic API for npm consumers | | 2. Bridge | LlamaInterface, binding.js | JS↔C++ communication | JS wrapper | Lifecycle management, handle safety | | 3. C++ Addon | JsInterface, AddonCpp/AddonJs | Single-job runner, threading, callbacks | C++ | Performance, native integration | | 4. Model | LlamaModel, ModelMetaData, AsyncWeightsLoader, Contexts | Inference logic, metadata extraction, streaming weight coordination, chat formatting | C++ | Direct llama.cpp integration | @@ -298,13 +334,14 @@ graph TB | Direction | Path | Data Format | Transform | |-----------|------|-------------|-----------| +| Weights → | bare-fs → LlmLlamacpp → LlamaInterface → Addon | Buffer chunks | Streamed via `loadWeights({filename, chunk, completed})` | | Input → | JS → Bridge → Addon | JSON string | Serialize messages | | Input → | Addon → Model | parsed chat_msg | Parse JSON, format template | | Input → | Model → llama.cpp | tokens | Tokenize | | Output ← | llama.cpp → Model | token IDs | Sample | | Output ← | Model → Addon | UTF-8 string | Decode token | | Output ← | Addon → Bridge | string | Queue output | -| Output ← | Bridge → JS | string | Emit via callback | +| Output ← | Bridge → JS | string | Emit via `_addonOutputCallback` → `JobHandler.output()` | @@ -316,7 +353,7 @@ graph TB #### **LlmLlamacpp (index.js)** -**Responsibility:** Main API class, orchestrates model lifecycle, manages data loaders +**Responsibility:** Main API class. Standalone (no inheritance). Orchestrates the lifecycle: creates the native addon, streams shards from absolute file paths via `bare-fs`, serializes public API calls through `exclusiveRunQueue`, tracks the current job via `createJobHandler`, and routes addon events to the active `QvacResponse`. **Why JavaScript:** - High-level API ergonomics for npm consumers @@ -324,7 +361,10 @@ graph TB - Event loop integration for streaming - Configuration parsing - +**Composition (no base class):** +- `this._job = createJobHandler({ cancel: () => this.addon.cancel() })` — single active job + response +- `this._run = exclusiveRunQueue()` — serialized `run()` / `finetune()` / `unload()` +- `this.addon = new LlamaInterface(...)` — native bridge, created lazily in `_load()` #### **LlamaInterface (addon.js)** @@ -359,15 +399,14 @@ graph TB **LLM specialization:** createInstance builds LlamaModel with config; runJob parses inputs array (media + text) into LlamaModel::Prompt -#### **WeightsProvider (@qvac/infer-base)** +#### **Shard streaming (`_streamShards` in index.js)** -**Responsibility:** Abstracts model weight acquisition +**Responsibility:** Stream caller-supplied shard files from disk into the native addon. -**Why JavaScript:** -- Integrates with data loaders (Hyperdrive, filesystem) -- Progress tracking and reporting -- Handles sharded GGUF expansion -- Streaming chunk delivery +- Iterates all entries of `files.model` (the primary path selected by `pickPrimaryGgufPath` was already passed to the constructor) +- For each file, opens a `bare-fs.createReadStream`, forwards every chunk via `addon.loadWeights({ filename, chunk, completed: false })` +- Calls `addon.loadWeights({ filename, chunk: null, completed: true })` after each file to finalize that shard +- The caller is responsible for the **complete set of files and their order** (including the `.tensors.txt` companion first for sharded models). No discovery, no expansion, no download logic inside the addon. #### **ModelMetaData (model-interface/ModelMetadata.cpp)** @@ -592,131 +631,50 @@ See [qvac-lib-inference-addon-cpp Decision 4: Why Bare Runtime](https://github.c --- -## Decision 3: Pluggable Data Loader Architecture +## Decision 3: Caller-Supplied File Paths
⚡ TL;DR -**Chose:** Abstract data loading via WeightsProvider interface -**Why:** Support multiple distribution methods (P2P, HTTP, local files, S3) -**Cost:** Additional abstraction layer, must implement loader interface +**Chose:** Caller passes absolute file paths in `files.model`; the addon does **no** download, discovery, or shard expansion. +**Why:** Keeps the addon focused on inference; distribution is the application's responsibility. +**Cost:** Callers must resolve sharded models themselves (including the `.tensors.txt` companion file).
### Context -Need to load multi-GB model files from various sources: -- Local filesystem (for offline/development) -- P2P networks (for privacy/decentralization) -- HTTP/CDN (for enterprise deployments) -- Cloud storage (S3, Azure Blob, etc.) - -Different use cases have different distribution requirements. No single distribution method fits all scenarios. +Earlier iterations of this package shipped a `WeightsProvider` + pluggable data-loader abstraction that tried to own download, caching, and shard expansion. In practice this coupled the inference addon to an I/O layer with very different lifecycle and failure modes, and forced consumers to pick (or adapt) a loader even for the trivial "file is already on disk" case. ### Decision -Create a pluggable data loader abstraction (WeightsProvider interface) that decouples model loading from the inference engine, allowing applications to choose their distribution strategy. +`LlmLlamacpp` accepts **only** absolute file paths via `files.model`. Downloading, caching, P2P, HTTP, and shard discovery all live outside the addon. The addon: -### Rationale +1. Constructs the native instance with the primary path selected by `pickPrimaryGgufPath(files.model)` — the first entry matching the shard regex, or `files.model[0]` for non-sharded models. +2. If `files.model.length > 1`, streams all files (in the provided order) via `bare-fs.createReadStream` into `addon.loadWeights({ filename, chunk, completed })`. +3. Calls `addon.activate()` to finalize load. -**Flexibility:** -- Different users have different distribution needs (privacy vs speed vs simplicity) -- Enterprises may require HTTP/CDN, privacy users may prefer P2P -- Development/testing needs local filesystem access -- No single distribution method fits all use cases +For sharded GGUFs, the caller must pass **every** shard **and** the `.tensors.txt` companion file, in order: `.tensors.txt` first, then each shard in numerical order. See the [README sharded models section](../README.md#sharded-models) for the concrete example. -**Separation of Concerns:** -- Inference engine doesn't need to know about distribution details -- Model loading is orthogonal to inference logic -- Easier to test inference separately from data loading +### Rationale -**Extensibility:** -- Applications can implement custom loaders (S3, IPFS, Torrent, etc.) -- Can optimize loaders for specific platforms (mobile vs desktop) -- Future-proof: new distribution methods don't require engine changes +**Single responsibility:** +- The addon is an inference engine. It should not own download/caching/P2P logic. +- Callers already have transport libraries that fit their deployment (Hyperdrive, HTTP, S3, bare file copy, etc.). -### Trade-offs -- ✅ Can mock loaders for unit testing inference logic -- ❌ Additional abstraction complexity vs hardcoding a single method -- ❌ Applications must choose/implement their loader (no batteries-included default) - -### WeightsProvider Interface - -```javascript -// Core abstraction that all loaders must implement -interface WeightsProvider { - // Get readable stream for model file - async getStream(path: string): ReadableStream - - // Wait for loader to be ready - async ready(): Promise - - // Cleanup resources - async close(): Promise -} -``` - -### Example Implementations +**Predictable failure modes:** +- No hidden retries, no hidden temp files, no partial-state recovery inside the addon. +- If a shard is missing, the failure happens at a clear boundary (`loadWeights` or `activate`). -
-📊 LLM-Friendly: Loader Comparison - -**Performance Characteristics:** - -| Loader | Use Case | Initial Download | Subsequent Access | Setup Complexity | -|--------|----------|------------------|-------------------|------------------| -| **FileSystemDataLoader** | Development, offline | Instant | Instant | Low (just file path) | -| **HyperdriveDataLoader** | Privacy, P2P | 10-100 MB/s | Instant (cached) | Medium (P2P keys) | -| **HttpDataLoader** | Enterprise, CDN | 50-500 MB/s | Varies | Low (just URL) | -| **S3DataLoader** | Cloud deployments | 50-200 MB/s | Varies | Medium (AWS credentials) | - -**Example: Local Filesystem Loader** -```javascript -class FileSystemDataLoader { - constructor(basePath) { this.basePath = basePath } - - async getStream(path) { - return fs.createReadStream(`${this.basePath}/${path}`) - } - async ready() { /* no-op */ } - async close() { /* no-op */ } -} -``` +**Simpler API:** +- `load()` takes no arguments. No `closeLoader`, no `onProgress`, no `downloadWeights()`. +- Callers who want progress reporting attach it to their own download step, before calling `load()`. -**Example: HTTP/CDN Loader** -```javascript -class HttpDataLoader { - constructor(baseUrl) { this.baseUrl = baseUrl } - - async getStream(path) { - const response = await fetch(`${this.baseUrl}/${path}`) - return response.body - } - async ready() { /* no-op */ } - async close() { /* no-op */ } -} -``` - -**Example: Hyperdrive (P2P) Loader** -```javascript -class HyperdriveDataLoader { - constructor(key) { - this.drive = new Hyperdrive(key) - } - - async getStream(path) { - return this.drive.createReadStream(path) - } - async ready() { - await this.drive.ready() - } - async close() { - await this.drive.close() - } -} -``` - -
+### Trade-offs +- ✅ Zero coupling between inference and transport +- ✅ Trivial to test with plain local files +- ❌ Callers must implement (or reuse) shard resolution — including listing the `.tensors.txt` companion file alongside the shards +- ❌ No "batteries included" default — intentional --- @@ -737,23 +695,18 @@ ML models can be gigabytes in size. llama.cpp expects either: 1. A file descriptor (simple but requires file on disk) 2. A buffer (via `std::streambuf` interface) -**Problem:** We need to load directly from Hyperdrive (P2P storage) without duplicating storage by saving to disk first. - -Alternative approach would be: download from Hyperdrive → save to temp file → pass file descriptor to llama.cpp. But this doubles storage requirements (Hyperdrive cache + temp file). +Even though the addon now reads shard files from disk via `bare-fs`, we still prefer the buffer path so that: +- The same code path works whether the caller streams from disk, from memory, or from any future transport. +- Multi-shard GGUFs can be fed incrementally instead of materialized to a single temp file. ### Decision -Implement custom `std::streambuf` over JavaScript-owned ArrayBuffers with incremental shard-by-shard loading, as provided by `qvac-lib-inference-addon-cpp` framework. This allows feeding buffer chunks from any source (Hyperdrive, HTTP, local files) directly to llama.cpp without intermediate file storage. - -JavaScript sends model data as buffer chunks, C++ wraps them in a `std::streambuf`, enabling llama.cpp to load sharded models incrementally with zero-copy access to JavaScript memory. See our [llama.cpp fork implementation](https://github.com/tetherto/qvac-ext-lib-llama.cpp/compare/master...tetherto:qvac-ext-lib-llama.cpp:temp-load-from-buffer?diff=unified&w). +Implement a custom `std::streambuf` over JavaScript-owned ArrayBuffers with incremental shard-by-shard loading, as provided by the `qvac-lib-inference-addon-cpp` framework. JavaScript forwards buffer chunks via `addon.loadWeights({ filename, chunk, completed })`; C++ wraps them in a `std::streambuf`, enabling llama.cpp to load sharded models incrementally with zero-copy access to JavaScript memory. See our [llama.cpp fork implementation](https://github.com/tetherto/qvac-ext-lib-llama.cpp/compare/master...tetherto:qvac-ext-lib-llama.cpp:temp-load-from-buffer?diff=unified&w). ### Rationale -**Avoid Storage Duplication:** -- Load directly from Hyperdrive streams without saving to disk first -- No temporary files consuming additional storage -- Critical for mobile devices with limited storage -- Hyperdrive data stays in its cache, not duplicated +**Incremental loading:** +- Sharded GGUFs are streamed into llama.cpp as chunks arrive, rather than requiring the full model to sit in RAM or a temp file before load. **Zero-Copy:** - C++ reads directly from JavaScript ArrayBuffer memory @@ -761,19 +714,18 @@ JavaScript sends model data as buffer chunks, C++ wraps them in a `std::streambu - Further reduces memory footprint **Source Flexibility:** -- Works with any data source (Hyperdrive, HTTP, filesystem) -- Data loader provides buffer chunks, streambuf wrapper handles delivery to llama.cpp -- Same incremental loading path for all distribution methods +- Works with any data source (bare-fs read stream today; any other in-process source tomorrow) +- Same incremental loading path regardless of where chunks come from - Supports sharded GGUF files with incremental tensor loading ### Trade-offs -- ✅ Can report loading progress per chunk +- ✅ Works with arbitrary in-process data sources - ❌ Complex streambuf implementation with seeking across blobs - ❌ Must keep JS buffers alive during load, defer cleanup to correct thread - ❌ Seeking overhead O(N) across N blobs (acceptable, rarely needed) **Key Components:** -- `WeightsProvider` (JavaScript): Orchestrates chunk delivery +- `LlmLlamacpp._streamShards()` (JavaScript): opens `bare-fs` read streams for each caller-provided shard path and forwards chunks via `addon.loadWeights` - `BlobsStream` (C++): Implements `std::basic_streambuf` over multiple blobs - `FinalizedStream` (C++): RAII wrapper owning JavaScript references - `ThreadQueuedRefDeleter` (C++): Defers reference deletion to JavaScript thread @@ -931,4 +883,4 @@ Provide hand-written TypeScript definitions in `index.d.ts` alongside JavaScript **Related Document:** - [data-flows-detailed.md](data-flows-detailed.md) - Detailed data flow diagrams and sequences -**Last Updated:** 2026-03-02 +**Last Updated:** 2026-04-16 diff --git a/packages/qvac-lib-infer-llamacpp-llm/docs/data-flows-detailed.md b/packages/qvac-lib-infer-llamacpp-llm/docs/data-flows-detailed.md index 988fa48079..c810369446 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/docs/data-flows-detailed.md +++ b/packages/qvac-lib-infer-llamacpp-llm/docs/data-flows-detailed.md @@ -22,12 +22,13 @@ This document contains detailed diagrams showing how data moves through the `@qv - Emits: Output (streaming), JobStarted, JobEnded, Error **Weight Loading:** -- JavaScript sends model weights in chunks (streaming, zero-copy) -- C++ creates std::streambuf over JS ArrayBuffers -- For streamed models, the first shard is lent to `ModelMetaData` for GGUF metadata extraction before proceeding to weight loading +- Caller passes every file (primary model + every shard + `.tensors.txt` companion) as an array of absolute paths in `files.model` +- `LlmLlamacpp._streamShards()` iterates those paths, opening `bare-fs.createReadStream` and forwarding chunks via `addon.loadWeights` +- C++ creates `std::streambuf` over JS ArrayBuffers (zero-copy) +- For streamed sharded models, the first shard is lent to `ModelMetaData` for GGUF metadata extraction before proceeding to weight loading - llama.cpp reads weights via stream interface -- Supports sharded models (GGUF multi-file) - JS references kept alive during load, cleaned up after +- The addon performs **no** download, discovery, or shard expansion — the caller owns transport **Session Cache:** - Optional KV cache persistence to disk via `CacheManager` @@ -192,56 +193,67 @@ flowchart TD ### Streaming Weight Loading +The caller is responsible for providing the **complete** list of files in `files.model`: the `.tensors.txt` companion first, followed by every shard in ascending order (for sharded models). The addon picks the first entry matching the shard regex `/-\d+-of-\d+\.gguf$/` as the primary path (falling back to `files.model[0]` for non-sharded models). The addon does no discovery, no expansion, and no download. + ```mermaid sequenceDiagram - participant JS as JavaScript - participant WP as WeightsProvider + participant User as User code + participant LLM as LlmLlamacpp (index.js) + participant FS as bare-fs participant IF as LlamaInterface participant Bind as Native Binding participant WL as WeightsLoader participant Model as LlamaModel participant LC as llama.cpp - - JS->>WP: load() - WP->>WP: expandGGUFIntoShards() - - loop For each shard chunk - WP->>WP: Read 10MB chunk - WP->>WP: Create Uint8Array - WP->>IF: loadWeights({filename, chunk, completed: false}) - IF->>Bind: loadWeights(handle, data) - Bind->>WL: addChunk() - WL->>WL: js_create_reference (pin from GC) - WL->>WL: Store reference - Bind-->>IF: void - IF-->>WP: void + + User->>LLM: new LlmLlamacpp({ files, config, logger, opts }) + User->>LLM: load() + + Note over LLM: _load(): build configurationParams with
path = pickPrimaryGgufPath(files.model), projectionPath, config + LLM->>IF: new LlamaInterface(binding, configurationParams, outputCb) + + alt files.model.length > 1 (sharded) + loop For each filePath in files.model (in order) + LLM->>FS: createReadStream(filePath) + loop For each chunk + FS-->>LLM: chunk (Uint8Array) + LLM->>IF: loadWeights({ filename, chunk, completed: false }) + IF->>Bind: loadWeights(handle, data) + Bind->>WL: addChunk() + WL->>WL: js_create_reference (pin from GC) + WL->>WL: Store reference + Bind-->>IF: void + IF-->>LLM: void + end + LLM->>IF: loadWeights({ filename, chunk: null, completed: true }) + IF->>Bind: loadWeights(handle, final) + Bind->>WL: addChunk() + finalize + WL->>WL: Create FinalizedStream + WL->>WL: Create BlobsStream (std::streambuf) + WL->>Model: set_weights_for_file(filename, stream) + end end - - WP->>IF: loadWeights({filename, chunk, completed: true}) - IF->>Bind: loadWeights(handle, data) - Bind->>WL: addChunk() + finalize - WL->>WL: Create FinalizedStream - WL->>WL: Create BlobsStream (std::streambuf) - WL->>Model: set_weights_for_file(filename, stream) + + LLM->>IF: activate() + IF->>Bind: activate(handle) + Bind->>Model: load via llama.cpp Model->>LC: llama_model_load_from_file(stream) - + LC->>LC: Read via streambuf->sgetn() Note over LC: Zero-copy access to JS buffers LC->>LC: Parse GGUF metadata LC->>LC: Load weights LC-->>Model: Model loaded - + Model->>WL: Mark for deletion WL->>WL: Queue js_delete_reference - - Note over JS: Next API call - JS->>IF: activate() - IF->>Bind: activate(handle) - Bind->>WL: Process deletion queue + Bind->>WL: Process deletion queue (during activate) WL->>WL: js_delete_reference (unpin) - WL-->>Bind: References cleaned - - Note over JS: GC can now collect ArrayBuffers + + IF-->>LLM: activated + LLM-->>User: load() resolves + + Note over User: GC can now collect ArrayBuffers ```
@@ -251,36 +263,36 @@ sequenceDiagram | Stage | JS Buffer State | C++ Reference State | Memory Location | Notes | |-------|-----------------|---------------------|-----------------|-------| -| 1. Create | Allocated by JS | None | JS heap | Uint8Array created | +| 1. Read from disk | Allocated by bare-fs | None | JS heap | Uint8Array chunk yielded by read stream | | 2. loadWeights() | Passed to C++ | js_create_reference() | JS heap | Pinned from GC | | 3. Accumulation | Still in JS | Stored in vector | JS heap | Multiple refs held | | 4. Finalize | Still in JS | Owned by FinalizedStream | JS heap | RAII wrapper | -| 5. Loading | Still in JS | Active | JS heap | Zero-copy access | +| 5. activate() load | Still in JS | Active | JS heap | Zero-copy access from llama.cpp | | 6. Load complete | Still in JS | Marked for deletion | JS heap | Queued cleanup | -| 7. Next API call | Still in JS | js_delete_reference() | JS heap | Unpinned | +| 7. activate() returns | Still in JS | js_delete_reference() | JS heap | Unpinned during activate | | 8. After return | May be GC'd | None | Freed | Memory reclaimed | -**Sharded Model Handling:** +**Sharded Model Handling (caller-owned):** -Input: `"model-00001-of-00004.gguf"` +For a 4-shard model, the caller must pass **five** absolute paths in `files.model`, in this exact order: -Expanded to: -1. `model-00001-of-00004.gguf` -2. `model-00002-of-00004.gguf` -3. `model-00003-of-00004.gguf` -4. `model-00004-of-00004.gguf` +1. `model.tensors.txt` (companion file — **required**) +2. `model-00001-of-00004.gguf` +3. `model-00002-of-00004.gguf` +4. `model-00003-of-00004.gguf` +5. `model-00004-of-00004.gguf` -JavaScript sends each file separately. C++ concatenates into single logical stream. +`_load()` uses `pickPrimaryGgufPath(files.model)` — the first entry matching the shard regex `/-\d+-of-\d+\.gguf$/`, falling back to `files.model[0]` for non-sharded models — as the primary path passed to the native addon constructor. `_streamShards()` iterates **all** entries streaming each via `bare-fs`. C++ concatenates them into a single logical stream per filename. **Performance:** | Operation | Duration | Memory Impact | Notes | |-----------|----------|---------------|-------| -| Create 10MB chunk | ~1ms | +10MB JS heap | Async I/O | +| bare-fs chunk | depends on FS | +chunk size in JS heap | Async I/O | | loadWeights() call | <1ms | +small C++ overhead | Non-blocking | | FinalizeStream | ~0.1ms | Transfer ownership | Zero-copy | -| llama_model_load() | Seconds | +model size in RAM | Background thread | -| Reference cleanup | <0.1ms | -10MB JS heap per chunk | Deferred to JS thread | +| llama_model_load() | Seconds | +model size in RAM | During activate() | +| Reference cleanup | <0.1ms | -chunk size per reference | During activate() |
@@ -468,5 +480,5 @@ flowchart TD **Related Documents:** - [architecture.md](architecture.md) - Complete architecture documentation -**Last Updated:** 2026-03-02 +**Last Updated:** 2026-04-16 diff --git a/packages/qvac-lib-infer-llamacpp-llm/docs/finetuning.md b/packages/qvac-lib-infer-llamacpp-llm/docs/finetuning.md index 1de3da563f..34a7e1a2f0 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/docs/finetuning.md +++ b/packages/qvac-lib-infer-llamacpp-llm/docs/finetuning.md @@ -41,7 +41,7 @@ The library supports **LoRA finetuning** of GGUF models. LoRA trains small adapt ### Architecture -Finetune and inference use the same job queue (JobRunner): both submit a job via `runJob()` and a single processing thread runs one job at a time (either inference or finetune). In JS, inference waits on `_lastJobResult`, while finetune uses `_finetuneActive` to block overlapping `run()`/`finetune()` calls; JobRunner enforces serialization on the native side. +Finetune and inference use the same job queue (JobRunner): both submit a job via `runJob()` and a single processing thread runs one job at a time (either inference or finetune). In JS, both `run()` and `finetune()` go through the same `exclusiveRunQueue` (`_run`) and share a `createJobHandler`-managed `_hasActiveResponse` flag; JobRunner enforces serialization on the native side. 1. **Model loading**: Load a base GGUF model (e.g., Qwen3-0.6B-Q8_0.gguf) with `model.load()`. 2. **Dataset preparation**: Training data is read from JSONL (chat format) or plain text files. Validation uses either a fraction of that data (when `validation.type` is `'split'`), a separate eval file (`'dataset'`), or none (`'none'`). @@ -71,11 +71,12 @@ Default (when `loraModules` is empty): attention Q, K, V, O only. ## JavaScript API -### `finetune(finetuningOptions?)` +### `finetune(finetuningOptions)` -Starts or resumes finetuning. If the model is not loaded, it will be loaded first. Finetuning runs exclusively (no concurrent inference). Returns a handle immediately (like `run()`); use `handle.await()` to wait for completion. If a pause checkpoint exists in `checkpointSaveDir`, training resumes from it automatically; otherwise a fresh run starts. +Starts or resumes finetuning. The model **must** already be loaded (call `load()` first); `finetune()` does not auto-load. `finetuningOptions` is required on every call — there is no in-process "stored params" state. Finetuning runs exclusively (no concurrent inference). Returns a handle immediately (like `run()`); use `handle.await()` to wait for completion. If a pause checkpoint exists in `checkpointSaveDir`, training resumes from it automatically; otherwise a fresh run starts. ```js +await model.load() const handle = await model.finetune(finetuneOptions) handle.on('stats', stats => { console.log(`epoch=${stats.current_epoch + 1} step=${stats.global_steps} loss=${stats.loss?.toFixed(4)} acc=${(stats.accuracy * 100)?.toFixed(1)}%`) @@ -88,7 +89,7 @@ const resumeResult = await resumeHandle.await() ``` - **Parameters** - - `finetuningOptions` — Object with [finetuning parameters](#finetuning-parameters). Always required. To resume after a pause, pass the same params again; the backend resumes from a pause checkpoint if one exists in `checkpointSaveDir`. **Resume contract:** call `finetune()` only after you have **awaited** `pause()`. There is no status API; await the previous command to know something is done. + - `finetuningOptions` — Object with [finetuning parameters](#finetuning-parameters). **Always required**, including on resume — pass the same params object that was used for the original run. The backend resumes from a pause checkpoint if one exists in `checkpointSaveDir`. **Resume contract:** call `finetune(finetuningOptions)` only after you have **awaited** `pause()`. There is no status API; await the previous command to know something is done. - **Returns** — `Promise`. The handle has `await()` — returns `Promise<{ op: 'finetune', status: 'COMPLETED' | 'PAUSED', stats?: object }>` when training completes or pauses. `stats` may include terminal metrics such as `train_loss`, `val_loss`, `learning_rate`, `global_steps`, and `epochs_completed`. Runtime failures reject `await()` (same failure path as inference) instead of resolving with an error status. - **Progress events** — if `opts.stats` is enabled, finetuning emits `stats` events on the handle with per-iteration metrics (`loss`, `accuracy`, `global_steps`, `current_epoch`, `current_batch`, `total_batches`). `global_steps` is the canonical monotonic step counter; `current_batch`/`total_batches` reflect backend ubatch indexing and may have non-sequential jumps depending on batch/microbatch configuration. @@ -105,7 +106,7 @@ Pauses finetuning and keeps pause checkpoints so the next `finetune()` call can await model.pause() ``` -**Returns** — `Promise`. Once resolved, you can call `finetune()` (no args) to resume. +**Returns** — `Promise`. Once resolved, you can call `finetune(finetuningOptions)` again — pass the same params object — and the backend will resume from the pause checkpoint in `checkpointSaveDir`. ### Stop and start fresh: `cancel()` @@ -208,7 +209,7 @@ The finetuning and pause/resume flow uses **wait conditions** and **events** onl | **Completion** | `handle.await()` resolves on finetune terminal payloads (`status: COMPLETED` or `PAUSED`) and rejects on runtime errors (`Error` event path). | | **Training started** | Event `FinetuningStarted` emitted when the first batch is processed. | | **Request pause** | Calling `pause()` during finetuning invokes `requestPause()` (sets `pauseRequested` and `llama_opt_request_stop()`). The binding runs `waitUntilFinetuningPauseComplete()` on a background task, blocking on a condition variable until the JobRunner thread (running the finetune job) signals pause done (checkpoint saved or save failed); the Promise resolves when that wait returns. There is a 5-minute timeout if the checkpoint save never completes. | -| **Resume** | When you call `finetune()` (with no args to use stored params), the JS calls `addon.finetune(params)`. The C++ `finetune()` checks for a pause checkpoint in `params.checkpointSaveDir`; if one exists, it calls `clearPauseRequest()` and resumes from that checkpoint. **Contract:** call `finetune()` only after you have **awaited** `pause()`. No status check in the binding. | +| **Resume** | When you call `finetune(params)` again after `pause()`, the JS calls `addon.finetune(params)`. The C++ `finetune()` checks for a pause checkpoint in `params.checkpointSaveDir`; if one exists, it calls `clearPauseRequest()` and resumes from that checkpoint. **Contract:** call `finetune(params)` only after you have **awaited** `pause()`. JS does not retain stored params — you must pass the same params object on resume. No status check in the binding. | **Wait conditions in C++:** `pauseDoneCv` / `pauseWaitDone` signal when pause has completed. `waitUntilFinetuningPauseComplete()` uses a 5-minute timeout so the caller is not blocked indefinitely if the JobRunner thread never signals. The C++ decides “resume from checkpoint” solely by checking the filesystem: at the start of `finetune(params, logCallback)` it calls `pauseCheckpointExists(params.checkpointSaveDir)`. If true, it calls `clearPauseRequest()` and then loads the latest `pause_checkpoint_step_*` directory and metadata to resume; otherwise it starts fresh. Atomic flags in `TrainingCheckpointState`: `pauseRequested`, `shouldExit`, `pauseCheckpointSaved`, `pauseWaitDone`; the pointer `currentCheckpointState_` in `LlamaModel` is also atomic. Together with `pauseDoneMutex` and `pauseDoneCv`, these provide thread-safe coordination between the thread waiting in `waitUntilFinetuningPauseComplete()` (from `pause()`) and the JobRunner thread running the finetune job (which checks flags, saves the checkpoint, and signals completion). @@ -216,16 +217,16 @@ The finetuning and pause/resume flow uses **wait conditions** and **events** onl | API | Backend behavior | |-----|------------------| -| **`finetune(opts?)`** | Normalizes opts (required `validation` object → `validationSplit`, `useEvalDatasetForValidation`), then calls `addon.finetune(params)`. Params come from opts or stored. C++ auto-detects resume when a pause checkpoint exists in `checkpointSaveDir`. Returns a handle; `handle.await()` resolves with terminal payload `status: COMPLETED | PAUSED`, and rejects on runtime errors. | +| **`finetune(opts)`** | Normalizes opts (required `validation` object → `validationSplit`, `useEvalDatasetForValidation`), then calls `addon.finetune(params)`. `opts` is required on every call — JS does not retain stored params. C++ auto-detects resume when a pause checkpoint exists in `checkpointSaveDir`. Returns a handle; `handle.await()` resolves with terminal payload `status: COMPLETED | PAUSED`, and rejects on runtime errors. | | **`pause()`** | During finetuning, calls C++ pause flow (`requestPause()` + `waitUntilFinetuningPauseComplete()`), which writes a pause checkpoint and resolves when the pause path completes. | | **`cancel()`** | Calls addon cancel, then removes local `pause_checkpoint_step_*` directories from `checkpointSaveDir` so the next `finetune()` starts fresh. | ### Fresh run vs resume -The choice between a **fresh run** and **resume from pause** is made in C++ inside `LlamaModel::finetune()`. The JS API exposes a single `finetune(opts?)`; resume is determined by the backend from the presence of a pause checkpoint on disk. There is no in-process "we were paused" state: if you restart the script and call `finetune(opts)` with the same `checkpointSaveDir`, the backend will resume from any existing pause checkpoint in that directory. +The choice between a **fresh run** and **resume from pause** is made in C++ inside `LlamaModel::finetune()`. The JS API exposes a single `finetune(opts)`; resume is determined by the backend from the presence of a pause checkpoint on disk. There is no in-process "we were paused" or "stored params" state in JS: if you restart the script and call `finetune(opts)` with the same `checkpointSaveDir`, the backend will resume from any existing pause checkpoint in that directory. - **How it’s decided:** After validating params, C++ sets `checkpointDir = params.checkpointSaveDir` (or `"./checkpoints"`) and calls `pauseCheckpointExists(checkpointDir)`. If that returns true, it calls `clearPauseRequest()` and then uses `findLatestPauseCheckpoint()` and `parseCheckpointMetadata()` to set `resumingFromPause` and load resume metadata; the rest of the function branches on `resumingFromPause` (load adapter from checkpoint vs init from params, restore step/epoch, etc.). -- **Params on resume:** The current `params` (from the call—e.g. from the original run when you call `finetune()` with no args) are used for dataset paths, `numberOfEpochs`, learning rate, scheduler, checkpoint dir, and so on. The checkpoint supplies the **position** (epoch, globalStep, currentStep, resumeEpoch, resumeBatch, pausedDuringValidation) and saved LoRA layout (targetModules, loraRank, loraAlpha); `loraInitStd` comes from `params`. +- **Params on resume:** The `params` you pass on the resume call are used for dataset paths, `numberOfEpochs`, learning rate, scheduler, checkpoint dir, and so on — pass the same object you used for the original run. The checkpoint supplies the **position** (epoch, globalStep, currentStep, resumeEpoch, resumeBatch, pausedDuringValidation) and saved LoRA layout (targetModules, loraRank, loraAlpha); `loraInitStd` comes from `params`. ### UML: finetune and pause flow (JS → C++) @@ -246,13 +247,13 @@ sequenceDiagram participant Helpers as LlamaFinetuningHelpers participant Queue as outputQueue - User->>LlamaModel: finetune(opts) or finetune() (no args → stored params) - LlamaModel->>LlamaModel: _finetuneActive check, store params, normalize opts (validation object required; dataset requires validation.path; emits validationSplit/useEvalDatasetForValidation/evalDatasetPath) + User->>LlamaModel: finetune(opts) (opts always required, including on resume) + LlamaModel->>LlamaModel: enqueue via exclusiveRunQueue and normalize opts (validation object required, dataset needs validation.path, emits flat validationSplit/useEvalDatasetForValidation/evalDatasetPath) LlamaModel->>Addon: finetune(params) Addon->>Binding: _binding.finetune(handle, params) Binding->>AddonJs: finetune(env, info) - AddonJs->>AddonJs: JsInterface.getInstance, getLlamaModel(instance); tryGetObject for params; build Prompt with finetuningParams and outputCallback + AddonJs->>AddonJs: JsInterface.getInstance, getLlamaModel(instance), tryGetObject for params, build Prompt with finetuningParams and outputCallback AddonJs->>AddonCpp: runJob(any(prompt)) AddonCpp->>JobRunner: runJob(any) JobRunner->>LlamaModelCpp: process(job) → branch on finetuningParams → finetune(params, outputCallback) @@ -262,7 +263,7 @@ sequenceDiagram LlamaModel-->>User: handle { await() } Note over JobRunner,LlamaModelCpp: Finetune runs in JobRunner thread (same as inference) - LlamaModelCpp->>LlamaModelCpp: pauseCheckpointExists(checkpointDir)? clearPauseRequest(); resume or fresh path + LlamaModelCpp->>LlamaModelCpp: pauseCheckpointExists(checkpointDir)? clearPauseRequest() — resume or fresh path LlamaModelCpp->>Helpers: prepareTrainingDataset, training loop loop each batch / completion Helpers->>LlamaModelCpp: logCallback(msg) for progress @@ -270,7 +271,7 @@ sequenceDiagram end LlamaModelCpp->>Queue: queueJobEnded({ op:'finetune', status, stats? }) Queue->>LlamaModel: _addonOutputCallback(...) -> _outputCallback(..., 'JobEnded', 'OnlyOneJob', data) - LlamaModel->>LlamaModel: BaseInference routes JobEnded to QvacResponse.ended(data) + LlamaModel->>LlamaModel: _handleAddonOutputEvent routes JobEnded to active QvacResponse via createJobHandler LlamaModel->>User: handle.await() resolves with { op:'finetune', status:'COMPLETED'|'PAUSED', stats? } (errors reject) ``` @@ -292,7 +293,7 @@ sequenceDiagram Addon->>Binding: _binding.cancel(handle) Binding->>AddonJs: qvac_lib_inference_addon_llama::cancel(env, info) - AddonJs->>AddonJs: JsInterface.getInstance, getLlamaModel(instance); isFinetuneRunning()? + AddonJs->>AddonJs: JsInterface.getInstance, getLlamaModel(instance), isFinetuneRunning check AddonJs->>LlamaModelCpp: llamaModel->requestPause() LlamaModelCpp->>LlamaModelCpp: currentCheckpointState_->pauseRequested.store(true) LlamaModelCpp->>LlamaModelCpp: llama_opt_request_stop(ctx) @@ -324,7 +325,7 @@ sequenceDiagram | Layer | Component | Role | |-------|-----------|------| -| JS | `index.js` → `LlmLlamacpp` | Public API: `finetune()`, `pause()`, `cancel()`. `pause()` requests a resumable stop; `cancel()` stops and removes `pause_checkpoint_step_*` directories for a fresh next run. Normalizes opts: requires `validation`, rejects top-level `evalDatasetPath`, maps dataset validation to `evalDatasetPath`, and emits `validationSplit` / `useEvalDatasetForValidation` before calling addon. Uses `_finetuneActive` and `QvacResponse` (`OnlyOneJob`) for lifecycle; `_addonOutputCallback` maps terminal finetune payloads to `JobEnded`. | +| JS | `index.js` → `LlmLlamacpp` | Public API: `finetune()`, `pause()`, `cancel()`. `pause()` requests a resumable stop; `cancel()` stops and removes `pause_checkpoint_step_*` directories for a fresh next run. Normalizes opts: requires `validation`, rejects top-level `evalDatasetPath`, maps dataset validation to `evalDatasetPath`, and emits `validationSplit` / `useEvalDatasetForValidation` before calling addon. Serializes public API via `exclusiveRunQueue` (`_run`) and tracks the active job via `createJobHandler` (`_job`) plus `_hasActiveResponse`; `_addonOutputCallback` maps terminal finetune payloads to `JobEnded` on the active `QvacResponse`. | | JS | `addon.js` → `LlamaInterface` | Thin wrapper: `finetune(params)` → `_binding.finetune(handle, params)`, `cancel()` → `_binding.cancel(handle)` (used by both `pause()` and `cancel()` in JS). | | C++ | `binding.cpp` | BARE exports: `finetune`, `cancel` → `qvac_lib_inference_addon_llama::*`. | | C++ | `AddonJs.hpp` | Parses JS args, gets `LlamaModel*` via `getLlamaModel(instance)`; `tryGetObject()` for params; builds `Prompt` with `finetuningParams` and `outputCallback`, calls `addonCpp->runJob(any(prompt))` (same path as inference). C++ auto-detects resume via `pauseCheckpointExists(checkpointSaveDir)`. `cancel()`: if `isFinetuneRunning()` then `requestPause()` + `JsAsyncTask::run(waitUntilFinetuningPauseComplete)`, else `cancelJob()`; always returns Promise via `JsAsyncTask::run`. | @@ -362,7 +363,7 @@ The finetuning backend lives in `addon/src/` and uses the llama.cpp optimizer AP 7. **Pause request path** — `requestPause()`: if `currentCheckpointState_` (atomic, per instance) is non-null, sets `pauseRequested.store(true)` and `llama_opt_request_stop(ctx)`; returns immediately. Returns `false` if no checkpoint state exists (e.g. training not started yet). 8. **Completion** — On normal finish: `saveLoraAdapter()` writes the final LoRA to `outputParametersDir` and finetune ends as `COMPLETED`. On pause: terminal status is `PAUSED`. On runtime error: C++ throws; JS receives an `Error` event and `handle.await()` rejects. -**Wait conditions and internal state** — `TrainingCheckpointState` holds atomic flags `pauseRequested`, `shouldExit`, `pauseCheckpointSaved`, `pauseWaitDone` and the wait condition `pauseDoneCv` / `pauseDoneMutex`. When `pause()` is called during finetuning, `requestPause()` sets `pauseRequested` and a background task runs `waitUntilFinetuningPauseComplete()`, which blocks on `pauseDoneCv` until the JobRunner thread (running the finetune job) saves the checkpoint and sets `pauseWaitDone`; this gives thread-safe coordination between the two. The binding does not read status (e.g. `isPaused`); resume is driven by calling `finetune()` after awaiting `pause()`; C++ auto-detects a pause checkpoint in `checkpointSaveDir` and resumes. Multiple model instances work correctly (per-instance state, thread-local callback state). Calling `cancel()` uses the same addon cancel entrypoint, then clears pause checkpoints on the JS side to force a fresh subsequent run. +**Wait conditions and internal state** — `TrainingCheckpointState` holds atomic flags `pauseRequested`, `shouldExit`, `pauseCheckpointSaved`, `pauseWaitDone` and the wait condition `pauseDoneCv` / `pauseDoneMutex`. When `pause()` is called during finetuning, `requestPause()` sets `pauseRequested` and a background task runs `waitUntilFinetuningPauseComplete()`, which blocks on `pauseDoneCv` until the JobRunner thread (running the finetune job) saves the checkpoint and sets `pauseWaitDone`; this gives thread-safe coordination between the two. The binding does not read status (e.g. `isPaused`); resume is driven by calling `finetune(params)` again after awaiting `pause()` — JS does not retain stored params, so the same params object must be passed on resume. C++ auto-detects a pause checkpoint in `checkpointSaveDir` and resumes. Multiple model instances work correctly (per-instance state, thread-local callback state). Calling `cancel()` uses the same addon cancel entrypoint, then clears pause checkpoints on the JS side to force a fresh subsequent run. --- @@ -405,28 +406,24 @@ Minimal example: load model, run finetuning, wait for completion. 'use strict' const LlmLlamacpp = require('@qvac/llm-llamacpp') -const FilesystemDL = require('@qvac/dl-filesystem') const path = require('bare-path') async function main() { const modelDir = path.resolve('./models') - const loader = new FilesystemDL({ dirPath: modelDir }) - - const model = new LlmLlamacpp( - { - loader, - opts: { stats: true }, - logger: console, - diskPath: modelDir, - modelName: 'Qwen3-0.6B-Q8_0.gguf' + + const model = new LlmLlamacpp({ + files: { + model: [path.join(modelDir, 'Qwen3-0.6B-Q8_0.gguf')] }, - { + config: { gpu_layers: '999', ctx_size: '512', device: 'gpu', flash_attn: 'off' - } - ) + }, + opts: { stats: true }, + logger: console + }) await model.load() @@ -456,7 +453,7 @@ main().catch(console.error) ### 2. Pause and Resume -Start finetuning, wait for training to begin (e.g. fixed sleep), call `pause()`, then resume and wait for completion. After `pause()` resolves you can call `finetune()` (no args). +Start finetuning, wait for training to begin (e.g. fixed sleep), call `pause()`, then resume by calling `finetune(finetuneOptions)` again with the same params object and wait for completion. **Run:** `bare examples/simple-lora-finetune-pause-resume.js` @@ -491,7 +488,13 @@ const config = { lora: './finetuned-model-direct/trained-lora-adapter.gguf' } -const model = new LlmLlamacpp(args, config) +const model = new LlmLlamacpp({ + files: { + model: [path.join(modelDir, 'Qwen3-0.6B-Q8_0.gguf')] + }, + config, + logger: console +}) await model.load() const messages = [ @@ -555,7 +558,7 @@ Each checkpoint directory typically contains: ### Resume from Pause -Call `finetune()` (no args) to resume. The addon finds the latest `pause_checkpoint_step_*` in `checkpointSaveDir` and continues training from there, reusing the stored finetuning parameters. The pause checkpoint metadata includes explicit resume cursor fields (`resume_epoch`, `resume_batch`) which are passed directly to the backend's `llama_opt_epoch_resume()`, so training resumes at the exact saved position without deriving it from step counters. If paused during validation, resume starts at the next epoch. **Checkpoint lifecycle:** After loading a pause checkpoint to resume, the backend removes that checkpoint directory so the same run does not resume from it again. When training completes successfully (COMPLETED), any remaining pause checkpoint in `checkpointSaveDir` is also cleared. Pause checkpoints remain on disk only while training is paused (after `pause()` and before the next `finetune()`), unless `cancel()` is called, which clears them. +Call `finetune(finetuningOptions)` again with the same params object to resume. The addon finds the latest `pause_checkpoint_step_*` in `checkpointSaveDir` and continues training from there. JS does not retain a stored params object, so the caller must pass the same params on every call (including resume). The pause checkpoint metadata includes explicit resume cursor fields (`resume_epoch`, `resume_batch`) which are passed directly to the backend's `llama_opt_epoch_resume()`, so training resumes at the exact saved position without deriving it from step counters. If paused during validation, resume starts at the next epoch. **Checkpoint lifecycle:** After loading a pause checkpoint to resume, the backend removes that checkpoint directory so the same run does not resume from it again. When training completes successfully (COMPLETED), any remaining pause checkpoint in `checkpointSaveDir` is also cleared. Pause checkpoints remain on disk only while training is paused (after `pause()` and before the next `finetune()`), unless `cancel()` is called, which clears them. --- diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/benchToolsPlacement.js b/packages/qvac-lib-infer-llamacpp-llm/examples/benchToolsPlacement.js index 7a9096c555..407a90a70f 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/benchToolsPlacement.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/benchToolsPlacement.js @@ -1,7 +1,6 @@ 'use strict' const LlmLlamacpp = require('../index') -const FilesystemDL = require('@qvac/dl-filesystem') const path = require('bare-path') const fs = require('bare-fs') const process = require('bare-process') @@ -213,16 +212,15 @@ function makeBaseConfig (toolsAtEnd) { } async function loadModel (dirPath, modelName, config) { - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const model = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: console, opts: { stats: true } - }, config) + }) await model.load() - return { model, loader } + return { model } } async function runAndCollect (model, prompt, runOptions) { @@ -255,7 +253,7 @@ async function runScenario (dirPath, modelName, opts) { console.log('='.repeat(70)) const config = makeBaseConfig(toolsAtEnd) - const { model, loader } = await loadModel(dirPath, modelName, config) + const { model } = await loadModel(dirPath, modelName, config) const cachePath = path.join(dirPath, cacheName) cleanCache(cachePath) @@ -340,7 +338,6 @@ async function runScenario (dirPath, modelName, opts) { } } finally { await model.unload() - await loader.close() cleanCache(cachePath) } diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/showcase/smart-home-finetune.js b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/showcase/smart-home-finetune.js index 3fc6964c86..e71d3b81ad 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/showcase/smart-home-finetune.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/showcase/smart-home-finetune.js @@ -1,7 +1,7 @@ 'use strict' const LlamaClient = require('../../../index') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const process = require('bare-process') const { downloadModel, formatProgress, createFilteredLogger } = require('../../utils') @@ -20,7 +20,6 @@ const OUTPUT_DIR = './smart-home-lora' async function main () { let client - let loader const { logger: filteredLogger, restore: restoreConsole } = createFilteredLogger() @@ -33,15 +32,7 @@ async function main () { const [modelName, modelDir] = await downloadModel(MODEL.url, MODEL.name) - loader = new FilesystemDL({ dirPath: modelDir }) - - const args = { - loader, - opts: { stats: true }, - logger: filteredLogger, - diskPath: modelDir, - modelName - } + const modelPath = path.join(modelDir, modelName) const config = { gpu_layers: '999', @@ -50,7 +41,12 @@ async function main () { flash_attn: 'off' } - client = new LlamaClient(args, config) + client = new LlamaClient({ + files: { model: [modelPath] }, + config, + logger: filteredLogger, + opts: { stats: true } + }) await client.load() console.log('Model loaded.\n') @@ -119,13 +115,6 @@ async function main () { console.error('Failed to unload model during cleanup:', unloadErr) } } - if (loader) { - try { - await loader.close() - } catch (closeErr) { - console.error('Failed to close loader during cleanup:', closeErr) - } - } } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/showcase/smart-home-finetuned-test.js b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/showcase/smart-home-finetuned-test.js index a94139c3fa..1ab6c77bae 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/showcase/smart-home-finetuned-test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/showcase/smart-home-finetuned-test.js @@ -1,7 +1,6 @@ 'use strict' const LlamaClient = require('../../../index') -const FilesystemDL = require('@qvac/dl-filesystem') const process = require('bare-process') const path = require('bare-path') const fs = require('bare-fs') @@ -143,15 +142,7 @@ async function runScenario (client, messages) { async function main () { const [modelName, modelDir] = await downloadModel(MODEL.url, MODEL.name) - const loader = new FilesystemDL({ dirPath: modelDir }) - - const args = { - loader, - opts: { stats: true }, - logger: console, - diskPath: modelDir, - modelName - } + const modelPath = path.join(modelDir, modelName) const sharedConfig = { device: 'gpu', @@ -184,7 +175,12 @@ async function main () { console.log(' Model: ' + MODEL.name) console.log(separator('=')) - baselineClient = new LlamaClient(args, baselineConfig) + baselineClient = new LlamaClient({ + files: { model: [modelPath] }, + config: baselineConfig, + logger: console, + opts: { stats: true } + }) await baselineClient.load() console.log('Base model loaded (no LoRA).\n') @@ -263,7 +259,12 @@ async function main () { console.log(' Adapter: ' + LORA_ADAPTER) console.log(separator('=')) - client = new LlamaClient(args, config) + client = new LlamaClient({ + files: { model: [modelPath] }, + config, + logger: console, + opts: { stats: true } + }) await client.load() console.log('Model + LoRA adapter loaded.\n') @@ -408,8 +409,6 @@ async function main () { console.error('\nTest failed:', error.message) console.error('Stack:', error.stack) process.exit(1) - } finally { - try { await loader.close() } catch (e) { console.error('Failed to close loader:', e) } } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-multiple-pause-resume.js b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-multiple-pause-resume.js index b67ca22f9b..52ed12d2bd 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-multiple-pause-resume.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-multiple-pause-resume.js @@ -1,7 +1,6 @@ 'use strict' const LlamaClient = require('../../index') -const FilesystemDL = require('@qvac/dl-filesystem') const process = require('bare-process') const path = require('bare-path') const fs = require('bare-fs') @@ -52,18 +51,10 @@ async function main () { const trainDatasetPath = './examples/input/small_train_HF.jsonl' const evalDatasetPath = './examples/input/small_eval_HF.jsonl' - const loader = new FilesystemDL({ dirPath: modelDir }) + const modelPath = path.join(modelDir, modelName) const { logger: filteredLogger, restore: restoreConsole } = createFilteredLogger() - const args = { - loader, - opts: { stats: true }, - logger: filteredLogger, - diskPath: modelDir, - modelName - } - const config = { device: 'gpu', gpu_layers: '999', @@ -76,7 +67,12 @@ async function main () { try { console.log('=== Multiple Pause/Resume Finetuning Test ===\n') console.log('Loading model...') - client = new LlamaClient(args, config) + client = new LlamaClient({ + files: { model: [modelPath] }, + config, + logger: filteredLogger, + opts: { stats: true } + }) await client.load() console.log('Model loaded successfully\n') diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-pause-inference-resume.js b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-pause-inference-resume.js index 5448436f43..df784f4385 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-pause-inference-resume.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-pause-inference-resume.js @@ -1,7 +1,6 @@ 'use strict' const LlamaClient = require('../../index') -const FilesystemDL = require('@qvac/dl-filesystem') const process = require('bare-process') const path = require('bare-path') const fs = require('bare-fs') @@ -76,18 +75,10 @@ async function main () { const trainDatasetPath = './examples/input/small_train_HF.jsonl' const evalDatasetPath = './examples/input/small_eval_HF.jsonl' - const loader = new FilesystemDL({ dirPath: modelDir }) + const modelPath = path.join(modelDir, modelName) const { logger: filteredLogger, restore: restoreConsole } = createFilteredLogger() - const args = { - loader, - opts: { stats: true }, - logger: filteredLogger, - diskPath: modelDir, - modelName - } - const config = { device: 'gpu', gpu_layers: '999', @@ -100,7 +91,12 @@ async function main () { try { console.log('=== Pause Finetuning, Inference, and Resume Test ===\n') console.log('Loading model...') - client = new LlamaClient(args, config) + client = new LlamaClient({ + files: { model: [modelPath] }, + config, + logger: filteredLogger, + opts: { stats: true } + }) await client.load() console.log('Model loaded successfully\n') @@ -219,7 +215,12 @@ async function main () { } console.log('🔮 Preparing inference 1: Loading model with LoRA adapter...') - inferenceClientWithLora = new LlamaClient(args, inferenceConfigWithLora) + inferenceClientWithLora = new LlamaClient({ + files: { model: [modelPath] }, + config: inferenceConfigWithLora, + logger: filteredLogger, + opts: { stats: true } + }) await inferenceClientWithLora.load() console.log('✅ Model with LoRA adapter loaded successfully\n') @@ -246,7 +247,12 @@ async function main () { } console.log('🔮 Preparing inference 2: Loading base model (no LoRA adapters)...') - inferenceClientBase = new LlamaClient(args, inferenceConfigBase) + inferenceClientBase = new LlamaClient({ + files: { model: [modelPath] }, + config: inferenceConfigBase, + logger: filteredLogger, + opts: { stats: true } + }) await inferenceClientBase.load() console.log('✅ Base model loaded successfully\n') diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-pause-resume.js b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-pause-resume.js index 4fa2d1eb70..33e817ff13 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-pause-resume.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-pause-resume.js @@ -1,7 +1,6 @@ 'use strict' const LlamaClient = require('../../index') -const FilesystemDL = require('@qvac/dl-filesystem') const process = require('bare-process') const path = require('bare-path') const fs = require('bare-fs') @@ -58,18 +57,10 @@ async function main () { const trainDatasetPath = './examples/input/small_train_HF.jsonl' const evalDatasetPath = './examples/input/small_eval_HF.jsonl' - const loader = new FilesystemDL({ dirPath: modelDir }) + const modelPath = path.join(modelDir, modelName) const { logger: filteredLogger, restore: restoreConsole } = createFilteredLogger() - const args = { - loader, - opts: { stats: true }, - logger: filteredLogger, - diskPath: modelDir, - modelName - } - const config = { device: 'gpu', gpu_layers: '999', @@ -82,7 +73,12 @@ async function main () { try { console.log('=== Pause/Resume Finetuning Test ===\n') console.log('Loading model...') - client = new LlamaClient(args, config) + client = new LlamaClient({ + files: { model: [modelPath] }, + config, + logger: filteredLogger, + opts: { stats: true } + }) await client.load() console.log('Model loaded successfully\n') diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune.js b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune.js index 1116e6a119..c1ff774e7a 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune.js @@ -1,7 +1,7 @@ 'use strict' const LlmLlamacpp = require('../../index') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const { downloadModel, formatProgress, createFilteredLogger } = require('../utils') const MODEL = { @@ -11,22 +11,13 @@ const MODEL = { async function runFinetuningTests () { let model - let loader const { logger: filteredLogger, restore: restoreConsole } = createFilteredLogger() try { const [modelName, modelDir] = await downloadModel(MODEL.url, MODEL.name) - loader = new FilesystemDL({ dirPath: modelDir }) - - const args = { - loader, - opts: { stats: true }, - logger: filteredLogger, - diskPath: modelDir, - modelName - } + const modelPath = path.join(modelDir, modelName) const config = { gpu_layers: '999', @@ -35,7 +26,12 @@ async function runFinetuningTests () { flash_attn: 'off' } - model = new LlmLlamacpp(args, config) + model = new LlmLlamacpp({ + files: { model: [modelPath] }, + config, + logger: filteredLogger, + opts: { stats: true } + }) await model.load() const finetuneOptions = { @@ -57,12 +53,10 @@ async function runFinetuningTests () { }) const finetuneResult = await handle.await() console.log('Finetune completed:', finetuneResult) - if (args.opts?.stats) { - if (finetuneResult && typeof finetuneResult.stats === 'object' && finetuneResult.stats !== null) { - console.log('✅ Finetune terminal stats:', finetuneResult.stats) - } else { - console.warn('⚠️ opts.stats is enabled, but no finetune terminal stats were returned') - } + if (finetuneResult && typeof finetuneResult.stats === 'object' && finetuneResult.stats !== null) { + console.log('✅ Finetune terminal stats:', finetuneResult.stats) + } else { + console.warn('⚠️ opts.stats is enabled, but no finetune terminal stats were returned') } } catch (error) { console.error('Test failed:', error.message) @@ -77,13 +71,6 @@ async function runFinetuningTests () { console.error('Failed to unload model during cleanup:', unloadErr) } } - if (loader) { - try { - await loader.close() - } catch (closeErr) { - console.error('Failed to close loader during cleanup:', closeErr) - } - } } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/multiCache.js b/packages/qvac-lib-infer-llamacpp-llm/examples/multiCache.js index 96ff7373f7..2041d82395 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/multiCache.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/multiCache.js @@ -1,7 +1,7 @@ 'use strict' const LlmLlamacpp = require('../index') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const process = require('bare-process') const { downloadModel } = require('./utils') @@ -15,17 +15,8 @@ async function main () { 'Llama-3.2-1B-Instruct-Q4_0.gguf' ) - // 2. Initializing data loader - const fsDL = new FilesystemDL({ dirPath }) - - // 3. Configuring model settings - const args = { - loader: fsDL, - opts: { stats: true }, - logger: console, - diskPath: dirPath, - modelName - } + // 2. Configuring model settings + const modelPath = path.join(dirPath, modelName) const config = { device: 'gpu', @@ -33,12 +24,17 @@ async function main () { ctx_size: '10000' } - // 4. Loading model - const model = new LlmLlamacpp(args, config) + // 3. Loading model + const model = new LlmLlamacpp({ + files: { model: [modelPath] }, + config, + logger: console, + opts: { stats: true } + }) await model.load() try { - // 5. First conversation - no cache will be used. One shot inference + // 4. First conversation - no cache will be used. One shot inference const messages = [ { role: 'system', @@ -73,7 +69,7 @@ async function main () { console.log(`Inference stats: ${JSON.stringify(response1.stats)}`) console.log('\n') - // 6. Switching to a new session with cache1.bin file + // 5. Switching to a new session with cache1.bin file const messages2 = [ { role: 'user', @@ -96,7 +92,7 @@ async function main () { console.log(`Inference stats: ${JSON.stringify(response2.stats)}`) console.log('\n') - // 7. Continuing conversation with cache1.bin + // 6. Continuing conversation with cache1.bin const messages3 = [ { role: 'user', @@ -123,9 +119,8 @@ async function main () { console.error('Error occurred:', errorMessage) console.error('Error details:', error) } finally { - // 8. Cleaning up resources + // 7. Cleaning up resources await model.unload() - await fsDL.close() } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/multiModal.js b/packages/qvac-lib-infer-llamacpp-llm/examples/multiModal.js index 2fbfff85b4..d3a666e857 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/multiModal.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/multiModal.js @@ -1,7 +1,7 @@ 'use strict' const LlmLlamacpp = require('../index') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const fs = require('bare-fs') const process = require('bare-process') const { downloadModel } = require('./utils') @@ -16,23 +16,13 @@ async function main () { 'SmolVLM2-500M-Video-Instruct-Q8_0.gguf' ) - const [projectionModel] = await downloadModel( + const [projModelName] = await downloadModel( 'https://huggingface.co/ggml-org/SmolVLM2-500M-Video-Instruct-GGUF/resolve/main/mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf', 'mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf' ) - // 2. Initializing data loader - const fsDL = new FilesystemDL({ dirPath }) - - // 3. Configuring model settings - const args = { - loader: fsDL, - opts: { stats: true }, - logger: console, - diskPath: dirPath, - modelName, - projectionModel - } + // 2. Configuring model settings + const modelPath = path.join(dirPath, modelName) const config = { device: 'gpu', @@ -40,16 +30,21 @@ async function main () { ctx_size: '2048' } - // 4. Loading model - const model = new LlmLlamacpp(args, config) + // 3. Loading model + const model = new LlmLlamacpp({ + files: { model: [modelPath], projectionModel: path.join(dirPath, projModelName) }, + config, + logger: console, + opts: { stats: true } + }) await model.load() - // 5. Preparing media. We will use both the path and the buffer in different inferences + // 4. Preparing media. We will use both the path and the buffer in different inferences const imageFilePath = 'media/news-paper.jpg' const imageBuffer = new Uint8Array(fs.readFileSync(imageFilePath)) try { - // 6. First inference with image buffer + // 5. First inference with image buffer (Uint8Array) const messages1 = [ { role: 'system', @@ -81,7 +76,7 @@ async function main () { console.log(`Inference stats: ${JSON.stringify(response1.stats)}`) console.log('\n') - // 7. Second inference with image file path + // 6. Second inference with image file path (string) const messages2 = [ { role: 'system', @@ -94,7 +89,7 @@ async function main () { }, { role: 'user', - content: 'what is in the image?' + content: 'Describe the image in one sentence.' } ] @@ -117,9 +112,8 @@ async function main () { console.error('Error occurred:', errorMessage) console.error('Error details:', error) } finally { - // 8. Cleaning up resources + // 7. Cleaning up resources await model.unload() - await fsDL.close() } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/nativelog.js b/packages/qvac-lib-infer-llamacpp-llm/examples/nativelog.js index f0458cf788..ecdba6aa43 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/nativelog.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/nativelog.js @@ -1,8 +1,8 @@ 'use strict' const LlmLlamacpp = require('../index') -const FilesystemDL = require('@qvac/dl-filesystem') const { setLogger, releaseLogger } = require('../addonLogging') +const path = require('bare-path') const process = require('bare-process') const { downloadModel } = require('./utils') @@ -36,17 +36,8 @@ async function main () { 'Llama-3.2-1B-Instruct-Q4_0.gguf' ) - // 3. Initializing data loader - const fsDL = new FilesystemDL({ dirPath }) - - // 4. Configuring model settings - const args = { - loader: fsDL, - opts: { stats: true }, - logger: console, - diskPath: dirPath, - modelName - } + // 3. Configuring model settings + const modelPath = path.join(dirPath, modelName) const config = { device: 'gpu', @@ -55,12 +46,17 @@ async function main () { verbosity: '2' } - // 5. Loading model - const model = new LlmLlamacpp(args, config) + // 4. Loading model + const model = new LlmLlamacpp({ + files: { model: [modelPath] }, + config, + logger: console, + opts: { stats: true } + }) await model.load() try { - // 6. Running inference with conversation prompt + // 5. Running inference with conversation prompt const prompt = [ { role: 'system', @@ -98,9 +94,8 @@ async function main () { console.error('Error occurred:', errorMessage) console.error('Error details:', error) } finally { - // 7. Cleaning up resources + // 6. Cleaning up resources await model.unload() - await fsDL.close() releaseLogger() } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/quickstart.js b/packages/qvac-lib-infer-llamacpp-llm/examples/quickstart.js index 551c867abe..09a664945d 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/quickstart.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/quickstart.js @@ -1,7 +1,7 @@ 'use strict' const LlmLlamacpp = require('../index') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const process = require('bare-process') const { downloadModel } = require('./utils') @@ -15,17 +15,8 @@ async function main () { 'Llama-3.2-1B-Instruct-Q4_0.gguf' ) - // 2. Initializing data loader - const fsDL = new FilesystemDL({ dirPath }) - - // 3. Configuring model settings - const args = { - loader: fsDL, - opts: { stats: true }, - logger: console, - diskPath: dirPath, - modelName - } + // 2. Configuring model settings + const modelPath = path.join(dirPath, modelName) const config = { device: 'gpu', @@ -33,12 +24,17 @@ async function main () { ctx_size: '1024' } - // 4. Loading model - const model = new LlmLlamacpp(args, config) + // 3. Loading model + const model = new LlmLlamacpp({ + files: { model: [modelPath] }, + config, + logger: console, + opts: { stats: true } + }) await model.load() try { - // 5. Running inference with conversation prompt + // 4. Running inference with conversation prompt const prompt = [ { role: 'system', @@ -76,9 +72,8 @@ async function main () { console.error('Error occurred:', errorMessage) console.error('Error details:', error) } finally { - // 6. Cleaning up resources + // 5. Cleaning up resources await model.unload() - await fsDL.close() } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/salamandraTA.js b/packages/qvac-lib-infer-llamacpp-llm/examples/salamandraTA.js index 81b88e3599..56136355b7 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/salamandraTA.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/salamandraTA.js @@ -1,7 +1,7 @@ 'use strict' const LlmLlamacpp = require('../index') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const process = require('bare-process') const { downloadModel } = require('./utils') @@ -15,17 +15,8 @@ async function main () { 'salamandrata_2b_inst_q4.gguf' ) - // 2. Initializing data loader - const fsDL = new FilesystemDL({ dirPath }) - - // 3. Configuring model settings - const args = { - loader: fsDL, - opts: { stats: true }, - logger: console, - diskPath: dirPath, - modelName - } + // 2. Configuring model settings + const modelPath = path.join(dirPath, modelName) const config = { device: 'gpu', @@ -33,12 +24,17 @@ async function main () { ctx_size: '1024' } - // 4. Loading model - const model = new LlmLlamacpp(args, config) + // 3. Loading model + const model = new LlmLlamacpp({ + files: { model: [modelPath] }, + config, + logger: console, + opts: { stats: true } + }) await model.load() try { - // 5. Running translation inference + // 4. Running translation inference const messages = [ { role: 'system', @@ -64,9 +60,8 @@ async function main () { console.error('Error occurred:', errorMessage) console.error('Error details:', error) } finally { - // 6. Cleaning up resources + // 5. Cleaning up resources await model.unload() - await fsDL.close() } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/simple-lora-inference.js b/packages/qvac-lib-infer-llamacpp-llm/examples/simple-lora-inference.js index 01ceaf97bb..437dfd2927 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/simple-lora-inference.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/simple-lora-inference.js @@ -1,7 +1,6 @@ 'use strict' const LlamaClient = require('../index') -const FilesystemDL = require('@qvac/dl-filesystem') const process = require('bare-process') const path = require('bare-path') const fs = require('bare-fs') @@ -112,15 +111,7 @@ async function main () { const loraAdapterPath = './lora_checkpoints/checkpoint_step_00000006/model.gguf' - const loader = new FilesystemDL({ dirPath: modelDir }) - - const args = { - loader, - opts: { stats: true }, - logger: console, - diskPath: modelDir, - modelName - } + const modelPath = path.join(modelDir, modelName) const config = { device: 'gpu', @@ -133,7 +124,12 @@ async function main () { let client try { - client = new LlamaClient(args, config) + client = new LlamaClient({ + files: { model: [modelPath] }, + config, + logger: console, + opts: { stats: true } + }) await client.load() const messages = [ diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/testToolRemoval.js b/packages/qvac-lib-infer-llamacpp-llm/examples/testToolRemoval.js index 85527f8d5c..650a72784a 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/testToolRemoval.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/testToolRemoval.js @@ -1,7 +1,6 @@ 'use strict' const LlmLlamacpp = require('../index') -const FilesystemDL = require('@qvac/dl-filesystem') const path = require('bare-path') const fs = require('bare-fs') const process = require('bare-process') @@ -65,16 +64,15 @@ function extractToolCalls (response) { } async function loadModel (dirPath, modelName, config) { - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const model = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: console, opts: { stats: true } - }, config) + }) await model.load() - return { model, loader } + return { model } } async function runAndCollect (model, prompt, runOptions) { @@ -102,7 +100,7 @@ async function main () { tools_at_end: 'true' } - const { model, loader } = await loadModel(dirPath, modelName, config) + const { model } = await loadModel(dirPath, modelName, config) const cachePath = path.join(dirPath, 'test-tool-removal.bin') try { fs.unlinkSync(cachePath) } catch (_) {} @@ -195,7 +193,6 @@ async function main () { : ' FAILURES DETECTED — removed tools leaked through the cache') } finally { await model.unload() - await loader.close() try { fs.unlinkSync(cachePath) } catch (_) {} } } @@ -221,7 +218,7 @@ async function mainInSystem () { tools_at_end: 'false' } - const { model, loader } = await loadModel(dirPath, modelName, config) + const { model } = await loadModel(dirPath, modelName, config) const cachePath = path.join(dirPath, 'test-tool-removal-insystem.bin') try { fs.unlinkSync(cachePath) } catch (_) {} @@ -320,7 +317,6 @@ async function mainInSystem () { : ' FAILURES DETECTED — removed tools leaked from conversation history') } finally { await model.unload() - await loader.close() try { fs.unlinkSync(cachePath) } catch (_) {} } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/toolCalling.js b/packages/qvac-lib-infer-llamacpp-llm/examples/toolCalling.js index bcbe1c604c..578d9bbdb9 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/toolCalling.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/toolCalling.js @@ -1,7 +1,7 @@ 'use strict' const LlmLlamacpp = require('../index') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const process = require('bare-process') const { downloadModel } = require('./utils') @@ -80,17 +80,8 @@ async function main () { 'Qwen3-1.7B-Q4_0.gguf' ) - // 2. Initializing data loader - const fsDL = new FilesystemDL({ dirPath }) - - // 3. Configuring model settings - const args = { - loader: fsDL, - opts: { stats: true }, - logger: console, - diskPath: dirPath, - modelName - } + // 2. Configuring model settings + const modelPath = path.join(dirPath, modelName) const config = { device: 'gpu', @@ -99,12 +90,17 @@ async function main () { tools: 'true' } - // 4. Loading model - const model = new LlmLlamacpp(args, config) + // 3. Loading model + const model = new LlmLlamacpp({ + files: { model: [modelPath] }, + config, + logger: console, + opts: { stats: true } + }) await model.load() try { - // 5. Defining tool queries with function schemas + // 4. Defining tool queries with function schemas const systemMessageAmbiguous = { role: 'system', content: 'You are a helpful assistant with access to various tools. If request is ambiguous,skip tool calls.' @@ -276,7 +272,7 @@ async function main () { } ] - // 6. Running tool calling queries + // 5. Running tool calling queries const queries = [ { name: 'Query 1: Complex tool calling with multiple parameters', prompt: toolQuery1 }, { name: 'Query 2: Math calculation and ambiguous query', prompt: toolQuery2 }, @@ -296,9 +292,8 @@ async function main () { console.error('Error occurred:', errorMessage) console.error('Error details:', error) } finally { - // 7. Cleaning up resources + // 6. Cleaning up resources await model.unload() - await fsDL.close() } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/index.d.ts b/packages/qvac-lib-infer-llamacpp-llm/index.d.ts index a9b2b9506f..07e07341c0 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/index.d.ts +++ b/packages/qvac-lib-infer-llamacpp-llm/index.d.ts @@ -1,26 +1,18 @@ -import BaseInference, { - ReportProgressCallback -} from '@qvac/infer-base/WeightsProvider/BaseInference' import type { QvacResponse } from '@qvac/infer-base' import type QvacLogger from '@qvac/logging' export type NumericLike = number | `${number}` -export interface Loader { - ready(): Promise - close(): Promise - getStream(path: string): Promise> - download( - path: string, - opts: { diskPath: string; progressReporter?: unknown } - ): Promise<{ await(): Promise }> - getFileSize?(path: string): Promise -} - export interface AddonMessage { type: 'text' input: string prefill?: boolean + /** + * Per-call sampling overrides forwarded by `LlmLlamacpp.run()` from + * `RunOptions.generationParams`. Carried on the `text` message and consumed + * by the native binding so each `runJob` can use a different temp / top_p / + * seed / etc. without re-loading the model. + */ generationParams?: GenerationParams cacheKey?: string saveCacheToDisk?: boolean @@ -31,7 +23,6 @@ export interface AddonMediaMessage { } export type AddonRunJobMessage = AddonMessage | AddonMediaMessage - export interface Addon { loadWeights(data: { filename: string; chunk: Uint8Array | null; completed: boolean }, logger?: QvacLogger): Promise activate(): Promise @@ -69,14 +60,10 @@ export interface LlamaConfig { } export interface LlmLlamacppArgs { - loader: Loader + files: { model: string[]; projectionModel?: string } + config: LlamaConfig logger?: QvacLogger | Console | null opts?: { stats?: boolean } - diskPath?: string - modelName: string - projectionModel?: string - modelPath?: string - modelConfig?: Record } export interface UserTextMessage { @@ -89,7 +76,12 @@ export interface UserTextMessage { export interface UserMediaMessage { role: 'user' type: 'media' - content: Uint8Array + /** + * Either the raw bytes of an image/audio/video file (`Uint8Array`) or an + * absolute path to a file on disk (`string`). Path-mode is handled by the + * C++ layer via `loadMedia()`; byte-mode takes the `parseMedia` path. + */ + content: Uint8Array | string } export interface ChatFunctionDefinition { @@ -122,10 +114,6 @@ export interface RunOptions { saveCacheToDisk?: boolean } -export interface DownloadWeightsOptions { - closeLoader?: boolean -} - export interface RuntimeStats { TTFT: number TPS: number @@ -136,12 +124,6 @@ export interface RuntimeStats { backendDevice: 'cpu' | 'gpu' } -export interface DownloadResult { - filePath: string | null - error: boolean - completed: boolean -} - export interface FinetuneValidationNone { type: 'none' } @@ -254,43 +236,24 @@ export interface FinetuneResult { stats?: FinetuneStats } -export default class LlmLlamacpp extends BaseInference { - protected addon: Addon - - constructor( - args: LlmLlamacppArgs, - config: LlamaConfig - ) - _load( - closeLoader?: boolean, - onDownloadProgress?: ReportProgressCallback | ((bytes: number) => void) - ): Promise +export default class LlmLlamacpp { + protected addon: Addon | null + opts: { stats?: boolean } + logger: QvacLogger + state: { configLoaded: boolean } - load( - closeLoader?: boolean, - onDownloadProgress?: ReportProgressCallback | ((bytes: number) => void) - ): Promise - - downloadWeights( - onDownloadProgress?: (progress: Record, opts: DownloadWeightsOptions) => any, - opts?: DownloadWeightsOptions - ): Promise> - - _downloadWeights( - onDownloadProgress?: (progress: Record, opts: DownloadWeightsOptions) => any, - opts?: DownloadWeightsOptions - ): Promise> - - _runInternal(prompt: Message[], runOptions?: RunOptions): Promise + constructor(args: LlmLlamacppArgs) + load(): Promise run(prompt: Message[], runOptions?: RunOptions): Promise - finetune(finetuningOptions: FinetuneOptions): Promise - cancel(): Promise - + pause(): Promise unload(): Promise - + getState(): { configLoaded: boolean } } -export { ReportProgressCallback, QvacResponse, FinetuneHandle, FinetuneProgressStats, FinetuneOptions, FinetuneValidation } +export { QvacResponse, FinetuneHandle, FinetuneProgressStats, FinetuneOptions, FinetuneValidation } + +/** Returns the first shard (matching `-NNNNN-of-MMMMM.gguf`) or the sole entry for single-file models. */ +export function pickPrimaryGgufPath(files: string[]): string diff --git a/packages/qvac-lib-infer-llamacpp-llm/index.js b/packages/qvac-lib-infer-llamacpp-llm/index.js index 9d4e7c1433..edecebfd38 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/index.js +++ b/packages/qvac-lib-infer-llamacpp-llm/index.js @@ -2,12 +2,9 @@ const fs = require('bare-fs') const path = require('bare-path') - -const BaseInference = require('@qvac/infer-base/WeightsProvider/BaseInference') -const WeightsProvider = require('@qvac/infer-base/WeightsProvider/WeightsProvider') -const { LlamaInterface } = require('./addon') - -const noop = () => { } +const QvacLogger = require('@qvac/logging') +const { createJobHandler, exclusiveRunQueue } = require('@qvac/infer-base') +const { LlamaInterface, mapAddonEvent } = require('./addon') const RUN_BUSY_ERROR_MESSAGE = 'Cannot set new job: a job is already set or being processed' @@ -99,187 +96,272 @@ function normalizeFinetuneParams (opts) { } /** - * GGML client implementation for Llama LLM model + * Returns the first shard (matching `-NNNNN-of-MMMMM.gguf`) or the sole + * entry for single-file models. Matches the C++ shard-expansion contract + * in `GGUFShards::expandGGUFIntoShards`. + * + * @param {string[]} files - ordered array of absolute paths + * @returns {string} */ -class LlmLlamacpp extends BaseInference { - /** - * Creates an instance of LlmLlamacpp. - * @constructor - * @param {Object} args - Setup parameters including loader, logger, disk path, and model name - * @param {Loader} args.loader - External loader instance - * @param {Logger} [args.logger] - Optional structured logger - * @param {Object} [args.opts] - Optional inference options - * @param {string} args.diskPath - Disk directory where model files are stored - * @param {string} args.modelName - Name of the model directory or file. The usage of a sharded - * filename (e.g. "llama-00001-of-00004.gguf") will trigger asynchronous loading of the weights for - * all remaining files. - * @param {string} args.projectionModel - Name of the projection model directory or file - * @param {Object} config - Model-specific configuration settings - */ - constructor ( - { opts = {}, loader, logger = null, diskPath = '.', modelName, projectionModel }, - config - ) { - super({ logger, opts }) +function pickPrimaryGgufPath (files) { + const SHARD_REGEX = /-\d+-of-\d+\.gguf$/ + return files.find((p) => SHARD_REGEX.test(p)) || files[0] +} + +/** LLM client wrapping the native LlamaInterface for inference, finetuning, and pause/resume. */ +class LlmLlamacpp { + constructor ({ files, config, logger = null, opts = {} }) { + if (!files || !Array.isArray(files.model) || files.model.length === 0) { + throw new TypeError('files.model must be a non-empty array of absolute paths') + } + for (const [i, entry] of files.model.entries()) { + if (typeof entry !== 'string' || entry.length === 0) { + throw new TypeError(`files.model[${i}] must be an absolute path string`) + } + if (!path.isAbsolute(entry)) { + throw new TypeError(`files.model[${i}] must be an absolute path (got: ${entry})`) + } + } + if (files.projectionModel !== undefined) { + if (typeof files.projectionModel !== 'string' || files.projectionModel.length === 0) { + throw new TypeError('files.projectionModel must be an absolute path string') + } + if (!path.isAbsolute(files.projectionModel)) { + throw new TypeError(`files.projectionModel must be an absolute path (got: ${files.projectionModel})`) + } + } + this._files = files.model + this._projectionModelPath = files.projectionModel || '' this._config = config - this._diskPath = diskPath - this._modelName = modelName - this._projectionModel = projectionModel - this._shards = WeightsProvider.expandGGUFIntoShards(this._modelName) - this.weightsProvider = new WeightsProvider(loader, this.logger) + this.logger = new QvacLogger(logger) + this.opts = opts + // Lazy deref + optional chain: safe before `_load()` and after `unload()`. + this._job = createJobHandler({ cancel: () => this.addon?.cancel() }) + this._run = exclusiveRunQueue() + this.addon = null this._checkpointSaveDir = null this._hasActiveResponse = false - this._skipNextRuntimeStats = false - this._originalLogger = this.logger - this._baseOutputCallback = this._outputCallback.bind(this) + // Carried across mapAddonEvent calls to drop the post-finetune TPS trailer. + this._addonEventState = { skipNextRuntimeStats: false } + this.state = { configLoaded: false } } - /** - * Load model weights, initialize the native addon, and activate the model. - * @param {boolean} [closeLoader=true] - Whether to close the loader when complete - * @param {ProgressReportCallback} [onDownloadProgress] - Optional byte-level progress callback - * @returns {Promise} - */ - async _load (closeLoader = true, onDownloadProgress = noop) { - this.logger.info('Starting model load') + async load () { + return this._run(async () => { + if (this.state.configLoaded) return + await this._load() + this.state.configLoaded = true + }) + } - try { - const configForLoad = { ...this._config } + async _load () { + this.logger.info('Starting model load') + const primaryGgufPath = pickPrimaryGgufPath(this._files) + const configurationParams = { + path: primaryGgufPath, + projectionPath: this._projectionModelPath, + config: { ...this._config } + } - const configurationParams = { - path: path.join(this._diskPath, this._modelName), - projectionPath: this._projectionModel ? path.join(this._diskPath, this._projectionModel) : '', - config: configForLoad - } + this.logger.info('Creating addon with configuration:', configurationParams) - this.logger.info('Creating addon with configuration:', configurationParams) + try { this.addon = this._createAddon(configurationParams) - - if (this._shards !== null) { - await this._loadWeights(onDownloadProgress) - } else { - await this.downloadWeights(onDownloadProgress, { closeLoader }) + if (this._files.length > 1) { + await this._streamShards() } - this.logger.info('Activating addon') await this.addon.activate() + } catch (loadError) { + this.logger.error('Error during model load:', loadError) + // Best-effort cleanup of the partially-initialized addon so a subsequent + // load() does not leak a zombie native instance. + try { await this.addon?.unload?.() } catch (_) {} + this.addon = null + throw loadError + } + this.logger.info('Model load completed successfully') + } - this.logger.info('Model load completed successfully') - } catch (error) { - this.logger.error('Error during model load:', error) - throw error + async _streamShards () { + for (const filePath of this._files) { + const filename = path.basename(filePath) + const stream = fs.createReadStream(filePath) + for await (const chunk of stream) { + await this.addon.loadWeights({ filename, chunk, completed: false }) + } + await this.addon.loadWeights({ filename, chunk: null, completed: true }) + this.logger.info(`Streamed weights for ${filename}`) } } /** - * Download the model weight files and return the local path to the primary file. - * @param {ProgressReportCallback} [onDownloadProgress] - Callback invoked with bytes downloaded - * @returns {Promise<{filePath: string, completed: boolean, error: boolean}[]>} Local file path for the model weights + * Public API entrypoint for inference. + * @param {Message[]} prompt - Input prompt array of messages + * @param {RunOptions} [runOptions] - Optional run settings (prefill, generationParams, cacheKey, saveCacheToDisk) + * @returns {Promise} */ - async _downloadWeights (onDownloadProgress, opts) { - return await this.weightsProvider.downloadFiles( - this._projectionModel ? [this._modelName, this._projectionModel] : [this._modelName], - this._diskPath, - { - closeLoader: opts.closeLoader, - onDownloadProgress - } - ) + async run (prompt, runOptions = {}) { + return this._run(() => this._runInternal(prompt, runOptions)) } - async _loadWeights (reportProgressCallback) { - const onChunk = async (chunkedWeightsData) => { - this.addon.loadWeights(chunkedWeightsData, this.logger) + async _runInternal (prompt, runOptions = {}) { + if (!this.addon) { + throw new Error('Addon not initialized. Call load() first.') + } + if (this._hasActiveResponse) { + throw new Error(RUN_BUSY_ERROR_MESSAGE) } - await this.weightsProvider.streamFiles(this._shards, onChunk, reportProgressCallback) - } - _isSuppressedNoResponseLog (args) { - const message = args.map(arg => { - if (typeof arg === 'string') return arg - if (arg && typeof arg === 'object') { - if (arg.message && typeof arg.message === 'string') return arg.message - return JSON.stringify(arg) - } - return String(arg) - }).join(' ') - return message && message.includes('No response found for job') - } + if (!Array.isArray(prompt)) { + throw new TypeError('Prompt input must be Message[]') + } + const { prefill, generationParams, cacheKey, saveCacheToDisk } = normalizeRunOptions(runOptions) - _createFilteredLogger (sourceLogger) { - const filteredLogger = sourceLogger ? Object.create(Object.getPrototypeOf(sourceLogger)) : {} - Object.assign(filteredLogger, sourceLogger) + this.logger.info('Starting inference with prompt:', prompt) - const originalInfo = sourceLogger && typeof sourceLogger.info === 'function' - ? sourceLogger.info.bind(sourceLogger) - : null - const originalWarn = sourceLogger && typeof sourceLogger.warn === 'function' - ? sourceLogger.warn.bind(sourceLogger) - : null + // Separate media messages from text messages + const textMessages = [] + const mediaItems = [] - filteredLogger.info = (...args) => { - if (this._isSuppressedNoResponseLog(args)) return - if (originalInfo) return originalInfo.apply(sourceLogger, args) + for (const message of prompt) { + if (message.role === 'user' && + message.type === 'media' && + message.content instanceof Uint8Array) { + mediaItems.push(message.content) + textMessages.push({ ...message, content: '' }) + } else { + textMessages.push(message) + } + } + + const promptMessages = [] + + // Send media first (in order), then the stringified text messages. + for (const mediaData of mediaItems) { + promptMessages.push({ type: 'media', content: mediaData }) } - filteredLogger.warn = (...args) => { - if (this._isSuppressedNoResponseLog(args)) return - if (originalWarn) return originalWarn.apply(sourceLogger, args) + promptMessages.push({ + type: 'text', + input: JSON.stringify(textMessages), + prefill, + generationParams, + cacheKey, + saveCacheToDisk + }) + + const response = this._job.start() + + let accepted + try { + accepted = await this.addon.runJob(promptMessages) + } catch (error) { + this._job.fail(error) + throw error + } + if (!accepted) { + this._job.fail(new Error(RUN_BUSY_ERROR_MESSAGE)) + throw new Error(RUN_BUSY_ERROR_MESSAGE) } - return filteredLogger + this._hasActiveResponse = true + const finalized = response.await().finally(() => { this._hasActiveResponse = false }) + finalized.catch((err) => { + this.logger?.warn?.('Inference response rejected:', err?.message || err) + }) + response.await = () => finalized + + this.logger.info('Inference job started successfully') + return response } - _handleAddonOutputEvent (originalOutputCb, originalLoggerRef, instance, eventType, jobId, data, extra) { - if (eventType === 'JobEnded' || eventType === 'Error') { - this._hasActiveResponse = false + async finetune (finetuningOptions = undefined) { + if (!finetuningOptions) { + throw new Error('Finetuning parameters are required.') } + const paramsToSend = normalizeFinetuneParams(finetuningOptions) + this.logger.info('finetune() called') + this.logger.info('Finetuning parameters:', finetuningOptions) + return this._run(async () => { + if (!this.addon) { + throw new Error('Addon not initialized. Call load() first.') + } + if (this._hasActiveResponse) { + throw new Error(RUN_BUSY_ERROR_MESSAGE) + } + if (finetuningOptions.checkpointSaveDir) { + this._checkpointSaveDir = finetuningOptions.checkpointSaveDir + } + + const response = this._job.start() + let accepted + try { + accepted = await this.addon.finetune(paramsToSend) + } catch (err) { + this._job.fail(err) + throw err + } + + if (!accepted) { + this._job.fail(new Error(RUN_BUSY_ERROR_MESSAGE)) + throw new Error(RUN_BUSY_ERROR_MESSAGE) + } + + this._hasActiveResponse = true + const finalized = response.await().finally(() => { this._hasActiveResponse = false }) + finalized.catch((err) => { + this.logger?.warn?.('Finetune response rejected:', err?.message || err) + }) + response.await = () => finalized + return response + }) + } + + _handleAddonOutputEvent (eventType, data, error) { if (eventType === 'LogMsg') { const logMsg = typeof data === 'string' ? data : (data?.message || JSON.stringify(data)) - originalLoggerRef?.info?.(logMsg) + this.logger?.info?.(logMsg) return } - if (originalOutputCb) { - return originalOutputCb(instance, eventType, jobId, data, extra) + if (eventType === 'Error') { + this.logger.error('Job failed with error:', error) + this._job.fail(error) + } else if (eventType === 'Output') { + this._job.output(data) + } else if (eventType === 'FinetuneProgress') { + if (this.opts.stats && data && data.stats) { + this._job.active?.updateStats(data.stats) + } + } else if (eventType === 'JobEnded') { + this.logger.info('Job completed') + const isFinetuneTerminal = data && typeof data === 'object' && data.op === 'finetune' && typeof data.status === 'string' + if (isFinetuneTerminal) { + this._job.end(null, data) + } else { + this._job.end(this.opts.stats ? data : null) + } } } - /** - * Public API entrypoint for inference. - * @param {Message[]} prompt - Input prompt array of messages - * @param {{prefill?: boolean}} [runOptions] - Optional run settings - * @returns {Promise} - */ - async run (prompt, runOptions = {}) { - return await this._runInternal(prompt, runOptions) + _addonOutputCallback (addon, event, data, error) { + const mapped = mapAddonEvent(event, data, error, this._addonEventState) + if (mapped === null) return + this._handleAddonOutputEvent(mapped.type, mapped.data, mapped.error) } /** * Instantiate the native addon with the given parameters. * @param {Object} configurationParams - Configuration parameters for the addon - * @param {string} configurationParams.path - Local file or directory path - * @param {Object} configurationParams.settings - LLM-specific settings + * @param {string} configurationParams.path - Absolute path to the primary model file (first shard for sharded models) + * @param {string} configurationParams.projectionPath - Absolute path to the multimodal projection model, or '' when not provided + * @param {Object} configurationParams.config - LLM-specific settings * @returns {Addon} The instantiated addon interface */ _createAddon (configurationParams) { const binding = require('./binding') - - this.logger = this._createFilteredLogger(this._originalLogger) - - this._outputCallback = (instance, eventType, jobId, data, extra) => { - return this._handleAddonOutputEvent( - this._baseOutputCallback, - this._originalLogger, - instance, - eventType, - jobId, - data, - extra - ) - } - return new LlamaInterface( binding, configurationParams, @@ -287,50 +369,9 @@ class LlmLlamacpp extends BaseInference { ) } - _addonOutputCallback (addon, event, data, error) { - if (typeof data === 'object' && data !== null && 'TPS' in data) { - if (this._skipNextRuntimeStats) { - this._skipNextRuntimeStats = false - return - } - const runtimeStats = { ...data } - if (runtimeStats.backendDevice === 0) { - runtimeStats.backendDevice = 'cpu' - } else if (runtimeStats.backendDevice === 1) { - runtimeStats.backendDevice = 'gpu' - } - return this._outputCallback(addon, 'JobEnded', 'OnlyOneJob', runtimeStats, null) - } - if ( - typeof data === 'object' && - data !== null && - data.op === 'finetune' && - typeof data.status === 'string' - ) { - this._skipNextRuntimeStats = true - return this._outputCallback(addon, 'JobEnded', 'OnlyOneJob', data, null) - } - if ( - typeof data === 'object' && - data !== null && - data.type === 'finetune_progress' - ) { - return this._outputCallback(addon, 'FinetuneProgress', 'OnlyOneJob', data, null) - } - - let mappedEvent = event - if (event.includes('Error')) { - mappedEvent = 'Error' - } else if (typeof data === 'string') { - mappedEvent = 'Output' - } - - return this._outputCallback(addon, mappedEvent, 'OnlyOneJob', data, error) - } - /** * Pause finetuning, saving a checkpoint so training can resume later. - * cancel inference job if it is running + * Also cancels any inference job in flight. */ async pause () { if (this.addon?.cancel) { @@ -340,8 +381,8 @@ class LlmLlamacpp extends BaseInference { /** * Cancel finetuning and remove the pause checkpoint so the next - * finetune() call starts fresh instead of resuming. - * cancel inference job if it is running + * `finetune()` call starts fresh instead of resuming. Also cancels + * any inference job in flight. */ async cancel () { if (this.addon?.cancel) { @@ -366,160 +407,32 @@ class LlmLlamacpp extends BaseInference { } /** - * Unload model safely by cancelling and clearing pending jobs. + * Unload the model safely by cancelling the in-flight job and releasing + * native resources. Subsequent calls to `run()` / `finetune()` / `cancel()` + * are safe; they hit the `!this.addon` guard and throw or no-op. * @returns {Promise} */ async unload () { - return await this._withExclusiveRun(async () => { + return this._run(async () => { try { await this.pause() } catch (_) {} - const currentJobResponse = this._jobToResponse.get('OnlyOneJob') - if (currentJobResponse) { - currentJobResponse.failed(new Error('Model was unloaded')) - this._deleteJobMapping('OnlyOneJob') + if (this._job.active) { + this._job.fail(new Error('Model was unloaded')) } this._hasActiveResponse = false - await super.unload() - }) - } - - /** - * Internal method to start inference with a text prompt. - * @param {Message[]} prompt - Input prompt array of messages - * @param {{prefill?: boolean}} [runOptions] - Optional run settings - * @returns {Promise} A QvacResponse representing the inference job - */ - async _runInternal (prompt, runOptions = {}) { - return this._withExclusiveRun(async () => { - if (this._hasActiveResponse) { - throw new Error(RUN_BUSY_ERROR_MESSAGE) - } - - if (!Array.isArray(prompt)) { - throw new TypeError('Prompt input must be Message[]') + if (this.addon) { + await this.addon.unload() + // Null the addon reference so post-unload `cancel()` / `run()` calls hit the + // `if (!this.addon)` guard instead of dereferencing a disposed native handle. + this.addon = null } - const { prefill, generationParams, cacheKey, saveCacheToDisk } = normalizeRunOptions(runOptions) - - this.logger.info('Starting inference with prompt:', prompt) - - // Separate media messages from text messages - const textMessages = [] - const mediaItems = [] - - for (const message of prompt) { - if (message.role === 'user' && - message.type === 'media' && - message.content instanceof Uint8Array) { - mediaItems.push(message.content) - // Keep the message as a placeholder marker (with empty content) for tokenization - textMessages.push({ ...message, content: '' }) - } else { - textMessages.push(message) - } - } - - const promptMessages = [] - - // Send media first (in order) if present - for (const mediaData of mediaItems) { - promptMessages.push({ type: 'media', content: mediaData }) - } - - // Send text messages - promptMessages.push({ - type: 'text', - input: JSON.stringify(textMessages), - prefill, - generationParams, - cacheKey, - saveCacheToDisk - }) - - const response = this._createResponse('OnlyOneJob') - - // addon-cpp C++ guarantees no events will be generated - // until job is fully accepted. This means even if trying - // to queue a job fails right now as not accepted, - // it will not generate events. - // - // If any unexpected exception is thrown (e.g. in the C++ code) - // it will unwind here and the job will not be accepted. - let accepted - try { - accepted = await this.addon.runJob(promptMessages) - } catch (error) { - this._deleteJobMapping('OnlyOneJob') - response.failed(error) - throw error - } - if (!accepted) { - this._deleteJobMapping('OnlyOneJob') - const msg = RUN_BUSY_ERROR_MESSAGE - response.failed(new Error(msg)) - throw new Error(msg) - } - - this._hasActiveResponse = true - const finalized = response.await().finally(() => { this._hasActiveResponse = false }) - finalized.catch(() => {}) - response.await = () => finalized - - this.logger.info('Inference job started successfully') - - return response + this.state.configLoaded = false }) } - async finetune (finetuningOptions = undefined) { - if (!this.addon) { - throw new Error( - 'Addon not initialized. Call load() first.' - ) - } - - if (!finetuningOptions) { - throw new Error( - 'Finetuning parameters are required.' - ) - } - if (finetuningOptions.checkpointSaveDir) { - this._checkpointSaveDir = finetuningOptions.checkpointSaveDir - } - const paramsToSend = normalizeFinetuneParams(finetuningOptions) - this.logger?.info?.('finetune() called') - this.logger?.info?.('Finetuning parameters:', finetuningOptions) - - return this._withExclusiveRun(async () => { - if (this._hasActiveResponse) { - throw new Error(RUN_BUSY_ERROR_MESSAGE) - } - - const response = this._createResponse('OnlyOneJob') - let accepted - try { - accepted = await this.addon.finetune(paramsToSend) - } catch (err) { - this._deleteJobMapping('OnlyOneJob') - response.failed(err) - throw err - } - - if (!accepted) { - this._deleteJobMapping('OnlyOneJob') - const msg = RUN_BUSY_ERROR_MESSAGE - response.failed(new Error(msg)) - throw new Error(msg) - } - - this._hasActiveResponse = true - const finalized = response.await().finally(() => { this._hasActiveResponse = false }) - finalized.catch(() => {}) - response.await = () => finalized - - return response - }) - } + getState () { return this.state } } module.exports = LlmLlamacpp +module.exports.pickPrimaryGgufPath = pickPrimaryGgufPath diff --git a/packages/qvac-lib-infer-llamacpp-llm/package.json b/packages/qvac-lib-infer-llamacpp-llm/package.json index d7deef159f..7ca55195f2 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/package.json +++ b/packages/qvac-lib-infer-llamacpp-llm/package.json @@ -1,6 +1,6 @@ { "name": "@qvac/llm-llamacpp", - "version": "0.15.0", + "version": "0.16.0", "description": "llama addon for qvac", "addon": true, "scripts": { @@ -12,12 +12,13 @@ "lint-cpp": "clang-tidy -p build $(find addon -name '*.cpp')", "quickstart": "node -e \"const fs=require('fs');const path=require('path');const {execSync}=require('child_process');const prebuilds=path.join(process.cwd(),'prebuilds');const source=path.join(process.cwd(),'node_modules','@qvac','llm-llamacpp','prebuilds');if(!fs.existsSync(prebuilds)){if(!fs.existsSync(source)){execSync('npm install @qvac/llm-llamacpp@latest',{stdio:'inherit'});}if(!fs.existsSync(source)){throw new Error('Prebuilds not found after install.');}fs.cpSync(source,prebuilds,{recursive:true});}execSync('bare examples/quickstart.js',{stdio:'inherit'});\"", "update:quickstart-section": "node ./scripts/quickstart-testing/update_readme.js", - "test": "npm run test:integration", "test:integration": "npm run test:integration:generate && bare test/integration/all.js --exit", "test:mobile:generate": "bare ./scripts/generate-mobile-integration-tests.js", "test:mobile:validate": "node scripts/validate-mobile-tests.js", "test:dts": "tsc -p tsconfig.dts.json", "test:integration:generate": "brittle -r test/integration/all.js test/integration/*.test.js && npm run test:mobile:generate", + "test:unit:generate": "brittle -r test/unit/all.js test/unit/*.test.js", + "test:unit": "npm run test:unit:generate && bare test/unit/all.js --exit", "test:cpp:build": "bare-make generate -D BUILD_TESTING=ON && bare-make build --target addon-test", "test:cpp:run": "cd build/test/unit/ && ./addon-test --gtest_output=xml:cpp-test-results.xml", "test:cpp": "npm run test:cpp:build && npm run test:cpp:run", @@ -26,7 +27,7 @@ "coverage:cpp:summary": "cd build/test/unit && llvm-cov-19 report ./addon-test --instr-profile=coverage.profdata -ignore-filename-regex='(tests|build|node_modules|gtest|gmock|\\.vcpkg|/usr)/' > coverage-summary.txt", "coverage:cpp:report": "cd build/test/unit/ && ls -lha && llvm-profdata-19 merge -sparse default.profraw -o coverage.profdata && llvm-cov-19 show ./addon-test -instr-profile=coverage.profdata -format=html -output-dir=coverage-html -ignore-filename-regex='(tests|build|node_modules|gtest|gmock|\\.vcpkg|/usr)/' && llvm-cov-19 export ./addon-test -instr-profile=coverage.profdata -format=lcov -ignore-filename-regex='(tests|build|node_modules|gtest|gmock|\\.vcpkg|/usr)/' > lcov.info && npm run coverage:cpp:summary", "coverage:cpp": "npm run coverage:cpp:build && npm run coverage:cpp:run && npm run coverage:cpp:report", - "test:all": "npm run test && npm run test:cpp", + "test:all": "npm run test:unit && npm run test:integration && npm run test:cpp", "benchmarks": "./benchmarks/run-benchmarks.sh", "benchmarks:windows": "powershell -ExecutionPolicy Bypass -File ./benchmarks/run-benchmarks.ps1" }, @@ -57,9 +58,6 @@ "bugs": "https://github.com/tetherto/qvac/issues", "homepage": "https://github.com/tetherto/qvac/tree/main/packages/qvac-lib-infer-llamacpp-llm#readme", "devDependencies": { - "@qvac/dl-base": "^0.1.0", - "@qvac/dl-filesystem": "^0.1.2", - "@qvac/logging": "^0.1.0", "@types/node": "^24.2.1", "bare-url": "^2.1.6", "brittle": "^3.16.5", @@ -73,7 +71,8 @@ "util": "npm:bare-utils@^1.5.1" }, "dependencies": { - "@qvac/infer-base": "^0.3.0", + "@qvac/infer-base": "^0.4.0", + "@qvac/logging": "^0.1.0", "bare-fs": "^4.5.1", "bare-path": "^3.0.0", "bare-process": "^4.2.2" diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/afriquegemma-edge-cases.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/afriquegemma-edge-cases.test.js index e447a0126d..b5c47caff7 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/afriquegemma-edge-cases.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/afriquegemma-edge-cases.test.js @@ -1,7 +1,6 @@ 'use strict' const test = require('brittle') -const FilesystemDL = require('@qvac/dl-filesystem') const LlmLlamacpp = require('../../index.js') const os = require('bare-os') const fs = require('bare-fs') @@ -71,14 +70,12 @@ const TIMEOUT = 1_800_000 // --------------------------------------------------------------------------- test('AfriqueGemma: empty and whitespace input must not crash', { timeout: TIMEOUT }, async t => { const [modelName, dirPath] = await resolveModel() - const loader = new FilesystemDL({ dirPath }) const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [path.join(dirPath, modelName)] }, + config: AFRIQUEGEMMA_CONFIG, logger: console, opts: { stats: true } - }, AFRIQUEGEMMA_CONFIG) + }) try { await addon.load() const emptyPrompt = 'Translate English to Swahili.\nEnglish: \nSwahili:' @@ -91,7 +88,6 @@ test('AfriqueGemma: empty and whitespace input must not crash', { timeout: TIMEO t.pass('whitespace-style prompt did not crash') } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) @@ -103,34 +99,28 @@ test('AfriqueGemma: empty and whitespace input must not crash', { timeout: TIMEO // --------------------------------------------------------------------------- test('AfriqueGemma: lifecycle load-unload-fresh-load-use', { timeout: TIMEOUT }, async t => { const [modelName, dirPath] = await resolveModel() - const loader1 = new FilesystemDL({ dirPath }) const addon1 = new LlmLlamacpp({ - loader: loader1, - modelName, - diskPath: dirPath, + files: { model: [path.join(dirPath, modelName)] }, + config: AFRIQUEGEMMA_CONFIG, logger: console, opts: { stats: true } - }, AFRIQUEGEMMA_CONFIG) + }) try { await addon1.load() const r1 = await addon1.run([{ role: 'user', content: EN_SW_PROMPT }]) const out1 = await collectTranslation(r1) t.ok(out1.length > 0, 'first run produced output') await addon1.unload() - await loader1.close() } catch (err) { await addon1.unload().catch(() => {}) - await loader1.close().catch(() => {}) throw err } - const loader2 = new FilesystemDL({ dirPath }) const addon2 = new LlmLlamacpp({ - loader: loader2, - modelName, - diskPath: dirPath, + files: { model: [path.join(dirPath, modelName)] }, + config: AFRIQUEGEMMA_CONFIG, logger: console, opts: { stats: true } - }, AFRIQUEGEMMA_CONFIG) + }) try { await addon2.load() const r2 = await addon2.run([{ role: 'user', content: EN_SW_PROMPT }]) @@ -138,7 +128,6 @@ test('AfriqueGemma: lifecycle load-unload-fresh-load-use', { timeout: TIMEOUT }, t.ok(out2.length > 0, 'second run after fresh load produced output') } finally { await addon2.unload().catch(() => {}) - await loader2.close().catch(() => {}) } }) @@ -149,14 +138,12 @@ test('AfriqueGemma: lifecycle load-unload-fresh-load-use', { timeout: TIMEOUT }, // --------------------------------------------------------------------------- test('AfriqueGemma: cancel mid-translation, model reusable after', { timeout: TIMEOUT }, async t => { const [modelName, dirPath] = await resolveModel() - const loader = new FilesystemDL({ dirPath }) const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [path.join(dirPath, modelName)] }, + config: { ...AFRIQUEGEMMA_CONFIG, n_predict: '512' }, logger: console, opts: { stats: true } - }, { ...AFRIQUEGEMMA_CONFIG, n_predict: '512' }) + }) try { await addon.load() const longPrompt = 'Translate English to Swahili.\nEnglish: The children are playing in the park. Their mother watches from the bench. The sun is shining brightly today. Many families enjoy this beautiful place.\nSwahili:' @@ -183,7 +170,6 @@ test('AfriqueGemma: cancel mid-translation, model reusable after', { timeout: TI t.ok(out2.length > 0, 'model produced output after cancel') } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) @@ -195,18 +181,16 @@ test('AfriqueGemma: cancel mid-translation, model reusable after', { timeout: TI // --------------------------------------------------------------------------- test('AfriqueGemma: tools true required for load', { timeout: TIMEOUT }, async t => { const [modelName, dirPath] = await resolveModel() - const loader = new FilesystemDL({ dirPath }) const configWithoutTools = { ...AFRIQUEGEMMA_CONFIG, tools: undefined } delete configWithoutTools.tools const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [path.join(dirPath, modelName)] }, + config: configWithoutTools, logger: console - }, configWithoutTools) + }) try { await addon.load() t.pass('load without tools succeeded (addon defaults to jinja)') @@ -217,8 +201,6 @@ test('AfriqueGemma: tools true required for load', { timeout: TIMEOUT }, async t /template|jinja|tools|not supported|custom/.test(msg), 'load without tools fails with clear message about template/jinja' ) - } finally { - await loader.close().catch(() => {}) } }) @@ -232,14 +214,12 @@ test('AfriqueGemma: tools true required for load', { timeout: TIMEOUT }, async t // --------------------------------------------------------------------------- test('AfriqueGemma: run after unload rejects cleanly', { timeout: TIMEOUT }, async t => { const [modelName, dirPath] = await resolveModel() - const loader = new FilesystemDL({ dirPath }) const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [path.join(dirPath, modelName)] }, + config: AFRIQUEGEMMA_CONFIG, logger: console, opts: { stats: true } - }, AFRIQUEGEMMA_CONFIG) + }) await addon.load() const r1 = await addon.run([{ role: 'user', content: EN_SW_PROMPT }]) @@ -275,7 +255,6 @@ test('AfriqueGemma: run after unload rejects cleanly', { timeout: TIMEOUT }, asy t.comment('Expected: synchronous throw or a response that resolves to an error') } t.ok(rejected || hadUnhandled, 'run() after unload() does not silently succeed') - await loader.close().catch(() => {}) }) // --------------------------------------------------------------------------- @@ -288,14 +267,12 @@ test('AfriqueGemma: run after unload rejects cleanly', { timeout: TIMEOUT }, asy // --------------------------------------------------------------------------- test('AfriqueGemma: small n_predict produces truncated but valid output', { timeout: TIMEOUT }, async t => { const [modelName, dirPath] = await resolveModel() - const loader = new FilesystemDL({ dirPath }) const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [path.join(dirPath, modelName)] }, + config: { ...AFRIQUEGEMMA_CONFIG, n_predict: '8' }, logger: console, opts: { stats: true } - }, { ...AFRIQUEGEMMA_CONFIG, n_predict: '8' }) + }) try { await addon.load() @@ -313,7 +290,6 @@ test('AfriqueGemma: small n_predict produces truncated but valid output', { time } } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) @@ -328,14 +304,12 @@ test('AfriqueGemma: small n_predict produces truncated but valid output', { time // --------------------------------------------------------------------------- test('AfriqueGemma: long input approaching ctx_size boundary', { timeout: TIMEOUT }, async t => { const [modelName, dirPath] = await resolveModel() - const loader = new FilesystemDL({ dirPath }) const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [path.join(dirPath, modelName)] }, + config: { ...AFRIQUEGEMMA_CONFIG, ctx_size: '512', n_predict: '32' }, logger: console, opts: { stats: true } - }, { ...AFRIQUEGEMMA_CONFIG, ctx_size: '512', n_predict: '32' }) + }) try { await addon.load() @@ -359,6 +333,5 @@ test('AfriqueGemma: long input approaching ctx_size boundary', { timeout: TIMEOU t.ok(gotOutput || gotError, 'long input either produced output or a clear error — no crash or hang') } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/afriquegemma-translation.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/afriquegemma-translation.test.js index cd42207918..d225db60ff 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/afriquegemma-translation.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/afriquegemma-translation.test.js @@ -1,7 +1,6 @@ 'use strict' const test = require('brittle') -const FilesystemDL = require('@qvac/dl-filesystem') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const os = require('bare-os') @@ -133,16 +132,14 @@ async function resolveModel () { } async function createAddon (dirPath, modelName, configOverrides = {}) { - const loader = new FilesystemDL({ dirPath }) const config = { ...AFRIQUEGEMMA_CONFIG, ...configOverrides } const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [path.join(dirPath, modelName)] }, + config, logger: console, opts: { stats: true } - }, config) - return { addon, loader } + }) + return { addon } } const TIMEOUT = 1_800_000 @@ -158,7 +155,7 @@ const TIMEOUT = 1_800_000 // --------------------------------------------------------------------------- test('AfriqueGemma: core EN↔African language pairs', { timeout: TIMEOUT }, async t => { const [modelName, dirPath] = await resolveModel() - const { addon, loader } = await createAddon(dirPath, modelName) + const { addon } = await createAddon(dirPath, modelName) try { await addon.load() t.pass('model loaded (Gemma 3 4B base, Q4_K_M via llama.cpp)') @@ -188,7 +185,6 @@ test('AfriqueGemma: core EN↔African language pairs', { timeout: TIMEOUT }, asy t.is(out1, out2, `deterministic: "${out1}"`) } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) @@ -203,7 +199,7 @@ test('AfriqueGemma: core EN↔African language pairs', { timeout: TIMEOUT }, asy // --------------------------------------------------------------------------- test('AfriqueGemma: African-to-African via English pivot', { timeout: TIMEOUT }, async t => { const [modelName, dirPath] = await resolveModel() - const { addon, loader } = await createAddon(dirPath, modelName) + const { addon } = await createAddon(dirPath, modelName) try { await addon.load() @@ -220,7 +216,6 @@ test('AfriqueGemma: African-to-African via English pivot', { timeout: TIMEOUT }, t.ok(!yorubaOutput.includes('English:'), 'final output is not English echo') } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) @@ -236,7 +231,7 @@ test('AfriqueGemma: African-to-African via English pivot', { timeout: TIMEOUT }, // --------------------------------------------------------------------------- test('AfriqueGemma: bridge languages (French, Portuguese, Arabic)', { timeout: TIMEOUT }, async t => { const [modelName, dirPath] = await resolveModel() - const { addon, loader } = await createAddon(dirPath, modelName) + const { addon } = await createAddon(dirPath, modelName) try { await addon.load() @@ -253,7 +248,6 @@ test('AfriqueGemma: bridge languages (French, Portuguese, Arabic)', { timeout: T } } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) @@ -265,7 +259,7 @@ test('AfriqueGemma: bridge languages (French, Portuguese, Arabic)', { timeout: T // --------------------------------------------------------------------------- test('AfriqueGemma: longer content, mixed content, sequential calls', { timeout: TIMEOUT }, async t => { const [modelName, dirPath] = await resolveModel() - const { addon, loader } = await createAddon(dirPath, modelName) + const { addon } = await createAddon(dirPath, modelName) try { await addon.load() @@ -287,7 +281,6 @@ test('AfriqueGemma: longer content, mixed content, sequential calls', { timeout: } } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) @@ -302,7 +295,7 @@ test('AfriqueGemma: longer content, mixed content, sequential calls', { timeout: // --------------------------------------------------------------------------- test('AfriqueGemma: African-language Unicode input (African → English)', { timeout: TIMEOUT }, async t => { const [modelName, dirPath] = await resolveModel() - const { addon, loader } = await createAddon(dirPath, modelName) + const { addon } = await createAddon(dirPath, modelName) try { await addon.load() @@ -327,7 +320,6 @@ test('AfriqueGemma: African-language Unicode input (African → English)', { tim t.comment('All non-Latin/diacritic inputs produced valid English output') } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) @@ -341,7 +333,7 @@ test('AfriqueGemma: African-language Unicode input (African → English)', { tim // --------------------------------------------------------------------------- test('AfriqueGemma: streaming tokens arrive incrementally with stats', { timeout: TIMEOUT }, async t => { const [modelName, dirPath] = await resolveModel() - const { addon, loader } = await createAddon(dirPath, modelName) + const { addon } = await createAddon(dirPath, modelName) try { await addon.load() @@ -375,6 +367,5 @@ test('AfriqueGemma: streaming tokens arrive incrementally with stats', { timeout } } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/api-behavior.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/api-behavior.test.js index 594e184ea5..b82c9f20ca 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/api-behavior.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/api-behavior.test.js @@ -3,7 +3,7 @@ // Tests must match the behavior described in README section "API behavior by state". const test = require('brittle') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const { attachSpecLogger } = require('./spec-logger') @@ -35,7 +35,7 @@ async function setupModel (t, configOverrides = {}) { downloadUrl: MODEL.url }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const config = { device: useCpu ? 'cpu' : 'gpu', gpu_layers: '999', @@ -47,18 +47,16 @@ async function setupModel (t, configOverrides = {}) { const specLogger = attachSpecLogger({ forwardToConsole: true }) const model = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: console, opts: { stats: true } - }, config) + }) await model.load() t.teardown(async () => { await model.unload().catch(() => {}) - await loader.close().catch(() => {}) specLogger.release() }) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/bitnet.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/bitnet.test.js index 3ccac870bb..067b0f78b8 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/bitnet.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/bitnet.test.js @@ -1,7 +1,7 @@ 'use strict' const test = require('brittle') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const { attachSpecLogger } = require('./spec-logger') @@ -32,7 +32,7 @@ test('bitnet model can run simple inference', { timeout: 600_000, skip: !isAndro downloadUrl: BITNET_MODEL.url }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const specLogger = attachSpecLogger({ forwardToConsole: true }) const config = { @@ -44,12 +44,11 @@ test('bitnet model can run simple inference', { timeout: 600_000, skip: !isAndro } const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: console, opts: { stats: true } - }, config) + }) try { await addon.load() @@ -60,7 +59,6 @@ test('bitnet model can run simple inference', { timeout: 600_000, skip: !isAndro t.comment(`BitNet output: "${output}"`) } finally { await addon.unload().catch(() => { }) - await loader.close().catch(() => { }) specLogger.release() } }) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/cache-state-machine.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/cache-state-machine.test.js index c4e153895b..6ea7dbd845 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/cache-state-machine.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/cache-state-machine.test.js @@ -3,7 +3,6 @@ const test = require('brittle') const path = require('bare-path') const fs = require('bare-fs') -const FilesystemDL = require('@qvac/dl-filesystem') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const { attachSpecLogger } = require('./spec-logger') @@ -92,7 +91,7 @@ async function setupModel (t, overrides = {}) { downloadUrl: DEFAULT_MODEL.url }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const config = { ...BASE_CONFIG, ...overrides } const specLogger = attachSpecLogger({ forwardToConsole: true }) let loggerReleased = false @@ -103,24 +102,21 @@ async function setupModel (t, overrides = {}) { } const model = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: console, opts: { stats: true } - }, config) + }) try { await model.load() } catch (err) { releaseLogger() - await loader.close().catch(() => { }) throw err } t.teardown(async () => { await model.unload().catch(() => { }) - await loader.close().catch(() => { }) releaseLogger() }) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/config-parameters.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/config-parameters.test.js index 90579d7c67..84db708190 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/config-parameters.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/config-parameters.test.js @@ -1,7 +1,7 @@ 'use strict' const test = require('brittle') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const { attachSpecLogger } = require('./spec-logger') @@ -343,7 +343,7 @@ async function executeScenario (t, scenario) { downloadUrl: 'https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf' }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const baseConfig = { device: useCpu ? 'cpu' : 'gpu', @@ -362,12 +362,11 @@ async function executeScenario (t, scenario) { const logs = specLogger.logs const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config: { ...baseConfig, ...scenario.overrides }, logger: createTestLogger(), opts: { stats: true } - }, { ...baseConfig, ...scenario.overrides }) + }) let loadSucceeded = false @@ -410,7 +409,6 @@ async function executeScenario (t, scenario) { if (loadSucceeded) { await addon.unload().catch(() => {}) } - await loader.close().catch(() => {}) specLogger.release() } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/dynamic-tools.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/dynamic-tools.test.js index 7baf88c9e2..85857006c9 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/dynamic-tools.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/dynamic-tools.test.js @@ -2,7 +2,6 @@ const test = require('brittle') const path = require('bare-path') -const FilesystemDL = require('@qvac/dl-filesystem') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const { attachSpecLogger } = require('./spec-logger') @@ -83,7 +82,7 @@ async function setupModel (t, overrides = {}) { downloadUrl: QWEN3_MODEL.url }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const config = { ...BASE_CONFIG, ...overrides } const specLogger = attachSpecLogger({ forwardToConsole: true }) let loggerReleased = false @@ -94,24 +93,21 @@ async function setupModel (t, overrides = {}) { } const model = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: console, opts: { stats: true } - }, config) + }) try { await model.load() } catch (err) { releaseLogger() - await loader.close().catch(() => {}) throw err } t.teardown(async () => { await model.unload().catch(() => {}) - await loader.close().catch(() => {}) releaseLogger() }) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/finetuning-pause-resume.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/finetuning-pause-resume.test.js index 4f4a6582dc..505cd6ed9f 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/finetuning-pause-resume.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/finetuning-pause-resume.test.js @@ -2,7 +2,6 @@ const test = require('brittle') const path = require('bare-path') -const FilesystemDL = require('@qvac/dl-filesystem') const LlmLlamacpp = require('../../index.js') const { ensureModel, @@ -89,7 +88,7 @@ function assertLossAndAccuracyAreFinite (t, result, modelId) { async function runLoraInference (t, modelVariant, modelName, modelDir, loraAdapterPath) { t.comment(`[${modelVariant.id}] Running inference with LoRA adapter: ${loraAdapterPath}`) - const inferLoader = new FilesystemDL({ dirPath: modelDir }) + const inferModelPath = path.join(modelDir, modelName) const inferConfig = { gpu_layers: '999', ctx_size: '512', @@ -98,16 +97,12 @@ async function runLoraInference (t, modelVariant, modelName, modelDir, loraAdapt lora: loraAdapterPath } - const inferModel = new LlmLlamacpp( - { - loader: inferLoader, - modelName, - diskPath: modelDir, - logger: console, - opts: { stats: true } - }, - inferConfig - ) + const inferModel = new LlmLlamacpp({ + files: { model: [inferModelPath] }, + config: inferConfig, + logger: console, + opts: { stats: true } + }) try { await inferModel.load() @@ -122,7 +117,6 @@ async function runLoraInference (t, modelVariant, modelName, modelDir, loraAdapt t.comment(`[${modelVariant.id}] LoRA inference stats: ${JSON.stringify(response.stats)}`) } finally { await inferModel.unload().catch(() => {}) - await inferLoader.close().catch(() => {}) } } @@ -143,7 +137,7 @@ test('finetuning pause and resume', { timeout: PAUSE_RESUME_TIMEOUT_MS, skip: sk }) const checkpointDir = finetuneConfig.checkpointSaveDir - const loader = new FilesystemDL({ dirPath: modelDir }) + const finetuneModelPath = path.join(modelDir, modelName) const loggerHandle = attachSpecLogger({ forwardToConsole: true }) const config = { @@ -153,16 +147,12 @@ test('finetuning pause and resume', { timeout: PAUSE_RESUME_TIMEOUT_MS, skip: sk verbosity: '2' } - const model = new LlmLlamacpp( - { - loader, - modelName, - diskPath: modelDir, - logger: console, - opts: { stats: true } - }, - config - ) + const model = new LlmLlamacpp({ + files: { model: [finetuneModelPath] }, + config, + logger: console, + opts: { stats: true } + }) try { await model.load() @@ -211,7 +201,6 @@ test('finetuning pause and resume', { timeout: PAUSE_RESUME_TIMEOUT_MS, skip: sk ) await model.unload().catch(() => {}) - await loader.close().catch(() => {}) const loraAdapterPath = path.join(finetuneConfig.outputParametersDir, 'trained-lora-adapter.gguf') await runLoraInference(t, modelVariant, modelName, modelDir, loraAdapterPath) @@ -275,7 +264,6 @@ test('finetuning pause and resume', { timeout: PAUSE_RESUME_TIMEOUT_MS, skip: sk t.pass(`[${modelVariant.id}] finetuning pause and resume completed`) await model.unload().catch(() => {}) - await loader.close().catch(() => {}) const loraAdapterPath = path.join(finetuneConfig.outputParametersDir, 'trained-lora-adapter.gguf') await runLoraInference(t, modelVariant, modelName, modelDir, loraAdapterPath) @@ -283,7 +271,6 @@ test('finetuning pause and resume', { timeout: PAUSE_RESUME_TIMEOUT_MS, skip: sk } finally { loggerHandle.release() await model.unload().catch(() => {}) - await loader.close().catch(() => {}) cleanupCheckpoints(checkpointDir) } } @@ -299,24 +286,20 @@ test('cancel() stops finetuning and removes pause checkpoint', { timeout: PAUSE_ const finetuneConfig = setupParams(modelDir, { checkpointSaveSteps: 5, datasetSize: isMobile ? 8 : 16, testId: 'cancel-test' }) const checkpointDir = finetuneConfig.checkpointSaveDir - const loader = new FilesystemDL({ dirPath: modelDir }) + const cancelModelPath = path.join(modelDir, modelName) const loggerHandle = attachSpecLogger({ forwardToConsole: true }) - const model = new LlmLlamacpp( - { - loader, - modelName, - diskPath: modelDir, - logger: console, - opts: { stats: true } - }, - { + const model = new LlmLlamacpp({ + files: { model: [cancelModelPath] }, + config: { gpu_layers: '999', ctx_size: '512', device: forceCpuDevice ? 'cpu' : 'gpu', verbosity: '2' - } - ) + }, + logger: console, + opts: { stats: true } + }) const fs = require('bare-fs') @@ -352,7 +335,6 @@ test('cancel() stops finetuning and removes pause checkpoint', { timeout: PAUSE_ } finally { loggerHandle.release() await model.unload().catch(() => {}) - await loader.close().catch(() => {}) cleanupCheckpoints(checkpointDir) } }) @@ -368,7 +350,7 @@ test('inference with session cache works after finetuning', { timeout: PAUSE_RES const checkpointDir = finetuneConfig.checkpointSaveDir const sessionFile = path.join(modelDir, 'test-session-finetune.bin') - const loader = new FilesystemDL({ dirPath: modelDir }) + const sessionModelPath = path.join(modelDir, modelName) const loggerHandle = attachSpecLogger({ forwardToConsole: true }) const config = { @@ -380,16 +362,12 @@ test('inference with session cache works after finetuning', { timeout: PAUSE_RES seed: '42' } - const model = new LlmLlamacpp( - { - loader, - modelName, - diskPath: modelDir, - logger: console, - opts: { stats: true } - }, - config - ) + const model = new LlmLlamacpp({ + files: { model: [sessionModelPath] }, + config, + logger: console, + opts: { stats: true } + }) const fs = require('bare-fs') @@ -430,7 +408,6 @@ test('inference with session cache works after finetuning', { timeout: PAUSE_RES } finally { loggerHandle.release() await model.unload().catch(() => {}) - await loader.close().catch(() => {}) cleanupCheckpoints(checkpointDir) try { fs.unlinkSync(sessionFile) } catch (_) {} } @@ -445,11 +422,13 @@ test('microBatchSize override changes backend batch geometry', { timeout: PAUSE_ async function getTotalBatches (batchSize, microBatchSize, testId) { const config = setupParams(modelDir, { batchSize, microBatchSize, checkpointSaveSteps: 0, testId }) - const loader = new FilesystemDL({ dirPath: modelDir }) - const model = new LlmLlamacpp( - { loader, modelName, diskPath: modelDir, logger: console, opts: { stats: true } }, - { gpu_layers: '999', ctx_size: '512', device: forceCpuDevice ? 'cpu' : 'gpu', verbosity: '0' } - ) + const batchModelPath = path.join(modelDir, modelName) + const model = new LlmLlamacpp({ + files: { model: [batchModelPath] }, + config: { gpu_layers: '999', ctx_size: '512', device: forceCpuDevice ? 'cpu' : 'gpu', verbosity: '0' }, + logger: console, + opts: { stats: true } + }) try { await model.load() const handle = await model.finetune(config) @@ -459,7 +438,6 @@ test('microBatchSize override changes backend batch geometry', { timeout: PAUSE_ return totalBatches } finally { await model.unload().catch(() => {}) - await loader.close().catch(() => {}) cleanupCheckpoints(config.checkpointSaveDir) } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/generation-params.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/generation-params.test.js index 613956cbc7..93ae6dec40 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/generation-params.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/generation-params.test.js @@ -1,7 +1,7 @@ 'use strict' const test = require('brittle') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const { attachSpecLogger } = require('./spec-logger') @@ -27,7 +27,7 @@ async function setupModel (t, configOverrides = {}) { downloadUrl: MODEL.url }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const config = { device: useCpu ? 'cpu' : 'gpu', gpu_layers: '999', @@ -40,18 +40,16 @@ async function setupModel (t, configOverrides = {}) { const specLogger = attachSpecLogger({ forwardToConsole: true }) const model = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: console, opts: { stats: true } - }, config) + }) await model.load() t.teardown(async () => { await model.unload().catch(() => {}) - await loader.close().catch(() => {}) specLogger.release() }) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/http-loader.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/http-loader.js index aa832a9da1..9d44d80271 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/http-loader.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/http-loader.js @@ -1,16 +1,26 @@ 'use strict' const https = require('bare-https') -const BaseDL = require('@qvac/dl-base') /** - * A minimal HTTP/HTTPS loader that implements the BaseDL interface. - * Fetches model files from a remote base URL, following redirects. + * Minimal HTTP/HTTPS streamer used by the sharded-model integration test + * to download a small public sharded GGUF before constructing the addon. + * + * Standalone — does not extend any base loader class. The package no + * longer depends on `@qvac/dl-base` after the loader-removal refactor; + * this helper exists solely so the sharded model-loading test can fetch + * shard files without pulling a heavyweight loader implementation back + * into devDependencies. + * + * Only the surface used by `model-loading.test.js` is implemented: + * - `new HttpDL({ baseUrl })` + * - `getStream(filename)` — returns a Bare-https response stream that + * can be piped into `fs.createWriteStream`. + * - `close()` — destroys any in-flight streams the caller did not + * consume to completion. */ -class HttpDL extends BaseDL { +class HttpDL { constructor (opts) { - super(opts) - if (!opts || !opts.baseUrl) { throw new Error('HttpDL requires a baseUrl option') } @@ -19,15 +29,6 @@ class HttpDL extends BaseDL { this._activeStreams = new Set() } - /** - * Return the Content-Length of a remote file via an HTTP HEAD request. - * @param {string} filename - * @returns {Promise} byte size - */ - async getFileSize (filename) { - return this._request('HEAD', this.baseUrl + filename) - } - /** * Fetch a file by name and return it as a readable stream. * The stream is tracked so that close() can destroy it if needed. @@ -44,7 +45,10 @@ class HttpDL extends BaseDL { return response } - async _close () { + /** + * Destroy any tracked streams that have not finished on their own. + */ + async close () { for (const stream of this._activeStreams) { stream.destroy() } @@ -73,22 +77,13 @@ class HttpDL extends BaseDL { return } - if (method === 'HEAD') { - response.resume() - resolve(parseInt(response.headers['content-length'] || '0', 10)) - } else { - resolve(response) - } + resolve(response) }) req.on('error', reject) req.end() }) } - - async list () { - throw new Error('HttpDL does not support list()') - } } module.exports = HttpDL diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/image.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/image.test.js index c3fb7c3442..0989b91cc8 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/image.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/image.test.js @@ -4,7 +4,6 @@ const test = require('brittle') const fs = require('bare-fs') const path = require('bare-path') const { ensureModel, getMediaPath } = require('./utils') -const FilesystemDL = require('@qvac/dl-filesystem') const LlmLlamacpp = require('../../index.js') const os = require('bare-os') const process = require('bare-process') @@ -119,7 +118,7 @@ function getConfig (device, modelConfig) { * Sets up a multimodal LlmLlamacpp instance with LLM and projection models * @param {Object} t - Test instance * @param {string} device - Device to use ('cpu' or 'gpu') - * @returns {Promise<{inference: LlmLlamacpp, loader: FilesystemDL}>} + * @returns {Promise<{inference: LlmLlamacpp}>} */ async function setupMultimodalInference (t, device = 'gpu', modelConfig = MULTIMODAL_MODEL_CONFIG) { const [modelName, dirPath] = await ensureModel(modelConfig.llmModel) @@ -128,23 +127,20 @@ async function setupMultimodalInference (t, device = 'gpu', modelConfig = MULTIM const [projModelName] = await ensureModel(modelConfig.projModel) t.ok(fs.existsSync(path.join(dirPath, projModelName)), 'Projection model file should exist') - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const inference = new LlmLlamacpp({ - modelName, - loader, - logger: console, - diskPath: dirPath, - projectionModel: projModelName - }, getConfig(device, modelConfig)) + files: { model: [modelPath], projectionModel: path.join(dirPath, projModelName) }, + config: getConfig(device, modelConfig), + logger: console + }) t.teardown(async () => { - await loader.close() await inference.unload() }) await inference.load() - return { inference, loader } + return { inference } } /** @@ -300,6 +296,32 @@ const imageTestCases = [ } ] +async function describeImageByPath (inference, imageFilePath, prompt = TEST_CONSTANTS.defaultPrompt) { + const messages = [ + { role: 'system', content: 'You are a helpful assistant.' }, + { role: 'user', type: 'media', content: imageFilePath }, + { role: 'user', content: prompt } + ] + + const response = await inference.run(messages) + const generatedText = [] + let error = null + + response.onUpdate(data => { + generatedText.push(data) + }).onError(err => { + error = err + }) + + await response.await() + + if (error) { + throw new Error('Inference error: ' + error) + } + + return generatedText.join('') +} + for (const testCase of imageTestCases) { test(`llama addon can recognize ${testCase.name} in an image`, { timeout: TEST_CONSTANTS.timeout }, async t => { for (const deviceConfig of DEVICE_CONFIGS) { @@ -331,6 +353,26 @@ for (const testCase of imageTestCases) { }) } +test('llama addon accepts a file path string as media content', { timeout: TEST_CONSTANTS.timeout }, async t => { + const deviceConfig = DEVICE_CONFIGS[0] + const label = `[${deviceConfig.id.toUpperCase()}]` + + const { inference } = await setupMultimodalInference(t, deviceConfig.device) + + const imageFilePath = getMediaPath('elephant.jpg') + t.ok(fs.existsSync(imageFilePath), `${label} elephant.jpg image file should exist`) + + const generatedText = await describeImageByPath(inference, imageFilePath) + t.comment(`${label} Generated text: ${generatedText}`) + + t.ok(generatedText.length > 0, `${label} Should generate text output when media content is a file path`) + const { hasMatch, foundKeywords } = checkKeywordsInText(generatedText, ['elephant', 'elephants']) + t.ok(hasMatch, + `${label} Output should describe the elephant when image is passed as a path string. ` + + `Found keywords: ${foundKeywords.join(', ') || 'none'}. ` + + `Full output: "${generatedText}"`) +}) + // TODO: Fix multi-image for smaller models? Seems like an image per separate message works // TODO: on smaller models, rather than all images on same message. // TODO: Discussion at: https://github.com/tetherto/qvac/pull/172#discussion_r2807275659 diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/model-loading.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/model-loading.test.js index 4bcc122af1..111f3acc58 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/model-loading.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/model-loading.test.js @@ -1,7 +1,6 @@ 'use strict' const test = require('brittle') -const FilesystemDL = require('@qvac/dl-filesystem') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') @@ -48,7 +47,7 @@ test('filesystem loader can run inference end-to-end', { timeout: 600_000, skip: downloadUrl: DEFAULT_MODEL.url }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const config = { gpu_layers: '999', ctx_size: '1024', @@ -58,12 +57,11 @@ test('filesystem loader can run inference end-to-end', { timeout: 600_000, skip: } const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: console, opts: { stats: true } - }, config) + }) try { await addon.load() @@ -76,7 +74,6 @@ test('filesystem loader can run inference end-to-end', { timeout: 600_000, skip: t.fail('filesystem-loaded model should generate output', error) } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) @@ -86,7 +83,7 @@ test('model unload is clean and idempotent', { timeout: 600_000 }, async t => { downloadUrl: DEFAULT_MODEL.url }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const config = { gpu_layers: '512', ctx_size: '1024', @@ -96,34 +93,29 @@ test('model unload is clean and idempotent', { timeout: 600_000 }, async t => { } const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: console, opts: { stats: true } - }, config) + }) - try { - await addon.load() - const firstResponse = await addon.run(BASE_PROMPT) - await collectResponse(firstResponse) + await addon.load() + const firstResponse = await addon.run(BASE_PROMPT) + await collectResponse(firstResponse) - await addon.unload() - t.pass('first unload succeeded') + await addon.unload() + t.pass('first unload succeeded') - await addon.load() - const secondResponse = await addon.run(BASE_PROMPT) - await collectResponse(secondResponse) + await addon.load() + const secondResponse = await addon.run(BASE_PROMPT) + await collectResponse(secondResponse) - await addon.unload() - t.pass('second unload succeeded') + await addon.unload() + t.pass('second unload succeeded') - await addon.unload().catch(err => { - if (err) t.fail('unload should be idempotent', err) - }) - } finally { - await loader.close().catch(() => {}) - } + await addon.unload().catch(err => { + if (err) t.fail('unload should be idempotent', err) + }) }) const SHARDED_MODEL = { @@ -134,10 +126,34 @@ const SHARDED_MODEL = { // This test can take longer to download and execute. To avoid blowing up testing time on all // platforms, just use Linux for now. C++ tests already have faster coverage for each type // of load. -test('network loader can run inference end-to-end with sharded model', { timeout: 4 * 60 * 1000, skip: !isLinuxX64 }, async t => { +test('sharded model can run inference end-to-end', { timeout: 4 * 60 * 1000, skip: !isLinuxX64 }, async t => { + const fs = require('bare-fs') const modelDir = path.resolve(__dirname, '../model') + fs.mkdirSync(modelDir, { recursive: true }) + + const shardFiles = [ + 'Qwen3-0.6B-UD-IQ1_S.tensors.txt', + 'Qwen3-0.6B-UD-IQ1_S-00001-of-00003.gguf', + 'Qwen3-0.6B-UD-IQ1_S-00002-of-00003.gguf', + 'Qwen3-0.6B-UD-IQ1_S-00003-of-00003.gguf' + ] const loader = new HttpDL({ baseUrl: SHARDED_MODEL.baseUrl }) + for (const filename of shardFiles) { + const dest = path.join(modelDir, filename) + if (fs.existsSync(dest)) continue + console.log(` Downloading shard: ${filename}`) + const stream = await loader.getStream(filename) + const ws = fs.createWriteStream(dest) + for await (const chunk of stream) { + ws.write(chunk) + } + ws.end() + await new Promise(resolve => ws.on('close', resolve)) + } + await loader.close().catch(() => {}) + + const shardPaths = shardFiles.map(f => path.join(modelDir, f)) const config = { gpu_layers: '999', ctx_size: '1024', @@ -147,39 +163,19 @@ test('network loader can run inference end-to-end with sharded model', { timeout } const addon = new LlmLlamacpp({ - loader, - modelName: SHARDED_MODEL.name, - diskPath: modelDir, + files: { model: shardPaths }, + config, logger: console, opts: { stats: true } - }, config) - - let progressMade = 0 - let lastLogTime = 0 - const LOG_INTERVAL_MS = 3000 - const onProgress = (data) => { - if (typeof data !== 'object' || data === null) return - const now = Date.now() - const shard = data.currentFile.replace(/^.*\//, '') - progressMade = Math.max(progressMade, data.overallProgress) - if (data.action === 'loadingFile' && now - lastLogTime >= LOG_INTERVAL_MS) { - console.log(`\r Loading ${shard}: ${data.currentFileProgress}% (overall ${data.overallProgress}%) `) - lastLogTime = now - } else if (data.action === 'completeFile') { - console.log(`\r Loaded ${shard}: 100.00% (overall ${data.overallProgress}%) [${data.filesProcessed}/${data.totalFiles}]\n`) - lastLogTime = now - } - } + }) try { - await addon.load(true, onProgress) + await addon.load() const response = await addon.run(BASE_PROMPT) const output = await collectResponse(response) - t.ok(output.length > 0, 'network-loaded sharded model should generate output') - t.ok(progressMade > 0, 'network-loaded sharded model should make progress') + t.ok(output.length > 0, 'sharded model should generate output') } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/moe.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/moe.test.js index 15da081db3..5f8606aa2d 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/moe.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/moe.test.js @@ -2,7 +2,7 @@ const test = require('brittle') const os = require('bare-os') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const { attachSpecLogger } = require('./spec-logger') @@ -40,15 +40,14 @@ test('llm addon can run MoE models [dolphin-mixtral-2x7b]', { }, async t => { const [modelName, dirPath] = await ensureModel({ modelName: MODEL.name, downloadUrl: MODEL.url }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const specLogger = attachSpecLogger({ forwardToConsole: true }) const inference = new LlmLlamacpp({ - modelName, - loader, + files: { model: [modelPath] }, + config: CONFIG, logger: console, - diskPath: dirPath, opts: { stats: true } - }, CONFIG) + }) try { await inference.load() @@ -60,6 +59,5 @@ test('llm addon can run MoE models [dolphin-mixtral-2x7b]', { } finally { specLogger.release() await inference.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/multi-instance.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/multi-instance.test.js index a7accc1753..01c88e3f39 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/multi-instance.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/multi-instance.test.js @@ -1,7 +1,7 @@ 'use strict' const test = require('brittle') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const os = require('bare-os') @@ -38,7 +38,7 @@ function createLogger () { } async function createInstance (modelName, dirPath, overrides = {}) { - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const config = { device: useCpu ? 'cpu' : 'gpu', gpu_layers: '999', @@ -50,12 +50,11 @@ async function createInstance (modelName, dirPath, overrides = {}) { } const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: createLogger(), opts: { stats: true } - }, config) + }) const origLoad = addon.load.bind(addon) addon.load = async function () { @@ -64,7 +63,7 @@ async function createInstance (modelName, dirPath, overrides = {}) { console.log(` model.load() took ${Date.now() - t0} ms`) } - return { addon, loader } + return { addon } } async function collectResponse (response) { @@ -82,14 +81,12 @@ test('Two instances can run inference simultaneously', { downloadUrl: DEFAULT_MODEL.url }) - const { addon: addon1, loader: loader1 } = await createInstance(modelName, dirPath) - const { addon: addon2, loader: loader2 } = await createInstance(modelName, dirPath) + const { addon: addon1 } = await createInstance(modelName, dirPath) + const { addon: addon2 } = await createInstance(modelName, dirPath) t.teardown(async () => { await addon1.unload().catch(() => {}) await addon2.unload().catch(() => {}) - await loader1.close().catch(() => {}) - await loader2.close().catch(() => {}) }) await addon1.load() @@ -119,7 +116,7 @@ test('Repeated load/unload cycles should remain stable', { const NUM_CYCLES = 6 for (let i = 0; i < NUM_CYCLES; i++) { - const { addon, loader } = await createInstance(modelName, dirPath) + const { addon } = await createInstance(modelName, dirPath) await addon.load() const response = await addon.run(BASE_PROMPT) @@ -128,7 +125,6 @@ test('Repeated load/unload cycles should remain stable', { t.ok(output.length > 0, `cycle ${i + 1}: produced output`) await addon.unload() - await loader.close() t.pass(`cycle ${i + 1}: load/unload completed`) } @@ -145,16 +141,14 @@ test('Unloading one instance does not affect another generating instance', { downloadUrl: DEFAULT_MODEL.url }) - const { addon: addon1, loader: loader1 } = await createInstance(modelName, dirPath, { + const { addon: addon1 } = await createInstance(modelName, dirPath, { n_predict: '256' }) - const { addon: addon2, loader: loader2 } = await createInstance(modelName, dirPath) + const { addon: addon2 } = await createInstance(modelName, dirPath) t.teardown(async () => { await addon1.unload().catch(() => {}) await addon2.unload().catch(() => {}) - await loader1.close().catch(() => {}) - await loader2.close().catch(() => {}) }) await addon1.load() @@ -189,7 +183,6 @@ test('Unloading one instance does not affect another generating instance', { if (!unloadedInstance2) { unloadedInstance2 = true await addon2.unload() - await loader2.close() t.pass('unloaded instance 2 while instance 1 is generating') } @@ -209,13 +202,12 @@ test('Multiple load/unload cycles on one instance while another generates', { downloadUrl: DEFAULT_MODEL.url }) - const { addon: addon1, loader: loader1 } = await createInstance(modelName, dirPath, { + const { addon: addon1 } = await createInstance(modelName, dirPath, { n_predict: '512' }) t.teardown(async () => { await addon1.unload().catch(() => {}) - await loader1.close().catch(() => {}) }) await addon1.load() @@ -262,10 +254,9 @@ test('Multiple load/unload cycles on one instance while another generates', { } cyclesCompleted++ const cycleNum = cyclesCompleted - const { addon: addon2, loader: loader2 } = await createInstance(modelName, dirPath) + const { addon: addon2 } = await createInstance(modelName, dirPath) await addon2.load() await addon2.unload() - await loader2.close() t.pass(`load/unload cycle ${cycleNum} completed while instance 1 generates`) } diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js index 9adf88567a..546abe8f91 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js @@ -4,7 +4,6 @@ const test = require('brittle') const fs = require('bare-fs') const path = require('bare-path') const { ensureModel, getMediaPath } = require('./utils') -const FilesystemDL = require('@qvac/dl-filesystem') const LlmLlamacpp = require('../../index.js') const os = require('bare-os') @@ -55,23 +54,20 @@ async function setupLightOnInference (t, device = 'gpu') { const [projModelName] = await ensureModel(LIGHTON_OCR_CONFIG.projModel) t.ok(fs.existsSync(path.join(dirPath, projModelName)), 'Projection model file should exist') - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const inference = new LlmLlamacpp({ - modelName, - loader, - logger: console, - diskPath: dirPath, - projectionModel: projModelName - }, getConfig(device)) + files: { model: [modelPath], projectionModel: path.join(dirPath, projModelName) }, + config: getConfig(device), + logger: console + }) t.teardown(async () => { - await loader.close() await inference.unload() }) await inference.load() - return { inference, loader } + return { inference } } async function runOcr (inference, imageFilePath) { diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/reasoning.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/reasoning.test.js index e29d5cf0c0..e27e1ee7e3 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/reasoning.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/reasoning.test.js @@ -1,10 +1,10 @@ 'use strict' const test = require('brittle') +const path = require('bare-path') const { ensureModel } = require('./utils') const { attachSpecLogger } = require('./spec-logger') const os = require('bare-os') -const FilesystemDL = require('@qvac/dl-filesystem') const LlmLlamacpp = require('../../index.js') const isDarwinX64 = os.platform() === 'darwin' && os.arch() === 'x64' @@ -23,7 +23,7 @@ async function setupReasoningModel (t, toolsEnabled) { downloadUrl: MODEL.url }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const specLogger = attachSpecLogger({ forwardToConsole: true }) const config = { @@ -38,27 +38,24 @@ async function setupReasoningModel (t, toolsEnabled) { } const inference = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: console, - projectionPath: '', opts: { stats: true } - }, config) + }) await inference.load() t.teardown(async () => { try { specLogger.release() - if (loader) await loader.close() if (inference) await inference.unload() } catch (err) { // Ignore cleanup errors } }) - return { inference, loader } + return { inference } } // Shared helper: Run a completion and collect response diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/sliding-context.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/sliding-context.test.js index 384306b5e5..c29c4d3977 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/sliding-context.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/sliding-context.test.js @@ -2,7 +2,6 @@ const test = require('brittle') const path = require('bare-path') -const FilesystemDL = require('@qvac/dl-filesystem') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const os = require('bare-os') @@ -55,7 +54,7 @@ async function setupModel (t, overrides = {}) { downloadUrl: DEFAULT_MODEL.url }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const baseConfig = { device: useCpu ? 'cpu' : 'gpu', @@ -69,19 +68,13 @@ async function setupModel (t, overrides = {}) { } const model = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config: { ...baseConfig, ...overrides }, logger: createTestLogger(), opts: { stats: true } - }, { ...baseConfig, ...overrides }) + }) - try { - await model.load() - } catch (err) { - await loader.close().catch(() => {}) - throw err - } + await model.load() t.teardown(async () => { // Guard against model.unload() hanging after context overflow (seen on darwin-arm64 CI). @@ -89,7 +82,6 @@ async function setupModel (t, overrides = {}) { const unloadDone = model.unload().catch(() => {}) const unloadTimeout = new Promise(resolve => setTimeout(resolve, 30_000)) await Promise.race([unloadDone, unloadTimeout]) - await loader.close().catch(() => {}) }) return { model, dirPath } diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/tool-calling.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/tool-calling.test.js index 0a6272acfd..6e2f2b0d80 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/tool-calling.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/tool-calling.test.js @@ -1,7 +1,7 @@ 'use strict' const test = require('brittle') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const { attachSpecLogger } = require('./spec-logger') @@ -141,7 +141,7 @@ async function createToolModel (modelVariant) { downloadUrl: modelVariant.downloadUrl }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const specLogger = attachSpecLogger({ forwardToConsole: true }) let loggerReleased = false const releaseLogger = () => { @@ -151,18 +151,16 @@ async function createToolModel (modelVariant) { } const model = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config: BASE_CONFIG, logger: console, opts: { stats: true } - }, BASE_CONFIG) + }) try { await model.load() } catch (err) { releaseLogger() - await loader.close().catch(() => {}) throw err } @@ -170,7 +168,6 @@ async function createToolModel (modelVariant) { model, async release () { await model.unload().catch(() => {}) - await loader.close().catch(() => {}) releaseLogger() } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/utf8-output.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/utf8-output.test.js index 18c0fb0670..1450600666 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/utf8-output.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/utf8-output.test.js @@ -1,7 +1,7 @@ 'use strict' const test = require('brittle') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const { attachSpecLogger } = require('./spec-logger') @@ -39,7 +39,7 @@ test('model returns UTF-8 emoji without truncation', { timeout: 600_000 }, async downloadUrl: MODEL.url }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const specLogger = attachSpecLogger({ forwardToConsole: true }) let loggerReleased = false const releaseLogger = () => { @@ -61,12 +61,11 @@ test('model returns UTF-8 emoji without truncation', { timeout: 600_000 }, async } const model = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: console, opts: { stats: true } - }, config) + }) let output = '' try { @@ -86,7 +85,6 @@ test('model returns UTF-8 emoji without truncation', { timeout: 600_000 }, async t.ok(response.stats.generatedTokens > 0, 'token stats recorded') } finally { await model.unload().catch(() => {}) - await loader.close().catch(() => {}) releaseLogger() } }) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/mobile/README.md b/packages/qvac-lib-infer-llamacpp-llm/test/mobile/README.md index 2a82fbd4cc..cb8be631a3 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/mobile/README.md +++ b/packages/qvac-lib-infer-llamacpp-llm/test/mobile/README.md @@ -1,46 +1,43 @@ # Mobile Testing for LLM Llamacpp -This directory contains the mobile test configuration for the `@qvac/llm-llamacpp` addon. +This directory contains the mobile test entrypoint for the `@qvac/llm-llamacpp` addon. > ⚠️ **Note**: This test directory is included in the published npm package to support the mobile testing framework. These test files are NOT part of the public API and should only be used by the internal mobile testing infrastructure. ## Test Structure -- `test.cjs` - Main test file with `startTest()` function that runs automatically on mobile -- `testAssets/` - Directory for model files and test data +- `integration-runtime.cjs` — Bare-runtime helper that exposes a global `runIntegrationModule()` so each generated test entry can dynamically import a single file under `../integration/`. +- `integration.auto.cjs` — **Auto-generated** by `npm run test:mobile:generate`. Each function in this file mirrors one `.test.js` under `test/integration/` and invokes it through the runtime helper. Do not edit by hand; regenerate after adding or renaming integration tests. +- `testAssets/` — Directory for model files and test data referenced by the integration tests. -## Setup +## What the Mobile Tests Do -### Download Test Model +The mobile tests run the **same integration suite** that lives under `test/integration/`. They exercise the public `LlmLlamacpp` API end-to-end: -The test requires a small GGUF model file. Download it to the `testAssets` directory: +1. **Construct the addon** with the new constructor shape — `new LlmLlamacpp({ files: { model: [absolutePath] }, config, logger?, opts? })`. For sharded GGUF models the caller pre-resolves the shard list (`tensors.txt` + every `*-NNNNN-of-MMMMM.gguf` file). +2. **Load** the model into memory via `model.load()`. +3. **Run** inference, finetuning, generation-parameter, KV-cache, and other scenarios depending on which test entry is invoked. +4. **Unload** the model via `model.unload()` (or via `t.teardown()` in brittle tests). -```bash -cd test/mobile/testAssets +There is **no separate `test.cjs` file** and the addon no longer takes a `Loader` instance — file paths are passed directly to the constructor by the test (or by the test helper in `test/integration/utils.js`). Mobile testing reuses these helpers unchanged. -# Download a small test model (~500KB) -curl -L -o small-test-model.gguf \ - https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf -``` +## Setup -### Verify Setup +### Test Assets -```bash -ls -lh testAssets/ -# Should show: small-test-model.gguf (~500KB) -``` +Each integration test downloads or expects its own model under `test/integration/...` (or under `testAssets/`). See the individual test files for the exact model required. Most tests rely on `setupModel()` / `setupTinyModel()` helpers in `test/integration/utils.js`, which resolve the absolute file paths and pass them through `files.model`. + +## Regenerating `integration.auto.cjs` -## What the Test Does +After adding a new file under `test/integration/`, regenerate the mobile entries: -The mobile test performs a complete LLM inference workflow: +```bash +npm run test:mobile:generate +``` -1. **Initialize Filesystem Loader** - Sets up file access for the model -2. **Configure Model** - Uses GPU-accelerated settings (99 GPU layers) for faster inference -3. **Load Model** - Loads the GGUF model weights into memory and offloads to GPU -4. **Run Inference** - Generates text from the prompt "Say hello in one word" -5. **Cleanup** - Properly destroys the model instance and closes the loader +This walks `test/integration/`, derives a function name per test file, and rewrites `integration.auto.cjs`. The generator script also runs from CI to ensure mobile and desktop test inventories stay in sync. -## Running the Test +## Running the Tests From the mobile tester app root: @@ -55,42 +52,16 @@ npm run android npm run ios ``` -The app will: -- Automatically initialize after 3 seconds -- Start the test after 5 seconds -- Display progress and results on screen - -## Expected Output - -Success message will show: -``` -TEST COMPLETE ✓ - -Model loaded and generated X characters in response to: "Say hello in one word." - -Generated: Hello -``` +The app drives the auto-generated entrypoints to execute the desired test scenarios on-device. ## Troubleshooting ### Model file not found -- Ensure `small-test-model.gguf` is in the `testAssets/` directory -- Check that the file downloaded completely (~500KB) +- Ensure the test asset referenced by the failing integration test is present under `test/integration/` (or `testAssets/`). +- For sharded models, every shard plus the `*.tensors.txt` file must be present — the caller is responsible for the full file set since the addon no longer downloads weights. ### Out of memory -- The test uses a very small model (~500KB) -- If issues persist, try closing other apps +- Mobile devices have limited RAM. Prefer the smaller test models (e.g. tinyllama / Qwen-0.6B) for on-device runs and skip large-model tests where possible. ### Timeout errors -- The test waits up to 60 seconds for generation -- On slower devices, this may need to be increased in `test.cjs` - -## Model Details - -**Model**: TinyLlamas Stories 260K -- Size: ~500KB -- Format: GGUF -- Purpose: Fast mobile testing -- Source: https://huggingface.co/ggml-org/models - -This is an extremely small model designed for quick testing, not production use. +- Generation timeouts can be tuned per test file in `test/integration/...` via the brittle `{ timeout }` option. diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/mobile/integration.auto.cjs b/packages/qvac-lib-infer-llamacpp-llm/test/mobile/integration.auto.cjs index 253ba67b03..cdf52bc043 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/mobile/integration.auto.cjs +++ b/packages/qvac-lib-infer-llamacpp-llm/test/mobile/integration.auto.cjs @@ -6,6 +6,7 @@ require('./integration-runtime.cjs') // Functions are invoked dynamically by the mobile test runner framework. /* global runIntegrationModule */ + /* global __shouldRunTest */ const __FILTERED = { modulePath: 'filtered', summary: { total: 0, passed: 0, failed: 0 } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/unit/finetuning.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/unit/finetuning.test.js index 9bb24441fe..d0d3b727f1 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/unit/finetuning.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/unit/finetuning.test.js @@ -60,17 +60,12 @@ async function assertInferenceSucceeds (t, model, token) { } const createModelWithMockAddon = (opts = {}) => { - const loader = { close: () => Promise.resolve() } - const model = new LlmLlamacpp( - { - loader, - opts, - logger: { info: () => {}, warn: () => {}, error: () => {}, debug: () => {} }, - diskPath: '.', - modelName: 'test.gguf' - }, - { device: 'cpu', ctx_size: '256' } - ) + const model = new LlmLlamacpp({ + files: { model: ['/tmp/test.gguf'] }, + config: { device: 'cpu', ctx_size: '256' }, + opts, + logger: { info: () => {}, warn: () => {}, error: () => {}, debug: () => {} } + }) model.addon = createMockAddon() return model } @@ -183,13 +178,14 @@ test('finetune() runs inside exclusive queue wrapper', async (t) => { model.addon.finetune.callsFake(completeFinetuneWith(model)) let wrapperCalled = false - model._withExclusiveRun = async (fn) => { + const originalRun = model._run + model._run = async (fn) => { wrapperCalled = true - return await fn() + return await originalRun(fn) } const handle = await model.finetune(opts) - t.ok(wrapperCalled, 'finetune should execute inside _withExclusiveRun') + t.ok(wrapperCalled, 'finetune should execute inside exclusiveRunQueue') const result = await handle.await() t.alike(result, { op: 'finetune', status: 'COMPLETED' }) }) @@ -405,16 +401,16 @@ test('_skipNextRuntimeStats swallows TPS stats that follow a finetune terminal r model.addon.finetune.callsFake(() => true) const handle = await model.finetune(opts) - t.is(model._skipNextRuntimeStats, false, 'flag starts false before finetune terminal arrives') + t.is(model._addonEventState.skipNextRuntimeStats, false, 'flag starts false before finetune terminal arrives') model._addonOutputCallback(null, 'Output', { op: 'finetune', status: 'COMPLETED' }, null) - t.is(model._skipNextRuntimeStats, true, 'flag must be set after finetune terminal result') + t.is(model._addonEventState.skipNextRuntimeStats, true, 'flag must be set after finetune terminal result') const result = await handle.await() t.alike(result, { op: 'finetune', status: 'COMPLETED' }) model._addonOutputCallback(null, 'Output', { TPS: 0, tokens: 0, time_ms: 100 }, null) - t.is(model._skipNextRuntimeStats, false, 'flag must reset after TPS stats are consumed') + t.is(model._addonEventState.skipNextRuntimeStats, false, 'flag must reset after TPS stats are consumed') }) test('TPS stats without prior finetune are forwarded as normal JobEnded', async (t) => { @@ -422,7 +418,7 @@ test('TPS stats without prior finetune are forwarded as normal JobEnded', async model.addon.runJob.callsFake(() => true) const response = await model._runInternal([{ role: 'user', content: 'Hello' }]) - t.is(model._skipNextRuntimeStats, false, 'flag should be false without finetune') + t.is(model._addonEventState.skipNextRuntimeStats, false, 'flag should be false without finetune') model._addonOutputCallback(null, 'Output', 'world', null) model._addonOutputCallback(null, 'Output', { TPS: 42.5, tokens: 10, time_ms: 235 }, null) @@ -430,7 +426,7 @@ test('TPS stats without prior finetune are forwarded as normal JobEnded', async const output = await response.await() t.ok(Array.isArray(output), 'inference response should resolve with output array') t.ok(output.includes('world'), 'output should contain the emitted token') - t.is(model._skipNextRuntimeStats, false, 'flag should remain false') + t.is(model._addonEventState.skipNextRuntimeStats, false, 'flag should remain false') t.is(model._hasActiveResponse, false, 'busy state should be cleared') }) @@ -442,13 +438,13 @@ test('_skipNextRuntimeStats prevents finetune TPS from ending a subsequent infer const finetuneHandle = await model.finetune(opts) model._addonOutputCallback(null, 'Output', { op: 'finetune', status: 'COMPLETED' }, null) await finetuneHandle.await() - t.is(model._skipNextRuntimeStats, true, 'skip flag should be armed after finetune') + t.is(model._addonEventState.skipNextRuntimeStats, true, 'skip flag should be armed after finetune') model.addon.runJob.callsFake(() => true) const inferResponse = await model._runInternal([{ role: 'user', content: 'Hello' }]) model._addonOutputCallback(null, 'Output', { TPS: 0, tokens: 0 }, null) - t.is(model._skipNextRuntimeStats, false, 'flag should reset after consuming stale TPS') + t.is(model._addonEventState.skipNextRuntimeStats, false, 'flag should reset after consuming stale TPS') t.is(inferResponse.getStatus(), 'running', 'inference must still be running after stale TPS was swallowed') model._addonOutputCallback(null, 'Output', 'answer', null) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/unit/map-addon-event.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/unit/map-addon-event.test.js new file mode 100644 index 0000000000..5aea78a622 --- /dev/null +++ b/packages/qvac-lib-infer-llamacpp-llm/test/unit/map-addon-event.test.js @@ -0,0 +1,83 @@ +'use strict' + +const test = require('brittle') +const { mapAddonEvent } = require('../../addon.js') + +function makeState (overrides = {}) { + return { skipNextRuntimeStats: false, ...overrides } +} + +test('TPS-shaped data maps to JobEnded with mapped backendDevice (cpu)', function (t) { + const state = makeState() + const result = mapAddonEvent('anything', { TPS: 42, tokens: 10, backendDevice: 0 }, null, state) + t.is(result.type, 'JobEnded') + t.is(result.data.TPS, 42) + t.is(result.data.backendDevice, 'cpu') + t.is(result.error, null) + t.is(state.skipNextRuntimeStats, false, 'flag remains false after normal inference terminal') +}) + +test('TPS-shaped data maps backendDevice 1 to "gpu"', function (t) { + const state = makeState() + const result = mapAddonEvent('anything', { TPS: 50, backendDevice: 1 }, null, state) + t.is(result.data.backendDevice, 'gpu') +}) + +test('TPS-shaped data preserves unknown backendDevice values as-is', function (t) { + const state = makeState() + const result = mapAddonEvent('anything', { TPS: 1, backendDevice: 2 }, null, state) + t.is(result.data.backendDevice, 2) +}) + +test('TPS-shaped data is dropped when skipNextRuntimeStats is set', function (t) { + const state = makeState({ skipNextRuntimeStats: true }) + const result = mapAddonEvent('anything', { TPS: 99 }, null, state) + t.is(result, null, 'returns null to drop stale post-finetune TPS') + t.is(state.skipNextRuntimeStats, false, 'flag resets after consuming') +}) + +test('finetune terminal payload maps to JobEnded and arms skip flag', function (t) { + const state = makeState() + const payload = { op: 'finetune', status: 'COMPLETED', stats: { loss: 0.1 } } + const result = mapAddonEvent('anything', payload, null, state) + t.is(result.type, 'JobEnded') + t.is(result.data, payload) + t.is(state.skipNextRuntimeStats, true, 'skip flag armed to swallow the TPS trailer') +}) + +test('finetune_progress payload maps to FinetuneProgress', function (t) { + const state = makeState() + const payload = { type: 'finetune_progress', stats: { loss: 0.2 } } + const result = mapAddonEvent('anything', payload, null, state) + t.is(result.type, 'FinetuneProgress') + t.is(result.data, payload) +}) + +test('event name containing "Error" maps to Error with rawError', function (t) { + const state = makeState() + const err = new Error('boom') + const result = mapAddonEvent('SomeError', null, err, state) + t.is(result.type, 'Error') + t.is(result.error, err) +}) + +test('string data maps to Output (token streaming)', function (t) { + const state = makeState() + const result = mapAddonEvent('OutputString', 'hello', null, state) + t.is(result.type, 'Output') + t.is(result.data, 'hello') +}) + +test('event name containing "LogMsg" maps to LogMsg (string payload not remapped to Output)', function (t) { + const state = makeState() + const result = mapAddonEvent('SomeLogMsg', 'native log line', null, state) + t.is(result.type, 'LogMsg', 'LogMsg event name wins over string-to-Output fallback') + t.is(result.data, 'native log line') +}) + +test('unknown event with non-TPS object falls through to default mapping', function (t) { + const state = makeState() + const result = mapAddonEvent('Unknown', { foo: 'bar' }, null, state) + t.is(result.type, 'Unknown', 'falls through preserving original event name') + t.alike(result.data, { foo: 'bar' }) +}) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/unit/pick-primary-gguf-path.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/unit/pick-primary-gguf-path.test.js new file mode 100644 index 0000000000..89a722c67d --- /dev/null +++ b/packages/qvac-lib-infer-llamacpp-llm/test/unit/pick-primary-gguf-path.test.js @@ -0,0 +1,35 @@ +'use strict' + +const test = require('brittle') +const { pickPrimaryGgufPath } = require('../../index.js') + +test('single non-sharded file returns that file', function (t) { + const files = ['/models/Qwen3-1.7B-Q4_0.gguf'] + t.is(pickPrimaryGgufPath(files), '/models/Qwen3-1.7B-Q4_0.gguf') +}) + +test('sharded model with tensors.txt first returns first shard, not tensors.txt', function (t) { + const files = [ + '/models/medgemma-4b-it-Q4_1.tensors.txt', + '/models/medgemma-4b-it-Q4_1-00001-of-00005.gguf', + '/models/medgemma-4b-it-Q4_1-00002-of-00005.gguf', + '/models/medgemma-4b-it-Q4_1-00003-of-00005.gguf', + '/models/medgemma-4b-it-Q4_1-00004-of-00005.gguf', + '/models/medgemma-4b-it-Q4_1-00005-of-00005.gguf' + ] + t.is(pickPrimaryGgufPath(files), '/models/medgemma-4b-it-Q4_1-00001-of-00005.gguf') +}) + +test('sharded model without tensors.txt returns first shard', function (t) { + const files = [ + '/models/Qwen3-0.6B-UD-IQ1_S-00001-of-00003.gguf', + '/models/Qwen3-0.6B-UD-IQ1_S-00002-of-00003.gguf', + '/models/Qwen3-0.6B-UD-IQ1_S-00003-of-00003.gguf' + ] + t.is(pickPrimaryGgufPath(files), '/models/Qwen3-0.6B-UD-IQ1_S-00001-of-00003.gguf') +}) + +test('non-gguf file falls back to first entry', function (t) { + const files = ['/models/some-model.bin'] + t.is(pickPrimaryGgufPath(files), '/models/some-model.bin') +})