diff --git a/packages/lib-infer-diffusion/CHANGELOG.md b/packages/lib-infer-diffusion/CHANGELOG.md index 691900133e..1e44c6ecb8 100644 --- a/packages/lib-infer-diffusion/CHANGELOG.md +++ b/packages/lib-infer-diffusion/CHANGELOG.md @@ -1,5 +1,82 @@ # Changelog +## [0.3.0] - 2026-04-15 + +This release migrates the diffusion addon off `BaseInference` inheritance and onto the composable `createJobHandler` + `exclusiveRunQueue` utilities from `@qvac/infer-base@^0.4.0`. The constructor signature is replaced with a single object whose `files` field carries absolute paths for every model component, mirroring the parallel embed and LLM addon refactors. This is a breaking change — every caller must update. + +## Breaking Changes + +### Constructor signature: single object with `files` instead of `(args, config)` + +`ImgStableDiffusion` now takes a single `{ files, config, logger?, opts? }` object. The old `diskPath` + `modelName` + per-component filename pattern is gone — callers pass absolute paths directly via `files`. Companion model fields are renamed (`clipLModel` → `clipL`, `clipGModel` → `clipG`, `t5XxlModel` → `t5Xxl`, `llmModel` → `llm`, `vaeModel` → `vae`). + +```js +// BEFORE (≤ 0.1.x) +const model = new ImgStableDiffusion({ + diskPath: '/models', + modelName: 'flux-2-klein-4b-Q8_0.gguf', + llmModel: 'Qwen3-4B-Q4_K_M.gguf', + vaeModel: 'flux2-vae.safetensors', + logger: console +}, { threads: 8 }) + +// AFTER (0.3.0) +const model = new ImgStableDiffusion({ + files: { + model: '/models/flux-2-klein-4b-Q8_0.gguf', + llm: '/models/Qwen3-4B-Q4_K_M.gguf', + vae: '/models/flux2-vae.safetensors' + }, + config: { threads: 8 }, + logger: console, + opts: { stats: true } +}) +``` + +### `BaseInference` inheritance removed + +`ImgStableDiffusion` no longer extends `BaseInference`. The class composes `createJobHandler` and `exclusiveRunQueue` from `@qvac/infer-base@^0.4.0` directly. The public lifecycle (`load` / `run` / `cancel` / `unload` / `getState`) is unchanged in shape; only construction differs. Internal helpers like `_withExclusiveRun` and `_outputCallback` are removed. + +### Caller owns absolute paths — addon no longer joins `diskPath` + filename + +Callers that previously relied on the addon to resolve `path.join(diskPath, filename)` must now do that resolution themselves before constructing the model. + +### `getState()` returns a narrower shape + +`getState()` previously returned `{ configLoaded, weightsLoaded, destroyed }` (the three-field shape from `BaseInference`). It now returns `{ configLoaded }` only. The `weightsLoaded` and `destroyed` fields are gone — `weightsLoaded` collapsed into `configLoaded` because the refactored `load()` does both in one step, and `destroyed` is no longer tracked since `unload()` resets `configLoaded` and nulls the addon handle instead. Callers reading `state.weightsLoaded` or `state.destroyed` must switch to `state.configLoaded`. + +## Features + +### Constructor input validation + +The constructor now throws `TypeError('files.model must be an absolute path string')` when `files.model` is missing or not a string, or `TypeError('files.model must be an absolute path (got: )')` when supplied as a relative path. This produces a clear error for callers porting old code instead of a confusing `Cannot read properties of undefined`. The same validation applies to optional companion fields (`clipL`, `clipG`, `t5Xxl`, `llm`, `vae`) when supplied. + +### `run()`-before-`load()` guard + +Calling `run()` before `load()` now throws `Error('Addon not initialized. Call load() first.')` instead of crashing in native code. Covered by a new regression test in `test/integration/api-behavior.test.js`. + +### `load()` is now idempotent when already loaded + +A second `load()` call on an already-loaded instance is now a silent no-op instead of unloading and reloading. This aligns with the ReadyResource pattern used elsewhere in QVAC and prevents accidental double-loads from triggering expensive work. Callers that intentionally want to swap weights must call `unload()` first (which clears `configLoaded`) and then `load()` again. + +### Broader split-layout detection + +`isSplitLayout` now also triggers when only `clipL` or `clipG` is supplied. This closes a footgun where a FLUX.1 caller passing `{ model, clipL, clipG, vae }` (without `t5Xxl`) would silently mis-route the diffusion model into the all-in-one `path` parameter and fail to load. + +## Bug Fixes + +### `unload()` clears the addon reference + +`unload()` now sets `this.addon = null` after `await this.addon.unload()`, so post-unload `cancel()` / `run()` calls hit the explicit `if (!this.addon)` guard rather than dereferencing a disposed native handle. + +### Unknown addon events no longer pollute the output stream + +`_addonOutputCallback` previously had a fallthrough that pushed any non-error / non-image / non-stats event into `response.output` (including `null` and `undefined`). It now logs unknown events at debug level and does not feed them into the active response. + +## Pull Requests + +- [#1496](https://github.com/tetherto/qvac/pull/1496) - chore[bc]: diffusion addon interface refactor — remove BaseInference + ## [0.2.0] - 2026-04-15 ### Added diff --git a/packages/lib-infer-diffusion/README.md b/packages/lib-infer-diffusion/README.md index badcd253a1..177edab2a0 100644 --- a/packages/lib-infer-diffusion/README.md +++ b/packages/lib-infer-diffusion/README.md @@ -176,28 +176,35 @@ const path = require('bare-path') const MODELS_DIR = path.resolve(__dirname, './models') const args = { logger: console, - diskPath: MODELS_DIR, - modelName: 'flux-2-klein-4b-Q8_0.gguf', - llmModel: 'Qwen3-4B-Q4_K_M.gguf', // Qwen3 text encoder for FLUX.2 [klein] - vaeModel: 'flux2-vae.safetensors' + files: { + model: path.join(MODELS_DIR, 'flux-2-klein-4b-Q8_0.gguf'), + llm: path.join(MODELS_DIR, 'Qwen3-4B-Q4_K_M.gguf'), // Qwen3 text encoder for FLUX.2 [klein] + vae: path.join(MODELS_DIR, 'flux2-vae.safetensors') + }, + config: { threads: 8 }, + opts: { stats: true } } ``` | Property | Required | Description | |----------|----------|-------------| -| `diskPath` | ✅ | Local directory where model files are already stored | -| `modelName` | ✅ | Diffusion model file name (all-in-one for SD1.x/2.x; diffusion-only GGUF for FLUX.2) | +| `files` | ✅ | Object of absolute paths to model files (see below) | +| `files.model` | ✅ | Absolute path to diffusion model file (all-in-one for SD1.x/2.x; diffusion-only GGUF for FLUX.2) | +| `files.clipL` | — | Absolute path to separate CLIP-L text encoder (SD3) | +| `files.clipG` | — | Absolute path to separate CLIP-G text encoder (SDXL / SD3) | +| `files.t5Xxl` | — | Absolute path to separate T5-XXL text encoder (SD3) | +| `files.llm` | — | Absolute path to Qwen3 LLM text encoder (FLUX.2 [klein]) | +| `files.vae` | — | Absolute path to separate VAE file | +| `config` | — | Native backend configuration object (see next section) | | `logger` | — | Logger instance (e.g. `console`) | -| `clipLModel` | — | Separate CLIP-L text encoder (SD3) | -| `clipGModel` | — | Separate CLIP-G text encoder (SDXL / SD3) | -| `t5XxlModel` | — | Separate T5-XXL text encoder (SD3) | -| `llmModel` | — | Qwen3 LLM text encoder (FLUX.2 [klein]) | -| `vaeModel` | — | Separate VAE file | +| `opts` | — | Additional options (e.g. `{ stats: true }`) | -### 3. Create the `config` object +### 3. Configure the native backend (`args.config`) + +`config` is a field on the `args` object built in step 2 — there is no separate constructor argument. The native backend reads it during `load()`. ```js -const config = { +args.config = { threads: 8 // CPU threads for tensor operations (Metal handles GPU automatically) } ``` @@ -216,10 +223,10 @@ Config values are coerced to strings internally. Generation parameters (prompt, ### 4. Create a Model Instance ```js -const model = new ImgStableDiffusion(args, config) +const model = new ImgStableDiffusion(args) ``` -The constructor stores configuration only — no memory is allocated yet. +The constructor takes a single object containing `files`, `config`, `logger`, and `opts`. It stores configuration only — no memory is allocated yet. ### 5. Load the Model @@ -227,7 +234,7 @@ The constructor stores configuration only — no memory is allocated yet. await model.load() ``` -This creates the native `sd_ctx_t` and loads all weights into memory. It can take 10–30 seconds depending on disk speed and model size. All model files must already be present on disk at `diskPath`. +This creates the native `sd_ctx_t` and loads all weights into memory. It can take 10–30 seconds depending on disk speed and model size. All model files must be passed as absolute paths via the `files` object. ### 6. Run Inference @@ -360,7 +367,7 @@ await model.unload() ### Stable Diffusion 1.x / 2.x -Pass an all-in-one checkpoint directly as `modelName`. No separate encoders needed. +Pass an all-in-one checkpoint absolute path as `files.model`. No separate encoders needed. --- diff --git a/packages/lib-infer-diffusion/addon.js b/packages/lib-infer-diffusion/addon.js index 9d8bc1f620..5e39d021bd 100644 --- a/packages/lib-infer-diffusion/addon.js +++ b/packages/lib-infer-diffusion/addon.js @@ -2,6 +2,41 @@ const path = require('bare-path') +/** + * Map a raw native event from the C++ stable-diffusion addon to a logical + * event consumed by `ImgStableDiffusion`. + * + * The native binding emits events with C++-mangled names and varied + * payload shapes. This wrapper normalizes them into one of: + * - `'Output'` — image bytes (`Uint8Array`) or progress JSON tick (`string`) + * - `'Error'` — failure + * - `'JobEnded'` — terminal RuntimeStats payload (object) + * + * Returns `{ type, data, error }` or `null` for unknown event/data shapes + * (caller logs at debug level). + * + * + * @param {string} rawEvent + * @param {*} rawData + * @param {*} rawError + * @returns {{ type: string, data: *, error: * } | null} + */ +function mapAddonEvent (rawEvent, rawData, rawError) { + if (typeof rawEvent === 'string' && rawEvent.includes('Error')) { + return { type: 'Error', data: rawData, error: rawError } + } + + if (rawData instanceof Uint8Array || typeof rawData === 'string') { + return { type: 'Output', data: rawData, error: null } + } + + if (rawData && typeof rawData === 'object') { + return { type: 'JobEnded', data: rawData, error: null } + } + + return null +} + /** * Extract pixel dimensions from a PNG or JPEG buffer without a full decode. * @@ -151,4 +186,4 @@ class SdInterface { } } -module.exports = { SdInterface, readImageDimensions } +module.exports = { SdInterface, mapAddonEvent, readImageDimensions } diff --git a/packages/lib-infer-diffusion/docs/architecture.md b/packages/lib-infer-diffusion/docs/architecture.md index a42ffabad9..23c4a16cf9 100644 --- a/packages/lib-infer-diffusion/docs/architecture.md +++ b/packages/lib-infer-diffusion/docs/architecture.md @@ -1,7 +1,7 @@ # Architecture Documentation -**Package:** `@qvac/diffusion-cpp` v0.1.0 -**Stack:** JavaScript, C++20, stable-diffusion.cpp, Bare Runtime, CMake, vcpkg +**Package:** `@qvac/diffusion-cpp` v0.2.0 +**Stack:** JavaScript, C++20, stable-diffusion.cpp, Bare Runtime, CMake, vcpkg **License:** Apache-2.0 --- @@ -49,7 +49,7 @@ ## Key Features - **Cross-platform**: macOS, Linux, Windows, iOS, Android -- **Disk-local models**: Files must be present on disk at `diskPath` +- **Disk-local models**: Files must be present on disk; the caller passes absolute file paths via `files.{model,clipL,clipG,t5Xxl,llm,vae}` to the constructor - **Progress tracking**: Step-by-step generation progress callbacks - **GPU acceleration**: Metal, Vulkan, OpenCL - **Quantized models**: GGUF, safetensors, checkpoint formats @@ -122,8 +122,8 @@ graph TB | Package | Type | Version | Purpose | |---------|------|---------|---------| -| @qvac/infer-base | Framework | ^0.2.0 | Base classes (BaseInference, QvacResponse) | -| qvac-lib-inference-addon-cpp | Native | ≥1.1.1 | C++ addon framework (single-job runner) | +| @qvac/infer-base | Framework | ^0.4.0 | Composition utilities (`createJobHandler`, `exclusiveRunQueue`, `QvacResponse`) | +| qvac-lib-inference-addon-cpp | Native | ≥1.1.2 | C++ addon framework (single-job runner) | | stable-diffusion.cpp | Native | latest | Diffusion inference engine | | Bare Runtime | Runtime | ≥1.24.0 | JavaScript execution | @@ -131,8 +131,8 @@ graph TB | From | To | Mechanism | Data Format | |------|-----|-----------|-------------| -| JavaScript | ImgStableDiffusion | Constructor | args, config objects | -| ImgStableDiffusion | BaseInference | Inheritance | Template method pattern | +| JavaScript | ImgStableDiffusion | Constructor | Single `{ files, config, logger?, opts? }` object | +| ImgStableDiffusion | createJobHandler / exclusiveRunQueue | Composition | Job lifecycle + run-queue helpers from `@qvac/infer-base` | | ImgStableDiffusion | SdInterface | Composition | Method calls | | SdInterface | C++ Addon | require.addon() | Native binding | @@ -147,20 +147,26 @@ graph TB ```mermaid classDiagram class ImgStableDiffusion { - +constructor(args, config) + +constructor(args: {files, config, logger?, opts?}) +load() Promise~void~ +run(params: GenerationParams) Promise~QvacResponse~ +cancel() Promise~void~ +unload() Promise~void~ + +getState() {configLoaded} } - class BaseInference { - <> - +load() Promise~void~ - +run() Promise~QvacResponse~ - +unload() Promise~void~ - #_runInternal() Promise~QvacResponse~ - #_withExclusiveRun(fn) Promise~any~ + class JobHandler { + <> + +start() QvacResponse + +output(data) + +end(stats?, payload?) + +fail(error) + +active QvacResponse + } + + class ExclusiveRunQueue { + <> + +(fn) Promise~T~ } class QvacResponse { @@ -169,8 +175,9 @@ classDiagram +cancel() Promise~void~ } - ImgStableDiffusion --|> BaseInference - ImgStableDiffusion ..> QvacResponse : creates + ImgStableDiffusion ..> JobHandler : composes via createJobHandler() + ImgStableDiffusion ..> ExclusiveRunQueue : composes via exclusiveRunQueue() + JobHandler ..> QvacResponse : creates per start() ```
@@ -180,16 +187,20 @@ classDiagram | Class | Responsibility | Lifecycle | Dependencies | |-------|----------------|-----------|--------------| -| ImgStableDiffusion | Orchestrate model lifecycle, manage loading/inference | Created by user, persistent | SdInterface | -| BaseInference | Define standard inference API (template method pattern) | Abstract base class | None | -| QvacResponse | Handle generation progress and result | Created per `run()` call | None | +| ImgStableDiffusion | Orchestrate model lifecycle, manage loading/inference | Created by user, persistent | SdInterface, JobHandler, ExclusiveRunQueue | +| JobHandler (`createJobHandler`) | Start/end/fail a single in-flight job and emit a `QvacResponse` | Per-instance, lives as long as the model | None | +| ExclusiveRunQueue (`exclusiveRunQueue`) | Serialize public API calls so only one job is in flight at a time | Per-instance | None | +| QvacResponse | Handle generation progress and result | Created per `run()` call by the JobHandler | None | **Key Relationships:** | From | To | Type | Purpose | |------|-----|------|---------| -| ImgStableDiffusion | BaseInference | Inheritance | Standard QVAC inference API | -| ImgStableDiffusion | QvacResponse | Creates | Progress/result per generation | +| ImgStableDiffusion | JobHandler | Composition | Lifecycle of the active job (replaces inheriting from `BaseInference`) | +| ImgStableDiffusion | ExclusiveRunQueue | Composition | Serializes `run()` and `unload()` (cancel is intentionally outside the queue so it can interrupt an in-flight run) | +| JobHandler | QvacResponse | Creates | Progress/result per generation | + +> **Note:** `ImgStableDiffusion` no longer extends `BaseInference`. It composes the helpers exposed by `@qvac/infer-base` (`createJobHandler`, `exclusiveRunQueue`) directly.
@@ -206,7 +217,7 @@ graph TB subgraph "Layer 1: JavaScript API" APP["Application Code"] IMGCLASS["ImgStableDiffusion
(index.js)"] - BASEINF["BaseInference
(@qvac/infer-base)"] + BASE["createJobHandler / exclusiveRunQueue
(@qvac/infer-base)"] RESPONSE["QvacResponse"] end @@ -234,7 +245,7 @@ graph TB end APP --> IMGCLASS - IMGCLASS --> BASEINF + IMGCLASS --> BASE IMGCLASS --> SDIF IMGCLASS -.-> RESPONSE @@ -267,7 +278,7 @@ graph TB | Layer | Components | Responsibility | Language | Why This Layer | |-------|------------|----------------|----------|----------------| -| 1. JavaScript API | ImgStableDiffusion, BaseInference, QvacResponse | High-level API, error handling | JS | Ergonomic API for npm consumers | +| 1. JavaScript API | ImgStableDiffusion, `createJobHandler` / `exclusiveRunQueue` (from `@qvac/infer-base`), QvacResponse | High-level API, error handling | JS | Ergonomic API for npm consumers | | 2. Bridge | SdInterface, binding.js | JS↔C++ communication | JS wrapper | Lifecycle management, handle safety | | 3. C++ Addon | JsInterface, AddonCpp/AddonJs | Single-job runner, threading, callbacks | C++ | Performance, native integration | | 4. Model | SdModel, Contexts | Diffusion logic, sampling | C++ | Direct stable-diffusion.cpp integration | @@ -533,14 +544,14 @@ See [qvac-lib-inference-addon-cpp Decision 4: Why Bare Runtime](https://github.c --- -## Decision 3: Disk-Local Model Files +## Decision 3: Disk-Local Model Files (caller-supplied absolute paths)
⚡ TL;DR -**Chose:** Require model files to already exist on disk at `diskPath` -**Why:** Simplicity — the addon loads files directly from disk, no streaming/download layer needed -**Cost:** Caller must ensure files are present before calling `load()` +**Chose:** Require model files to already exist on disk; the caller passes absolute paths via `files.{model,clipL,clipG,t5Xxl,llm,vae}` +**Why:** Simplicity — the addon loads files directly from disk, no streaming/download layer needed and no loader abstraction +**Cost:** Caller must ensure files are present and supply absolute paths before calling `load()`
@@ -548,28 +559,28 @@ See [qvac-lib-inference-addon-cpp Decision 4: Why Bare Runtime](https://github.c Diffusion models consist of multiple large files (diffusion model, text encoders, VAE). The addon needs these files to create the native `sd_ctx_t` context. -Unlike the LLM addon which historically used WeightsProvider for streaming weights, diffusion loads files directly from disk paths — no loader abstraction is involved. +Unlike the LLM addon which historically used WeightsProvider for streaming weights, diffusion has always loaded files directly from disk. After the addon-loader-abstraction refactor, there is also no `Loader` interface and no `diskPath` / `modelName` joining inside the addon — the caller passes absolute paths through the new `files` argument. ### Decision -Require all model files to be present on disk at `diskPath` before `load()` is called. The addon constructs file paths by joining `diskPath` with each model filename and passes them directly to stable-diffusion.cpp. +Require all model files to be present on disk before `load()` is called. The constructor accepts a single `files` object whose entries are absolute paths (`files.model` is required; `files.clipL`, `files.clipG`, `files.t5Xxl`, `files.llm`, `files.vae` are optional companions). `_load()` reads `this._files` and forwards the paths directly to stable-diffusion.cpp. ### Rationale **Simplicity:** - No download/streaming abstraction layer needed -- No WeightsProvider, no progress tracking for downloads +- No WeightsProvider, no Loader, no progress tracking for downloads - Direct file paths to stable-diffusion.cpp **Split-model support:** - Diffusion models may have multiple components (diffusion GGUF, CLIP-L, CLIP-G, T5-XXL, LLM encoder, VAE) -- All resolved as `path.join(diskPath, filename)` in `_load()` -- Split vs all-in-one layout detected via heuristic (`isSplitLayout = !!llmModel || !!t5XxlModel`) +- The caller supplies each component as an absolute path on `files` +- Split vs all-in-one layout is detected via heuristic in `_load()` (`isSplitLayout = !!this._files.llm || !!this._files.t5Xxl || !!this._files.clipL || !!this._files.clipG`). Any caller-supplied separate encoder implies the primary file is the standalone diffusion model rather than an all-in-one checkpoint, so FLUX.1 (`{ model, clipL, clipG, vae }` without `t5Xxl`) is also routed correctly. ### Trade-offs - ✅ Simple, no abstraction overhead - ✅ No streaming/buffering complexity -- ❌ Caller responsible for ensuring files exist on disk +- ❌ Caller responsible for ensuring files exist on disk and for resolving absolute paths --- @@ -578,7 +589,7 @@ Require all model files to be present on disk at `diskPath` before `load()` is c
⚡ TL;DR -**Chose:** Pass file paths directly to stable-diffusion.cpp via `sd_ctx_params_t` +**Chose:** Pass absolute file paths directly to stable-diffusion.cpp via `sd_ctx_params_t` **Why:** stable-diffusion.cpp natively loads from file paths; no need for buffer intermediary **Cost:** Files must exist on disk (no streaming from P2P sources) @@ -586,11 +597,11 @@ Require all model files to be present on disk at `diskPath` before `load()` is c ### Context -stable-diffusion.cpp accepts model files via file paths in its context parameters (`model_path`, `diffusion_model_path`, `clip_l_path`, `vae_path`, etc.). The addon constructs these paths from `diskPath` + filenames. +stable-diffusion.cpp accepts model files via file paths in its context parameters (`model_path`, `diffusion_model_path`, `clip_l_path`, `vae_path`, etc.). The caller supplies these as absolute paths on the constructor's `files` object; the addon never joins a base directory with a filename. ### Decision -Pass absolute file paths directly to stable-diffusion.cpp rather than using buffer-based loading. The `_load()` method constructs a `configurationParams` object with resolved paths and passes it to the native addon. +Pass absolute file paths directly to stable-diffusion.cpp rather than using buffer-based loading. `_load()` builds a `configurationParams` object from `this._files` and passes it to the native addon as-is. ### Rationale @@ -679,8 +690,8 @@ interface GenerationParams {
⚡ TL;DR -**Chose:** Promise-based exclusive run queue using `_withExclusiveRun()` wrapper -**Why:** Ensure generation jobs complete without interruption (long-running operations) +**Chose:** Compose `exclusiveRunQueue()` from `@qvac/infer-base` to serialize public API entrypoints +**Why:** Ensure generation jobs complete without interruption (long-running operations) **Cost:** One generation at a time per model instance
@@ -691,7 +702,7 @@ Diffusion generation takes significant time (seconds to minutes). Without coordi ### Decision -Implement JavaScript-level promise queue ensuring only one generation job runs at a time per model instance. +Use the `exclusiveRunQueue()` helper from `@qvac/infer-base`. The constructor stores the queue as `this._run`, and `run()` and `unload()` wrap their bodies with `this._run(() => …)`. `cancel()` is intentionally **not** queued — it must be able to interrupt an in-flight `run()` to terminate it, so it bypasses the queue and delegates straight to `addon.cancel()` (which is itself a no-op when there is no active job). This replaces the previous `BaseInference._withExclusiveRun()` template-method approach with a small composable utility. ### Rationale @@ -760,4 +771,4 @@ Provide hand-written TypeScript definitions in `index.d.ts`. --- -**Last Updated:** 2026-03-11 +**Last Updated:** 2026-04-10 diff --git a/packages/lib-infer-diffusion/examples/generate-image-sd2.js b/packages/lib-infer-diffusion/examples/generate-image-sd2.js index c0ddb790bd..ea0eb80ae5 100644 --- a/packages/lib-infer-diffusion/examples/generate-image-sd2.js +++ b/packages/lib-infer-diffusion/examples/generate-image-sd2.js @@ -49,21 +49,20 @@ async function main () { console.log('Seed :', SEED) console.log() - const model = new ImgStableDiffusion( - { - logger: console, - diskPath: MODELS_DIR, - modelName: MODEL_NAME - // No llmModel — SD2.1 uses the CLIP text encoder baked into the checkpoint. - // No vaeModel — the VAE is baked into the checkpoint. + const model = new ImgStableDiffusion({ + files: { + model: path.join(MODELS_DIR, MODEL_NAME) + // No llm — SD2.1 uses the CLIP text encoder baked into the checkpoint. + // No vae — the VAE is baked into the checkpoint. }, - { + config: { threads: 8, // SD2.1 uses v-prediction. This safetensors file has no GGUF metadata so // auto-detection cannot determine the prediction type; set it explicitly. prediction: 'v' - } - ) + }, + logger: console + }) try { // ── 1. Load weights ─────────────────────────────────────────────────────── diff --git a/packages/lib-infer-diffusion/examples/generate-image-sd3.js b/packages/lib-infer-diffusion/examples/generate-image-sd3.js index fcd6a8788f..a71c5cc927 100644 --- a/packages/lib-infer-diffusion/examples/generate-image-sd3.js +++ b/packages/lib-infer-diffusion/examples/generate-image-sd3.js @@ -54,24 +54,23 @@ async function main () { console.log('Seed :', SEED) console.log() - const model = new ImgStableDiffusion( - { - logger: console, - diskPath: MODELS_DIR, - modelName: MODEL_NAME - // All-in-one safetensors: no clipLModel, clipGModel, t5XxlModel, or vaeModel. + const model = new ImgStableDiffusion({ + files: { + model: path.join(MODELS_DIR, MODEL_NAME) + // All-in-one safetensors: no clipL, clipG, t5Xxl, or vae. // // To add T5-XXL (better text following) without redownloading the main file: - // t5XxlModel: 't5xxl_fp8_e4m3fn.safetensors' // download via download-model-sd3.sh + // t5Xxl: path.join(MODELS_DIR, 't5xxl_fp8_e4m3fn.safetensors') // download via download-model-sd3.sh }, - { + config: { threads: 4, // SD3 uses flow-matching. The safetensors metadata allows auto-detection, // but we set these explicitly as safety overrides. prediction: 'flow', // FLOW_PRED — SD3 flow-matching flow_shift: '3.0' // SD3 Medium default; overrides INFINITY sentinel - } - ) + }, + logger: console + }) try { // ── 1. Load weights ─────────────────────────────────────────────────────── diff --git a/packages/lib-infer-diffusion/examples/generate-image-sdxl.js b/packages/lib-infer-diffusion/examples/generate-image-sdxl.js index ea730dd00f..bea89f5b6a 100644 --- a/packages/lib-infer-diffusion/examples/generate-image-sdxl.js +++ b/packages/lib-infer-diffusion/examples/generate-image-sdxl.js @@ -51,20 +51,19 @@ async function main () { console.log('Seed :', SEED) console.log() - const model = new ImgStableDiffusion( - { - logger: console, - diskPath: MODELS_DIR, - modelName: MODEL_NAME - // No llmModel — SDXL uses CLIP-L + CLIP-G baked into the checkpoint. - // No vaeModel — the VAE is baked into the checkpoint. + const model = new ImgStableDiffusion({ + files: { + model: path.join(MODELS_DIR, MODEL_NAME) + // No llm — SDXL uses CLIP-L + CLIP-G baked into the checkpoint. + // No vae — the VAE is baked into the checkpoint. }, - { + config: { threads: 4 // No prediction override — SDXL uses eps-prediction and the GGUF // has the correct metadata for auto-detection. - } - ) + }, + logger: console + }) try { // ── 1. Load weights ─────────────────────────────────────────────────────── diff --git a/packages/lib-infer-diffusion/examples/generate-image.js b/packages/lib-infer-diffusion/examples/generate-image.js index 05ac42c39b..a6b8c0feb2 100644 --- a/packages/lib-infer-diffusion/examples/generate-image.js +++ b/packages/lib-infer-diffusion/examples/generate-image.js @@ -41,18 +41,17 @@ async function main () { console.log('Seed :', SEED) console.log() - const model = new ImgStableDiffusion( - { - logger: console, - diskPath: MODELS_DIR, - modelName: MODEL_NAME, - llmModel: LLM_MODEL, - vaeModel: VAE_MODEL + const model = new ImgStableDiffusion({ + files: { + model: path.join(MODELS_DIR, MODEL_NAME), + llm: path.join(MODELS_DIR, LLM_MODEL), + vae: path.join(MODELS_DIR, VAE_MODEL) }, - { + config: { threads: 4 - } - ) + }, + logger: console + }) try { // ── 1. Load weights ─────────────────────────────────────────────────────── diff --git a/packages/lib-infer-diffusion/examples/img2img-flux2-f16.js b/packages/lib-infer-diffusion/examples/img2img-flux2-f16.js index ab20a485f4..34f23d0387 100644 --- a/packages/lib-infer-diffusion/examples/img2img-flux2-f16.js +++ b/packages/lib-infer-diffusion/examples/img2img-flux2-f16.js @@ -29,20 +29,19 @@ async function main () { console.log('Loading FLUX2-klein F16 model (full precision)...') - const model = new ImgStableDiffusion( - { - logger: console, - diskPath: modelDir, - modelName: 'flux-2-klein-4b-F16.gguf', // F16 full precision - llmModel: 'Qwen3-4B-Q8_0.gguf', // Q8 text encoder - vaeModel: 'flux2-vae.safetensors' + const model = new ImgStableDiffusion({ + files: { + model: path.join(modelDir, 'flux-2-klein-4b-F16.gguf'), // F16 full precision + llm: path.join(modelDir, 'Qwen3-4B-Q8_0.gguf'), // Q8 text encoder + vae: path.join(modelDir, 'flux2-vae.safetensors') }, - { + config: { threads: 4, device: 'gpu', prediction: 'flux2_flow' - } - ) + }, + logger: console + }) try { // Load model weights diff --git a/packages/lib-infer-diffusion/examples/img2img-flux2.js b/packages/lib-infer-diffusion/examples/img2img-flux2.js index 7ba167ae1d..3ed7879e1c 100644 --- a/packages/lib-infer-diffusion/examples/img2img-flux2.js +++ b/packages/lib-infer-diffusion/examples/img2img-flux2.js @@ -30,20 +30,19 @@ async function main () { console.log('Loading FLUX2-klein model...') - const model = new ImgStableDiffusion( - { - logger: console, - diskPath: modelDir, - modelName: 'flux-2-klein-4b-Q8_0.gguf', - llmModel: 'Qwen3-4B-Q4_K_M.gguf', - vaeModel: 'flux2-vae.safetensors' + const model = new ImgStableDiffusion({ + files: { + model: path.join(modelDir, 'flux-2-klein-4b-Q8_0.gguf'), + llm: path.join(modelDir, 'Qwen3-4B-Q4_K_M.gguf'), + vae: path.join(modelDir, 'flux2-vae.safetensors') }, - { + config: { threads: 4, device: 'gpu', // or 'cpu' for MacBook Air prediction: 'flux2_flow' - } - ) + }, + logger: console + }) try { // Load model weights diff --git a/packages/lib-infer-diffusion/examples/img2img-sd3.js b/packages/lib-infer-diffusion/examples/img2img-sd3.js index 5511650822..e43c293ee7 100644 --- a/packages/lib-infer-diffusion/examples/img2img-sd3.js +++ b/packages/lib-infer-diffusion/examples/img2img-sd3.js @@ -70,22 +70,21 @@ async function main () { console.log(' Seed : ' + SEED) console.log(' Note : VAE encode runs first (no progress tick) — please wait...\n') - const model = new ImgStableDiffusion( - { - logger: console, - diskPath: modelDir, - modelName: MODEL_NAME - // All-in-one safetensors: no clipLModel, clipGModel, t5XxlModel, or vaeModel. + const model = new ImgStableDiffusion({ + files: { + model: path.join(modelDir, MODEL_NAME) + // All-in-one safetensors: no clipL, clipG, t5Xxl, or vae. // To improve text-following, add T5-XXL (download via download-model-sd3.sh): - // t5XxlModel: 't5xxl_fp8_e4m3fn.safetensors' + // t5Xxl: path.join(modelDir, 't5xxl_fp8_e4m3fn.safetensors') }, - { + config: { threads: 4, device: 'gpu', prediction: 'flow', // SD3 rectified flow-matching (not flux2_flow) flow_shift: '3.0' // SD3 Medium default; controls noise schedule shift - } - ) + }, + logger: console + }) try { console.log('Loading SD3 Medium model...') diff --git a/packages/lib-infer-diffusion/examples/load-model.js b/packages/lib-infer-diffusion/examples/load-model.js index cb15e90e06..c3a264bc83 100644 --- a/packages/lib-infer-diffusion/examples/load-model.js +++ b/packages/lib-infer-diffusion/examples/load-model.js @@ -24,18 +24,17 @@ async function main () { console.log() // ── 1. Construct — stores config, allocates nothing ──────────────────────── - const model = new ImgStableDiffusion( - { - logger: console, - diskPath: MODELS_DIR, - modelName: MODEL_NAME, - llmModel: LLM_MODEL, - vaeModel: VAE_MODEL + const model = new ImgStableDiffusion({ + files: { + model: path.join(MODELS_DIR, MODEL_NAME), + llm: path.join(MODELS_DIR, LLM_MODEL), + vae: path.join(MODELS_DIR, VAE_MODEL) }, - { + config: { threads: 8 // Metal handles GPU; threads are for CPU fallback ops - } - ) + }, + logger: console + }) try { // ── 2. Load — reads weights into memory via activate() → new_sd_ctx() ─── diff --git a/packages/lib-infer-diffusion/examples/quickstart.js b/packages/lib-infer-diffusion/examples/quickstart.js index b1e9fa910b..d163f25a6c 100644 --- a/packages/lib-infer-diffusion/examples/quickstart.js +++ b/packages/lib-infer-diffusion/examples/quickstart.js @@ -46,19 +46,18 @@ async function main () { console.log(`Model : ${MODEL_NAME}`) console.log(`Prompt: ${PROMPT}\n`) - const model = new ImgStableDiffusion( - { - logger: console, - diskPath: MODELS_DIR, - modelName: MODEL_NAME, - opts: { stats: true } + const model = new ImgStableDiffusion({ + files: { + model: path.join(MODELS_DIR, MODEL_NAME) }, - { + config: { threads: 4, prediction: 'v', verbosity: 2 - } - ) + }, + logger: console, + opts: { stats: true } + }) try { // 3. Load model weights diff --git a/packages/lib-infer-diffusion/examples/runtime-stats-sd2.js b/packages/lib-infer-diffusion/examples/runtime-stats-sd2.js index 8547fac071..dc48706eae 100644 --- a/packages/lib-infer-diffusion/examples/runtime-stats-sd2.js +++ b/packages/lib-infer-diffusion/examples/runtime-stats-sd2.js @@ -59,18 +59,17 @@ async function main () { console.log('Seed :', SEED) console.log() - const model = new ImgStableDiffusion( - { - logger: console, - diskPath: MODELS_DIR, - modelName: MODEL_NAME, - opts: { stats: true } + const model = new ImgStableDiffusion({ + files: { + model: path.join(MODELS_DIR, MODEL_NAME) }, - { + config: { threads: 4, prediction: 'v' - } - ) + }, + logger: console, + opts: { stats: true } + }) try { // ── 1. Load weights ───────────────────────────────────────────────────── diff --git a/packages/lib-infer-diffusion/index.d.ts b/packages/lib-infer-diffusion/index.d.ts index b0395003ab..88f3fc75f2 100644 --- a/packages/lib-infer-diffusion/index.d.ts +++ b/packages/lib-infer-diffusion/index.d.ts @@ -1,4 +1,3 @@ -import BaseInference from '@qvac/infer-base/WeightsProvider/BaseInference' import type { QvacResponse } from '@qvac/infer-base' import type QvacLogger from '@qvac/logging' @@ -122,6 +121,33 @@ export interface SdConfig { [key: string]: string | number | boolean | undefined } +export interface DiffusionFiles { + /** Absolute path to main model weights */ + model: string + /** SD3: absolute path to CLIP-L text encoder */ + clipL?: string + /** SDXL / SD3: absolute path to CLIP-G text encoder */ + clipG?: string + /** SD3: absolute path to T5-XXL text encoder */ + t5Xxl?: string + /** FLUX.2 [klein]: absolute path to Qwen3 4B text encoder (llm_path) */ + llm?: string + /** Absolute path to VAE file */ + vae?: string +} + +export interface ImgStableDiffusionArgs { + files: DiffusionFiles + /** + * Native backend configuration. Optional — when omitted, the addon + * forwards an empty config object and the C++ layer falls back to + * stable-diffusion.cpp defaults for every parameter. + */ + config?: SdConfig + logger?: QvacLogger | Console | null + opts?: { stats?: boolean } +} + export interface GenerationParams { prompt: string negative_prompt?: string @@ -203,28 +229,13 @@ export interface RuntimeStats { seed: number } -export interface ImgStableDiffusionArgs { - logger?: QvacLogger | Console | null - opts?: { stats?: boolean } - diskPath?: string - modelName: string - /** CLIP-L text encoder */ - clipLModel?: string - /** CLIP-G text encoder */ - clipGModel?: string - /** T5-XXL text encoder */ - t5XxlModel?: string - /** FLUX.2 [klein]: Qwen3 4B text encoder (llm_path) */ - llmModel?: string - vaeModel?: string -} - -export default class ImgStableDiffusion extends BaseInference { - protected addon: Addon +export default class ImgStableDiffusion { + protected addon: Addon | null + opts: { stats?: boolean } + logger: QvacLogger + state: { configLoaded: boolean } - constructor(args: ImgStableDiffusionArgs, config: SdConfig) - - _load(): Promise + constructor(args: ImgStableDiffusionArgs) load(): Promise @@ -233,6 +244,8 @@ export default class ImgStableDiffusion extends BaseInference { unload(): Promise cancel(): Promise + + getState(): { configLoaded: boolean } } export { QvacResponse, RuntimeStats } diff --git a/packages/lib-infer-diffusion/index.js b/packages/lib-infer-diffusion/index.js index c90187556c..1cd34eaf21 100644 --- a/packages/lib-infer-diffusion/index.js +++ b/packages/lib-infer-diffusion/index.js @@ -1,9 +1,20 @@ 'use strict' const path = require('bare-path') +const QvacLogger = require('@qvac/logging') +const { createJobHandler, exclusiveRunQueue } = require('@qvac/infer-base') +const { SdInterface, mapAddonEvent } = require('./addon') -const BaseInference = require('@qvac/infer-base/WeightsProvider/BaseInference') -const { SdInterface } = require('./addon') +const COMPANION_FILE_KEYS = ['clipL', 'clipG', 't5Xxl', 'llm', 'vae'] + +function assertAbsolute (key, value) { + if (typeof value !== 'string' || value.length === 0) { + throw new TypeError(`files.${key} must be an absolute path string`) + } + if (!path.isAbsolute(value)) { + throw new TypeError(`files.${key} must be an absolute path (got: ${value})`) + } +} const LOG_METHODS = ['error', 'warn', 'info', 'debug'] @@ -13,94 +24,86 @@ const RUN_BUSY_ERROR_MESSAGE = 'Cannot set new job: a job is already set or bein * Text-to-image and image-to-image generation using stable-diffusion.cpp. * Supports SD1.x, SD2.x, SDXL, SD3, and FLUX.2 [klein]. */ -class ImgStableDiffusion extends BaseInference { +class ImgStableDiffusion { /** * @param {object} args + * @param {object} args.files - Absolute file paths for model components + * @param {string} args.files.model - Main model weights (absolute path) + * @param {string} [args.files.clipL] - CLIP-L text encoder (SD3, absolute path) + * @param {string} [args.files.clipG] - CLIP-G text encoder (SDXL / SD3, absolute path) + * @param {string} [args.files.t5Xxl] - T5-XXL text encoder (SD3, absolute path) + * @param {string} [args.files.llm] - LLM text encoder (FLUX.2 klein, absolute path) + * @param {string} [args.files.vae] - VAE file (absolute path) + * @param {object} [args.config] - SD context configuration (threads, device, type, etc.). + * Optional — when omitted, the addon forwards an empty config and the C++ layer falls + * back to stable-diffusion.cpp defaults for every parameter. * @param {object} [args.logger] - Structured logger * @param {object} [args.opts] - Optional inference options - * @param {string} [args.diskPath='.'] - Local directory containing model weight files - * @param {string} args.modelName - Model file name (e.g. 'flux-2-klein-4b-Q8_0.gguf') - * @param {string} [args.clipLModel] - Optional CLIP-L text encoder file name (SD3) - * @param {string} [args.clipGModel] - Optional CLIP-G text encoder file name (SDXL / SD3) - * @param {string} [args.t5XxlModel] - Optional T5-XXL text encoder file name (SD3) - * @param {string} [args.llmModel] - Optional LLM text encoder file name (FLUX.2 klein → Qwen3 4B) - * @param {string} [args.vaeModel] - Optional VAE file name - * @param {object} config - SD context configuration (threads, device, type, etc.) */ - constructor ( - { - opts = {}, - logger = null, - diskPath = '.', - modelName, - clipLModel, - clipGModel, - t5XxlModel, - llmModel, - vaeModel - }, - config - ) { - super({ logger, opts }) - this._config = config - this._diskPath = diskPath - this._modelName = modelName - this._clipLModel = clipLModel || null - this._clipGModel = clipGModel || null - this._t5XxlModel = t5XxlModel || null - this._llmModel = llmModel || null - this._vaeModel = vaeModel || null + constructor ({ files, config, logger = null, opts = {} }) { + if (!files || typeof files !== 'object') { + throw new TypeError('files must be an object containing at least { model }') + } + assertAbsolute('model', files.model) + for (const key of COMPANION_FILE_KEYS) { + if (files[key] !== undefined) { + assertAbsolute(key, files[key]) + } + } + this._files = files + this._config = config || {} + this.logger = new QvacLogger(logger) + this.opts = opts + // The cancel closure dereferences `this.addon` lazily, so it is safe even though + // `this.addon` is `null` at construction time — it is only invoked from + // `response.cancel()` after `_load()` has assigned the addon. + this._job = createJobHandler({ cancel: () => this.addon?.cancel() }) + this._run = exclusiveRunQueue() + this.addon = null this._hasActiveResponse = false + this._binding = null + this._nativeLoggerActive = false + this.state = { configLoaded: false } + } + + async load () { + if (this.state.configLoaded) return + await this._load() + this.state.configLoaded = true } async _load () { this.logger.info('Starting stable-diffusion model load') - try { - // Route the primary model file to the correct stable-diffusion.cpp param: - // - // model_path — all-in-one checkpoints that embed their own text - // encoders and version metadata (SD1.x, SD2.x, SDXL, - // SD3 all-in-one GGUF). - // - // diffusion_model_path — standalone diffusion-only weights that have no - // embedded SD metadata and require separate encoders: - // FLUX.2 [klein] → llmModel (Qwen3) - // SD3 pure GGUF → t5XxlModel (T5-XXL) + clipLModel + clipGModel - // - // Heuristic: if any separate encoder is provided (LLM for FLUX.2, T5-XXL - // for SD3 split) the caller is using a pure diffusion GGUF that must be - // loaded via diffusion_model_path. - const isSplitLayout = !!this._llmModel || !!this._t5XxlModel - const resolve = (name) => name ? (path.isAbsolute(name) ? name : path.join(this._diskPath, name)) : '' - const configurationParams = { - path: isSplitLayout ? '' : resolve(this._modelName), - diffusionModelPath: isSplitLayout ? resolve(this._modelName) : '', - clipLPath: resolve(this._clipLModel), - clipGPath: resolve(this._clipGModel), - t5XxlPath: resolve(this._t5XxlModel), - llmPath: resolve(this._llmModel), - vaePath: resolve(this._vaeModel), - config: this._config - } + // Route the primary model file to the correct stable-diffusion.cpp param: + // path — all-in-one checkpoints (SD1.x, SD2.x, SDXL, SD3 all-in-one GGUF) + // diffusionModelPath — standalone diffusion weights requiring separate encoders + // (FLUX.2 klein → llm, SD3 pure GGUF → t5Xxl + clipL + clipG, + // FLUX.1 → t5Xxl + clipL, etc.) + // Any caller-supplied separate encoder implies the primary file is the standalone + // diffusion model, not an all-in-one checkpoint. + const isSplitLayout = !!this._files.llm || !!this._files.t5Xxl || + !!this._files.clipL || !!this._files.clipG + const configurationParams = { + path: isSplitLayout ? '' : this._files.model, + diffusionModelPath: isSplitLayout ? this._files.model : '', + clipLPath: this._files.clipL || '', + clipGPath: this._files.clipG || '', + t5XxlPath: this._files.t5Xxl || '', + llmPath: this._files.llm || '', + vaePath: this._files.vae || '', + config: this._config + } - this.logger.info('Creating stable-diffusion addon with configuration:', configurationParams) - this.addon = this._createAddon(configurationParams) + this.logger.info('Creating stable-diffusion addon with configuration:', configurationParams) + this.addon = this._createAddon(configurationParams) - this.logger.info('Activating stable-diffusion addon') - await this.addon.activate() + this.logger.info('Activating stable-diffusion addon') + await this.addon.activate() - this.logger.info('Stable-diffusion model load completed successfully') - } catch (error) { - this.logger.error('Error during stable-diffusion model load:', error) - throw error - } + this.logger.info('Stable-diffusion model load completed successfully') } - /** - * @param {object} configurationParams - * @returns {SdInterface} - */ _createAddon (configurationParams) { this._binding = require('./binding') this._connectNativeLogger() @@ -135,51 +138,32 @@ class ImgStableDiffusion extends BaseInference { } _addonOutputCallback (addon, event, data, error) { - if (event.includes('Error')) { - return this._outputCallback(addon, 'Error', 'OnlyOneJob', data, error) + const mapped = mapAddonEvent(event, data, error) + if (mapped === null) { + // Unknown event/data combination — log it instead of feeding null/undefined + // into the active response output stream. The native layer is expected to + // emit only the shapes handled above; reaching this branch indicates a + // native-layer bug worth surfacing. + this.logger.debug(`Unhandled addon event: ${event} (data type: ${typeof data})`) + return } - if (data instanceof Uint8Array || typeof data === 'string') { - return this._outputCallback(addon, 'Output', 'OnlyOneJob', data, error) + if (mapped.type === 'Error') { + this.logger.error('Job failed with error:', mapped.error) + this._job.fail(mapped.error) + return } - // RuntimeStats is the only plain-object payload the C++ addon emits. - // Matching structurally avoids coupling to specific stats key names. - if (typeof data === 'object' && data !== null) { - return this._outputCallback(addon, 'JobEnded', 'OnlyOneJob', data, null) + if (mapped.type === 'JobEnded') { + this._job.end(this.opts.stats ? mapped.data : null) + return } - return this._outputCallback(addon, event, 'OnlyOneJob', data, error) - } - - /** - * Cancel the current generation job. - */ - async cancel () { - if (this.addon?.cancel) { - await this.addon.cancel() + if (mapped.type === 'Output') { + this._job.output(mapped.data) } } - /** - * Unload the model and release all resources. - */ - async unload () { - return await this._withExclusiveRun(async () => { - await this.cancel() - const currentJobResponse = this._jobToResponse.get('OnlyOneJob') - if (currentJobResponse) { - currentJobResponse.failed(new Error('Model was unloaded')) - this._deleteJobMapping('OnlyOneJob') - } - this._hasActiveResponse = false - if (this.addon) { - await super.unload() - } - this._releaseNativeLogger() - }) - } - /** * Generate an image from a text prompt, or transform an input image with a prompt. * @@ -223,8 +207,13 @@ class ImgStableDiffusion extends BaseInference { * Others: SDEdit (init_image + strength). * @returns {Promise} */ + async run (params) { + return this._run(() => this._runInternal(params)) + } + async _runInternal (params) { - // Validate init_image is Uint8Array if provided + // Validate inputs first so callers get precise errors before any + // readiness/busy checks. if (params.init_image != null && !(params.init_image instanceof Uint8Array)) { throw new Error( 'init_image must be a Uint8Array (e.g. fs.readFileSync("image.png")). ' + @@ -237,7 +226,7 @@ class ImgStableDiffusion extends BaseInference { // SdModel::process() only enters the FLUX ref_images path when // config_.prediction is FLUX_FLOW_PRED or FLUX2_FLOW_PRED. Without // an explicit value the addon silently falls back to SDEdit. - if (params.init_image && this._llmModel) { + if (params.init_image && this._files.llm) { const pred = this._config?.prediction if (pred !== 'flux2_flow' && pred !== 'flux_flow') { throw new Error( @@ -249,42 +238,68 @@ class ImgStableDiffusion extends BaseInference { } } + if (!this.addon) { + throw new Error('Addon not initialized. Call load() first.') + } + const mode = params.init_image ? 'img2img' : 'txt2img' this.logger.info('Starting generation with mode:', mode) - return await this._withExclusiveRun(async () => { - if (this._hasActiveResponse) { - throw new Error(RUN_BUSY_ERROR_MESSAGE) - } + if (this._hasActiveResponse) { + throw new Error(RUN_BUSY_ERROR_MESSAGE) + } - const response = this._createResponse('OnlyOneJob') + const response = this._job.start() - let accepted - try { - accepted = await this.addon.runJob({ ...params, mode }) - } catch (error) { - this._deleteJobMapping('OnlyOneJob') - response.failed(error) - throw error - } + let accepted + try { + accepted = await this.addon.runJob({ ...params, mode }) + } catch (error) { + this._job.fail(error) + throw error + } - if (!accepted) { - this._deleteJobMapping('OnlyOneJob') - const msg = RUN_BUSY_ERROR_MESSAGE - response.failed(new Error(msg)) - throw new Error(msg) - } + if (!accepted) { + this._job.fail(new Error(RUN_BUSY_ERROR_MESSAGE)) + throw new Error(RUN_BUSY_ERROR_MESSAGE) + } - this._hasActiveResponse = true - const finalized = response.await().finally(() => { this._hasActiveResponse = false }) - finalized.catch(() => {}) - response.await = () => finalized + this._hasActiveResponse = true + const finalized = response.await().finally(() => { this._hasActiveResponse = false }) + finalized.catch((err) => { + this.logger?.warn?.('Generation response rejected:', err?.message || err) + }) + response.await = () => finalized + + this.logger.info('Generation job started successfully') + return response + } - this.logger.info('Generation job started successfully') + async cancel () { + if (this.addon?.cancel) { + await this.addon.cancel() + } + } - return response + async unload () { + return this._run(async () => { + await this.cancel() + if (this._job.active) { + this._job.fail(new Error('Model was unloaded')) + } + this._hasActiveResponse = false + if (this.addon) { + await this.addon.unload() + // Null the addon reference so post-unload `cancel()` / `run()` calls hit the + // `if (!this.addon)` guard instead of dereferencing a disposed native handle. + this.addon = null + } + this._releaseNativeLogger() + this.state.configLoaded = false }) } + + getState () { return this.state } } module.exports = ImgStableDiffusion diff --git a/packages/lib-infer-diffusion/package.json b/packages/lib-infer-diffusion/package.json index 0795fe1c1e..14e8f86b4a 100644 --- a/packages/lib-infer-diffusion/package.json +++ b/packages/lib-infer-diffusion/package.json @@ -1,6 +1,6 @@ { "name": "@qvac/diffusion-cpp", - "version": "0.2.0", + "version": "0.3.0", "description": "stable-diffusion.cpp addon for qvac image/video generation", "addon": true, "scripts": { @@ -72,7 +72,8 @@ "typescript": "^5.9.2" }, "dependencies": { - "@qvac/infer-base": "^0.2.2", + "@qvac/infer-base": "^0.4.0", + "@qvac/logging": "^0.1.0", "bare-fs": "^4.5.1", "bare-path": "^3.0.0", "bare-process": "^4.2.2" diff --git a/packages/lib-infer-diffusion/test/integration/api-behavior.test.js b/packages/lib-infer-diffusion/test/integration/api-behavior.test.js index ffd442f26c..8fa0ea67de 100644 --- a/packages/lib-infer-diffusion/test/integration/api-behavior.test.js +++ b/packages/lib-infer-diffusion/test/integration/api-behavior.test.js @@ -1,6 +1,7 @@ 'use strict' const test = require('brittle') +const path = require('bare-path') const os = require('bare-os') const proc = require('bare-process') const binding = require('../../binding') @@ -55,26 +56,24 @@ async function setupModel (t) { downloadUrl: MODEL.url }) - const model = new ImgStableDiffusion( - { - logger: console, - diskPath: modelDir, - modelName + const model = new ImgStableDiffusion({ + files: { + model: path.join(modelDir, modelName) }, - { + config: { device: useCpu ? 'cpu' : 'gpu', vae_on_cpu: isAndroid, threads: 4, prediction: 'v', verbosity: '2' - } - ) + }, + logger: console + }) await model.load() t.teardown(async () => { await model.unload().catch(() => {}) - try { binding.releaseLogger() } catch (_) {} }) return { model, modelDir } @@ -199,6 +198,30 @@ test('cancel | run: can run again after cancel', { timeout: testTimeout }, async saveGeneratedImages(modelDir, 'cancel-run-second-response', images) }) +test('run() before load() throws clear initialization error', { timeout: 60000 }, async t => { + const [, modelDir] = await ensureModel({ + modelName: MODEL.name, + downloadUrl: MODEL.url + }) + + const model = new ImgStableDiffusion({ + files: { model: path.join(modelDir, MODEL.name) }, + config: { device: useCpu ? 'cpu' : 'gpu', threads: 4 }, + logger: console, + opts: { stats: true } + }) + + let caught = null + try { + await model.run(SHORT_PARAMS) + } catch (err) { + caught = err + } + + t.ok(caught, 'run() before load() throws') + t.ok(/load\(\) first/i.test(caught?.message || ''), 'error message instructs to call load() first') +}) + // Keep event loop alive briefly to let pending async operations complete. // Prevents C++ destructors from running while async cleanup is still happening. setImmediate(() => { diff --git a/packages/lib-infer-diffusion/test/integration/generate-image-flux2-i2i.test.js b/packages/lib-infer-diffusion/test/integration/generate-image-flux2-i2i.test.js index 4e2f42e1dd..0cf3c57e7e 100644 --- a/packages/lib-infer-diffusion/test/integration/generate-image-flux2-i2i.test.js +++ b/packages/lib-infer-diffusion/test/integration/generate-image-flux2-i2i.test.js @@ -72,20 +72,19 @@ test('FLUX2-klein img2img — transforms an input image', { timeout: 1800000, sk const modelPath = path.join(modelDir, downloadedModelName) t.ok(fs.existsSync(modelPath), 'Model file exists on disk') - const model = new ImgStableDiffusion( - { - logger: console, - diskPath: modelDir, - modelName: downloadedModelName, - llmModel: qwenName, - vaeModel: vaeName + const model = new ImgStableDiffusion({ + files: { + model: path.join(modelDir, downloadedModelName), + llm: path.join(modelDir, qwenName), + vae: path.join(modelDir, vaeName) }, - { + config: { threads: 4, device: useCpu ? 'cpu' : 'gpu', prediction: 'flux2_flow' - } - ) + }, + logger: console + }) const images = [] const progressTicks = [] diff --git a/packages/lib-infer-diffusion/test/integration/generate-image-flux2.test.js b/packages/lib-infer-diffusion/test/integration/generate-image-flux2.test.js index 9c98c5ee45..2540b297bd 100644 --- a/packages/lib-infer-diffusion/test/integration/generate-image-flux2.test.js +++ b/packages/lib-infer-diffusion/test/integration/generate-image-flux2.test.js @@ -68,19 +68,18 @@ test('FLUX.2 klein txt2img — generates a valid PNG image', { timeout: 1800000, const modelPath = path.join(modelDir, downloadedModelName) t.ok(fs.existsSync(modelPath), 'Model file exists on disk') - const model = new ImgStableDiffusion( - { - logger: console, - diskPath: modelDir, - modelName: downloadedModelName, - llmModel: LLM_MODEL.name, - vaeModel: VAE_MODEL.name + const model = new ImgStableDiffusion({ + files: { + model: path.join(modelDir, downloadedModelName), + llm: path.join(modelDir, LLM_MODEL.name), + vae: path.join(modelDir, VAE_MODEL.name) }, - { + config: { threads: 4, device: useCpu ? 'cpu' : 'gpu' - } - ) + }, + logger: console + }) const images = [] const progressTicks = [] diff --git a/packages/lib-infer-diffusion/test/integration/generate-image-sd3-i2i.test.js b/packages/lib-infer-diffusion/test/integration/generate-image-sd3-i2i.test.js index 7ca196a71a..fffaaa8fb9 100644 --- a/packages/lib-infer-diffusion/test/integration/generate-image-sd3-i2i.test.js +++ b/packages/lib-infer-diffusion/test/integration/generate-image-sd3-i2i.test.js @@ -51,20 +51,19 @@ test('SD3 Medium img2img — transforms an input image', { timeout: 1800000, ski const modelPath = path.join(modelDir, downloadedModelName) t.ok(fs.existsSync(modelPath), 'Model file exists on disk') - const model = new ImgStableDiffusion( - { - logger: console, - diskPath: modelDir, - modelName: downloadedModelName + const model = new ImgStableDiffusion({ + files: { + model: path.join(modelDir, downloadedModelName) }, - { + config: { threads: 4, device: useCpu ? 'cpu' : 'gpu', vae_on_cpu: true, prediction: 'flow', flow_shift: '3.0' - } - ) + }, + logger: console + }) const images = [] const progressTicks = [] diff --git a/packages/lib-infer-diffusion/test/integration/generate-image-sd3.test.js b/packages/lib-infer-diffusion/test/integration/generate-image-sd3.test.js index 92ec2475eb..5e33a47d09 100644 --- a/packages/lib-infer-diffusion/test/integration/generate-image-sd3.test.js +++ b/packages/lib-infer-diffusion/test/integration/generate-image-sd3.test.js @@ -46,19 +46,18 @@ test('SD3 Medium txt2img — generates a valid PNG image', { timeout: 900000, sk const modelPath = path.join(modelDir, downloadedModelName) t.ok(fs.existsSync(modelPath), 'Model file exists on disk') - const model = new ImgStableDiffusion( - { - logger: console, - diskPath: modelDir, - modelName: downloadedModelName + const model = new ImgStableDiffusion({ + files: { + model: path.join(modelDir, downloadedModelName) }, - { + config: { threads: 4, device: useCpu ? 'cpu' : 'gpu', prediction: 'flow', flow_shift: '3.0' - } - ) + }, + logger: console + }) const images = [] const progressTicks = [] diff --git a/packages/lib-infer-diffusion/test/integration/generate-image-sdxl.test.js b/packages/lib-infer-diffusion/test/integration/generate-image-sdxl.test.js index 3985c26633..528abd9db1 100644 --- a/packages/lib-infer-diffusion/test/integration/generate-image-sdxl.test.js +++ b/packages/lib-infer-diffusion/test/integration/generate-image-sdxl.test.js @@ -46,18 +46,16 @@ test('SDXL txt2img — generates a valid PNG image', { timeout: 900000, skip }, const modelPath = path.join(modelDir, downloadedModelName) t.ok(fs.existsSync(modelPath), 'Model file exists on disk') - const model = new ImgStableDiffusion( - { - logger: console, - diskPath: modelDir, - modelName: downloadedModelName + const model = new ImgStableDiffusion({ + files: { + model: path.join(modelDir, downloadedModelName) }, - { + config: { threads: 4, device: useCpu ? 'cpu' : 'gpu' - - } - ) + }, + logger: console + }) const images = [] const progressTicks = [] diff --git a/packages/lib-infer-diffusion/test/integration/generate-image.test.js b/packages/lib-infer-diffusion/test/integration/generate-image.test.js index 3cc175a188..2afb9596c2 100644 --- a/packages/lib-infer-diffusion/test/integration/generate-image.test.js +++ b/packages/lib-infer-diffusion/test/integration/generate-image.test.js @@ -45,18 +45,17 @@ test('SD2.1 txt2img — generates a valid PNG image', { timeout: 600000, skip }, const modelPath = path.join(modelDir, downloadedModelName) t.ok(fs.existsSync(modelPath), 'Model file exists on disk') - const model = new ImgStableDiffusion( - { - logger: console, - diskPath: modelDir, - modelName: downloadedModelName + const model = new ImgStableDiffusion({ + files: { + model: path.join(modelDir, downloadedModelName) }, - { + config: { threads: 4, device: useCpu ? 'cpu' : 'gpu', prediction: 'v' // SD2.1 uses v-prediction - } - ) + }, + logger: console + }) const images = [] const progressTicks = [] diff --git a/packages/lib-infer-diffusion/test/integration/input-validation.test.js b/packages/lib-infer-diffusion/test/integration/input-validation.test.js index ced543c97b..6f3bae3d57 100644 --- a/packages/lib-infer-diffusion/test/integration/input-validation.test.js +++ b/packages/lib-infer-diffusion/test/integration/input-validation.test.js @@ -87,15 +87,14 @@ test('readImageDimensions | unrecognised format returns null', async (t) => { // ---------- FLUX img2img prediction guard ---------- test('FLUX img2img | throws when prediction is omitted', async (t) => { - const model = new ImgStableDiffusion( - { - logger: console, - diskPath: '.', - modelName: 'flux-2-klein-4b-Q8_0.gguf', - llmModel: 'Qwen3-4B-Q4_K_M.gguf' + const model = new ImgStableDiffusion({ + files: { + model: '/tmp/flux-2-klein-4b-Q8_0.gguf', + llm: '/tmp/Qwen3-4B-Q4_K_M.gguf' }, - { threads: 1 } - ) + config: { threads: 1 }, + logger: console + }) const fakeImage = VALID_PNG_HEADER @@ -115,15 +114,14 @@ test('FLUX img2img | throws when prediction is omitted', async (t) => { }) test('FLUX img2img | throws when prediction is "auto"', async (t) => { - const model = new ImgStableDiffusion( - { - logger: console, - diskPath: '.', - modelName: 'flux-2-klein-4b-Q8_0.gguf', - llmModel: 'Qwen3-4B-Q4_K_M.gguf' + const model = new ImgStableDiffusion({ + files: { + model: '/tmp/flux-2-klein-4b-Q8_0.gguf', + llm: '/tmp/Qwen3-4B-Q4_K_M.gguf' }, - { threads: 1, prediction: 'auto' } - ) + config: { threads: 1, prediction: 'auto' }, + logger: console + }) const fakeImage = VALID_PNG_HEADER @@ -139,15 +137,14 @@ test('FLUX img2img | throws when prediction is "auto"', async (t) => { }) test('FLUX img2img | does NOT throw for txt2img even without prediction', async (t) => { - const model = new ImgStableDiffusion( - { - logger: console, - diskPath: '.', - modelName: 'flux-2-klein-4b-Q8_0.gguf', - llmModel: 'Qwen3-4B-Q4_K_M.gguf' + const model = new ImgStableDiffusion({ + files: { + model: '/tmp/flux-2-klein-4b-Q8_0.gguf', + llm: '/tmp/Qwen3-4B-Q4_K_M.gguf' }, - { threads: 1 } - ) + config: { threads: 1 }, + logger: console + }) // txt2img (no init_image) should pass the guard even without prediction. // It will fail later because no model is loaded, but that's expected — @@ -164,16 +161,15 @@ test('FLUX img2img | does NOT throw for txt2img even without prediction', async }) test('non-FLUX model | does NOT throw for img2img without prediction', async (t) => { - const model = new ImgStableDiffusion( - { - logger: console, - diskPath: '.', - modelName: 'stable-diffusion-v2-1-Q4_0.gguf' + const model = new ImgStableDiffusion({ + files: { + model: '/tmp/stable-diffusion-v2-1-Q4_0.gguf' }, - { threads: 1 } - ) + config: { threads: 1 }, + logger: console + }) - // SD model (no llmModel) should not trigger the FLUX guard. + // SD model (no files.llm) should not trigger the FLUX guard. try { await model.run({ prompt: 'test', init_image: VALID_PNG_HEADER }) t.fail('should have thrown (no model loaded)') diff --git a/packages/lib-infer-diffusion/test/integration/model-loading.test.js b/packages/lib-infer-diffusion/test/integration/model-loading.test.js index 2caf344f0a..e6c18d2adf 100644 --- a/packages/lib-infer-diffusion/test/integration/model-loading.test.js +++ b/packages/lib-infer-diffusion/test/integration/model-loading.test.js @@ -1,6 +1,7 @@ 'use strict' const test = require('brittle') +const path = require('bare-path') const os = require('bare-os') const proc = require('bare-process') @@ -37,10 +38,12 @@ test('model loading - load and unload', { timeout: testTimeout }, async t => { } const addon = new ImgStableDiffusion({ - modelName: downloadedModelName, - diskPath: modelDir, + files: { + model: path.join(modelDir, downloadedModelName) + }, + config, logger: console - }, config) + }) await addon.load() t.pass('model loaded successfully') diff --git a/packages/lib-infer-diffusion/test/unit/map-addon-event.test.js b/packages/lib-infer-diffusion/test/unit/map-addon-event.test.js new file mode 100644 index 0000000000..bfe6cdc3c4 --- /dev/null +++ b/packages/lib-infer-diffusion/test/unit/map-addon-event.test.js @@ -0,0 +1,51 @@ +'use strict' + +const test = require('brittle') +const { mapAddonEvent } = require('../../addon.js') + +test('event name containing "Error" maps to Error type carrying rawError', function (t) { + const err = new Error('generation failed') + const result = mapAddonEvent('GenerationError', null, err) + t.is(result.type, 'Error') + t.is(result.error, err) +}) + +test('Uint8Array data maps to Output (image bytes)', function (t) { + const bytes = new Uint8Array([137, 80, 78, 71]) + const result = mapAddonEvent('ImageOutput', bytes, null) + t.is(result.type, 'Output') + t.is(result.data, bytes) + t.is(result.error, null) +}) + +test('string data maps to Output (progress JSON tick)', function (t) { + const tick = '{"step":3,"total":20,"elapsed_ms":1234}' + const result = mapAddonEvent('Progress', tick, null) + t.is(result.type, 'Output') + t.is(result.data, tick) +}) + +test('plain object data maps to JobEnded (RuntimeStats)', function (t) { + const stats = { total_time_ms: 5000, steps: 20 } + const result = mapAddonEvent('Stats', stats, null) + t.is(result.type, 'JobEnded') + t.is(result.data, stats) + t.is(result.error, null) +}) + +test('Error event takes precedence over data shape', function (t) { + const err = new Error('boom') + const bytes = new Uint8Array([1, 2, 3]) + const result = mapAddonEvent('FatalError', bytes, err) + t.is(result.type, 'Error', 'Error event name beats Uint8Array output shape') + t.is(result.error, err) +}) + +test('null data with unknown event returns null', function (t) { + t.is(mapAddonEvent('Unknown', null, null), null) +}) + +test('number/boolean data with unknown event returns null', function (t) { + t.is(mapAddonEvent('Unknown', 42, null), null) + t.is(mapAddonEvent('Unknown', true, null), null) +}) diff --git a/packages/qvac-lib-infer-llamacpp-embed/CHANGELOG.md b/packages/qvac-lib-infer-llamacpp-embed/CHANGELOG.md index fcd3b3f39a..7a6450a10c 100644 --- a/packages/qvac-lib-infer-llamacpp-embed/CHANGELOG.md +++ b/packages/qvac-lib-infer-llamacpp-embed/CHANGELOG.md @@ -1,5 +1,98 @@ # Changelog +## [0.14.0] - 2026-04-10 + +This release migrates the embed addon off `BaseInference` inheritance and the `WeightsProvider` download layer onto the composable `createJobHandler` + `exclusiveRunQueue` utilities from `@qvac/infer-base@^0.4.0`. The constructor signature is replaced with a single object whose `files.model` field is an ordered array of absolute paths, mirroring the parallel LLM and diffusion addon refactors. This is a breaking change — every caller must update. + +## Breaking Changes + +### Constructor signature: single object with `files`, no `Loader` + +`GGMLBert` now takes a single `{ files, config?, logger?, opts? }` object. The old `Loader` + `diskPath` + `modelName` + two-arg `(args, config)` shape is gone — callers pre-resolve absolute paths and supply them as `files.model`. + +```js +// BEFORE (≤ 0.13.x) +const FilesystemDL = require('@qvac/dl-filesystem') +const loader = new FilesystemDL({ dirPath: '/models' }) +const model = new GGMLBert({ + loader, + modelName: 'bge-small-en-v1.5-q4_0.gguf', + diskPath: '/models', + logger: console, + opts: { stats: true } +}, { device: 'gpu', batch_size: '512' }) + +// AFTER (0.14.0) +const model = new GGMLBert({ + files: { + model: ['/models/bge-small-en-v1.5-q4_0.gguf'] + }, + config: { device: 'gpu', batch_size: '512' }, + logger: console, + opts: { stats: true } +}) +``` + +For sharded models the caller passes the full ordered list — the `.tensors.txt` companion first, followed by every `-NNNNN-of-MMMMM.gguf` shard in ascending order: + +```js +const model = new GGMLBert({ + files: { + model: [ + '/models/big-embed-model.tensors.txt', + '/models/big-embed-model-00001-of-00003.gguf', + '/models/big-embed-model-00002-of-00003.gguf', + '/models/big-embed-model-00003-of-00003.gguf' + ] + }, + config: { device: 'gpu' } +}) +``` + +### `BaseInference` inheritance and `WeightsProvider` removed + +`GGMLBert` no longer extends `BaseInference` and no longer touches the `WeightsProvider` download layer. The class composes `createJobHandler` and `exclusiveRunQueue` from `@qvac/infer-base@^0.4.0` directly. Public lifecycle methods (`load` / `run` / `cancel` / `unload` / `getState`) are unchanged in shape, but `downloadWeights` and the loader-based progress callbacks are gone — the caller is responsible for placing files on disk before constructing the model. + +In-memory streaming from network sources (URLs, Hyperdrive) is no longer supported in the current API. The SDK does not currently use it (models are stored to disk first); this can be re-added when/if the SDK plans to support that feature. Before, it was possible through the `Loader` abstraction. + +### Dependency changes + +- `@qvac/infer-base` bumped from `^0.2.2` to `^0.4.0`. +- `bare-fs` is now a runtime dependency (used to stream shards from disk). +- `@qvac/dl-filesystem` and `@qvac/dl-hyperdrive` are no longer used by this package and have been removed from `devDependencies` / `peerDependencies`. + +## Features + +### Constructor input validation + +The constructor now throws `TypeError('files.model must be a non-empty array of absolute paths')` when `files` or `files.model` is missing or empty. This produces a clear error for callers porting old code instead of a confusing `Cannot read properties of undefined`. + +### `run()`-before-`load()` guard + +Calling `run()` before `load()` now throws `Error('Addon not initialized. Call load() first.')` instead of dereferencing `null` and crashing. + +### `load()` is now idempotent when already loaded + +A second `load()` call on an already-loaded instance is now a silent no-op instead of unloading and reloading. This aligns with the ReadyResource pattern used elsewhere in QVAC and prevents accidental double-loads from triggering expensive work. Callers that intentionally want to swap weights must call `unload()` first (which clears `configLoaded`) and then `load()` again. + +### Crash-safe shard streaming + +If `_streamShards()` or `addon.activate()` throws mid-load (for example a corrupted shard file or a native init failure), the partially-initialized addon is now best-effort-unloaded and `this.addon` is reset to `null`. A subsequent `load()` call starts cleanly instead of leaking a zombie native instance. + +## Bug Fixes + +### `unload()` clears the addon reference + +`unload()` now sets `this.addon = null` after `await this.addon.unload()`, so post-unload `cancel()` / `run()` calls hit the explicit guards rather than dereferencing a disposed native handle. `cancel()` and the job-handler cancel closure both use optional chaining for the same reason. + +### Unknown addon events no longer pollute the output stream + +`_addonOutputCallback` previously fed any non-stats / non-error event payload into `response.output`, including unknown events. It now logs unknown events at warn level (these indicate a native-layer change worth surfacing) and only forwards `Embeddings` payloads to the active response. + +## Pull Requests + +- [#1493](https://github.com/tetherto/qvac/pull/1493) - chore[bc]: embed addon interface refactor — remove BaseInference and WeightsProvider + ## [0.13.4] - 2026-04-03 ### Changed diff --git a/packages/qvac-lib-infer-llamacpp-embed/README.md b/packages/qvac-lib-infer-llamacpp-embed/README.md index 24843b91de..a844e86a15 100644 --- a/packages/qvac-lib-infer-llamacpp-embed/README.md +++ b/packages/qvac-lib-infer-llamacpp-embed/README.md @@ -9,13 +9,12 @@ This native C++ addon, built using the `Bare` Runtime, simplifies running text e - [Building from Source](#building-from-source) - [Usage](#usage) - [1. Import the Model Class](#1-import-the-model-class) - - [2. Create a Data Loader](#2-create-a-data-loader) - - [3. Create the `args` obj](#3-create-the-args-obj) - - [4. Create `config`](#4-create-config) - - [5. Instanstiate the model](#5-instanstiate-the-model) - - [6. Load the model](#6-load-the-model) - - [7. Generate embeddings for input sequence](#7-generate-embeddings-for-input-sequence) - - [8. Unload the model](#8-unload-the-model) + - [2. Create the `args` obj](#2-create-the-args-obj) + - [3. Create `config`](#3-create-config) + - [4. Instanstiate the model](#4-instanstiate-the-model) + - [5. Load the model](#5-load-the-model) + - [6. Generate embeddings for input sequence](#6-generate-embeddings-for-input-sequence) + - [7. Release Resources](#7-release-resources) - [API behavior by state](#api-behavior-by-state) - [Quickstart Example](#quickstart-example) - [Other Examples](#other-examples) @@ -69,67 +68,78 @@ See [build.md](./build.md) for detailed instructions on how to build the addon f const GGMLBert = require('@qvac/embed-llamacpp') ``` -### 2. Create a Data Loader - -Data Loaders abstract the way model files are accessed. Use a [`FileSystemDataLoader`](../dl-filesystem) to load model files from your local file system. Models can be downloaded directly from HuggingFace. +### 2. Create the `args` obj ```js -const FilesystemDL = require('@qvac/dl-filesystem') - -// Download model from HuggingFace (see examples/utils.js for downloadModel helper) -const [modelName, dirPath] = await downloadModel( - 'https://huggingface.co/ChristianAzinn/gte-large-gguf/resolve/main/gte-large_fp16.gguf', - 'gte-large_fp16.gguf' -) - -const fsDL = new FilesystemDL({ dirPath }) -``` - -### 3. Create the `args` obj +const path = require('bare-path') -```js const args = { - loader: fsDL, + files: { model: [path.join(dirPath, modelName)] }, + config: { + device: 'gpu', + gpu_layers: '99', + batch_size: '1024', + ctx_size: '512' + }, logger: console, - opts: { stats: true }, - diskPath: dirPath, - modelName + opts: { stats: true } } ``` The `args` obj contains the following properties: -* `loader`: The Data Loader instance from which the model file will be streamed. -* `logger`: This property is used to create a [`QvacLogger`](../logging) instance, which handles all logging functionality. +* `files.model`: An array of absolute paths to the model file(s) on disk. For sharded models, provide all shard paths. +* `config`: A dictionary of hyper-parameters used to tweak the behaviour of the model (see [Create `config`](#3-create-config) below). +* `logger`: This property is used to create a [`QvacLogger`](../logging) instance, which handles all logging functionality. * `opts.stats`: This flag determines whether to calculate inference stats. -* `diskPath`: The local directory where the model file will be downloaded to. -* `modelName`: The name of model file in the Data Loader. -### 4. Create `config` +#### Sharded model usage -The `config` is a dictionary (object) consisting of hyper-parameters which can be used to tweak the behaviour of the model. -All parameter values should be strings. +The addon does not discover companion files on disk — the caller MUST pass every file the model needs, in order, via `files.model`. For sharded GGUF models this includes the `.tensors.txt` companion file followed by each `.gguf` shard in numerical order. ```js -const config = { - device: 'gpu', - gpu_layers: '99', - batch_size: '1024', - ctx_size: '512' -} +const path = require('bare-path') + +const dir = '/path/to/models' +const model = new GGMLBert({ + files: { + model: [ + path.join(dir, 'gte-large.Q2_K.tensors.txt'), + path.join(dir, 'gte-large.Q2_K-00001-of-00005.gguf'), + path.join(dir, 'gte-large.Q2_K-00002-of-00005.gguf'), + path.join(dir, 'gte-large.Q2_K-00003-of-00005.gguf'), + path.join(dir, 'gte-large.Q2_K-00004-of-00005.gguf'), + path.join(dir, 'gte-large.Q2_K-00005-of-00005.gguf') + ] + }, + config: { device: 'gpu', gpu_layers: '99' }, + logger: console, + opts: { stats: true } +}) ``` -| Parameter | Range / Type | Default | Description | -|-------------------|---------------------------------------------|------------------------------|-------------------------------------------------------| -| -dev | `"gpu"` or `"cpu"` | `"gpu"` | Device to run inference on | -| -ngl | integer | 0 | Number of model layers to offload to GPU | -|--batch-size | integer | 2048 | Tokens for processing multiple prompts together | -| --pooling | `{none,mean,cls,last,rank}` | model default | Pooling type for embeddings | -| --attention | `{causal,non-causal}` | model default | Attention type for embeddings | -| --embd-normalize | integer | 2 | Embedding normalization (-1=none, 0=max abs int16, 1=taxicab, 2=euclidean, >2=p-norm) | -| -fa | `"on"`, `"off"`, or `"auto"` | `"auto"` | Enable/disable flash attention | -| --main-gpu | integer, `"integrated"`, or `"dedicated"` | — | GPU selection for multi-GPU systems | -| verbosity | 0 – 3 (0=ERROR, 1=WARNING, 2=INFO, 3=DEBUG) | 0 | Logging verbosity level | +Rules for the `files.model` array: + +* **Order matters.** The `.tensors.txt` file must come first, then shards in ascending numerical order (`00001-of-00005`, `00002-of-00005`, ...). +* **All shards are required.** Missing any shard or the `.tensors.txt` companion will fail loading. +* **Non-sharded models** pass a single absolute path: `files: { model: [modelPath] }`. +* **Absolute paths only.** The addon reads each file directly via `bare-fs` during `load()`. + +### 3. Create `config` + +The `config` is a plain JS object whose keys are forwarded directly to the native backend. All values must be strings (the native layer parses them with `getSubmap`). + +| Key | Range / Type | Default | Description | +|------------------|-----------------------------------------------|---------------|------------------------------------------------------------------------------------------| +| `device` | `"gpu"` \| `"cpu"` | `"gpu"` | Device to run inference on | +| `gpu_layers` | string of integer | `"0"` | Number of model layers to offload to GPU | +| `batch_size` | string of integer | `"2048"` | Tokens processed per batch (input throughput) | +| `pooling` | `"none"` \| `"mean"` \| `"cls"` \| `"last"` \| `"rank"` | model default | Pooling strategy used to collapse token embeddings into a single sequence vector | +| `attention` | `"causal"` \| `"non-causal"` | model default | Attention type | +| `embd_normalize` | string of integer | `"2"` | Embedding normalization (`-1` = none, `0` = max abs int16, `1` = taxicab, `2` = euclidean, `>2` = p-norm) | +| `flash_attn` | `"on"` \| `"off"` \| `"auto"` | `"auto"` | Enable / disable flash attention | +| `main-gpu` | string of integer \| `"integrated"` \| `"dedicated"` | — | GPU selection for multi-GPU systems | +| `verbosity` | string of `"0"`–`"3"` (0=ERROR, 1=WARN, 2=INFO, 3=DEBUG) | `"0"` | Logging verbosity level | #### IGPU/GPU selection logic: @@ -141,43 +151,21 @@ const config = { | System with both | ✅ Uses dedicated GPU (preferred) | ✅ Uses dedicated GPU | ✅ Uses integrated GPU | -### 5. Instantiate the model +### 4. Instantiate the model ```js -const model = new GGMLBert(args, config) +const model = new GGMLBert(args) ``` -### 6. Load the model +### 5. Load the model ```js await model.load() ``` -_Optionally_ you can pass the following parameters to tweak the loading behaviour. -* `close?`: This boolean value determines whether to close the Data Loader after loading. Defaults to `true` -* `reportProgressCallback?`: A callback function which gets called periodically with progress updates. It can be used to display overall progress percentage. - -_For example:_ - -```js -await model.load(false, progress => process.stdout.write(`\rOverall Progress: ${progress.overallProgress}%`)) -``` - -**Progress Callback Data** - -The progress callback receives an object with the following properties: +`load()` takes no arguments. The addon streams each file listed in `files.model` directly from disk via `bare-fs` and then activates the model. There is no data loader, no progress callback, and no download step — the caller is responsible for ensuring the files already exist at the paths passed to the constructor. -| Property | Type | Description | -|---------------------|--------|-----------------------------------------| -| `action` | string | Current operation being performed | -| `totalSize` | number | Total bytes to be loaded | -| `totalFiles` | number | Total number of files to process | -| `filesProcessed` | number | Number of files completed so far | -| `currentFile` | string | Name of file currently being processed | -| `currentFileProgress` | string | Percentage progress on current file | -| `overallProgress` | string | Overall loading progress percentage | - -### 7. Generate embeddings for input sequence +### 6. Generate embeddings for input sequence The model outputs a vector for the input sequence. @@ -189,14 +177,13 @@ const embeddings = await response.await() When `opts.stats` is enabled, `response.stats` includes runtime metrics such as `total_tokens`, `total_time_ms`, `tokens_per_second`, and `backendDevice` (`"cpu"` or `"gpu"`). `backendDevice` reflects the resolved device used at runtime after backend selection/fallback logic, not only the requested config. -### 8. Release Resources +### 7. Release Resources Unload the model when finished: ```javascript try { await model.unload() - await fsDL.close() } catch (error) { console.error('Failed to unload model:', error) } @@ -210,11 +197,10 @@ The following table describes the expected behavior of `run` and `cancel` depend |---------------|----------------|----------------------------------------------------------------| | idle | run | **Allowed** — starts inference, returns `QvacResponse` | | idle | cancel | **Allowed** — no-op (no job to cancel); Promise resolves | -| run | run | **Throw** — second `run()` throws "a job is already set or being processed" (can wait very briefly for previous job completion) | +| run | run | **Throw** — second `run()` throws `"Cannot set new job: a job is already set or being processed"` once it reaches the head of the queue; previous response must settle first. | | run | cancel | **Allowed** — cancels current job; Promise resolves when job has stopped | -When `run()` is called while another job is active, the implementation first waits briefly for the previous job to settle. This preserves single-job behavior while still failing fast when the instance is busy. If the second run cannot be accepted (timeout or addon busy rejection), it throws: -- `"Cannot set new job: a job is already set or being processed"` +A second `run()` while a job is active is serialized by `exclusiveRunQueue` — it waits in the queue until the previous `_runInternal` returns, then enters the busy guard. Because the busy flag (`_hasActiveResponse`) is only cleared when the previous `response.await()` settles, the second call rejects with `"Cannot set new job: a job is already set or being processed"`. The queue eliminates race conditions but does not retry or buffer results; callers must wait for the previous `response.await()` to settle (or call `model.cancel()`) before issuing the next request. **Cancellation API:** Prefer cancelling from the model: `await model.cancel()`. This cancels the current job and the Promise resolves when the job has actually stopped (future-based in C++). You can also call `await response.cancel()` on the value returned by `run()`; it is equivalent and targets the same job. Both are no-op when idle. @@ -261,11 +247,11 @@ Results are continuously updated with new releases to ensure up-to-date performa ## Tests -Integration tests are located in [`test/integration/`](./test/integration/) and cover core functionality including model loading, inference, tool calling, multimodal capabilities, and configuration parameters. -These tests help prevent regressions and ensure the library remains stable as contributions are made to the project. +Integration tests are located in [`test/integration/`](./test/integration/) and cover core embed functionality: single-file model load → embed → unload, multi-instance concurrency (two embed instances running simultaneously, repeated load/unload cycles, unloading one instance while another processes), and the public `run()` / `cancel()` lifecycle. These tests help prevent regressions and ensure the library remains stable as contributions are made to the project. + +C++ unit tests live under [`addon/test/`](./addon/test/) and exercise the native components at a lower level, including backend selection, single-step inference, end-to-end embedding generation, and pooling. These tests validate the native implementation and help catch issues early in development. -Unit tests are located in [`test/unit/`](./test/unit/) and test the C++ addon components at a lower level, including backend selection, cache management, chat templates, context handling, and UTF8 token processing. -These tests validate the native implementation and help catch issues early in development. +> **Note:** This package is *embeddings only*. There is no tool-calling, multimodal, KV-cache, or chat-template support — those features belong to the LLM addon ([`@qvac/llm-llamacpp`](../qvac-lib-infer-llamacpp-llm/)). ## Glossary diff --git a/packages/qvac-lib-infer-llamacpp-embed/addon.js b/packages/qvac-lib-infer-llamacpp-embed/addon.js index f58ecef5f1..997e5699f0 100644 --- a/packages/qvac-lib-infer-llamacpp-embed/addon.js +++ b/packages/qvac-lib-infer-llamacpp-embed/addon.js @@ -1,5 +1,62 @@ const path = require('bare-path') +/** + * Map a raw native event from the C++ embed addon to a logical event. + * + * The native binding emits events with C++-mangled names and varied + * payload shapes. This wrapper normalizes them into one of: + * - `'Output'` — embeddings payload (`Embeddings` family event) + * - `'Error'` — failure + * - `'JobEnded'` — terminal RuntimeStats payload (with `backendDevice` + * mapped from `0/1` to `'cpu'/'gpu'`) + * + * Returns `{ type, data, error }` or `null` if the event should be + * dropped (currently never — embed has no skip-flag state, but the + * shape mirrors the LLM addon for consistency). + * + * + * @param {string} rawEvent + * @param {*} rawData + * @param {*} rawError + * @returns {{ type: string, data: *, error: * } | null} + */ +function mapAddonEvent (rawEvent, rawData, rawError) { + // RuntimeStats: structurally detected so we don't couple to C++ key + // ordering. The embed addon emits these as the terminal event for a + // job (`tokens_per_second` is the marker; `total_tokens` / + // `total_time_ms` / `batch_size` / `context_size` are the other + // canonical fields). + const isStatsData = + rawData && + typeof rawData === 'object' && + ( + 'tokens_per_second' in rawData || + 'total_tokens' in rawData || + 'total_time_ms' in rawData || + 'batch_size' in rawData || + 'context_size' in rawData + ) + if (isStatsData) { + const stats = { ...rawData } + if (stats.backendDevice === 0) { + stats.backendDevice = 'cpu' + } else if (stats.backendDevice === 1) { + stats.backendDevice = 'gpu' + } + return { type: 'JobEnded', data: stats, error: null } + } + + if (typeof rawEvent === 'string' && rawEvent.includes('Error')) { + return { type: 'Error', data: rawData, error: rawError } + } + + if (typeof rawEvent === 'string' && rawEvent.includes('Embeddings')) { + return { type: 'Output', data: rawData, error: null } + } + + return null +} + /// An interface between Bare addon in C++ and JS runtime. class BertInterface { /** @@ -37,11 +94,17 @@ class BertInterface { } /** - * Loads model weights + * Loads model weights. The native side reads the JS property names + * `chunk` and `completed` directly, so this object's field names are + * load-bearing — see `JsBlobsStream.hpp::appendBlob` in + * `qvac-lib-inference-addon-cpp` for the parser. * @param {Object} data - * @param {String} data.filename - * @param {Buffer} data.contents - * @param {Promise} data.completed + * @param {String} data.filename - Logical filename used to group chunks + * into one shard. The native side keys `shards_in_progress` on this. + * @param {Uint8Array|null} data.chunk - Next chunk of bytes for the + * current shard, or `null` on the final call when `completed` is true. + * @param {Boolean} data.completed - `false` while more chunks remain; + * `true` on the last call to finalize the shard. */ async loadWeights (data) { return this._binding.loadWeights(this._handle, data) @@ -65,5 +128,6 @@ class BertInterface { } module.exports = { - BertInterface + BertInterface, + mapAddonEvent } diff --git a/packages/qvac-lib-infer-llamacpp-embed/benchmarks/performance/case-runner.js b/packages/qvac-lib-infer-llamacpp-embed/benchmarks/performance/case-runner.js index ec032125fc..530db5c597 100644 --- a/packages/qvac-lib-infer-llamacpp-embed/benchmarks/performance/case-runner.js +++ b/packages/qvac-lib-infer-llamacpp-embed/benchmarks/performance/case-runner.js @@ -3,7 +3,6 @@ const fs = require('bare-fs') const path = require('bare-path') const process = require('bare-process') -const FilesystemDL = require('@qvac/dl-filesystem') const { elapsedMs, round, @@ -39,19 +38,15 @@ function normalizeEmbeddings (rawEmbeddings) { return rawEmbeddings[0].map((vector) => Array.from(vector)) } -function buildConfigString (runtimeConfig, options = {}) { +function buildAddonConfig (runtimeConfig, options = {}) { const debugEnabled = !!options.debugEnabled - const parts = ['verbosity\t0'] - if (runtimeConfig.device != null) parts.push(`-dev\t${runtimeConfig.device}`) - if (runtimeConfig.batchSize != null) parts.push(`--batch-size\t${runtimeConfig.batchSize}`) - if (runtimeConfig.flashAttn != null) parts.push(`-fa\t${runtimeConfig.flashAttn}`) - if (runtimeConfig.ngl != null) parts.push(`-ngl\t${runtimeConfig.ngl}`) - if (runtimeConfig.noMmap) parts.push('--no-mmap') - if (!debugEnabled) { - // Suppress native llama.cpp startup logs in benchmark mode. - parts.push('--log-disable') - } - return parts.join('\n') + const config = { verbosity: debugEnabled ? '2' : '0' } + if (runtimeConfig.device != null) config.device = String(runtimeConfig.device) + if (runtimeConfig.batchSize != null) config.batch_size = String(runtimeConfig.batchSize) + if (runtimeConfig.flashAttn != null) config.flash_attn = String(runtimeConfig.flashAttn) + if (runtimeConfig.ngl != null) config.gpu_layers = String(runtimeConfig.ngl) + if (runtimeConfig.noMmap) config['no-mmap'] = '' + return config } function resolveModelName (modelDef, quantization) { @@ -145,8 +140,7 @@ function aggregateRunMetrics (runMetrics) { } async function runCaseWithRepeats ({ AddonCtor, modelDir, modelName, runtimeConfig, inputs, repeats, onRepeatComplete, debugEnabled }) { - const loader = new FilesystemDL({ dirPath: modelDir }) - const configString = buildConfigString(runtimeConfig, { debugEnabled }) + const addonConfig = buildAddonConfig(runtimeConfig, { debugEnabled }) const addonRuntimeLogger = createAddonRuntimeLogger(debugEnabled) let model = null @@ -160,12 +154,11 @@ async function runCaseWithRepeats ({ AddonCtor, modelDir, modelName, runtimeConf try { model = new AddonCtor({ - modelName, - loader, + files: { model: [path.join(modelDir, modelName)] }, + config: addonConfig, logger: addonRuntimeLogger, - diskPath: modelDir, opts: { stats: true } - }, configString) + }) const loadStart = process.hrtime() await model.load() @@ -209,11 +202,6 @@ async function runCaseWithRepeats ({ AddonCtor, modelDir, modelName, runtimeConf } catch (unloadError) { cleanupErrors.push(`unload_error=${unloadError && unloadError.message ? unloadError.message : String(unloadError)}`) } - try { - await loader.close() - } catch (closeError) { - cleanupErrors.push(`loader_close_error=${closeError && closeError.message ? closeError.message : String(closeError)}`) - } } if (primaryError) { @@ -306,7 +294,7 @@ function buildCaseResult ({ } async function runModelCases ({ - addonCtor, + AddonCtor, repeats, debugEnabled, debugLogger, @@ -350,7 +338,7 @@ async function runModelCases ({ } const inputs = testCase.inputMode === 'single' ? inputsRaw[0] : inputsRaw executionResult = await runCaseWithRepeats({ - addonCtor, + AddonCtor, modelDir: modelDef.modelDir, modelName: testCase.modelName, runtimeConfig: testCase.runtimeConfig, diff --git a/packages/qvac-lib-infer-llamacpp-embed/benchmarks/performance/embed-parameter-sweep.js b/packages/qvac-lib-infer-llamacpp-embed/benchmarks/performance/embed-parameter-sweep.js index 16dd09902c..061a0d0068 100644 --- a/packages/qvac-lib-infer-llamacpp-embed/benchmarks/performance/embed-parameter-sweep.js +++ b/packages/qvac-lib-infer-llamacpp-embed/benchmarks/performance/embed-parameter-sweep.js @@ -84,7 +84,7 @@ async function main () { const debugEnabled = Boolean(args.debug) const debugLogger = createDebugLogger(debugEnabled) const addonSource = parseAddonSource(args['addon-source']) - const addonCtor = resolveAddonCtor(addonSource) + const AddonCtor = resolveAddonCtor(addonSource) const repeats = parseRepeats(args.repeats) const resultsDir = DEFAULT_RESULTS_DIR const inputsFilePath = DEFAULT_INPUTS_FILE @@ -120,7 +120,7 @@ async function main () { for (const plan of plannedRunsByModel) { const modelResult = await runModelCases({ - addonCtor, + AddonCtor, repeats, debugEnabled, debugLogger, diff --git a/packages/qvac-lib-infer-llamacpp-embed/benchmarks/server/src/services/modelManager.js b/packages/qvac-lib-infer-llamacpp-embed/benchmarks/server/src/services/modelManager.js index 18c4bd1193..35486eae27 100644 --- a/packages/qvac-lib-infer-llamacpp-embed/benchmarks/server/src/services/modelManager.js +++ b/packages/qvac-lib-infer-llamacpp-embed/benchmarks/server/src/services/modelManager.js @@ -1,7 +1,7 @@ 'use strict' const EmbedLlamacpp = require('@qvac/embed-llamacpp') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const logger = require('../utils/logger') /** @@ -78,11 +78,6 @@ class ModelManager { * Internal method to load a model */ async _loadModel (modelPath, diskPath, localModelName, config) { - // Create FilesystemDL for local model loading - const loader = new FilesystemDL({ - dirPath: diskPath - }) - // Build addon config map from parameters // Config is a map with string values: { gpu_layers: '25', ctx_size: '512', batch_size: '512' } const addonConfig = {} @@ -109,16 +104,15 @@ class ModelManager { logger.info(`Loading model with config: ${JSON.stringify(addonConfig)}`) const model = new EmbedLlamacpp({ - diskPath, - modelName: localModelName, - loader, + files: { model: [path.join(diskPath, localModelName)] }, + config: addonConfig, logger: { info: logger.info.bind(logger), error: logger.error.bind(logger), warn: logger.warn.bind(logger), debug: logger.debug.bind(logger) } - }, addonConfig) + }) logger.info('Loading model into VRAM...') await model.load() diff --git a/packages/qvac-lib-infer-llamacpp-embed/docs/architecture.md b/packages/qvac-lib-infer-llamacpp-embed/docs/architecture.md index 06dda4e650..fb42c46f73 100644 --- a/packages/qvac-lib-infer-llamacpp-embed/docs/architecture.md +++ b/packages/qvac-lib-infer-llamacpp-embed/docs/architecture.md @@ -24,7 +24,7 @@ ### Architecture Decisions - [Decision 1: llama.cpp as Inference Backend](#decision-1-llamacpp-as-inference-backend) - [Decision 2: Bare Runtime over Node.js](#decision-2-bare-runtime-over-nodejs) -- [Decision 3: Pluggable Data Loader Architecture](#decision-3-pluggable-data-loader-architecture) +- [Decision 3: Caller-Provided File Paths](#decision-3-caller-provided-file-paths) - [Decision 4: Incremental Buffer-Based Weight Loading](#decision-4-incremental-buffer-based-weight-loading) - [Decision 5: Batch Processing as Primary Use Case](#decision-5-batch-processing-as-primary-use-case) - [Decision 6: Exclusive Run Queue](#decision-6-exclusive-run-queue) @@ -43,19 +43,18 @@ **Core value:** - High-level JavaScript API for embedding generation -- Peer-to-peer model distribution via Hyperdrive +- Direct file streaming from disk via `bare-fs` (no download/loader layer) - Batch processing for high-throughput use cases -- Pluggable model weight loaders - Vector embeddings for semantic search and similarity ## Key Features - **Cross-platform**: macOS, Linux, Windows, iOS, Android -- **Multiple loaders**: Hyperdrive (P2P), filesystem, custom +- **Caller-supplied paths**: Application passes absolute file paths; addon streams them from disk - **Batch processing**: Process multiple texts in a single forward pass - **GPU acceleration**: Metal, Vulkan, OpenCL - **Quantized models**: GGUF format (Q2-Q8, 1-bit variants) -- **Sharded loading**: Automatic split GGUF handling +- **Sharded loading**: Caller passes every shard + `.tensors.txt` companion; addon streams them in order - **Encoder-only models**: Optimized for embedding generation ## Target Platforms @@ -72,8 +71,9 @@ Tier 1: Platform targets for which prebuilds are provided as defined by the .git **Dependencies:** - qvac-lib-inference-addon-cpp (≥1.1.2): C++ addon framework +- @qvac/infer-base: Provides `createJobHandler` and `exclusiveRunQueue` helpers (composition, no base class) - qvac-fabric-llm.cpp (≥7248.2.3): Inference engine -- Bare Runtime (≥1.24.0): JavaScript runtime +- Bare Runtime (≥1.24.0): JavaScript runtime (provides `bare-fs` for direct file streaming) --- @@ -95,28 +95,27 @@ graph TB WHISPER[whispercpp
STT] NMT[nmtcpp
Translation] end - + subgraph "core libs" - BASE["@qvac/infer-base"] - DL["@qvac/dl-hyperdrive"] + BASE["@qvac/infer-base
(createJobHandler,
exclusiveRunQueue)"] end - + subgraph "Native Framework" ADDON[addon-cpp] end - + subgraph "Backend" - BARE[Bare Runtime] + BARE[Bare Runtime
(bare-fs)] LLAMA[llama.cpp] end - + APP --> EMBED EMBED --> BASE - EMBED --> DL EMBED --> ADDON + EMBED --> BARE ADDON --> BARE ADDON --> LLAMA - + style EMBED fill:#e1f5ff,stroke:#0066cc,stroke-width:3px ``` @@ -127,21 +126,20 @@ graph TB | Package | Type | Version | Purpose | |---------|------|---------|---------| -| @qvac/infer-base | Framework | ^0.2.2 | Base classes, WeightsProvider, QvacResponse | -| @qvac/dl-hyperdrive | Peer | ^0.1.0 | P2P model loading | -| qvac-lib-inference-addon-cpp | Native | ≥1.1.1 | C++ addon framework | -| llama.cpp | Native | ≥7248.1.0 | Inference engine | -| Bare Runtime | Runtime | ≥1.24.0 | JavaScript execution | +| @qvac/infer-base | Framework | ≥0.2.2 | `createJobHandler`, `exclusiveRunQueue`, `QvacResponse` helpers (composition, no base class) | +| qvac-lib-inference-addon-cpp | Native | ≥1.1.2 | C++ addon framework | +| llama.cpp | Native | ≥7248.2.1 | Inference engine | +| Bare Runtime | Runtime | ≥1.24.0 | JavaScript execution, `bare-fs`, `bare-path` | **Integration Points:** | From | To | Mechanism | Data Format | |------|-----|-----------|-------------| -| JavaScript | GGMLBert | Constructor | args, config objects | -| GGMLBert | BaseInference | Inheritance | Template method pattern | +| JavaScript | GGMLBert | Constructor | `{ files, config, logger, opts }` (single object) | +| GGMLBert | createJobHandler / exclusiveRunQueue | Composition | Function calls | | GGMLBert | BertInterface | Composition | Method calls | +| GGMLBert | bare-fs | `fs.createReadStream(path)` | Raw chunks per shard | | BertInterface | C++ Addon | require.addon() | Native binding | -| WeightsProvider | Data Loader | Interface | Stream protocol |
@@ -151,37 +149,50 @@ graph TB ### Main Class: GGMLBert +`GGMLBert` is a standalone class (no base class). It composes helpers from `@qvac/infer-base` and talks to the native addon via `BertInterface`. The caller passes all file paths explicitly; there is no downloader, loader, or weights provider. + ```mermaid classDiagram class GGMLBert { - +constructor(args, config) - +load(closeLoader, onProgress) Promise~void~ + +constructor({ files, config, logger, opts }) + +load() Promise~void~ +run(text) Promise~QvacResponse~ + +cancel() Promise~void~ +unload() Promise~void~ - +downloadWeights(onProgress, opts) Promise~string~ + +getState() object } - - class BaseInference { - <> - +load() Promise~void~ - +run() Promise~QvacResponse~ - +unload() Promise~void~ + + class createJobHandler { + <> + +start() QvacResponse + +output(data) + +end(stats) + +fail(error) } - + + class exclusiveRunQueue { + <> + +run(fn) Promise + } + class QvacResponse { +await() Promise~Array~ +cancel() Promise~void~ +stats object } - - class WeightsProvider { - +downloadFiles(files, path, opts) Promise~void~ - +streamFiles(shards, onChunk, onProgress) Promise~void~ + + class BertInterface { + +runJob(input) Promise~boolean~ + +loadWeights({filename, chunk, completed}) + +activate() Promise + +cancel() Promise + +unload() Promise } - - GGMLBert --|> BaseInference - GGMLBert *-- WeightsProvider - GGMLBert ..> QvacResponse : creates + + GGMLBert *-- createJobHandler : composes + GGMLBert *-- exclusiveRunQueue : composes + GGMLBert *-- BertInterface : composes + createJobHandler ..> QvacResponse : creates ```
@@ -189,20 +200,22 @@ classDiagram **Component Roles:** -| Class | Responsibility | Lifecycle | Dependencies | -|-------|----------------|-----------|--------------| -| GGMLBert | Orchestrate model lifecycle, manage loading/inference | Created by user, persistent | WeightsProvider, BertInterface | -| BaseInference | Define standard inference API | Abstract base class | None | -| QvacResponse | Return embedding results | Created per run() call, short-lived | None | -| WeightsProvider | Abstract model weight loading | Created by GGMLBert | DataLoader | +| Class / Helper | Responsibility | Lifecycle | Dependencies | +|----------------|----------------|-----------|--------------| +| GGMLBert | Standalone class. Orchestrate model lifecycle, stream files from disk, dispatch inference | Created by user, persistent | `createJobHandler`, `exclusiveRunQueue`, `BertInterface`, `bare-fs` | +| createJobHandler | Factory producing a job handler with `start/output/end/fail` semantics; returns a `QvacResponse` | Created once per GGMLBert | None | +| exclusiveRunQueue | Factory producing a promise queue that serialises `run()` / `unload()` calls | Created once per GGMLBert | None | +| QvacResponse | Return embedding results and expose `await()`/`cancel()`/`stats` | Created per `run()` call, short-lived | None | +| BertInterface | JS wrapper around the native addon (`require.addon`) | Created in `_load()`, lives for model lifetime | Native addon | **Key Relationships:** | From | To | Type | Purpose | |------|-----|------|---------| -| GGMLBert | BaseInference | Inheritance | Standard QVAC inference API | -| GGMLBert | WeightsProvider | Composition | Model weight acquisition | -| GGMLBert | QvacResponse | Creates | Embedding output per inference | +| GGMLBert | createJobHandler | Composition | Job state machine + response emission | +| GGMLBert | exclusiveRunQueue | Composition | Serialise concurrent callers | +| GGMLBert | BertInterface | Composition | Native addon calls | +| GGMLBert | bare-fs | Direct use | Stream each caller-supplied path into the addon |
@@ -219,53 +232,55 @@ graph TB subgraph "Layer 1: JavaScript API" APP["Application Code"] BERTCLASS["GGMLBert
(index.js)"] - BASEINF["BaseInference
(@qvac/infer-base)"] - WEIGHTSPR["WeightsProvider
(@qvac/infer-base)"] + JOB["createJobHandler
(@qvac/infer-base)"] + QUEUE["exclusiveRunQueue
(@qvac/infer-base)"] RESPONSE["QvacResponse
(@qvac/infer-base)"] + BAREFS["bare-fs
createReadStream"] end - + subgraph "Layer 2: Bridge" BERTIF["BertInterface
(addon.js)"] BINDING["require.addon
(binding.js)"] end - + subgraph "Layer 3: C++ Addon" JSINTERFACE["JsInterface
(js-interface/binding.cpp)"] ADDON["Addon
(addon/Addon.cpp)"] WEIGHTSLOAD["WeightsLoader
(addon-cpp)"] end - + subgraph "Layer 4: Model" BERTMODEL["BertModel
(model-interface/BertModel.cpp)"] ENCODE["encode_host_f32
encode_host_f32_sequences"] end - + subgraph "Layer 5: Backend" LLAMACPP["llama.cpp"] GGML["GGML"] GPU["GPU Backends"] end - + APP --> BERTCLASS - BERTCLASS --> BASEINF - BERTCLASS --> WEIGHTSPR + BERTCLASS --> JOB + BERTCLASS --> QUEUE BERTCLASS --> BERTIF - BERTCLASS -.-> RESPONSE - + BERTCLASS --> BAREFS + JOB -.-> RESPONSE + BERTIF --> BINDING BINDING --> JSINTERFACE - WEIGHTSPR --> WEIGHTSLOAD - + BAREFS -. chunks .-> BERTIF + JSINTERFACE --> ADDON ADDON --> WEIGHTSLOAD ADDON --> BERTMODEL - + BERTMODEL --> ENCODE ENCODE --> LLAMACPP - + LLAMACPP --> GGML GGML --> GPU - + style BERTCLASS fill:#e1f5ff style ADDON fill:#ffe1e1 style BERTMODEL fill:#ffe1e1 @@ -279,7 +294,7 @@ graph TB | Layer | Components | Responsibility | Language | Why This Layer | |-------|------------|----------------|----------|----------------| -| 1. JavaScript API | GGMLBert, BaseInference | High-level API, error handling | JS | Ergonomic API for npm consumers | +| 1. JavaScript API | GGMLBert (standalone class), `createJobHandler`, `exclusiveRunQueue`, `bare-fs` | High-level API, file streaming, job/queue composition | JS | Ergonomic API for npm consumers | | 2. Bridge | BertInterface, binding.js | JS↔C++ communication | JS wrapper | Lifecycle management, handle safety | | 3. C++ Addon | JsInterface, Addon | Job queue, threading, callbacks | C++ | Performance, native integration | | 4. Model | BertModel, encode methods | Inference logic, batch processing | C++ | Direct llama.cpp integration | @@ -307,7 +322,15 @@ graph TB #### **GGMLBert (index.js)** -**Responsibility:** Main API class, orchestrates model lifecycle, manages data loaders +**Responsibility:** Main API class. Standalone (no base class) — uses composition with `createJobHandler` and `exclusiveRunQueue` from `@qvac/infer-base`. Orchestrates the model lifecycle, streams each caller-supplied file path directly from disk via `bare-fs`, and dispatches inference jobs. + +**Constructor:** `new GGMLBert({ files, config, logger, opts })` — a single options object. +- `files.model` — ordered array of absolute paths. Caller is responsible for passing every file the model needs (for sharded GGUF: the `.tensors.txt` companion first, then all `.gguf` shards in numerical order). +- `config` — llama.cpp hyper-parameters (all string values). +- `logger` — wrapped in a `QvacLogger`. +- `opts.stats` — emit runtime stats on the response. + +**`load()`** takes no arguments. For single-file models it just activates the addon using the provided path. For sharded models it opens `bare-fs.createReadStream(path)` on each file in order, pipes chunks into `addon.loadWeights({ filename, chunk, completed: false })`, and sends a final `completed: true` marker for each file before calling `addon.activate()`. **Why JavaScript:** - High-level API ergonomics for npm consumers @@ -355,15 +378,15 @@ graph TB **Specialization:** Handles variant input (string or vector), merges batch inputs -#### **WeightsProvider (@qvac/infer-base)** +#### **@qvac/infer-base helpers** -**Responsibility:** Abstracts model weight acquisition +**Responsibility:** Provide composition primitives used by `GGMLBert`. -**Why JavaScript:** -- Integrates with data loaders (Hyperdrive, filesystem) -- Progress tracking and reporting -- Handles sharded GGUF expansion -- Streaming chunk delivery +- `createJobHandler({ cancel })` — returns a job handler exposing `start()` (creates a `QvacResponse`), `output(data)`, `end(stats)`, and `fail(error)`. `GGMLBert` wires the addon's output callback into it. +- `exclusiveRunQueue()` — returns a function that serialises async callers so at most one `run()` / `unload()` body is in flight at a time. +- `QvacResponse` — the response object returned by `start()`; exposes `await()`, `cancel()`, and `stats`. + +There is no `BaseInference` class, no `WeightsProvider`, and no `downloadWeights` method. `GGMLBert` composes these helpers directly; it does not inherit from anything. #### **BackendSelection (model-interface/BackendSelection.cpp)** @@ -516,53 +539,58 @@ See [qvac-lib-inference-addon-cpp Decision 4: Why Bare Runtime](https://github.c --- -## Decision 3: Pluggable Data Loader Architecture +## Decision 3: Caller-Provided File Paths
⚡ TL;DR -**Chose:** Abstract data loading via WeightsProvider interface -**Why:** Support multiple distribution methods (P2P, HTTP, local files, S3) -**Cost:** Additional abstraction layer, must implement loader interface +**Chose:** Caller passes absolute paths to every required file; addon streams them from disk via `bare-fs` +**Why:** Remove the download/loader abstraction, push distribution concerns up to the application +**Cost:** Callers must discover and order shard files themselves; no built-in P2P or HTTP loader
### Context -Need to load multi-GB model files from various sources: -- Local filesystem (for offline/development) -- P2P networks (for privacy/decentralization) -- HTTP/CDN (for enterprise deployments) -- Cloud storage (S3, Azure Blob, etc.) +Earlier versions of this package bundled a `WeightsProvider` abstraction and a `downloadWeights()` entry point that delegated to pluggable data loaders (Hyperdrive, filesystem, etc.). In practice this added an extra abstraction layer, coupled the addon to a specific loader ecosystem, and made sharded model handling implicit. -Different use cases have different distribution requirements. No single distribution method fits all scenarios. +The SDK and higher-level applications already have their own download, cache, and catalog pipelines. Embedding the loader into the addon was redundant and obscured the real contract between the addon and the model files on disk. ### Decision -Create a pluggable data loader abstraction (WeightsProvider interface) that decouples model loading from the inference engine, allowing applications to choose their distribution strategy. +Remove the data loader / `WeightsProvider` / `downloadWeights` surface entirely. The addon now takes an explicit list of absolute file paths at construction time and streams them from disk via `bare-fs.createReadStream` when `load()` is called. Distribution (P2P, HTTP, bundled asset, cache, etc.) is entirely the caller's responsibility. + +```js +const model = new GGMLBert({ + files: { model: [tensorsTxtPath, shard1, shard2, ...] }, + config, + logger, + opts: { stats: true } +}) +await model.load() +``` ### Rationale -**Flexibility:** -- Different users have different distribution needs (privacy vs speed vs simplicity) -- Enterprises may require HTTP/CDN, privacy users may prefer P2P -- Development/testing needs local filesystem access -- No single distribution method fits all use cases +**Simplicity:** +- One less abstraction layer inside the addon +- No runtime dependency on any specific data loader +- The contract is explicit: "give me the exact file list, in order" **Separation of Concerns:** -- Inference engine doesn't need to know about distribution details -- Model loading is orthogonal to inference logic -- Easier to test inference separately from data loading +- Inference engine only cares about bytes on disk +- Distribution strategy lives where it belongs (SDK / application) +- Easier to test and reason about model loading -**Extensibility:** -- Applications can implement custom loaders (S3, IPFS, Torrent, etc.) -- Can optimize loaders for specific platforms (mobile vs desktop) -- Future-proof: new distribution methods don't require engine changes +**Predictability:** +- Sharded models require the caller to pass every shard and the `.tensors.txt` companion file in numerical order — no hidden discovery +- No partial loading states caused by implicit shard expansion ### Trade-offs -- ✅ Can mock loaders for unit testing inference logic -- ❌ Additional abstraction complexity vs hardcoding a single method -- ❌ Applications must choose/implement their loader (no batteries-included default) +- ✅ Smaller, more focused addon surface +- ✅ No loader coupling — works with any distribution mechanism +- ❌ Caller must discover and order sharded files themselves +- ❌ No built-in progress reporting during the disk-stream phase --- @@ -571,8 +599,8 @@ Create a pluggable data loader abstraction (WeightsProvider interface) that deco
⚡ TL;DR -**Chose:** Buffer-based weight loader using custom std::streambuf over JavaScript ArrayBuffers -**Why:** Avoid storage duplication, zero-copy, supports incremental shard-by-shard loading +**Chose:** Buffer-based weight loader using custom std::streambuf over JavaScript ArrayBuffers +**Why:** Zero-copy bridge between `bare-fs` chunks and llama.cpp, supports incremental shard-by-shard loading **Cost:** Complex streambuf implementation, JavaScript reference lifecycle management
@@ -583,37 +611,30 @@ ML models can be gigabytes in size. llama.cpp expects either: 1. A file descriptor (simple but requires file on disk) 2. A buffer (via `std::streambuf` interface) -**Problem:** We need to load directly from Hyperdrive (P2P storage) without duplicating storage by saving to disk first. - -Alternative approach would be: download from Hyperdrive → save to temp file → pass file descriptor to llama.cpp. But this doubles storage requirements (Hyperdrive cache + temp file). +**Problem:** JavaScript reads files chunk-by-chunk via `bare-fs.createReadStream`. We want to feed those chunks into llama.cpp without first materialising the entire model in a single contiguous buffer (which would double RAM usage for multi-GB models). ### Decision -Implement custom `std::streambuf` over JavaScript-owned ArrayBuffers with incremental shard-by-shard loading, as provided by `qvac-lib-inference-addon-cpp` framework. This allows feeding buffer chunks from any source (Hyperdrive, HTTP, local files) directly to llama.cpp without intermediate file storage. - -JavaScript sends model data as buffer chunks, C++ wraps them in a `std::streambuf`, enabling llama.cpp to load sharded models incrementally with zero-copy access to JavaScript memory. +Use the custom `std::streambuf` over JavaScript-owned ArrayBuffers provided by `qvac-lib-inference-addon-cpp`. `GGMLBert._streamShards()` opens `bare-fs.createReadStream(path)` on each caller-supplied file, forwards each chunk into `addon.loadWeights({ filename, chunk, completed: false })`, and sends a final `{ completed: true }` marker per file. The addon appends each chunk as a blob in the streambuf; llama.cpp then reads across blobs without allocating a contiguous buffer. ### Rationale -**Avoid Storage Duplication:** -- Load directly from Hyperdrive streams without saving to disk first -- No temporary files consuming additional storage -- Critical for mobile devices with limited storage -- Hyperdrive data stays in its cache, not duplicated +**Memory Efficiency:** +- No need to buffer the entire model in JS before handing it to llama.cpp +- C++ reads directly from JavaScript ArrayBuffer memory — no memcpy of multi-GB model files +- Works naturally with the chunked output of `bare-fs.createReadStream` -**Zero-Copy:** -- C++ reads directly from JavaScript ArrayBuffer memory -- No memcpy of multi-GB model files -- Further reduces memory footprint +**Sharded Loading:** +- Each file is delivered as a sequence of `{ filename, chunk, completed }` messages +- llama.cpp parses the GGUF index and loads tensors lazily across all shards -**Source Flexibility:** -- Works with any data source (Hyperdrive, HTTP, filesystem) -- Data loader provides buffer chunks, streambuf wrapper handles delivery to llama.cpp -- Same incremental loading path for all distribution methods -- Supports sharded GGUF files with incremental tensor loading +**Uniform Path:** +- The same streambuf path is used whether the caller has one file or many +- The addon does not care where the bytes came from — it only sees chunks and filenames ### Trade-offs -- ✅ Can report loading progress per chunk +- ✅ No contiguous multi-GB buffers in JS or C++ +- ✅ Same path for single-file and sharded models - ❌ Complex streambuf implementation with seeking across blobs - ❌ Must keep JS buffers alive during load, defer cleanup to correct thread - ❌ Seeking overhead O(N) across N blobs (acceptable, rarely needed) @@ -687,8 +708,8 @@ Support batch processing natively by accepting both single strings and arrays of
⚡ TL;DR -**Chose:** Promise-based exclusive run queue using `_withExclusiveRun()` wrapper -**Why:** Ensure atomic multi-step operations complete without interruption +**Chose:** Compose `exclusiveRunQueue()` from `@qvac/infer-base` to serialize public API entrypoints +**Why:** Ensure atomic multi-step operations complete without interruption **Cost:** One inference request at a time per model instance
@@ -699,9 +720,9 @@ With addon-cpp ≥1.1.2, a single inference request is one `runJob({ type, input ### Decision -Implement JavaScript-level promise queue using `_withExclusiveRun()` helper so that only one `run()` (and thus one `runJob()`) is in progress at a time. This avoids races and ensures the addon’s single-job contract is respected. +Use the `exclusiveRunQueue()` helper from `@qvac/infer-base@^0.4.0`. The constructor stores the queue as `this._run`, and `run()` and `unload()` wrap their bodies with `this._run(() => …)`. This replaces the previous `BaseInference._withExclusiveRun()` template-method approach with a small composable utility, in line with the loader-removal refactor that dropped `BaseInference` inheritance. -**Note:** C++ level thread safety (mutex-protected job queue) and single-job semantics (runJob, cancel waits until stopped) are handled by the addon-cpp (≥1.1.1) framework. +**Note:** C++ level thread safety (mutex-protected job queue) and single-job semantics (runJob, cancel waits until stopped) are handled by the addon-cpp (≥1.1.2) framework. ### Rationale @@ -779,4 +800,4 @@ Provide hand-written TypeScript definitions in `index.d.ts` alongside JavaScript **Related Document:** - [data-flows-detailed.md](data-flows-detailed.md) - Detailed data flow diagrams and sequences -**Last Updated:** 2026-02-17 +**Last Updated:** 2026-04-07 diff --git a/packages/qvac-lib-infer-llamacpp-embed/docs/data-flows-detailed.md b/packages/qvac-lib-infer-llamacpp-embed/docs/data-flows-detailed.md index 0f884081c3..d572abbce2 100644 --- a/packages/qvac-lib-infer-llamacpp-embed/docs/data-flows-detailed.md +++ b/packages/qvac-lib-infer-llamacpp-embed/docs/data-flows-detailed.md @@ -10,7 +10,7 @@ - [Model Loading Flow](#model-loading-flow) - [Batch Embedding Generation Flow](#batch-embedding-generation-flow) -- [Weight Loading Flow](#weight-loading-flow) +- [Weight Streaming Flow](#weight-streaming-flow) - [Single Text Embedding Flow](#single-text-embedding-flow) --- @@ -23,45 +23,40 @@ sequenceDiagram participant App as Application participant GGMLBert as GGMLBert - participant WP as WeightsProvider - participant DL as DataLoader + participant FS as bare-fs participant BI as BertInterface participant Addon as Addon participant BM as BertModel participant LLAMA as llama.cpp - - App->>GGMLBert: new GGMLBert(args, config) - GGMLBert->>GGMLBert: Store config, modelName, diskPath - GGMLBert->>WP: new WeightsProvider(loader) - - App->>GGMLBert: load(closeLoader, onProgress) - GGMLBert->>BI: new BertInterface(binding, params, callbacks) + + App->>GGMLBert: new GGMLBert({ files, config, logger, opts }) + GGMLBert->>GGMLBert: Store files array, config, opts + GGMLBert->>GGMLBert: createJobHandler + exclusiveRunQueue + + App->>GGMLBert: load() + GGMLBert->>GGMLBert: pick primaryGgufPath = first entry matching /-\d+-of-\d+\.gguf$/
(falls back to files.model[0] for non-sharded models) + GGMLBert->>BI: new BertInterface(binding, { path: primaryGgufPath, config }, outputCb) BI->>Addon: createInstance(params) Addon->>BM: BertModel(path, config, backendsDir) BM->>BM: Delayed init (InitLoader) - - alt Sharded Model - GGMLBert->>GGMLBert: _loadWeights(onProgress) - GGMLBert->>WP: streamFiles(shards, onChunk, onProgress) - loop For each shard - WP->>DL: getStream(shard) - DL-->>WP: Stream chunks - WP->>GGMLBert: onChunk(chunkedWeightsData) - GGMLBert->>BI: loadWeights({filename, chunk, completed}) - BI->>Addon: loadWeights(handle, data) - Addon->>BM: set_weights_for_file(filename, streambuf) - BM->>LLAMA: llama_model_load_fulfill_split_future() - end - else Single File Model - GGMLBert->>GGMLBert: downloadWeights(onProgress) - GGMLBert->>WP: downloadFiles([modelName], diskPath, opts) - loop Download progress - WP->>DL: download(modelName, diskPath) - DL-->>WP: Progress updates - WP->>GGMLBert: onDownloadProgress(bytes) + + alt Sharded Model (files.length > 1) + GGMLBert->>GGMLBert: _streamShards() + loop For each absolute path in files.model (in order) + GGMLBert->>FS: fs.createReadStream(absolutePath) + loop For each chunk + FS-->>GGMLBert: chunk + GGMLBert->>BI: loadWeights({filename, chunk, completed: false}) + BI->>Addon: loadWeights(handle, data) + Addon->>BM: Append blob to streambuf for filename + end + GGMLBert->>BI: loadWeights({filename, chunk: null, completed: true}) + BI->>Addon: mark file complete end + else Single File Model (files.length == 1) + Note over GGMLBert: Skip streaming — addon will read path directly on activate() end - + GGMLBert->>BI: activate() BI->>Addon: activate(handle) Addon->>BM: load() @@ -70,7 +65,7 @@ sequenceDiagram BM->>BM: initializeBackend(backendsDir) BM->>BM: setupParams(modelPath, config) BM->>LLAMA: initFromConfig(params, path, streams, shards) - LLAMA->>LLAMA: Load model weights + LLAMA->>LLAMA: Load model weights (from streambufs or path) LLAMA-->>BM: model, context BM->>BM: Initialize batch, vocab, pooling BM-->>Addon: Model loaded @@ -79,31 +74,42 @@ sequenceDiagram GGMLBert-->>App: Model loaded ``` +### Caller Contract for `files.model` + +- **Absolute paths only.** `GGMLBert` does not resolve relative paths or discover companion files. +- **Order matters.** For sharded GGUFs, callers pass the `.tensors.txt` companion first, then shards `00001-of-N`, `00002-of-N`, …, `N-of-N` in numeric order. The addon scans the array for the first entry matching the shard regex `^(.+)-(\d+)-of-(\d+)\.gguf$` and uses that as the primary path handed to llama.cpp's `params.model.path`. For non-sharded single-file models, the only entry is used. The `.tensors.txt` file is consumed by the streaming layer (along with the shards) but is never the primary path. +- **All files required.** Every shard and the `.tensors.txt` file must be present in the array; missing any file will fail at load time. +- **No download step.** The addon reads bytes from disk via `bare-fs`. Distribution, caching, and integrity are the caller's responsibility. + ### Sharded Model Loading Detail ```mermaid sequenceDiagram - participant JS as JavaScript + participant App as Application + participant GGMLBert as GGMLBert + participant FS as bare-fs participant Cpp as C++ Addon participant Stream as BlobsStream participant LLAMA as llama.cpp - - Note over JS: WeightsProvider streams shards - JS->>Cpp: loadWeights({filename: "model-00001-of-00005.gguf", chunk: ArrayBuffer, completed: false}) - Cpp->>Stream: Append blob to streambuf - Stream->>Stream: Store ArrayBuffer reference - - JS->>Cpp: loadWeights({filename: "model-00002-of-00005.gguf", chunk: ArrayBuffer, completed: false}) - Cpp->>Stream: Append blob to streambuf - - Note over JS: Last shard - JS->>Cpp: loadWeights({filename: "model-00005-of-00005.gguf", chunk: ArrayBuffer, completed: true}) - Cpp->>Stream: Append final blob, mark complete - + + Note over App: Caller passes every file explicitly + App->>GGMLBert: new GGMLBert({ files: { model: [tensorsTxt, shard1..shardN] }, ... }) + App->>GGMLBert: load() + + loop For each absolute path (in order) + GGMLBert->>FS: createReadStream(path) + loop For each chunk + FS-->>GGMLBert: Buffer chunk + GGMLBert->>Cpp: loadWeights({filename, chunk, completed: false}) + Cpp->>Stream: Append blob to streambuf (zero-copy) + end + GGMLBert->>Cpp: loadWeights({filename, chunk: null, completed: true}) + Cpp->>Stream: Mark file complete + end + Note over Cpp: activate() called - Cpp->>LLAMA: llama_model_load() with streambuf - LLAMA->>Stream: seekg(), read() operations - Stream->>Stream: Navigate across blobs + Cpp->>LLAMA: llama_model_load() with streambufs + LLAMA->>Stream: seekg(), read() operations across blobs Stream-->>LLAMA: Model weight data LLAMA->>LLAMA: Parse GGUF, load tensors LLAMA-->>Cpp: Model loaded @@ -195,47 +201,38 @@ flowchart TD --- -## Weight Loading Flow +## Weight Streaming Flow + +### Direct File Streaming Sequence -### Streaming Weight Loading Sequence +`GGMLBert` has no `WeightsProvider` and no data loader. It streams each caller-supplied absolute path straight from disk using `bare-fs.createReadStream` and forwards chunks to the native addon. Distribution (downloading, P2P, cache, integrity) happens entirely outside this package. ```mermaid sequenceDiagram - participant JS as JavaScript - participant WP as WeightsProvider - participant DL as DataLoader + participant JS as GGMLBert + participant FS as bare-fs participant BI as BertInterface participant Addon as Addon participant Stream as BlobsStream participant LLAMA as llama.cpp - - JS->>WP: streamFiles(shards, onChunk, onProgress) - WP->>WP: Expand shards from modelName - - loop For each shard file - WP->>DL: getStream(shard) - DL-->>WP: AsyncIterable - + + Note over JS: Caller passed files.model = [file1, file2, ...] + loop For each absolute path (in order) + JS->>FS: fs.createReadStream(path) loop For each chunk - DL-->>WP: Uint8Array chunk - WP->>WP: Track progress - WP->>JS: onProgress({currentFile, currentFileProgress, overallProgress}) - WP->>JS: onChunk({filename, chunk, completed: false}) + FS-->>JS: Buffer chunk + JS->>BI: loadWeights({filename, chunk, completed: false}) + BI->>Addon: loadWeights(handle, data) + Addon->>Stream: Append blob (zero-copy ArrayBuffer ref) end - - DL-->>WP: Stream complete - WP->>JS: onChunk({filename, chunk: null, completed: true}) + FS-->>JS: stream end + JS->>BI: loadWeights({filename, chunk: null, completed: true}) + BI->>Addon: loadWeights(handle, data) + Addon->>Stream: Mark file complete end - - Note over JS: All shards streamed - JS->>BI: loadWeights({filename, chunk, completed}) - BI->>Addon: loadWeights(handle, data) - Addon->>Addon: Convert Uint8Array to std::streambuf - Addon->>Stream: Append blob - Stream->>Stream: Store ArrayBuffer reference (zero-copy) - + Note over Addon: activate() called later - Addon->>LLAMA: llama_model_load() with streambuf + Addon->>LLAMA: llama_model_load() with streambufs LLAMA->>Stream: seekg(offset) Stream->>Stream: Find blob containing offset Stream->>Stream: Calculate position within blob @@ -372,4 +369,4 @@ flowchart TD --- -**Last Updated:** 2026-02-17 +**Last Updated:** 2026-04-07 diff --git a/packages/qvac-lib-infer-llamacpp-embed/examples/batchInference.js b/packages/qvac-lib-infer-llamacpp-embed/examples/batchInference.js index 563aae0fe6..8a6af3c6b8 100644 --- a/packages/qvac-lib-infer-llamacpp-embed/examples/batchInference.js +++ b/packages/qvac-lib-infer-llamacpp-embed/examples/batchInference.js @@ -1,6 +1,6 @@ 'use strict' -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const GGMLBert = require('../index') const { downloadModel } = require('./utils') @@ -14,25 +14,19 @@ async function main () { 'gte-large_fp16.gguf' ) - // 2. Initializing data loader - const fsDL = new FilesystemDL({ dirPath }) - - // 3. Configuring model settings - const args = { - loader: fsDL, + // 2. Configuring model settings + const model = new GGMLBert({ + files: { model: [path.join(dirPath, modelName)] }, + config: { device: 'gpu', gpu_layers: '25', batch_size: '128' }, logger: console, - opts: { stats: true }, - diskPath: dirPath, - modelName - } - const config = { device: 'gpu', gpu_layers: '25', batch_size: '128' } // large enough batch size to run all test prompts in one pass + opts: { stats: true } + }) - // 4. Loading model - const model = new GGMLBert(args, config) + // 3. Loading model await model.load() try { - // 5. Generating embeddings (all prompts in one batch) + // 4. Generating embeddings (all prompts in one batch) const prompts = [ 'Hello, can you suggest a game I can play with my 1 year old daughter?', 'What is the capital of Great Britain?', @@ -53,9 +47,8 @@ async function main () { console.error('Error occurred:', errorMessage) console.error('Error details:', error) } finally { - // 6. Cleaning up resources + // 5. Cleaning up resources await model.unload() - await fsDL.close() } } diff --git a/packages/qvac-lib-infer-llamacpp-embed/examples/nativelog.js b/packages/qvac-lib-infer-llamacpp-embed/examples/nativelog.js index 29dd9c7b2a..e0161c903b 100644 --- a/packages/qvac-lib-infer-llamacpp-embed/examples/nativelog.js +++ b/packages/qvac-lib-infer-llamacpp-embed/examples/nativelog.js @@ -1,6 +1,6 @@ 'use strict' -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const GGMLBert = require('../index.js') const { setLogger, releaseLogger } = require('../addonLogging.js') const { downloadModel } = require('./utils') @@ -36,25 +36,19 @@ async function main () { 'gte-large_fp16.gguf' ) - // 3. Initializing data loader - const fsDL = new FilesystemDL({ dirPath }) - - // 4. Configuring model settings - const args = { - loader: fsDL, + // 3. Configuring model settings + const model = new GGMLBert({ + files: { model: [path.join(dirPath, modelName)] }, + config: { device: 'gpu', gpu_layers: '25', verbosity: '2' }, logger: console, - opts: { stats: true }, - diskPath: dirPath, - modelName - } - const config = { device: 'gpu', gpu_layers: '25', verbosity: '2' } + opts: { stats: true } + }) - // 5. Loading model - const model = new GGMLBert(args, config) + // 4. Loading model await model.load() try { - // 6. Generating embeddings + // 5. Generating embeddings const query = 'Hello, can you suggest a game I can play with my 1 year old daughter?' const response = await model.run(query) const embeddings = await response.await() @@ -67,9 +61,8 @@ async function main () { console.error('Error occurred:', errorMessage) console.error('Error details:', error) } finally { - // 7. Cleaning up resources + // 6. Cleaning up resources await model.unload() - await fsDL.close() releaseLogger() } } diff --git a/packages/qvac-lib-infer-llamacpp-embed/examples/quickstart.js b/packages/qvac-lib-infer-llamacpp-embed/examples/quickstart.js index 4043187229..9218bf2112 100644 --- a/packages/qvac-lib-infer-llamacpp-embed/examples/quickstart.js +++ b/packages/qvac-lib-infer-llamacpp-embed/examples/quickstart.js @@ -1,6 +1,6 @@ 'use strict' -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const GGMLBert = require('../index') const { downloadModel } = require('./utils') @@ -14,25 +14,19 @@ async function main () { 'gte-large_fp16.gguf' ) - // 2. Initializing data loader - const fsDL = new FilesystemDL({ dirPath }) - - // 3. Configuring model settings - const args = { - loader: fsDL, + // 2. Configuring model settings + const model = new GGMLBert({ + files: { model: [path.join(dirPath, modelName)] }, + config: { device: 'gpu', gpu_layers: '25' }, logger: console, - opts: { stats: true }, - diskPath: dirPath, - modelName - } - const config = { device: 'gpu', gpu_layers: '25' } + opts: { stats: true } + }) - // 4. Loading model - const model = new GGMLBert(args, config) + // 3. Loading model await model.load() try { - // 5. Generating embeddings + // 4. Generating embeddings const query = 'Hello, can you suggest a game I can play with my 1 year old daughter?' const response = await model.run(query) const embeddings = await response.await() @@ -45,9 +39,8 @@ async function main () { console.error('Error occurred:', errorMessage) console.error('Error details:', error) } finally { - // 6. Cleaning up resources + // 5. Cleaning up resources await model.unload() - await fsDL.close() } } diff --git a/packages/qvac-lib-infer-llamacpp-embed/index.d.ts b/packages/qvac-lib-infer-llamacpp-embed/index.d.ts index 8909c32abb..02b3579fa5 100644 --- a/packages/qvac-lib-infer-llamacpp-embed/index.d.ts +++ b/packages/qvac-lib-infer-llamacpp-embed/index.d.ts @@ -1,22 +1,7 @@ -import BaseInference, { - ReportProgressCallback -} from '@qvac/infer-base/WeightsProvider/BaseInference' import type { QvacResponse } from '@qvac/infer-base' - -export { ReportProgressCallback, QvacResponse } -import type WeightsProvider from '@qvac/infer-base/WeightsProvider/WeightsProvider' import type QvacLogger from '@qvac/logging' -export interface Loader { - ready(): Promise - close(): Promise - getStream(path: string): Promise> - download( - path: string, - opts: { diskPath: string; progressReporter?: unknown } - ): Promise<{ await(): Promise }> - getFileSize?(path: string): Promise -} +export { QvacResponse } export interface Addon { loadWeights(data: { filename: string; chunk: Uint8Array | null; completed: boolean }): Promise @@ -26,26 +11,6 @@ export interface Addon { unload(): Promise } -export interface GGMLArgs { - loader: Loader - logger?: QvacLogger | Console | null - opts?: { stats?: boolean } - diskPath?: string - modelName: string - modelPath?: string - exclusiveRun?: boolean -} - -export interface DownloadWeightsOptions { - closeLoader?: boolean -} - -export interface DownloadResult { - filePath: string | null - error: boolean - completed: boolean -} - export type NumericLike = `${number}` export interface GGMLConfig { @@ -63,6 +28,13 @@ export interface GGMLConfig { [key: string]: string | number | boolean | string[] | undefined } +export interface GGMLBertArgs { + files: { model: string[] } + config?: GGMLConfig + logger?: QvacLogger | Console | null + opts?: { stats?: boolean } +} + export interface AddonConfigurationParams { path: string config: GGMLConfig @@ -78,47 +50,23 @@ export interface RuntimeStats { backendDevice: 'cpu' | 'gpu' } -export default class GGMLBert extends BaseInference { - protected addon: Addon - - weightsProvider: WeightsProvider - - constructor(args: GGMLArgs, config: GGMLConfig) - - _load( - closeLoader?: boolean, - reportProgressCallback?: ReportProgressCallback | ((bytes: number) => void) - ): Promise - - load( - closeLoader?: boolean, - reportProgressCallback?: ReportProgressCallback | ((bytes: number) => void) - ): Promise - - downloadWeights( - onDownloadProgress?: (progress: Record, opts: DownloadWeightsOptions) => any, - opts?: DownloadWeightsOptions - ): Promise> - - _downloadWeights( - onDownloadProgress?: (progress: Record, opts: DownloadWeightsOptions) => any, - opts?: DownloadWeightsOptions - ): Promise> - - protected _loadWeights( - reportProgressCallback?: ReportProgressCallback | ((bytes: number) => void) - ): Promise +export default class GGMLBert { + protected addon: Addon | null + opts: { stats?: boolean } + logger: QvacLogger + state: { configLoaded: boolean } - protected _createAddon(configurationParams: AddonConfigurationParams): Addon - - _runInternal(text: string | string[]): Promise + constructor(args: GGMLBertArgs) + load(): Promise run(text: string | string[]): Promise - + unload(): Promise cancel(): Promise + getState(): { configLoaded: boolean } } export { GGMLBert } + export interface AddonLogging { setLogger(callback: (priority: number, message: string) => void): void releaseLogger(): void @@ -129,12 +77,12 @@ export class BertInterface implements Addon { constructor( binding: unknown, configurationParams: AddonConfigurationParams, - outputCb: (addon: unknown, event: string, jobId: number, data: unknown, error?: Error) => void + outputCb: (addon: unknown, event: string, data: unknown, error?: Error) => void ) - + loadWeights(data: { filename: string; chunk: Uint8Array | null; completed: boolean }): Promise activate(): Promise - runJob(input: { type: 'text' | 'sequences'; input?: string | string[] }): Promise + runJob(input: { type: 'text' | 'sequences'; input?: string | string[] }): Promise cancel(): Promise unload(): Promise } diff --git a/packages/qvac-lib-infer-llamacpp-embed/index.js b/packages/qvac-lib-infer-llamacpp-embed/index.js index 1173c2ba31..cb59870fa7 100644 --- a/packages/qvac-lib-infer-llamacpp-embed/index.js +++ b/packages/qvac-lib-infer-llamacpp-embed/index.js @@ -1,167 +1,164 @@ 'use strict' +const fs = require('bare-fs') const path = require('bare-path') -const BaseInference = require('@qvac/infer-base/WeightsProvider/BaseInference') -const WeightsProvider = require('@qvac/infer-base/WeightsProvider/WeightsProvider') -const { BertInterface } = require('./addon') +const QvacLogger = require('@qvac/logging') +const { createJobHandler, exclusiveRunQueue } = require('@qvac/infer-base') +const { BertInterface, mapAddonEvent } = require('./addon') const RUN_BUSY_ERROR_MESSAGE = 'Cannot set new job: a job is already set or being processed' /** - * GGML client implementation for BERT GTE model + * Picks the primary GGUF path from an ordered file list. + * + * For sharded models the caller passes + * `[tensors.txt, shard-00001-of-N.gguf, ..., shard-N-of-N.gguf]`. + * The first entry matching the shard regex is returned so the value matches + * the C++ `GGUFShards::expandGGUFIntoShards` regex contract. + * For non-sharded single-file models the only entry is returned. + * + * @param {string[]} files - ordered array of absolute paths + * @returns {string} the primary GGUF path */ -class GGMLBert extends BaseInference { - /** - * Creates an instance of GGMLBert. - * @constructor - * @param {Object} params - arguments for model setup - * @param {Object} args arguments for inference setup - * @param {Object} config - environment specific inference setup configuration - */ - constructor ( - { opts = {}, loader, logger = null, diskPath = '.', modelName }, - config = {} - ) { - super({ logger, opts }) +function pickPrimaryGgufPath (files) { + const SHARD_REGEX = /-\d+-of-\d+\.gguf$/ + return files.find((p) => SHARD_REGEX.test(p)) || files[0] +} + +class GGMLBert { + constructor ({ files, config = {}, logger = null, opts = {} }) { + if (!files || !Array.isArray(files.model) || files.model.length === 0) { + throw new TypeError('files.model must be a non-empty array of absolute paths') + } + this._files = files.model this._config = config - this._diskPath = diskPath - this._modelName = modelName - // _shards will be null if the modelName is not a sharded file. - this._shards = WeightsProvider.expandGGUFIntoShards(this._modelName) - this.weightsProvider = new WeightsProvider(loader, this.logger) + this.logger = new QvacLogger(logger) + this.opts = opts + // The cancel closure dereferences `this.addon` lazily, so it is safe even though + // `this.addon` is `null` at construction time — it is only invoked from + // `response.cancel()` after `_load()` has assigned the addon. The optional chain + // also makes a stale `response.cancel()` after `unload()` a no-op. + this._job = createJobHandler({ cancel: () => this.addon?.cancel() }) + this._run = exclusiveRunQueue() + this.addon = null this._hasActiveResponse = false + this.state = { configLoaded: false } } - async _load (closeLoader = false, reportProgressCallback) { - this.logger.info('Starting model load') + async load () { + if (this.state.configLoaded) return + await this._load() + this.state.configLoaded = true + } + async _load () { + this.logger.info('Starting model load') + const primaryGgufPath = pickPrimaryGgufPath(this._files) const configurationParams = { - path: path.join(this._diskPath, this._modelName), + path: primaryGgufPath, config: this._config } - - this.logger.info('Creating addon with configuration:', configurationParams) this.addon = this._createAddon(configurationParams) - if (this._shards !== null) { - await this._loadWeights(reportProgressCallback) - } else { - await this.downloadWeights(reportProgressCallback, { closeLoader }) + try { + if (this._files.length > 1) { + await this._streamShards() + } + await this.addon.activate() + } catch (loadError) { + // Best-effort cleanup of the partially-initialized addon so a subsequent + // load() does not leak a zombie native instance. + try { await this.addon?.unload?.() } catch (_) {} + this.addon = null + throw loadError } - - this.logger.info('Activating addon') - await this.addon.activate() - this.logger.info('Model load completed successfully') } - /** - * Download the model weight files and return the local path to the primary file. - * @param {ProgressReportCallback} [onDownloadProgress] - Callback invoked with bytes downloaded - * @param {Object} opts - Options for the download - * @param {boolean} opts.closeLoader - Whether to close the loader when done - * @returns {Promise<{filePath: string, completed: boolean, error: boolean}[]>} Local file path for the model weights - */ - async _downloadWeights (onDownloadProgress, opts) { - return await this.weightsProvider.downloadFiles( - [this._modelName], - this._diskPath, - { - closeLoader: opts.closeLoader, - onDownloadProgress + async _streamShards () { + for (const filePath of this._files) { + const filename = path.basename(filePath) + const stream = fs.createReadStream(filePath) + for await (const chunk of stream) { + await this.addon.loadWeights({ filename, chunk, completed: false }) } - ) + await this.addon.loadWeights({ filename, chunk: null, completed: true }) + this.logger.info(`Streamed weights for ${filename}`) + } } - async _loadWeights (reportProgressCallback) { - const onChunk = async (chunkedWeightsData) => { - this.addon.loadWeights(chunkedWeightsData, this.logger) - } - await this.weightsProvider.streamFiles(this._shards, onChunk, reportProgressCallback) + async run (input) { + return this._run(() => this._runInternal(input)) } - /** - * Cancel the current task. - */ - async cancel () { - if (this.addon?.cancel) { - await this.addon.cancel() + async _runInternal (text) { + if (!this.addon) { + throw new Error('Addon not initialized. Call load() first.') + } + if (this._hasActiveResponse) { + throw new Error(RUN_BUSY_ERROR_MESSAGE) } - } - /** - * Unload the model and clear resources. Ensures any in-flight job is resolved as failed. - * @returns {Promise} - */ - async unload () { - return await this._withExclusiveRun(async () => { - await this.cancel() - const currentJobResponse = this._jobToResponse.get('OnlyOneJob') - if (currentJobResponse) { - // Make sure not to leak jobs to avoid "job already exists" errors after - // loading the model again. - currentJobResponse.failed(new Error('Model was unloaded')) - this._deleteJobMapping('OnlyOneJob') - } - this._hasActiveResponse = false - await super.unload() + this.logger.info('Starting inference embeddings for text:', text) + const inputData = Array.isArray(text) + ? { type: 'sequences', input: text } + : { type: 'text', input: text } + + const response = this._job.start() + + let accepted + try { + accepted = await this.addon.runJob(inputData) + } catch (error) { + this._job.fail(error) + throw error + } + if (!accepted) { + this._job.fail(new Error(RUN_BUSY_ERROR_MESSAGE)) + throw new Error(RUN_BUSY_ERROR_MESSAGE) + } + + this._hasActiveResponse = true + const finalized = response.await().finally(() => { this._hasActiveResponse = false }) + finalized.catch((err) => { + this.logger?.warn?.('Inference response rejected:', err?.message || err) }) + response.await = () => finalized + return response } - async _runInternal (text) { - return this._withExclusiveRun(async () => { - if (this._hasActiveResponse) { - throw new Error(RUN_BUSY_ERROR_MESSAGE) - } + _addonOutputCallback (addon, event, data, error) { + // Event-name normalization lives in `addon.js` (`mapAddonEvent`) so the + // native binding wrapper owns the C++ event vocabulary. This shim only + // dispatches the resulting logical event onto the active job. + const mapped = mapAddonEvent(event, data, error) + if (mapped === null) { + // Unknown event type — log it instead of feeding the payload into the + // active response output stream as if it were embedding data. The + // native layer is expected to emit only `Embeddings`, `Error`, or + // stats; reaching this branch indicates a native-layer change worth + // surfacing. + this.logger.warn(`Unhandled addon event: ${event} (data type: ${typeof data})`) + return + } - this.logger.info('Starting inference embeddings for text:', text) - - // Detect arrays and set type: 'sequences' for direct vector passing - // Otherwise use type: 'text' for string input - const inputData = Array.isArray(text) - ? { type: 'sequences', input: text } - : { type: 'text', input: text } - - const response = this._createResponse('OnlyOneJob') - - // addon-cpp C++ guarantees no events will be generated until job is - // fully accepted. If runJob throws or returns false, no events will be - // generated for this job. - let accepted - try { - accepted = await this.addon.runJob(inputData) - } catch (error) { - this._deleteJobMapping('OnlyOneJob') - response.failed(error) - throw error - } - if (!accepted) { - this._deleteJobMapping('OnlyOneJob') - response.failed(new Error(RUN_BUSY_ERROR_MESSAGE)) - throw new Error(RUN_BUSY_ERROR_MESSAGE) - } + if (mapped.type === 'Error') { + this.logger.error('Job failed with error:', mapped.error) + this._job.fail(mapped.error) + return + } - this._hasActiveResponse = true - const finalized = response.await().finally(() => { this._hasActiveResponse = false }) - finalized.catch(() => {}) - response.await = () => finalized + if (mapped.type === 'JobEnded') { + this._job.end(this.opts.stats ? mapped.data : null) + return + } - return response - }) + if (mapped.type === 'Output') { + this._job.output(mapped.data) + } } - /** - * Instantiate the native addon with the given parameters. - * @param {Object} configurationParams - Configuration parameters for the addon - * @param {string} configurationParams.path - Local file or directory path - * @param {Object} configurationParams.settings - Bert-specific settings - * @returns {Addon} The instantiated addon interface - */ _createAddon (configurationParams) { - this.logger.info( - 'Creating Bert interface with configuration:', - configurationParams - ) const binding = require('./binding') return new BertInterface( binding, @@ -170,31 +167,31 @@ class GGMLBert extends BaseInference { ) } - _addonOutputCallback (addon, event, data, error) { - // Map C++ mangled type names to expected event names - // Stats / job-ended: LLM uses tokens_per_second; embed uses total_tokens, total_time_ms, etc. (RuntimeStats) - const isStatsData = typeof data === 'object' && data !== null && ( - 'tokens_per_second' in data || - ('total_tokens' in data || 'total_time_ms' in data || 'batch_size' in data || 'context_size' in data) - ) - if (isStatsData) { - const runtimeStats = { ...data } - if (runtimeStats.backendDevice === 0) { - runtimeStats.backendDevice = 'cpu' - } else if (runtimeStats.backendDevice === 1) { - runtimeStats.backendDevice = 'gpu' + async unload () { + return this._run(async () => { + await this.cancel() + if (this._job.active) { + this._job.fail(new Error('Model was unloaded')) } - return this._outputCallback(addon, 'JobEnded', 'OnlyOneJob', runtimeStats, null) - } + this._hasActiveResponse = false + if (this.addon) { + await this.addon.unload() + // Null the addon reference so post-unload `cancel()` / `run()` calls hit the + // `if (!this.addon)` guard instead of dereferencing a disposed native handle. + this.addon = null + } + this.state.configLoaded = false + }) + } - let mappedEvent = event - if (event.includes('Error')) { - mappedEvent = 'Error' - } else if (event.includes('Embeddings')) { - mappedEvent = 'Output' + async cancel () { + if (this.addon?.cancel) { + await this.addon.cancel() } - return this._outputCallback(addon, mappedEvent, 'OnlyOneJob', data, error) } + + getState () { return this.state } } module.exports = GGMLBert +module.exports.pickPrimaryGgufPath = pickPrimaryGgufPath diff --git a/packages/qvac-lib-infer-llamacpp-embed/package.json b/packages/qvac-lib-infer-llamacpp-embed/package.json index de217aa945..6efd365ed0 100644 --- a/packages/qvac-lib-infer-llamacpp-embed/package.json +++ b/packages/qvac-lib-infer-llamacpp-embed/package.json @@ -1,6 +1,6 @@ { "name": "@qvac/embed-llamacpp", - "version": "0.13.4", + "version": "0.14.0", "description": "bert addon for qvac", "addon": true, "engines": { @@ -59,9 +59,7 @@ "bugs": "https://github.com/tetherto/qvac/issues", "homepage": "https://github.com/tetherto/qvac/tree/main/packages/qvac-lib-infer-llamacpp-embed#readme", "devDependencies": { - "@qvac/dl-filesystem": "^0.1.2", "@types/node": "^24.2.1", - "bare-fs": "^4.5.1", "brittle": "^3.4.0", "cmake-bare": "^1.7.1", "cmake-vcpkg": "^1.1.0", @@ -69,13 +67,11 @@ "typescript": "^5.9.2" }, "dependencies": { - "@qvac/infer-base": "^0.2.2", + "@qvac/infer-base": "^0.4.0", "@qvac/logging": "^0.1.0", + "bare-fs": "^4.5.1", "bare-path": "^3.0.0" }, - "peerDependencies": { - "@qvac/dl-hyperdrive": "^0.1.0" - }, "exports": { "./package": "./package.json", ".": { diff --git a/packages/qvac-lib-infer-llamacpp-embed/test/integration/addon.test.js b/packages/qvac-lib-infer-llamacpp-embed/test/integration/addon.test.js index c8f31983fd..73d48f89c4 100644 --- a/packages/qvac-lib-infer-llamacpp-embed/test/integration/addon.test.js +++ b/packages/qvac-lib-infer-llamacpp-embed/test/integration/addon.test.js @@ -77,7 +77,7 @@ createDeviceModelTest('Model inference works correctly', async (t, modelName, mo const embeddingDimension = modelConfig.embeddingDimension console.log(`Creating new GGMLBert instance for ${modelName} on ${device.toUpperCase()}`) - const { inference, loader } = await createEmbeddingsTestInstance(t, modelName, device, null, DEFAULT_BATCH_SIZE) + const { inference } = await createEmbeddingsTestInstance(t, modelName, device, null, DEFAULT_BATCH_SIZE) const sentence = 'That is a happy person' const response = await inference.run(sentence) @@ -87,7 +87,7 @@ createDeviceModelTest('Model inference works correctly', async (t, modelName, mo console.log('Generated embeddings:', embeddings[0][0]) t.teardown(async () => { - await cleanupResources(loader, inference) + await cleanupResources(inference) }) }) @@ -95,7 +95,7 @@ createDeviceModelTest('Model inference works correctly with array input', async const embeddingDimension = modelConfig.embeddingDimension console.log(`Creating new GGMLBert instance for array input test [${modelName}] on ${device.toUpperCase()}`) - const { inference, loader } = await createEmbeddingsTestInstance(t, modelName, device, null, DEFAULT_BATCH_SIZE) + const { inference } = await createEmbeddingsTestInstance(t, modelName, device, null, DEFAULT_BATCH_SIZE) const sentences = ['That is a happy person', 'This is a sad person', 'I am feeling neutral'] const response = await inference.run(sentences) @@ -121,14 +121,14 @@ createDeviceModelTest('Model inference works correctly with array input', async } t.teardown(async () => { - await cleanupResources(loader, inference) + await cleanupResources(inference) }) }) createDeviceModelTest('Model inference works correctly with long string exceeding context size', async (t, modelName, modelConfig, device) => { const maxContextSize = modelConfig.maxContextSize - const { inference, loader } = await createEmbeddingsTestInstance(t, modelName, device, null, DEFAULT_BATCH_SIZE) + const { inference } = await createEmbeddingsTestInstance(t, modelName, device, null, DEFAULT_BATCH_SIZE) // Create a string that exceeds maxContextSize tokens // "Hello world " is approximately 2-3 tokens, repeating enough times to exceed context size @@ -183,14 +183,14 @@ createDeviceModelTest('Model inference works correctly with long string exceedin await inference.cancel() t.teardown(async () => { - await cleanupResources(loader, inference) + await cleanupResources(inference) }) }) createDeviceModelTest('Model inference works correctly with array input where one sequence exceeds context size', async (t, modelName, modelConfig, device) => { const maxContextSize = modelConfig.maxContextSize - const { inference, loader } = await createEmbeddingsTestInstance(t, modelName, device, null, DEFAULT_BATCH_SIZE) + const { inference } = await createEmbeddingsTestInstance(t, modelName, device, null, DEFAULT_BATCH_SIZE) // Create an array with 3 sequences where the second sequence exceeds context size // "Hello world " is approximately 2-3 tokens, repeating enough times to exceed context size @@ -251,7 +251,7 @@ createDeviceModelTest('Model inference works correctly with array input where on await inference.cancel() t.teardown(async () => { - await cleanupResources(loader, inference) + await cleanupResources(inference) }) }) @@ -260,7 +260,7 @@ createDeviceModelTest('Model inference works correctly with batching - 5 sequenc const maxContextSize = modelConfig.maxContextSize console.log(`Creating new GGMLBert instance for batching test [${modelName}] on ${device.toUpperCase()}`) - const { inference, loader } = await createEmbeddingsTestInstance(t, modelName, device, null, DEFAULT_BATCH_SIZE) + const { inference } = await createEmbeddingsTestInstance(t, modelName, device, null, DEFAULT_BATCH_SIZE) // Create 5 sequences of roughly similar length. // The goal is to have enough total tokens so that: @@ -305,12 +305,12 @@ createDeviceModelTest('Model inference works correctly with batching - 5 sequenc } t.teardown(async () => { - await cleanupResources(loader, inference) + await cleanupResources(inference) }) }) createDeviceModelTest('Embeddings: empty string input', async (t, modelName, modelConfig, device) => { - const { inference, loader } = await createEmbeddingsTestInstance(t, modelName, device, null, DEFAULT_BATCH_SIZE) + const { inference } = await createEmbeddingsTestInstance(t, modelName, device, null, DEFAULT_BATCH_SIZE) const sentence = '' const response = await inference.run(sentence) @@ -326,14 +326,14 @@ createDeviceModelTest('Embeddings: empty string input', async (t, modelName, mod } t.teardown(async () => { - await cleanupResources(loader, inference) + await cleanupResources(inference) }) }) createDeviceModelTest('Embeddings: whitespace-only string input', async (t, modelName, modelConfig, device) => { const embeddingDimension = modelConfig.embeddingDimension - const { inference, loader } = await createEmbeddingsTestInstance(t, modelName, device, null, DEFAULT_BATCH_SIZE) + const { inference } = await createEmbeddingsTestInstance(t, modelName, device, null, DEFAULT_BATCH_SIZE) const sentence = ' \t \n ' const response = await inference.run(sentence) @@ -345,14 +345,14 @@ createDeviceModelTest('Embeddings: whitespace-only string input', async (t, mode ) t.teardown(async () => { - await cleanupResources(loader, inference) + await cleanupResources(inference) }) }) createDeviceModelTest('Embeddings: unicode / multilingual input with emojis', async (t, modelName, modelConfig, device) => { const embeddingDimension = modelConfig.embeddingDimension - const { inference, loader } = await createEmbeddingsTestInstance(t, modelName, device, null, DEFAULT_BATCH_SIZE) + const { inference } = await createEmbeddingsTestInstance(t, modelName, device, null, DEFAULT_BATCH_SIZE) const sentences = ['Привет, как дела? 😊', '你好,世界 🌏', 'Hola, ¿cómo estás? ❤️', 'Hello, world! 🚀'] const response = await inference.run(sentences) @@ -365,14 +365,14 @@ createDeviceModelTest('Embeddings: unicode / multilingual input with emojis', as } t.teardown(async () => { - await cleanupResources(loader, inference) + await cleanupResources(inference) }) }) createDeviceModelTest('Embeddings: deterministic output for same input', async (t, modelName, modelConfig, device) => { const embeddingDimension = modelConfig.embeddingDimension - const { inference, loader } = await createEmbeddingsTestInstance(t, modelName, device, null, DEFAULT_BATCH_SIZE) + const { inference } = await createEmbeddingsTestInstance(t, modelName, device, null, DEFAULT_BATCH_SIZE) const sentence = 'This sentence should always map to the same embedding.' @@ -422,7 +422,7 @@ createDeviceModelTest('Embeddings: deterministic output for same input', async ( t.ok(similarity > 0.999, `Same input should produce identical embeddings (cosine similarity: ${similarity.toFixed(6)})`) t.teardown(async () => { - await cleanupResources(loader, inference) + await cleanupResources(inference) }) }) @@ -434,7 +434,7 @@ createDeviceModelTest(`Stress: inference with large batch size ${STRESS_BATCH_SI `(batch_size=${STRESS_BATCH_SIZE})` ) - const { inference, loader } = await createEmbeddingsTestInstance( + const { inference } = await createEmbeddingsTestInstance( t, modelName, device, @@ -464,7 +464,7 @@ createDeviceModelTest(`Stress: inference with large batch size ${STRESS_BATCH_SI } t.teardown(async () => { - await cleanupResources(loader, inference) + await cleanupResources(inference) }) }) @@ -476,7 +476,7 @@ createDeviceModelTest(`Stress: inference with many sequences (~${STRESS_NUM_SEQU `(num_sequences=${STRESS_NUM_SEQUENCES})` ) - const { inference, loader } = await createEmbeddingsTestInstance( + const { inference } = await createEmbeddingsTestInstance( t, modelName, device, @@ -515,14 +515,14 @@ createDeviceModelTest(`Stress: inference with many sequences (~${STRESS_NUM_SEQU } t.teardown(async () => { - await cleanupResources(loader, inference) + await cleanupResources(inference) }) }) createDeviceModelTest('Cancel: immediate cancel returns fewer embeddings than full run', async (t, modelName, modelConfig, device) => { console.log(`Creating GGMLBert instance for cancel comparison test [${modelName}] on ${device.toUpperCase()}`) - const { inference, loader } = await createEmbeddingsTestInstance( + const { inference } = await createEmbeddingsTestInstance( t, modelName, device, @@ -570,7 +570,7 @@ createDeviceModelTest('Cancel: immediate cancel returns fewer embeddings than fu ) t.teardown(async () => { - await cleanupResources(loader, inference) + await cleanupResources(inference) }) }) @@ -580,7 +580,7 @@ const MODEL_NAME_API = getModelConfigs()[0]?.modelName ?? 'embeddinggemma-300M-Q async function setupModelApiBehavior (t) { await ensureModel(MODEL_NAME_API) - const { inference, loader } = await createEmbeddingsTestInstance( + const { inference } = await createEmbeddingsTestInstance( t, MODEL_NAME_API, DEVICE_API, @@ -588,7 +588,7 @@ async function setupModelApiBehavior (t) { '1024' ) t.teardown(async () => { - await cleanupResources(loader, inference) + await cleanupResources(inference) }) return { inference } } diff --git a/packages/qvac-lib-infer-llamacpp-embed/test/integration/multi-instance.test.js b/packages/qvac-lib-infer-llamacpp-embed/test/integration/multi-instance.test.js index 2276503ff1..438a5739ca 100644 --- a/packages/qvac-lib-infer-llamacpp-embed/test/integration/multi-instance.test.js +++ b/packages/qvac-lib-infer-llamacpp-embed/test/integration/multi-instance.test.js @@ -18,14 +18,14 @@ test('Two embed instances can run inference simultaneously', { timeout: 900_000 }, async t => { const modelName = TEST_MODEL.modelName - const { inference: inference1, loader: loader1 } = await createEmbeddingsTestInstance( + const { inference: inference1 } = await createEmbeddingsTestInstance( t, modelName, DEFAULT_DEVICE, null, DEFAULT_BATCH_SIZE ) - const { inference: inference2, loader: loader2 } = await createEmbeddingsTestInstance( + const { inference: inference2 } = await createEmbeddingsTestInstance( t, modelName, DEFAULT_DEVICE, @@ -36,8 +36,6 @@ test('Two embed instances can run inference simultaneously', { t.teardown(async () => { await inference1.unload().catch(() => {}) await inference2.unload().catch(() => {}) - await loader1.close().catch(() => {}) - await loader2.close().catch(() => {}) }) const sentences1 = ['Hello world', 'This is a test'] @@ -65,7 +63,7 @@ test('Repeated embed load/unload cycles should remain stable', { const testSentence = 'This is a stability test sentence.' for (let i = 0; i < NUM_CYCLES; i++) { - const { inference, loader } = await createEmbeddingsTestInstance( + const { inference } = await createEmbeddingsTestInstance( t, modelName, DEFAULT_DEVICE, @@ -79,7 +77,6 @@ test('Repeated embed load/unload cycles should remain stable', { t.ok(embeddings[0][0].length > 0, `cycle ${i + 1}: produced embeddings`) await inference.unload() - await loader.close() t.pass(`cycle ${i + 1}: load/unload completed`) } @@ -91,14 +88,14 @@ test('Unloading one embed instance does not affect another running instance', { timeout: 900_000 }, async t => { const modelName = TEST_MODEL.modelName - const { inference: inference1, loader: loader1 } = await createEmbeddingsTestInstance( + const { inference: inference1 } = await createEmbeddingsTestInstance( t, modelName, DEFAULT_DEVICE, null, DEFAULT_BATCH_SIZE ) - const { inference: inference2, loader: loader2 } = await createEmbeddingsTestInstance( + const { inference: inference2 } = await createEmbeddingsTestInstance( t, modelName, DEFAULT_DEVICE, @@ -109,8 +106,6 @@ test('Unloading one embed instance does not affect another running instance', { t.teardown(async () => { await inference1.unload().catch(() => {}) await inference2.unload().catch(() => {}) - await loader1.close().catch(() => {}) - await loader2.close().catch(() => {}) }) const largeBatch = Array(50).fill(null).map((_, i) => `Test sentence number ${i} for batch processing`) @@ -119,7 +114,6 @@ test('Unloading one embed instance does not affect another running instance', { await new Promise(resolve => setTimeout(resolve, 100)) await inference2.unload() - await loader2.close() t.pass('unloaded instance 2 while instance 1 is processing') const response1 = await response1Promise @@ -132,7 +126,7 @@ test('Multiple embed load/unload cycles on one instance while another processes' timeout: 900_000 }, async t => { const modelName = TEST_MODEL.modelName - const { inference: inference1, loader: loader1 } = await createEmbeddingsTestInstance( + const { inference: inference1 } = await createEmbeddingsTestInstance( t, modelName, DEFAULT_DEVICE, @@ -142,7 +136,6 @@ test('Multiple embed load/unload cycles on one instance while another processes' t.teardown(async () => { await inference1.unload().catch(() => {}) - await loader1.close().catch(() => {}) }) const NUM_CYCLES = 3 @@ -154,7 +147,7 @@ test('Multiple embed load/unload cycles on one instance while another processes' const response1Promise = inference1.run(sentences) if (cyclesCompleted < NUM_CYCLES) { - const { inference: inference2, loader: loader2 } = await createEmbeddingsTestInstance( + const { inference: inference2 } = await createEmbeddingsTestInstance( t, modelName, DEFAULT_DEVICE, @@ -162,7 +155,6 @@ test('Multiple embed load/unload cycles on one instance while another processes' DEFAULT_BATCH_SIZE ) await inference2.unload() - await loader2.close() cyclesCompleted++ t.pass(`load/unload cycle ${cyclesCompleted} completed while instance 1 processes batch ${batch + 1}`) } diff --git a/packages/qvac-lib-infer-llamacpp-embed/test/integration/utils.js b/packages/qvac-lib-infer-llamacpp-embed/test/integration/utils.js index c407668d2a..71ba33ebdd 100644 --- a/packages/qvac-lib-infer-llamacpp-embed/test/integration/utils.js +++ b/packages/qvac-lib-infer-llamacpp-embed/test/integration/utils.js @@ -5,7 +5,6 @@ const path = require('bare-path') const https = require('bare-https') const os = require('bare-os') const GGMLBert = require('../../index.js') -const FilesystemDL = require('@qvac/dl-filesystem') /** * Downloads a file from a URL to a destination path @@ -162,15 +161,14 @@ class TestLogger { * @param {string} device - Device to use: 'cpu' or 'gpu' (default: 'gpu') * @param {string} gpuLayers - Number of GPU layers (default: '999' for GPU, '0' for CPU) * @param {string} batchSize - Batch size (default: '1024') - * @returns {Promise<{inference: GGMLBert, loader: FilesystemDL}>} + * @returns {Promise<{inference: GGMLBert}>} */ async function createEmbeddingsTestInstance (t, modelName, device = 'gpu', gpuLayers = null, batchSize = '1024') { const [, modelDir] = await ensureModel(modelName) - const diskPath = modelDir + const modelPath = path.join(modelDir, modelName) - t.ok(fs.existsSync(path.join(diskPath, modelName)), 'Model file should exist') + t.ok(fs.existsSync(modelPath), 'Model file should exist') - const loader = new FilesystemDL({ dirPath: diskPath }) const logger = new TestLogger() // Force CPU on darwin-x64 @@ -180,35 +178,36 @@ async function createEmbeddingsTestInstance (t, modelName, device = 'gpu', gpuLa console.log('Platform detected: darwin-x64, forcing device to CPU') } - // Determine gpu_layers based on device if not explicitly provided const actualGpuLayers = gpuLayers !== null ? gpuLayers : (device === 'cpu' ? '0' : '999') - // Build config object with device and gpu_layers parameters const config = { gpu_layers: actualGpuLayers, batch_size: batchSize } - // Add device preference if specified if (device === 'cpu' || device === 'gpu') { config.device = device } - // Disable flash attention on Android if (os.platform() === 'android') { config.flash_attn = 'off' console.log('Platform detected: Android, setting flash_attn to off') } - config.openclCacheDir = diskPath + config.openclCacheDir = modelDir - const inference = new GGMLBert({ modelName, loader, logger, diskPath, opts: { stats: true } }, config) + const inference = new GGMLBert({ + files: { model: [modelPath] }, + config, + logger, + opts: { stats: true } + }) const t0 = Date.now() await inference.load() console.log(` model.load() took ${Date.now() - t0} ms`) - return { inference, loader } + return { inference } } /** @@ -260,12 +259,10 @@ function removeErrorHandlers (response) { /** * Cleans up test resources - * @param {Object} loader - The loader instance * @param {Object} inference - The inference instance * @returns {Promise} */ -async function cleanupResources (loader, inference) { - await loader.close() +async function cleanupResources (inference) { await inference.unload() } diff --git a/packages/qvac-lib-infer-llamacpp-embed/test/unit/map-addon-event.test.js b/packages/qvac-lib-infer-llamacpp-embed/test/unit/map-addon-event.test.js new file mode 100644 index 0000000000..c0c38905a5 --- /dev/null +++ b/packages/qvac-lib-infer-llamacpp-embed/test/unit/map-addon-event.test.js @@ -0,0 +1,61 @@ +'use strict' + +const test = require('brittle') +const { mapAddonEvent } = require('../../addon.js') + +test('stats payload with tokens_per_second maps to JobEnded', function (t) { + const result = mapAddonEvent('Stats', { tokens_per_second: 123, total_tokens: 10 }, null) + t.is(result.type, 'JobEnded') + t.is(result.data.tokens_per_second, 123) + t.is(result.data.total_tokens, 10) + t.is(result.error, null) +}) + +test('stats payload maps backendDevice 0 to "cpu"', function (t) { + const result = mapAddonEvent('Stats', { total_time_ms: 5, backendDevice: 0 }, null) + t.is(result.type, 'JobEnded') + t.is(result.data.backendDevice, 'cpu') +}) + +test('stats payload maps backendDevice 1 to "gpu"', function (t) { + const result = mapAddonEvent('Stats', { batch_size: 32, backendDevice: 1 }, null) + t.is(result.type, 'JobEnded') + t.is(result.data.backendDevice, 'gpu') +}) + +test('stats payload leaves unknown backendDevice values as-is', function (t) { + const result = mapAddonEvent('Stats', { context_size: 512, backendDevice: 2 }, null) + t.is(result.type, 'JobEnded') + t.is(result.data.backendDevice, 2) +}) + +test('Error event name maps to Error type carrying rawError', function (t) { + const err = new Error('boom') + const result = mapAddonEvent('SomeError', null, err) + t.is(result.type, 'Error') + t.is(result.error, err) +}) + +test('Embeddings event name maps to Output type', function (t) { + const data = [[0.1, 0.2, 0.3]] + const result = mapAddonEvent('Embeddings', data, null) + t.is(result.type, 'Output') + t.is(result.data, data) + t.is(result.error, null) +}) + +test('stats detection takes precedence over event name', function (t) { + const result = mapAddonEvent('Embeddings', { tokens_per_second: 99 }, null) + t.is(result.type, 'JobEnded', 'stats-shaped data overrides Embeddings event') +}) + +test('unknown event with non-stats object returns null', function (t) { + const result = mapAddonEvent('Unknown', { foo: 'bar' }, null) + t.is(result, null) +}) + +test('unknown event with primitive data returns null', function (t) { + t.is(mapAddonEvent('Unknown', 'string', null), null) + t.is(mapAddonEvent('Unknown', 42, null), null) + t.is(mapAddonEvent('Unknown', null, null), null) +}) diff --git a/packages/qvac-lib-infer-llamacpp-embed/test/unit/pick-primary-gguf-path.test.js b/packages/qvac-lib-infer-llamacpp-embed/test/unit/pick-primary-gguf-path.test.js new file mode 100644 index 0000000000..fb3bad0645 --- /dev/null +++ b/packages/qvac-lib-infer-llamacpp-embed/test/unit/pick-primary-gguf-path.test.js @@ -0,0 +1,32 @@ +'use strict' + +const test = require('brittle') +const { pickPrimaryGgufPath } = require('../../index.js') + +test('single non-sharded file returns that file', function (t) { + const files = ['/models/bge-small-en-v1.5-q4_0.gguf'] + t.is(pickPrimaryGgufPath(files), '/models/bge-small-en-v1.5-q4_0.gguf') +}) + +test('sharded model with tensors.txt first returns first shard, not tensors.txt', function (t) { + const files = [ + '/models/big-embed.tensors.txt', + '/models/big-embed-00001-of-00003.gguf', + '/models/big-embed-00002-of-00003.gguf', + '/models/big-embed-00003-of-00003.gguf' + ] + t.is(pickPrimaryGgufPath(files), '/models/big-embed-00001-of-00003.gguf') +}) + +test('sharded model without tensors.txt returns first shard', function (t) { + const files = [ + '/models/gte-large-00001-of-00002.gguf', + '/models/gte-large-00002-of-00002.gguf' + ] + t.is(pickPrimaryGgufPath(files), '/models/gte-large-00001-of-00002.gguf') +}) + +test('non-gguf file falls back to first entry', function (t) { + const files = ['/models/some-model.bin'] + t.is(pickPrimaryGgufPath(files), '/models/some-model.bin') +}) diff --git a/packages/qvac-lib-infer-llamacpp-llm/CHANGELOG.md b/packages/qvac-lib-infer-llamacpp-llm/CHANGELOG.md index 9a0801faf5..7967ca1a38 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/CHANGELOG.md +++ b/packages/qvac-lib-infer-llamacpp-llm/CHANGELOG.md @@ -1,5 +1,109 @@ # Changelog +## [0.16.0] - 2026-04-14 + +This release migrates the LLM addon off `BaseInference` inheritance and the `WeightsProvider` download layer onto the composable `createJobHandler` + `exclusiveRunQueue` utilities from `@qvac/infer-base@^0.4.0`. The constructor signature is replaced with a single object whose `files.model` field is an ordered array of absolute paths and `files.projectionModel` is an optional absolute path for multimodal models. This is a breaking change — every caller must update. + +## Breaking Changes + +### Constructor signature: single object with `files`, no `Loader` + +`LlmLlamacpp` now takes a single `{ files, config, logger?, opts? }` object. The old `Loader` + `diskPath` + `modelName` + two-arg `(args, config)` shape is gone — callers pre-resolve absolute paths and supply them as `files.model`. + +```js +// BEFORE (≤ 0.14.x) +const FilesystemDL = require('@qvac/dl-filesystem') +const loader = new FilesystemDL({ dirPath: '/models' }) +const model = new LlmLlamacpp({ + loader, + modelName: 'Qwen3-1.7B-Q4_0.gguf', + diskPath: '/models', + logger: console, + opts: { stats: true } +}, { ctx_size: '4096', gpu_layers: '99' }) + +// AFTER (0.15.0) +const model = new LlmLlamacpp({ + files: { + model: ['/models/Qwen3-1.7B-Q4_0.gguf'] + }, + config: { ctx_size: '4096', gpu_layers: '99' }, + logger: console, + opts: { stats: true } +}) +``` + +For sharded models the caller passes the full ordered list — the `.tensors.txt` companion first, followed by every `-NNNNN-of-MMMMM.gguf` shard in ascending order. For multimodal models, `files.projectionModel` carries the absolute path to the mmproj file: + +```js +const model = new LlmLlamacpp({ + files: { + model: [ + '/models/medgemma-4b-it-Q4_1.tensors.txt', + '/models/medgemma-4b-it-Q4_1-00001-of-00005.gguf', + '/models/medgemma-4b-it-Q4_1-00002-of-00005.gguf', + '/models/medgemma-4b-it-Q4_1-00003-of-00005.gguf', + '/models/medgemma-4b-it-Q4_1-00004-of-00005.gguf', + '/models/medgemma-4b-it-Q4_1-00005-of-00005.gguf' + ], + projectionModel: '/models/mmproj-model-f16.gguf' + }, + config: { gpu_layers: '99' } +}) +``` + +### `BaseInference` inheritance and `WeightsProvider` removed + +`LlmLlamacpp` no longer extends `BaseInference` and no longer touches the `WeightsProvider` download layer. The class composes `createJobHandler` and `exclusiveRunQueue` from `@qvac/infer-base@^0.4.0` directly. Public lifecycle methods (`load` / `run` / `finetune` / `pause` / `cancel` / `unload` / `getState`) are unchanged in shape, but `downloadWeights` and the loader-based progress callbacks are gone — the caller is responsible for placing files on disk before constructing the model. + +In-memory streaming from network sources (URLs, Hyperdrive) is no longer supported in the current API. The SDK does not currently use it (models are stored to disk first); this can be re-added when/if the SDK plans to support that feature. Before, it was possible through the `Loader` abstraction. + +### `destroy()` removed + +The inherited `destroy()` from `BaseInference` is no longer part of the public surface. Callers should use `unload()` instead, which now also nulls the addon reference. + +### Dependency changes + +- `@qvac/infer-base` bumped from `^0.3.0` to `^0.4.0`. +- `bare-fs` is now a runtime dependency (used to stream shards from disk). +- `@qvac/dl-base` and `@qvac/dl-filesystem` are no longer used by this package and have been removed from `devDependencies`. + +## Features + +### Constructor input validation + +The constructor now throws `TypeError('files.model must be a non-empty array of absolute paths')` when `files` or `files.model` is missing or empty. This produces a clear error for callers porting old code instead of a confusing `Cannot read properties of undefined`. + +### `run()`-before-`load()` guard + +Calling `run()` before `load()` now throws `Error('Addon not initialized. Call load() first.')` instead of dereferencing `null` and crashing. `finetune()` already had this guard since the previous release. + +### `load()` is now idempotent when already loaded + +A second `load()` call on an already-loaded instance is now a silent no-op instead of unloading and reloading. This aligns with the ReadyResource pattern used elsewhere in QVAC and prevents accidental double-loads from triggering expensive work. Callers that intentionally want to swap weights must call `unload()` first (which clears `configLoaded`) and then `load()` again. + +### Crash-safe shard streaming + +If `_streamShards()` or `addon.activate()` throws mid-load (for example a corrupted shard file or a native init failure), the partially-initialized addon is now best-effort-unloaded and `this.addon` is reset to `null`. A subsequent `load()` call starts cleanly instead of leaking a zombie native instance. + +### Restored JSDoc on `FinetuneOptions` + +Every `FinetuneOptions` field carries a `/** … */` doc comment again, including the default values (`numberOfEpochs = 1`, `learningRate = 1e-4`, `batchSize = 128`, …) so IDE tooltips show them without needing to read `docs/finetuning.md`. + +## Bug Fixes + +### `unload()` clears the addon reference + +`unload()` now sets `this.addon = null` after `await this.addon.unload()`, so post-unload `cancel()` / `pause()` / `run()` calls hit the explicit guards rather than dereferencing a disposed native handle. `pause()`, `cancel()`, and the job-handler cancel closure all use optional chaining for the same reason. + +### Removed dead `_isSuppressedNoResponseLog` filter + +The `_createFilteredLogger` infrastructure that wrapped the user-supplied logger to swallow `'No response found for job'` warnings was tied to the old `BaseInference` `_jobToResponse` Map. The new architecture cannot emit that message at all, so the filter, the wrapped logger, and the `_originalLogger` indirection are all removed. The user-supplied logger is now used directly. + +## Pull Requests + +- [#1494](https://github.com/tetherto/qvac/pull/1494) - chore[bc]: LLM addon interface refactor — remove BaseInference and WeightsProvider + ## [0.15.0] - 2026-04-09 ### Breaking Changes diff --git a/packages/qvac-lib-infer-llamacpp-llm/README.md b/packages/qvac-lib-infer-llamacpp-llm/README.md index e71a948b7d..ba6029dcc7 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/README.md +++ b/packages/qvac-lib-infer-llamacpp-llm/README.md @@ -8,13 +8,13 @@ This native C++ addon, built using the `Bare` Runtime, simplifies running Large - [Building from Source](#building-from-source) - [Usage](#usage) - [1. Import the Model Class](#1-import-the-model-class) - - [2. Create a Data Loader](#2-create-a-data-loader) - - [3. Create the `args` obj](#3-create-the-args-obj) - - [4. Create the `config` obj](#4-create-the-config-obj) - - [5. Create Model Instance](#5-create-model-instance) - - [6. Load Model](#6-load-model) - - [7. Run Inference](#7-run-inference) - - [8. Release Resources](#8-release-resources) + - [2. Create the `args` obj](#2-create-the-args-obj) + - [Sharded models](#sharded-models) + - [3. Create the `config` obj](#3-create-the-config-obj) + - [4. Create Model Instance](#4-create-model-instance) + - [5. Load Model](#5-load-model) + - [6. Run Inference](#6-run-inference) + - [7. Release Resources](#7-release-resources) - [API behavior by state](#api-behavior-by-state) - [Fine-tuning](#fine-tuning) - [Quickstart Example](#quickstart-example) @@ -72,47 +72,77 @@ See [build.md](./build.md) for detailed instructions on how to build the addon f ```js const LlmLlamacpp = require('@qvac/llm-llamacpp') +const path = require('bare-path') ``` -### 2. Create a Data Loader - -Data Loaders abstract the way model files are accessed. Use a [`FileSystemDataLoader`](../dl-filesystem) to load model files from your local file system. Models can be downloaded directly from HuggingFace. +### 2. Create the `args` obj ```js -const FilesystemDL = require('@qvac/dl-filesystem') - -// Download model from HuggingFace (see examples/utils.js for downloadModel helper) -const [modelName, dirPath] = await downloadModel( - 'https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf', - 'Llama-3.2-1B-Instruct-Q4_0.gguf' -) - -const fsDL = new FilesystemDL({ dirPath }) -``` - -### 3. Create the `args` obj +const dirPath = './models' +const modelName = 'Llama-3.2-1B-Instruct-Q4_0.gguf' -```js const args = { - loader: fsDL, + files: { + model: [path.join(dirPath, modelName)] + // projectionModel: path.join(dirPath, 'mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf') // for multimodal support pass the projection model path + }, + config, opts: { stats: true }, - logger: console, - diskPath: dirPath, - modelName, - // projectionModel: 'mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf' // for multimodal support you need to pass the projection model name + logger: console } ``` The `args` obj contains the following properties: -* `loader`: The Data Loader instance from which the model file will be streamed. -* `logger`: This property is used to create a [`QvacLogger`](../logging) instance, which handles all logging functionality. +* `files.model`: Required. An array of absolute paths to the GGUF model file(s) to load. The caller is responsible for passing the complete set of files for the model, including every shard and the `.tensors.txt` companion for multi-shard models (see [Sharded models](#sharded-models) below). +* `files.projectionModel`: Optional. Absolute path to the projection model file. This is required for multimodal support. +* `config`: The model configuration object (see next section). +* `logger`: This property is used to create a [`QvacLogger`](../logging) instance, which handles all logging functionality. * `opts.stats`: This flag determines whether to calculate inference stats. -* `diskPath`: The local directory where the model file will be downloaded to. -* `modelName`: The name of model file in the Data Loader. -* `projectionModel`: The name of the projection model file in the Data Loader. This is required for multimodal support. -### 4. Create the `config` obj +#### Sharded models + +The addon no longer expands sharded models internally. If you are loading a multi-shard GGUF model, **the caller MUST pass every file** — including the `.tensors.txt` companion file that lives alongside the shards — in `files.model`. Anything missing will cause the addon to fail during weight streaming. + +**Required ordering for multi-shard models:** +1. The `.tensors.txt` companion file **first**. +2. Each `*-NNNNN-of-MMMMM.gguf` shard in **numerical order** (shard `00001` before `00002`, and so on). + +Example — loading a 5-shard model: + +```js +const path = require('bare-path') +const LlmLlamacpp = require('@qvac/llm-llamacpp') + +const dir = './models' +const modelBase = 'my-big-model-Q4_K_M' + +const model = new LlmLlamacpp({ + files: { + model: [ + path.join(dir, `${modelBase}.tensors.txt`), + path.join(dir, `${modelBase}-00001-of-00005.gguf`), + path.join(dir, `${modelBase}-00002-of-00005.gguf`), + path.join(dir, `${modelBase}-00003-of-00005.gguf`), + path.join(dir, `${modelBase}-00004-of-00005.gguf`), + path.join(dir, `${modelBase}-00005-of-00005.gguf`) + ] + }, + config, + logger: console, + opts: { stats: true } +}) + +await model.load() +``` + +For single-file GGUF models, pass a one-element array: + +```js +files: { model: [path.join(dir, 'Llama-3.2-1B-Instruct-Q4_0.gguf')] } +``` + +### 3. Create the `config` obj The `config` obj consists of a set of hyper-parameters which can be used to tweak the behaviour of the model. *All parameters must by strings.* @@ -159,43 +189,21 @@ const config = { | System with both | ✅ Uses dedicated GPU (preferred) | ✅ Uses dedicated GPU | ✅ Uses integrated GPU | -### 5. Create Model Instance +### 4. Create Model Instance ```js -const model = new LlmLlamacpp(args, config) +const model = new LlmLlamacpp(args) ``` -### 6. Load Model +### 5. Load Model ```js await model.load() ``` -_Optionally_ you can pass the following parameters to tweak the loading behaviour. -* `close?`: This boolean value determines whether to close the Data Loader after loading. Defaults to `true` -* `reportProgressCallback?`: A callback function which gets called periodically with progress updates. It can be used to display overall progress percentage. +Loads the model file(s) passed in `files.model` and activates the native addon. If a projection model was provided (`files.projectionModel`), it is loaded as part of the same step. -_For example:_ - -```js -await model.load(false, progress => process.stdout.write(`\rOverall Progress: ${progress.overallProgress}%`)) -``` - -**Progress Callback Data** - -The progress callback receives an object with the following properties: - -| Property | Type | Description | -|---------------------|--------|-----------------------------------------| -| `action` | string | Current operation being performed | -| `totalSize` | number | Total bytes to be loaded | -| `totalFiles` | number | Total number of files to process | -| `filesProcessed` | number | Number of files completed so far | -| `currentFile` | string | Name of file currently being processed | -| `currentFileProgress` | string | Percentage progress on current file | -| `overallProgress` | string | Overall loading progress percentage | - -### 7. Run Inference +### 6. Run Inference Pass an array of messages (following the chat completion format) to the `run` method. Process the generated tokens asynchronously: @@ -227,14 +235,13 @@ try { When `opts.stats` is enabled, `response.stats` includes runtime metrics such as `TTFT`, `TPS`, token counters, and `backendDevice` (`"cpu"` or `"gpu"`). `backendDevice` reflects the resolved device used at runtime after backend selection/fallback logic, not only the requested config. -### 8. Release Resources +### 7. Release Resources Unload the model when finished: ```javascript try { await model.unload() - await fsDL.close() } catch (error) { console.error('Failed to unload model:', error) } @@ -341,24 +348,24 @@ In addition to ONNX-based OCR (`@qvac/ocr-onnx`), you can use vision-language mo ```js const LlmLlamacpp = require('@qvac/llm-llamacpp') -const FilesystemDL = require('@qvac/dl-filesystem') const fs = require('bare-fs') +const path = require('bare-path') const dirPath = './models' -const loader = new FilesystemDL({ dirPath }) const model = new LlmLlamacpp({ - modelName: 'LightOnOCR-2-1B-ocr-soup-Q4_K_M.gguf', - loader, - logger: console, - diskPath: dirPath, - projectionModel: 'mmproj-F16.gguf' -}, { - device: 'cpu', - gpu_layers: '0', - ctx_size: '4096', - temp: '0.1', - predict: '2048' + files: { + model: [path.join(dirPath, 'LightOnOCR-2-1B-ocr-soup-Q4_K_M.gguf')], + projectionModel: path.join(dirPath, 'mmproj-F16.gguf') + }, + config: { + device: 'cpu', + gpu_layers: '0', + ctx_size: '4096', + temp: '0.1', + predict: '2048' + }, + logger: console }) await model.load() @@ -382,7 +389,6 @@ await response.await() console.log(output.join('')) await model.unload() -await loader.close() ``` ## Architecture diff --git a/packages/qvac-lib-infer-llamacpp-llm/addon.js b/packages/qvac-lib-infer-llamacpp-llm/addon.js index 5582f626b6..af9fcaf742 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/addon.js +++ b/packages/qvac-lib-infer-llamacpp-llm/addon.js @@ -1,5 +1,82 @@ const path = require('bare-path') +/** + * Map a raw native event from the C++ addon to a logical inference event. + * + * The native binding emits events with C++-mangled names and varied payload + * shapes. This wrapper normalizes them into one of: + * - `'Output'` — token / partial output + * - `'Error'` — failure + * - `'JobEnded'` — terminal payload (TPS-shaped runtime stats OR + * a finetune `{op:'finetune', status, stats?}` payload) + * - `'FinetuneProgress'` — incremental finetune metrics + * + * Returns `{ type, data, error }` or `null` if the event should be dropped + * (currently only used to swallow the TPS payload that the C++ addon emits + * immediately after a finetune terminal — see `state.skipNextRuntimeStats`). + * + * Stateful because the C++ event ordering is stateful: the same TPS shape + * means "inference finished" most of the time but means "stale finetune + * trailer, ignore me" when it follows a finetune terminal. The caller owns + * the state object so the addon wrapper does not need to be a singleton. + * + * @param {string} rawEvent - native event name + * @param {*} rawData - native event payload + * @param {*} rawError - native error payload (only set for Error events) + * @param {{ skipNextRuntimeStats: boolean }} state - mutable state shared + * across consecutive callbacks for one model instance + * @returns {{ type: string, data: *, error: * } | null} + */ +function mapAddonEvent (rawEvent, rawData, rawError, state) { + // TPS-shaped runtime stats — either a real inference terminal or the stale + // trailer that follows a finetune terminal. + if (rawData && typeof rawData === 'object' && 'TPS' in rawData) { + if (state.skipNextRuntimeStats) { + state.skipNextRuntimeStats = false + return null + } + const stats = { ...rawData } + if (stats.backendDevice === 0) { + stats.backendDevice = 'cpu' + } else if (stats.backendDevice === 1) { + stats.backendDevice = 'gpu' + } + return { type: 'JobEnded', data: stats, error: null } + } + + // Finetune terminal: dispatch JobEnded carrying the finetune payload and arm + // the skip flag so the TPS the C++ addon emits right after is not mistaken + // for an inference result that would clobber `_hasActiveResponse`. + if ( + rawData && + typeof rawData === 'object' && + rawData.op === 'finetune' && + typeof rawData.status === 'string' + ) { + state.skipNextRuntimeStats = true + return { type: 'JobEnded', data: rawData, error: null } + } + + // Per-iteration finetune metrics. + if ( + rawData && + typeof rawData === 'object' && + rawData.type === 'finetune_progress' + ) { + return { type: 'FinetuneProgress', data: rawData, error: null } + } + + // Default: name-based mapping. C++ event names mangle to things like + // `Error.something` and string payloads carry tokens. + let type = rawEvent + if (typeof rawEvent === 'string' && rawEvent.includes('Error')) { + type = 'Error' + } else if (typeof rawData === 'string') { + type = 'Output' + } + return { type, data: rawData, error: rawError } +} + /** * An interface between Bare addon in C++ and JS runtime. */ @@ -29,11 +106,19 @@ class LlamaInterface { } /** + * Loads model weights. The native side reads the JS property names + * `chunk` and `completed` directly, so this object's field names are + * load-bearing — see `JsBlobsStream.hpp::appendBlob` in + * `qvac-lib-inference-addon-cpp` for the parser. * * @param {Object} weightsData - * @param {String} weightsData.filename - * @param {Uint8Array} weightsData.contents - * @param {Boolean} weightsData.completed + * @param {String} weightsData.filename - Logical filename used to group + * chunks into one shard. The native side keys `shards_in_progress` on + * this. + * @param {Uint8Array|null} weightsData.chunk - Next chunk of bytes for the + * current shard, or `null` on the final call when `completed` is true. + * @param {Boolean} weightsData.completed - `false` while more chunks + * remain; `true` on the last call to finalize the shard. */ async loadWeights (weightsData) { this._binding.loadWeights(this._handle, weightsData) @@ -86,5 +171,6 @@ class LlamaInterface { } module.exports = { - LlamaInterface + LlamaInterface, + mapAddonEvent } diff --git a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/llm-parameter-sweep.js b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/llm-parameter-sweep.js index 6f84cd946a..c65f81c658 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/llm-parameter-sweep.js +++ b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/llm-parameter-sweep.js @@ -3,7 +3,6 @@ const fs = require('bare-fs') const path = require('bare-path') const process = require('bare-process') -const FilesystemDL = require('@qvac/dl-filesystem') const { parseAddonSource, resolveAddonCtor, @@ -249,7 +248,6 @@ async function main () { continue } } - let loader = null let model = null let modelLoaded = false let caseRepeatsAttempted = 0 @@ -269,18 +267,16 @@ async function main () { debugLogger.log(`Running: ${testCase.caseId}`) - loader = new FilesystemDL({ dirPath: modelDef.modelDir }) const config = buildConfigObject(testCase.runtimeConfig) const addonRuntimeLogger = createAddonRuntimeLogger(debugEnabled) // Load model once for this case model = new AddonCtor({ - modelName: testCase.modelName, - loader, + files: { model: [path.join(modelDef.modelDir, testCase.modelName)] }, + config, logger: addonRuntimeLogger, - diskPath: modelDef.modelDir, opts: { stats: true } - }, config) + }) const loadStart = process.hrtime() let loadMs = null @@ -326,12 +322,6 @@ async function main () { }) completedCases.add(caseKey) saveProgress() - // Clean up loader before continuing - try { - await loader.close().catch(() => {}) - } catch { - // Ignore cleanup errors - } continue // Skip to next case } throw loadError @@ -529,13 +519,6 @@ async function main () { } } - // Close loader after all prompts - try { - await loader.close().catch(() => {}) - } catch (closeError) { - debugLogger.warn(`Failed to close loader: ${closeError.message || String(closeError)}`) - } - // Add delay after case completion to allow cleanup await new Promise(resolve => setTimeout(resolve, 200)) @@ -606,13 +589,6 @@ async function main () { } catch { // Ignore cleanup errors } - try { - if (loader) { - await loader.close().catch(() => {}) - } - } catch { - // Ignore cleanup errors - } debugLogger.error(`Case ${testCase.caseId} failed completely: ${caseError.message || String(caseError)}`) const remainingRepeats = Math.max(0, (promptsForCase.length * repeats) - caseRepeatsAttempted) for (let i = 0; i < remainingRepeats; i++) { diff --git a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/package.json b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/package.json index d2286498c6..fd334080f4 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/package.json +++ b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/package.json @@ -11,7 +11,6 @@ "run:judge": "node ./prepare-models.js --target addon && bare ./run-judge.js" }, "dependencies": { - "@qvac/dl-filesystem": "latest", "@qvac/llm-llamacpp": "latest", "bare-fs": "latest", "bare-path": "latest", diff --git a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/prepare-prompts.js b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/prepare-prompts.js index 1be3bc7962..0a70fe49d1 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/prepare-prompts.js +++ b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/prepare-prompts.js @@ -3,7 +3,6 @@ const fs = require('bare-fs') const path = require('bare-path') const process = require('bare-process') -const FilesystemDL = require('@qvac/dl-filesystem') const Llm = require('../../index') const { PROMPT_CTX_SIZES, @@ -200,24 +199,20 @@ function batchTemplateMessages () { } async function main () { - if (!fs.existsSync(path.join(MODEL_DIR, MODEL_NAME))) { - throw new Error(`Missing tokenizer model at ${path.join(MODEL_DIR, MODEL_NAME)}. Run model prep first.`) + const modelPath = path.join(MODEL_DIR, MODEL_NAME) + if (!fs.existsSync(modelPath)) { + throw new Error(`Missing tokenizer model at ${modelPath}. Run model prep first.`) } - const loader = new FilesystemDL({ dirPath: MODEL_DIR }) let model = null try { try { - model = new Llm( - { - modelName: MODEL_NAME, - loader, - diskPath: MODEL_DIR, - opts: { stats: true } - }, - FAST_PROBE_RUNTIME - ) + model = new Llm({ + files: { model: [modelPath] }, + config: FAST_PROBE_RUNTIME, + opts: { stats: true } + }) await model.load() console.log('Prompt calibration runtime: gpu (fast path)') } catch (gpuErr) { @@ -227,15 +222,11 @@ async function main () { } console.warn(`GPU probe init failed; falling back to CPU: ${msg}`) if (model) await model.unload().catch(() => {}) - model = new Llm( - { - modelName: MODEL_NAME, - loader, - diskPath: MODEL_DIR, - opts: { stats: true } - }, - SAFE_FALLBACK_RUNTIME - ) + model = new Llm({ + files: { model: [modelPath] }, + config: SAFE_FALLBACK_RUNTIME, + opts: { stats: true } + }) await model.load() console.log('Prompt calibration runtime: cpu (fallback)') } @@ -282,7 +273,6 @@ async function main () { console.log(`Wrote ${prompts.length} prompts to ${OUTPUT_PATH}`) } finally { if (model) await model.unload().catch(() => {}) - await loader.close().catch(() => {}) } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/run-judge.js b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/run-judge.js index 648ad6cb5b..77037a40e7 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/run-judge.js +++ b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/run-judge.js @@ -3,7 +3,6 @@ const fs = require('bare-fs') const path = require('bare-path') const process = require('bare-process') -const FilesystemDL = require('@qvac/dl-filesystem') const { parseAddonSource, resolveAddonCtor, @@ -94,22 +93,20 @@ function pairKey (reference, candidate) { function createJudgeRuntimeManager (opts) { let model = null - let loader = null const cache = new Map() const maxChars = 6000 return { async init () { if (model) return - loader = new FilesystemDL({ dirPath: opts.modelDef.modelDir }) const config = buildConfigObject(opts.runtimeConfig) - model = new opts.AddonCtor({ - modelName: opts.modelName, - loader, - diskPath: opts.modelDef.modelDir, + const AddonCtor = opts.AddonCtor + model = new AddonCtor({ + files: { model: [path.join(opts.modelDef.modelDir, opts.modelName)] }, + config, opts: { stats: true }, logger: createAddonRuntimeLogger(opts.debug) - }, config) + }) await model.load() }, @@ -156,10 +153,6 @@ function createJudgeRuntimeManager (opts) { await model.unload().catch(() => {}) model = null } - if (loader) { - await loader.close().catch(() => {}) - loader = null - } } } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/verify-prompts.js b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/verify-prompts.js index 902822a1e9..d954df48f4 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/verify-prompts.js +++ b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/performance/verify-prompts.js @@ -3,7 +3,6 @@ const fs = require('bare-fs') const path = require('bare-path') const process = require('bare-process') -const FilesystemDL = require('@qvac/dl-filesystem') const Llm = require('../../index') const { PROMPT_CTX_SIZES, @@ -53,24 +52,20 @@ async function main () { if (!byId.has('long')) failures.push('Missing base prompt: long') - if (!fs.existsSync(path.join(MODEL_DIR, MODEL_NAME))) { - throw new Error(`Missing tokenizer model at ${path.join(MODEL_DIR, MODEL_NAME)}`) + const modelPath = path.join(MODEL_DIR, MODEL_NAME) + if (!fs.existsSync(modelPath)) { + throw new Error(`Missing tokenizer model at ${modelPath}`) } - const loader = new FilesystemDL({ dirPath: MODEL_DIR }) let model = null try { try { - model = new Llm( - { - modelName: MODEL_NAME, - loader, - diskPath: MODEL_DIR, - opts: { stats: true } - }, - FAST_PROBE_RUNTIME - ) + model = new Llm({ + files: { model: [modelPath] }, + config: FAST_PROBE_RUNTIME, + opts: { stats: true } + }) await model.load() console.log('Prompt verification runtime: gpu (fast path)') } catch (gpuErr) { @@ -80,15 +75,11 @@ async function main () { } console.warn(`GPU probe init failed; falling back to CPU: ${msg}`) if (model) await model.unload().catch(() => {}) - model = new Llm( - { - modelName: MODEL_NAME, - loader, - diskPath: MODEL_DIR, - opts: { stats: true } - }, - SAFE_FALLBACK_RUNTIME - ) + model = new Llm({ + files: { model: [modelPath] }, + config: SAFE_FALLBACK_RUNTIME, + opts: { stats: true } + }) await model.load() console.log('Prompt verification runtime: cpu (fallback)') } @@ -140,7 +131,6 @@ async function main () { } } finally { if (model) await model.unload().catch(() => {}) - await loader.close().catch(() => {}) } if (failures.length) { diff --git a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/server/bare_infer.js b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/server/bare_infer.js index 303caff5e9..514b42f261 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/server/bare_infer.js +++ b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/server/bare_infer.js @@ -24,38 +24,26 @@ async function main () { const prompts = JSON.parse(fs.readFileSync(promptsFile, 'utf-8')) console.log(`Loaded ${prompts.length} prompts`) - // Load FilesystemDL directly (same package used by modelManager) - let FsDL - try { - FsDL = require('@qvac/dl-filesystem') - } catch { - // Fallback: resolve from main package node_modules - FsDL = require('../../node_modules/@qvac/dl-filesystem') - } - - const loader = new FsDL({ dirPath: diskPath }) - // Create LlmLlamacpp directly (bypassing modelManager) so we can pass // tools: 'true' which enables jinja template rendering for models with // custom chat templates (like AfriqueGemma) const model = new LlmLlamacpp({ - loader, - logger: console, - diskPath, - modelName - }, { - device: 'cpu', - gpu_layers: '0', - ctx_size: '2048', - temp: '0', - top_p: '1', - top_k: '1', - predict: maxTokens, - repeat_penalty: '1', - seed: '42', - tools: 'true', - 'reverse-prompt': '\n', - verbosity: '1' + files: { model: [path.join(diskPath, modelName)] }, + config: { + device: 'cpu', + gpu_layers: '0', + ctx_size: '2048', + temp: '0', + top_p: '1', + top_k: '1', + predict: maxTokens, + repeat_penalty: '1', + seed: '42', + tools: 'true', + 'reverse-prompt': '\n', + verbosity: '1' + }, + logger: console }) await model.load() @@ -84,7 +72,6 @@ async function main () { console.log(`Outputs written to ${outputsFile}`) await model.unload() - await loader.close() } main().catch(error => { diff --git a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/server/src/services/modelManager.js b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/server/src/services/modelManager.js index efc7b0823d..52c0c56d95 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/benchmarks/server/src/services/modelManager.js +++ b/packages/qvac-lib-infer-llamacpp-llm/benchmarks/server/src/services/modelManager.js @@ -1,7 +1,7 @@ 'use strict' const LlmLlamacpp = require('@qvac/llm-llamacpp') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const logger = require('../utils/logger') /** @@ -71,32 +71,26 @@ class ModelManager { * Internal method to load a model */ async _loadModel (modelPath, diskPath, localModelName, config) { - // Create FilesystemDL for local model loading - const loader = new FilesystemDL({ - dirPath: diskPath - }) - const model = new LlmLlamacpp({ - diskPath, - modelName: localModelName, - loader, + files: { model: [path.join(diskPath, localModelName)] }, + config: { + device: config?.device, + gpu_layers: config?.gpu_layers, + ctx_size: config?.ctx_size, + temp: config?.temp, + top_p: config?.top_p, + top_k: config?.top_k, + n_predict: config?.n_predict, + repeat_penalty: config?.repeat_penalty, + seed: config?.seed, + verbosity: '3' + }, logger: { info: logger.info.bind(logger), error: logger.error.bind(logger), warn: logger.warn.bind(logger), debug: logger.debug.bind(logger) } - }, { - device: config?.device, - gpu_layers: config?.gpu_layers, - ctx_size: config?.ctx_size, - temp: config?.temp, - top_p: config?.top_p, - top_k: config?.top_k, - n_predict: config?.n_predict, - repeat_penalty: config?.repeat_penalty, - seed: config?.seed, - verbosity: '3' }) logger.info('Loading model into VRAM...') diff --git a/packages/qvac-lib-infer-llamacpp-llm/docs/afriquegemma-translation.md b/packages/qvac-lib-infer-llamacpp-llm/docs/afriquegemma-translation.md index f11248534f..afada8c60c 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/docs/afriquegemma-translation.md +++ b/packages/qvac-lib-infer-llamacpp-llm/docs/afriquegemma-translation.md @@ -192,24 +192,25 @@ wget -O ~/.qvac/models/AfriqueGemma-4B-Q4_K_M.gguf \ ```javascript const LlmLlamacpp = require('@qvac/llm-llamacpp') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') -const loader = new FilesystemDL({ dirPath: '/path/to/models' }) +const modelDir = '/path/to/models' const model = new LlmLlamacpp({ - loader, - modelName: 'AfriqueGemma-4B-Q4_K_M.gguf', - diskPath: '/path/to/models', + files: { + model: [path.join(modelDir, 'AfriqueGemma-4B-Q4_K_M.gguf')] + }, + config: { + device: 'cpu', + ctx_size: '2048', + temp: '0', + top_k: '1', + top_p: '1', + n_predict: '64', + seed: '42', + tools: 'true' + }, logger: console -}, { - device: 'cpu', - ctx_size: '2048', - temp: '0', - top_k: '1', - top_p: '1', - n_predict: '64', - seed: '42', - tools: 'true' }) await model.load() @@ -230,7 +231,6 @@ translation = translation.split('\n')[0].trim() console.log(translation) await model.unload() -await loader.close() ``` ### Python Validation (Transformers) @@ -320,7 +320,7 @@ The model generates text beyond the first translation line. Use one of: ### Production Considerations -1. **Model path:** Store GGUF in a persistent volume. Use `FilesystemDL` or `HyperdriveDL` for loading. +1. **Model path:** Store GGUF in a persistent volume. Models are passed to `LlmLlamacpp` as absolute paths via `files.model` (an array of one or more GGUF file paths). 2. **Warm-up:** First inference after load is slower due to KV cache initialization. Run a dummy prompt after `model.load()`. 3. **Concurrency:** `LlmLlamacpp` supports one active inference at a time. Queue requests at the application layer. 4. **Error handling:** Wrap `model.run()` in try/catch. The addon throws on context overflow or busy state. diff --git a/packages/qvac-lib-infer-llamacpp-llm/docs/architecture.md b/packages/qvac-lib-infer-llamacpp-llm/docs/architecture.md index 4052cd0096..abdcafe5cc 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/docs/architecture.md +++ b/packages/qvac-lib-infer-llamacpp-llm/docs/architecture.md @@ -1,6 +1,6 @@ # Architecture Documentation -**Package:** `@qvac/llm-llamacpp` v0.9.0 +**Package:** `@qvac/llm-llamacpp` v0.16.0 **Stack:** JavaScript, C++20, llama.cpp, Bare Runtime, CMake, vcpkg **License:** Apache-2.0 @@ -23,7 +23,7 @@ ### Architecture Decisions - [Decision 1: llama.cpp as Inference Backend](#decision-1-llamacpp-as-inference-backend) - [Decision 2: Bare Runtime over Node.js](#decision-2-bare-runtime-over-nodejs) -- [Decision 3: Pluggable Data Loader Architecture](#decision-3-pluggable-data-loader-architecture) +- [Decision 3: Caller-Supplied File Paths](#decision-3-caller-supplied-file-paths) - [Decision 4: Incremental Buffer-Based Weight Loading](#decision-4-incremental-buffer-based-weight-loading) - [Decision 5: Chat Message Format](#decision-5-chat-message-format-json-serialization) - [Decision 6: Exclusive Run Queue](#decision-6-exclusive-run-queue-indexjs) @@ -42,20 +42,19 @@ **Core value:** - High-level JavaScript API for LLM inference -- Peer-to-peer model distribution via Hyperdrive - Streaming token-by-token output - Text and multimodal (vision + text) models -- Pluggable model weight loaders +- Caller-owned model files (any source: filesystem, P2P, HTTP, etc.) ## Key Features - **Cross-platform**: macOS, Linux, Windows, iOS, Android -- **Multiple loaders**: Hyperdrive (P2P), filesystem, custom +- **Caller-owned files**: caller provides absolute file paths; the addon never downloads or discovers files on its own - **Streaming responses**: Async iterators or callbacks - **GPU acceleration**: Metal, Vulkan, OpenCL - **Quantized models**: GGUF format - **Multimodal**: Vision models (i.e. Qwen3-VL, SmolVLM, etc.) -- **Sharded loading**: Automatic split GGUF handling +- **Sharded loading**: Caller passes every shard (and the `.tensors.txt` companion); the addon streams them into llama.cpp in order ## Target Platforms @@ -70,6 +69,8 @@ **Dependencies:** - qvac-lib-inference-addon-cpp (≥1.1.2): C++ addon framework (single-job runner, runJob/activate/loadWeights/cancel/destroyInstance) - qvac-fabric-llm.cpp (≥7248.2.3): Inference engine +- @qvac/infer-base: `createJobHandler` and `exclusiveRunQueue` helpers (job/response lifecycle + single-job serialization) +- @qvac/logging: `QvacLogger` wrapper - Bare Runtime (≥1.24.0): JavaScript runtime - Linux requires Clang/LLVM 19 with libc++ @@ -86,35 +87,35 @@ graph TB subgraph "Application Layer" APP[QVAC Applications] end - + subgraph "Inference Addons" LLM[llm-llamacpp
LLMs] EMBED[embed-llamacpp
Embeddings] WHISPER[whispercpp
STT] NMT[nmtcpp
Translation] end - + subgraph "core libs" - BASE["@qvac/infer-base"] - DL["@qvac/dl-hyperdrive"] + BASE["@qvac/infer-base
(job handler + run queue)"] + LOG["@qvac/logging"] end - + subgraph "Native Framework" ADDON[addon-cpp] end - + subgraph "Backend" BARE[Bare Runtime] LLAMA[llama.cpp] end - + APP --> LLM LLM --> BASE - LLM --> DL + LLM --> LOG LLM --> ADDON ADDON --> BARE ADDON --> LLAMA - + style LLM fill:#e1f5ff,stroke:#0066cc,stroke-width:3px ``` @@ -123,23 +124,24 @@ graph TB **Dependency Table:** -| Package | Type | Version | Purpose | -|---------|------|---------|---------| -| @qvac/infer-base | Framework | ^0.2.0 | Base classes, WeightsProvider, QvacResponse | -| @qvac/dl-hyperdrive | Peer | ^0.1.1 | P2P model loading | -| qvac-lib-inference-addon-cpp | Native | ≥1.1.1 | C++ addon framework (single-job runner) | -| qvac-fabric-llm.cpp | Native | ≥7248.2.3 | Inference engine | -| Bare Runtime | Runtime | ≥1.24.0 | JavaScript execution | +| Package | Type | Purpose | +|---------|------|---------| +| @qvac/infer-base | Framework | `createJobHandler`, `exclusiveRunQueue`, `QvacResponse` | +| @qvac/logging | Framework | `QvacLogger` wrapper | +| qvac-lib-inference-addon-cpp | Native | C++ addon framework (single-job runner) | +| qvac-fabric-llm.cpp | Native | Inference engine | +| Bare Runtime | Runtime | JavaScript execution | **Integration Points:** | From | To | Mechanism | Data Format | |------|-----|-----------|-------------| -| JavaScript | LlmLlamacpp | Constructor | args, config objects | -| LlmLlamacpp | BaseInference | Inheritance | Template method pattern | +| JavaScript | LlmLlamacpp | Constructor `{ files, config, logger, opts }` | Object | +| LlmLlamacpp | createJobHandler | Composition | Job handle + callbacks | +| LlmLlamacpp | exclusiveRunQueue | Composition | Promise-based queue | | LlmLlamacpp | LlamaInterface | Composition | Method calls | | LlamaInterface | C++ Addon | require.addon() | Native binding | -| WeightsProvider | Data Loader | Interface | Stream protocol | +| LlmLlamacpp | bare-fs | Direct read stream | Absolute file paths | @@ -149,39 +151,56 @@ graph TB ### Main Class: LlmLlamacpp +`LlmLlamacpp` is a standalone class (no inheritance). It composes a job handler (`createJobHandler`), a single-job run queue (`exclusiveRunQueue`), and a `LlamaInterface` native bridge. + ```mermaid classDiagram class LlmLlamacpp { - +constructor(args, config) - +load(closeLoader, onProgress) Promise~void~ - +run(messages) Promise~QvacResponse~ + +constructor({ files, config, logger, opts }) + +load() Promise~void~ + +run(messages, runOptions?) Promise~QvacResponse~ + +finetune(finetuningOptions) Promise~FinetuneHandle~ + +pause() Promise~void~ + +cancel() Promise~void~ +unload() Promise~void~ - +downloadWeights(onProgress, opts) Promise~string~ + +getState() object } - - class BaseInference { - <> - +load() Promise~void~ - +run() Promise~QvacResponse~ + + class LlamaInterface { + +activate() Promise~void~ + +loadWeights(chunk) Promise~void~ + +runJob(inputs) Promise~boolean~ + +finetune(params) Promise~boolean~ + +cancel() Promise~void~ +unload() Promise~void~ } - + + class JobHandler { + <> + +start() QvacResponse + +output(token) void + +end(stats?, payload?) void + +fail(error) void + +active QvacResponse + } + + class RunQueue { + <> + +(fn) Promise~any~ + } + class QvacResponse { +iterate() AsyncIterator~string~ - +onUpdate(callback) QvacResponse - +await() Promise~void~ + +onUpdate(cb) QvacResponse + +await() Promise~object~ +cancel() Promise~void~ +stats object } - - class WeightsProvider { - +downloadFiles(files, path, opts) Promise~void~ - +streamFiles(shards, onChunk, onProgress) Promise~void~ - } - - LlmLlamacpp --|> BaseInference - LlmLlamacpp *-- WeightsProvider - LlmLlamacpp ..> QvacResponse : creates + + LlmLlamacpp *-- LlamaInterface + LlmLlamacpp *-- JobHandler + LlmLlamacpp *-- RunQueue + JobHandler ..> QvacResponse : creates ```
@@ -191,18 +210,34 @@ classDiagram | Class | Responsibility | Lifecycle | Dependencies | |-------|----------------|-----------|--------------| -| LlmLlamacpp | Orchestrate model lifecycle, manage loading/inference | Created by user, persistent | WeightsProvider, LlamaInterface | -| BaseInference | Define standard inference API | Abstract base class | None | -| QvacResponse | Stream inference output | Created per run() call, short-lived | None | -| WeightsProvider | Abstract model weight loading | Created by LlmLlamacpp | DataLoader | +| LlmLlamacpp | Orchestrate model lifecycle, stream weights, submit jobs, handle events | Created by user, persistent | LlamaInterface, createJobHandler, exclusiveRunQueue | +| LlamaInterface | JS wrapper around the native addon (handle, callbacks) | Created lazily in `_load()` | binding.js | +| JobHandler (createJobHandler) | Track the current job, create `QvacResponse`, route `output`/`end`/`fail` | One per LlmLlamacpp instance | None | +| exclusiveRunQueue | Serialize `run()` / `finetune()` / `unload()` into single-in-flight FIFO | One per LlmLlamacpp instance | None | +| QvacResponse | Stream inference output, expose `await()`/`iterate()`/`onUpdate()` | Created per job, short-lived | None | **Key Relationships:** | From | To | Type | Purpose | |------|-----|------|---------| -| LlmLlamacpp | BaseInference | Inheritance | Standard QVAC inference API | -| LlmLlamacpp | WeightsProvider | Composition | Model weight acquisition | -| LlmLlamacpp | QvacResponse | Creates | Streaming output per inference | +| LlmLlamacpp | LlamaInterface | Composition | Native addon bridge | +| LlmLlamacpp | JobHandler | Composition | Per-job lifecycle + response | +| LlmLlamacpp | exclusiveRunQueue | Composition | Serialize public API calls | +| LlmLlamacpp | bare-fs | Direct use | Stream shard files in `_streamShards()` | + +**Constructor signature (new):** + +```js +new LlmLlamacpp({ + files: { model: string[], projectionModel?: string }, + config: Record, + logger?: object, + opts?: { stats?: boolean } +}) +``` + +- `files.model` is an array of absolute file paths. For single-file GGUFs, pass a one-element array. For sharded GGUFs the caller passes the `.tensors.txt` companion first, followed by every shard in numerical order. +- `load()` takes no arguments. It constructs the native addon with the first shard-matching entry of `files.model` (via `pickPrimaryGgufPath`) as the primary model path, streams all entries via `bare-fs` + `loadWeights`, and finally calls `activate()`.
@@ -219,22 +254,23 @@ graph TB subgraph "Layer 1: JavaScript API" APP["Application Code"] LLMCLASS["LlmLlamacpp
(index.js)"] - BASEINF["BaseInference
(@qvac/infer-base)"] - WEIGHTSPR["WeightsProvider
(@qvac/infer-base)"] + JOBH["createJobHandler
(@qvac/infer-base)"] + RUNQ["exclusiveRunQueue
(@qvac/infer-base)"] RESPONSE["QvacResponse
(@qvac/infer-base)"] + BAREFS["bare-fs read stream
(shard streaming)"] end - + subgraph "Layer 2: Bridge" LLAMAIF["LlamaInterface
(addon.js)"] BINDING["require.addon
(binding.js)"] end - + subgraph "Layer 3: C++ Addon" JSINTERFACE["JsInterface
(addon-cpp JsInterface)"] ADDONCPP["AddonCpp / AddonJs
(addon-cpp + addon/AddonJs.hpp)"] WEIGHTSLOAD["WeightsLoader
(addon-cpp)"] end - + subgraph "Layer 4: Model" LLAMAMODEL["LlamaModel
(model-interface/LlamaModel.cpp)"] METADATA["ModelMetaData
(model-interface/ModelMetadata.cpp)"] @@ -242,28 +278,28 @@ graph TB TEXTCTX["TextLlmContext
(model-interface/TextLlmContext.cpp)"] MTMDCTX["MtmdLlmContext
(model-interface/MtmdLlmContext.cpp)"] end - + subgraph "Layer 5: Backend" LLAMACPP["llama.cpp"] GGML["GGML"] GPU["GPU Backends"] end - + APP --> LLMCLASS - LLMCLASS --> BASEINF - LLMCLASS --> WEIGHTSPR + LLMCLASS --> JOBH + LLMCLASS --> RUNQ + LLMCLASS --> BAREFS LLMCLASS --> LLAMAIF - LLMCLASS -.-> RESPONSE - + JOBH -.-> RESPONSE + LLAMAIF --> BINDING BINDING --> JSINTERFACE - WEIGHTSPR --> WEIGHTSLOAD - + BAREFS --> LLAMAIF + JSINTERFACE --> ADDONCPP ADDONCPP --> WEIGHTSLOAD ADDONCPP --> LLAMAMODEL - ADDONCPP --> WEIGHTSLOAD - + LLAMAMODEL --> METADATA LLAMAMODEL --> ASYNCWL ASYNCWL --> METADATA @@ -271,10 +307,10 @@ graph TB LLAMAMODEL --> MTMDCTX TEXTCTX --> LLAMACPP MTMDCTX --> LLAMACPP - + LLAMACPP --> GGML GGML --> GPU - + style LLMCLASS fill:#e1f5ff style ADDONCPP fill:#ffe1e1 style LLAMAMODEL fill:#ffe1e1 @@ -288,7 +324,7 @@ graph TB | Layer | Components | Responsibility | Language | Why This Layer | |-------|------------|----------------|----------|----------------| -| 1. JavaScript API | LlmLlamacpp, BaseInference | High-level API, error handling | JS | Ergonomic API for npm consumers | +| 1. JavaScript API | LlmLlamacpp, createJobHandler, exclusiveRunQueue, bare-fs | High-level API, job/response lifecycle, shard streaming | JS | Ergonomic API for npm consumers | | 2. Bridge | LlamaInterface, binding.js | JS↔C++ communication | JS wrapper | Lifecycle management, handle safety | | 3. C++ Addon | JsInterface, AddonCpp/AddonJs | Single-job runner, threading, callbacks | C++ | Performance, native integration | | 4. Model | LlamaModel, ModelMetaData, AsyncWeightsLoader, Contexts | Inference logic, metadata extraction, streaming weight coordination, chat formatting | C++ | Direct llama.cpp integration | @@ -298,13 +334,14 @@ graph TB | Direction | Path | Data Format | Transform | |-----------|------|-------------|-----------| +| Weights → | bare-fs → LlmLlamacpp → LlamaInterface → Addon | Buffer chunks | Streamed via `loadWeights({filename, chunk, completed})` | | Input → | JS → Bridge → Addon | JSON string | Serialize messages | | Input → | Addon → Model | parsed chat_msg | Parse JSON, format template | | Input → | Model → llama.cpp | tokens | Tokenize | | Output ← | llama.cpp → Model | token IDs | Sample | | Output ← | Model → Addon | UTF-8 string | Decode token | | Output ← | Addon → Bridge | string | Queue output | -| Output ← | Bridge → JS | string | Emit via callback | +| Output ← | Bridge → JS | string | Emit via `_addonOutputCallback` → `JobHandler.output()` | @@ -316,7 +353,7 @@ graph TB #### **LlmLlamacpp (index.js)** -**Responsibility:** Main API class, orchestrates model lifecycle, manages data loaders +**Responsibility:** Main API class. Standalone (no inheritance). Orchestrates the lifecycle: creates the native addon, streams shards from absolute file paths via `bare-fs`, serializes public API calls through `exclusiveRunQueue`, tracks the current job via `createJobHandler`, and routes addon events to the active `QvacResponse`. **Why JavaScript:** - High-level API ergonomics for npm consumers @@ -324,7 +361,10 @@ graph TB - Event loop integration for streaming - Configuration parsing - +**Composition (no base class):** +- `this._job = createJobHandler({ cancel: () => this.addon.cancel() })` — single active job + response +- `this._run = exclusiveRunQueue()` — serialized `run()` / `finetune()` / `unload()` +- `this.addon = new LlamaInterface(...)` — native bridge, created lazily in `_load()` #### **LlamaInterface (addon.js)** @@ -359,15 +399,14 @@ graph TB **LLM specialization:** createInstance builds LlamaModel with config; runJob parses inputs array (media + text) into LlamaModel::Prompt -#### **WeightsProvider (@qvac/infer-base)** +#### **Shard streaming (`_streamShards` in index.js)** -**Responsibility:** Abstracts model weight acquisition +**Responsibility:** Stream caller-supplied shard files from disk into the native addon. -**Why JavaScript:** -- Integrates with data loaders (Hyperdrive, filesystem) -- Progress tracking and reporting -- Handles sharded GGUF expansion -- Streaming chunk delivery +- Iterates all entries of `files.model` (the primary path selected by `pickPrimaryGgufPath` was already passed to the constructor) +- For each file, opens a `bare-fs.createReadStream`, forwards every chunk via `addon.loadWeights({ filename, chunk, completed: false })` +- Calls `addon.loadWeights({ filename, chunk: null, completed: true })` after each file to finalize that shard +- The caller is responsible for the **complete set of files and their order** (including the `.tensors.txt` companion first for sharded models). No discovery, no expansion, no download logic inside the addon. #### **ModelMetaData (model-interface/ModelMetadata.cpp)** @@ -592,131 +631,50 @@ See [qvac-lib-inference-addon-cpp Decision 4: Why Bare Runtime](https://github.c --- -## Decision 3: Pluggable Data Loader Architecture +## Decision 3: Caller-Supplied File Paths
⚡ TL;DR -**Chose:** Abstract data loading via WeightsProvider interface -**Why:** Support multiple distribution methods (P2P, HTTP, local files, S3) -**Cost:** Additional abstraction layer, must implement loader interface +**Chose:** Caller passes absolute file paths in `files.model`; the addon does **no** download, discovery, or shard expansion. +**Why:** Keeps the addon focused on inference; distribution is the application's responsibility. +**Cost:** Callers must resolve sharded models themselves (including the `.tensors.txt` companion file).
### Context -Need to load multi-GB model files from various sources: -- Local filesystem (for offline/development) -- P2P networks (for privacy/decentralization) -- HTTP/CDN (for enterprise deployments) -- Cloud storage (S3, Azure Blob, etc.) - -Different use cases have different distribution requirements. No single distribution method fits all scenarios. +Earlier iterations of this package shipped a `WeightsProvider` + pluggable data-loader abstraction that tried to own download, caching, and shard expansion. In practice this coupled the inference addon to an I/O layer with very different lifecycle and failure modes, and forced consumers to pick (or adapt) a loader even for the trivial "file is already on disk" case. ### Decision -Create a pluggable data loader abstraction (WeightsProvider interface) that decouples model loading from the inference engine, allowing applications to choose their distribution strategy. +`LlmLlamacpp` accepts **only** absolute file paths via `files.model`. Downloading, caching, P2P, HTTP, and shard discovery all live outside the addon. The addon: -### Rationale +1. Constructs the native instance with the primary path selected by `pickPrimaryGgufPath(files.model)` — the first entry matching the shard regex, or `files.model[0]` for non-sharded models. +2. If `files.model.length > 1`, streams all files (in the provided order) via `bare-fs.createReadStream` into `addon.loadWeights({ filename, chunk, completed })`. +3. Calls `addon.activate()` to finalize load. -**Flexibility:** -- Different users have different distribution needs (privacy vs speed vs simplicity) -- Enterprises may require HTTP/CDN, privacy users may prefer P2P -- Development/testing needs local filesystem access -- No single distribution method fits all use cases +For sharded GGUFs, the caller must pass **every** shard **and** the `.tensors.txt` companion file, in order: `.tensors.txt` first, then each shard in numerical order. See the [README sharded models section](../README.md#sharded-models) for the concrete example. -**Separation of Concerns:** -- Inference engine doesn't need to know about distribution details -- Model loading is orthogonal to inference logic -- Easier to test inference separately from data loading +### Rationale -**Extensibility:** -- Applications can implement custom loaders (S3, IPFS, Torrent, etc.) -- Can optimize loaders for specific platforms (mobile vs desktop) -- Future-proof: new distribution methods don't require engine changes +**Single responsibility:** +- The addon is an inference engine. It should not own download/caching/P2P logic. +- Callers already have transport libraries that fit their deployment (Hyperdrive, HTTP, S3, bare file copy, etc.). -### Trade-offs -- ✅ Can mock loaders for unit testing inference logic -- ❌ Additional abstraction complexity vs hardcoding a single method -- ❌ Applications must choose/implement their loader (no batteries-included default) - -### WeightsProvider Interface - -```javascript -// Core abstraction that all loaders must implement -interface WeightsProvider { - // Get readable stream for model file - async getStream(path: string): ReadableStream - - // Wait for loader to be ready - async ready(): Promise - - // Cleanup resources - async close(): Promise -} -``` - -### Example Implementations +**Predictable failure modes:** +- No hidden retries, no hidden temp files, no partial-state recovery inside the addon. +- If a shard is missing, the failure happens at a clear boundary (`loadWeights` or `activate`). -
-📊 LLM-Friendly: Loader Comparison - -**Performance Characteristics:** - -| Loader | Use Case | Initial Download | Subsequent Access | Setup Complexity | -|--------|----------|------------------|-------------------|------------------| -| **FileSystemDataLoader** | Development, offline | Instant | Instant | Low (just file path) | -| **HyperdriveDataLoader** | Privacy, P2P | 10-100 MB/s | Instant (cached) | Medium (P2P keys) | -| **HttpDataLoader** | Enterprise, CDN | 50-500 MB/s | Varies | Low (just URL) | -| **S3DataLoader** | Cloud deployments | 50-200 MB/s | Varies | Medium (AWS credentials) | - -**Example: Local Filesystem Loader** -```javascript -class FileSystemDataLoader { - constructor(basePath) { this.basePath = basePath } - - async getStream(path) { - return fs.createReadStream(`${this.basePath}/${path}`) - } - async ready() { /* no-op */ } - async close() { /* no-op */ } -} -``` +**Simpler API:** +- `load()` takes no arguments. No `closeLoader`, no `onProgress`, no `downloadWeights()`. +- Callers who want progress reporting attach it to their own download step, before calling `load()`. -**Example: HTTP/CDN Loader** -```javascript -class HttpDataLoader { - constructor(baseUrl) { this.baseUrl = baseUrl } - - async getStream(path) { - const response = await fetch(`${this.baseUrl}/${path}`) - return response.body - } - async ready() { /* no-op */ } - async close() { /* no-op */ } -} -``` - -**Example: Hyperdrive (P2P) Loader** -```javascript -class HyperdriveDataLoader { - constructor(key) { - this.drive = new Hyperdrive(key) - } - - async getStream(path) { - return this.drive.createReadStream(path) - } - async ready() { - await this.drive.ready() - } - async close() { - await this.drive.close() - } -} -``` - -
+### Trade-offs +- ✅ Zero coupling between inference and transport +- ✅ Trivial to test with plain local files +- ❌ Callers must implement (or reuse) shard resolution — including listing the `.tensors.txt` companion file alongside the shards +- ❌ No "batteries included" default — intentional --- @@ -737,23 +695,18 @@ ML models can be gigabytes in size. llama.cpp expects either: 1. A file descriptor (simple but requires file on disk) 2. A buffer (via `std::streambuf` interface) -**Problem:** We need to load directly from Hyperdrive (P2P storage) without duplicating storage by saving to disk first. - -Alternative approach would be: download from Hyperdrive → save to temp file → pass file descriptor to llama.cpp. But this doubles storage requirements (Hyperdrive cache + temp file). +Even though the addon now reads shard files from disk via `bare-fs`, we still prefer the buffer path so that: +- The same code path works whether the caller streams from disk, from memory, or from any future transport. +- Multi-shard GGUFs can be fed incrementally instead of materialized to a single temp file. ### Decision -Implement custom `std::streambuf` over JavaScript-owned ArrayBuffers with incremental shard-by-shard loading, as provided by `qvac-lib-inference-addon-cpp` framework. This allows feeding buffer chunks from any source (Hyperdrive, HTTP, local files) directly to llama.cpp without intermediate file storage. - -JavaScript sends model data as buffer chunks, C++ wraps them in a `std::streambuf`, enabling llama.cpp to load sharded models incrementally with zero-copy access to JavaScript memory. See our [llama.cpp fork implementation](https://github.com/tetherto/qvac-ext-lib-llama.cpp/compare/master...tetherto:qvac-ext-lib-llama.cpp:temp-load-from-buffer?diff=unified&w). +Implement a custom `std::streambuf` over JavaScript-owned ArrayBuffers with incremental shard-by-shard loading, as provided by the `qvac-lib-inference-addon-cpp` framework. JavaScript forwards buffer chunks via `addon.loadWeights({ filename, chunk, completed })`; C++ wraps them in a `std::streambuf`, enabling llama.cpp to load sharded models incrementally with zero-copy access to JavaScript memory. See our [llama.cpp fork implementation](https://github.com/tetherto/qvac-ext-lib-llama.cpp/compare/master...tetherto:qvac-ext-lib-llama.cpp:temp-load-from-buffer?diff=unified&w). ### Rationale -**Avoid Storage Duplication:** -- Load directly from Hyperdrive streams without saving to disk first -- No temporary files consuming additional storage -- Critical for mobile devices with limited storage -- Hyperdrive data stays in its cache, not duplicated +**Incremental loading:** +- Sharded GGUFs are streamed into llama.cpp as chunks arrive, rather than requiring the full model to sit in RAM or a temp file before load. **Zero-Copy:** - C++ reads directly from JavaScript ArrayBuffer memory @@ -761,19 +714,18 @@ JavaScript sends model data as buffer chunks, C++ wraps them in a `std::streambu - Further reduces memory footprint **Source Flexibility:** -- Works with any data source (Hyperdrive, HTTP, filesystem) -- Data loader provides buffer chunks, streambuf wrapper handles delivery to llama.cpp -- Same incremental loading path for all distribution methods +- Works with any data source (bare-fs read stream today; any other in-process source tomorrow) +- Same incremental loading path regardless of where chunks come from - Supports sharded GGUF files with incremental tensor loading ### Trade-offs -- ✅ Can report loading progress per chunk +- ✅ Works with arbitrary in-process data sources - ❌ Complex streambuf implementation with seeking across blobs - ❌ Must keep JS buffers alive during load, defer cleanup to correct thread - ❌ Seeking overhead O(N) across N blobs (acceptable, rarely needed) **Key Components:** -- `WeightsProvider` (JavaScript): Orchestrates chunk delivery +- `LlmLlamacpp._streamShards()` (JavaScript): opens `bare-fs` read streams for each caller-provided shard path and forwards chunks via `addon.loadWeights` - `BlobsStream` (C++): Implements `std::basic_streambuf` over multiple blobs - `FinalizedStream` (C++): RAII wrapper owning JavaScript references - `ThreadQueuedRefDeleter` (C++): Defers reference deletion to JavaScript thread @@ -931,4 +883,4 @@ Provide hand-written TypeScript definitions in `index.d.ts` alongside JavaScript **Related Document:** - [data-flows-detailed.md](data-flows-detailed.md) - Detailed data flow diagrams and sequences -**Last Updated:** 2026-03-02 +**Last Updated:** 2026-04-07 diff --git a/packages/qvac-lib-infer-llamacpp-llm/docs/data-flows-detailed.md b/packages/qvac-lib-infer-llamacpp-llm/docs/data-flows-detailed.md index 988fa48079..1aac0368d1 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/docs/data-flows-detailed.md +++ b/packages/qvac-lib-infer-llamacpp-llm/docs/data-flows-detailed.md @@ -22,12 +22,13 @@ This document contains detailed diagrams showing how data moves through the `@qv - Emits: Output (streaming), JobStarted, JobEnded, Error **Weight Loading:** -- JavaScript sends model weights in chunks (streaming, zero-copy) -- C++ creates std::streambuf over JS ArrayBuffers -- For streamed models, the first shard is lent to `ModelMetaData` for GGUF metadata extraction before proceeding to weight loading +- Caller passes every file (primary model + every shard + `.tensors.txt` companion) as an array of absolute paths in `files.model` +- `LlmLlamacpp._streamShards()` iterates those paths, opening `bare-fs.createReadStream` and forwarding chunks via `addon.loadWeights` +- C++ creates `std::streambuf` over JS ArrayBuffers (zero-copy) +- For streamed sharded models, the first shard is lent to `ModelMetaData` for GGUF metadata extraction before proceeding to weight loading - llama.cpp reads weights via stream interface -- Supports sharded models (GGUF multi-file) - JS references kept alive during load, cleaned up after +- The addon performs **no** download, discovery, or shard expansion — the caller owns transport **Session Cache:** - Optional KV cache persistence to disk via `CacheManager` @@ -192,56 +193,67 @@ flowchart TD ### Streaming Weight Loading +The caller is responsible for providing the **complete** list of files in `files.model`: the `.tensors.txt` companion first, followed by every shard in ascending order (for sharded models). The addon picks the first entry matching the shard regex `/-\d+-of-\d+\.gguf$/` as the primary path (falling back to `files.model[0]` for non-sharded models). The addon does no discovery, no expansion, and no download. + ```mermaid sequenceDiagram - participant JS as JavaScript - participant WP as WeightsProvider + participant User as User code + participant LLM as LlmLlamacpp (index.js) + participant FS as bare-fs participant IF as LlamaInterface participant Bind as Native Binding participant WL as WeightsLoader participant Model as LlamaModel participant LC as llama.cpp - - JS->>WP: load() - WP->>WP: expandGGUFIntoShards() - - loop For each shard chunk - WP->>WP: Read 10MB chunk - WP->>WP: Create Uint8Array - WP->>IF: loadWeights({filename, chunk, completed: false}) - IF->>Bind: loadWeights(handle, data) - Bind->>WL: addChunk() - WL->>WL: js_create_reference (pin from GC) - WL->>WL: Store reference - Bind-->>IF: void - IF-->>WP: void + + User->>LLM: new LlmLlamacpp({ files, config, logger, opts }) + User->>LLM: load() + + Note over LLM: _load(): build configurationParams with
path = pickPrimaryGgufPath(files.model), projectionPath, config + LLM->>IF: new LlamaInterface(binding, configurationParams, outputCb) + + alt files.model.length > 1 (sharded) + loop For each filePath in files.model (in order) + LLM->>FS: createReadStream(filePath) + loop For each chunk + FS-->>LLM: chunk (Uint8Array) + LLM->>IF: loadWeights({ filename, chunk, completed: false }) + IF->>Bind: loadWeights(handle, data) + Bind->>WL: addChunk() + WL->>WL: js_create_reference (pin from GC) + WL->>WL: Store reference + Bind-->>IF: void + IF-->>LLM: void + end + LLM->>IF: loadWeights({ filename, chunk: null, completed: true }) + IF->>Bind: loadWeights(handle, final) + Bind->>WL: addChunk() + finalize + WL->>WL: Create FinalizedStream + WL->>WL: Create BlobsStream (std::streambuf) + WL->>Model: set_weights_for_file(filename, stream) + end end - - WP->>IF: loadWeights({filename, chunk, completed: true}) - IF->>Bind: loadWeights(handle, data) - Bind->>WL: addChunk() + finalize - WL->>WL: Create FinalizedStream - WL->>WL: Create BlobsStream (std::streambuf) - WL->>Model: set_weights_for_file(filename, stream) + + LLM->>IF: activate() + IF->>Bind: activate(handle) + Bind->>Model: load via llama.cpp Model->>LC: llama_model_load_from_file(stream) - + LC->>LC: Read via streambuf->sgetn() Note over LC: Zero-copy access to JS buffers LC->>LC: Parse GGUF metadata LC->>LC: Load weights LC-->>Model: Model loaded - + Model->>WL: Mark for deletion WL->>WL: Queue js_delete_reference - - Note over JS: Next API call - JS->>IF: activate() - IF->>Bind: activate(handle) - Bind->>WL: Process deletion queue + Bind->>WL: Process deletion queue (during activate) WL->>WL: js_delete_reference (unpin) - WL-->>Bind: References cleaned - - Note over JS: GC can now collect ArrayBuffers + + IF-->>LLM: activated + LLM-->>User: load() resolves + + Note over User: GC can now collect ArrayBuffers ```
@@ -251,36 +263,36 @@ sequenceDiagram | Stage | JS Buffer State | C++ Reference State | Memory Location | Notes | |-------|-----------------|---------------------|-----------------|-------| -| 1. Create | Allocated by JS | None | JS heap | Uint8Array created | +| 1. Read from disk | Allocated by bare-fs | None | JS heap | Uint8Array chunk yielded by read stream | | 2. loadWeights() | Passed to C++ | js_create_reference() | JS heap | Pinned from GC | | 3. Accumulation | Still in JS | Stored in vector | JS heap | Multiple refs held | | 4. Finalize | Still in JS | Owned by FinalizedStream | JS heap | RAII wrapper | -| 5. Loading | Still in JS | Active | JS heap | Zero-copy access | +| 5. activate() load | Still in JS | Active | JS heap | Zero-copy access from llama.cpp | | 6. Load complete | Still in JS | Marked for deletion | JS heap | Queued cleanup | -| 7. Next API call | Still in JS | js_delete_reference() | JS heap | Unpinned | +| 7. activate() returns | Still in JS | js_delete_reference() | JS heap | Unpinned during activate | | 8. After return | May be GC'd | None | Freed | Memory reclaimed | -**Sharded Model Handling:** +**Sharded Model Handling (caller-owned):** -Input: `"model-00001-of-00004.gguf"` +For a 4-shard model, the caller must pass **five** absolute paths in `files.model`, in this exact order: -Expanded to: -1. `model-00001-of-00004.gguf` -2. `model-00002-of-00004.gguf` -3. `model-00003-of-00004.gguf` -4. `model-00004-of-00004.gguf` +1. `model.tensors.txt` (companion file — **required**) +2. `model-00001-of-00004.gguf` +3. `model-00002-of-00004.gguf` +4. `model-00003-of-00004.gguf` +5. `model-00004-of-00004.gguf` -JavaScript sends each file separately. C++ concatenates into single logical stream. +`_load()` uses `pickPrimaryGgufPath(files.model)` — the first entry matching the shard regex `/-\d+-of-\d+\.gguf$/`, falling back to `files.model[0]` for non-sharded models — as the primary path passed to the native addon constructor. `_streamShards()` iterates **all** entries streaming each via `bare-fs`. C++ concatenates them into a single logical stream per filename. **Performance:** | Operation | Duration | Memory Impact | Notes | |-----------|----------|---------------|-------| -| Create 10MB chunk | ~1ms | +10MB JS heap | Async I/O | +| bare-fs chunk | depends on FS | +chunk size in JS heap | Async I/O | | loadWeights() call | <1ms | +small C++ overhead | Non-blocking | | FinalizeStream | ~0.1ms | Transfer ownership | Zero-copy | -| llama_model_load() | Seconds | +model size in RAM | Background thread | -| Reference cleanup | <0.1ms | -10MB JS heap per chunk | Deferred to JS thread | +| llama_model_load() | Seconds | +model size in RAM | During activate() | +| Reference cleanup | <0.1ms | -chunk size per reference | During activate() |
@@ -468,5 +480,5 @@ flowchart TD **Related Documents:** - [architecture.md](architecture.md) - Complete architecture documentation -**Last Updated:** 2026-03-02 +**Last Updated:** 2026-04-07 diff --git a/packages/qvac-lib-infer-llamacpp-llm/docs/finetuning.md b/packages/qvac-lib-infer-llamacpp-llm/docs/finetuning.md index 1de3da563f..49f9dc6008 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/docs/finetuning.md +++ b/packages/qvac-lib-infer-llamacpp-llm/docs/finetuning.md @@ -41,7 +41,7 @@ The library supports **LoRA finetuning** of GGUF models. LoRA trains small adapt ### Architecture -Finetune and inference use the same job queue (JobRunner): both submit a job via `runJob()` and a single processing thread runs one job at a time (either inference or finetune). In JS, inference waits on `_lastJobResult`, while finetune uses `_finetuneActive` to block overlapping `run()`/`finetune()` calls; JobRunner enforces serialization on the native side. +Finetune and inference use the same job queue (JobRunner): both submit a job via `runJob()` and a single processing thread runs one job at a time (either inference or finetune). In JS, both `run()` and `finetune()` go through the same `exclusiveRunQueue` (`_run`) and share a `createJobHandler`-managed `_hasActiveResponse` flag; JobRunner enforces serialization on the native side. 1. **Model loading**: Load a base GGUF model (e.g., Qwen3-0.6B-Q8_0.gguf) with `model.load()`. 2. **Dataset preparation**: Training data is read from JSONL (chat format) or plain text files. Validation uses either a fraction of that data (when `validation.type` is `'split'`), a separate eval file (`'dataset'`), or none (`'none'`). @@ -71,11 +71,12 @@ Default (when `loraModules` is empty): attention Q, K, V, O only. ## JavaScript API -### `finetune(finetuningOptions?)` +### `finetune(finetuningOptions)` -Starts or resumes finetuning. If the model is not loaded, it will be loaded first. Finetuning runs exclusively (no concurrent inference). Returns a handle immediately (like `run()`); use `handle.await()` to wait for completion. If a pause checkpoint exists in `checkpointSaveDir`, training resumes from it automatically; otherwise a fresh run starts. +Starts or resumes finetuning. The model **must** already be loaded (call `load()` first); `finetune()` does not auto-load. `finetuningOptions` is required on every call — there is no in-process "stored params" state. Finetuning runs exclusively (no concurrent inference). Returns a handle immediately (like `run()`); use `handle.await()` to wait for completion. If a pause checkpoint exists in `checkpointSaveDir`, training resumes from it automatically; otherwise a fresh run starts. ```js +await model.load() const handle = await model.finetune(finetuneOptions) handle.on('stats', stats => { console.log(`epoch=${stats.current_epoch + 1} step=${stats.global_steps} loss=${stats.loss?.toFixed(4)} acc=${(stats.accuracy * 100)?.toFixed(1)}%`) @@ -88,7 +89,7 @@ const resumeResult = await resumeHandle.await() ``` - **Parameters** - - `finetuningOptions` — Object with [finetuning parameters](#finetuning-parameters). Always required. To resume after a pause, pass the same params again; the backend resumes from a pause checkpoint if one exists in `checkpointSaveDir`. **Resume contract:** call `finetune()` only after you have **awaited** `pause()`. There is no status API; await the previous command to know something is done. + - `finetuningOptions` — Object with [finetuning parameters](#finetuning-parameters). **Always required**, including on resume — pass the same params object that was used for the original run. The backend resumes from a pause checkpoint if one exists in `checkpointSaveDir`. **Resume contract:** call `finetune(finetuningOptions)` only after you have **awaited** `pause()`. There is no status API; await the previous command to know something is done. - **Returns** — `Promise`. The handle has `await()` — returns `Promise<{ op: 'finetune', status: 'COMPLETED' | 'PAUSED', stats?: object }>` when training completes or pauses. `stats` may include terminal metrics such as `train_loss`, `val_loss`, `learning_rate`, `global_steps`, and `epochs_completed`. Runtime failures reject `await()` (same failure path as inference) instead of resolving with an error status. - **Progress events** — if `opts.stats` is enabled, finetuning emits `stats` events on the handle with per-iteration metrics (`loss`, `accuracy`, `global_steps`, `current_epoch`, `current_batch`, `total_batches`). `global_steps` is the canonical monotonic step counter; `current_batch`/`total_batches` reflect backend ubatch indexing and may have non-sequential jumps depending on batch/microbatch configuration. @@ -105,7 +106,7 @@ Pauses finetuning and keeps pause checkpoints so the next `finetune()` call can await model.pause() ``` -**Returns** — `Promise`. Once resolved, you can call `finetune()` (no args) to resume. +**Returns** — `Promise`. Once resolved, you can call `finetune(finetuningOptions)` again — pass the same params object — and the backend will resume from the pause checkpoint in `checkpointSaveDir`. ### Stop and start fresh: `cancel()` @@ -208,7 +209,7 @@ The finetuning and pause/resume flow uses **wait conditions** and **events** onl | **Completion** | `handle.await()` resolves on finetune terminal payloads (`status: COMPLETED` or `PAUSED`) and rejects on runtime errors (`Error` event path). | | **Training started** | Event `FinetuningStarted` emitted when the first batch is processed. | | **Request pause** | Calling `pause()` during finetuning invokes `requestPause()` (sets `pauseRequested` and `llama_opt_request_stop()`). The binding runs `waitUntilFinetuningPauseComplete()` on a background task, blocking on a condition variable until the JobRunner thread (running the finetune job) signals pause done (checkpoint saved or save failed); the Promise resolves when that wait returns. There is a 5-minute timeout if the checkpoint save never completes. | -| **Resume** | When you call `finetune()` (with no args to use stored params), the JS calls `addon.finetune(params)`. The C++ `finetune()` checks for a pause checkpoint in `params.checkpointSaveDir`; if one exists, it calls `clearPauseRequest()` and resumes from that checkpoint. **Contract:** call `finetune()` only after you have **awaited** `pause()`. No status check in the binding. | +| **Resume** | When you call `finetune(params)` again after `pause()`, the JS calls `addon.finetune(params)`. The C++ `finetune()` checks for a pause checkpoint in `params.checkpointSaveDir`; if one exists, it calls `clearPauseRequest()` and resumes from that checkpoint. **Contract:** call `finetune(params)` only after you have **awaited** `pause()`. JS does not retain stored params — you must pass the same params object on resume. No status check in the binding. | **Wait conditions in C++:** `pauseDoneCv` / `pauseWaitDone` signal when pause has completed. `waitUntilFinetuningPauseComplete()` uses a 5-minute timeout so the caller is not blocked indefinitely if the JobRunner thread never signals. The C++ decides “resume from checkpoint” solely by checking the filesystem: at the start of `finetune(params, logCallback)` it calls `pauseCheckpointExists(params.checkpointSaveDir)`. If true, it calls `clearPauseRequest()` and then loads the latest `pause_checkpoint_step_*` directory and metadata to resume; otherwise it starts fresh. Atomic flags in `TrainingCheckpointState`: `pauseRequested`, `shouldExit`, `pauseCheckpointSaved`, `pauseWaitDone`; the pointer `currentCheckpointState_` in `LlamaModel` is also atomic. Together with `pauseDoneMutex` and `pauseDoneCv`, these provide thread-safe coordination between the thread waiting in `waitUntilFinetuningPauseComplete()` (from `pause()`) and the JobRunner thread running the finetune job (which checks flags, saves the checkpoint, and signals completion). @@ -216,16 +217,16 @@ The finetuning and pause/resume flow uses **wait conditions** and **events** onl | API | Backend behavior | |-----|------------------| -| **`finetune(opts?)`** | Normalizes opts (required `validation` object → `validationSplit`, `useEvalDatasetForValidation`), then calls `addon.finetune(params)`. Params come from opts or stored. C++ auto-detects resume when a pause checkpoint exists in `checkpointSaveDir`. Returns a handle; `handle.await()` resolves with terminal payload `status: COMPLETED | PAUSED`, and rejects on runtime errors. | +| **`finetune(opts)`** | Normalizes opts (required `validation` object → `validationSplit`, `useEvalDatasetForValidation`), then calls `addon.finetune(params)`. `opts` is required on every call — JS does not retain stored params. C++ auto-detects resume when a pause checkpoint exists in `checkpointSaveDir`. Returns a handle; `handle.await()` resolves with terminal payload `status: COMPLETED | PAUSED`, and rejects on runtime errors. | | **`pause()`** | During finetuning, calls C++ pause flow (`requestPause()` + `waitUntilFinetuningPauseComplete()`), which writes a pause checkpoint and resolves when the pause path completes. | | **`cancel()`** | Calls addon cancel, then removes local `pause_checkpoint_step_*` directories from `checkpointSaveDir` so the next `finetune()` starts fresh. | ### Fresh run vs resume -The choice between a **fresh run** and **resume from pause** is made in C++ inside `LlamaModel::finetune()`. The JS API exposes a single `finetune(opts?)`; resume is determined by the backend from the presence of a pause checkpoint on disk. There is no in-process "we were paused" state: if you restart the script and call `finetune(opts)` with the same `checkpointSaveDir`, the backend will resume from any existing pause checkpoint in that directory. +The choice between a **fresh run** and **resume from pause** is made in C++ inside `LlamaModel::finetune()`. The JS API exposes a single `finetune(opts)`; resume is determined by the backend from the presence of a pause checkpoint on disk. There is no in-process "we were paused" or "stored params" state in JS: if you restart the script and call `finetune(opts)` with the same `checkpointSaveDir`, the backend will resume from any existing pause checkpoint in that directory. - **How it’s decided:** After validating params, C++ sets `checkpointDir = params.checkpointSaveDir` (or `"./checkpoints"`) and calls `pauseCheckpointExists(checkpointDir)`. If that returns true, it calls `clearPauseRequest()` and then uses `findLatestPauseCheckpoint()` and `parseCheckpointMetadata()` to set `resumingFromPause` and load resume metadata; the rest of the function branches on `resumingFromPause` (load adapter from checkpoint vs init from params, restore step/epoch, etc.). -- **Params on resume:** The current `params` (from the call—e.g. from the original run when you call `finetune()` with no args) are used for dataset paths, `numberOfEpochs`, learning rate, scheduler, checkpoint dir, and so on. The checkpoint supplies the **position** (epoch, globalStep, currentStep, resumeEpoch, resumeBatch, pausedDuringValidation) and saved LoRA layout (targetModules, loraRank, loraAlpha); `loraInitStd` comes from `params`. +- **Params on resume:** The `params` you pass on the resume call are used for dataset paths, `numberOfEpochs`, learning rate, scheduler, checkpoint dir, and so on — pass the same object you used for the original run. The checkpoint supplies the **position** (epoch, globalStep, currentStep, resumeEpoch, resumeBatch, pausedDuringValidation) and saved LoRA layout (targetModules, loraRank, loraAlpha); `loraInitStd` comes from `params`. ### UML: finetune and pause flow (JS → C++) @@ -246,8 +247,8 @@ sequenceDiagram participant Helpers as LlamaFinetuningHelpers participant Queue as outputQueue - User->>LlamaModel: finetune(opts) or finetune() (no args → stored params) - LlamaModel->>LlamaModel: _finetuneActive check, store params, normalize opts (validation object required; dataset requires validation.path; emits validationSplit/useEvalDatasetForValidation/evalDatasetPath) + User->>LlamaModel: finetune(opts) (opts always required, including on resume) + LlamaModel->>LlamaModel: enqueue on exclusiveRunQueue (_run), check _hasActiveResponse, normalize opts (validation object required; dataset requires validation.path; emits validationSplit/useEvalDatasetForValidation/evalDatasetPath) LlamaModel->>Addon: finetune(params) Addon->>Binding: _binding.finetune(handle, params) @@ -270,7 +271,7 @@ sequenceDiagram end LlamaModelCpp->>Queue: queueJobEnded({ op:'finetune', status, stats? }) Queue->>LlamaModel: _addonOutputCallback(...) -> _outputCallback(..., 'JobEnded', 'OnlyOneJob', data) - LlamaModel->>LlamaModel: BaseInference routes JobEnded to QvacResponse.ended(data) + LlamaModel->>LlamaModel: _handleAddonOutputEvent routes JobEnded to active QvacResponse via createJobHandler LlamaModel->>User: handle.await() resolves with { op:'finetune', status:'COMPLETED'|'PAUSED', stats? } (errors reject) ``` @@ -324,7 +325,7 @@ sequenceDiagram | Layer | Component | Role | |-------|-----------|------| -| JS | `index.js` → `LlmLlamacpp` | Public API: `finetune()`, `pause()`, `cancel()`. `pause()` requests a resumable stop; `cancel()` stops and removes `pause_checkpoint_step_*` directories for a fresh next run. Normalizes opts: requires `validation`, rejects top-level `evalDatasetPath`, maps dataset validation to `evalDatasetPath`, and emits `validationSplit` / `useEvalDatasetForValidation` before calling addon. Uses `_finetuneActive` and `QvacResponse` (`OnlyOneJob`) for lifecycle; `_addonOutputCallback` maps terminal finetune payloads to `JobEnded`. | +| JS | `index.js` → `LlmLlamacpp` | Public API: `finetune()`, `pause()`, `cancel()`. `pause()` requests a resumable stop; `cancel()` stops and removes `pause_checkpoint_step_*` directories for a fresh next run. Normalizes opts: requires `validation`, rejects top-level `evalDatasetPath`, maps dataset validation to `evalDatasetPath`, and emits `validationSplit` / `useEvalDatasetForValidation` before calling addon. Serializes public API via `exclusiveRunQueue` (`_run`) and tracks the active job via `createJobHandler` (`_job`) plus `_hasActiveResponse`; `_addonOutputCallback` maps terminal finetune payloads to `JobEnded` on the active `QvacResponse`. | | JS | `addon.js` → `LlamaInterface` | Thin wrapper: `finetune(params)` → `_binding.finetune(handle, params)`, `cancel()` → `_binding.cancel(handle)` (used by both `pause()` and `cancel()` in JS). | | C++ | `binding.cpp` | BARE exports: `finetune`, `cancel` → `qvac_lib_inference_addon_llama::*`. | | C++ | `AddonJs.hpp` | Parses JS args, gets `LlamaModel*` via `getLlamaModel(instance)`; `tryGetObject()` for params; builds `Prompt` with `finetuningParams` and `outputCallback`, calls `addonCpp->runJob(any(prompt))` (same path as inference). C++ auto-detects resume via `pauseCheckpointExists(checkpointSaveDir)`. `cancel()`: if `isFinetuneRunning()` then `requestPause()` + `JsAsyncTask::run(waitUntilFinetuningPauseComplete)`, else `cancelJob()`; always returns Promise via `JsAsyncTask::run`. | @@ -362,7 +363,7 @@ The finetuning backend lives in `addon/src/` and uses the llama.cpp optimizer AP 7. **Pause request path** — `requestPause()`: if `currentCheckpointState_` (atomic, per instance) is non-null, sets `pauseRequested.store(true)` and `llama_opt_request_stop(ctx)`; returns immediately. Returns `false` if no checkpoint state exists (e.g. training not started yet). 8. **Completion** — On normal finish: `saveLoraAdapter()` writes the final LoRA to `outputParametersDir` and finetune ends as `COMPLETED`. On pause: terminal status is `PAUSED`. On runtime error: C++ throws; JS receives an `Error` event and `handle.await()` rejects. -**Wait conditions and internal state** — `TrainingCheckpointState` holds atomic flags `pauseRequested`, `shouldExit`, `pauseCheckpointSaved`, `pauseWaitDone` and the wait condition `pauseDoneCv` / `pauseDoneMutex`. When `pause()` is called during finetuning, `requestPause()` sets `pauseRequested` and a background task runs `waitUntilFinetuningPauseComplete()`, which blocks on `pauseDoneCv` until the JobRunner thread (running the finetune job) saves the checkpoint and sets `pauseWaitDone`; this gives thread-safe coordination between the two. The binding does not read status (e.g. `isPaused`); resume is driven by calling `finetune()` after awaiting `pause()`; C++ auto-detects a pause checkpoint in `checkpointSaveDir` and resumes. Multiple model instances work correctly (per-instance state, thread-local callback state). Calling `cancel()` uses the same addon cancel entrypoint, then clears pause checkpoints on the JS side to force a fresh subsequent run. +**Wait conditions and internal state** — `TrainingCheckpointState` holds atomic flags `pauseRequested`, `shouldExit`, `pauseCheckpointSaved`, `pauseWaitDone` and the wait condition `pauseDoneCv` / `pauseDoneMutex`. When `pause()` is called during finetuning, `requestPause()` sets `pauseRequested` and a background task runs `waitUntilFinetuningPauseComplete()`, which blocks on `pauseDoneCv` until the JobRunner thread (running the finetune job) saves the checkpoint and sets `pauseWaitDone`; this gives thread-safe coordination between the two. The binding does not read status (e.g. `isPaused`); resume is driven by calling `finetune(params)` again after awaiting `pause()` — JS does not retain stored params, so the same params object must be passed on resume. C++ auto-detects a pause checkpoint in `checkpointSaveDir` and resumes. Multiple model instances work correctly (per-instance state, thread-local callback state). Calling `cancel()` uses the same addon cancel entrypoint, then clears pause checkpoints on the JS side to force a fresh subsequent run. --- @@ -405,28 +406,24 @@ Minimal example: load model, run finetuning, wait for completion. 'use strict' const LlmLlamacpp = require('@qvac/llm-llamacpp') -const FilesystemDL = require('@qvac/dl-filesystem') const path = require('bare-path') async function main() { const modelDir = path.resolve('./models') - const loader = new FilesystemDL({ dirPath: modelDir }) - - const model = new LlmLlamacpp( - { - loader, - opts: { stats: true }, - logger: console, - diskPath: modelDir, - modelName: 'Qwen3-0.6B-Q8_0.gguf' + + const model = new LlmLlamacpp({ + files: { + model: [path.join(modelDir, 'Qwen3-0.6B-Q8_0.gguf')] }, - { + config: { gpu_layers: '999', ctx_size: '512', device: 'gpu', flash_attn: 'off' - } - ) + }, + opts: { stats: true }, + logger: console + }) await model.load() @@ -456,7 +453,7 @@ main().catch(console.error) ### 2. Pause and Resume -Start finetuning, wait for training to begin (e.g. fixed sleep), call `pause()`, then resume and wait for completion. After `pause()` resolves you can call `finetune()` (no args). +Start finetuning, wait for training to begin (e.g. fixed sleep), call `pause()`, then resume by calling `finetune(finetuneOptions)` again with the same params object and wait for completion. **Run:** `bare examples/simple-lora-finetune-pause-resume.js` @@ -491,7 +488,13 @@ const config = { lora: './finetuned-model-direct/trained-lora-adapter.gguf' } -const model = new LlmLlamacpp(args, config) +const model = new LlmLlamacpp({ + files: { + model: [path.join(modelDir, 'Qwen3-0.6B-Q8_0.gguf')] + }, + config, + logger: console +}) await model.load() const messages = [ @@ -555,7 +558,7 @@ Each checkpoint directory typically contains: ### Resume from Pause -Call `finetune()` (no args) to resume. The addon finds the latest `pause_checkpoint_step_*` in `checkpointSaveDir` and continues training from there, reusing the stored finetuning parameters. The pause checkpoint metadata includes explicit resume cursor fields (`resume_epoch`, `resume_batch`) which are passed directly to the backend's `llama_opt_epoch_resume()`, so training resumes at the exact saved position without deriving it from step counters. If paused during validation, resume starts at the next epoch. **Checkpoint lifecycle:** After loading a pause checkpoint to resume, the backend removes that checkpoint directory so the same run does not resume from it again. When training completes successfully (COMPLETED), any remaining pause checkpoint in `checkpointSaveDir` is also cleared. Pause checkpoints remain on disk only while training is paused (after `pause()` and before the next `finetune()`), unless `cancel()` is called, which clears them. +Call `finetune(finetuningOptions)` again with the same params object to resume. The addon finds the latest `pause_checkpoint_step_*` in `checkpointSaveDir` and continues training from there. JS does not retain a stored params object, so the caller must pass the same params on every call (including resume). The pause checkpoint metadata includes explicit resume cursor fields (`resume_epoch`, `resume_batch`) which are passed directly to the backend's `llama_opt_epoch_resume()`, so training resumes at the exact saved position without deriving it from step counters. If paused during validation, resume starts at the next epoch. **Checkpoint lifecycle:** After loading a pause checkpoint to resume, the backend removes that checkpoint directory so the same run does not resume from it again. When training completes successfully (COMPLETED), any remaining pause checkpoint in `checkpointSaveDir` is also cleared. Pause checkpoints remain on disk only while training is paused (after `pause()` and before the next `finetune()`), unless `cancel()` is called, which clears them. --- diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/benchToolsPlacement.js b/packages/qvac-lib-infer-llamacpp-llm/examples/benchToolsPlacement.js index 7a9096c555..407a90a70f 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/benchToolsPlacement.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/benchToolsPlacement.js @@ -1,7 +1,6 @@ 'use strict' const LlmLlamacpp = require('../index') -const FilesystemDL = require('@qvac/dl-filesystem') const path = require('bare-path') const fs = require('bare-fs') const process = require('bare-process') @@ -213,16 +212,15 @@ function makeBaseConfig (toolsAtEnd) { } async function loadModel (dirPath, modelName, config) { - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const model = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: console, opts: { stats: true } - }, config) + }) await model.load() - return { model, loader } + return { model } } async function runAndCollect (model, prompt, runOptions) { @@ -255,7 +253,7 @@ async function runScenario (dirPath, modelName, opts) { console.log('='.repeat(70)) const config = makeBaseConfig(toolsAtEnd) - const { model, loader } = await loadModel(dirPath, modelName, config) + const { model } = await loadModel(dirPath, modelName, config) const cachePath = path.join(dirPath, cacheName) cleanCache(cachePath) @@ -340,7 +338,6 @@ async function runScenario (dirPath, modelName, opts) { } } finally { await model.unload() - await loader.close() cleanCache(cachePath) } diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/showcase/smart-home-finetune.js b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/showcase/smart-home-finetune.js index 3fc6964c86..e71d3b81ad 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/showcase/smart-home-finetune.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/showcase/smart-home-finetune.js @@ -1,7 +1,7 @@ 'use strict' const LlamaClient = require('../../../index') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const process = require('bare-process') const { downloadModel, formatProgress, createFilteredLogger } = require('../../utils') @@ -20,7 +20,6 @@ const OUTPUT_DIR = './smart-home-lora' async function main () { let client - let loader const { logger: filteredLogger, restore: restoreConsole } = createFilteredLogger() @@ -33,15 +32,7 @@ async function main () { const [modelName, modelDir] = await downloadModel(MODEL.url, MODEL.name) - loader = new FilesystemDL({ dirPath: modelDir }) - - const args = { - loader, - opts: { stats: true }, - logger: filteredLogger, - diskPath: modelDir, - modelName - } + const modelPath = path.join(modelDir, modelName) const config = { gpu_layers: '999', @@ -50,7 +41,12 @@ async function main () { flash_attn: 'off' } - client = new LlamaClient(args, config) + client = new LlamaClient({ + files: { model: [modelPath] }, + config, + logger: filteredLogger, + opts: { stats: true } + }) await client.load() console.log('Model loaded.\n') @@ -119,13 +115,6 @@ async function main () { console.error('Failed to unload model during cleanup:', unloadErr) } } - if (loader) { - try { - await loader.close() - } catch (closeErr) { - console.error('Failed to close loader during cleanup:', closeErr) - } - } } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/showcase/smart-home-finetuned-test.js b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/showcase/smart-home-finetuned-test.js index a94139c3fa..1ab6c77bae 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/showcase/smart-home-finetuned-test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/showcase/smart-home-finetuned-test.js @@ -1,7 +1,6 @@ 'use strict' const LlamaClient = require('../../../index') -const FilesystemDL = require('@qvac/dl-filesystem') const process = require('bare-process') const path = require('bare-path') const fs = require('bare-fs') @@ -143,15 +142,7 @@ async function runScenario (client, messages) { async function main () { const [modelName, modelDir] = await downloadModel(MODEL.url, MODEL.name) - const loader = new FilesystemDL({ dirPath: modelDir }) - - const args = { - loader, - opts: { stats: true }, - logger: console, - diskPath: modelDir, - modelName - } + const modelPath = path.join(modelDir, modelName) const sharedConfig = { device: 'gpu', @@ -184,7 +175,12 @@ async function main () { console.log(' Model: ' + MODEL.name) console.log(separator('=')) - baselineClient = new LlamaClient(args, baselineConfig) + baselineClient = new LlamaClient({ + files: { model: [modelPath] }, + config: baselineConfig, + logger: console, + opts: { stats: true } + }) await baselineClient.load() console.log('Base model loaded (no LoRA).\n') @@ -263,7 +259,12 @@ async function main () { console.log(' Adapter: ' + LORA_ADAPTER) console.log(separator('=')) - client = new LlamaClient(args, config) + client = new LlamaClient({ + files: { model: [modelPath] }, + config, + logger: console, + opts: { stats: true } + }) await client.load() console.log('Model + LoRA adapter loaded.\n') @@ -408,8 +409,6 @@ async function main () { console.error('\nTest failed:', error.message) console.error('Stack:', error.stack) process.exit(1) - } finally { - try { await loader.close() } catch (e) { console.error('Failed to close loader:', e) } } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-multiple-pause-resume.js b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-multiple-pause-resume.js index b67ca22f9b..52ed12d2bd 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-multiple-pause-resume.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-multiple-pause-resume.js @@ -1,7 +1,6 @@ 'use strict' const LlamaClient = require('../../index') -const FilesystemDL = require('@qvac/dl-filesystem') const process = require('bare-process') const path = require('bare-path') const fs = require('bare-fs') @@ -52,18 +51,10 @@ async function main () { const trainDatasetPath = './examples/input/small_train_HF.jsonl' const evalDatasetPath = './examples/input/small_eval_HF.jsonl' - const loader = new FilesystemDL({ dirPath: modelDir }) + const modelPath = path.join(modelDir, modelName) const { logger: filteredLogger, restore: restoreConsole } = createFilteredLogger() - const args = { - loader, - opts: { stats: true }, - logger: filteredLogger, - diskPath: modelDir, - modelName - } - const config = { device: 'gpu', gpu_layers: '999', @@ -76,7 +67,12 @@ async function main () { try { console.log('=== Multiple Pause/Resume Finetuning Test ===\n') console.log('Loading model...') - client = new LlamaClient(args, config) + client = new LlamaClient({ + files: { model: [modelPath] }, + config, + logger: filteredLogger, + opts: { stats: true } + }) await client.load() console.log('Model loaded successfully\n') diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-pause-inference-resume.js b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-pause-inference-resume.js index 5448436f43..df784f4385 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-pause-inference-resume.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-pause-inference-resume.js @@ -1,7 +1,6 @@ 'use strict' const LlamaClient = require('../../index') -const FilesystemDL = require('@qvac/dl-filesystem') const process = require('bare-process') const path = require('bare-path') const fs = require('bare-fs') @@ -76,18 +75,10 @@ async function main () { const trainDatasetPath = './examples/input/small_train_HF.jsonl' const evalDatasetPath = './examples/input/small_eval_HF.jsonl' - const loader = new FilesystemDL({ dirPath: modelDir }) + const modelPath = path.join(modelDir, modelName) const { logger: filteredLogger, restore: restoreConsole } = createFilteredLogger() - const args = { - loader, - opts: { stats: true }, - logger: filteredLogger, - diskPath: modelDir, - modelName - } - const config = { device: 'gpu', gpu_layers: '999', @@ -100,7 +91,12 @@ async function main () { try { console.log('=== Pause Finetuning, Inference, and Resume Test ===\n') console.log('Loading model...') - client = new LlamaClient(args, config) + client = new LlamaClient({ + files: { model: [modelPath] }, + config, + logger: filteredLogger, + opts: { stats: true } + }) await client.load() console.log('Model loaded successfully\n') @@ -219,7 +215,12 @@ async function main () { } console.log('🔮 Preparing inference 1: Loading model with LoRA adapter...') - inferenceClientWithLora = new LlamaClient(args, inferenceConfigWithLora) + inferenceClientWithLora = new LlamaClient({ + files: { model: [modelPath] }, + config: inferenceConfigWithLora, + logger: filteredLogger, + opts: { stats: true } + }) await inferenceClientWithLora.load() console.log('✅ Model with LoRA adapter loaded successfully\n') @@ -246,7 +247,12 @@ async function main () { } console.log('🔮 Preparing inference 2: Loading base model (no LoRA adapters)...') - inferenceClientBase = new LlamaClient(args, inferenceConfigBase) + inferenceClientBase = new LlamaClient({ + files: { model: [modelPath] }, + config: inferenceConfigBase, + logger: filteredLogger, + opts: { stats: true } + }) await inferenceClientBase.load() console.log('✅ Base model loaded successfully\n') diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-pause-resume.js b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-pause-resume.js index 4fa2d1eb70..33e817ff13 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-pause-resume.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune-pause-resume.js @@ -1,7 +1,6 @@ 'use strict' const LlamaClient = require('../../index') -const FilesystemDL = require('@qvac/dl-filesystem') const process = require('bare-process') const path = require('bare-path') const fs = require('bare-fs') @@ -58,18 +57,10 @@ async function main () { const trainDatasetPath = './examples/input/small_train_HF.jsonl' const evalDatasetPath = './examples/input/small_eval_HF.jsonl' - const loader = new FilesystemDL({ dirPath: modelDir }) + const modelPath = path.join(modelDir, modelName) const { logger: filteredLogger, restore: restoreConsole } = createFilteredLogger() - const args = { - loader, - opts: { stats: true }, - logger: filteredLogger, - diskPath: modelDir, - modelName - } - const config = { device: 'gpu', gpu_layers: '999', @@ -82,7 +73,12 @@ async function main () { try { console.log('=== Pause/Resume Finetuning Test ===\n') console.log('Loading model...') - client = new LlamaClient(args, config) + client = new LlamaClient({ + files: { model: [modelPath] }, + config, + logger: filteredLogger, + opts: { stats: true } + }) await client.load() console.log('Model loaded successfully\n') diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune.js b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune.js index 1116e6a119..c1ff774e7a 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/finetune/simple-lora-finetune.js @@ -1,7 +1,7 @@ 'use strict' const LlmLlamacpp = require('../../index') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const { downloadModel, formatProgress, createFilteredLogger } = require('../utils') const MODEL = { @@ -11,22 +11,13 @@ const MODEL = { async function runFinetuningTests () { let model - let loader const { logger: filteredLogger, restore: restoreConsole } = createFilteredLogger() try { const [modelName, modelDir] = await downloadModel(MODEL.url, MODEL.name) - loader = new FilesystemDL({ dirPath: modelDir }) - - const args = { - loader, - opts: { stats: true }, - logger: filteredLogger, - diskPath: modelDir, - modelName - } + const modelPath = path.join(modelDir, modelName) const config = { gpu_layers: '999', @@ -35,7 +26,12 @@ async function runFinetuningTests () { flash_attn: 'off' } - model = new LlmLlamacpp(args, config) + model = new LlmLlamacpp({ + files: { model: [modelPath] }, + config, + logger: filteredLogger, + opts: { stats: true } + }) await model.load() const finetuneOptions = { @@ -57,12 +53,10 @@ async function runFinetuningTests () { }) const finetuneResult = await handle.await() console.log('Finetune completed:', finetuneResult) - if (args.opts?.stats) { - if (finetuneResult && typeof finetuneResult.stats === 'object' && finetuneResult.stats !== null) { - console.log('✅ Finetune terminal stats:', finetuneResult.stats) - } else { - console.warn('⚠️ opts.stats is enabled, but no finetune terminal stats were returned') - } + if (finetuneResult && typeof finetuneResult.stats === 'object' && finetuneResult.stats !== null) { + console.log('✅ Finetune terminal stats:', finetuneResult.stats) + } else { + console.warn('⚠️ opts.stats is enabled, but no finetune terminal stats were returned') } } catch (error) { console.error('Test failed:', error.message) @@ -77,13 +71,6 @@ async function runFinetuningTests () { console.error('Failed to unload model during cleanup:', unloadErr) } } - if (loader) { - try { - await loader.close() - } catch (closeErr) { - console.error('Failed to close loader during cleanup:', closeErr) - } - } } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/multiCache.js b/packages/qvac-lib-infer-llamacpp-llm/examples/multiCache.js index 96ff7373f7..2041d82395 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/multiCache.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/multiCache.js @@ -1,7 +1,7 @@ 'use strict' const LlmLlamacpp = require('../index') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const process = require('bare-process') const { downloadModel } = require('./utils') @@ -15,17 +15,8 @@ async function main () { 'Llama-3.2-1B-Instruct-Q4_0.gguf' ) - // 2. Initializing data loader - const fsDL = new FilesystemDL({ dirPath }) - - // 3. Configuring model settings - const args = { - loader: fsDL, - opts: { stats: true }, - logger: console, - diskPath: dirPath, - modelName - } + // 2. Configuring model settings + const modelPath = path.join(dirPath, modelName) const config = { device: 'gpu', @@ -33,12 +24,17 @@ async function main () { ctx_size: '10000' } - // 4. Loading model - const model = new LlmLlamacpp(args, config) + // 3. Loading model + const model = new LlmLlamacpp({ + files: { model: [modelPath] }, + config, + logger: console, + opts: { stats: true } + }) await model.load() try { - // 5. First conversation - no cache will be used. One shot inference + // 4. First conversation - no cache will be used. One shot inference const messages = [ { role: 'system', @@ -73,7 +69,7 @@ async function main () { console.log(`Inference stats: ${JSON.stringify(response1.stats)}`) console.log('\n') - // 6. Switching to a new session with cache1.bin file + // 5. Switching to a new session with cache1.bin file const messages2 = [ { role: 'user', @@ -96,7 +92,7 @@ async function main () { console.log(`Inference stats: ${JSON.stringify(response2.stats)}`) console.log('\n') - // 7. Continuing conversation with cache1.bin + // 6. Continuing conversation with cache1.bin const messages3 = [ { role: 'user', @@ -123,9 +119,8 @@ async function main () { console.error('Error occurred:', errorMessage) console.error('Error details:', error) } finally { - // 8. Cleaning up resources + // 7. Cleaning up resources await model.unload() - await fsDL.close() } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/multiModal.js b/packages/qvac-lib-infer-llamacpp-llm/examples/multiModal.js index 2fbfff85b4..d3a666e857 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/multiModal.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/multiModal.js @@ -1,7 +1,7 @@ 'use strict' const LlmLlamacpp = require('../index') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const fs = require('bare-fs') const process = require('bare-process') const { downloadModel } = require('./utils') @@ -16,23 +16,13 @@ async function main () { 'SmolVLM2-500M-Video-Instruct-Q8_0.gguf' ) - const [projectionModel] = await downloadModel( + const [projModelName] = await downloadModel( 'https://huggingface.co/ggml-org/SmolVLM2-500M-Video-Instruct-GGUF/resolve/main/mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf', 'mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf' ) - // 2. Initializing data loader - const fsDL = new FilesystemDL({ dirPath }) - - // 3. Configuring model settings - const args = { - loader: fsDL, - opts: { stats: true }, - logger: console, - diskPath: dirPath, - modelName, - projectionModel - } + // 2. Configuring model settings + const modelPath = path.join(dirPath, modelName) const config = { device: 'gpu', @@ -40,16 +30,21 @@ async function main () { ctx_size: '2048' } - // 4. Loading model - const model = new LlmLlamacpp(args, config) + // 3. Loading model + const model = new LlmLlamacpp({ + files: { model: [modelPath], projectionModel: path.join(dirPath, projModelName) }, + config, + logger: console, + opts: { stats: true } + }) await model.load() - // 5. Preparing media. We will use both the path and the buffer in different inferences + // 4. Preparing media. We will use both the path and the buffer in different inferences const imageFilePath = 'media/news-paper.jpg' const imageBuffer = new Uint8Array(fs.readFileSync(imageFilePath)) try { - // 6. First inference with image buffer + // 5. First inference with image buffer (Uint8Array) const messages1 = [ { role: 'system', @@ -81,7 +76,7 @@ async function main () { console.log(`Inference stats: ${JSON.stringify(response1.stats)}`) console.log('\n') - // 7. Second inference with image file path + // 6. Second inference with image file path (string) const messages2 = [ { role: 'system', @@ -94,7 +89,7 @@ async function main () { }, { role: 'user', - content: 'what is in the image?' + content: 'Describe the image in one sentence.' } ] @@ -117,9 +112,8 @@ async function main () { console.error('Error occurred:', errorMessage) console.error('Error details:', error) } finally { - // 8. Cleaning up resources + // 7. Cleaning up resources await model.unload() - await fsDL.close() } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/nativelog.js b/packages/qvac-lib-infer-llamacpp-llm/examples/nativelog.js index f0458cf788..ecdba6aa43 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/nativelog.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/nativelog.js @@ -1,8 +1,8 @@ 'use strict' const LlmLlamacpp = require('../index') -const FilesystemDL = require('@qvac/dl-filesystem') const { setLogger, releaseLogger } = require('../addonLogging') +const path = require('bare-path') const process = require('bare-process') const { downloadModel } = require('./utils') @@ -36,17 +36,8 @@ async function main () { 'Llama-3.2-1B-Instruct-Q4_0.gguf' ) - // 3. Initializing data loader - const fsDL = new FilesystemDL({ dirPath }) - - // 4. Configuring model settings - const args = { - loader: fsDL, - opts: { stats: true }, - logger: console, - diskPath: dirPath, - modelName - } + // 3. Configuring model settings + const modelPath = path.join(dirPath, modelName) const config = { device: 'gpu', @@ -55,12 +46,17 @@ async function main () { verbosity: '2' } - // 5. Loading model - const model = new LlmLlamacpp(args, config) + // 4. Loading model + const model = new LlmLlamacpp({ + files: { model: [modelPath] }, + config, + logger: console, + opts: { stats: true } + }) await model.load() try { - // 6. Running inference with conversation prompt + // 5. Running inference with conversation prompt const prompt = [ { role: 'system', @@ -98,9 +94,8 @@ async function main () { console.error('Error occurred:', errorMessage) console.error('Error details:', error) } finally { - // 7. Cleaning up resources + // 6. Cleaning up resources await model.unload() - await fsDL.close() releaseLogger() } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/quickstart.js b/packages/qvac-lib-infer-llamacpp-llm/examples/quickstart.js index 551c867abe..09a664945d 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/quickstart.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/quickstart.js @@ -1,7 +1,7 @@ 'use strict' const LlmLlamacpp = require('../index') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const process = require('bare-process') const { downloadModel } = require('./utils') @@ -15,17 +15,8 @@ async function main () { 'Llama-3.2-1B-Instruct-Q4_0.gguf' ) - // 2. Initializing data loader - const fsDL = new FilesystemDL({ dirPath }) - - // 3. Configuring model settings - const args = { - loader: fsDL, - opts: { stats: true }, - logger: console, - diskPath: dirPath, - modelName - } + // 2. Configuring model settings + const modelPath = path.join(dirPath, modelName) const config = { device: 'gpu', @@ -33,12 +24,17 @@ async function main () { ctx_size: '1024' } - // 4. Loading model - const model = new LlmLlamacpp(args, config) + // 3. Loading model + const model = new LlmLlamacpp({ + files: { model: [modelPath] }, + config, + logger: console, + opts: { stats: true } + }) await model.load() try { - // 5. Running inference with conversation prompt + // 4. Running inference with conversation prompt const prompt = [ { role: 'system', @@ -76,9 +72,8 @@ async function main () { console.error('Error occurred:', errorMessage) console.error('Error details:', error) } finally { - // 6. Cleaning up resources + // 5. Cleaning up resources await model.unload() - await fsDL.close() } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/salamandraTA.js b/packages/qvac-lib-infer-llamacpp-llm/examples/salamandraTA.js index 81b88e3599..56136355b7 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/salamandraTA.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/salamandraTA.js @@ -1,7 +1,7 @@ 'use strict' const LlmLlamacpp = require('../index') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const process = require('bare-process') const { downloadModel } = require('./utils') @@ -15,17 +15,8 @@ async function main () { 'salamandrata_2b_inst_q4.gguf' ) - // 2. Initializing data loader - const fsDL = new FilesystemDL({ dirPath }) - - // 3. Configuring model settings - const args = { - loader: fsDL, - opts: { stats: true }, - logger: console, - diskPath: dirPath, - modelName - } + // 2. Configuring model settings + const modelPath = path.join(dirPath, modelName) const config = { device: 'gpu', @@ -33,12 +24,17 @@ async function main () { ctx_size: '1024' } - // 4. Loading model - const model = new LlmLlamacpp(args, config) + // 3. Loading model + const model = new LlmLlamacpp({ + files: { model: [modelPath] }, + config, + logger: console, + opts: { stats: true } + }) await model.load() try { - // 5. Running translation inference + // 4. Running translation inference const messages = [ { role: 'system', @@ -64,9 +60,8 @@ async function main () { console.error('Error occurred:', errorMessage) console.error('Error details:', error) } finally { - // 6. Cleaning up resources + // 5. Cleaning up resources await model.unload() - await fsDL.close() } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/simple-lora-inference.js b/packages/qvac-lib-infer-llamacpp-llm/examples/simple-lora-inference.js index 01ceaf97bb..437dfd2927 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/simple-lora-inference.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/simple-lora-inference.js @@ -1,7 +1,6 @@ 'use strict' const LlamaClient = require('../index') -const FilesystemDL = require('@qvac/dl-filesystem') const process = require('bare-process') const path = require('bare-path') const fs = require('bare-fs') @@ -112,15 +111,7 @@ async function main () { const loraAdapterPath = './lora_checkpoints/checkpoint_step_00000006/model.gguf' - const loader = new FilesystemDL({ dirPath: modelDir }) - - const args = { - loader, - opts: { stats: true }, - logger: console, - diskPath: modelDir, - modelName - } + const modelPath = path.join(modelDir, modelName) const config = { device: 'gpu', @@ -133,7 +124,12 @@ async function main () { let client try { - client = new LlamaClient(args, config) + client = new LlamaClient({ + files: { model: [modelPath] }, + config, + logger: console, + opts: { stats: true } + }) await client.load() const messages = [ diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/testToolRemoval.js b/packages/qvac-lib-infer-llamacpp-llm/examples/testToolRemoval.js index 85527f8d5c..650a72784a 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/testToolRemoval.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/testToolRemoval.js @@ -1,7 +1,6 @@ 'use strict' const LlmLlamacpp = require('../index') -const FilesystemDL = require('@qvac/dl-filesystem') const path = require('bare-path') const fs = require('bare-fs') const process = require('bare-process') @@ -65,16 +64,15 @@ function extractToolCalls (response) { } async function loadModel (dirPath, modelName, config) { - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const model = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: console, opts: { stats: true } - }, config) + }) await model.load() - return { model, loader } + return { model } } async function runAndCollect (model, prompt, runOptions) { @@ -102,7 +100,7 @@ async function main () { tools_at_end: 'true' } - const { model, loader } = await loadModel(dirPath, modelName, config) + const { model } = await loadModel(dirPath, modelName, config) const cachePath = path.join(dirPath, 'test-tool-removal.bin') try { fs.unlinkSync(cachePath) } catch (_) {} @@ -195,7 +193,6 @@ async function main () { : ' FAILURES DETECTED — removed tools leaked through the cache') } finally { await model.unload() - await loader.close() try { fs.unlinkSync(cachePath) } catch (_) {} } } @@ -221,7 +218,7 @@ async function mainInSystem () { tools_at_end: 'false' } - const { model, loader } = await loadModel(dirPath, modelName, config) + const { model } = await loadModel(dirPath, modelName, config) const cachePath = path.join(dirPath, 'test-tool-removal-insystem.bin') try { fs.unlinkSync(cachePath) } catch (_) {} @@ -320,7 +317,6 @@ async function mainInSystem () { : ' FAILURES DETECTED — removed tools leaked from conversation history') } finally { await model.unload() - await loader.close() try { fs.unlinkSync(cachePath) } catch (_) {} } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/examples/toolCalling.js b/packages/qvac-lib-infer-llamacpp-llm/examples/toolCalling.js index bcbe1c604c..578d9bbdb9 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/examples/toolCalling.js +++ b/packages/qvac-lib-infer-llamacpp-llm/examples/toolCalling.js @@ -1,7 +1,7 @@ 'use strict' const LlmLlamacpp = require('../index') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const process = require('bare-process') const { downloadModel } = require('./utils') @@ -80,17 +80,8 @@ async function main () { 'Qwen3-1.7B-Q4_0.gguf' ) - // 2. Initializing data loader - const fsDL = new FilesystemDL({ dirPath }) - - // 3. Configuring model settings - const args = { - loader: fsDL, - opts: { stats: true }, - logger: console, - diskPath: dirPath, - modelName - } + // 2. Configuring model settings + const modelPath = path.join(dirPath, modelName) const config = { device: 'gpu', @@ -99,12 +90,17 @@ async function main () { tools: 'true' } - // 4. Loading model - const model = new LlmLlamacpp(args, config) + // 3. Loading model + const model = new LlmLlamacpp({ + files: { model: [modelPath] }, + config, + logger: console, + opts: { stats: true } + }) await model.load() try { - // 5. Defining tool queries with function schemas + // 4. Defining tool queries with function schemas const systemMessageAmbiguous = { role: 'system', content: 'You are a helpful assistant with access to various tools. If request is ambiguous,skip tool calls.' @@ -276,7 +272,7 @@ async function main () { } ] - // 6. Running tool calling queries + // 5. Running tool calling queries const queries = [ { name: 'Query 1: Complex tool calling with multiple parameters', prompt: toolQuery1 }, { name: 'Query 2: Math calculation and ambiguous query', prompt: toolQuery2 }, @@ -296,9 +292,8 @@ async function main () { console.error('Error occurred:', errorMessage) console.error('Error details:', error) } finally { - // 7. Cleaning up resources + // 6. Cleaning up resources await model.unload() - await fsDL.close() } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/index.d.ts b/packages/qvac-lib-infer-llamacpp-llm/index.d.ts index a9b2b9506f..2fa44dc9be 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/index.d.ts +++ b/packages/qvac-lib-infer-llamacpp-llm/index.d.ts @@ -1,26 +1,18 @@ -import BaseInference, { - ReportProgressCallback -} from '@qvac/infer-base/WeightsProvider/BaseInference' import type { QvacResponse } from '@qvac/infer-base' import type QvacLogger from '@qvac/logging' export type NumericLike = number | `${number}` -export interface Loader { - ready(): Promise - close(): Promise - getStream(path: string): Promise> - download( - path: string, - opts: { diskPath: string; progressReporter?: unknown } - ): Promise<{ await(): Promise }> - getFileSize?(path: string): Promise -} - export interface AddonMessage { type: 'text' input: string prefill?: boolean + /** + * Per-call sampling overrides forwarded by `LlmLlamacpp.run()` from + * `RunOptions.generationParams`. Carried on the `text` message and consumed + * by the native binding so each `runJob` can use a different temp / top_p / + * seed / etc. without re-loading the model. + */ generationParams?: GenerationParams cacheKey?: string saveCacheToDisk?: boolean @@ -31,7 +23,6 @@ export interface AddonMediaMessage { } export type AddonRunJobMessage = AddonMessage | AddonMediaMessage - export interface Addon { loadWeights(data: { filename: string; chunk: Uint8Array | null; completed: boolean }, logger?: QvacLogger): Promise activate(): Promise @@ -69,14 +60,10 @@ export interface LlamaConfig { } export interface LlmLlamacppArgs { - loader: Loader + files: { model: string[]; projectionModel?: string } + config: LlamaConfig logger?: QvacLogger | Console | null opts?: { stats?: boolean } - diskPath?: string - modelName: string - projectionModel?: string - modelPath?: string - modelConfig?: Record } export interface UserTextMessage { @@ -122,10 +109,6 @@ export interface RunOptions { saveCacheToDisk?: boolean } -export interface DownloadWeightsOptions { - closeLoader?: boolean -} - export interface RuntimeStats { TTFT: number TPS: number @@ -136,12 +119,6 @@ export interface RuntimeStats { backendDevice: 'cpu' | 'gpu' } -export interface DownloadResult { - filePath: string | null - error: boolean - completed: boolean -} - export interface FinetuneValidationNone { type: 'none' } @@ -164,53 +141,67 @@ export type FinetuneValidation = | FinetuneValidationDataset export interface FinetuneOptions { - /** Path to training dataset file (.jsonl for SFT, .txt for causal). */ + /** Path to the training dataset file (e.g. `.jsonl` for SFT, `.txt` for causal). */ trainDatasetDir: string - /** How to run validation. */ + /** + * How to run validation. Required — there is no default. + * `{ type: 'none' }` disables validation. `{ type: 'split', fraction? }` reserves + * a fraction of the training data (default 0.05). `{ type: 'dataset', path }` + * uses a separate eval dataset file. + */ validation: FinetuneValidation - /** Directory (or file path ending in .gguf) for the final LoRA adapter. */ + /** Directory (or file path) where the final LoRA adapter will be written. */ outputParametersDir: string /** Number of training epochs. Default 1. */ numberOfEpochs?: number /** Initial learning rate. Default 1e-4. */ learningRate?: number - /** Training sequence length. Default 128. */ + /** Training sequence length (tokens). Default 128. */ contextLength?: number - /** Backend n_batch (tokens per batch). Must be >= microBatchSize and divisible by it. Default 128. */ + /** + * Backend `n_batch` (number of tokens processed per batch). Must be `>= microBatchSize` + * and divisible by it when both are set. Default 128. + */ batchSize?: number - /** Backend n_ubatch (micro-batch size). Must be <= batchSize. Default 128. */ + /** + * Backend `n_ubatch` (micro-batch size). Adjusted to gcd(datasetSampleCount, requested) + * if needed. Must be `<= batchSize` when both are set. Default 128. + */ microBatchSize?: number - /** Use SFT (chat) mode when true; causal (next-token) when false. Default false. */ + /** Use SFT (chat) mode if `true`, causal mode otherwise. Default `false`. */ assistantLossOnly?: boolean - /** Comma-separated LoRA target modules (e.g. 'attn_q,attn_k,attn_v,attn_o'). Default: attention Q/K/V/O. */ + /** + * Comma-separated target modules (e.g. `attn_q,attn_k,attn_v,attn_o,ffn_gate,ffn_up,ffn_down,output`, + * or `all`). Default attention Q, K, V, O only. + */ loraModules?: string /** LoRA rank. Default 8. */ loraRank?: number - /** LoRA alpha (scaling factor). Default 16.0. */ + /** LoRA alpha (scaling). Default 16.0. */ loraAlpha?: number /** LoRA init standard deviation. Default 0.02. */ loraInitStd?: number /** Seed for LoRA weight initialization (0 = non-deterministic). Default 42. */ loraSeed?: number - /** Directory for checkpoints. Default './checkpoints'. */ + /** Directory where checkpoints (and pause checkpoints) are saved. Default `./checkpoints`. */ checkpointSaveDir?: string - /** Save a checkpoint every N optimizer steps (0 = only on pause). Default 0. */ + /** Save a checkpoint every N steps (0 = pause checkpoints only). Default 0. */ checkpointSaveSteps?: number - /** Path to a custom chat template file (for SFT). */ + /** Path to a chat template file (used in SFT mode). Default `""`. */ chatTemplatePath?: string - /** Learning rate scheduler: 'constant', 'cosine', or 'linear'. Default 'cosine'. */ + /** Learning-rate schedule. Default `"cosine"`. */ lrScheduler?: 'constant' | 'cosine' | 'linear' - /** Minimum learning rate (for cosine/linear schedulers). Default 0. */ + /** Minimum learning rate (used by cosine/linear schedulers). Default 0. */ lrMin?: number - /** Warmup ratio (0–1). Requires warmupRatioSet: true. Default 0.1. */ + /** Warmup ratio (0–1). Requires `warmupRatioSet: true` to take effect. Default 0.1. */ warmupRatio?: number - /** When true, compute warmup steps from warmupRatio. */ + /** When `true`, warmup steps = `warmupRatio × totalSteps`. Default `false`. */ warmupRatioSet?: boolean - /** Explicit warmup steps (used when warmupStepsSet is true). Default 0. */ + /** Explicit warmup steps (used when `warmupStepsSet: true`). Default 0. */ warmupSteps?: number - /** When true, use warmupSteps directly instead of ratio. */ + /** When `true`, use `warmupSteps` directly instead of `warmupRatio`. Default `false`. */ warmupStepsSet?: boolean - /** Weight decay. Default 0.01. */ + /** Optimizer weight decay. Default 0.01. */ weightDecay?: number } @@ -254,43 +245,21 @@ export interface FinetuneResult { stats?: FinetuneStats } -export default class LlmLlamacpp extends BaseInference { - protected addon: Addon - - constructor( - args: LlmLlamacppArgs, - config: LlamaConfig - ) - _load( - closeLoader?: boolean, - onDownloadProgress?: ReportProgressCallback | ((bytes: number) => void) - ): Promise +export default class LlmLlamacpp { + protected addon: Addon | null + opts: { stats?: boolean } + logger: QvacLogger + state: { configLoaded: boolean } - load( - closeLoader?: boolean, - onDownloadProgress?: ReportProgressCallback | ((bytes: number) => void) - ): Promise - - downloadWeights( - onDownloadProgress?: (progress: Record, opts: DownloadWeightsOptions) => any, - opts?: DownloadWeightsOptions - ): Promise> - - _downloadWeights( - onDownloadProgress?: (progress: Record, opts: DownloadWeightsOptions) => any, - opts?: DownloadWeightsOptions - ): Promise> - - _runInternal(prompt: Message[], runOptions?: RunOptions): Promise + constructor(args: LlmLlamacppArgs) + load(): Promise run(prompt: Message[], runOptions?: RunOptions): Promise - finetune(finetuningOptions: FinetuneOptions): Promise - cancel(): Promise - + pause(): Promise unload(): Promise - + getState(): { configLoaded: boolean } } -export { ReportProgressCallback, QvacResponse, FinetuneHandle, FinetuneProgressStats, FinetuneOptions, FinetuneValidation } +export { QvacResponse, FinetuneHandle, FinetuneProgressStats, FinetuneOptions, FinetuneValidation } diff --git a/packages/qvac-lib-infer-llamacpp-llm/index.js b/packages/qvac-lib-infer-llamacpp-llm/index.js index 9d4e7c1433..91d7836f16 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/index.js +++ b/packages/qvac-lib-infer-llamacpp-llm/index.js @@ -2,12 +2,9 @@ const fs = require('bare-fs') const path = require('bare-path') - -const BaseInference = require('@qvac/infer-base/WeightsProvider/BaseInference') -const WeightsProvider = require('@qvac/infer-base/WeightsProvider/WeightsProvider') -const { LlamaInterface } = require('./addon') - -const noop = () => { } +const QvacLogger = require('@qvac/logging') +const { createJobHandler, exclusiveRunQueue } = require('@qvac/infer-base') +const { LlamaInterface, mapAddonEvent } = require('./addon') const RUN_BUSY_ERROR_MESSAGE = 'Cannot set new job: a job is already set or being processed' @@ -99,187 +96,251 @@ function normalizeFinetuneParams (opts) { } /** - * GGML client implementation for Llama LLM model + * Picks the primary GGUF path from an ordered file list. + * + * For sharded models the caller passes + * `[tensors.txt, shard-00001-of-N.gguf, ..., shard-N-of-N.gguf]`. + * The first entry matching the shard regex is returned so the value matches + * the C++ `GGUFShards::expandGGUFIntoShards` regex contract. + * For non-sharded single-file models the only entry is returned. + * + * @param {string[]} files - ordered array of absolute paths + * @returns {string} the primary GGUF path */ -class LlmLlamacpp extends BaseInference { - /** - * Creates an instance of LlmLlamacpp. - * @constructor - * @param {Object} args - Setup parameters including loader, logger, disk path, and model name - * @param {Loader} args.loader - External loader instance - * @param {Logger} [args.logger] - Optional structured logger - * @param {Object} [args.opts] - Optional inference options - * @param {string} args.diskPath - Disk directory where model files are stored - * @param {string} args.modelName - Name of the model directory or file. The usage of a sharded - * filename (e.g. "llama-00001-of-00004.gguf") will trigger asynchronous loading of the weights for - * all remaining files. - * @param {string} args.projectionModel - Name of the projection model directory or file - * @param {Object} config - Model-specific configuration settings - */ - constructor ( - { opts = {}, loader, logger = null, diskPath = '.', modelName, projectionModel }, - config - ) { - super({ logger, opts }) +function pickPrimaryGgufPath (files) { + const SHARD_REGEX = /-\d+-of-\d+\.gguf$/ + return files.find((p) => SHARD_REGEX.test(p)) || files[0] +} + +class LlmLlamacpp { + constructor ({ files, config, logger = null, opts = {} }) { + if (!files || !Array.isArray(files.model) || files.model.length === 0) { + throw new TypeError('files.model must be a non-empty array of absolute paths') + } + this._files = files.model + this._projectionModelPath = files.projectionModel || '' this._config = config - this._diskPath = diskPath - this._modelName = modelName - this._projectionModel = projectionModel - this._shards = WeightsProvider.expandGGUFIntoShards(this._modelName) - this.weightsProvider = new WeightsProvider(loader, this.logger) + this.logger = new QvacLogger(logger) + this.opts = opts + // The cancel closure dereferences `this.addon` lazily, so it is safe even though + // `this.addon` is `null` at construction time — it is only invoked from + // `response.cancel()` after `_load()` has assigned the addon. The optional chain + // also makes a stale `response.cancel()` after `unload()` a no-op. + this._job = createJobHandler({ cancel: () => this.addon?.cancel() }) + this._run = exclusiveRunQueue() + this.addon = null this._checkpointSaveDir = null this._hasActiveResponse = false - this._skipNextRuntimeStats = false - this._originalLogger = this.logger - this._baseOutputCallback = this._outputCallback.bind(this) + // Stateful flag carried across `mapAddonEvent` calls so the post-finetune + // TPS trailer the C++ addon emits is not mistaken for a fresh inference + // result. Lives on the model so unit tests can poke at it. + this._addonEventState = { skipNextRuntimeStats: false } + this.state = { configLoaded: false } } - /** - * Load model weights, initialize the native addon, and activate the model. - * @param {boolean} [closeLoader=true] - Whether to close the loader when complete - * @param {ProgressReportCallback} [onDownloadProgress] - Optional byte-level progress callback - * @returns {Promise} - */ - async _load (closeLoader = true, onDownloadProgress = noop) { + async load () { + if (this.state.configLoaded) return + await this._load() + this.state.configLoaded = true + } + + async _load () { this.logger.info('Starting model load') + const primaryGgufPath = pickPrimaryGgufPath(this._files) + const configurationParams = { + path: primaryGgufPath, + projectionPath: this._projectionModelPath, + config: { ...this._config } + } + this.addon = this._createAddon(configurationParams) try { - const configForLoad = { ...this._config } - - const configurationParams = { - path: path.join(this._diskPath, this._modelName), - projectionPath: this._projectionModel ? path.join(this._diskPath, this._projectionModel) : '', - config: configForLoad - } - - this.logger.info('Creating addon with configuration:', configurationParams) - this.addon = this._createAddon(configurationParams) - - if (this._shards !== null) { - await this._loadWeights(onDownloadProgress) - } else { - await this.downloadWeights(onDownloadProgress, { closeLoader }) + if (this._files.length > 1) { + await this._streamShards() } - - this.logger.info('Activating addon') await this.addon.activate() + } catch (loadError) { + // Best-effort cleanup of the partially-initialized addon so a subsequent + // load() does not leak a zombie native instance. + try { await this.addon?.unload?.() } catch (_) {} + this.addon = null + throw loadError + } + this.logger.info('Model load completed successfully') + } - this.logger.info('Model load completed successfully') - } catch (error) { - this.logger.error('Error during model load:', error) - throw error + async _streamShards () { + for (const filePath of this._files) { + const filename = path.basename(filePath) + const stream = fs.createReadStream(filePath) + for await (const chunk of stream) { + await this.addon.loadWeights({ filename, chunk, completed: false }) + } + await this.addon.loadWeights({ filename, chunk: null, completed: true }) + this.logger.info(`Streamed weights for ${filename}`) } } /** - * Download the model weight files and return the local path to the primary file. - * @param {ProgressReportCallback} [onDownloadProgress] - Callback invoked with bytes downloaded - * @returns {Promise<{filePath: string, completed: boolean, error: boolean}[]>} Local file path for the model weights + * Public API entrypoint for inference. + * @param {Message[]} prompt - Input prompt array of messages + * @param {RunOptions} [runOptions] - Optional run settings (prefill, generationParams, cacheKey, saveCacheToDisk) + * @returns {Promise} */ - async _downloadWeights (onDownloadProgress, opts) { - return await this.weightsProvider.downloadFiles( - this._projectionModel ? [this._modelName, this._projectionModel] : [this._modelName], - this._diskPath, - { - closeLoader: opts.closeLoader, - onDownloadProgress - } - ) + async run (prompt, runOptions = {}) { + return this._run(() => this._runInternal(prompt, runOptions)) } - async _loadWeights (reportProgressCallback) { - const onChunk = async (chunkedWeightsData) => { - this.addon.loadWeights(chunkedWeightsData, this.logger) + async _runInternal (prompt, runOptions = {}) { + if (!this.addon) { + throw new Error('Addon not initialized. Call load() first.') + } + if (this._hasActiveResponse) { + throw new Error(RUN_BUSY_ERROR_MESSAGE) } - await this.weightsProvider.streamFiles(this._shards, onChunk, reportProgressCallback) - } - _isSuppressedNoResponseLog (args) { - const message = args.map(arg => { - if (typeof arg === 'string') return arg - if (arg && typeof arg === 'object') { - if (arg.message && typeof arg.message === 'string') return arg.message - return JSON.stringify(arg) - } - return String(arg) - }).join(' ') - return message && message.includes('No response found for job') - } + if (!Array.isArray(prompt)) { + throw new TypeError('Prompt input must be Message[]') + } + const { prefill, generationParams, cacheKey, saveCacheToDisk } = normalizeRunOptions(runOptions) + + this.logger.info('Starting inference with prompt:', prompt) + + const textMessages = [] + const mediaItems = [] - _createFilteredLogger (sourceLogger) { - const filteredLogger = sourceLogger ? Object.create(Object.getPrototypeOf(sourceLogger)) : {} - Object.assign(filteredLogger, sourceLogger) + for (const message of prompt) { + if (message.role === 'user' && + message.type === 'media' && + message.content instanceof Uint8Array) { + mediaItems.push(message.content) + textMessages.push({ ...message, content: '' }) + } else { + textMessages.push(message) + } + } - const originalInfo = sourceLogger && typeof sourceLogger.info === 'function' - ? sourceLogger.info.bind(sourceLogger) - : null - const originalWarn = sourceLogger && typeof sourceLogger.warn === 'function' - ? sourceLogger.warn.bind(sourceLogger) - : null + const promptMessages = [] - filteredLogger.info = (...args) => { - if (this._isSuppressedNoResponseLog(args)) return - if (originalInfo) return originalInfo.apply(sourceLogger, args) + for (const mediaData of mediaItems) { + promptMessages.push({ type: 'media', content: mediaData }) } - filteredLogger.warn = (...args) => { - if (this._isSuppressedNoResponseLog(args)) return - if (originalWarn) return originalWarn.apply(sourceLogger, args) + promptMessages.push({ + type: 'text', + input: JSON.stringify(textMessages), + prefill, + generationParams, + cacheKey, + saveCacheToDisk + }) + + const response = this._job.start() + + let accepted + try { + accepted = await this.addon.runJob(promptMessages) + } catch (error) { + this._job.fail(error) + throw error + } + if (!accepted) { + this._job.fail(new Error(RUN_BUSY_ERROR_MESSAGE)) + throw new Error(RUN_BUSY_ERROR_MESSAGE) } - return filteredLogger + this._hasActiveResponse = true + const finalized = response.await().finally(() => { this._hasActiveResponse = false }) + finalized.catch((err) => { + this.logger?.warn?.('Inference response rejected:', err?.message || err) + }) + response.await = () => finalized + + this.logger.info('Inference job started successfully') + return response } - _handleAddonOutputEvent (originalOutputCb, originalLoggerRef, instance, eventType, jobId, data, extra) { - if (eventType === 'JobEnded' || eventType === 'Error') { - this._hasActiveResponse = false + async finetune (finetuningOptions = undefined) { + if (!this.addon) { + throw new Error('Addon not initialized. Call load() first.') + } + if (!finetuningOptions) { + throw new Error('Finetuning parameters are required.') + } + if (finetuningOptions.checkpointSaveDir) { + this._checkpointSaveDir = finetuningOptions.checkpointSaveDir } + const paramsToSend = normalizeFinetuneParams(finetuningOptions) + this.logger.info('finetune() called') + this.logger.info('Finetuning parameters:', finetuningOptions) + + return this._run(async () => { + if (this._hasActiveResponse) { + throw new Error(RUN_BUSY_ERROR_MESSAGE) + } + + const response = this._job.start() + let accepted + try { + accepted = await this.addon.finetune(paramsToSend) + } catch (err) { + this._job.fail(err) + throw err + } + + if (!accepted) { + this._job.fail(new Error(RUN_BUSY_ERROR_MESSAGE)) + throw new Error(RUN_BUSY_ERROR_MESSAGE) + } + this._hasActiveResponse = true + const finalized = response.await().finally(() => { this._hasActiveResponse = false }) + finalized.catch((err) => { + this.logger?.warn?.('Finetune response rejected:', err?.message || err) + }) + response.await = () => finalized + return response + }) + } + + _handleAddonOutputEvent (eventType, data, error) { if (eventType === 'LogMsg') { const logMsg = typeof data === 'string' ? data : (data?.message || JSON.stringify(data)) - originalLoggerRef?.info?.(logMsg) + this.logger?.info?.(logMsg) return } - if (originalOutputCb) { - return originalOutputCb(instance, eventType, jobId, data, extra) + if (eventType === 'Error') { + this.logger.error('Job failed with error:', error) + this._job.fail(error) + } else if (eventType === 'Output') { + this._job.output(data) + } else if (eventType === 'FinetuneProgress') { + if (this.opts.stats && data && data.stats) { + this._job.active?.updateStats(data.stats) + } + } else if (eventType === 'JobEnded') { + this.logger.info('Job completed') + const isFinetuneTerminal = data && typeof data === 'object' && data.op === 'finetune' && typeof data.status === 'string' + if (isFinetuneTerminal) { + this._job.end(null, data) + } else { + this._job.end(this.opts.stats ? data : null) + } } } - /** - * Public API entrypoint for inference. - * @param {Message[]} prompt - Input prompt array of messages - * @param {{prefill?: boolean}} [runOptions] - Optional run settings - * @returns {Promise} - */ - async run (prompt, runOptions = {}) { - return await this._runInternal(prompt, runOptions) + _addonOutputCallback (addon, event, data, error) { + // Event-name normalization lives in `addon.js` (`mapAddonEvent`) so the + // native binding wrapper owns the C++ event vocabulary. This shim only + // forwards the resulting logical event into `_handleAddonOutputEvent`. + const mapped = mapAddonEvent(event, data, error, this._addonEventState) + if (mapped === null) return + this._handleAddonOutputEvent(mapped.type, mapped.data, mapped.error) } - /** - * Instantiate the native addon with the given parameters. - * @param {Object} configurationParams - Configuration parameters for the addon - * @param {string} configurationParams.path - Local file or directory path - * @param {Object} configurationParams.settings - LLM-specific settings - * @returns {Addon} The instantiated addon interface - */ _createAddon (configurationParams) { const binding = require('./binding') - - this.logger = this._createFilteredLogger(this._originalLogger) - - this._outputCallback = (instance, eventType, jobId, data, extra) => { - return this._handleAddonOutputEvent( - this._baseOutputCallback, - this._originalLogger, - instance, - eventType, - jobId, - data, - extra - ) - } - return new LlamaInterface( binding, configurationParams, @@ -287,62 +348,12 @@ class LlmLlamacpp extends BaseInference { ) } - _addonOutputCallback (addon, event, data, error) { - if (typeof data === 'object' && data !== null && 'TPS' in data) { - if (this._skipNextRuntimeStats) { - this._skipNextRuntimeStats = false - return - } - const runtimeStats = { ...data } - if (runtimeStats.backendDevice === 0) { - runtimeStats.backendDevice = 'cpu' - } else if (runtimeStats.backendDevice === 1) { - runtimeStats.backendDevice = 'gpu' - } - return this._outputCallback(addon, 'JobEnded', 'OnlyOneJob', runtimeStats, null) - } - if ( - typeof data === 'object' && - data !== null && - data.op === 'finetune' && - typeof data.status === 'string' - ) { - this._skipNextRuntimeStats = true - return this._outputCallback(addon, 'JobEnded', 'OnlyOneJob', data, null) - } - if ( - typeof data === 'object' && - data !== null && - data.type === 'finetune_progress' - ) { - return this._outputCallback(addon, 'FinetuneProgress', 'OnlyOneJob', data, null) - } - - let mappedEvent = event - if (event.includes('Error')) { - mappedEvent = 'Error' - } else if (typeof data === 'string') { - mappedEvent = 'Output' - } - - return this._outputCallback(addon, mappedEvent, 'OnlyOneJob', data, error) - } - - /** - * Pause finetuning, saving a checkpoint so training can resume later. - * cancel inference job if it is running - */ async pause () { if (this.addon?.cancel) { await this.addon.cancel() } } - /** - * Cancel finetuning and remove the pause checkpoint so the next - * finetune() call starts fresh instead of resuming. - * cancel inference job if it is running - */ async cancel () { if (this.addon?.cancel) { await this.addon.cancel() @@ -365,161 +376,27 @@ class LlmLlamacpp extends BaseInference { } } - /** - * Unload model safely by cancelling and clearing pending jobs. - * @returns {Promise} - */ async unload () { - return await this._withExclusiveRun(async () => { + return this._run(async () => { try { await this.pause() } catch (_) {} - const currentJobResponse = this._jobToResponse.get('OnlyOneJob') - if (currentJobResponse) { - currentJobResponse.failed(new Error('Model was unloaded')) - this._deleteJobMapping('OnlyOneJob') + if (this._job.active) { + this._job.fail(new Error('Model was unloaded')) } this._hasActiveResponse = false - await super.unload() - }) - } - - /** - * Internal method to start inference with a text prompt. - * @param {Message[]} prompt - Input prompt array of messages - * @param {{prefill?: boolean}} [runOptions] - Optional run settings - * @returns {Promise} A QvacResponse representing the inference job - */ - async _runInternal (prompt, runOptions = {}) { - return this._withExclusiveRun(async () => { - if (this._hasActiveResponse) { - throw new Error(RUN_BUSY_ERROR_MESSAGE) - } - - if (!Array.isArray(prompt)) { - throw new TypeError('Prompt input must be Message[]') - } - const { prefill, generationParams, cacheKey, saveCacheToDisk } = normalizeRunOptions(runOptions) - - this.logger.info('Starting inference with prompt:', prompt) - - // Separate media messages from text messages - const textMessages = [] - const mediaItems = [] - - for (const message of prompt) { - if (message.role === 'user' && - message.type === 'media' && - message.content instanceof Uint8Array) { - mediaItems.push(message.content) - // Keep the message as a placeholder marker (with empty content) for tokenization - textMessages.push({ ...message, content: '' }) - } else { - textMessages.push(message) - } - } - - const promptMessages = [] - - // Send media first (in order) if present - for (const mediaData of mediaItems) { - promptMessages.push({ type: 'media', content: mediaData }) - } - - // Send text messages - promptMessages.push({ - type: 'text', - input: JSON.stringify(textMessages), - prefill, - generationParams, - cacheKey, - saveCacheToDisk - }) - - const response = this._createResponse('OnlyOneJob') - - // addon-cpp C++ guarantees no events will be generated - // until job is fully accepted. This means even if trying - // to queue a job fails right now as not accepted, - // it will not generate events. - // - // If any unexpected exception is thrown (e.g. in the C++ code) - // it will unwind here and the job will not be accepted. - let accepted - try { - accepted = await this.addon.runJob(promptMessages) - } catch (error) { - this._deleteJobMapping('OnlyOneJob') - response.failed(error) - throw error + if (this.addon) { + await this.addon.unload() + // Null the addon reference so post-unload `cancel()` / `run()` calls hit the + // `if (!this.addon)` guard instead of dereferencing a disposed native handle. + this.addon = null } - if (!accepted) { - this._deleteJobMapping('OnlyOneJob') - const msg = RUN_BUSY_ERROR_MESSAGE - response.failed(new Error(msg)) - throw new Error(msg) - } - - this._hasActiveResponse = true - const finalized = response.await().finally(() => { this._hasActiveResponse = false }) - finalized.catch(() => {}) - response.await = () => finalized - - this.logger.info('Inference job started successfully') - - return response + this.state.configLoaded = false }) } - async finetune (finetuningOptions = undefined) { - if (!this.addon) { - throw new Error( - 'Addon not initialized. Call load() first.' - ) - } - - if (!finetuningOptions) { - throw new Error( - 'Finetuning parameters are required.' - ) - } - if (finetuningOptions.checkpointSaveDir) { - this._checkpointSaveDir = finetuningOptions.checkpointSaveDir - } - const paramsToSend = normalizeFinetuneParams(finetuningOptions) - this.logger?.info?.('finetune() called') - this.logger?.info?.('Finetuning parameters:', finetuningOptions) - - return this._withExclusiveRun(async () => { - if (this._hasActiveResponse) { - throw new Error(RUN_BUSY_ERROR_MESSAGE) - } - - const response = this._createResponse('OnlyOneJob') - let accepted - try { - accepted = await this.addon.finetune(paramsToSend) - } catch (err) { - this._deleteJobMapping('OnlyOneJob') - response.failed(err) - throw err - } - - if (!accepted) { - this._deleteJobMapping('OnlyOneJob') - const msg = RUN_BUSY_ERROR_MESSAGE - response.failed(new Error(msg)) - throw new Error(msg) - } - - this._hasActiveResponse = true - const finalized = response.await().finally(() => { this._hasActiveResponse = false }) - finalized.catch(() => {}) - response.await = () => finalized - - return response - }) - } + getState () { return this.state } } module.exports = LlmLlamacpp +module.exports.pickPrimaryGgufPath = pickPrimaryGgufPath diff --git a/packages/qvac-lib-infer-llamacpp-llm/package.json b/packages/qvac-lib-infer-llamacpp-llm/package.json index d7deef159f..aa9181fb12 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/package.json +++ b/packages/qvac-lib-infer-llamacpp-llm/package.json @@ -1,6 +1,6 @@ { "name": "@qvac/llm-llamacpp", - "version": "0.15.0", + "version": "0.16.0", "description": "llama addon for qvac", "addon": true, "scripts": { @@ -57,8 +57,6 @@ "bugs": "https://github.com/tetherto/qvac/issues", "homepage": "https://github.com/tetherto/qvac/tree/main/packages/qvac-lib-infer-llamacpp-llm#readme", "devDependencies": { - "@qvac/dl-base": "^0.1.0", - "@qvac/dl-filesystem": "^0.1.2", "@qvac/logging": "^0.1.0", "@types/node": "^24.2.1", "bare-url": "^2.1.6", @@ -73,7 +71,7 @@ "util": "npm:bare-utils@^1.5.1" }, "dependencies": { - "@qvac/infer-base": "^0.3.0", + "@qvac/infer-base": "^0.4.0", "bare-fs": "^4.5.1", "bare-path": "^3.0.0", "bare-process": "^4.2.2" diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/afriquegemma-edge-cases.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/afriquegemma-edge-cases.test.js index e447a0126d..3291fb86dc 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/afriquegemma-edge-cases.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/afriquegemma-edge-cases.test.js @@ -1,7 +1,6 @@ 'use strict' const test = require('brittle') -const FilesystemDL = require('@qvac/dl-filesystem') const LlmLlamacpp = require('../../index.js') const os = require('bare-os') const fs = require('bare-fs') @@ -11,6 +10,7 @@ const platform = os.platform() const arch = os.arch() const isDarwinX64 = platform === 'darwin' && arch === 'x64' const isLinuxArm64 = platform === 'linux' && arch === 'arm64' +const isMobile = platform === 'ios' || platform === 'android' const useCpu = isDarwinX64 || isLinuxArm64 const AFRIQUEGEMMA_MODEL = { @@ -69,16 +69,14 @@ const TIMEOUT = 1_800_000 // // WHY: Users pass empty strings through UIs and pipelines; must not segfault. // --------------------------------------------------------------------------- -test('AfriqueGemma: empty and whitespace input must not crash', { timeout: TIMEOUT }, async t => { +test('AfriqueGemma: empty and whitespace input must not crash', { timeout: TIMEOUT, skip: isMobile }, async t => { const [modelName, dirPath] = await resolveModel() - const loader = new FilesystemDL({ dirPath }) const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [path.join(dirPath, modelName)] }, + config: AFRIQUEGEMMA_CONFIG, logger: console, opts: { stats: true } - }, AFRIQUEGEMMA_CONFIG) + }) try { await addon.load() const emptyPrompt = 'Translate English to Swahili.\nEnglish: \nSwahili:' @@ -91,7 +89,6 @@ test('AfriqueGemma: empty and whitespace input must not crash', { timeout: TIMEO t.pass('whitespace-style prompt did not crash') } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) @@ -101,36 +98,30 @@ test('AfriqueGemma: empty and whitespace input must not crash', { timeout: TIMEO // WHY: Apps that swap models or recover from errors need lifecycle to work; // creating a new instance after unload is the supported reload pattern. // --------------------------------------------------------------------------- -test('AfriqueGemma: lifecycle load-unload-fresh-load-use', { timeout: TIMEOUT }, async t => { +test('AfriqueGemma: lifecycle load-unload-fresh-load-use', { timeout: TIMEOUT, skip: isMobile }, async t => { const [modelName, dirPath] = await resolveModel() - const loader1 = new FilesystemDL({ dirPath }) const addon1 = new LlmLlamacpp({ - loader: loader1, - modelName, - diskPath: dirPath, + files: { model: [path.join(dirPath, modelName)] }, + config: AFRIQUEGEMMA_CONFIG, logger: console, opts: { stats: true } - }, AFRIQUEGEMMA_CONFIG) + }) try { await addon1.load() const r1 = await addon1.run([{ role: 'user', content: EN_SW_PROMPT }]) const out1 = await collectTranslation(r1) t.ok(out1.length > 0, 'first run produced output') await addon1.unload() - await loader1.close() } catch (err) { await addon1.unload().catch(() => {}) - await loader1.close().catch(() => {}) throw err } - const loader2 = new FilesystemDL({ dirPath }) const addon2 = new LlmLlamacpp({ - loader: loader2, - modelName, - diskPath: dirPath, + files: { model: [path.join(dirPath, modelName)] }, + config: AFRIQUEGEMMA_CONFIG, logger: console, opts: { stats: true } - }, AFRIQUEGEMMA_CONFIG) + }) try { await addon2.load() const r2 = await addon2.run([{ role: 'user', content: EN_SW_PROMPT }]) @@ -138,7 +129,6 @@ test('AfriqueGemma: lifecycle load-unload-fresh-load-use', { timeout: TIMEOUT }, t.ok(out2.length > 0, 'second run after fresh load produced output') } finally { await addon2.unload().catch(() => {}) - await loader2.close().catch(() => {}) } }) @@ -147,16 +137,14 @@ test('AfriqueGemma: lifecycle load-unload-fresh-load-use', { timeout: TIMEOUT }, // // WHY: Cancelling mid-operation must not corrupt state; model should be reusable. // --------------------------------------------------------------------------- -test('AfriqueGemma: cancel mid-translation, model reusable after', { timeout: TIMEOUT }, async t => { +test('AfriqueGemma: cancel mid-translation, model reusable after', { timeout: TIMEOUT, skip: isMobile }, async t => { const [modelName, dirPath] = await resolveModel() - const loader = new FilesystemDL({ dirPath }) const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [path.join(dirPath, modelName)] }, + config: { ...AFRIQUEGEMMA_CONFIG, n_predict: '512' }, logger: console, opts: { stats: true } - }, { ...AFRIQUEGEMMA_CONFIG, n_predict: '512' }) + }) try { await addon.load() const longPrompt = 'Translate English to Swahili.\nEnglish: The children are playing in the park. Their mother watches from the bench. The sun is shining brightly today. Many families enjoy this beautiful place.\nSwahili:' @@ -183,7 +171,6 @@ test('AfriqueGemma: cancel mid-translation, model reusable after', { timeout: TI t.ok(out2.length > 0, 'model produced output after cancel') } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) @@ -193,20 +180,18 @@ test('AfriqueGemma: cancel mid-translation, model reusable after', { timeout: TI // WHY: tools enables Jinja chat template; easy to miss, produces confusing error. // Verifies the addon either rejects with a clear message or defaults to jinja. // --------------------------------------------------------------------------- -test('AfriqueGemma: tools true required for load', { timeout: TIMEOUT }, async t => { +test('AfriqueGemma: tools true required for load', { timeout: TIMEOUT, skip: isMobile }, async t => { const [modelName, dirPath] = await resolveModel() - const loader = new FilesystemDL({ dirPath }) const configWithoutTools = { ...AFRIQUEGEMMA_CONFIG, tools: undefined } delete configWithoutTools.tools const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [path.join(dirPath, modelName)] }, + config: configWithoutTools, logger: console - }, configWithoutTools) + }) try { await addon.load() t.pass('load without tools succeeded (addon defaults to jinja)') @@ -217,8 +202,6 @@ test('AfriqueGemma: tools true required for load', { timeout: TIMEOUT }, async t /template|jinja|tools|not supported|custom/.test(msg), 'load without tools fails with clear message about template/jinja' ) - } finally { - await loader.close().catch(() => {}) } }) @@ -230,16 +213,14 @@ test('AfriqueGemma: tools true required for load', { timeout: TIMEOUT }, async t // BUG: Currently the addon returns an invalid response object that causes an // unhandled rejection. This test documents the expected behaviour. // --------------------------------------------------------------------------- -test('AfriqueGemma: run after unload rejects cleanly', { timeout: TIMEOUT }, async t => { +test('AfriqueGemma: run after unload rejects cleanly', { timeout: TIMEOUT, skip: isMobile }, async t => { const [modelName, dirPath] = await resolveModel() - const loader = new FilesystemDL({ dirPath }) const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [path.join(dirPath, modelName)] }, + config: AFRIQUEGEMMA_CONFIG, logger: console, opts: { stats: true } - }, AFRIQUEGEMMA_CONFIG) + }) await addon.load() const r1 = await addon.run([{ role: 'user', content: EN_SW_PROMPT }]) @@ -275,7 +256,6 @@ test('AfriqueGemma: run after unload rejects cleanly', { timeout: TIMEOUT }, asy t.comment('Expected: synchronous throw or a response that resolves to an error') } t.ok(rejected || hadUnhandled, 'run() after unload() does not silently succeed') - await loader.close().catch(() => {}) }) // --------------------------------------------------------------------------- @@ -286,16 +266,14 @@ test('AfriqueGemma: run after unload rejects cleanly', { timeout: TIMEOUT }, asy // crashing or producing garbled output. Catches buffer handling bugs // in the token emission pipeline. // --------------------------------------------------------------------------- -test('AfriqueGemma: small n_predict produces truncated but valid output', { timeout: TIMEOUT }, async t => { +test('AfriqueGemma: small n_predict produces truncated but valid output', { timeout: TIMEOUT, skip: isMobile }, async t => { const [modelName, dirPath] = await resolveModel() - const loader = new FilesystemDL({ dirPath }) const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [path.join(dirPath, modelName)] }, + config: { ...AFRIQUEGEMMA_CONFIG, n_predict: '8' }, logger: console, opts: { stats: true } - }, { ...AFRIQUEGEMMA_CONFIG, n_predict: '8' }) + }) try { await addon.load() @@ -313,7 +291,6 @@ test('AfriqueGemma: small n_predict produces truncated but valid output', { time } } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) @@ -326,16 +303,14 @@ test('AfriqueGemma: small n_predict produces truncated but valid output', { time // A crash or silent hang is unacceptable on mobile (see: SIGABRT on // context overflow in sliding-context tests). // --------------------------------------------------------------------------- -test('AfriqueGemma: long input approaching ctx_size boundary', { timeout: TIMEOUT }, async t => { +test('AfriqueGemma: long input approaching ctx_size boundary', { timeout: TIMEOUT, skip: isMobile }, async t => { const [modelName, dirPath] = await resolveModel() - const loader = new FilesystemDL({ dirPath }) const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [path.join(dirPath, modelName)] }, + config: { ...AFRIQUEGEMMA_CONFIG, ctx_size: '512', n_predict: '32' }, logger: console, opts: { stats: true } - }, { ...AFRIQUEGEMMA_CONFIG, ctx_size: '512', n_predict: '32' }) + }) try { await addon.load() @@ -359,6 +334,5 @@ test('AfriqueGemma: long input approaching ctx_size boundary', { timeout: TIMEOU t.ok(gotOutput || gotError, 'long input either produced output or a clear error — no crash or hang') } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/afriquegemma-translation.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/afriquegemma-translation.test.js index cd42207918..c69c486400 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/afriquegemma-translation.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/afriquegemma-translation.test.js @@ -1,7 +1,6 @@ 'use strict' const test = require('brittle') -const FilesystemDL = require('@qvac/dl-filesystem') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const os = require('bare-os') @@ -12,6 +11,7 @@ const platform = os.platform() const arch = os.arch() const isDarwinX64 = platform === 'darwin' && arch === 'x64' const isLinuxArm64 = platform === 'linux' && arch === 'arm64' +const isMobile = platform === 'ios' || platform === 'android' const useCpu = isDarwinX64 || isLinuxArm64 const AFRIQUEGEMMA_MODEL = { @@ -133,16 +133,14 @@ async function resolveModel () { } async function createAddon (dirPath, modelName, configOverrides = {}) { - const loader = new FilesystemDL({ dirPath }) const config = { ...AFRIQUEGEMMA_CONFIG, ...configOverrides } const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [path.join(dirPath, modelName)] }, + config, logger: console, opts: { stats: true } - }, config) - return { addon, loader } + }) + return { addon } } const TIMEOUT = 1_800_000 @@ -156,9 +154,9 @@ const TIMEOUT = 1_800_000 // WHY: Proves model produces valid translations across the primary language // pairs the pitch promises. Includes low-resource pairs and reverse. // --------------------------------------------------------------------------- -test('AfriqueGemma: core EN↔African language pairs', { timeout: TIMEOUT }, async t => { +test('AfriqueGemma: core EN↔African language pairs', { timeout: TIMEOUT, skip: isMobile }, async t => { const [modelName, dirPath] = await resolveModel() - const { addon, loader } = await createAddon(dirPath, modelName) + const { addon } = await createAddon(dirPath, modelName) try { await addon.load() t.pass('model loaded (Gemma 3 4B base, Q4_K_M via llama.cpp)') @@ -188,7 +186,6 @@ test('AfriqueGemma: core EN↔African language pairs', { timeout: TIMEOUT }, asy t.is(out1, out2, `deterministic: "${out1}"`) } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) @@ -201,9 +198,9 @@ test('AfriqueGemma: core EN↔African language pairs', { timeout: TIMEOUT }, asy // WHY: Mobile users need cross-language African communication; the pitch says // this routes via bridge language. Two inference calls, same model. // --------------------------------------------------------------------------- -test('AfriqueGemma: African-to-African via English pivot', { timeout: TIMEOUT }, async t => { +test('AfriqueGemma: African-to-African via English pivot', { timeout: TIMEOUT, skip: isMobile }, async t => { const [modelName, dirPath] = await resolveModel() - const { addon, loader } = await createAddon(dirPath, modelName) + const { addon } = await createAddon(dirPath, modelName) try { await addon.load() @@ -220,7 +217,6 @@ test('AfriqueGemma: African-to-African via English pivot', { timeout: TIMEOUT }, t.ok(!yorubaOutput.includes('English:'), 'final output is not English echo') } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) @@ -234,9 +230,9 @@ test('AfriqueGemma: African-to-African via English pivot', { timeout: TIMEOUT }, // French, Portuguese, and Arabic must also produce valid output. // Includes pt→sw to verify bridge→African direct translation. // --------------------------------------------------------------------------- -test('AfriqueGemma: bridge languages (French, Portuguese, Arabic)', { timeout: TIMEOUT }, async t => { +test('AfriqueGemma: bridge languages (French, Portuguese, Arabic)', { timeout: TIMEOUT, skip: isMobile }, async t => { const [modelName, dirPath] = await resolveModel() - const { addon, loader } = await createAddon(dirPath, modelName) + const { addon } = await createAddon(dirPath, modelName) try { await addon.load() @@ -253,7 +249,6 @@ test('AfriqueGemma: bridge languages (French, Portuguese, Arabic)', { timeout: T } } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) @@ -263,9 +258,9 @@ test('AfriqueGemma: bridge languages (French, Portuguese, Arabic)', { timeout: T // WHY: Real mobile text has multi-sentence paragraphs and numbers/dates. // Sequential calls on same instance must not leak state. // --------------------------------------------------------------------------- -test('AfriqueGemma: longer content, mixed content, sequential calls', { timeout: TIMEOUT }, async t => { +test('AfriqueGemma: longer content, mixed content, sequential calls', { timeout: TIMEOUT, skip: isMobile }, async t => { const [modelName, dirPath] = await resolveModel() - const { addon, loader } = await createAddon(dirPath, modelName) + const { addon } = await createAddon(dirPath, modelName) try { await addon.load() @@ -287,7 +282,6 @@ test('AfriqueGemma: longer content, mixed content, sequential calls', { timeout: } } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) @@ -300,9 +294,9 @@ test('AfriqueGemma: longer content, mixed content, sequential calls', { timeout: // non-Latin scripts (Amharic Ge'ez), and hooks (Hausa ɗ/ɓ). If the // tokenizer corrupts Unicode input, these translations fail silently. // --------------------------------------------------------------------------- -test('AfriqueGemma: African-language Unicode input (African → English)', { timeout: TIMEOUT }, async t => { +test('AfriqueGemma: African-language Unicode input (African → English)', { timeout: TIMEOUT, skip: isMobile }, async t => { const [modelName, dirPath] = await resolveModel() - const { addon, loader } = await createAddon(dirPath, modelName) + const { addon } = await createAddon(dirPath, modelName) try { await addon.load() @@ -327,7 +321,6 @@ test('AfriqueGemma: African-language Unicode input (African → English)', { tim t.comment('All non-Latin/diacritic inputs produced valid English output') } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) @@ -339,9 +332,9 @@ test('AfriqueGemma: African-language Unicode input (African → English)', { tim // stats, the UX breaks. Verifies onUpdate fires multiple times and // performance stats are populated for visibility. // --------------------------------------------------------------------------- -test('AfriqueGemma: streaming tokens arrive incrementally with stats', { timeout: TIMEOUT }, async t => { +test('AfriqueGemma: streaming tokens arrive incrementally with stats', { timeout: TIMEOUT, skip: isMobile }, async t => { const [modelName, dirPath] = await resolveModel() - const { addon, loader } = await createAddon(dirPath, modelName) + const { addon } = await createAddon(dirPath, modelName) try { await addon.load() @@ -375,6 +368,5 @@ test('AfriqueGemma: streaming tokens arrive incrementally with stats', { timeout } } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/api-behavior.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/api-behavior.test.js index 594e184ea5..b82c9f20ca 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/api-behavior.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/api-behavior.test.js @@ -3,7 +3,7 @@ // Tests must match the behavior described in README section "API behavior by state". const test = require('brittle') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const { attachSpecLogger } = require('./spec-logger') @@ -35,7 +35,7 @@ async function setupModel (t, configOverrides = {}) { downloadUrl: MODEL.url }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const config = { device: useCpu ? 'cpu' : 'gpu', gpu_layers: '999', @@ -47,18 +47,16 @@ async function setupModel (t, configOverrides = {}) { const specLogger = attachSpecLogger({ forwardToConsole: true }) const model = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: console, opts: { stats: true } - }, config) + }) await model.load() t.teardown(async () => { await model.unload().catch(() => {}) - await loader.close().catch(() => {}) specLogger.release() }) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/bitnet.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/bitnet.test.js index 3ccac870bb..067b0f78b8 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/bitnet.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/bitnet.test.js @@ -1,7 +1,7 @@ 'use strict' const test = require('brittle') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const { attachSpecLogger } = require('./spec-logger') @@ -32,7 +32,7 @@ test('bitnet model can run simple inference', { timeout: 600_000, skip: !isAndro downloadUrl: BITNET_MODEL.url }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const specLogger = attachSpecLogger({ forwardToConsole: true }) const config = { @@ -44,12 +44,11 @@ test('bitnet model can run simple inference', { timeout: 600_000, skip: !isAndro } const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: console, opts: { stats: true } - }, config) + }) try { await addon.load() @@ -60,7 +59,6 @@ test('bitnet model can run simple inference', { timeout: 600_000, skip: !isAndro t.comment(`BitNet output: "${output}"`) } finally { await addon.unload().catch(() => { }) - await loader.close().catch(() => { }) specLogger.release() } }) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/cache-state-machine.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/cache-state-machine.test.js index c4e153895b..6ea7dbd845 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/cache-state-machine.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/cache-state-machine.test.js @@ -3,7 +3,6 @@ const test = require('brittle') const path = require('bare-path') const fs = require('bare-fs') -const FilesystemDL = require('@qvac/dl-filesystem') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const { attachSpecLogger } = require('./spec-logger') @@ -92,7 +91,7 @@ async function setupModel (t, overrides = {}) { downloadUrl: DEFAULT_MODEL.url }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const config = { ...BASE_CONFIG, ...overrides } const specLogger = attachSpecLogger({ forwardToConsole: true }) let loggerReleased = false @@ -103,24 +102,21 @@ async function setupModel (t, overrides = {}) { } const model = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: console, opts: { stats: true } - }, config) + }) try { await model.load() } catch (err) { releaseLogger() - await loader.close().catch(() => { }) throw err } t.teardown(async () => { await model.unload().catch(() => { }) - await loader.close().catch(() => { }) releaseLogger() }) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/config-parameters.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/config-parameters.test.js index 90579d7c67..84db708190 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/config-parameters.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/config-parameters.test.js @@ -1,7 +1,7 @@ 'use strict' const test = require('brittle') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const { attachSpecLogger } = require('./spec-logger') @@ -343,7 +343,7 @@ async function executeScenario (t, scenario) { downloadUrl: 'https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf' }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const baseConfig = { device: useCpu ? 'cpu' : 'gpu', @@ -362,12 +362,11 @@ async function executeScenario (t, scenario) { const logs = specLogger.logs const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config: { ...baseConfig, ...scenario.overrides }, logger: createTestLogger(), opts: { stats: true } - }, { ...baseConfig, ...scenario.overrides }) + }) let loadSucceeded = false @@ -410,7 +409,6 @@ async function executeScenario (t, scenario) { if (loadSucceeded) { await addon.unload().catch(() => {}) } - await loader.close().catch(() => {}) specLogger.release() } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/dynamic-tools.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/dynamic-tools.test.js index 7baf88c9e2..85857006c9 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/dynamic-tools.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/dynamic-tools.test.js @@ -2,7 +2,6 @@ const test = require('brittle') const path = require('bare-path') -const FilesystemDL = require('@qvac/dl-filesystem') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const { attachSpecLogger } = require('./spec-logger') @@ -83,7 +82,7 @@ async function setupModel (t, overrides = {}) { downloadUrl: QWEN3_MODEL.url }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const config = { ...BASE_CONFIG, ...overrides } const specLogger = attachSpecLogger({ forwardToConsole: true }) let loggerReleased = false @@ -94,24 +93,21 @@ async function setupModel (t, overrides = {}) { } const model = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: console, opts: { stats: true } - }, config) + }) try { await model.load() } catch (err) { releaseLogger() - await loader.close().catch(() => {}) throw err } t.teardown(async () => { await model.unload().catch(() => {}) - await loader.close().catch(() => {}) releaseLogger() }) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/finetuning-pause-resume.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/finetuning-pause-resume.test.js index 4f4a6582dc..505cd6ed9f 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/finetuning-pause-resume.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/finetuning-pause-resume.test.js @@ -2,7 +2,6 @@ const test = require('brittle') const path = require('bare-path') -const FilesystemDL = require('@qvac/dl-filesystem') const LlmLlamacpp = require('../../index.js') const { ensureModel, @@ -89,7 +88,7 @@ function assertLossAndAccuracyAreFinite (t, result, modelId) { async function runLoraInference (t, modelVariant, modelName, modelDir, loraAdapterPath) { t.comment(`[${modelVariant.id}] Running inference with LoRA adapter: ${loraAdapterPath}`) - const inferLoader = new FilesystemDL({ dirPath: modelDir }) + const inferModelPath = path.join(modelDir, modelName) const inferConfig = { gpu_layers: '999', ctx_size: '512', @@ -98,16 +97,12 @@ async function runLoraInference (t, modelVariant, modelName, modelDir, loraAdapt lora: loraAdapterPath } - const inferModel = new LlmLlamacpp( - { - loader: inferLoader, - modelName, - diskPath: modelDir, - logger: console, - opts: { stats: true } - }, - inferConfig - ) + const inferModel = new LlmLlamacpp({ + files: { model: [inferModelPath] }, + config: inferConfig, + logger: console, + opts: { stats: true } + }) try { await inferModel.load() @@ -122,7 +117,6 @@ async function runLoraInference (t, modelVariant, modelName, modelDir, loraAdapt t.comment(`[${modelVariant.id}] LoRA inference stats: ${JSON.stringify(response.stats)}`) } finally { await inferModel.unload().catch(() => {}) - await inferLoader.close().catch(() => {}) } } @@ -143,7 +137,7 @@ test('finetuning pause and resume', { timeout: PAUSE_RESUME_TIMEOUT_MS, skip: sk }) const checkpointDir = finetuneConfig.checkpointSaveDir - const loader = new FilesystemDL({ dirPath: modelDir }) + const finetuneModelPath = path.join(modelDir, modelName) const loggerHandle = attachSpecLogger({ forwardToConsole: true }) const config = { @@ -153,16 +147,12 @@ test('finetuning pause and resume', { timeout: PAUSE_RESUME_TIMEOUT_MS, skip: sk verbosity: '2' } - const model = new LlmLlamacpp( - { - loader, - modelName, - diskPath: modelDir, - logger: console, - opts: { stats: true } - }, - config - ) + const model = new LlmLlamacpp({ + files: { model: [finetuneModelPath] }, + config, + logger: console, + opts: { stats: true } + }) try { await model.load() @@ -211,7 +201,6 @@ test('finetuning pause and resume', { timeout: PAUSE_RESUME_TIMEOUT_MS, skip: sk ) await model.unload().catch(() => {}) - await loader.close().catch(() => {}) const loraAdapterPath = path.join(finetuneConfig.outputParametersDir, 'trained-lora-adapter.gguf') await runLoraInference(t, modelVariant, modelName, modelDir, loraAdapterPath) @@ -275,7 +264,6 @@ test('finetuning pause and resume', { timeout: PAUSE_RESUME_TIMEOUT_MS, skip: sk t.pass(`[${modelVariant.id}] finetuning pause and resume completed`) await model.unload().catch(() => {}) - await loader.close().catch(() => {}) const loraAdapterPath = path.join(finetuneConfig.outputParametersDir, 'trained-lora-adapter.gguf') await runLoraInference(t, modelVariant, modelName, modelDir, loraAdapterPath) @@ -283,7 +271,6 @@ test('finetuning pause and resume', { timeout: PAUSE_RESUME_TIMEOUT_MS, skip: sk } finally { loggerHandle.release() await model.unload().catch(() => {}) - await loader.close().catch(() => {}) cleanupCheckpoints(checkpointDir) } } @@ -299,24 +286,20 @@ test('cancel() stops finetuning and removes pause checkpoint', { timeout: PAUSE_ const finetuneConfig = setupParams(modelDir, { checkpointSaveSteps: 5, datasetSize: isMobile ? 8 : 16, testId: 'cancel-test' }) const checkpointDir = finetuneConfig.checkpointSaveDir - const loader = new FilesystemDL({ dirPath: modelDir }) + const cancelModelPath = path.join(modelDir, modelName) const loggerHandle = attachSpecLogger({ forwardToConsole: true }) - const model = new LlmLlamacpp( - { - loader, - modelName, - diskPath: modelDir, - logger: console, - opts: { stats: true } - }, - { + const model = new LlmLlamacpp({ + files: { model: [cancelModelPath] }, + config: { gpu_layers: '999', ctx_size: '512', device: forceCpuDevice ? 'cpu' : 'gpu', verbosity: '2' - } - ) + }, + logger: console, + opts: { stats: true } + }) const fs = require('bare-fs') @@ -352,7 +335,6 @@ test('cancel() stops finetuning and removes pause checkpoint', { timeout: PAUSE_ } finally { loggerHandle.release() await model.unload().catch(() => {}) - await loader.close().catch(() => {}) cleanupCheckpoints(checkpointDir) } }) @@ -368,7 +350,7 @@ test('inference with session cache works after finetuning', { timeout: PAUSE_RES const checkpointDir = finetuneConfig.checkpointSaveDir const sessionFile = path.join(modelDir, 'test-session-finetune.bin') - const loader = new FilesystemDL({ dirPath: modelDir }) + const sessionModelPath = path.join(modelDir, modelName) const loggerHandle = attachSpecLogger({ forwardToConsole: true }) const config = { @@ -380,16 +362,12 @@ test('inference with session cache works after finetuning', { timeout: PAUSE_RES seed: '42' } - const model = new LlmLlamacpp( - { - loader, - modelName, - diskPath: modelDir, - logger: console, - opts: { stats: true } - }, - config - ) + const model = new LlmLlamacpp({ + files: { model: [sessionModelPath] }, + config, + logger: console, + opts: { stats: true } + }) const fs = require('bare-fs') @@ -430,7 +408,6 @@ test('inference with session cache works after finetuning', { timeout: PAUSE_RES } finally { loggerHandle.release() await model.unload().catch(() => {}) - await loader.close().catch(() => {}) cleanupCheckpoints(checkpointDir) try { fs.unlinkSync(sessionFile) } catch (_) {} } @@ -445,11 +422,13 @@ test('microBatchSize override changes backend batch geometry', { timeout: PAUSE_ async function getTotalBatches (batchSize, microBatchSize, testId) { const config = setupParams(modelDir, { batchSize, microBatchSize, checkpointSaveSteps: 0, testId }) - const loader = new FilesystemDL({ dirPath: modelDir }) - const model = new LlmLlamacpp( - { loader, modelName, diskPath: modelDir, logger: console, opts: { stats: true } }, - { gpu_layers: '999', ctx_size: '512', device: forceCpuDevice ? 'cpu' : 'gpu', verbosity: '0' } - ) + const batchModelPath = path.join(modelDir, modelName) + const model = new LlmLlamacpp({ + files: { model: [batchModelPath] }, + config: { gpu_layers: '999', ctx_size: '512', device: forceCpuDevice ? 'cpu' : 'gpu', verbosity: '0' }, + logger: console, + opts: { stats: true } + }) try { await model.load() const handle = await model.finetune(config) @@ -459,7 +438,6 @@ test('microBatchSize override changes backend batch geometry', { timeout: PAUSE_ return totalBatches } finally { await model.unload().catch(() => {}) - await loader.close().catch(() => {}) cleanupCheckpoints(config.checkpointSaveDir) } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/generation-params.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/generation-params.test.js index 613956cbc7..93ae6dec40 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/generation-params.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/generation-params.test.js @@ -1,7 +1,7 @@ 'use strict' const test = require('brittle') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const { attachSpecLogger } = require('./spec-logger') @@ -27,7 +27,7 @@ async function setupModel (t, configOverrides = {}) { downloadUrl: MODEL.url }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const config = { device: useCpu ? 'cpu' : 'gpu', gpu_layers: '999', @@ -40,18 +40,16 @@ async function setupModel (t, configOverrides = {}) { const specLogger = attachSpecLogger({ forwardToConsole: true }) const model = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: console, opts: { stats: true } - }, config) + }) await model.load() t.teardown(async () => { await model.unload().catch(() => {}) - await loader.close().catch(() => {}) specLogger.release() }) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/http-loader.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/http-loader.js index aa832a9da1..9d44d80271 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/http-loader.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/http-loader.js @@ -1,16 +1,26 @@ 'use strict' const https = require('bare-https') -const BaseDL = require('@qvac/dl-base') /** - * A minimal HTTP/HTTPS loader that implements the BaseDL interface. - * Fetches model files from a remote base URL, following redirects. + * Minimal HTTP/HTTPS streamer used by the sharded-model integration test + * to download a small public sharded GGUF before constructing the addon. + * + * Standalone — does not extend any base loader class. The package no + * longer depends on `@qvac/dl-base` after the loader-removal refactor; + * this helper exists solely so the sharded model-loading test can fetch + * shard files without pulling a heavyweight loader implementation back + * into devDependencies. + * + * Only the surface used by `model-loading.test.js` is implemented: + * - `new HttpDL({ baseUrl })` + * - `getStream(filename)` — returns a Bare-https response stream that + * can be piped into `fs.createWriteStream`. + * - `close()` — destroys any in-flight streams the caller did not + * consume to completion. */ -class HttpDL extends BaseDL { +class HttpDL { constructor (opts) { - super(opts) - if (!opts || !opts.baseUrl) { throw new Error('HttpDL requires a baseUrl option') } @@ -19,15 +29,6 @@ class HttpDL extends BaseDL { this._activeStreams = new Set() } - /** - * Return the Content-Length of a remote file via an HTTP HEAD request. - * @param {string} filename - * @returns {Promise} byte size - */ - async getFileSize (filename) { - return this._request('HEAD', this.baseUrl + filename) - } - /** * Fetch a file by name and return it as a readable stream. * The stream is tracked so that close() can destroy it if needed. @@ -44,7 +45,10 @@ class HttpDL extends BaseDL { return response } - async _close () { + /** + * Destroy any tracked streams that have not finished on their own. + */ + async close () { for (const stream of this._activeStreams) { stream.destroy() } @@ -73,22 +77,13 @@ class HttpDL extends BaseDL { return } - if (method === 'HEAD') { - response.resume() - resolve(parseInt(response.headers['content-length'] || '0', 10)) - } else { - resolve(response) - } + resolve(response) }) req.on('error', reject) req.end() }) } - - async list () { - throw new Error('HttpDL does not support list()') - } } module.exports = HttpDL diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/image.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/image.test.js index 4a07c72b52..b2aff007ae 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/image.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/image.test.js @@ -4,7 +4,6 @@ const test = require('brittle') const fs = require('bare-fs') const path = require('bare-path') const { ensureModel, getMediaPath } = require('./utils') -const FilesystemDL = require('@qvac/dl-filesystem') const LlmLlamacpp = require('../../index.js') const os = require('bare-os') @@ -84,7 +83,7 @@ function getConfig (device, modelConfig) { * Sets up a multimodal LlmLlamacpp instance with LLM and projection models * @param {Object} t - Test instance * @param {string} device - Device to use ('cpu' or 'gpu') - * @returns {Promise<{inference: LlmLlamacpp, loader: FilesystemDL}>} + * @returns {Promise<{inference: LlmLlamacpp}>} */ async function setupMultimodalInference (t, device = 'gpu', modelConfig = MULTIMODAL_MODEL_CONFIG) { const [modelName, dirPath] = await ensureModel(modelConfig.llmModel) @@ -93,23 +92,20 @@ async function setupMultimodalInference (t, device = 'gpu', modelConfig = MULTIM const [projModelName] = await ensureModel(modelConfig.projModel) t.ok(fs.existsSync(path.join(dirPath, projModelName)), 'Projection model file should exist') - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const inference = new LlmLlamacpp({ - modelName, - loader, - logger: console, - diskPath: dirPath, - projectionModel: projModelName - }, getConfig(device, modelConfig)) + files: { model: [modelPath], projectionModel: path.join(dirPath, projModelName) }, + config: getConfig(device, modelConfig), + logger: console + }) t.teardown(async () => { - await loader.close() await inference.unload() }) await inference.load() - return { inference, loader } + return { inference } } /** diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/model-loading.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/model-loading.test.js index 4bcc122af1..111f3acc58 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/model-loading.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/model-loading.test.js @@ -1,7 +1,6 @@ 'use strict' const test = require('brittle') -const FilesystemDL = require('@qvac/dl-filesystem') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') @@ -48,7 +47,7 @@ test('filesystem loader can run inference end-to-end', { timeout: 600_000, skip: downloadUrl: DEFAULT_MODEL.url }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const config = { gpu_layers: '999', ctx_size: '1024', @@ -58,12 +57,11 @@ test('filesystem loader can run inference end-to-end', { timeout: 600_000, skip: } const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: console, opts: { stats: true } - }, config) + }) try { await addon.load() @@ -76,7 +74,6 @@ test('filesystem loader can run inference end-to-end', { timeout: 600_000, skip: t.fail('filesystem-loaded model should generate output', error) } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) @@ -86,7 +83,7 @@ test('model unload is clean and idempotent', { timeout: 600_000 }, async t => { downloadUrl: DEFAULT_MODEL.url }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const config = { gpu_layers: '512', ctx_size: '1024', @@ -96,34 +93,29 @@ test('model unload is clean and idempotent', { timeout: 600_000 }, async t => { } const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: console, opts: { stats: true } - }, config) + }) - try { - await addon.load() - const firstResponse = await addon.run(BASE_PROMPT) - await collectResponse(firstResponse) + await addon.load() + const firstResponse = await addon.run(BASE_PROMPT) + await collectResponse(firstResponse) - await addon.unload() - t.pass('first unload succeeded') + await addon.unload() + t.pass('first unload succeeded') - await addon.load() - const secondResponse = await addon.run(BASE_PROMPT) - await collectResponse(secondResponse) + await addon.load() + const secondResponse = await addon.run(BASE_PROMPT) + await collectResponse(secondResponse) - await addon.unload() - t.pass('second unload succeeded') + await addon.unload() + t.pass('second unload succeeded') - await addon.unload().catch(err => { - if (err) t.fail('unload should be idempotent', err) - }) - } finally { - await loader.close().catch(() => {}) - } + await addon.unload().catch(err => { + if (err) t.fail('unload should be idempotent', err) + }) }) const SHARDED_MODEL = { @@ -134,10 +126,34 @@ const SHARDED_MODEL = { // This test can take longer to download and execute. To avoid blowing up testing time on all // platforms, just use Linux for now. C++ tests already have faster coverage for each type // of load. -test('network loader can run inference end-to-end with sharded model', { timeout: 4 * 60 * 1000, skip: !isLinuxX64 }, async t => { +test('sharded model can run inference end-to-end', { timeout: 4 * 60 * 1000, skip: !isLinuxX64 }, async t => { + const fs = require('bare-fs') const modelDir = path.resolve(__dirname, '../model') + fs.mkdirSync(modelDir, { recursive: true }) + + const shardFiles = [ + 'Qwen3-0.6B-UD-IQ1_S.tensors.txt', + 'Qwen3-0.6B-UD-IQ1_S-00001-of-00003.gguf', + 'Qwen3-0.6B-UD-IQ1_S-00002-of-00003.gguf', + 'Qwen3-0.6B-UD-IQ1_S-00003-of-00003.gguf' + ] const loader = new HttpDL({ baseUrl: SHARDED_MODEL.baseUrl }) + for (const filename of shardFiles) { + const dest = path.join(modelDir, filename) + if (fs.existsSync(dest)) continue + console.log(` Downloading shard: ${filename}`) + const stream = await loader.getStream(filename) + const ws = fs.createWriteStream(dest) + for await (const chunk of stream) { + ws.write(chunk) + } + ws.end() + await new Promise(resolve => ws.on('close', resolve)) + } + await loader.close().catch(() => {}) + + const shardPaths = shardFiles.map(f => path.join(modelDir, f)) const config = { gpu_layers: '999', ctx_size: '1024', @@ -147,39 +163,19 @@ test('network loader can run inference end-to-end with sharded model', { timeout } const addon = new LlmLlamacpp({ - loader, - modelName: SHARDED_MODEL.name, - diskPath: modelDir, + files: { model: shardPaths }, + config, logger: console, opts: { stats: true } - }, config) - - let progressMade = 0 - let lastLogTime = 0 - const LOG_INTERVAL_MS = 3000 - const onProgress = (data) => { - if (typeof data !== 'object' || data === null) return - const now = Date.now() - const shard = data.currentFile.replace(/^.*\//, '') - progressMade = Math.max(progressMade, data.overallProgress) - if (data.action === 'loadingFile' && now - lastLogTime >= LOG_INTERVAL_MS) { - console.log(`\r Loading ${shard}: ${data.currentFileProgress}% (overall ${data.overallProgress}%) `) - lastLogTime = now - } else if (data.action === 'completeFile') { - console.log(`\r Loaded ${shard}: 100.00% (overall ${data.overallProgress}%) [${data.filesProcessed}/${data.totalFiles}]\n`) - lastLogTime = now - } - } + }) try { - await addon.load(true, onProgress) + await addon.load() const response = await addon.run(BASE_PROMPT) const output = await collectResponse(response) - t.ok(output.length > 0, 'network-loaded sharded model should generate output') - t.ok(progressMade > 0, 'network-loaded sharded model should make progress') + t.ok(output.length > 0, 'sharded model should generate output') } finally { await addon.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/moe.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/moe.test.js index 15da081db3..5f8606aa2d 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/moe.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/moe.test.js @@ -2,7 +2,7 @@ const test = require('brittle') const os = require('bare-os') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const { attachSpecLogger } = require('./spec-logger') @@ -40,15 +40,14 @@ test('llm addon can run MoE models [dolphin-mixtral-2x7b]', { }, async t => { const [modelName, dirPath] = await ensureModel({ modelName: MODEL.name, downloadUrl: MODEL.url }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const specLogger = attachSpecLogger({ forwardToConsole: true }) const inference = new LlmLlamacpp({ - modelName, - loader, + files: { model: [modelPath] }, + config: CONFIG, logger: console, - diskPath: dirPath, opts: { stats: true } - }, CONFIG) + }) try { await inference.load() @@ -60,6 +59,5 @@ test('llm addon can run MoE models [dolphin-mixtral-2x7b]', { } finally { specLogger.release() await inference.unload().catch(() => {}) - await loader.close().catch(() => {}) } }) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/multi-instance.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/multi-instance.test.js index a7accc1753..01c88e3f39 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/multi-instance.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/multi-instance.test.js @@ -1,7 +1,7 @@ 'use strict' const test = require('brittle') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const os = require('bare-os') @@ -38,7 +38,7 @@ function createLogger () { } async function createInstance (modelName, dirPath, overrides = {}) { - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const config = { device: useCpu ? 'cpu' : 'gpu', gpu_layers: '999', @@ -50,12 +50,11 @@ async function createInstance (modelName, dirPath, overrides = {}) { } const addon = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: createLogger(), opts: { stats: true } - }, config) + }) const origLoad = addon.load.bind(addon) addon.load = async function () { @@ -64,7 +63,7 @@ async function createInstance (modelName, dirPath, overrides = {}) { console.log(` model.load() took ${Date.now() - t0} ms`) } - return { addon, loader } + return { addon } } async function collectResponse (response) { @@ -82,14 +81,12 @@ test('Two instances can run inference simultaneously', { downloadUrl: DEFAULT_MODEL.url }) - const { addon: addon1, loader: loader1 } = await createInstance(modelName, dirPath) - const { addon: addon2, loader: loader2 } = await createInstance(modelName, dirPath) + const { addon: addon1 } = await createInstance(modelName, dirPath) + const { addon: addon2 } = await createInstance(modelName, dirPath) t.teardown(async () => { await addon1.unload().catch(() => {}) await addon2.unload().catch(() => {}) - await loader1.close().catch(() => {}) - await loader2.close().catch(() => {}) }) await addon1.load() @@ -119,7 +116,7 @@ test('Repeated load/unload cycles should remain stable', { const NUM_CYCLES = 6 for (let i = 0; i < NUM_CYCLES; i++) { - const { addon, loader } = await createInstance(modelName, dirPath) + const { addon } = await createInstance(modelName, dirPath) await addon.load() const response = await addon.run(BASE_PROMPT) @@ -128,7 +125,6 @@ test('Repeated load/unload cycles should remain stable', { t.ok(output.length > 0, `cycle ${i + 1}: produced output`) await addon.unload() - await loader.close() t.pass(`cycle ${i + 1}: load/unload completed`) } @@ -145,16 +141,14 @@ test('Unloading one instance does not affect another generating instance', { downloadUrl: DEFAULT_MODEL.url }) - const { addon: addon1, loader: loader1 } = await createInstance(modelName, dirPath, { + const { addon: addon1 } = await createInstance(modelName, dirPath, { n_predict: '256' }) - const { addon: addon2, loader: loader2 } = await createInstance(modelName, dirPath) + const { addon: addon2 } = await createInstance(modelName, dirPath) t.teardown(async () => { await addon1.unload().catch(() => {}) await addon2.unload().catch(() => {}) - await loader1.close().catch(() => {}) - await loader2.close().catch(() => {}) }) await addon1.load() @@ -189,7 +183,6 @@ test('Unloading one instance does not affect another generating instance', { if (!unloadedInstance2) { unloadedInstance2 = true await addon2.unload() - await loader2.close() t.pass('unloaded instance 2 while instance 1 is generating') } @@ -209,13 +202,12 @@ test('Multiple load/unload cycles on one instance while another generates', { downloadUrl: DEFAULT_MODEL.url }) - const { addon: addon1, loader: loader1 } = await createInstance(modelName, dirPath, { + const { addon: addon1 } = await createInstance(modelName, dirPath, { n_predict: '512' }) t.teardown(async () => { await addon1.unload().catch(() => {}) - await loader1.close().catch(() => {}) }) await addon1.load() @@ -262,10 +254,9 @@ test('Multiple load/unload cycles on one instance while another generates', { } cyclesCompleted++ const cycleNum = cyclesCompleted - const { addon: addon2, loader: loader2 } = await createInstance(modelName, dirPath) + const { addon: addon2 } = await createInstance(modelName, dirPath) await addon2.load() await addon2.unload() - await loader2.close() t.pass(`load/unload cycle ${cycleNum} completed while instance 1 generates`) } diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js index 9adf88567a..546abe8f91 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/ocr-lighton.test.js @@ -4,7 +4,6 @@ const test = require('brittle') const fs = require('bare-fs') const path = require('bare-path') const { ensureModel, getMediaPath } = require('./utils') -const FilesystemDL = require('@qvac/dl-filesystem') const LlmLlamacpp = require('../../index.js') const os = require('bare-os') @@ -55,23 +54,20 @@ async function setupLightOnInference (t, device = 'gpu') { const [projModelName] = await ensureModel(LIGHTON_OCR_CONFIG.projModel) t.ok(fs.existsSync(path.join(dirPath, projModelName)), 'Projection model file should exist') - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const inference = new LlmLlamacpp({ - modelName, - loader, - logger: console, - diskPath: dirPath, - projectionModel: projModelName - }, getConfig(device)) + files: { model: [modelPath], projectionModel: path.join(dirPath, projModelName) }, + config: getConfig(device), + logger: console + }) t.teardown(async () => { - await loader.close() await inference.unload() }) await inference.load() - return { inference, loader } + return { inference } } async function runOcr (inference, imageFilePath) { diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/reasoning.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/reasoning.test.js index e29d5cf0c0..e27e1ee7e3 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/reasoning.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/reasoning.test.js @@ -1,10 +1,10 @@ 'use strict' const test = require('brittle') +const path = require('bare-path') const { ensureModel } = require('./utils') const { attachSpecLogger } = require('./spec-logger') const os = require('bare-os') -const FilesystemDL = require('@qvac/dl-filesystem') const LlmLlamacpp = require('../../index.js') const isDarwinX64 = os.platform() === 'darwin' && os.arch() === 'x64' @@ -23,7 +23,7 @@ async function setupReasoningModel (t, toolsEnabled) { downloadUrl: MODEL.url }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const specLogger = attachSpecLogger({ forwardToConsole: true }) const config = { @@ -38,27 +38,24 @@ async function setupReasoningModel (t, toolsEnabled) { } const inference = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: console, - projectionPath: '', opts: { stats: true } - }, config) + }) await inference.load() t.teardown(async () => { try { specLogger.release() - if (loader) await loader.close() if (inference) await inference.unload() } catch (err) { // Ignore cleanup errors } }) - return { inference, loader } + return { inference } } // Shared helper: Run a completion and collect response diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/sliding-context.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/sliding-context.test.js index 384306b5e5..c29c4d3977 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/sliding-context.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/sliding-context.test.js @@ -2,7 +2,6 @@ const test = require('brittle') const path = require('bare-path') -const FilesystemDL = require('@qvac/dl-filesystem') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const os = require('bare-os') @@ -55,7 +54,7 @@ async function setupModel (t, overrides = {}) { downloadUrl: DEFAULT_MODEL.url }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const baseConfig = { device: useCpu ? 'cpu' : 'gpu', @@ -69,19 +68,13 @@ async function setupModel (t, overrides = {}) { } const model = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config: { ...baseConfig, ...overrides }, logger: createTestLogger(), opts: { stats: true } - }, { ...baseConfig, ...overrides }) + }) - try { - await model.load() - } catch (err) { - await loader.close().catch(() => {}) - throw err - } + await model.load() t.teardown(async () => { // Guard against model.unload() hanging after context overflow (seen on darwin-arm64 CI). @@ -89,7 +82,6 @@ async function setupModel (t, overrides = {}) { const unloadDone = model.unload().catch(() => {}) const unloadTimeout = new Promise(resolve => setTimeout(resolve, 30_000)) await Promise.race([unloadDone, unloadTimeout]) - await loader.close().catch(() => {}) }) return { model, dirPath } diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/tool-calling.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/tool-calling.test.js index 0a6272acfd..6e2f2b0d80 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/tool-calling.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/tool-calling.test.js @@ -1,7 +1,7 @@ 'use strict' const test = require('brittle') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const { attachSpecLogger } = require('./spec-logger') @@ -141,7 +141,7 @@ async function createToolModel (modelVariant) { downloadUrl: modelVariant.downloadUrl }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const specLogger = attachSpecLogger({ forwardToConsole: true }) let loggerReleased = false const releaseLogger = () => { @@ -151,18 +151,16 @@ async function createToolModel (modelVariant) { } const model = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config: BASE_CONFIG, logger: console, opts: { stats: true } - }, BASE_CONFIG) + }) try { await model.load() } catch (err) { releaseLogger() - await loader.close().catch(() => {}) throw err } @@ -170,7 +168,6 @@ async function createToolModel (modelVariant) { model, async release () { await model.unload().catch(() => {}) - await loader.close().catch(() => {}) releaseLogger() } } diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/integration/utf8-output.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/integration/utf8-output.test.js index 18c0fb0670..1450600666 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/integration/utf8-output.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/integration/utf8-output.test.js @@ -1,7 +1,7 @@ 'use strict' const test = require('brittle') -const FilesystemDL = require('@qvac/dl-filesystem') +const path = require('bare-path') const LlmLlamacpp = require('../../index.js') const { ensureModel } = require('./utils') const { attachSpecLogger } = require('./spec-logger') @@ -39,7 +39,7 @@ test('model returns UTF-8 emoji without truncation', { timeout: 600_000 }, async downloadUrl: MODEL.url }) - const loader = new FilesystemDL({ dirPath }) + const modelPath = path.join(dirPath, modelName) const specLogger = attachSpecLogger({ forwardToConsole: true }) let loggerReleased = false const releaseLogger = () => { @@ -61,12 +61,11 @@ test('model returns UTF-8 emoji without truncation', { timeout: 600_000 }, async } const model = new LlmLlamacpp({ - loader, - modelName, - diskPath: dirPath, + files: { model: [modelPath] }, + config, logger: console, opts: { stats: true } - }, config) + }) let output = '' try { @@ -86,7 +85,6 @@ test('model returns UTF-8 emoji without truncation', { timeout: 600_000 }, async t.ok(response.stats.generatedTokens > 0, 'token stats recorded') } finally { await model.unload().catch(() => {}) - await loader.close().catch(() => {}) releaseLogger() } }) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/mobile/README.md b/packages/qvac-lib-infer-llamacpp-llm/test/mobile/README.md index 2a82fbd4cc..cb8be631a3 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/mobile/README.md +++ b/packages/qvac-lib-infer-llamacpp-llm/test/mobile/README.md @@ -1,46 +1,43 @@ # Mobile Testing for LLM Llamacpp -This directory contains the mobile test configuration for the `@qvac/llm-llamacpp` addon. +This directory contains the mobile test entrypoint for the `@qvac/llm-llamacpp` addon. > ⚠️ **Note**: This test directory is included in the published npm package to support the mobile testing framework. These test files are NOT part of the public API and should only be used by the internal mobile testing infrastructure. ## Test Structure -- `test.cjs` - Main test file with `startTest()` function that runs automatically on mobile -- `testAssets/` - Directory for model files and test data +- `integration-runtime.cjs` — Bare-runtime helper that exposes a global `runIntegrationModule()` so each generated test entry can dynamically import a single file under `../integration/`. +- `integration.auto.cjs` — **Auto-generated** by `npm run test:mobile:generate`. Each function in this file mirrors one `.test.js` under `test/integration/` and invokes it through the runtime helper. Do not edit by hand; regenerate after adding or renaming integration tests. +- `testAssets/` — Directory for model files and test data referenced by the integration tests. -## Setup +## What the Mobile Tests Do -### Download Test Model +The mobile tests run the **same integration suite** that lives under `test/integration/`. They exercise the public `LlmLlamacpp` API end-to-end: -The test requires a small GGUF model file. Download it to the `testAssets` directory: +1. **Construct the addon** with the new constructor shape — `new LlmLlamacpp({ files: { model: [absolutePath] }, config, logger?, opts? })`. For sharded GGUF models the caller pre-resolves the shard list (`tensors.txt` + every `*-NNNNN-of-MMMMM.gguf` file). +2. **Load** the model into memory via `model.load()`. +3. **Run** inference, finetuning, generation-parameter, KV-cache, and other scenarios depending on which test entry is invoked. +4. **Unload** the model via `model.unload()` (or via `t.teardown()` in brittle tests). -```bash -cd test/mobile/testAssets +There is **no separate `test.cjs` file** and the addon no longer takes a `Loader` instance — file paths are passed directly to the constructor by the test (or by the test helper in `test/integration/utils.js`). Mobile testing reuses these helpers unchanged. -# Download a small test model (~500KB) -curl -L -o small-test-model.gguf \ - https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf -``` +## Setup -### Verify Setup +### Test Assets -```bash -ls -lh testAssets/ -# Should show: small-test-model.gguf (~500KB) -``` +Each integration test downloads or expects its own model under `test/integration/...` (or under `testAssets/`). See the individual test files for the exact model required. Most tests rely on `setupModel()` / `setupTinyModel()` helpers in `test/integration/utils.js`, which resolve the absolute file paths and pass them through `files.model`. + +## Regenerating `integration.auto.cjs` -## What the Test Does +After adding a new file under `test/integration/`, regenerate the mobile entries: -The mobile test performs a complete LLM inference workflow: +```bash +npm run test:mobile:generate +``` -1. **Initialize Filesystem Loader** - Sets up file access for the model -2. **Configure Model** - Uses GPU-accelerated settings (99 GPU layers) for faster inference -3. **Load Model** - Loads the GGUF model weights into memory and offloads to GPU -4. **Run Inference** - Generates text from the prompt "Say hello in one word" -5. **Cleanup** - Properly destroys the model instance and closes the loader +This walks `test/integration/`, derives a function name per test file, and rewrites `integration.auto.cjs`. The generator script also runs from CI to ensure mobile and desktop test inventories stay in sync. -## Running the Test +## Running the Tests From the mobile tester app root: @@ -55,42 +52,16 @@ npm run android npm run ios ``` -The app will: -- Automatically initialize after 3 seconds -- Start the test after 5 seconds -- Display progress and results on screen - -## Expected Output - -Success message will show: -``` -TEST COMPLETE ✓ - -Model loaded and generated X characters in response to: "Say hello in one word." - -Generated: Hello -``` +The app drives the auto-generated entrypoints to execute the desired test scenarios on-device. ## Troubleshooting ### Model file not found -- Ensure `small-test-model.gguf` is in the `testAssets/` directory -- Check that the file downloaded completely (~500KB) +- Ensure the test asset referenced by the failing integration test is present under `test/integration/` (or `testAssets/`). +- For sharded models, every shard plus the `*.tensors.txt` file must be present — the caller is responsible for the full file set since the addon no longer downloads weights. ### Out of memory -- The test uses a very small model (~500KB) -- If issues persist, try closing other apps +- Mobile devices have limited RAM. Prefer the smaller test models (e.g. tinyllama / Qwen-0.6B) for on-device runs and skip large-model tests where possible. ### Timeout errors -- The test waits up to 60 seconds for generation -- On slower devices, this may need to be increased in `test.cjs` - -## Model Details - -**Model**: TinyLlamas Stories 260K -- Size: ~500KB -- Format: GGUF -- Purpose: Fast mobile testing -- Source: https://huggingface.co/ggml-org/models - -This is an extremely small model designed for quick testing, not production use. +- Generation timeouts can be tuned per test file in `test/integration/...` via the brittle `{ timeout }` option. diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/unit/finetuning.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/unit/finetuning.test.js index 9bb24441fe..d0d3b727f1 100644 --- a/packages/qvac-lib-infer-llamacpp-llm/test/unit/finetuning.test.js +++ b/packages/qvac-lib-infer-llamacpp-llm/test/unit/finetuning.test.js @@ -60,17 +60,12 @@ async function assertInferenceSucceeds (t, model, token) { } const createModelWithMockAddon = (opts = {}) => { - const loader = { close: () => Promise.resolve() } - const model = new LlmLlamacpp( - { - loader, - opts, - logger: { info: () => {}, warn: () => {}, error: () => {}, debug: () => {} }, - diskPath: '.', - modelName: 'test.gguf' - }, - { device: 'cpu', ctx_size: '256' } - ) + const model = new LlmLlamacpp({ + files: { model: ['/tmp/test.gguf'] }, + config: { device: 'cpu', ctx_size: '256' }, + opts, + logger: { info: () => {}, warn: () => {}, error: () => {}, debug: () => {} } + }) model.addon = createMockAddon() return model } @@ -183,13 +178,14 @@ test('finetune() runs inside exclusive queue wrapper', async (t) => { model.addon.finetune.callsFake(completeFinetuneWith(model)) let wrapperCalled = false - model._withExclusiveRun = async (fn) => { + const originalRun = model._run + model._run = async (fn) => { wrapperCalled = true - return await fn() + return await originalRun(fn) } const handle = await model.finetune(opts) - t.ok(wrapperCalled, 'finetune should execute inside _withExclusiveRun') + t.ok(wrapperCalled, 'finetune should execute inside exclusiveRunQueue') const result = await handle.await() t.alike(result, { op: 'finetune', status: 'COMPLETED' }) }) @@ -405,16 +401,16 @@ test('_skipNextRuntimeStats swallows TPS stats that follow a finetune terminal r model.addon.finetune.callsFake(() => true) const handle = await model.finetune(opts) - t.is(model._skipNextRuntimeStats, false, 'flag starts false before finetune terminal arrives') + t.is(model._addonEventState.skipNextRuntimeStats, false, 'flag starts false before finetune terminal arrives') model._addonOutputCallback(null, 'Output', { op: 'finetune', status: 'COMPLETED' }, null) - t.is(model._skipNextRuntimeStats, true, 'flag must be set after finetune terminal result') + t.is(model._addonEventState.skipNextRuntimeStats, true, 'flag must be set after finetune terminal result') const result = await handle.await() t.alike(result, { op: 'finetune', status: 'COMPLETED' }) model._addonOutputCallback(null, 'Output', { TPS: 0, tokens: 0, time_ms: 100 }, null) - t.is(model._skipNextRuntimeStats, false, 'flag must reset after TPS stats are consumed') + t.is(model._addonEventState.skipNextRuntimeStats, false, 'flag must reset after TPS stats are consumed') }) test('TPS stats without prior finetune are forwarded as normal JobEnded', async (t) => { @@ -422,7 +418,7 @@ test('TPS stats without prior finetune are forwarded as normal JobEnded', async model.addon.runJob.callsFake(() => true) const response = await model._runInternal([{ role: 'user', content: 'Hello' }]) - t.is(model._skipNextRuntimeStats, false, 'flag should be false without finetune') + t.is(model._addonEventState.skipNextRuntimeStats, false, 'flag should be false without finetune') model._addonOutputCallback(null, 'Output', 'world', null) model._addonOutputCallback(null, 'Output', { TPS: 42.5, tokens: 10, time_ms: 235 }, null) @@ -430,7 +426,7 @@ test('TPS stats without prior finetune are forwarded as normal JobEnded', async const output = await response.await() t.ok(Array.isArray(output), 'inference response should resolve with output array') t.ok(output.includes('world'), 'output should contain the emitted token') - t.is(model._skipNextRuntimeStats, false, 'flag should remain false') + t.is(model._addonEventState.skipNextRuntimeStats, false, 'flag should remain false') t.is(model._hasActiveResponse, false, 'busy state should be cleared') }) @@ -442,13 +438,13 @@ test('_skipNextRuntimeStats prevents finetune TPS from ending a subsequent infer const finetuneHandle = await model.finetune(opts) model._addonOutputCallback(null, 'Output', { op: 'finetune', status: 'COMPLETED' }, null) await finetuneHandle.await() - t.is(model._skipNextRuntimeStats, true, 'skip flag should be armed after finetune') + t.is(model._addonEventState.skipNextRuntimeStats, true, 'skip flag should be armed after finetune') model.addon.runJob.callsFake(() => true) const inferResponse = await model._runInternal([{ role: 'user', content: 'Hello' }]) model._addonOutputCallback(null, 'Output', { TPS: 0, tokens: 0 }, null) - t.is(model._skipNextRuntimeStats, false, 'flag should reset after consuming stale TPS') + t.is(model._addonEventState.skipNextRuntimeStats, false, 'flag should reset after consuming stale TPS') t.is(inferResponse.getStatus(), 'running', 'inference must still be running after stale TPS was swallowed') model._addonOutputCallback(null, 'Output', 'answer', null) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/unit/map-addon-event.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/unit/map-addon-event.test.js new file mode 100644 index 0000000000..82cdae61c0 --- /dev/null +++ b/packages/qvac-lib-infer-llamacpp-llm/test/unit/map-addon-event.test.js @@ -0,0 +1,76 @@ +'use strict' + +const test = require('brittle') +const { mapAddonEvent } = require('../../addon.js') + +function makeState (overrides = {}) { + return { skipNextRuntimeStats: false, ...overrides } +} + +test('TPS-shaped data maps to JobEnded with mapped backendDevice (cpu)', function (t) { + const state = makeState() + const result = mapAddonEvent('anything', { TPS: 42, tokens: 10, backendDevice: 0 }, null, state) + t.is(result.type, 'JobEnded') + t.is(result.data.TPS, 42) + t.is(result.data.backendDevice, 'cpu') + t.is(result.error, null) + t.is(state.skipNextRuntimeStats, false, 'flag remains false after normal inference terminal') +}) + +test('TPS-shaped data maps backendDevice 1 to "gpu"', function (t) { + const state = makeState() + const result = mapAddonEvent('anything', { TPS: 50, backendDevice: 1 }, null, state) + t.is(result.data.backendDevice, 'gpu') +}) + +test('TPS-shaped data preserves unknown backendDevice values as-is', function (t) { + const state = makeState() + const result = mapAddonEvent('anything', { TPS: 1, backendDevice: 2 }, null, state) + t.is(result.data.backendDevice, 2) +}) + +test('TPS-shaped data is dropped when skipNextRuntimeStats is set', function (t) { + const state = makeState({ skipNextRuntimeStats: true }) + const result = mapAddonEvent('anything', { TPS: 99 }, null, state) + t.is(result, null, 'returns null to drop stale post-finetune TPS') + t.is(state.skipNextRuntimeStats, false, 'flag resets after consuming') +}) + +test('finetune terminal payload maps to JobEnded and arms skip flag', function (t) { + const state = makeState() + const payload = { op: 'finetune', status: 'COMPLETED', stats: { loss: 0.1 } } + const result = mapAddonEvent('anything', payload, null, state) + t.is(result.type, 'JobEnded') + t.is(result.data, payload) + t.is(state.skipNextRuntimeStats, true, 'skip flag armed to swallow the TPS trailer') +}) + +test('finetune_progress payload maps to FinetuneProgress', function (t) { + const state = makeState() + const payload = { type: 'finetune_progress', stats: { loss: 0.2 } } + const result = mapAddonEvent('anything', payload, null, state) + t.is(result.type, 'FinetuneProgress') + t.is(result.data, payload) +}) + +test('event name containing "Error" maps to Error with rawError', function (t) { + const state = makeState() + const err = new Error('boom') + const result = mapAddonEvent('SomeError', null, err, state) + t.is(result.type, 'Error') + t.is(result.error, err) +}) + +test('string data maps to Output (token streaming)', function (t) { + const state = makeState() + const result = mapAddonEvent('OutputString', 'hello', null, state) + t.is(result.type, 'Output') + t.is(result.data, 'hello') +}) + +test('unknown event with non-TPS object falls through to default mapping', function (t) { + const state = makeState() + const result = mapAddonEvent('Unknown', { foo: 'bar' }, null, state) + t.is(result.type, 'Unknown', 'falls through preserving original event name') + t.alike(result.data, { foo: 'bar' }) +}) diff --git a/packages/qvac-lib-infer-llamacpp-llm/test/unit/pick-primary-gguf-path.test.js b/packages/qvac-lib-infer-llamacpp-llm/test/unit/pick-primary-gguf-path.test.js new file mode 100644 index 0000000000..89a722c67d --- /dev/null +++ b/packages/qvac-lib-infer-llamacpp-llm/test/unit/pick-primary-gguf-path.test.js @@ -0,0 +1,35 @@ +'use strict' + +const test = require('brittle') +const { pickPrimaryGgufPath } = require('../../index.js') + +test('single non-sharded file returns that file', function (t) { + const files = ['/models/Qwen3-1.7B-Q4_0.gguf'] + t.is(pickPrimaryGgufPath(files), '/models/Qwen3-1.7B-Q4_0.gguf') +}) + +test('sharded model with tensors.txt first returns first shard, not tensors.txt', function (t) { + const files = [ + '/models/medgemma-4b-it-Q4_1.tensors.txt', + '/models/medgemma-4b-it-Q4_1-00001-of-00005.gguf', + '/models/medgemma-4b-it-Q4_1-00002-of-00005.gguf', + '/models/medgemma-4b-it-Q4_1-00003-of-00005.gguf', + '/models/medgemma-4b-it-Q4_1-00004-of-00005.gguf', + '/models/medgemma-4b-it-Q4_1-00005-of-00005.gguf' + ] + t.is(pickPrimaryGgufPath(files), '/models/medgemma-4b-it-Q4_1-00001-of-00005.gguf') +}) + +test('sharded model without tensors.txt returns first shard', function (t) { + const files = [ + '/models/Qwen3-0.6B-UD-IQ1_S-00001-of-00003.gguf', + '/models/Qwen3-0.6B-UD-IQ1_S-00002-of-00003.gguf', + '/models/Qwen3-0.6B-UD-IQ1_S-00003-of-00003.gguf' + ] + t.is(pickPrimaryGgufPath(files), '/models/Qwen3-0.6B-UD-IQ1_S-00001-of-00003.gguf') +}) + +test('non-gguf file falls back to first entry', function (t) { + const files = ['/models/some-model.bin'] + t.is(pickPrimaryGgufPath(files), '/models/some-model.bin') +}) diff --git a/packages/sdk/bun.lock b/packages/sdk/bun.lock index 1c4482e9c2..6a8fe3c38e 100644 --- a/packages/sdk/bun.lock +++ b/packages/sdk/bun.lock @@ -6,12 +6,12 @@ "name": "@qvac/sdk", "dependencies": { "@qvac/decoder-audio": "^0.3.7", - "@qvac/diffusion-cpp": "0.1.3", + "@qvac/diffusion-cpp": "^0.3.0", "@qvac/dl-filesystem": "^0.2.1", - "@qvac/embed-llamacpp": "^0.13.4", + "@qvac/embed-llamacpp": "^0.14.0", "@qvac/error": "^0.1.1", "@qvac/langdetect-text": "^0.1.2", - "@qvac/llm-llamacpp": "^0.14.4", + "@qvac/llm-llamacpp": "^0.16.0", "@qvac/logging": "^0.1.0", "@qvac/ocr-onnx": "^0.4.2", "@qvac/rag": "^0.4.4", @@ -509,7 +509,7 @@ "@qvac/diagnostics": ["@qvac/diagnostics@0.1.1", "", {}, "sha512-KUWpnNjtsNM2h2yIJXyQ6E5S53GDdRf2LXxp0E5dH7qLD5hToBrP4wjvdmahwmBobL7nJU9rum6uT3JgLVKm3w=="], - "@qvac/diffusion-cpp": ["@qvac/diffusion-cpp@0.1.3", "", { "dependencies": { "@qvac/infer-base": "^0.2.2", "bare-path": "^3.0.0", "bare-process": "^4.2.2" } }, "sha512-bGg/lz0sjLehz0EbQEs2yCnwNfX03YtfyzeLqM6wrltRVmAY+MSY16NngKS7yzi+yFN/Zu91xsKp3D2rWFuyUw=="], + "@qvac/diffusion-cpp": ["@qvac/diffusion-cpp@0.3.0", "", { "dependencies": { "@qvac/infer-base": "^0.4.0", "@qvac/logging": "^0.1.0", "bare-fs": "^4.5.1", "bare-path": "^3.0.0", "bare-process": "^4.2.2" } }, "sha512-2qT+VVc44BerV9yqD0U9GSk/3uE7FBvF6ad8ZOFF2e99wLpo1141ZjYDwKuctlb0rtRPMdRXC/O369CYgQjhig=="], "@qvac/dl-base": ["@qvac/dl-base@0.2.1", "", { "dependencies": { "@qvac/logging": "^0.1.0", "ready-resource": "^1.1.1" } }, "sha512-Wd1/oOFsGb4O0kTKWw8OuQXhdr5EGBGn99zEdDYj4h1c+jUtAkpb39W5rf7D0A/XWwyVgsxL7vi7YHv6Xe7n/Q=="], @@ -517,7 +517,7 @@ "@qvac/dl-hyperdrive": ["@qvac/dl-hyperdrive@0.2.1", "", { "dependencies": { "@qvac/dl-base": "^0.2.0", "@qvac/error": "^0.1.0", "@qvac/infer-base": "^0.1.0", "corestore": "^7.1.0", "hypercore-id-encoding": "1.3.0", "hyperdrive": "^13.0.1", "hyperswarm": "^4.10.1", "test-tmp": "^1.3.0", "z32": "1.1.0" } }, "sha512-UfDnAYx/CPVxcUbIkEHreHYbxV2UZa8krBDo8doCfhwuYLNfH1Tu5x9s7j9xWs4/AEFgoJzW5sppkDtWAkvyxQ=="], - "@qvac/embed-llamacpp": ["@qvac/embed-llamacpp@0.13.4", "", { "dependencies": { "@qvac/infer-base": "^0.2.2", "@qvac/logging": "^0.1.0", "bare-path": "^3.0.0" }, "peerDependencies": { "@qvac/dl-hyperdrive": "^0.1.0" } }, "sha512-XN+0WP+1gOM9PLM8AaJL/F//MYEArtQVN4Mzy5MOIvbuBfiqRtFw/z0IxWXB1fEjVusjxH2qc4k/3b3ywSDHXg=="], + "@qvac/embed-llamacpp": ["@qvac/embed-llamacpp@0.14.0", "", { "dependencies": { "@qvac/infer-base": "^0.4.0", "@qvac/logging": "^0.1.0", "bare-fs": "^4.5.1", "bare-path": "^3.0.0" } }, "sha512-laXF58dOhLcltvrKp2tMx6ChTYWuW8GjEgqP3v5ZUw8xHaWuIoyVKR8sTNgXNDqyTr4dLZRp/zkl535K/WOPvw=="], "@qvac/error": ["@qvac/error@0.1.1", "", {}, "sha512-Xv7p1wnC/JmsKimGrkvXlcq+AHsG1r33f+uayANuEYe5ThFi+FR3txnN2UPjulwBdDorEnger9/+9ftShAFOAw=="], @@ -525,7 +525,7 @@ "@qvac/langdetect-text": ["@qvac/langdetect-text@0.1.2", "", { "dependencies": { "tinyld": "1.3.4" } }, "sha512-V6ntqPNBmz+49eIaY8jYdpgyx8MzSk9/bNp9ibSn+Xwx1D/8Mca8RNfn7/gHWsuACMvkvvJmNzZGGLu1eOW3og=="], - "@qvac/llm-llamacpp": ["@qvac/llm-llamacpp@0.14.4", "", { "dependencies": { "@qvac/infer-base": "^0.3.0", "bare-fs": "^4.5.1", "bare-path": "^3.0.0", "bare-process": "^4.2.2" } }, "sha512-0jAXNcSaDaCgOPiueg/lGbxd3ofuLDtPn5rg0u9bTA3lwOX8qFF1You0h0sc6uOJH9ivt4SI+gf3VGo20+vSGg=="], + "@qvac/llm-llamacpp": ["@qvac/llm-llamacpp@0.16.0", "", { "dependencies": { "@qvac/infer-base": "^0.4.0", "@qvac/logging": "^0.1.0", "bare-fs": "^4.5.1", "bare-path": "^3.0.0", "bare-process": "^4.2.2" } }, "sha512-BJTEVnzsx1uhUpyPp3nTOLDBk3LQZQN1D2xfY4Hi0o6WTGbIoS/YGfxNG57rAVt9njzk5RI1QH8EDUhbdke3AA=="], "@qvac/logging": ["@qvac/logging@0.1.0", "", {}, "sha512-B9JayZKJGzSsM/9JmMdO7wiOOZ2mY5aWTbXl2aIKzy+l2Uqzkoby0IxMjKSVtYo6uMDs2zcrZqLtjI2dDSaYog=="], @@ -2551,11 +2551,11 @@ "@npmcli/promise-spawn/which": ["which@4.0.0", "", { "dependencies": { "isexe": "^3.1.1" }, "bin": { "node-which": "bin/which.js" } }, "sha512-GlaYyEb07DPxYCKhKzplCWBJtvxZcZMrL+4UkrTSJHHPyZU4mYYTv3qaOe77H7EODLSSopAUFAc6W8U4yqvscg=="], - "@qvac/diffusion-cpp/@qvac/infer-base": ["@qvac/infer-base@0.2.2", "", { "dependencies": { "@qvac/error": "^0.1.0", "@qvac/logging": "^0.1.0", "bare-events": "2.4.2", "bare-os": "^3.2.0", "bare-path": "^3.0.0" }, "peerDependencies": { "@qvac/dl-hyperdrive": "^0.1.0" } }, "sha512-8D/5PRIy/A+Uhg1ZSoJMu5FSPDHdrMKZoPnAzTZMceikTj+BWwTV//j8pXbRABsjrFbqBegvr/LujirC9I2cRQ=="], + "@qvac/diffusion-cpp/@qvac/infer-base": ["@qvac/infer-base@0.4.0", "", { "dependencies": { "@qvac/error": "^0.1.0", "@qvac/logging": "^0.1.0", "bare-events": "2.4.2", "bare-os": "^3.2.0", "bare-path": "^3.0.0" }, "optionalDependencies": { "@qvac/diagnostics": "^0.1.0" }, "peerDependencies": { "@qvac/dl-hyperdrive": "^0.1.0" } }, "sha512-KoD4PrNzcScFjuLdGSTwtN/i10tMuwpRUW9g5lIiIaoR4s36NHwkfcxjyQDlHFlXkYjf3p3IpWXJgKOSo3jAqg=="], - "@qvac/embed-llamacpp/@qvac/infer-base": ["@qvac/infer-base@0.2.2", "", { "dependencies": { "@qvac/error": "^0.1.0", "@qvac/logging": "^0.1.0", "bare-events": "2.4.2", "bare-os": "^3.2.0", "bare-path": "^3.0.0" }, "peerDependencies": { "@qvac/dl-hyperdrive": "^0.1.0" } }, "sha512-8D/5PRIy/A+Uhg1ZSoJMu5FSPDHdrMKZoPnAzTZMceikTj+BWwTV//j8pXbRABsjrFbqBegvr/LujirC9I2cRQ=="], + "@qvac/embed-llamacpp/@qvac/infer-base": ["@qvac/infer-base@0.4.0", "", { "dependencies": { "@qvac/error": "^0.1.0", "@qvac/logging": "^0.1.0", "bare-events": "2.4.2", "bare-os": "^3.2.0", "bare-path": "^3.0.0" }, "optionalDependencies": { "@qvac/diagnostics": "^0.1.0" }, "peerDependencies": { "@qvac/dl-hyperdrive": "^0.1.0" } }, "sha512-KoD4PrNzcScFjuLdGSTwtN/i10tMuwpRUW9g5lIiIaoR4s36NHwkfcxjyQDlHFlXkYjf3p3IpWXJgKOSo3jAqg=="], - "@qvac/llm-llamacpp/@qvac/infer-base": ["@qvac/infer-base@0.3.1", "", { "dependencies": { "@qvac/error": "^0.1.0", "@qvac/logging": "^0.1.0", "bare-events": "2.4.2", "bare-os": "^3.2.0", "bare-path": "^3.0.0" }, "optionalDependencies": { "@qvac/diagnostics": "^0.1.0" }, "peerDependencies": { "@qvac/dl-hyperdrive": "^0.1.0" } }, "sha512-Os5dHW7x96V2+63k0uO1Z9l9rbz3eHvURC0iqNH+346EeG7EGzqvW7EfCh6a4rjY8PXLSqGTTuo9yv5ZzbSdhg=="], + "@qvac/llm-llamacpp/@qvac/infer-base": ["@qvac/infer-base@0.4.0", "", { "dependencies": { "@qvac/error": "^0.1.0", "@qvac/logging": "^0.1.0", "bare-events": "2.4.2", "bare-os": "^3.2.0", "bare-path": "^3.0.0" }, "optionalDependencies": { "@qvac/diagnostics": "^0.1.0" }, "peerDependencies": { "@qvac/dl-hyperdrive": "^0.1.0" } }, "sha512-KoD4PrNzcScFjuLdGSTwtN/i10tMuwpRUW9g5lIiIaoR4s36NHwkfcxjyQDlHFlXkYjf3p3IpWXJgKOSo3jAqg=="], "@qvac/ocr-onnx/@qvac/infer-base": ["@qvac/infer-base@0.4.0", "", { "dependencies": { "@qvac/error": "^0.1.0", "@qvac/logging": "^0.1.0", "bare-events": "2.4.2", "bare-os": "^3.2.0", "bare-path": "^3.0.0" }, "optionalDependencies": { "@qvac/diagnostics": "^0.1.0" }, "peerDependencies": { "@qvac/dl-hyperdrive": "^0.1.0" } }, "sha512-KoD4PrNzcScFjuLdGSTwtN/i10tMuwpRUW9g5lIiIaoR4s36NHwkfcxjyQDlHFlXkYjf3p3IpWXJgKOSo3jAqg=="], diff --git a/packages/sdk/package.json b/packages/sdk/package.json index 2cca82ef2d..7efd7a2b3a 100644 --- a/packages/sdk/package.json +++ b/packages/sdk/package.json @@ -173,12 +173,12 @@ }, "dependencies": { "@qvac/decoder-audio": "^0.3.7", - "@qvac/diffusion-cpp": "^0.1.3", + "@qvac/diffusion-cpp": "^0.3.0", "@qvac/dl-filesystem": "^0.2.1", - "@qvac/embed-llamacpp": "^0.13.4", + "@qvac/embed-llamacpp": "^0.14.0", "@qvac/error": "^0.1.1", "@qvac/langdetect-text": "^0.1.2", - "@qvac/llm-llamacpp": "^0.14.4", + "@qvac/llm-llamacpp": "^0.16.0", "@qvac/logging": "^0.1.0", "@qvac/ocr-onnx": "^0.4.2", "@qvac/rag": "^0.4.4", diff --git a/packages/sdk/server/bare/plugins/llamacpp-completion/plugin.ts b/packages/sdk/server/bare/plugins/llamacpp-completion/plugin.ts index a7765b594b..93837acc2e 100644 --- a/packages/sdk/server/bare/plugins/llamacpp-completion/plugin.ts +++ b/packages/sdk/server/bare/plugins/llamacpp-completion/plugin.ts @@ -1,4 +1,4 @@ -import LlmLlamacpp, { type Loader as LlmLoader } from "@qvac/llm-llamacpp"; +import LlmLlamacpp from "@qvac/llm-llamacpp"; import llmAddonLogging from "@qvac/llm-llamacpp/addonLogging"; import { definePlugin, @@ -19,9 +19,7 @@ import { type LlmConfigInput, } from "@/schemas"; import { createStreamLogger, registerAddonLogger } from "@/logging"; -import { parseModelPath } from "@/server/utils"; -import FilesystemDL from "@qvac/dl-filesystem"; -import { asLoader } from "@/server/bare/utils/loader-adapter"; +import { expandGGUFIntoShards } from "@/server/utils/expand-gguf-shards"; import { completion } from "@/server/bare/plugins/llamacpp-completion/ops/completion-stream"; import { finetune } from "@/server/bare/plugins/llamacpp-completion/ops/finetune"; import { translate } from "@/server/bare/ops/translate"; @@ -65,28 +63,22 @@ function createLlmModel( llmConfig: LlmConfig, projectionModelPath?: string, ) { - const { dirPath, basePath } = parseModelPath(modelPath); - const loader = new FilesystemDL({ dirPath }); const logger = createStreamLogger(modelId, ModelType.llamacppCompletion); registerAddonLogger(modelId, ModelType.llamacppCompletion, logger); const llmConfigStrings = transformLlmConfig(llmConfig); - - const args = { - loader: asLoader(loader), - opts: { stats: true }, + const modelFiles = expandGGUFIntoShards(modelPath); + + const model = new LlmLlamacpp({ + files: { + model: modelFiles, + ...(projectionModelPath && { projectionModel: projectionModelPath }), + }, + config: llmConfigStrings, logger, - diskPath: dirPath, - modelName: basePath, - projectionModel: projectionModelPath - ? parseModelPath(projectionModelPath).basePath - : "", - modelPath, - modelConfig: llmConfigStrings, - }; - - const model = new LlmLlamacpp(args, llmConfigStrings); - - return { model, loader }; + opts: { stats: true }, + }); + + return { model }; } export const llmPlugin = definePlugin({ @@ -112,14 +104,14 @@ export const llmPlugin = definePlugin({ createModel(params: CreateModelParams): PluginModelResult { const llmConfig = (params.modelConfig ?? {}) as LlmConfig; - const { model, loader } = createLlmModel( + const { model } = createLlmModel( params.modelId, params.modelPath, llmConfig, params.artifacts?.["projectionModelPath"], ); - return { model, loader }; + return { model, loader: undefined }; }, handlers: { diff --git a/packages/sdk/server/bare/plugins/llamacpp-embedding/plugin.ts b/packages/sdk/server/bare/plugins/llamacpp-embedding/plugin.ts index e5f3b544bf..3a8f99d105 100644 --- a/packages/sdk/server/bare/plugins/llamacpp-embedding/plugin.ts +++ b/packages/sdk/server/bare/plugins/llamacpp-embedding/plugin.ts @@ -1,7 +1,4 @@ -import EmbedLlamacpp, { - type GGMLConfig, - type Loader as EmbedLoader, -} from "@qvac/embed-llamacpp"; +import EmbedLlamacpp, { type GGMLConfig } from "@qvac/embed-llamacpp"; import embedAddonLogging from "@qvac/embed-llamacpp/addonLogging"; import { definePlugin, @@ -16,9 +13,7 @@ import { type EmbedConfig, } from "@/schemas"; import { createStreamLogger, registerAddonLogger } from "@/logging"; -import { parseModelPath } from "@/server/utils"; -import FilesystemDL from "@qvac/dl-filesystem"; -import { asLoader } from "@/server/bare/utils/loader-adapter"; +import { expandGGUFIntoShards } from "@/server/utils/expand-gguf-shards"; import { embed } from "@/server/bare/ops/embed"; import { forwardModelExecution } from "@/profiling/model-execution"; @@ -68,25 +63,20 @@ function createEmbeddingsModel( modelPath: string, embedConfig: EmbedConfig, ) { - const { dirPath, basePath } = parseModelPath(modelPath); - const loader = new FilesystemDL({ dirPath }); const logger = createStreamLogger(modelId, ModelType.llamacppEmbedding); registerAddonLogger(modelId, ModelType.llamacppEmbedding, logger); const config = transformEmbedConfig(embedConfig); + const modelFiles = expandGGUFIntoShards(modelPath); - const args = { - loader: asLoader(loader), - opts: { stats: true }, + const model = new EmbedLlamacpp({ + files: { model: modelFiles }, + config, logger, - diskPath: dirPath, - modelName: basePath, - modelPath, - }; - - const model = new EmbedLlamacpp(args, config); + opts: { stats: true }, + }); - return { model, loader }; + return { model }; } export const embeddingsPlugin = definePlugin({ @@ -98,13 +88,13 @@ export const embeddingsPlugin = definePlugin({ createModel(params: CreateModelParams): PluginModelResult { const embedConfig = (params.modelConfig ?? {}) as EmbedConfig; - const { model, loader } = createEmbeddingsModel( + const { model } = createEmbeddingsModel( params.modelId, params.modelPath, embedConfig, ); - return { model, loader }; + return { model, loader: undefined }; }, handlers: { diff --git a/packages/sdk/server/bare/plugins/sdcpp-generation/plugin.ts b/packages/sdk/server/bare/plugins/sdcpp-generation/plugin.ts index 7756ffb100..8c6814f504 100644 --- a/packages/sdk/server/bare/plugins/sdcpp-generation/plugin.ts +++ b/packages/sdk/server/bare/plugins/sdcpp-generation/plugin.ts @@ -1,4 +1,7 @@ -import ImgStableDiffusion, { type ImgStableDiffusionArgs, type SdConfig } from "@qvac/diffusion-cpp"; +import ImgStableDiffusion, { + type DiffusionFiles, + type SdConfig, +} from "@qvac/diffusion-cpp"; import addonLogging from "@qvac/diffusion-cpp/addonLogging"; import { definePlugin, @@ -15,7 +18,6 @@ import { type SdcppConfig, } from "@/schemas"; import { createStreamLogger, registerAddonLogger } from "@/logging"; -import { parseModelPath } from "@/server/utils"; import { diffusion } from "./ops/diffusion"; type DiffusionArtifactKey = @@ -72,23 +74,24 @@ export const diffusionPlugin = definePlugin({ createModel(params: CreateModelParams): PluginModelResult { const { modelId, modelPath, modelConfig, artifacts } = params; const config = (modelConfig ?? {}) as SdcppConfig; - const { dirPath, basePath } = parseModelPath(modelPath); const logger = createStreamLogger(modelId, ModelType.sdcppGeneration); registerAddonLogger(modelId, ModelType.sdcppGeneration, logger); - const addonArgs: ImgStableDiffusionArgs = { - diskPath: dirPath, - modelName: basePath, - logger, - opts: { stats: true }, - ...(artifacts?.["clipLModelPath"] && { clipLModel: artifacts["clipLModelPath"] }), - ...(artifacts?.["clipGModelPath"] && { clipGModel: artifacts["clipGModelPath"] }), - ...(artifacts?.["t5XxlModelPath"] && { t5XxlModel: artifacts["t5XxlModelPath"] }), - ...(artifacts?.["llmModelPath"] && { llmModel: artifacts["llmModelPath"] }), - ...(artifacts?.["vaeModelPath"] && { vaeModel: artifacts["vaeModelPath"] }), + const files: DiffusionFiles = { + model: modelPath, + ...(artifacts?.["clipLModelPath"] && { clipL: artifacts["clipLModelPath"] }), + ...(artifacts?.["clipGModelPath"] && { clipG: artifacts["clipGModelPath"] }), + ...(artifacts?.["t5XxlModelPath"] && { t5Xxl: artifacts["t5XxlModelPath"] }), + ...(artifacts?.["llmModelPath"] && { llm: artifacts["llmModelPath"] }), + ...(artifacts?.["vaeModelPath"] && { vae: artifacts["vaeModelPath"] }), }; - const model = new ImgStableDiffusion(addonArgs, config as SdConfig); + const model = new ImgStableDiffusion({ + files, + config: config as SdConfig, + logger, + opts: { stats: true }, + }); return { model, loader: undefined }; }, diff --git a/packages/sdk/server/utils/expand-gguf-shards.ts b/packages/sdk/server/utils/expand-gguf-shards.ts new file mode 100644 index 0000000000..cea226683c --- /dev/null +++ b/packages/sdk/server/utils/expand-gguf-shards.ts @@ -0,0 +1,52 @@ +const SHARD_PATTERN = /^(.+)-(\d{5})-of-(\d{5})\.gguf$/; + +/** + * Expand a GGUF model path into its constituent shard paths. + * + * Sharded GGUF models follow the convention `-NNNNN-of-MMMMM.gguf` + * with a sibling `.tensors.txt` metadata file. This helper mirrors + * the C++ `GGUFShards::expandGGUFIntoShards` logic so the SDK can pass the + * full ordered list of files to the addon's `files.model` argument, which is + * the contract introduced by the addon-loader-abstraction refactor. + * + * Order matters: the tensors.txt file is yielded first, followed by each + * shard from `00001-of-NNNNN` through `NNNNN-of-NNNNN`. The addon streams + * weights in this order and picks the first shard-matching entry (the + * `-00001-of-NNNNN.gguf` file) as the resolved model path for native + * loading; the `.tensors.txt` companion is consumed by the weight-streaming + * layer but is not used as the primary path. + * + * Non-sharded models (or paths whose filename does not match the pattern) + * are returned as a single-element array containing the input path + * unchanged. + * + * Pure string manipulation — no filesystem or runtime-specific path module. + * Handles both POSIX (`/`) and Windows (`\`) separators based on whichever + * appears in the input. + */ +export function expandGGUFIntoShards(modelPath: string): string[] { + const lastSep = Math.max( + modelPath.lastIndexOf("/"), + modelPath.lastIndexOf("\\"), + ); + const dir = lastSep >= 0 ? modelPath.slice(0, lastSep) : ""; + const sep = lastSep >= 0 ? modelPath.charAt(lastSep) : "/"; + const filename = lastSep >= 0 ? modelPath.slice(lastSep + 1) : modelPath; + + const match = filename.match(SHARD_PATTERN); + if (!match || !match[1] || !match[3]) return [modelPath]; + + const basename = match[1]; + const totalShards = Number.parseInt(match[3], 10); + if (!Number.isFinite(totalShards) || totalShards <= 0) return [modelPath]; + + const join = (name: string) => (dir ? `${dir}${sep}${name}` : name); + const shards: string[] = [join(`${basename}.tensors.txt`)]; + const totalDigits = String(totalShards).padStart(5, "0"); + for (let i = 1; i <= totalShards; i++) { + shards.push( + join(`${basename}-${String(i).padStart(5, "0")}-of-${totalDigits}.gguf`), + ); + } + return shards; +} diff --git a/packages/sdk/test/unit/expand-gguf-shards.test.ts b/packages/sdk/test/unit/expand-gguf-shards.test.ts new file mode 100644 index 0000000000..9e472ba9ed --- /dev/null +++ b/packages/sdk/test/unit/expand-gguf-shards.test.ts @@ -0,0 +1,72 @@ +// @ts-expect-error brittle has no type declarations +import test from "brittle"; +import { expandGGUFIntoShards } from "@/server/utils/expand-gguf-shards"; + +test("expandGGUFIntoShards: returns single path for non-sharded model", (t) => { + const result = expandGGUFIntoShards("/models/llama-7b.gguf"); + t.alike(result, ["/models/llama-7b.gguf"]); +}); + +test("expandGGUFIntoShards: returns single path for non-gguf file", (t) => { + const result = expandGGUFIntoShards("/models/something.bin"); + t.alike(result, ["/models/something.bin"]); +}); + +test("expandGGUFIntoShards: expands sharded model when given first shard", (t) => { + const result = expandGGUFIntoShards( + "/models/medgemma-4b-it-Q4_1-00001-of-00005.gguf", + ); + t.is(result.length, 6, "tensors.txt + 5 shards"); + t.is(result[0], "/models/medgemma-4b-it-Q4_1.tensors.txt"); + t.is(result[1], "/models/medgemma-4b-it-Q4_1-00001-of-00005.gguf"); + t.is(result[5], "/models/medgemma-4b-it-Q4_1-00005-of-00005.gguf"); +}); + +test("expandGGUFIntoShards: expands sharded model when given a non-first shard", (t) => { + const result = expandGGUFIntoShards( + "/models/medgemma-4b-it-Q4_1-00003-of-00005.gguf", + ); + t.is(result.length, 6); + t.is(result[0], "/models/medgemma-4b-it-Q4_1.tensors.txt"); + for (let i = 1; i <= 5; i++) { + t.is( + result[i], + `/models/medgemma-4b-it-Q4_1-${String(i).padStart(5, "0")}-of-00005.gguf`, + ); + } +}); + +test("expandGGUFIntoShards: preserves nested directory in returned paths", (t) => { + const result = expandGGUFIntoShards( + "/some/nested/dir/Qwen3-1.7B-Q4_0-00001-of-00002.gguf", + ); + t.is(result.length, 3); + t.is(result[0], "/some/nested/dir/Qwen3-1.7B-Q4_0.tensors.txt"); + t.is(result[1], "/some/nested/dir/Qwen3-1.7B-Q4_0-00001-of-00002.gguf"); + t.is(result[2], "/some/nested/dir/Qwen3-1.7B-Q4_0-00002-of-00002.gguf"); +}); + +test("expandGGUFIntoShards: handles single-shard sharded model (1-of-1)", (t) => { + const result = expandGGUFIntoShards("/models/tiny-00001-of-00001.gguf"); + t.is(result.length, 2); + t.is(result[0], "/models/tiny.tensors.txt"); + t.is(result[1], "/models/tiny-00001-of-00001.gguf"); +}); + +test("expandGGUFIntoShards: handles relative path without directory", (t) => { + const result = expandGGUFIntoShards("model-00001-of-00002.gguf"); + t.is(result.length, 3); + t.is(result[0], "model.tensors.txt"); + t.is(result[1], "model-00001-of-00002.gguf"); + t.is(result[2], "model-00002-of-00002.gguf"); +}); + +test("expandGGUFIntoShards: handles Windows-style backslash separators", (t) => { + const result = expandGGUFIntoShards( + "C:\\models\\llama-00001-of-00003.gguf", + ); + t.is(result.length, 4); + t.is(result[0], "C:\\models\\llama.tensors.txt"); + t.is(result[1], "C:\\models\\llama-00001-of-00003.gguf"); + t.is(result[3], "C:\\models\\llama-00003-of-00003.gguf"); +});