Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
node_modules
.DS_Store
.vscode
.claude
/cache
/build
/build-compat
Expand Down
7 changes: 6 additions & 1 deletion .npmignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@
/llama.cpp
/models
/build
/build-*
/cache
/compat
/.github
/.vscode
/.vscode
/.claude
a.out.js
a.out.wasm
5 changes: 3 additions & 2 deletions compat/package.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
{
"name": "@wllama/wllama-compat",
"version": "3.4.0",
"version": "3.4.1",
"description": "Optional package providing compatibility with older browsers for @wllama/wllama",
"main": "index.js",
"type": "module",
"scripts": {
"upload": "npm publish --access public"
"upload": "npm publish --access public",
"prepublishOnly": "node ../scripts/check_package_size.js"
},
"repository": {
"type": "git",
Expand Down
2 changes: 1 addition & 1 deletion llama.cpp
5 changes: 3 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@wllama/wllama",
"version": "3.4.0",
"version": "3.4.1",
"description": "WebAssembly binding for llama.cpp - Enabling on-browser LLM inference",
"main": "index.js",
"type": "module",
Expand All @@ -25,7 +25,8 @@
"test": "vitest",
"test:firefox": "BROWSER=firefox vitest",
"test:safari": "BROWSER=safari vitest",
"test:wgpu": "WEBGPU=1 vitest"
"test:wgpu": "WEBGPU=1 vitest",
"prepublishOnly": "node scripts/check_package_size.js"
},
"repository": {
"type": "git",
Expand Down
22 changes: 22 additions & 0 deletions scripts/check_package_size.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env node

import { execSync } from 'child_process';

const MAX_SIZE = 20 * 1024 * 1024; // 20 MB
const MAX_FILES = 90;

const result = JSON.parse(execSync('npm pack --dry-run --json 2>/dev/null'));
const { unpackedSize, entryCount } = result[0];

console.log(`Unpacked size: ${(unpackedSize / 1024 / 1024).toFixed(2)} MB`);
console.log(`Total files: ${entryCount}`);

if (unpackedSize > MAX_SIZE) {
console.error(`ERROR: Unpacked size exceeds 20 MB limit`);
process.exit(1);
}

if (entryCount > MAX_FILES) {
console.error(`ERROR: Total files (${entryCount}) exceeds limit of ${MAX_FILES}`);
process.exit(1);
}
6 changes: 3 additions & 3 deletions src/wasm-from-cdn.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
// Do not edit this file directly

const WasmFromCDN = {
default: 'https://cdn.jsdelivr.net/npm/@wllama/wllama@3.4.0/src/wasm/wllama.wasm',
default: 'https://cdn.jsdelivr.net/npm/@wllama/wllama@3.4.1/src/wasm/wllama.wasm',
};

export const WasmCompatFromCDN = {
worker: 'https://cdn.jsdelivr.net/npm/@wllama/wllama-compat@3.4.0/wasm/wllama.js',
wasm: 'https://cdn.jsdelivr.net/npm/@wllama/wllama-compat@3.4.0/wasm/wllama.wasm',
worker: 'https://cdn.jsdelivr.net/npm/@wllama/wllama-compat@3.4.1/wasm/wllama.js',
wasm: 'https://cdn.jsdelivr.net/npm/@wllama/wllama-compat@3.4.1/wasm/wllama.wasm',
};

export default WasmFromCDN;
Binary file modified src/wasm/wllama.wasm
Binary file not shown.
2 changes: 1 addition & 1 deletion src/workers-code/generated.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// This file is auto-generated
// To re-generate it, run: npm run build:worker

export const LIBLLAMA_VERSION = 'b9425-0821c5f';
export const LIBLLAMA_VERSION = 'b9437-aa46bda';

export const LLAMA_CPP_WORKER_CODE = "// Start the main llama.cpp\nlet wllamaMalloc;\nlet wllamaStart;\nlet wllamaAction;\nlet wllamaExit;\nlet wllamaDebug;\n\nlet Module = null;\nlet isCompat = false;\nlet lastStack = '';\nlet isAborted = false;\nlet hasMultithread = false;\n\n//////////////////////////////////////////////////////////////\n// UTILS\n//////////////////////////////////////////////////////////////\n\n// send message back to main thread\nconst msg = (data, transfer) => postMessage(data, transfer);\n\n// Convert CPP log into JS log\nconst cppLogToJSLog = (line) => {\n const matched = line.match(/@@(DEBUG|INFO|WARN|ERROR)@@(.*)/);\n return !!matched\n ? {\n level: (matched[1] === 'INFO' ? 'debug' : matched[1]).toLowerCase(),\n text: matched[2],\n }\n : { level: 'log', text: line };\n};\n\nconst getHeapU8 = () => {\n const buffer = Module.wasmMemory.buffer;\n return new Uint8Array(buffer);\n};\n\nconst toSizeT = (num) => {\n return isCompat ? Number(num) : BigInt(num);\n};\n\n// Get module config that forwards stdout/err to main thread\nconst getWModuleConfig = (_argMainScriptBlob) => {\n var pathConfig = RUN_OPTIONS.pathConfig;\n var pthreadPoolSize = RUN_OPTIONS.nbThread;\n var argMainScriptBlob = _argMainScriptBlob;\n\n isCompat = RUN_OPTIONS.compat;\n hasMultithread = pthreadPoolSize > 1;\n\n msg({\n verb: 'console.debug',\n args: [\n `Multithread enabled: ${hasMultithread}, pthreadPoolSize: ${pthreadPoolSize}`,\n ],\n });\n\n if (!pathConfig['wllama.wasm']) {\n throw new Error('\"wllama.wasm\" is missing in pathConfig');\n }\n return {\n noInitialRun: true,\n print: function (text) {\n if (arguments.length > 1)\n text = Array.prototype.slice.call(arguments).join(' ');\n msg({ verb: 'console.log', args: [text] });\n },\n printErr: function (text) {\n if (arguments.length > 1)\n text = Array.prototype.slice.call(arguments).join(' ');\n if (text.startsWith('@@STACK@@')) {\n lastStack = text.slice('@@STACK@@'.length);\n return;\n }\n const logLine = cppLogToJSLog(text);\n msg({ verb: 'console.' + logLine.level, args: [logLine.text] });\n },\n locateFile: function (filename, basePath) {\n const p = pathConfig[filename];\n const truncate = (str) =>\n str.length > 128 ? `${str.substr(0, 128)}...` : str;\n if (filename.match(/wllama\\.worker\\.js/)) {\n msg({\n verb: 'console.error',\n args: [\n '\"wllama.worker.js\" is removed from v2.2.1. Hint: make sure to clear browser\\'s cache.',\n ],\n });\n } else {\n msg({\n verb: 'console.debug',\n args: [`Loading \"${filename}\" from \"${truncate(p)}\"`],\n });\n return p;\n }\n },\n mainScriptUrlOrBlob: hasMultithread\n ? argMainScriptBlob\n : 'throw new Error(\"Multithreading is not enabled\")',\n pthreadPoolSize: hasMultithread ? pthreadPoolSize : 0,\n wasmMemory: hasMultithread ? getWasmMemory() : null,\n onAbort: function (message) {\n isAborted = true;\n msg({ verb: 'signal.abort', args: ['abort', message, lastStack, null] });\n },\n onExit: function (code) {\n isAborted = true;\n const callstack = new Error().stack.toString();\n msg({\n verb: 'signal.abort',\n args: ['abort', 'exit(' + code + ')', callstack, null],\n });\n },\n };\n};\n\n// Get the memory to be used by wasm. (Only used in multi-thread mode)\n// Because we have a weird OOM issue on iOS, we need to try some values\n// See: https://github.com/emscripten-core/emscripten/issues/19144\n// https://github.com/godotengine/godot/issues/70621\nconst getWasmMemory = () => {\n let minBytes = 128 * 1024 * 1024;\n let maxBytes = 4096 * 1024 * 1024;\n let stepBytes = 128 * 1024 * 1024;\n while (maxBytes > minBytes) {\n try {\n const wasmMemory = new WebAssembly.Memory({\n initial: toSizeT(minBytes / 65536),\n maximum: toSizeT(maxBytes / 65536),\n shared: true,\n address: isCompat ? undefined : 'i64',\n });\n return wasmMemory;\n } catch (e) {\n maxBytes -= stepBytes;\n continue; // retry\n }\n }\n throw new Error('Cannot allocate WebAssembly.Memory');\n};\n\n//////////////////////////////////////////////////////////////\n// HEAPFS PATCH\n//////////////////////////////////////////////////////////////\n\n/**\n * By default, emscripten uses memfs. The way it works is by\n * allocating new Uint8Array in javascript heap. This is not good\n * because it requires files to be copied to wasm heap each time\n * a file is read.\n *\n * HeapFS is an alternative, which resolves this problem by\n * allocating space for file directly inside wasm heap. This\n * allows us to mmap without doing any copy.\n *\n * For llama.cpp, this is great because we use MAP_SHARED\n *\n * Ref: https://github.com/ngxson/wllama/pull/39\n * Ref: https://github.com/emscripten-core/emscripten/blob/main/src/library_memfs.js\n *\n * Note 29/05/2024 @ngxson\n * Due to ftell() being limited to MAX_LONG, we cannot load files bigger than 2^31 bytes (or 2GB)\n * Ref: https://github.com/emscripten-core/emscripten/blob/main/system/lib/libc/musl/src/stdio/ftell.c\n */\n\nconst fsNameToFile = {}; // map Name => File\nconst fsIdToFile = {}; // map ID => File\nlet currFileId = 0;\n\n// Patch and redirect memfs calls to wllama\nconst patchHeapFS = () => {\n const m = Module;\n // save functions\n m.MEMFS.stream_ops._read = m.MEMFS.stream_ops.read;\n m.MEMFS.stream_ops._write = m.MEMFS.stream_ops.write;\n m.MEMFS.stream_ops._llseek = m.MEMFS.stream_ops.llseek;\n m.MEMFS.stream_ops._allocate = m.MEMFS.stream_ops.allocate;\n m.MEMFS.stream_ops._mmap = m.MEMFS.stream_ops.mmap;\n m.MEMFS.stream_ops._msync = m.MEMFS.stream_ops.msync;\n\n const patchStream = (stream) => {\n const name = stream.node.name;\n if (fsNameToFile[name]) {\n const f = fsNameToFile[name];\n const ptr = Number(f.ptr);\n stream.node.contents = getHeapU8().subarray(ptr, ptr + f.size);\n stream.node.usedBytes = f.size;\n }\n };\n\n // replace \"read\" functions\n m.MEMFS.stream_ops.read = function (\n stream,\n buffer,\n offset,\n length,\n position\n ) {\n patchStream(stream);\n return m.MEMFS.stream_ops._read(stream, buffer, offset, length, position);\n };\n m.MEMFS.ops_table.file.stream.read = m.MEMFS.stream_ops.read;\n\n // replace \"llseek\" functions\n m.MEMFS.stream_ops.llseek = function (stream, offset, whence) {\n patchStream(stream);\n return m.MEMFS.stream_ops._llseek(stream, offset, whence);\n };\n m.MEMFS.ops_table.file.stream.llseek = m.MEMFS.stream_ops.llseek;\n\n // replace \"mmap\" functions\n m.MEMFS.stream_ops.mmap = function (stream, length, position, prot, flags) {\n patchStream(stream);\n const name = stream.node.name;\n if (fsNameToFile[name]) {\n const f = fsNameToFile[name];\n const mmapPtr = f.ptr + toSizeT(position);\n return {\n ptr: mmapPtr,\n allocated: false,\n };\n } else {\n return m.MEMFS.stream_ops._mmap(stream, length, position, prot, flags);\n }\n };\n m.MEMFS.ops_table.file.stream.mmap = m.MEMFS.stream_ops.mmap;\n\n // mount FS\n m.FS.mkdir('/models');\n m.FS.mount(m.MEMFS, { root: '.' }, '/models');\n};\n\n// Allocate a new file in wllama heapfs, returns file ID\nconst heapfsAlloc = (name, size, allocBuffer) => {\n if (size < 1) {\n throw new Error('File size must be bigger than 0');\n }\n const m = Module;\n const ptr = toSizeT(allocBuffer ? m.mmapAlloc(size) : 0);\n const file = {\n ptr: ptr,\n size: size,\n id: currFileId++,\n };\n fsIdToFile[file.id] = file;\n fsNameToFile[name] = file;\n return file.id;\n};\n\n// Add new file to wllama heapfs, return number of written bytes\nconst heapfsWrite = (id, buffer, offset) => {\n if (fsIdToFile[id]) {\n const { ptr, size } = fsIdToFile[id];\n const afterWriteByte = offset + buffer.byteLength;\n if (afterWriteByte > size) {\n throw new Error(\n `File ID ${id} write out of bound, afterWriteByte = ${afterWriteByte} while size = ${size}`\n );\n }\n getHeapU8().set(buffer, Number(ptr) + offset);\n return buffer.byteLength;\n } else {\n throw new Error(`File ID ${id} not found in heapfs`);\n }\n};\n\n//////////////////////////////////////////////////////////////\n// ASYNC FILE READ\n//////////////////////////////////////////////////////////////\n\nlet isAwaitReading = false;\nlet pendingReadPromise = null;\nlet pendingReadResolve = null;\nlet pendingReadReject = null;\n\nconst _stripModelsPrefix = (path) => path.replace(/^\\/?models\\//, '');\n\n// Called from EM_ASYNC_JS stub in wllama-fs.h (path is already a JS string)\nconst _wllama_js_file_read = async (path, offset, req_size, out_ptr) => {\n const name = _stripModelsPrefix(path);\n\n pendingReadPromise = new Promise((res, rej) => {\n pendingReadResolve = res;\n pendingReadReject = rej;\n });\n isAwaitReading = true;\n\n postMessage({ verb: 'fs.read_req', args: [name, offset, req_size] });\n\n let data;\n try {\n data = await pendingReadPromise;\n } finally {\n isAwaitReading = false;\n pendingReadResolve = null;\n pendingReadReject = null;\n }\n\n const bytes = new Uint8Array(data);\n getHeapU8().set(bytes, out_ptr);\n return toSizeT(bytes.length);\n};\n\n//////////////////////////////////////////////////////////////\n// MAIN CODE\n//////////////////////////////////////////////////////////////\n\nconst callWrapper = (name, ret, args, isAsync) => {\n const fn = Module.cwrap(\n name,\n ret,\n args,\n isAsync ? { async: true } : undefined\n );\n return async (action, req) => {\n // console.log(`Calling ${name} with action:`, action, 'and req:', req);\n let result;\n try {\n if (args.length === 2) {\n result = isAsync ? await fn(action, req) : fn(action, req);\n } else {\n result = fn();\n }\n } catch (ex) {\n console.error(ex);\n throw ex;\n }\n return result;\n };\n};\n\nfunction handleError(err) {\n // If WASM already aborted, onAbort already sent signal.abort; skip to avoid\n // re-reporting the resulting WebAssembly.RuntimeError as a JS exception.\n if (isAborted) return;\n\n const message = err ? err.message || String(err) : 'Unknown error';\n const stack = err ? err.stack || String(err) : '';\n msg({\n verb: 'signal.abort',\n args: ['exception', message, stack, err],\n });\n}\n\nonmessage = async (e) => {\n if (!e.data) return;\n const { verb, args, callbackId } = e.data;\n\n // fs.read_res arrives while wasm is JSPI-suspended; resolve the pending promise.\n if (verb === 'fs.read_res') {\n if (pendingReadResolve) {\n pendingReadResolve(args[0]);\n }\n return;\n }\n\n // Guard: while awaiting a file read, reject any other incoming task.\n if (isAwaitReading) {\n if (callbackId) {\n msg({\n callbackId,\n err: 'Worker is suspended waiting for file data (JSPI)',\n });\n }\n return;\n }\n\n if (!callbackId) {\n msg({ verb: 'console.error', args: ['callbackId is required', e.data] });\n return;\n }\n\n if (verb === 'module.init') {\n const argMainScriptBlob = args[0];\n const argUseAsyncFile = args[1];\n try {\n Module = getWModuleConfig(argMainScriptBlob);\n Module.preRun = () => {\n if (argUseAsyncFile) {\n Module.ENV['USE_ASYNC_FILE'] = '1';\n }\n };\n Module.onRuntimeInitialized = () => {\n // async call once module is ready\n // init FS\n patchHeapFS();\n // init cwrap\n const pointer = isCompat ? 'number' : 'bigint';\n // TODO: note sure why emscripten cannot bind if there is only 1 argument\n wllamaMalloc = callWrapper('wllama_malloc', pointer, [\n 'number',\n pointer,\n ]);\n wllamaStart = callWrapper('wllama_start', 'string', [], true);\n wllamaAction = callWrapper(\n 'wllama_action',\n pointer,\n ['string', pointer],\n true\n );\n wllamaExit = callWrapper('wllama_exit', 'string', []);\n wllamaDebug = callWrapper('wllama_debug', 'string', []);\n msg({ callbackId, result: null });\n };\n wModuleInit();\n } catch (err) {\n handleError(err);\n }\n return;\n }\n\n if (verb === 'fs.alloc') {\n const argFilename = args[0];\n const argSize = args[1];\n const argAllocBuffer = args[2];\n try {\n // create blank file\n const emptyBuffer = new ArrayBuffer(0);\n Module['FS_createDataFile'](\n '/models',\n argFilename,\n emptyBuffer,\n true,\n true,\n true\n );\n // alloc data on heap\n const fileId = heapfsAlloc(argFilename, argSize, argAllocBuffer);\n msg({ callbackId, result: { fileId } });\n } catch (err) {\n handleError(err);\n }\n return;\n }\n\n if (verb === 'fs.write') {\n const argFileId = args[0];\n const argBuffer = args[1];\n const argOffset = args[2];\n try {\n const writtenBytes = heapfsWrite(argFileId, argBuffer, argOffset);\n msg({ callbackId, result: { writtenBytes } });\n } catch (err) {\n handleError(err);\n }\n return;\n }\n\n if (verb === 'wllama.start') {\n try {\n const result = await wllamaStart();\n msg({ callbackId, result });\n } catch (err) {\n handleError(err);\n }\n return;\n }\n\n if (verb === 'wllama.action') {\n const argAction = args[0];\n const argEncodedMsg = args[1];\n try {\n const inputPtr = await wllamaMalloc(toSizeT(argEncodedMsg.byteLength), 0);\n // copy data to wasm heap\n const inputBuffer = new Uint8Array(\n getHeapU8().buffer,\n Number(inputPtr),\n argEncodedMsg.byteLength\n );\n inputBuffer.set(argEncodedMsg, 0);\n const outputPtr = await wllamaAction(argAction, inputPtr);\n // length of output buffer is written at the first 4 bytes of input buffer\n const outputLen = new Uint32Array(\n getHeapU8().buffer,\n Number(inputPtr),\n 1\n )[0];\n // copy the output buffer to JS heap\n const outputBuffer = new Uint8Array(outputLen);\n const outputSrcView = new Uint8Array(\n getHeapU8().buffer,\n Number(outputPtr),\n outputLen\n );\n outputBuffer.set(outputSrcView, 0); // copy it\n msg({ callbackId, result: outputBuffer }, [outputBuffer.buffer]);\n } catch (err) {\n handleError(err);\n }\n return;\n }\n\n if (verb === 'wllama.exit') {\n try {\n const result = await wllamaExit();\n msg({ callbackId, result });\n } catch (err) {\n handleError(err);\n }\n return;\n }\n\n if (verb === 'wllama.debug') {\n try {\n const result = await wllamaDebug();\n msg({ callbackId, result });\n } catch (err) {\n handleError(err);\n }\n return;\n }\n};\n";

Expand Down
Loading