ngxson · ngxson · May 17, 2026 · May 16, 2026 · May 16, 2026 · May 16, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -35,6 +35,11 @@ add_link_options(
     -pthread
     -sJSPI
     -sJSPI_EXPORTS=['wllama_start','wllama_action']
+    -Wl,--wrap,fopen
+    -Wl,--wrap,fclose
+    -Wl,--wrap,fread
+    -Wl,--wrap,fseek
+    -Wl,--wrap,ftell
 )
 
 add_subdirectory(llama.cpp)

diff --git a/README-dev.md b/README-dev.md
@@ -51,10 +51,6 @@ The thread pool size is passed to emscripten via `-sPTHREAD_POOL_SIZE=Module["pt
 
 This logic lives in `wllama.ts` (`isSupportMultiThread()` from `utils.ts` performs the feature detection).
 
-### HeapFS
-
-HeapFS is a lightweight wrapper around emscripten's default FS driver. The main goal is to allow `mmap()` operation to map to existing data, instead of copying it (the default behavior of emscripten). See `workers-code/llama-cpp.js` for more details.
-
 ## Startup process
 
 Upon startup, these steps are performed:
@@ -65,6 +61,43 @@ Upon startup, these steps are performed:
     - Setting up HeapFS
     - Setting up communication callbacks
 
+## File access
+
+Wllama employs some tricks to avoid making copies while reading GGUF files. The runtime uses one of these 2 mechanisms. See `workers-code/llama-cpp.js` for the implementation.
+
+Please note that wllama only accepts `Blob` as input data.
+
+### Async file read
+
+This implementation hooks into `fopen`, `fseek` and `fread`, and forwards these calls to the main thread (via message port), where we eventually call `Blob.slice()` to read the data. Because of the asynchronous execution via `onmessage` and `postMessage`, JSPI is required.
+
+Upon running, action `fs.alloc` is fired to indicate that the file can be read through JSPI call. The actual buffer won't be allocated for the file, but only the metadata is.
+
+When wasm calls `fread()`:
+- `fread()` calls `await fileRead()` in the JS context
+- `fileRead()` posts a message of type `fs.read_req` to the main thread
+- Main thread uses `Blob.slice()` to read the data, then sends it back via a `fs.read_res` message
+- Worker's `onmessage` receives the message and resumes the awaiting coroutine
+
+Note:
+- While awaiting the read data, the worker should not have any other activities (a global variable is used as a guard and will raise an exception on any incoming messages)
+- The minimum read size is 1MB. If less than this amount is requested, the full 1MB block is cached for subsequent reads. This is because reading GGUF metadata frequently involves reads of less than 1KB at a time, which can become a bottleneck without caching.
+- Env var `USE_ASYNC_FILE` is used to signal from JS to wasm that we are using async file read (upon starting the module). If `USE_ASYNC_FILE` is not set, we fallback to HeapFS/mmap case (see in next section)
+
+### HeapFS
+
+HeapFS is a lightweight wrapper around emscripten's default FS driver. The main goal is to allow `mmap()` to map to existing data instead of copying it (the default emscripten behavior).
+
+These steps are performed:
+
+- Action `fs.alloc` is fired to create the file handle and file buffer in the wasm context
+- The main thread then creates and holds a `ReadableStream` for the `Blob`
+- The main thread reads the file chunk by chunk, streaming it to the worker via `fs.write` messages
+- Once streaming is finished, the `ReadableStream` is closed
+- The model load is then triggered with `mmap = true`, and `mmap()` is wrapped to return a pointer to the correct data in the buffer allocated in step 1
+
+The main downside of this approach is that on WebGPU, even though some tensors can be offloaded to the GPU, we still need to allocate the full model in main memory. For example, a 4GB model will still occupy 4GB of main memory, even if half of the layers (~2GB) are offloaded to the GPU.
+
 ## Build process
 
 The build process uses emscripten in docker to compile the project.

diff --git a/cpp/wllama-fs.h b/cpp/wllama-fs.h
@@ -0,0 +1,148 @@
+#pragma once
+
+#ifdef __EMSCRIPTEN__
+#include <emscripten/emscripten.h>
+#endif
+
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <cstring>
+
+static std::map<FILE *, std::string> s_file_path_map;
+
+namespace wllama_fs
+{
+  bool ready = false;
+  bool use_async = false;
+
+  static const size_t CACHE_SIZE = 1024 * 1024; // 1 MB read-ahead
+
+  std::vector<uint8_t> cache_data;
+  size_t cache_start = 0;
+  FILE *cache_file = nullptr;
+
+  void make_sure_ready()
+  {
+    if (ready)
+      return;
+    use_async = getenv("USE_ASYNC_FILE") != nullptr;
+    ready = true;
+  }
+
+  size_t try_cache(FILE *f, char *ptr, size_t req_bytes, size_t fpos)
+  {
+    if (f != cache_file || cache_data.empty())
+      return 0;
+    if (fpos >= cache_start && fpos + req_bytes <= cache_start + cache_data.size())
+    {
+      memcpy(ptr, cache_data.data() + (fpos - cache_start), req_bytes);
+      return req_bytes;
+    }
+    return 0;
+  }
+}
+
+// Thin stub — real implementation lives in llama-cpp.js to avoid
+// C++ formatter mangling the JS syntax inside EM_ASYNC_JS macros.
+
+EM_ASYNC_JS(size_t, js_file_read, (const char *path_ptr, size_t offset, size_t req_size, void *out_ptr), {
+  return await _wllama_js_file_read(UTF8ToString(Number(path_ptr)), Number(offset), Number(req_size), Number(out_ptr));
+});
+
+extern "C"
+{
+  FILE *__real_fopen(const char *path, const char *mode);
+  int __real_fclose(FILE *f);
+  size_t __real_fread(void *ptr, size_t size, size_t nmemb, FILE *f);
+  int __real_fseek(FILE *f, long offset, int whence);
+  long __real_ftell(FILE *f);
+
+  FILE *__wrap_fopen(const char *path, const char *mode)
+  {
+    wllama_fs::make_sure_ready();
+    FILE *f = __real_fopen(path, mode);
+    if (f)
+    {
+      s_file_path_map[f] = path;
+    }
+    return f;
+  }
+
+  int __wrap_fclose(FILE *f)
+  {
+    if (wllama_fs::cache_file == f)
+    {
+      wllama_fs::cache_file = nullptr;
+      wllama_fs::cache_data.clear();
+    }
+    s_file_path_map.erase(f);
+    return __real_fclose(f);
+  }
+
+  int __wrap_fseek(FILE *f, long offset, int whence)
+  {
+    return __real_fseek(f, offset, whence);
+  }
+
+  long __wrap_ftell(FILE *f)
+  {
+    return __real_ftell(f);
+  }
+
+  size_t __wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *f)
+  {
+    wllama_fs::make_sure_ready();
+    if (!wllama_fs::use_async)
+      return __real_fread(ptr, size, nmemb, f);
+
+    auto nit = s_file_path_map.find(f);
+    if (nit == s_file_path_map.end())
+      return __real_fread(ptr, size, nmemb, f);
+
+    size_t req_bytes = size * nmemb;
+    if (req_bytes == 0)
+      return 0;
+
+    size_t fpos = (size_t)__real_ftell(f);
+
+    // Large reads (>= 1 MB): write directly into ptr, skip cache entirely.
+    if (req_bytes >= wllama_fs::CACHE_SIZE)
+    {
+      size_t actual = (size_t)js_file_read(
+          nit->second.c_str(), fpos, req_bytes, ptr);
+      if (actual == 0)
+        return 0;
+      size_t copy_bytes = std::min(req_bytes, actual);
+      __real_fseek(f, fpos + copy_bytes, SEEK_SET);
+      return copy_bytes / size;
+    }
+
+    // Small reads: try cache first.
+    size_t cached = wllama_fs::try_cache(f, (char *)ptr, req_bytes, fpos);
+    if (cached == req_bytes)
+    {
+      __real_fseek(f, fpos + req_bytes, SEEK_SET);
+      return nmemb;
+    }
+
+    // Cache miss: fetch a full CACHE_SIZE block from main thread.
+    wllama_fs::cache_data.resize(wllama_fs::CACHE_SIZE);
+    size_t actual = (size_t)js_file_read(
+        nit->second.c_str(), fpos, wllama_fs::CACHE_SIZE,
+        wllama_fs::cache_data.data());
+
+    wllama_fs::cache_data.resize(actual);
+    wllama_fs::cache_file  = f;
+    wllama_fs::cache_start = fpos;
+
+    if (actual == 0)
+      return 0;
+
+    size_t copy_bytes = std::min(req_bytes, actual);
+    memcpy(ptr, wllama_fs::cache_data.data(), copy_bytes);
+    __real_fseek(f, fpos + copy_bytes, SEEK_SET);
+
+    return copy_bytes / size;
+  }
+}
diff --git a/cpp/wllama.cpp b/cpp/wllama.cpp
@@ -16,15 +16,16 @@
 
 #include "llama.h"
 #include "wllama-context.h"
+#include "wllama-fs.h"
 #include "wllama.h"
 
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
-#define WLLAMA_ACTION(name)                 \
-  else if (action == #name)                 \
-  {                                         \
-    auto res = app.action_##name(req_raw);  \
-    res.handler.serialize(output_buffer);   \
+#define WLLAMA_ACTION(name)                \
+  else if (action == #name)                \
+  {                                        \
+    auto res = app.action_##name(req_raw); \
+    res.handler.serialize(output_buffer);  \
   }
 
 static void llama_log_callback_logTee(ggml_log_level level, const char *text, void *user_data)
@@ -80,6 +81,13 @@ extern "C" const char *wllama_start()
     // std::cerr << llama_print_system_info() << "\n";
     llama_log_set(llama_log_callback_logTee, nullptr);
     wllama_malloc(1024, 0);
+
+    wllama_fs::make_sure_ready();
+    if (wllama_fs::use_async)
+    {
+      printStr(GGML_LOG_LEVEL_INFO, "Using async file read");
+    }
+
     return "{\"success\":true}";
   }
   catch (std::exception &e)

diff --git a/examples/main/src/components/GuideScreen.tsx b/examples/main/src/components/GuideScreen.tsx
@@ -48,8 +48,7 @@ export default function GuideScreen() {
             </li>
             <li>Running on smartphone maybe buggy.</li>
             <li>
-              <b>WebGPU is not supported</b>. We're still working hard to add
-              support for WebGPU.
+              <b>Safari is not supported</b>, due to lack of Memory64 support.
             </li>
           </ul>
         </div>

diff --git a/examples/main/src/utils/custom-models.tsx b/examples/main/src/utils/custom-models.tsx
@@ -1,4 +1,3 @@
-import { MAX_GGUF_SIZE } from '../config';
 import { DisplayedModel } from './displayed-model';
 import { WllamaStorage } from './utils';
 
@@ -58,11 +57,11 @@ const getModelSize = async (url: string): Promise<number> => {
     })
   );
 
-  if (sizes.some((s) => s >= MAX_GGUF_SIZE)) {
-    throw new Error(
-      'GGUF file is too big (max. 2GB per file). Please split the file into smaller shards (learn more in "Guide")'
-    );
-  }
+  // if (sizes.some((s) => s >= MAX_GGUF_SIZE)) {
+  //   throw new Error(
+  //     'GGUF file is too big (max. 2GB per file). Please split the file into smaller shards (learn more in "Guide")'
+  //   );
+  // }
 
   return sumArr(sizes);
 };

diff --git a/llama.cpp b/llama.cpp
diff --git a/src/utils.ts b/src/utils.ts
@@ -367,3 +367,9 @@ export const cbToAsyncIter =
       }
     })();
   };
+
+/**
+ * Check if we can use async file read, where the wasm env can asynchronously read a Blob.
+ * Please refer to README-dev.md for more details.
+ */
+export const canUseAsyncFileRead = () => isSupportJSPI();
diff --git a/src/wasm/wllama.js b/src/wasm/wllama.js
diff --git a/src/wasm/wllama.wasm b/src/wasm/wllama.wasm
diff --git a/src/wllama.ts b/src/wllama.ts
@@ -1,6 +1,7 @@
 import { ProxyToWorker } from './worker';
 import {
   absoluteUrl,
+  canUseAsyncFileRead,
   cbToAsyncIter,
   checkEnvironmentCompatible,
   isFirefox,
@@ -469,8 +470,8 @@ export class Wllama {
     const loadResult: GlueMsgLoadRes = await this.proxy.wllamaAction('load', {
       _name: 'load_req',
       log_level: logLevel,
-      use_mmap: true,
-      use_mlock: true,
+      use_mmap: !canUseAsyncFileRead(), // if async read is not supported, use mmap; refer to README-dev.md for more details
+      use_mlock: false,
       n_gpu_layers: params.n_gpu_layers ?? 99999,
       n_ctx: params.n_ctx ?? 1024,
       n_threads: this.useMultiThread ? nbThreads : 1,