ngxson · ngxson · May 11, 2026 · May 10, 2026 · May 11, 2026 · May 11, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -51,6 +51,8 @@ jobs:
       - name: Test (Firefox)
         run: npm run test:firefox
 
+  # TODO: add test for WebGPU on github hosted runner ; current missing ShaderF16 support
+
   lint:
     runs-on: ubuntu-latest
     steps:

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -9,13 +9,15 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
 
 add_compile_options(
     -O3 -msimd128 -DNDEBUG
-    -flto=full -frtti -fwasm-exceptions
+    -flto=full -frtti
+    -fwasm-exceptions
     -pthread
     -sMEMORY64=1
 )
 add_link_options(
     -sMEMORY64=1
-    -flto=full -fwasm-exceptions
+    -flto=full
+    -fwasm-exceptions
     --no-entry
     -sEXPORT_ALL=1
     -sEXPORT_ES6=0
@@ -31,6 +33,8 @@ add_link_options(
     -sPTHREAD_POOL_SIZE=Module[\"pthreadPoolSize\"]
     -sUSE_PTHREADS=1
     -pthread
+    -sJSPI
+    -sJSPI_EXPORTS=['wllama_start','wllama_action']
 )
 
 add_subdirectory(llama.cpp)

diff --git a/README.md b/README.md
@@ -63,6 +63,10 @@ For complete code example, see [examples/main/src/utils/wllama.context.tsx](./ex
 
 NOTE: this example only covers completions usage. For embeddings, please see [examples/embeddings/index.html](./examples/embeddings/index.html)
 
+### WebGPU support
+
+
+
 ### Prepare your model
 
 - It is recommended to split the model into **chunks of maximum 512MB**. This will result in slightly faster download speed (because multiple splits can be downloaded in parallel), and also prevent some out-of-memory issues.  
@@ -200,5 +204,4 @@ npm run build
 ## TODO
 
 - Add support for LoRA adapter
-- Support GPU inference via WebGPU
 - Support multi-sequences: knowing the resource limitation when using WASM, I don't think having multi-sequences is a good idea
diff --git a/cpp/wllama-context.h b/cpp/wllama-context.h
@@ -302,6 +302,8 @@ struct wllama_context
       params.cache_type_k = kv_cache_type_from_str(req.cache_type_k.value);
     if (req.cache_type_v.not_null())
       params.cache_type_v = kv_cache_type_from_str(req.cache_type_v.value);
+    if (req.flash_attn.not_null())
+      params.flash_attn_type = req.flash_attn.value ? LLAMA_FLASH_ATTN_TYPE_AUTO : LLAMA_FLASH_ATTN_TYPE_DISABLED;
     if (req.swa_full.not_null())
       params.swa_full = req.swa_full.value;
     if (req.n_ctx_checkpoints.not_null())
@@ -314,22 +316,29 @@ struct wllama_context
       params.chat_template = req.chat_template.value;
     if (req.jinja.not_null())
       params.use_jinja = req.jinja.value;
-    if (req.reasoning.not_null()) {
-      if (req.reasoning.value) {
+    if (req.reasoning.not_null())
+    {
+      if (req.reasoning.value)
+      {
         params.enable_reasoning = 1;
         params.default_template_kwargs["enable_thinking"] = "true";
-      } else {
+      }
+      else
+      {
         params.enable_reasoning = 0;
         params.default_template_kwargs["enable_thinking"] = "false";
       }
     }
-    if (req.default_template_kwargs_keys.not_null() && req.default_template_kwargs_vals.not_null()) {
+    if (req.default_template_kwargs_keys.not_null() && req.default_template_kwargs_vals.not_null())
+    {
       auto &keys = req.default_template_kwargs_keys.arr;
       auto &vals = req.default_template_kwargs_vals.arr;
-      if (keys.size() != vals.size()) {
+      if (keys.size() != vals.size())
+      {
         throw app_exception("default_template_kwargs_keys and default_template_kwargs_vals must have the same length");
       }
-      for (size_t i = 0; i < keys.size(); i++) {
+      for (size_t i = 0; i < keys.size(); i++)
+      {
         params.default_template_kwargs[keys[i]] = vals[i];
       }
     }
@@ -422,28 +431,37 @@ struct wllama_context
     json body = json::parse(req_raw);
 
     json prompt;
-    if (body.count("input") != 0) {
+    if (body.count("input") != 0)
+    {
       prompt = body.at("input");
-    } else if (body.contains("content")) {
+    }
+    else if (body.contains("content"))
+    {
       prompt = body.at("content");
-    } else {
+    }
+    else
+    {
       throw app_exception("\"input\" or \"content\" must be provided");
     }
 
     int embd_normalize = 2;
-    if (body.count("embd_normalize") != 0) {
+    if (body.count("embd_normalize") != 0)
+    {
       embd_normalize = body.at("embd_normalize");
     }
 
     auto tokenized_prompts = tokenize_input_prompts(vocab, nullptr, prompt, true, true);
-    for (const auto &tokens : tokenized_prompts) {
-      if (tokens.empty()) {
+    for (const auto &tokens : tokenized_prompts)
+    {
+      if (tokens.empty())
+      {
         throw app_exception("Input content cannot be empty");
       }
     }
 
     std::vector<server_task> tasks;
-    for (size_t i = 0; i < tokenized_prompts.size(); i++) {
+    for (size_t i = 0; i < tokenized_prompts.size(); i++)
+    {
       server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING);
       task.id = rd->get_new_id();
       task.tokens = std::move(tokenized_prompts[i]);

diff --git a/examples/multimodal/index.html b/examples/multimodal/index.html
@@ -11,6 +11,7 @@
         color: rgb(222, 222, 222);
         font-family: 'Courier New', Courier, monospace;
         padding: 1em;
+        padding-bottom: 4em;
       }
 
       #output_cmpl {
@@ -92,6 +93,13 @@ <h2>Multimodal (Vision) Completion</h2>
 
     Output:<br />
     <div id="output_cmpl"></div>
+    <div
+      id="output_timings"
+      style="margin-top: 0.5em; color: #aaa; font-size: 0.85em; display: none"
+    >
+      Prompt: <span id="timing_prompt">-</span> t/s &nbsp;|&nbsp; Generation:
+      <span id="timing_gen">-</span> t/s
+    </div>
 
     <script type="module">
       import { Wllama } from '../../esm/index.js';
@@ -112,6 +120,12 @@ <h2>Multimodal (Vision) Completion</h2>
       async function main() {
         setRunDisabled(true);
 
+        // Pre-load the example image
+        const response = await fetch('./bliss.png');
+        imageData = await response.arrayBuffer();
+        elemPreviewImage.src = './bliss.png';
+        elemPreviewImage.style.display = 'block';
+
         elemBtnLoadRemote.onclick = async () => {
           elemBtnLoadRemote.disabled = true;
           elemBtnPickFiles.disabled = true;
@@ -149,6 +163,7 @@ <h2>Multimodal (Vision) Completion</h2>
           if (!wllama) return;
           setRunDisabled(true);
           elemOutputCmpl.textContent = '';
+          elemOutputTimings.style.display = 'none';
           try {
             await runCompletion();
           } catch (err) {
@@ -226,8 +241,15 @@ <h2>Multimodal (Vision) Completion</h2>
           temperature: 0.2,
           stream: true,
           onData: (chunk) => {
+            console.log('Received chunk:', chunk);
             const delta = chunk.choices[0]?.delta?.content;
             if (delta) elemOutputCmpl.textContent += delta;
+            if (chunk.timings) {
+              const t = chunk.timings;
+              elemTimingPrompt.textContent = t.prompt_per_second.toFixed(1);
+              elemTimingGen.textContent = t.predicted_per_second.toFixed(1);
+              elemOutputTimings.style.display = 'block';
+            }
           },
         });
       }
@@ -266,6 +288,9 @@ <h2>Multimodal (Vision) Completion</h2>
       const elemPreviewImage = document.getElementById('preview_image');
       const elemBtnRunCmpl = document.getElementById('btn_run_cmpl');
       const elemOutputCmpl = document.getElementById('output_cmpl');
+      const elemOutputTimings = document.getElementById('output_timings');
+      const elemTimingPrompt = document.getElementById('timing_prompt');
+      const elemTimingGen = document.getElementById('timing_gen');
 
       main();
     </script>

diff --git a/guides/intro-v3.1.md b/guides/intro-v3.1.md
@@ -0,0 +1,40 @@
+# Release note Wllama V3.1
+
+## What's new
+
+Continuing from the [V3.0 release](./intro-v3.md), V3.1 continues to bring more interesting features into wllama. This release marks 2 major changes:
+1. WebGPU support
+2. Single WASM build (no more single/multi-threaded build)
+
+### WebGPU support
+
+WebGPU support is introduced via [PR #215](https://github.com/ngxson/wllama/pull/215). Currently only supports Chrome (for Firefox, a flag must be enabled manually).
+
+Upon updating to V3.1, WebGPU will be enabled by default. By default, all layers will be offloaded to GPU. If the model is too big to fit into VRAM, you can manually adjust the number of layers via the `n_gpu_layers` parameter of `LoadModelParams`. Example:
+
+```js
+await wllama.loadModel(files, {
+  n_gpu_layers: 4, // meaning 4 layers are offloaded to GPU; set to 0 to disable GPU inference
+});
+```
+
+### Single WASM build
+
+From [PR #214](https://github.com/ngxson/wllama/pull/214), the separation between single-threaded build and multi-threaded build has been removed. Wllama now uses a single build that can support both single/multi-threaded and WebGPU, each feature can be toggled at runtime.
+
+This allows cutting down the space to host the pre-built binary, while speeding up the build process.
+
+To migrate from an older version:
+
+```js
+// Old config
+const CONFIG_PATHS = {
+  'single-thread/wllama.wasm': './path_to_source/single-thread/wllama.wasm',
+  'multi-thread/wllama.wasm' : './path_to_source/multi-thread/wllama.wasm',
+};
+
+// New config
+const CONFIG_PATHS = {
+  'wllama.wasm': './path_to_source/wasm/wllama.wasm',
+};
+```
diff --git a/llama.cpp b/llama.cpp
diff --git a/package.json b/package.json
@@ -24,7 +24,8 @@
     "format": "prettier --write .",
     "test": "vitest",
     "test:firefox": "BROWSER=firefox vitest",
-    "test:safari": "BROWSER=safari vitest"
+    "test:safari": "BROWSER=safari vitest",
+    "test:wgpu": "WEBGPU=1 vitest"
   },
   "repository": {
     "type": "git",

diff --git a/scripts/docker-compose.yml b/scripts/docker-compose.yml
@@ -21,7 +21,17 @@ services:
 
         mkdir -p build
         cd build
-        emcmake cmake ..
+        mkdir -p emdawn
+
+        DAWN_TAG=v20260317.182325
+        EMDAWN_PKG="emdawnwebgpu_pkg-$${DAWN_TAG}.zip"
+        EMDAWNWEBGPU_DIR="/source/build/emdawn/emdawnwebgpu_pkg"
+        echo "Downloading $${EMDAWN_PKG}"
+        curl -L -o emdawn.zip \
+          "https://github.com/google/dawn/releases/download/$${DAWN_TAG}/$${EMDAWN_PKG}"
+        python3 -c "import zipfile; zf=zipfile.ZipFile('emdawn.zip','r'); zf.extractall('/source/build/emdawn'); zf.close()"
+
+        emcmake cmake .. -DGGML_WEBGPU=ON -DGGML_WEBGPU_JSPI=ON -DEMDAWNWEBGPU_DIR="$${EMDAWNWEBGPU_DIR}"
         emmake make wllama -j
 
         # go back to root

diff --git a/src/types/oai-compat.ts b/src/types/oai-compat.ts
@@ -196,6 +196,18 @@ export interface ChatCompletionChunkChoice {
   logprobs: ChatCompletionChoiceLogprobs | null;
 }
 
+export interface ResultTimings {
+  cache_n: number;
+  prompt_n: number;
+  prompt_ms: number;
+  prompt_per_token_ms: number;
+  prompt_per_second: number;
+  predicted_n: number;
+  predicted_ms: number;
+  predicted_per_token_ms: number;
+  predicted_per_second: number;
+}
+
 /** Response when stream=true — one chunk per SSE event */
 export interface ChatCompletionChunk {
   id: string;
@@ -204,6 +216,7 @@ export interface ChatCompletionChunk {
   model: string;
   choices: ChatCompletionChunkChoice[];
   usage?: ChatCompletionUsage | null;
+  timings?: ResultTimings;
 }
 
 // Raw (text) completion
@@ -249,6 +262,7 @@ export interface RawCompletionResponse {
   choices: RawCompletionChoice[];
   usage: ChatCompletionUsage;
   system_fingerprint?: string;
+  timings?: ResultTimings;
 }
 
 /** One chunk when stream=true */
@@ -264,6 +278,7 @@ export interface RawCompletionChunk {
     logprobs: null;
   }>;
   usage?: ChatCompletionUsage | null;
+  timings?: ResultTimings;
 }
 
 // Embeddings

diff --git a/src/types/types.ts b/src/types/types.ts
@@ -4,6 +4,8 @@ export interface LoadModelParams {
   seed?: number;
   n_ctx?: number;
   n_batch?: number;
+  // by default, all layers are offloaded if WebGPU is available
+  n_gpu_layers?: number;
   // by default, on multi-thread build, we take half number of available threads (hardwareConcurrency / 2)
   n_threads?: number;
   embeddings?: boolean;
@@ -26,7 +28,6 @@ export interface LoadModelParams {
   yarn_beta_fast?: number;
   yarn_beta_slow?: number;
   yarn_orig_ctx?: number;
-  // TODO: add group attention
   // optimizations
   cache_type_k?: 'f32' | 'f16' | 'q8_0' | 'q5_1' | 'q5_0' | 'q4_1' | 'q4_0';
   cache_type_v?: 'f32' | 'f16' | 'q8_0' | 'q5_1' | 'q5_0' | 'q4_1' | 'q4_0';

diff --git a/src/utils.ts b/src/utils.ts
@@ -254,6 +254,20 @@ const isSupportSIMD = async () =>
     ])
   );
 
+/**
+ * @returns true if browser support JSPI
+ */
+export const isSupportJSPI = () => {
+  return !!(WebAssembly as any).Suspending;
+};
+
+/**
+ * @returns true if brower support WebGPU and JSPI (required by emscripten build)
+ */
+export const isSupportWebGPU = () => {
+  return !!(navigator as any).gpu && isSupportJSPI();
+};
+
 /**
  * Throws an error if the environment is not compatible
  */
@@ -277,6 +291,13 @@ export const isSafari = (): boolean => {
   ); // safari
 };
 
+/**
+ * Check if browser is Firefox
+ */
+export const isFirefox = (): boolean => {
+  return !!navigator.userAgent.match(/Firefox\/([0-9\.]+)(?:\s|$)/);
+};
+
 /**
  * Regular expression to validate GGUF file paths/URLs
  * Matches paths ending with .gguf and optional query parameters

diff --git a/src/wasm/wllama.js b/src/wasm/wllama.js
diff --git a/src/wasm/wllama.wasm b/src/wasm/wllama.wasm