reeselevine · reeselevine · Apr 3, 2026 · Jan 20, 2026 · Jan 22, 2026 · Mar 6, 2026
diff --git a/examples/main/src/components/GuideScreen.tsx b/examples/main/src/components/GuideScreen.tsx
@@ -20,14 +20,14 @@ export default function GuideScreen() {
             llama.cpp
           </a>
           . It enables running LLM inference directly on browser by leveraging
-          the power of <b>WebAssembly</b>. It accepts GGUF as model format.
+          the power of <b>WebAssembly</b> and <b>WebGPU</b>. It accepts GGUF as model format.
         </div>
 
         <div className="mb-3">
           Please note that:
           <ul>
             <li>
-              Due to WebAssembly overhead, performance will not be as good as
+              Due to WebAssembly and WebGPU overhead, performance will not be as good as
               running llama.cpp in native. Performance degradation can range
               from 25% to 50%.
             </li>
@@ -48,8 +48,7 @@ export default function GuideScreen() {
             </li>
             <li>Running on smartphone maybe buggy.</li>
             <li>
-              <b>WebGPU is not supported</b>. We're still working hard to add
-              support for WebGPU.
+              <b>WebGPU support is experimental</b>. We're still working hard to stabilize and optimize the WebGPU backend, and it may not work well on some devices.
             </li>
           </ul>
         </div>

diff --git a/examples/main/src/components/ModelScreen.tsx b/examples/main/src/components/ModelScreen.tsx
@@ -8,14 +8,21 @@ import {
   faCheck,
 } from '@fortawesome/free-solid-svg-icons';
 import { DEFAULT_INFERENCE_PARAMS, MAX_GGUF_SIZE } from '../config';
-import { toHumanReadableSize, useDebounce } from '../utils/utils';
+import {
+  getWebGPUMemoryBudget,
+  toHumanReadableSize,
+  useDebounce,
+} from '../utils/utils';
 import { useEffect, useState } from 'react';
 import ScreenWrapper from './ScreenWrapper';
 import { DisplayedModel } from '../utils/displayed-model';
 import { isValidGgufFile } from '@wllama/wllama';
 
 export default function ModelScreen() {
   const [showAddCustom, setShowAddCustom] = useState(false);
+  const [webgpuMemoryBudget, setWebgpuMemoryBudget] = useState<
+    number | undefined
+  >();
   const {
     models,
     removeCachedModel,
@@ -27,8 +34,33 @@ export default function ModelScreen() {
   } = useWllama();
 
   const blockModelBtn = !!(loadedModel || isDownloading || isLoadingModel);
+  const effectiveWebGPUMemoryBudget = webgpuMemoryBudget
+    ? Math.floor(webgpuMemoryBudget * 0.8)
+    : undefined;
+
+  useEffect(() => {
+    let cancelled = false;
+
+    getWebGPUMemoryBudget()
+      .then((budget) => {
+        if (!cancelled) {
+          setWebgpuMemoryBudget(budget);
+        }
+      })
+      .catch(() => {
+        if (!cancelled) {
+          setWebgpuMemoryBudget(undefined);
+        }
+      });
 
-  const onChange = (key: keyof typeof currParams) => (e: any) => {
+    return () => {
+      cancelled = true;
+    };
+  }, []);
+
+  const onChange = (
+    key: 'nThreads' | 'nContext' | 'nPredict' | 'temperature'
+  ) => (e: any) => {
     setParams({ ...currParams, [key]: parseFloat(e.target.value || -1) });
   };
 
@@ -88,6 +120,25 @@ export default function ModelScreen() {
           />
         </label>
 
+        <label className="label cursor-pointer justify-start gap-3 mb-2">
+          <input
+            type="checkbox"
+            className="toggle toggle-primary"
+            checked={currParams.preferWebGPU}
+            onChange={(e) =>
+              setParams({ ...currParams, preferWebGPU: e.target.checked })
+            }
+            disabled={blockModelBtn}
+          />
+          <span className="label-text">Prefer WebGPU</span>
+        </label>
+
+        {currParams.preferWebGPU && effectiveWebGPUMemoryBudget && (
+          <div className="text-sm opacity-80 mb-2">
+            Usable WebGPU Budget: {toHumanReadableSize(effectiveWebGPUMemoryBudget)}
+          </div>
+        )}
+
         <button
           className="btn btn-sm mr-2"
           onClick={() => setParams(DEFAULT_INFERENCE_PARAMS)}
@@ -127,7 +178,13 @@ export default function ModelScreen() {
         {models
           .filter((m) => m.isUserAdded)
           .map((m) => (
-            <ModelCard key={m.url} model={m} blockModelBtn={blockModelBtn} />
+            <ModelCard
+              key={m.url}
+              model={m}
+              blockModelBtn={blockModelBtn}
+              preferWebGPU={currParams.preferWebGPU}
+              webgpuMemoryBudget={effectiveWebGPUMemoryBudget}
+            />
           ))}
       </div>
 
@@ -137,7 +194,13 @@ export default function ModelScreen() {
         {models
           .filter((m) => !m.isUserAdded)
           .map((m) => (
-            <ModelCard key={m.url} model={m} blockModelBtn={blockModelBtn} />
+            <ModelCard
+              key={m.url}
+              model={m}
+              blockModelBtn={blockModelBtn}
+              preferWebGPU={currParams.preferWebGPU}
+              webgpuMemoryBudget={effectiveWebGPUMemoryBudget}
+            />
           ))}
       </div>
 
@@ -277,9 +340,13 @@ function AddCustomModelDialog({ onClose }: { onClose(): void }) {
 function ModelCard({
   model,
   blockModelBtn,
+  preferWebGPU,
+  webgpuMemoryBudget,
 }: {
   model: DisplayedModel;
   blockModelBtn: boolean;
+  preferWebGPU: boolean;
+  webgpuMemoryBudget?: number;
 }) {
   const {
     downloadModel,
@@ -293,9 +360,19 @@ function ModelCard({
 
   const m = model;
   const percent = parseInt(Math.round(m.downloadPercent * 100).toString());
+  const blockedByWebGPU = !!(
+    preferWebGPU &&
+    webgpuMemoryBudget &&
+    m.size > webgpuMemoryBudget
+  );
+  const blockedActionLabel = blockedByWebGPU
+    ? `Too large for current WebGPU budget (${toHumanReadableSize(webgpuMemoryBudget!)})`
+    : undefined;
   return (
     <div
-      className={`card bg-base-100 w-full mb-2 ${m.state === ModelState.LOADED ? 'border-2 border-primary' : ''}`}
+      className={`card bg-base-100 w-full mb-2 ${
+        m.state === ModelState.LOADED ? 'border-2 border-primary' : ''
+      } ${blockedByWebGPU ? 'opacity-50 saturate-0' : ''}`}
       key={m.url}
     >
       <div className="card-body p-4 flex flex-row">
@@ -321,6 +398,13 @@ function ModelCard({
               : ''}
           </small>
 
+          {blockedByWebGPU && (
+            <div className="text-sm text-warning mt-1">
+              <FontAwesomeIcon icon={faWarning} className="mr-2" />
+              Model size exceeds the current WebGPU budget.
+            </div>
+          )}
+
           {m.state === ModelState.LOADED && currRuntimeInfo && (
             <>
               <br />
@@ -362,7 +446,8 @@ function ModelCard({
             <button
               className="btn btn-primary btn-sm mr-2"
               onClick={() => downloadModel(m)}
-              disabled={blockModelBtn}
+              disabled={blockModelBtn || blockedByWebGPU}
+              title={blockedActionLabel}
             >
               Download
             </button>
@@ -372,7 +457,8 @@ function ModelCard({
               <button
                 className="btn btn-primary btn-sm mr-2"
                 onClick={() => loadModel(m)}
-                disabled={blockModelBtn}
+                disabled={blockModelBtn || blockedByWebGPU}
+                title={blockedActionLabel}
               >
                 Load model
               </button>

diff --git a/examples/main/src/config.ts b/examples/main/src/config.ts
@@ -1,86 +1,83 @@
 // See: https://vitejs.dev/guide/assets#explicit-url-imports
-import wllamaSingle from '@wllama/wllama/src/single-thread/wllama.wasm?url';
-import wllamaMulti from '@wllama/wllama/src/multi-thread/wllama.wasm?url';
+import wllamaJspiSingle from '@wllama/wllama/src/jspi-single-thread/wllama.wasm?url';
+import wllamaJspiMulti from '@wllama/wllama/src/jspi-multi-thread/wllama.wasm?url';
+import wllamaAsyncifySingle from '@wllama/wllama/src/asyncify-single-thread/wllama.wasm?url';
+import wllamaAsyncifyMulti from '@wllama/wllama/src/asyncify-multi-thread/wllama.wasm?url';
 import wllamaPackageJson from '@wllama/wllama/package.json';
 import { InferenceParams } from './utils/types';
 
 export const WLLAMA_VERSION = wllamaPackageJson.version;
 
 export const WLLAMA_CONFIG_PATHS = {
-  'single-thread/wllama.wasm': wllamaSingle,
-  'multi-thread/wllama.wasm': wllamaMulti,
+  'jspi/single-thread/wllama.wasm': wllamaJspiSingle,
+  'jspi/multi-thread/wllama.wasm': wllamaJspiMulti,
+  'asyncify/single-thread/wllama.wasm': wllamaAsyncifySingle,
+  'asyncify/multi-thread/wllama.wasm': wllamaAsyncifyMulti,
+
 };
 
 export const MAX_GGUF_SIZE = 2 * 1024 * 1024 * 1024; // 2GB
 
 export const LIST_MODELS = [
   {
-    url: 'https://huggingface.co/LiquidAI/LFM2-350M-GGUF/resolve/main/LFM2-350M-Q4_K_M.gguf',
-    size: 229309376,
-  },
-  {
-    url: 'https://huggingface.co/LiquidAI/LFM2-700M-GGUF/resolve/main/LFM2-700M-Q4_K_M.gguf',
-    size: 468624320,
+    url: 'https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf',
+    size: 773025824,
   },
   {
-    url: 'https://huggingface.co/LiquidAI/LFM2-1.2B-GGUF/resolve/main/LFM2-1.2B-Q4_K_M.gguf',
-    size: 730893248,
+    url: 'https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q8_0.gguf',
+    size: 1321082528,
   },
   {
-    url: 'https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q8_0.gguf',
-    size: 639447744,
+    url: 'https://huggingface.co/unsloth/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q4_0.gguf',
+    size: 1921909184,
   },
   {
-    url: 'https://huggingface.co/unsloth/Qwen3-1.7B-GGUF/resolve/main/Qwen3-1.7B-Q4_K_M.gguf',
-    size: 1107409472,
+    url: 'https://huggingface.co/unsloth/gemma-3-270m-it-GGUF/resolve/main/gemma-3-270m-it-Q4_0.gguf',
+    size: 241574944,
   },
   {
-    url: 'https://huggingface.co/ggml-org/SmolLM3-3B-GGUF/resolve/main/SmolLM3-Q4_K_M.gguf',
-    size: 1915305312,
+    url: 'https://huggingface.co/unsloth/gemma-3-270m-it-GGUF/resolve/main/gemma-3-270m-it-F16.gguf',
+    size: 542835488,
   },
   {
-    url: 'https://huggingface.co/ngxson/SmolLM2-360M-Instruct-Q8_0-GGUF/resolve/main/smollm2-360m-instruct-q8_0.gguf',
-    size: 386404992,
+    url: 'https://huggingface.co/unsloth/gemma-3-1b-it-GGUF/resolve/main/gemma-3-1b-it-Q4_0.gguf',
+    size: 721918496,
   },
   {
-    url: 'https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q8_0.gguf',
-    size: 675710816,
+    url: 'https://huggingface.co/unsloth/gemma-3-1b-it-GGUF/resolve/main/gemma-3-1b-it-Q8_0.gguf',
+    size: 1069306400,
   },
   {
-    url: 'https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/resolve/main/llama-3.2-1b-instruct-q4_k_m.gguf',
-    size: 807690656,
-  },
-  {
-    url: 'https://huggingface.co/bartowski/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf',
-    size: 924456032,
+    url: 'https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q8_0.gguf',
+    size: 639447744,
   },
   {
-    url: 'https://huggingface.co/ngxson/wllama-split-models/resolve/main/qwen2-1_5b-instruct-q4_k_m-00001-of-00004.gguf',
-    size: 986046272,
+    url: 'https://huggingface.co/unsloth/Qwen3-1.7B-GGUF/resolve/main/Qwen3-1.7B-Q4_0.gguf',
+    size: 1056782912,
   },
   {
-    url: 'https://huggingface.co/ngxson/SmolLM2-1.7B-Instruct-Q4_K_M-GGUF/resolve/main/smollm2-1.7b-instruct-q4_k_m.gguf',
-    size: 1055609536,
+    url: 'https://huggingface.co/unsloth/Qwen3.5-2B-GGUF/resolve/main/Qwen3.5-2B-Q4_0.gguf',
+    size: 1214873856,
   },
   {
-    url: 'https://huggingface.co/ngxson/wllama-split-models/resolve/main/gemma-2-2b-it-abliterated-Q4_K_M-00001-of-00004.gguf',
-    size: 1708583264,
+    url: 'https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct-GGUF/resolve/main/LFM2.5-1.2B-Instruct-Q4_0.gguf',
+    size: 695751488,
   },
   {
-    url: 'https://huggingface.co/ngxson/wllama-split-models/resolve/main/neuralreyna-mini-1.8b-v0.3.q4_k_m-00001-of-00005.gguf',
-    size: 1217753472,
+    url: 'https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct-GGUF/resolve/main/LFM2.5-1.2B-Instruct-Q8_0.gguf',
+    size: 1246253888,
   },
   {
-    url: 'https://huggingface.co/ngxson/wllama-split-models/resolve/main/Phi-3.1-mini-128k-instruct-Q3_K_M-00001-of-00008.gguf',
-    size: 1955478176,
+    url: 'https://huggingface.co/unsloth/SmolLM3-3B-128K-GGUF/resolve/main/SmolLM3-3B-128K-Q4_0.gguf',
+    size: 1811456608,
   },
   {
-    url: 'https://huggingface.co/ngxson/wllama-split-models/resolve/main/meta-llama-3.1-8b-instruct-abliterated.Q2_K-00001-of-00014.gguf',
-    size: 3179133600,
+    url: 'https://huggingface.co/unsloth/Ministral-3-3B-Instruct-2512-GGUF/resolve/main/Ministral-3-3B-Instruct-2512-Q3_K_M.gguf',
+    size: 1795552544,
   },
   {
-    url: 'https://huggingface.co/ngxson/wllama-split-models/resolve/main/Meta-Llama-3.1-8B-Instruct-Q2_K-00001-of-00014.gguf',
-    size: 3179138048,
+    url: 'https://huggingface.co/unsloth/Phi-4-mini-instruct-GGUF/resolve/main/Phi-4-mini-instruct-Q2_K.gguf',
+    size: 1682635744,
   },
 ];
 
@@ -90,6 +87,7 @@ export const DEFAULT_INFERENCE_PARAMS: InferenceParams = {
   nPredict: 4096,
   nBatch: 128,
   temperature: 0.2,
+  preferWebGPU: true,
 };
 
 export const DEFAULT_CHAT_TEMPLATE =

diff --git a/examples/main/src/utils/types.ts b/examples/main/src/utils/types.ts
@@ -25,6 +25,7 @@ export interface InferenceParams {
   nBatch: number;
   temperature: number;
   nPredict: number;
+  preferWebGPU: boolean;
 }
 
 export interface Message {

diff --git a/examples/main/src/utils/utils.ts b/examples/main/src/utils/utils.ts
@@ -85,6 +85,11 @@ export const toHumanReadableSize = (bytes: number): string => {
   return `${size.toFixed(1)} ${units[unitIndex]}`;
 };
 
+export const getWebGPUMemoryBudget = async (): Promise<number | undefined> => {
+  const adapter = await navigator.gpu.requestAdapter();
+  return adapter?.limits.maxBufferSize;
+};
+
 export const DebugLogger = {
   content: [] as string[],
   debug(...args: any) {