[Cache] Support IndexedDB, add useIndexedDBCache in AppConfig (#352)

DiegoCao · CharlieFRuan · web-flow · commit 16d5e00003ad · 2024-04-09T20:49:41.000-04:00
Add `AppConfig.useIndexedDBCache` to optionally use IndexedDBCache
rather than the default Cache API.

Also add `examples/cache-usage` to demonstrate the usage of the two
caches and cache utils such as deleting a model from cache.

---------

Co-authored-by: Charlie Ruan &lt;53290280+CharlieFRuan@users.noreply.github.com&gt;
diff --git a/examples/README.md b/examples/README.md
@@ -27,6 +27,9 @@ These examples demonstrate various capabilities via WebLLM's OpenAI-like API.
 
 #### Others
 - [logit-processor](logit-processor): while `logit_bias` is supported, we additionally support stateful logit processing where users can specify their own rules. We also expose low-level API `forwardTokensAndSample()`.
+- [cache-usage](cache-usage): demonstrates how WebLLM supports both the [Cache API](https://developer.mozilla.org/en-US/docs/Web/API/Cache) and [IndexedDB cache](https://developer.mozilla.org/en-US/docs/Web/API/IndexedDB_API), and
+users can pick with `appConfig.useIndexedDBCache`. Also demonstrates various cache utils such as checking
+whether a model is cached, deleting a model's weights from cache, deleting a model library wasm from cache, etc.
 
 ## Demo Spaces
 
diff --git a/examples/cache-usage/README.md b/examples/cache-usage/README.md
@@ -0,0 +1,23 @@
+# WebLLM Cache Usage
+
+WebLLM supports both the Cache API and IndexedDB, which you can specify via `AppConfig.useIndexedDBCache`.
+This folder provides an example on how Cache and IndexedDB Cache are used in WebLLM. We also
+demonstrate the utility cache functions such as deleting models, checking if models are in cache, etc.
+
+For more information about the two caches, see: https://developer.mozilla.org/en-US/docs/Web/API/Storage_API/Storage_quotas_and_eviction_criteria#what_technologies_store_data_in_the_browser.
+
+To inspect the downloaded artifacts in your browser, open up developer console, go to application, 
+and you will find the artifacts under either `IndexedDB` or `Cache storage`.
+
+
+To run the exapmle, you can do the following steps under this folder
+
+```bash
+npm install
+npm start
+```
+
+Note if you would like to hack WebLLM core package.
+You can change web-llm dependencies as `"file:../.."`, and follow the build from source
+instruction in the project to build webllm locally. This option is only recommended
+if you would like to hack WebLLM core package.
diff --git a/examples/cache-usage/package.json b/examples/cache-usage/package.json
@@ -0,0 +1,20 @@
+{
+    "name": "cache-usage",
+    "version": "0.1.0",
+    "private": true,
+    "scripts": {
+        "start": "parcel src/cache_usage.html  --port 8888",
+        "build": "parcel build src/cache_usage.html --dist-dir lib"
+    },
+    "devDependencies": {
+        "buffer": "^5.7.1",
+        "parcel": "^2.8.3",
+        "process": "^0.11.10",
+        "tslib": "^2.3.1",
+        "typescript": "^4.9.5",
+        "url": "^0.11.3"
+    },
+    "dependencies": {
+        "@mlc-ai/web-llm": "^0.2.30"
+    }
+}
diff --git a/examples/cache-usage/src/cache_usage.html b/examples/cache-usage/src/cache_usage.html
@@ -0,0 +1,24 @@
+<!DOCTYPE html>
+<html>
+<script>
+  webLLMGlobal = {}
+</script>
+
+<body>
+  <h2>WebLLM Test Page</h2>
+  Open console to see output
+  </br>
+  </br>
+  <label id="init-label"> </label>
+
+  <h3>Prompt</h3>
+  <label id="prompt-label"> </label>
+
+  <h3>Response</h3>
+  <label id="generate-label"> </label>
+  </br>
+  <label id="stats-label"> </label>
+
+  <script type="module" src="./cache_usage.ts"></script>
+
+</html>
diff --git a/examples/cache-usage/src/cache_usage.ts b/examples/cache-usage/src/cache_usage.ts
@@ -0,0 +1,74 @@
+import * as webllm from "@mlc-ai/web-llm";
+
+function setLabel(id: string, text: string) {
+  const label = document.getElementById(id);
+  if (label == null) {
+    throw Error("Cannot find label " + id);
+  }
+  label.innerText = text;
+}
+
+const initProgressCallback = (report: webllm.InitProgressReport) => {
+  setLabel("init-label", report.text);
+};
+
+async function main() {
+  const appConfig = webllm.prebuiltAppConfig;
+  // CHANGE THIS TO SEE EFFECTS OF BOTH, CODE BELOW DO NOT NEED TO CHANGE
+  appConfig.useIndexedDBCache = true;
+
+  if (appConfig.useIndexedDBCache) {
+    console.log("Using IndexedDB Cache");
+  } else {
+    console.log("Using Cache API");
+  }
+
+  // 1. This triggers downloading and caching the model with either Cache or IndexedDB Cache
+  const selectedModel = "Phi2-q4f16_1"
+  const engine: webllm.EngineInterface = await webllm.CreateEngine(
+    "Phi2-q4f16_1",
+    { initProgressCallback: initProgressCallback, appConfig: appConfig }
+  );
+
+  const request: webllm.ChatCompletionRequest = {
+    stream: false,
+    messages: [
+      { "role": "user", "content": "Write an analogy between mathematics and a lighthouse." },
+    ],
+    n: 1,
+  };
+  let reply = await engine.chat.completions.create(request);
+  console.log(reply);
+
+  // 2. Check whether model weights are cached
+  let modelCached = await webllm.hasModelInCache(selectedModel, appConfig);
+  console.log("hasModelInCache: ", modelCached);
+  if (!modelCached) {
+    throw Error("Expect hasModelInCache() to be true, but got: " + modelCached);
+  }
+
+  // 3. We reload, and we should see this time it is much faster because the weights are cached.
+  console.log("Reload model start");
+  await engine.reload(selectedModel, undefined, appConfig);
+  console.log("Reload model end");
+  reply = await engine.chat.completions.create(request);
+  console.log(reply);
+
+  // 4. Delete every thing about this model from cache
+  // You can also delete only the model library wasm, only the model weights, or only the config file
+  await webllm.deleteModelAllInfoInCache(selectedModel, appConfig);
+  modelCached = await webllm.hasModelInCache(selectedModel, appConfig);
+  console.log("After deletion, hasModelInCache: ", modelCached);
+  if (modelCached) {
+    throw Error("Expect hasModelInCache() to be false, but got: " + modelCached);
+  }
+
+  // 5. If we reload, we should expect the model to start downloading again
+  console.log("Reload model start");
+  await engine.reload(selectedModel, undefined, appConfig);
+  console.log("Reload model end");
+  reply = await engine.chat.completions.create(request);
+  console.log(reply);
+}
+
+main();
diff --git a/src/cache_util.ts b/src/cache_util.ts
@@ -1,10 +1,11 @@
 import * as tvmjs from "tvmjs";
 import {
   AppConfig,
+  ModelRecord,
   prebuiltAppConfig,
 } from "./config";
 
-function findModelRecord(modelId: string, appConfig?: AppConfig) {
+function findModelRecord(modelId: string, appConfig?: AppConfig): ModelRecord {
   const matchedItem = appConfig?.model_list.find(
     item => item.model_id == modelId
   );
@@ -18,9 +19,10 @@ export async function hasModelInCache(modelId: string, appConfig?: AppConfig): P
   if (appConfig === undefined) {
     appConfig = prebuiltAppConfig;
   }
-  const modelRecord = await findModelRecord(modelId, appConfig);
+  const modelRecord = findModelRecord(modelId, appConfig);
   const modelUrl = modelRecord.model_url;
-  return tvmjs.hasNDArrayInCache(modelUrl, "webllm/model");
+  const cacheType = appConfig.useIndexedDBCache ? "indexeddb" : "cache";
+  return tvmjs.hasNDArrayInCache(modelUrl, "webllm/model", cacheType);
 }
 
 export async function deleteModelAllInfoInCache(modelId: string, appConfig?: AppConfig) {
@@ -42,9 +44,15 @@ export async function deleteModelInCache(modelId: string, appConfig?: AppConfig)
   if (appConfig === undefined) {
     appConfig = prebuiltAppConfig;
   }
-  const modelRecord = await findModelRecord(modelId, appConfig);
-  tvmjs.deleteNDArrayCache(modelRecord.model_url, "webllm/model");
-  const modelCache = new tvmjs.ArtifactCache("webllm/model");
+  const modelRecord = findModelRecord(modelId, appConfig);
+  let modelCache: tvmjs.ArtifactCacheTemplate;
+  if (appConfig.useIndexedDBCache) {
+    tvmjs.deleteNDArrayCache(modelRecord.model_url, "webllm/model", "indexeddb");
+    modelCache = new tvmjs.ArtifactIndexedDBCache("webllm/model");
+  } else {
+    tvmjs.deleteNDArrayCache(modelRecord.model_url, "webllm/model", "cache");
+    modelCache = new tvmjs.ArtifactCache("webllm/model");
+  }
   await modelCache.deleteInCache(new URL("tokenizer.model", modelRecord.model_url).href);
   await modelCache.deleteInCache(new URL("tokenizer.json", modelRecord.model_url).href);
 }
@@ -54,19 +62,28 @@ export async function deleteChatConfigInCache(modelId: string, appConfig?: AppCo
   if (appConfig === undefined) {
     appConfig = prebuiltAppConfig;
   }
-  const modelRecord = await findModelRecord(modelId, appConfig);
-  const configCache = new tvmjs.ArtifactCache("webllm/config");
+  const modelRecord = findModelRecord(modelId, appConfig);
+  let configCache: tvmjs.ArtifactCacheTemplate;
+  if (appConfig.useIndexedDBCache) {
+    configCache = new tvmjs.ArtifactIndexedDBCache("webllm/config");
+  } else {
+    configCache = new tvmjs.ArtifactCache("webllm/config");
+  }
   const configUrl = new URL("mlc-chat-config.json", modelRecord.model_url).href;
   await configCache.deleteInCache(configUrl);
 }
 
-
 export async function deleteModelWasmInCache(modelId: string, appConfig?: AppConfig) {
   // delete the wasm in Cache
   if (appConfig === undefined) {
     appConfig = prebuiltAppConfig;
   }
-  const modelRecord = await findModelRecord(modelId, appConfig);
-  const wasmCache = new tvmjs.ArtifactCache("webllm/wasm");
+  const modelRecord = findModelRecord(modelId, appConfig);
+  let wasmCache: tvmjs.ArtifactCacheTemplate;
+  if (appConfig.useIndexedDBCache) {
+    wasmCache = new tvmjs.ArtifactIndexedDBCache("webllm/wasm");
+  } else {
+    wasmCache = new tvmjs.ArtifactCache("webllm/wasm");
+  }
   await wasmCache.deleteInCache(modelRecord.model_lib_url);
 }
diff --git a/src/config.ts b/src/config.ts
@@ -220,9 +220,15 @@ export interface ModelRecord {
  * passed to the load.
  * 
  * @param model_list: models to be used.
+ * @param useIndexedDBCache: if true, will use IndexedDBCache to cache models and other artifacts.
+ * If false or unspecified, will use the Cache API. For more information of the two, see:
+ * https://developer.mozilla.org/en-US/docs/Web/API/Storage_API/Storage_quotas_and_eviction_criteria#what_technologies_store_data_in_the_browser 
+ * 
+ * @note Note that the Cache API is more well-tested in WebLLM as of now.
  */
 export interface AppConfig {
   model_list: Array<ModelRecord>;
+  useIndexedDBCache?: boolean;
 }
 
 /**
@@ -243,6 +249,7 @@ export const modelLibURLPrefix =
  * current WebLLM npm version.
  */
 export const prebuiltAppConfig: AppConfig = {
+  useIndexedDBCache: false,
   model_list: [
     // Llama-2
     {
diff --git a/src/engine.ts b/src/engine.ts
@@ -109,17 +109,29 @@ export class Engine implements EngineInterface {
     if (!modelUrl.startsWith("http")) {
       modelUrl = new URL(modelUrl, baseUrl).href;
     }
-    const configCache = new tvmjs.ArtifactCache("webllm/config");
+
+    let configCache: tvmjs.ArtifactCacheTemplate;
+    if (appConfig.useIndexedDBCache) {
+      configCache = new tvmjs.ArtifactIndexedDBCache("webllm/config");
+    } else {
+      configCache = new tvmjs.ArtifactCache("webllm/config");
+    }
 
     // load config
     const configUrl = new URL("mlc-chat-config.json", modelUrl).href;
     this.config = {
-      ...(await (await configCache.fetchWithCache(configUrl)).json()),
+      ...(await configCache.fetchWithCache(configUrl, "json")),
       ...chatOpts
     } as ChatConfig;
 
     // load tvm wasm
-    const wasmCache = new tvmjs.ArtifactCache("webllm/wasm");
+    let wasmCache: tvmjs.ArtifactCacheTemplate;
+    if (appConfig.useIndexedDBCache) {
+      wasmCache = new tvmjs.ArtifactIndexedDBCache("webllm/wasm");
+    } else {
+      wasmCache = new tvmjs.ArtifactCache("webllm/wasm");
+    }
+
     const wasmUrl = modelRecord.model_lib_url;
     if (wasmUrl === undefined) {
       throw Error("You need to specify `model_lib_url` for each model in `model_list` " +
@@ -135,10 +147,10 @@ export class Engine implements EngineInterface {
         return await fetch(new URL(wasmUrl, baseUrl).href);
       } else {
         // use cache
-        return await wasmCache.fetchWithCache(wasmUrl);
+        return await wasmCache.fetchWithCache(wasmUrl, "arraybuffer");
       }
     };
-    const wasmSource = await (await fetchWasmSource()).arrayBuffer();
+    const wasmSource = await fetchWasmSource();
 
     const tvm = await tvmjs.instantiate(
       new Uint8Array(wasmSource),
@@ -188,9 +200,9 @@ export class Engine implements EngineInterface {
       }
     });
     this.deviceLostIsError = true;
-    const tokenizer = await this.asyncLoadTokenizer(modelUrl, this.config);
-    await tvm.fetchNDArrayCache(modelUrl, tvm.webgpu(), "webllm/model");
-
+    const tokenizer = await this.asyncLoadTokenizer(modelUrl, this.config, appConfig);
+    const cacheType = appConfig.useIndexedDBCache ? "indexeddb" : "cache";
+    await tvm.fetchNDArrayCache(modelUrl, tvm.webgpu(), "webllm/model", cacheType);
     this.pipeline = new LLMChatPipeline(tvm, tokenizer, this.config, this.logitProcessor);
     await this.pipeline?.asyncLoadWebGPUPipelines();
     const tend = performance.now();
@@ -692,12 +704,19 @@ export class Engine implements EngineInterface {
 
   private async asyncLoadTokenizer(
     baseUrl: string,
-    config: ChatConfig
+    config: ChatConfig,
+    appConfig: AppConfig,
   ): Promise<Tokenizer> {
-    const modelCache = new tvmjs.ArtifactCache("webllm/model");
+    let modelCache: tvmjs.ArtifactCacheTemplate;
+    if (appConfig.useIndexedDBCache) {
+      modelCache = new tvmjs.ArtifactIndexedDBCache("webllm/model");
+    } else {
+      modelCache = new tvmjs.ArtifactCache("webllm/model");
+    }
+
     if (config.tokenizer_files.includes("tokenizer.json")) {
       const url = new URL("tokenizer.json", baseUrl).href;
-      const model = await (await modelCache.fetchWithCache(url)).arrayBuffer();
+      const model = await modelCache.fetchWithCache(url, "arraybuffer");
       return Tokenizer.fromJSON(model);
     }
     else if (config.tokenizer_files.includes("tokenizer.model")) {
@@ -707,7 +726,7 @@ export class Engine implements EngineInterface {
         "Consider converting `tokenizer.model` to `tokenizer.json` by compiling the model " +
         "with MLC again, or see if MLC's huggingface provides this file.");
       const url = new URL("tokenizer.model", baseUrl).href;
-      const model = await (await modelCache.fetchWithCache(url)).arrayBuffer();
+      const model = await modelCache.fetchWithCache(url, "arraybuffer");
       return Tokenizer.fromSentencePiece(model);
     }
     throw Error("Cannot handle tokenizer files " + config.tokenizer_files)