mlc-ai
diff --git a/‎examples/openai-api/src/openai_api.ts‎
Lines changed: 43 additions & 21 deletions b/‎examples/openai-api/src/openai_api.ts‎
Lines changed: 43 additions & 21 deletions
diff --git a/‎examples/web-worker/src/main.ts‎
Lines changed: 0 additions & 2 deletions b/‎examples/web-worker/src/main.ts‎
Lines changed: 0 additions & 2 deletions
@@ -94,45 +94,66 @@ async function mainStreaming() {
 }
 
 /**
- * We domnstrate stateful chat completion, where chat history is preserved across requests.
+ * We domnstrate multiround chatting. Though users are required to maintain chat history, internally
+ * we compare provided `messages` with the internal chat history. If it matches, we will reuse KVs
+ * and hence save computation -- essentially an implicit internal optimization.
  */
-async function mainStateful() {
+async function mainMultiroundChat() {
   const chat: webllm.ChatInterface = new webllm.ChatModule();
-
   chat.setInitProgressCallback((report: webllm.InitProgressReport) => {
     setLabel("init-label", report.text);
   });
 
   await chat.reload("Llama-2-7b-chat-hf-q4f32_1");
 
+  // Round 0
+  const messages: webllm.ChatCompletionMessageParam[] = [
+    {
+      "role": "system",
+      "content": "[INST] <<SYS>>\n\nYou are a helpful, respectful and honest assistant. " +
+        "Be as happy as you can when speaking please.\n<</SYS>>\n\n "
+    },
+    { "role": "user", "content": "Provide me three US states." },
+  ];
+
   const request0: webllm.ChatCompletionRequest = {
-    stateful: true,
-    // stream: true, // works with and without streaming
-    messages: [
-      {
-        "role": "system",
-        "content": "[INST] <<SYS>>\n\nYou are a helpful, respectful and honest assistant. " +
-          "Be as happy as you can when speaking please.\n<</SYS>>\n\n "
-      },
-      { "role": "user", "content": "Provide me three US states." },
-    ],
+    stream: false,  // can be streaming, same behavior
+    messages: messages,
   };
 
   const reply0 = await chat.chatCompletion(request0);
+  const replyMessage0 = await chat.getMessage();
   console.log(reply0);
-  console.log(await chat.getMessage());
+  console.log(replyMessage0);
+
+  // Round 1
+  // Append generated response to messages
+  messages.push({ "role": "assistant", "content": replyMessage0 });
+  // Append new user input
+  messages.push({ "role": "user", "content": "Two more please!" });
+  // Below line would cause an internal reset (clear KV cache, etc.) since the history no longer
+  // matches the new request
+  // messages[0].content = "Another system prompt";
 
   const request1: webllm.ChatCompletionRequest = {
-    stateful: true,
-    // stream: true, // works with and without streaming
-    messages: [
-      { "role": "user", "content": "Two more please!" },
-    ],
+    stream: false,  // can be streaming, same behavior
+    messages: messages
   };
 
   const reply1 = await chat.chatCompletion(request1);
+  const replyMessage1 = await chat.getMessage();
   console.log(reply1);
-  console.log(await chat.getMessage());
+  console.log(replyMessage1);
+
+  // If we used multiround chat, request1 should only prefill a small number of tokens
+  const prefillTokens0 = reply0.usage?.prompt_tokens;
+  const prefillTokens1 = reply1.usage?.prompt_tokens;
+  console.log("Requset 0 prompt tokens: ", prefillTokens0);
+  console.log("Requset 1 prompt tokens: ", prefillTokens1);
+  if (prefillTokens0 === undefined || prefillTokens1 === undefined ||
+    prefillTokens1 > prefillTokens0) {
+    throw Error("Multi-round chat is not triggered as expected.");
+  }
 
   console.log(await chat.runtimeStatsText());
 }
@@ -195,4 +216,5 @@ async function mainFunctionCalling() {
 // Run one of the functions
 // mainNonStreaming();
 // mainStreaming();
-mainFunctionCalling();
+// mainFunctionCalling();
+mainMultiroundChat();
@@ -62,7 +62,6 @@ async function mainOpenAIAPINonStreaming() {
   await chat.reload("Llama-2-7b-chat-hf-q4f32_1");
 
   const request: webllm.ChatCompletionRequest = {
-    // stateful: true,  // set this optionally to preserve chat history
     messages: [
       {
         "role": "system",
@@ -102,7 +101,6 @@ async function mainOpenAIAPIStreaming() {
   await chat.reload("Llama-2-7b-chat-hf-q4f32_1");
 
   const request: webllm.ChatCompletionRequest = {
-    // stateful: true,  // set this optionally to preserve chat history
     stream: true,
     messages: [
       {
Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,6 @@ async function mainOpenAIAPINonStreaming() {`
`62`	`62`	`await chat.reload("Llama-2-7b-chat-hf-q4f32_1");`
`63`	`63`
`64`	`64`	`const request: webllm.ChatCompletionRequest = {`
`65`		`- // stateful: true, // set this optionally to preserve chat history`
`66`	`65`	`messages: [`
`67`	`66`	`{`
`68`	`67`	`"role": "system",`
`@@ -102,7 +101,6 @@ async function mainOpenAIAPIStreaming() {`
`102`	`101`	`await chat.reload("Llama-2-7b-chat-hf-q4f32_1");`
`103`	`102`
`104`	`103`	`const request: webllm.ChatCompletionRequest = {`
`105`		`- // stateful: true, // set this optionally to preserve chat history`
`106`	`104`	`stream: true,`
`107`	`105`	`messages: [`
`108`	`106`	`{`