implement remove_prompt_cache (#5)

* implement remove_prompt_cache * Add remove_prompt_cache description to README
onicai · Jan 22, 2025 · 9993bb6 · 9993bb6
1 parent 17356ea
commit 9993bb6
Show file tree

Hide file tree

Showing 9 changed files with 211 additions and 62 deletions.
diff --git a/README-contributors-guide.md b/README-contributors-guide.md
@@ -4,6 +4,16 @@
 
 Follow steps of [llama_cpp_canister/README/Getting Started](https://github.com/onicai/llama_cpp_canister/blob/main/README.md#getting-started)
 
+# VS Code debugger
+
+## lldb-mi hangs
+
+On the Mac, there is an issue with lldb-mi: https://github.com/microsoft/vscode-cpptools/issues/7240
+
+Upon stopping at a breakpoint in a new module, lldb-mi will try to load all local variables, and it goes into an endless loop.
+
+The solution is to hide the VARIABLES section in the debug window, and rely on the WATCH section instead.
+
 # How to run & debug original llama.cpp
 
 - Clone ggerganov/llama.cpp  (Do NOT initialize submodules...)

diff --git a/README.md b/README.md
@@ -162,17 +162,17 @@ WARNING: Currently, the canister can only be build on a `Mac` !
 
     ```bash
     # Start a new chat - this resets the prompt-cache for this conversation
-    dfx canister call llama_cpp new_chat '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"} })'
+    dfx canister call llama_cpp new_chat '(record { args = vec {"--prompt-cache"; "prompt.cache"} })'
 
     # Repeat this call until `prompt_remaining` in the response is empty. 
     # This ingest the prompt into the prompt-cache, using multiple update calls
     # Important: KEEP SENDING THE FULL PROMPT 
-    dfx canister call llama_cpp run_update '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\ngive me a short introduction to LLMs.<|im_end|>\n<|im_start|>assistant\n"; "-n"; "512" } })' 
+    dfx canister call llama_cpp run_update '(record { args = vec {"--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\ngive me a short introduction to LLMs.<|im_end|>\n<|im_start|>assistant\n"; "-n"; "512" } })' 
     ...
 
     # Once `prompt_remaining` in the response is empty, repeat this call, with an empty prompt, until `generated_eog=true`
     # Now the LLM is generating new tokens !
-    dfx canister call llama_cpp run_update '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; ""; "-n"; "512" } })'
+    dfx canister call llama_cpp run_update '(record { args = vec {"--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "-sp"; "-p"; ""; "-n"; "512" } })'
 
     ...
 
@@ -201,6 +201,9 @@ WARNING: Currently, the canister can only be build on a `Mac` !
     #      ;"--print-token-count"; "1"     #
     ########################################
 
+    # Remove the prompt cache when done - this keeps stable memory usage at a minimum
+    dfx canister call llama_cpp remove_prompt_cache '(record { args = vec {"--prompt-cache"; "prompt.cache"} })'
+    
     ```
 
     Note: The sequence of update calls to the canister is required because the Internet Computer has a limitation
@@ -223,6 +226,7 @@ WARNING: Currently, the canister can only be build on a `Mac` !
     ```
 
 
+
 - You can download the `main.log` file from the canister with:
   ```
   python -m scripts.download --network local --canister llama_cpp --local-filename main.log main.log

diff --git a/native/test_qwen2.cpp b/native/test_qwen2.cpp
@@ -87,59 +87,59 @@ void test_qwen2(MockIC &mockIC) {
   for (int i = 0; i < 2; ++i) {
     // -----------------------------------------------------------------------------
     // Start a new chat, which will remove the prompt-cache file if it exists
-    // '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"} })' ->
-    // '(variant { Ok = record { status_code = 200 : nat16; output = "Ready to start a new chat for cache file .canister_cache/expmt-gtxsw-inftj-ttabj-qhp5s-nozup-n3bbo-k7zvn-dg4he-knac3-lae/sessions/my_cache/prompt.cache"; input = ""; error=""; prompt_remaining=""; generated_eog=false : bool } })'
+    // '(record { args = vec {"--prompt-cache"; "prompt.cache"} })' ->
+    // '(variant { Ok = record { status_code = 200 : nat16; output = "Ready to start a new chat for cache file .canister_cache/expmt-gtxsw-inftj-ttabj-qhp5s-nozup-n3bbo-k7zvn-dg4he-knac3-lae/sessions/prompt.cache"; input = ""; error=""; prompt_remaining=""; generated_eog=false : bool } })'
     mockIC.run_test(
         std::string(__func__) + ": " + "new_chat " + std::to_string(i) + " - " +
             model,
         new_chat,
-        "4449444c026c01dd9ad28304016d710100020e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865",
-        "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a01000101009701526561647920746f2073746172742061206e6577206368617420666f722063616368652066696c65202e63616e69737465725f63616368652f6578706d742d67747873772d696e66746a2d747461626a2d71687035732d6e6f7a75702d6e3362626f2d6b377a766e2d64673468652d6b6e6163332d6c61652f73657373696f6e732f6d795f63616368652f70726f6d70742e63616368650000c8000000",
+        "4449444c026c01dd9ad28304016d710100020e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865",
+        "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a01000101008e01526561647920746f2073746172742061206e6577206368617420666f722063616368652066696c65202e63616e69737465725f63616368652f6578706d742d67747873772d696e66746a2d747461626a2d71687035732d6e6f7a75702d6e3362626f2d6b377a766e2d64673468652d6b6e6163332d6c61652f73657373696f6e732f70726f6d70742e63616368650000c8000000",
         silent_on_trap, my_principal);
 
     // -----------------------------------------------------------------------------
     // -sp  : special token output enabled
-    // '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "512"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"} })' ->
+    // '(record { args = vec {"--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "512"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"} })' ->
     // -> '(variant { Ok = record { status_code = 200 : nat16; error = ""; output = ""; input = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>" ; prompt_remaining = "user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"; generated_eog=false : bool} })'
     mockIC.run_test(
         std::string(__func__) + ": " + "run_update prompt step 1 for chat " +
             std::to_string(i) + " - " + model,
         run_update,
-        "4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e03353132022d708a013c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e740a",
+        "4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e03353132022d708a013c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e740a",
         "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a010001010000463c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e00c80044757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e740a00",
         silent_on_trap, my_principal);
 
     // -----------------------------------------------------------------------------
     // -sp  : special token output enabled
-    // '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "512"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"} })' ->
+    // '(record { args = vec {"--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "512"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"} })' ->
     // -> '(variant { Ok = record { status_code = 200 : nat16; error = ""; output = ""; input = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant"; prompt_remaining = "\n";} generated_eog=false : bool})'
     mockIC.run_test(
         std::string(__func__) + ": " + "run_update prompt step 2 for chat " +
             std::to_string(i) + " - " + model,
         run_update,
-        "4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e03353132022d708a013c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e740a",
+        "4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e03353132022d708a013c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e740a",
         "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a01000101000089013c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e7400c800010a00",
         silent_on_trap, my_principal);
 
     // -----------------------------------------------------------------------------
-    // '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "512"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"} })' ->
+    // '(record { args = vec {"--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "512"; "-p"; "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nExplain Large Language Models.<|im_end|>\n<|im_start|>assistant\n"} })' ->
     // -> can no longer check it, because the LLM generated tokens
     mockIC.run_test(
         std::string(__func__) + ": " + "run_update prompt step 3 for chat " +
             std::to_string(i) + " - " + model,
         run_update,
-        "4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e03353132022d708a013c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e740a",
+        "4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e03353132022d708a013c7c696d5f73746172747c3e73797374656d0a596f752061726520612068656c7066756c20617373697374616e742e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e757365720a4578706c61696e204c61726765204c616e6775616765204d6f64656c732e3c7c696d5f656e647c3e0a3c7c696d5f73746172747c3e617373697374616e740a",
         "", silent_on_trap, my_principal);
 
     // -----------------------------------------------------------------------------
     // Once there is no prompt_remaining, it is totally ok to send an empty prompt, and just let it generate new tokens
-    // '(record { args = vec {"--prompt-cache"; "my_cache/prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "512"; "-p"; ""} })' ->
+    // '(record { args = vec {"--prompt-cache"; "prompt.cache"; "--prompt-cache-all"; "-sp"; "-n"; "512"; "-p"; ""} })' ->
     // -> can no longer check it, because the LLM generated tokens
     mockIC.run_test(
         std::string(__func__) + ": " + "run_update prompt step 4 for chat " +
             std::to_string(i) + " - " + model,
         run_update,
-        "4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d6361636865156d795f63616368652f70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e03353132022d7000",
+        "4449444c026c01dd9ad28304016d710100080e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865122d2d70726f6d70742d63616368652d616c6c032d7370022d6e03353132022d7000",
         "", silent_on_trap, my_principal);
   }
 
@@ -150,4 +150,15 @@ void test_qwen2(MockIC &mockIC) {
   // Note that the pytest is verifying it in more detail
   mockIC.run_test(std::string(__func__) + ": " + "get_chats - " + model,
                   get_chats, "4449444c0000", "", silent_on_trap, my_principal);
+
+  // -----------------------------------------------------------------------------
+  // Remove the prompt-cache file if it exists
+  // '(record { args = vec {"--prompt-cache"; "prompt.cache"} })' ->
+  // '(variant { Ok = record { status_code = 200 : nat16; output = "Cache file .canister_cache/expmt-gtxsw-inftj-ttabj-qhp5s-nozup-n3bbo-k7zvn-dg4he-knac3-lae/sessions/prompt.cache deleted successfully"; input = ""; error=""; prompt_remaining=""; generated_eog=false : bool } })'
+  mockIC.run_test(
+      std::string(__func__) + ": " + "remove_prompt_cache " + model,
+      remove_prompt_cache,
+      "4449444c026c01dd9ad28304016d710100020e2d2d70726f6d70742d63616368650c70726f6d70742e6361636865",
+      "4449444c026c06819e846471838fe5800671c897a79907719aa1b2f90c7adb92a2c90d71cdd9e6b30e7e6b01bc8a0100010100850143616368652066696c65202e63616e69737465725f63616368652f6578706d742d67747873772d696e66746a2d747461626a2d71687035732d6e6f7a75702d6e3362626f2d6b377a766e2d64673468652d6b6e6163332d6c61652f73657373696f6e732f70726f6d70742e63616368652064656c65746564207375636365737366756c6c790000c8000000",
+      silent_on_trap, my_principal);
 }