From 622a19e9ba4fb33381174f233b608eee8a65cbf9 Mon Sep 17 00:00:00 2001
From: igardev <igardev@gmail.com>
Date: Mon, 5 Jan 2026 15:36:13 +0200
Subject: [PATCH] Generate multiple completions in parallel

---
 package-lock.json    |   4 +-
 package.json         |  15 ++++
 resources/help.md    | 203 +++++++++++++++++++------------------------
 src/architect.ts     |  30 +++++++
 src/completion.ts    | 138 ++++++++++++++++++++---------
 src/configuration.ts |   2 +
 src/extension.ts     |   2 +
 src/llama-server.ts  |   5 +-
 src/lru-cache.ts     |   6 +-
 9 files changed, 245 insertions(+), 160 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index 3fae2ff..1334ac2 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "llama-vscode",
-  "version": "0.0.38",
+  "version": "0.0.39",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "llama-vscode",
-      "version": "0.0.38",
+      "version": "0.0.39",
       "hasInstallScript": true,
       "dependencies": {
         "axios": "^1.1.2",
diff --git a/package.json b/package.json
index d9cc913..a523fb0 100644
--- a/package.json
+++ b/package.json
@@ -151,6 +151,16 @@
         "key": "ctrl+x",
         "when": "editorTextFocus"
       },
+      {
+        "command": "extension.selectNextSuggestion",
+        "key": "alt+]",
+        "when": "editorTextFocus && inlineSuggestionVisible"
+      },
+      {
+        "command": "extension.selectPreviousSuggestion",
+        "key": "alt+[",
+        "when": "editorTextFocus && inlineSuggestionVisible"
+      },
       {
         "command": "extension.acceptFirstLine",
         "key": "shift+tab",
@@ -271,6 +281,11 @@
           "default": "",
           "description": "The URL to be used by the extension for creating embeddings."
         },
+        "llama-vscode.max_parallel_completions": {
+          "type": "number",
+          "default": 3,
+          "description": "The max number of parallel completions. Switching between completions could be done with Alt+] (next) or Alt =+[ (previous). "
+        },
         "llama-vscode.new_completion_model_port": {
           "type": "number",
           "default": 8012,
diff --git a/resources/help.md b/resources/help.md
index f43c4f3..eaab3f1 100644
--- a/resources/help.md
+++ b/resources/help.md
@@ -13,9 +13,7 @@ Example:
 1. Select several lines of source code
 2. Press Ctrl+Shift+A (or right click and select "llama-vscode: Show Llama Agent") - this will attach the selected lines to the prompt
 3. Inside the agent prompt press "/" and select "explain"
-The agent will explain the selected code. 
- 
-## Chat with AI about llama-vscode  
+The agent will explain the selected code.## Chat with AI about llama-vscode  
 
 ### Requred servers
 - Tools server
@@ -25,12 +23,6 @@ This is a conversation with the llama-vscode help agent AI about llama-vscode, s
 - From llama-vscode menu select "Chat with AI about llama-vscode" -> the agent will be opened
 - Enter your question about llama-vscode
 The first time it could take longer to answer. The following questions will be answered faster as the help information will be cached.
- 
- 
-## Chat with AI with project context 
-This is removed. Chat with AI with project context is equal to using agent with the tool search_source. The agent has many other tools and is therefore a better choice.
- 
- 
 ## Chat with AI  
 
 ### Requred servers
@@ -41,8 +33,8 @@ This is a conversation with the local AI. Mainly for asking questions for refere
 - Press Ctrl+; inside an editor (or select from llama.vscode menu Chat with AI) - A chat window will open inside VS Code
 - Enter your message and start the chat
 
-![Chat with AI](https://github.com/user-attachments/assets/e068f5cc-fce3-4366-9b8f-1c89e952b411) 
- 
+![Chat with AI](https://github.com/user-attachments/assets/e068f5cc-fce3-4366-9b8f-1c89e952b411)## Chat with AI with project context 
+This is removed. Chat with AI with project context is equal to using agent with the tool search_source. The agent has many other tools and is therefore a better choice.
 ## Code completion
 
 ### Requred servers
@@ -61,9 +53,7 @@ https://github.com/user-attachments/assets/97bb1418-dcea-4a49-8332-13b2ab4da661
 
 
 
-![Code completion](https://private-user-images.githubusercontent.com/1991296/405712196-b19499d9-f50d-49d4-9dff-ff3e8ba23757.gif?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3NDY5NDc1NDEsIm5iZiI6MTc0Njk0NzI0MSwicGF0aCI6Ii8xOTkxMjk2LzQwNTcxMjE5Ni1iMTk0OTlkOS1mNTBkLTQ5ZDQtOWRmZi1mZjNlOGJhMjM3NTcuZ2lmP1gtQW16LUFsZ29yaXRobT1BV1M0LUhNQUMtU0hBMjU2JlgtQW16LUNyZWRlbnRpYWw9QUtJQVZDT0RZTFNBNTNQUUs0WkElMkYyMDI1MDUxMSUyRnVzLWVhc3QtMSUyRnMzJTJGYXdzNF9yZXF1ZXN0JlgtQW16LURhdGU9MjAyNTA1MTFUMDcwNzIxWiZYLUFtei1FeHBpcmVzPTMwMCZYLUFtei1TaWduYXR1cmU9NmZiMmI0NGYzNTkyZGZkMTM5Njk3M2NjZDFhMjFiNTFkMjVkMmY4MGQ5ZDQ2ZDQ0MDgzOWI2YjM5NTY0NzM2OSZYLUFtei1TaWduZWRIZWFkZXJzPWhvc3QifQ.P150YJh87_y1pin20aWIuKoPzivmDjZF0iAemQlk_ok) 
- 
-## Custom eval tool
+![Code completion](https://private-user-images.githubusercontent.com/1991296/405712196-b19499d9-f50d-49d4-9dff-ff3e8ba23757.gif?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3NDY5NDc1NDEsIm5iZiI6MTc0Njk0NzI0MSwicGF0aCI6Ii8xOTkxMjk2LzQwNTcxMjE5Ni1iMTk0OTlkOS1mNTBkLTQ5ZDQtOWRmZi1mZjNlOGJhMjM3NTcuZ2lmP1gtQW16LUFsZ29yaXRobT1BV1M0LUhNQUMtU0hBMjU2JlgtQW16LUNyZWRlbnRpYWw9QUtJQVZDT0RZTFNBNTNQUUs0WkElMkYyMDI1MDUxMSUyRnVzLWVhc3QtMSUyRnMzJTJGYXdzNF9yZXF1ZXN0JlgtQW16LURhdGU9MjAyNTA1MTFUMDcwNzIxWiZYLUFtei1FeHBpcmVzPTMwMCZYLUFtei1TaWduYXR1cmU9NmZiMmI0NGYzNTkyZGZkMTM5Njk3M2NjZDFhMjFiNTFkMjVkMmY4MGQ5ZDQ2ZDQ0MDgzOWI2YjM5NTY0NzM2OSZYLUFtei1TaWduZWRIZWFkZXJzPWhvc3QifQ.P150YJh87_y1pin20aWIuKoPzivmDjZF0iAemQlk_ok)## Custom eval tool
 
 ### Overview
 llama-vscode provides to the users the posibility to partially create their own tool. Custom eval tool is a simple one - has one parameters and and uses the provided by the user javascript function to calculate the result.
@@ -85,8 +75,6 @@ https://github.com/user-attachments/assets/fb12d56f-61e8-409b-b888-0a524167e116
 https://github.com/user-attachments/assets/7e928fc3-da14-4834-a414-0f8e23593155
 
 
- 
- 
 ## Custom tool
 
 ### Overview
@@ -105,8 +93,6 @@ https://github.com/user-attachments/assets/46602f8c-bd45-4794-9f5c-6ebe262c396a
 
 https://github.com/user-attachments/assets/50baa8c3-f426-4901-a443-8882da644800
 
- 
- 
 ## Delete models  
 
 ### Overview
@@ -122,14 +108,20 @@ You could delete the GGUF files from this folder. If they are missing, but are n
 
 
 
- 
- 
 ## Edit Agent 
 
 ### Overview
 Edit agent view is used for adding and editing agents. From there it is also possible to delete and copy an existing agent as a new one. The identifier of an agent is it's name. For now there is no tools model as part of the agent (the currently selected tools model will be used)
 
+<img width="582" height="977" alt="image" src="https://github.com/user-attachments/assets/9a406e7a-09ea-4f04-9054-f709bcdb038a" />
+
+
 ### How to use it 
+Edit agent view could be shown in one of the following ways:  
+- In the left sidebar click llama-viscode button and after that on the upper part click button Show Edit Agent View (pencip image)
+- From llama-vscode menu (Ctrl+Shift+M) select Agents...-> Add agent (or Edit agent or Copy agent)
+- From environment view, when an agent is selected, click button Edit - this will show the selected agent in the Edit Agent View
+
 Edit existing agent:  
 1. Click Select button and load an agent to be edited.
 2. Change the Description and System Instructions fields (if needed)
@@ -150,12 +142,10 @@ Copy existing agent as a new one:
 
 Delete agent: 
 1. Click Delete button
-3. Select an agent to be deleted from the list
-4. Confirm the deletion of the agent
+2. Select an agent to be deleted from the list
+3. Confirm the deletion of the agent
 
 
- 
- 
 ## Edit with AI  
 
 ### Requred servers
@@ -171,9 +161,7 @@ Delete agent:
 https://github.com/user-attachments/assets/887d0b88-717b-4765-b565-d4c54673bde8
 
 
-![Edit with AI](https://github.com/user-attachments/assets/d7aef6a8-8c29-4278-b91f-9b3031c8cbd5) 
- 
-## Env
+![Edit with AI](https://github.com/user-attachments/assets/d7aef6a8-8c29-4278-b91f-9b3031c8cbd5)## Env
 
 ### What is env
 Env (short for environment) is a group of models, agent and settings. Env makes it easier for the users to prepare the environment for their needs. Selecting an env with a given intent will make sure all needed servers are available. One env could contain up to 4 different models - for completions, chat, embeddings, tools. Env could also contain an agent and settings for enabling/disabling completions, rag and starting last selected env on startup. If the user wants to use only code completions functionality, he/she could select an env with only one model for completions. If the user wants to use all the functionality from llama-vscode, he/she could select an env with full package of models.
@@ -186,8 +174,6 @@ There is a page in llama-vscode UI with the current environment details. From th
 
 <img width="540" height="996" alt="image" src="https://github.com/user-attachments/assets/b1a78d7a-8602-451a-b304-fc967fb66696" />
 
- 
- 
 ## Generate a commit message  
 
 ### Requred servers
@@ -197,14 +183,13 @@ There is a page in llama-vscode UI with the current environment details. From th
 In the source control panel just click on the star button (near the commit button).  
 This generate a commit message, based on the current changes.   
 
-![Generate a commit message](https://github.com/user-attachments/assets/25f5d1ae-3673-4416-ba52-7615969c1bb3) 
- 
-## Version 0.0.32 is released (05.10.2025)
+![Generate a commit message](https://github.com/user-attachments/assets/25f5d1ae-3673-4416-ba52-7615969c1bb3)## Version 0.0.39 is released (31.12.2025)
 ## What is new
-- predefined model DeepSeek V3.1 free 163,800 context (OpenRouter) added
-- predefined model Z.AI: GLM 4.5 Air (free): GLM 4.5 Air - 128.000 context (OpenRouter) added
-- Added agent "Ask" is for review, analysis and suggestions for the code without changing the files
-- Some bugs are fixed
+
+- Skills (https://agentskills.io/home) could be now used with llama-vscode
+- skills_folder setting determines where are skills descriptions. If empty the <project_folder>/skills folder is used by default
+- Anthropic models support skills best. I guess, the open source models will catch up.
+
 
 ## Setup instructions for llama.cpp server
 
@@ -252,8 +237,6 @@ This generate a commit message, based on the current changes.
 
 ### [Model selection](https://github.com/ggml-org/llama.vscode/wiki/Model-selection)
 
- 
- 
 ## How to use llama-vscode  
 
 ### Overview
@@ -279,8 +262,6 @@ If you are an existing user - you could continue using llama-vscode as before.
 
 For more details - select 'View Documentation' from llama-vscode menu
 
- 
- 
 ## Llama Agent 
 
 ### Requred servers
@@ -321,8 +302,6 @@ https://github.com/user-attachments/assets/dd9da21a-6f57-477d-a55c-e4ff60b1ecb8
 
 
 
- 
- 
 ## Use as local AI runner (as LM Studio, Ollama, etc.) 
 
 ### Overview
@@ -342,8 +321,6 @@ Enjoy talking with local AI.
 
 https://github.com/user-attachments/assets/e75e96de-878b-43db-a45b-47cc0c554697
 
- 
- 
 ## Manage envs 
 
 ### Requred servers
@@ -385,8 +362,6 @@ An agent could be exported as a .json files. This file could be shared with othe
 
 - Import  
 An agent could be imported from a .json file - select a file to import it.
- 
- 
 ## Manage chat models 
 
 ### Requred servers
@@ -435,9 +410,7 @@ Add chat model from OpenAI compatible provider - OpenRouter or custom (for examp
 A model could be exported as a .json files. This file could be shared with other users, modified if needed and imported again. Select a model to export it.
 
 - Import  
-A model could be imported from a .json file - select a file to import it. 
- 
-## Manage envs 
+A model could be imported from a .json file - select a file to import it.## Manage envs 
 
 ### Requred servers
 - No servers required
@@ -462,8 +435,6 @@ A chat could be exported as a .json file. This file could be shared with other u
 
 - Import  
 A chat could be imported from a .json file - select a file to import it.
- 
- 
 ## Manage completion models 
 
 ### Requred servers
@@ -512,9 +483,7 @@ Add completion model from OpenAI compatible provider - OpenRouter or custom (for
 A model could be exported as a .json files. This file could be shared with other users, modified if needed and imported again. Select a model to export it.
 
 - Import  
-A model could be imported from a .json file - select a file to import it. 
- 
-## Manage embeddings 
+A model could be imported from a .json file - select a file to import it.## Manage embeddings 
 
 ### Requred servers
 - No servers required
@@ -562,9 +531,7 @@ Add embeddings model from OpenAI compatible provider - OpenRouter or custom (for
 A model could be exported as a .json files. This file could be shared with others used, modified if needed and imported again. Select a model to export it.
 
 - Import  
-A model could be imported from a .json file - select a file to import it. 
- 
-## Manage envs 
+A model could be imported from a .json file - select a file to import it.## Manage envs 
 
 ### Requred servers
 - No servers required
@@ -612,8 +579,6 @@ https://github.com/user-attachments/assets/3fb864ad-a010-4d19-97d8-fd7c9ce60494
 https://github.com/user-attachments/assets/3b8dffcc-bcdc-4981-b181-ffc52fe43075
 
 
- 
- 
 ## Manage tools models 
 
 ### Requred servers
@@ -662,9 +627,7 @@ Add tools model from OpenAI compatible provider - OpenRouter or custom (for exam
 A model could be exported as a .json files. This file could be shared with other users, modified if needed and imported again. Select a model to export it.
 
 - Import  
-A model could be imported from a .json file - select a file to import it. 
- 
-## MCP Support  
+A model could be imported from a .json file - select a file to import it.## MCP Support  
 
 ### Requred servers
 - Tools server
@@ -684,8 +647,6 @@ llama-vscode could use the the tools from the MCP servers, which are installed i
 4. Click "Select Tools" from Llama Agent panel and select the tools, which you want to use from your MCP Server
 
 
- 
- 
 ## Menu  
 
 ### Requred servers
@@ -701,8 +662,6 @@ OR
 
 https://github.com/user-attachments/assets/9895924d-1948-4f3c-b52e-2cce453645c8
 
- 
- 
 ## Model selection
 
 ### What is model selection
@@ -716,8 +675,22 @@ There are different ways to select a model
 - In Llama Agent click the button for selecting a model (completion, chat, embeddings, tools)
 - In llama-vscode menu select "Completion models..." (or chat, embeddings, tools)
 - Select an env. This will select the models, which are part of the env
- 
- 
+## Parallel Completions
+
+### Overview
+Llama-vscode generates parallel code completions (default 3) if a version of llama.cpp after December, 6, 2025 (commit c42712b) is used. The next completion is shown by pressing Ctrl+], previous completion is shown by pressing Ctrl+[.  
+The setting max_parallel_completions determines how many completions are generated.
+
+### How to use it
+1. Run the completion model and start codeing
+2. When a code completion is shown, press Ctrl+] to show the next completion, Ctrl+[ to show the previous completion
+
+
+Settings:
+- max_parallel_completions: The max number of parallel completions to generate. Default is 3.
+
+[Screencast from 2026-01-05 15-05-00.webm](https://github.com/user-attachments/assets/41fa92f8-88db-4079-9574-486fb4286c79)
+
 ## Rules
 
 ### What are rules
@@ -730,46 +703,6 @@ The rules are optional. You could use rules file to add instructions to the syst
 There are two ways to configure rules:
 - Create a new rules file under name llama-vscode-rules.md in the root of the project.
 - In llama-vscode setting Agent_rules enter a path to a rules file. It could be relative to the project root or absolute path. If this is specified, the file llama-vscode-rules.md will be ignored.
- 
- 
-## Statusbar  
-
-### Requred servers
-- No servers requred
-
-### How to use it 
-- View vscode-state
-- View statistics
-- Click on "llama-vscode" status bar to show llama-vscode menu
-
-
-
-https://github.com/user-attachments/assets/8f0b4575-104f-471c-be3f-f3d5b58aeee1
-
- 
- 
-## Use cases  
-
-### Overview
-The use cases below describe how to prepare and use llama-vscode in some specific cases. There are already some configurations for models and env, which could be selected and used directly
-
-### Only completion used, local server started by llama-vscode
-- Use the default configuration if it works for you by selecting Env for your case
-- If you want to use a different one, here is how to prepare it:
-1. Create completion model - select llama-vscode menu -> "Completion models..." -> "Add completion model from Huggingface", find the model in Huggingface and add it.
-2. From llama-vscode menu select "Deselect/stop env and models"
-3. Create an env, which includes only this model - from llama-vscode menu -> "Env..." -> "Add Env...". A panel will be show with buttons for selecting completion, chat, embeddings and tools models. Click "Compl" button and select the newly added model (the name is hf: model_name_from_huggingface). Test if code completion works well. Click button "Add Env" to save the environment.
-
-### Only completion used, external server
-Extarnal server could be also a local one, but is not started by llama-vscode on selecting the model. The completion server should support /infill endpoint, which is currently available only by llama.cpp.
-1. Create a new model - select llama-vscode menu -> "Completion models..." -> "Add completion model...". Enter only name and endpoint.
-2. From llama-vscode menu select "Deselect/stop env and models"
-3. Create an env, which includes only this model - from llama-vscode menu -> "Env..." -> "Add Env...". A panel will be show with buttons for selecting completion, chat, embeddings and tools models. Click "Compl" button and select the newly added model. Test if code completion works well. Click button "Add Env" to save the environment.
-
-
-
- 
- 
 ## Setup llama.cpp server for Linux 
 
 1. Download the release files for your OS from [llama.cpp releases.](https://github.com/ggerganov/llama.cpp/releases) (or build from source).  
@@ -853,8 +786,6 @@ Same like code completion server, but use embeddings model and a little bit diff
 ```bash  
 `llama-server -hf ggml-org/Nomic-Embed-Text-V2-GGUF --port 8010 -ub 2048 -b 2048 --ctx-size 2048 --embeddings`  
 ```
- 
- 
 ### Setup llama.cpp servers for Mac  
 
 Show llama-vscode menu (Ctrl+Shift+M) and select "Install/upgrade llama.cpp" (if not yet done). After that add/select the models you want to use.   
@@ -933,8 +864,6 @@ Same like code completion server, but use embeddings model and a little bit diff
 `llama-server -hf ggml-org/Nomic-Embed-Text-V2-GGUF --port 8010 -ub 2048 -b 2048 --ctx-size 2048 --embeddings`  
 ```
 
- 
- 
 ### Setup llama.cpp servers for Windows  
 
 Show llama-vscode menu (Ctrl+Shift+M) and select "Install/upgrade llama.cpp" (if not yet done). After that add/select the models you want to use.   
@@ -1018,8 +947,34 @@ Same like code completion server, but use embeddings model and a little bit diff
 ```bash
 `llama-server.exe -hf nomic-embed-text-v2-moe-q8_0.gguf --port 8010 -ub 2048 -b 2048 --ctx-size 2048 --embeddings`  
 ```
- 
- 
+## Skills
+
+### Overview
+Llama-vscode support skills (https://agentskills.io/home), which extend the capabilities of the LLM (similar to tools).
+
+### How to use it
+1. Set the skills folder in setting skills_folder (if not set, the <project_root>/skills is used)
+2. Ask the agent for to do something, which requres a skill (or ask details about the skills)
+
+On sending a user request to the agent, the folder is scanned and the available skills are provided to the LLM. If the LLM decides to use a partiular skill, the skill details are loaded by LLM.  
+
+
+Settings:
+- skills_folder: The folder where the skills are stored
+## Statusbar  
+
+### Requred servers
+- No servers requred
+
+### How to use it 
+- View vscode-state
+- View statistics
+- Click on "llama-vscode" status bar to show llama-vscode menu
+
+
+
+https://github.com/user-attachments/assets/8f0b4575-104f-471c-be3f-f3d5b58aeee1
+
 ## Update todos tool
 
 ### Overview
@@ -1037,3 +992,23 @@ Settings:
 
 <img width="750" height="922" alt="image" src="https://github.com/user-attachments/assets/a4049df0-17da-4c6d-868f-a6bcbfa5f65c" />
 
+## Use cases  
+
+### Overview
+The use cases below describe how to prepare and use llama-vscode in some specific cases. There are already some configurations for models and env, which could be selected and used directly
+
+### Only completion used, local server started by llama-vscode
+- Use the default configuration if it works for you by selecting Env for your case
+- If you want to use a different one, here is how to prepare it:
+1. Create completion model - select llama-vscode menu -> "Completion models..." -> "Add completion model from Huggingface", find the model in Huggingface and add it.
+2. From llama-vscode menu select "Deselect/stop env and models"
+3. Create an env, which includes only this model - from llama-vscode menu -> "Env..." -> "Add Env...". A panel will be show with buttons for selecting completion, chat, embeddings and tools models. Click "Compl" button and select the newly added model (the name is hf: model_name_from_huggingface). Test if code completion works well. Click button "Add Env" to save the environment.
+
+### Only completion used, external server
+Extarnal server could be also a local one, but is not started by llama-vscode on selecting the model. The completion server should support /infill endpoint, which is currently available only by llama.cpp.
+1. Create a new model - select llama-vscode menu -> "Completion models..." -> "Add completion model...". Enter only name and endpoint.
+2. From llama-vscode menu select "Deselect/stop env and models"
+3. Create an env, which includes only this model - from llama-vscode menu -> "Env..." -> "Add Env...". A panel will be show with buttons for selecting completion, chat, embeddings and tools models. Click "Compl" button and select the newly added model. Test if code completion works well. Click button "Add Env" to save the environment.
+
+
+
diff --git a/src/architect.ts b/src/architect.ts
index 80876ac..82e82a0 100644
--- a/src/architect.ts
+++ b/src/architect.ts
@@ -99,6 +99,36 @@ export class Architect {
         context.subscriptions.push(changeActiveTextEditorDisp)
     }
 
+    registerCommandSelectNextSuggestion = (context: vscode.ExtensionContext) => {
+        const selectNextSuggestionCommand = vscode.commands.registerCommand(
+            'extension.selectNextSuggestion',
+            async () => {
+                const editor = vscode.window.activeTextEditor;
+                if (!editor) {
+                    return;
+                }
+                await vscode.commands.executeCommand('editor.action.inlineSuggest.showNext');
+                await this.app.completion.increaseSuggestionIndex();
+            }
+        );
+        context.subscriptions.push(selectNextSuggestionCommand);
+    }
+
+    registerCommandSelectPreviousSuggestion = (context: vscode.ExtensionContext) => {
+        const selectPreviousSuggestionCommand = vscode.commands.registerCommand(
+            'extension.selectPreviousSuggestion',
+            async () => {
+                const editor = vscode.window.activeTextEditor;
+                if (!editor) {
+                    return;
+                }
+                await vscode.commands.executeCommand('editor.action.inlineSuggest.showPrevious');
+                await this.app.completion.decreaseSuggestionIndex();
+            }
+        );
+        context.subscriptions.push(selectPreviousSuggestionCommand);
+    }
+
     registerCommandAcceptFirstLine = (context: vscode.ExtensionContext) => {
         const acceptFirstLineCommand = vscode.commands.registerCommand(
             'extension.acceptFirstLine',
diff --git a/src/completion.ts b/src/completion.ts
index 749c42a..7b34e50 100644
--- a/src/completion.ts
+++ b/src/completion.ts
@@ -4,18 +4,19 @@ import vscode from "vscode";
 import {Utils} from "./utils";
 
 interface CompletionDetails {
-    completion: string;
+    completions: string[];
     position: vscode.Position;
     inputPrefix: string;
     inputSuffix: string;
     prompt: string;
+    complIndex: number;
 }
 
 export class Completion {
     private app: Application
     private isRequestInProgress = false
     isForcedNewRequest = false
-    lastCompletion: CompletionDetails = {completion: "", position: new vscode.Position(0, 0), inputPrefix: "", inputSuffix: "", prompt: ""};
+    lastCompletion: CompletionDetails = {completions: [], complIndex: 0, position: new vscode.Position(0, 0), inputPrefix: "", inputSuffix: "", prompt: ""};
 
     constructor(application: Application) {
         this.app = application;
@@ -66,8 +67,8 @@ export class Completion {
         try {
             let data: LlamaResponse | undefined
             let hashKey = this.app.lruResultCache.getHash(inputPrefix + "|" + inputSuffix + "|" + prompt)
-            let completion = this.getCachedCompletion(hashKey, inputPrefix, inputSuffix, prompt)
-            let isCachedResponse = !this.isForcedNewRequest && completion != undefined
+            let completions = this.getCachedCompletion(hashKey, inputPrefix, inputSuffix, prompt)
+            let isCachedResponse = !this.isForcedNewRequest && completions != undefined
             if (!isCachedResponse) {
                 this.isForcedNewRequest = false
                 if (token.isCancellationRequested){
@@ -78,46 +79,55 @@ export class Completion {
                 this.app.statusbar.showThinkingInfo();
 
                 data = await this.app.llamaServer.getFIMCompletion(inputPrefix, inputSuffix, prompt, this.app.extraContext.chunks, nindent)
-                if (data != undefined) completion = data.content;
-                else completion = undefined
+                if (data != undefined) completions = this.getComplFromContent(data);
+                else completions = undefined
             }
-            if (completion == undefined || completion.trim() == ""){
+            if (completions == undefined || completions.length == 0){
                 this.app.statusbar.showInfo(undefined);
                 this.isRequestInProgress = false
                 this.app.logger.addEventLog(group, "NO_SUGGESTION_RETURN", "")
                 return [];
             }
 
-            let suggestionLines = completion.split(/\r?\n/)
-            Utils.removeTrailingNewLines(suggestionLines);
-
-            if (this.shouldDiscardSuggestion(suggestionLines, document, position, linePrefix, lineSuffix)) {
+            let newCompletions: string[] = []
+            let firstComplLines: string[] = []
+            for (let compl of completions){
+                let suggestionLines = compl.split(/\r?\n/)
+                Utils.removeTrailingNewLines(suggestionLines);
+
+                if (this.shouldDiscardSuggestion(suggestionLines, document, position, linePrefix, lineSuffix)) {
+                    continue
+                } else {
+                    compl = this.updateSuggestion(suggestionLines, lineSuffix);
+                    newCompletions.push(compl);
+                    if (firstComplLines.length == 0) firstComplLines = suggestionLines;
+                }     
+            }
+            if (newCompletions.length == 0){
                 this.app.statusbar.showInfo(undefined);
-                this.isRequestInProgress = false
-                this.app.logger.addEventLog(group, "DISCARD_SUGGESTION_RETURN", "")
-                return [];
+                    this.isRequestInProgress = false
+                    this.app.logger.addEventLog(group, "DISCARD_SUGGESTION_RETURN", "")
+                    return [];
             }
 
-            completion = this.updateSuggestion(suggestionLines, lineSuffix);
-
-            if (!isCachedResponse) this.app.lruResultCache.put(hashKey, completion)
-            this.lastCompletion = this.getCompletionDetails(completion, position, inputPrefix, inputSuffix, prompt);
+            if (!isCachedResponse && newCompletions) this.app.lruResultCache.put(hashKey, newCompletions)
+            this.lastCompletion = this.getCompletionDetails(newCompletions, position, inputPrefix, inputSuffix, prompt);
 
             // Run async as not needed for the suggestion
             setTimeout(async () => {
                 if (isCachedResponse) this.app.statusbar.showCachedInfo()
                 else this.app.statusbar.showInfo(data);
                 if (!token.isCancellationRequested && lineSuffix.trim() === ""){
-                    await this.cacheFutureSuggestion(inputPrefix, inputSuffix, prompt, suggestionLines);
-                    await this.cacheFutureAcceptLineSuggestion(inputPrefix, inputSuffix, prompt, suggestionLines);
+                    await this.cacheFutureSuggestion(inputPrefix, inputSuffix, prompt, firstComplLines);
+                    await this.cacheFutureAcceptLineSuggestion(inputPrefix, inputSuffix, prompt, firstComplLines);
                 }
                 if (!token.isCancellationRequested){
                     this.app.extraContext.addFimContextChunks(position, context, document);
                 }
             }, 0);
             this.isRequestInProgress = false
-            this.app.logger.addEventLog(group, "NORMAL_RETURN", suggestionLines[0])
-            return [this.getCompletion(this.removeLeadingSpaces(completion, spacesToRemove), position)];
+            this.app.logger.addEventLog(group, "NORMAL_RETURN", firstComplLines[0])
+            return this.getCompletion(newCompletions||[], position, spacesToRemove);
         } catch (err) {
             console.error("Error fetching llama completion:", err);
             vscode.window.showInformationMessage(this.app.configuration.getUiText(`Error getting response. Please check if llama.cpp server is running.`)??"");
@@ -155,21 +165,36 @@ export class Completion {
             let promptCut = prompt.slice(i)
             let hash = this.app.lruResultCache.getHash(inputPrefix + "|" + inputSuffix + "|" + newPrompt)
             let result = this.app.lruResultCache.get(hash)
-            if (result != undefined && promptCut == result.slice(0,promptCut.length)) return result.slice(prompt.length - newPrompt.length)
+            if (result == undefined) continue
+            let completions: string[] = []
+            for (const compl of result){
+                if (compl && promptCut == compl.slice(0,promptCut.length)) {
+                    completions.push(compl.slice(prompt.length - newPrompt.length))
+                }
+            }
+            if (completions.length > 0) return completions;
         }
 
         return undefined
     }
 
-    getCompletion = (completion: string, position: vscode.Position) => {
-        return new vscode.InlineCompletionItem(
-            completion,
-            new vscode.Range(position, position)
-        );
+    getCompletion = (completions: string[], 
+        position: vscode.Position,
+        spacesToRemove: number): vscode.InlineCompletionItem[] => {
+        let completionItems: vscode.InlineCompletionItem[] = []
+        for (const completion of completions){
+            const compl: vscode.InlineCompletionItem = new vscode.InlineCompletionItem(
+                this.removeLeadingSpaces(completion, spacesToRemove),
+                new vscode.Range(position, position)
+            )
+            completionItems.push(compl);
+        }
+    
+        return completionItems;
     }
 
-    private getCompletionDetails = (completion: string, position: vscode.Position, inputPrefix: string, inputSuffix: string, prompt: string) => {
-        return { completion: completion, position: position, inputPrefix: inputPrefix, inputSuffix: inputSuffix, prompt: prompt };
+    private getCompletionDetails = (completions: string[], position: vscode.Position, inputPrefix: string, inputSuffix: string, prompt: string) => {
+        return { completions: completions,complIndex: 0, position: position, inputPrefix: inputPrefix, inputSuffix: inputSuffix, prompt: prompt };
     }
 
     // logic for discarding predictions that repeat existing text
@@ -241,14 +266,17 @@ export class Completion {
         let cached_completion = this.app.lruResultCache.get(futureHashKey)
         if (cached_completion != undefined) return;
         let futureData = await this.app.llamaServer.getFIMCompletion(futureInputPrefix, futureInputSuffix, futurePrompt, this.app.extraContext.chunks, prompt.length - prompt.trimStart().length);
-        let futureSuggestion = "";
+        let futureSuggestions = [];
         if (futureData != undefined && futureData.content != undefined && futureData.content.trim() != "") {
-            futureSuggestion = futureData.content;
-            let suggestionLines = futureSuggestion.split(/\r?\n/)
-            Utils.removeTrailingNewLines(suggestionLines);
-            futureSuggestion = suggestionLines.join('\n')
+            let suggestions = this.getComplFromContent(futureData);
+            for (let futureSuggestion of suggestions||[]){
+                let suggestionLines = futureSuggestion.split(/\r?\n/)
+                Utils.removeTrailingNewLines(suggestionLines);
+                futureSuggestion = suggestionLines.join('\n')
+                futureSuggestions.push(futureSuggestion)
+            }
             let futureHashKey = this.app.lruResultCache.getHash(futureInputPrefix + "|" + futureInputSuffix + "|" + futurePrompt);
-            this.app.lruResultCache.put(futureHashKey, futureSuggestion);
+            this.app.lruResultCache.put(futureHashKey, futureSuggestions);
         }
     }
 
@@ -262,13 +290,13 @@ export class Completion {
             let futureSuggestion = suggestionLines.slice(1).join('\n')
             let cached_completion = this.app.lruResultCache.get(futureHashKey)
             if (cached_completion != undefined) return;
-            else this.app.lruResultCache.put(futureHashKey, futureSuggestion)
+            else this.app.lruResultCache.put(futureHashKey, [futureSuggestion])
         }
     }
 
     insertNextWord = async (editor: vscode.TextEditor) => {
         // Retrieve the last inline completion item
-        const lastSuggestion = this.lastCompletion.completion;
+        const lastSuggestion = this.lastCompletion.completions[this.lastCompletion.complIndex];
         if (!lastSuggestion) {
             return;
         }
@@ -294,7 +322,7 @@ export class Completion {
 
     insertFirstLine = async (editor: vscode.TextEditor) => {
         // Retrieve the last inline completion item
-        const lastItem = this.lastCompletion.completion;
+        const lastItem = this.lastCompletion.completions[this.lastCompletion.complIndex];
         if (!lastItem) {
             return;
         }
@@ -311,4 +339,36 @@ export class Completion {
             editBuilder.insert(position, insertLine);
         });
     }
+
+    increaseSuggestionIndex = async () => {
+        const totalCompletions = this.lastCompletion.completions.length
+        if (totalCompletions > 0){
+            this.lastCompletion.complIndex = (this.lastCompletion.complIndex + 1) % totalCompletions
+        }
+    }
+
+    decreaseSuggestionIndex = async () => {
+        const totalCompletions = this.lastCompletion.completions.length
+        if (totalCompletions > 0){
+            if (this.lastCompletion.complIndex > 0) this.lastCompletion.complIndex--
+            else this.lastCompletion.complIndex = totalCompletions - 1
+        }
+    }
+
+    private getComplFromContent(codeCompletions: any): string[] | undefined {
+        if ("content" in codeCompletions) 
+            return codeCompletions.content??""
+        
+        if (codeCompletions.length > 0){
+            let completions: Set<string> = new Set()
+            for (const compl of codeCompletions){
+                completions.add(compl.content??"")
+            }
+            return Array.from(completions);
+        } 
+        else return [];
+    }
+    
+    // Function for printing the first n Fibonachi numbers
+    
 }
diff --git a/src/configuration.ts b/src/configuration.ts
index 77c9d79..3863a9d 100644
--- a/src/configuration.ts
+++ b/src/configuration.ts
@@ -93,6 +93,7 @@ export class Configuration {
     chats_max_tokens = 64000;
     chats_summarize_old_msgs = false;
     chats_msgs_keep = 50
+    max_parallel_completions = 3
     completion_models_list = new Array();
     embeddings_models_list = new Array();
     tools_models_list = new Array();
@@ -239,6 +240,7 @@ export class Configuration {
         this.tools_log_calls = Boolean(config.get<boolean>("tools_log_calls"));
         this.chats_max_history = Number(config.get<number>("chats_max_history"));
         this.chats_max_tokens = Number(config.get<number>("chats_max_tokens"));
+        this.max_parallel_completions = Number(config.get<number>("max_parallel_completions"));
         this.chats_summarize_old_msgs = Boolean(config.get<boolean>("chats_summarize_old_msgs"));
         this.chats_msgs_keep = Number(config.get<number>("chats_msgs_keep"));
         this.skills_folder = String(config.get<string>("skills_folder"));
diff --git a/src/extension.ts b/src/extension.ts
index 66e4608..dbbcc3e 100644
--- a/src/extension.ts
+++ b/src/extension.ts
@@ -29,6 +29,8 @@ export function activate(context: vscode.ExtensionContext) {
     app.architect.registerGenarateCommitMsg(context)
     app.architect.registerCommandKillAgent(context)
     app.architect.registerWebviewProvider(context)
+    app.architect.registerCommandSelectNextSuggestion(context)
+    app.architect.registerCommandSelectPreviousSuggestion(context)
     app.architect.init()
 }
 
diff --git a/src/llama-server.ts b/src/llama-server.ts
index 9f51262..8e3e539 100644
--- a/src/llama-server.ts
+++ b/src/llama-server.ts
@@ -113,7 +113,7 @@ export class LlamaServer {
                 input_suffix: inputSuffix,
                 input_extra: chunks,
                 prompt,
-                n_predict: 0,
+                n_predict: 0,                
                 samplers: [],
                 cache_prompt: true,
                 t_max_prompt_ms: this.app.configuration.t_max_prompt_ms,
@@ -125,9 +125,10 @@ export class LlamaServer {
         return {
             input_prefix: inputPrefix,
             input_suffix: inputSuffix,
-            input_extra: chunks,
+            input_extra: chunks, 
             prompt,
             n_predict: this.app.configuration.n_predict,
+            n_cmpl: this.app.configuration.max_parallel_completions,
             ...this.defaultRequestParams,
             ...(nindent && { n_indent: nindent }),
             t_max_prompt_ms: this.app.configuration.t_max_prompt_ms,
diff --git a/src/lru-cache.ts b/src/lru-cache.ts
index 4508e12..346e4a6 100644
--- a/src/lru-cache.ts
+++ b/src/lru-cache.ts
@@ -2,7 +2,7 @@ import * as crypto from 'crypto';
 
 export class LRUCache {
     private capacity: number;
-    private map: Map<string, string>;
+    private map: Map<string, string[]>;
 
     constructor(capacity: number) {
         if (capacity <= 0) {
@@ -18,7 +18,7 @@ export class LRUCache {
      * @param key The key to retrieve.
      * @returns The value associated with the key, or undefined if the key is not found.
      */
-    get = (key: string): string | undefined => {
+    get = (key: string): string[] | undefined => {
         if (!this.map.has(key)) {
             return undefined;
         }
@@ -37,7 +37,7 @@ export class LRUCache {
      * @param key The key to insert or update.
      * @param value The value to associate with the key.
      */
-    put = (key: string, value: string): void => {
+    put = (key: string, value: string[]): void => {
         if (this.map.has(key)) {
             // If the key exists, delete it to refresh its position
             this.map.delete(key);