From 30a5f99241baa96e790cb3dca88a5ec4ff9a96d7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 19 Jan 2026 16:06:40 +0200
Subject: [PATCH 1/8] inst : initial

---
 autoload/llama.vim | 376 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 340 insertions(+), 36 deletions(-)
diff --git a/autoload/llama.vim b/autoload/llama.vim
index 08f9151..c1d7316 100644
--- a/autoload/llama.vim
+++ b/autoload/llama.vim
@@ -1,13 +1,25 @@
 " vim: ts=4 sts=4 expandtab
 " colors (adjust to your liking)
-highlight default llama_hl_hint guifg=#ff772f ctermfg=202
-highlight default llama_hl_info guifg=#77ff2f ctermfg=119
+
+" fim colors
+highlight default llama_hl_fim_hint guifg=#ff772f ctermfg=202
+highlight default llama_hl_fim_info guifg=#77ff2f ctermfg=119
+
+ " instruct colors for selected block
+ highlight default llama_hl_inst_src guibg=#333333 ctermbg=236
+
+ " virtual text colors for instructions
+ highlight default llama_hl_inst_virt_proc  guifg=#77ff2f ctermfg=119
+ highlight default llama_hl_inst_virt_gen   guifg=#77ff2f ctermfg=119
+ highlight default llama_hl_inst_virt_ready guifg=#ff772f ctermfg=202
 
 " general parameters:
 "
-"   endpoint:         llama.cpp server endpoint
+"   endpoint_fim:     llama.cpp server endpoint for FIM completion
+"   endpoint_inst:    llama.cpp server endpoint for instruction completion
+"   model_fim:        model name in case when multiple models are loaded (optional)
+"   model_inst:       instruction model name (optional)
 "   api_key:          llama.cpp server api key (optional)
-"   model:            model name in case when multiple models are loaded (optional)
 "   n_prefix:         number of lines before the cursor location to include in the local prefix
 "   n_suffix:         number of lines after  the cursor location to include in the local suffix
 "   n_predict:        max number of tokens to predict
@@ -44,11 +56,14 @@ highlight default llama_hl_info guifg=#77ff2f ctermfg=119
 "   keymap_fim_accept_line: keymap to accept line suggestion, default: <S-Tab>
 "   keymap_fim_accept_word: keymap to accept word suggestion, default: <C-B>
 "   keymap_debug_toggle:    keymap to toggle the debug pane,  default: null
+"   keymap_inst_trigger:    keymap to trigger the instruction command, default: null
 "
 let s:default_config = {
-    \ 'endpoint':               'http://127.0.0.1:8012/infill',
+    \ 'endpoint_fim':           'http://127.0.0.1:8012/infill',
+    \ 'endpoint_inst':          'http://127.0.0.1:8012/v1/chat/completions',
+    \ 'model_fim':              '',
+    \ 'model_inst':             '',
     \ 'api_key':                '',
-    \ 'model':                  '',
     \ 'n_prefix':               256,
     \ 'n_suffix':               64,
     \ 'n_predict':              128,
@@ -68,13 +83,17 @@ let s:default_config = {
     \ 'keymap_fim_accept_line': "<S-Tab>",
     \ 'keymap_fim_accept_word': "<C-B>",
     \ 'keymap_debug_toggle':    v:null,
+    \ 'keymap_inst_trigger':    v:null,
     \ 'enable_at_startup':      v:true,
+    \ 'timeout_inst':           30000,
     \ }
 
 let llama_config = get(g:, 'llama_config', s:default_config)
 
 " rename deprecated keys in `llama_config`.
 let s:renames = {
+      \ 'endpoint'           : 'endpoint_fim',
+      \ 'model'              : 'model_fim',
       \ 'keymap_trigger'     : 'keymap_fim_trigger',
       \ 'keymap_accept_full' : 'keymap_fim_accept_full',
       \ 'keymap_accept_line' : 'keymap_fim_accept_line',
@@ -170,6 +189,9 @@ function! llama#disable()
     exe "silent! iunmap <buffer> " .. g:llama_config.keymap_fim_accept_word
 
     exe "silent!  unmap          " .. g:llama_config.keymap_debug_toggle
+    exe "silent! vunmap          " .. g:llama_config.keymap_inst_trigger
+    exe "silent!  unmap          <Tab>"
+    exe "silent!  unmap          <Esc>"
 
     let s:llama_enabled = v:false
 
@@ -200,6 +222,8 @@ function! llama#setup()
     command! LlamaToggle         call llama#toggle()
     command! LlamaToggleAutoFim  call llama#toggle_auto_fim()
 
+    command! -range=% LlamaInstruct call llama#inst(<line1>, <line2>)
+
     call llama#debug_setup()
 endfunction
 
@@ -221,14 +245,17 @@ function! llama#init()
     let s:ring_queued = [] " chunks that are queued to be sent for processing
     let s:ring_n_evict = 0
 
-    let s:hint_shown = v:false
+    let s:fim_hint_shown = v:false
     let s:pos_y_pick = -9999 " last y where we picked a chunk
     let s:indent_last = -1   " last indentation level that was accepted (TODO: this might be buggy)
 
     let s:timer_fim = -1
     let s:t_last_move = reltime() " last time the cursor moved
 
-    let s:current_job = v:null
+    let s:current_job_fim  = v:null
+
+    let s:inst_requests = []
+    let s:inst_request_id = 0
 
     let s:ghost_text_nvim = exists('*nvim_buf_get_mark')
     let s:ghost_text_vim = has('textprop')
@@ -237,8 +264,8 @@ function! llama#init()
         if version < 901
             echom 'Warning: llama.vim requires version 901 or greater. Current version: ' . version
         endif
-        let s:hlgroup_hint = 'llama_hl_hint'
-        let s:hlgroup_info = 'llama_hl_info'
+        let s:hlgroup_hint = 'llama_hl_fim_hint'
+        let s:hlgroup_info = 'llama_hl_fim_info'
 
         if empty(prop_type_get(s:hlgroup_hint))
             call prop_type_add(s:hlgroup_hint, {'highlight': s:hlgroup_hint})
@@ -288,6 +315,10 @@ function! llama#enable()
     " setup keymaps
     exe "autocmd InsertEnter * inoremap <buffer> <expr> <silent> " . g:llama_config.keymap_fim_trigger . " llama#fim_inline(v:false, v:false)"
     exe "nnoremap <silent> " .. g:llama_config.keymap_debug_toggle .. " :call llama#debug_toggle()<CR>"
+    exe "vnoremap <silent> " .. g:llama_config.keymap_inst_trigger .. " :LlamaInstruct<CR>"
+
+    exe "nnoremap <silent> <Tab> :call llama#inst_accept()<CR>"
+    exe "nnoremap <silent> <Esc> :call llama#inst_cancel()<CR>"
 
     call llama#setup_autocmds()
 
@@ -466,13 +497,13 @@ function! s:ring_update()
         \ "--silent",
         \ "--no-buffer",
         \ "--request", "POST",
-        \ "--url", g:llama_config.endpoint,
+        \ "--url", g:llama_config.endpoint_fim,
         \ "--header", "Content-Type: application/json",
         \ "--data", "@-",
         \ ]
 
-    if exists ("g:llama_config.model") && len("g:llama_config.model") > 0
-        let l:request['model'] = g:llama_config.model
+    if exists ("g:llama_config.model_fim") && len("g:llama_config.model_fim") > 0
+        let l:request['model'] = g:llama_config.model_fim
     end
 
     if exists ("g:llama_config.api_key") && len("g:llama_config.api_key") > 0
@@ -493,6 +524,10 @@ function! s:ring_update()
     endif
 endfunction
 
+" =====================================
+" Fill-in-Middle (FIM) completion
+" =====================================
+
 " get the local context at a specified position
 " a:prev can optionally contain a previous completion for this position
 "   in such cases, create the local context as if the completion was already inserted
@@ -578,7 +613,7 @@ function! llama#fim_inline(is_auto, use_cache) abort
     endif
 
     " we already have a suggestion displayed - hide it
-    if s:hint_shown && !a:is_auto
+    if s:fim_hint_shown && !a:is_auto
         call llama#fim_hide()
         return ''
     endif
@@ -603,7 +638,7 @@ function! llama#fim(pos_x, pos_y, is_auto, prev, use_cache) abort
     endif
 
     " avoid sending repeated requests too fast
-    if s:current_job != v:null
+    if s:current_job_fim != v:null
         if s:timer_fim != -1
             call timer_stop(s:timer_fim)
             let s:timer_fim = -1
@@ -613,7 +648,7 @@ function! llama#fim(pos_x, pos_y, is_auto, prev, use_cache) abort
         return
     endif
 
-    "if s:hint_shown && empty(a:prev)
+    "if s:fim_hint_shown && empty(a:prev)
     "    return
     "endif
 
@@ -729,44 +764,44 @@ function! llama#fim(pos_x, pos_y, is_auto, prev, use_cache) abort
         \ "--silent",
         \ "--no-buffer",
         \ "--request", "POST",
-        \ "--url", g:llama_config.endpoint,
+        \ "--url", g:llama_config.endpoint_fim,
         \ "--header", "Content-Type: application/json",
         \ "--data", "@-",
         \ ]
 
-    if exists ("g:llama_config.model") && len("g:llama_config.model") > 0
-        let l:request['model'] = g:llama_config.model
+    if exists ("g:llama_config.model_fim") && len("g:llama_config.model_fim") > 0
+        let l:request['model'] = g:llama_config.model_fim
     end
 
     if exists ("g:llama_config.api_key") && len("g:llama_config.api_key") > 0
         call extend(l:curl_command, ['--header', 'Authorization: Bearer ' .. g:llama_config.api_key])
     endif
 
-    if s:current_job != v:null
+    if s:current_job_fim != v:null
         if s:ghost_text_nvim
-            call jobstop(s:current_job)
+            call jobstop(s:current_job_fim)
         elseif s:ghost_text_vim
-            call job_stop(s:current_job)
+            call job_stop(s:current_job_fim)
         endif
     endif
 
     " send the request asynchronously
     let l:request_json = json_encode(l:request)
     if s:ghost_text_nvim
-        let s:current_job = jobstart(l:curl_command, {
+        let s:current_job_fim = jobstart(l:curl_command, {
             \ 'on_stdout': function('s:fim_on_response', [l:hashes]),
             \ 'on_exit':   function('s:fim_on_exit'),
             \ 'stdout_buffered': v:true
             \ })
-        call chansend(s:current_job, l:request_json)
-        call chanclose(s:current_job, 'stdin')
+        call chansend(s:current_job_fim, l:request_json)
+        call chanclose(s:current_job_fim, 'stdin')
     elseif s:ghost_text_vim
-        let s:current_job = job_start(l:curl_command, {
+        let s:current_job_fim = job_start(l:curl_command, {
             \ 'out_cb':    function('s:fim_on_response', [l:hashes]),
             \ 'exit_cb':   function('s:fim_on_exit')
             \ })
 
-        let channel = job_getchannel(s:current_job)
+        let channel = job_getchannel(s:current_job_fim)
         call ch_sendraw(channel, l:request_json)
         call ch_close_in(channel)
     endif
@@ -819,7 +854,7 @@ function! s:fim_on_response(hashes, job_id, data, event = v:null)
     endfor
 
     " if nothing is currently displayed - show the hint directly
-    if !s:hint_shown || !s:fim_data['can_accept']
+    if !s:fim_hint_shown || !s:fim_data['can_accept']
         " log only non-speculative fims for now
         call llama#debug_log('fim_on_response', get(json_decode(l:raw), 'content', ''))
 
@@ -835,7 +870,7 @@ function! s:fim_on_exit(job_id, exit_code, event = v:null)
         echom "Job failed with exit code: " . a:exit_code
     endif
 
-    let s:current_job = v:null
+    let s:current_job_fim = v:null
 endfunction
 
 function! s:on_move()
@@ -910,7 +945,7 @@ function! s:fim_try_hint(pos_x, pos_y)
         call s:fim_render(l:pos_x, l:pos_y, l:raw)
 
         " run async speculative FIM in the background for this position
-        if s:hint_shown
+        if s:fim_hint_shown
             call llama#fim(l:pos_x, l:pos_y, v:true, s:fim_data['content'], v:true)
         endif
     endif
@@ -1090,12 +1125,12 @@ function! s:fim_render(pos_x, pos_y, data)
     " display the suggestion and append the info to the end of the first line
     if s:ghost_text_nvim
         call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, l:pos_y - 1, l:pos_x, {
-            \ 'virt_text': [[l:content[0], 'llama_hl_hint'], [l:info, 'llama_hl_info']],
+            \ 'virt_text': [[l:content[0], 'llama_hl_fim_hint'], [l:info, 'llama_hl_fim_info']],
             \ 'virt_text_pos': l:content == [""] ? 'eol' : 'overlay'
             \ })
 
         call nvim_buf_set_extmark(l:bufnr, l:id_vt_fim, l:pos_y - 1, 0, {
-            \ 'virt_lines': map(l:content[1:], {idx, val -> [[val, 'llama_hl_hint']]})
+            \ 'virt_lines': map(l:content[1:], {idx, val -> [[val, 'llama_hl_fim_hint']]})
             \ })
     elseif s:ghost_text_vim
         let l:full_suffix = l:content[0]
@@ -1128,7 +1163,7 @@ function! s:fim_render(pos_x, pos_y, data)
     exe 'inoremap <buffer> ' . g:llama_config.keymap_fim_accept_line . ' <C-O>:call llama#fim_accept(''line'')<CR>'
     exe 'inoremap <buffer> ' . g:llama_config.keymap_fim_accept_word . ' <C-O>:call llama#fim_accept(''word'')<CR>'
 
-    let s:hint_shown = v:true
+    let s:fim_hint_shown = v:true
 
     let s:fim_data['pos_x']  = l:pos_x
     let s:fim_data['pos_y']  = l:pos_y
@@ -1189,7 +1224,7 @@ function! llama#fim_accept(accept_type)
 endfunction
 
 function! llama#fim_hide()
-    let s:hint_shown = v:false
+    let s:fim_hint_shown = v:false
 
     " clear the virtual text
     let l:bufnr = bufnr('%')
@@ -1208,10 +1243,279 @@ function! llama#fim_hide()
     exe 'silent! iunmap <buffer> ' . g:llama_config.keymap_fim_accept_word
 endfunction
 
-function! llama#is_hint_shown()
-    return s:hint_shown
+" =====================================
+" Instruct-based editing
+" =====================================
+
+function! llama#inst(start, end)
+    let l:lines = getline(a:start, a:end)
+    let l:inst = input('Instruction: ')
+    if empty(l:inst)
+        return
+    endif
+    call llama#inst_send(a:start, a:end, l:lines, l:inst, function('s:inst_callback', [a:start, a:end]))
 endfunction
 
+function! llama#inst_send(start, end, lines, inst, callback)
+    " Create request state
+    let l:request_id = s:inst_request_id
+    let s:inst_request_id += 1
+
+    let l:end = min([a:end, line('$')])
+
+    let l:req = {
+        \ 'id': l:request_id,
+        \ 'range': [a:start, l:end],
+        \ 'status': 'proc',
+        \ 'result': '',
+        \ 'instruction': a:inst,
+        \ 'job': v:null,
+        \ 'extmark': -1,
+        \ 'extmark_virt': -1,
+        \ }
+
+    call add(s:inst_requests, l:req)
+
+     " highlights the selected text
+     let l:bufnr = bufnr('%')
+     if s:ghost_text_nvim
+         let l:ns = nvim_create_namespace('vt_inst')
+         let l:req.extmark = nvim_buf_set_extmark(l:bufnr, l:ns, a:start - 1, 0, {
+             \ 'end_row': l:end - 1,
+             \ 'end_col': len(getline(l:end)),
+             \ 'hl_group': 'llama_hl_inst_src'
+             \ })
+     elseif s:ghost_text_vim
+         " TODO: implement classic Vim support
+     endif
+
+     " Initialize virtual text with processing status
+     call s:inst_update(l:request_id, 'proc')
+
+    " Build the payload
+    let l:system_message = {'role': 'system', 'content': 'You are a code-editing assistant. Return ONLY the result after applying the instruction to the selection.'}
+
+    " extra context
+    " TODO: deduplicate
+    let l:extra_context = []
+    for l:chunk in s:ring_chunks
+        call add(l:extra_context, {
+            \ 'text':     l:chunk.str,
+            \ 'time':     l:chunk.time,
+            \ 'filename': l:chunk.filename
+            \ })
+    endfor
+
+    let l:user_content  = ""
+    let l:user_content .= "--- context ----------------------------------------------------\n"
+    let l:user_content .= join(l:extra_context, "\n") . "\n"
+    let l:user_content .= "--- instruction ------------------------------------------------\n"
+    let l:user_content .= a:inst . "\n"
+    let l:user_content .= "--- selection --------------------------------------------------\n"
+    let l:user_content .= join(a:lines, "\n") . "\n"
+    let l:user_content .= "--- result -----------------------------------------------------\n"
+
+    let l:user_message = {'role': 'user', 'content': l:user_content}
+
+    let l:messages = [l:system_message, l:user_message]
+
+    let l:request = {
+        \ 'model': g:llama_config.model_inst,
+        \ 'messages': l:messages,
+        \ 'temperature': 0.0,
+        \ 'stream': v:false,
+        \ }
+
+    call llama#debug_log('inst_send | ' . a:inst, l:user_content)
+
+    let l:curl_command = [
+        \ "curl",
+        \ "--silent",
+        \ "--no-buffer",
+        \ "--request", "POST",
+        \ "--url", g:llama_config.endpoint_inst,
+        \ "--header", "Content-Type: application/json",
+        \ "--data", "@-",
+        \ ]
+
+    if exists("g:llama_config.api_key") && len("g:llama_config.api_key") > 0
+        call extend(l:curl_command, ['--header', 'Authorization: Bearer ' .. g:llama_config.api_key])
+    endif
+
+    let l:request_json = json_encode(l:request)
+
+    if s:ghost_text_nvim
+        let l:req.job = jobstart(l:curl_command, {
+            \ 'on_stdout': function('s:inst_on_response', [l:request_id, a:callback]),
+            \ 'on_exit':   function('s:inst_on_exit',     [l:request_id, a:callback]),
+            \ 'stdout_buffered': v:true
+            \ })
+        call chansend(l:req.job, l:request_json)
+        call chanclose(l:req.job, 'stdin')
+    elseif s:ghost_text_vim
+        let l:req.job = job_start(l:curl_command, {
+            \ 'out_cb':  function('s:inst_on_response', [l:request_id, a:callback]),
+            \ 'exit_cb': function('s:inst_on_exit',     [l:request_id, a:callback])
+            \ })
+
+        let channel = job_getchannel(l:req.job)
+        call ch_sendraw(channel, l:request_json)
+        call ch_close_in(channel)
+    endif
+endfunction
+
+function! s:inst_update(id, status)
+    for l:req in s:inst_requests
+        if l:req.id == a:id
+            let l:req.status = a:status
+             if s:ghost_text_nvim
+                 let l:ns = nvim_create_namespace('vt_inst')
+                 " Clear existing virtual text extmark if it exists
+                 if l:req.extmark_virt != -1
+                     call nvim_buf_del_extmark(bufnr('%'), l:ns, l:req.extmark_virt)
+                     let l:req.extmark_virt = -1
+                 endif
+                 " Create virtual text extmark
+                 let l:virt_lines = []
+                 let l:separator = '====================================='
+                 if a:status == 'ready'
+                     let l:result_lines = split(l:req.result, "\n")
+                     let l:virt_lines = [[[l:separator, 'llama_hl_inst_virt_ready']]] + map(l:result_lines, {idx, val -> [[val, 'llama_hl_inst_virt_ready']]})
+                 elseif a:status == 'proc'
+                     let l:instruction_truncated = l:req.instruction
+                     if len(l:instruction_truncated) > 64
+                         let l:instruction_truncated = l:instruction_truncated[:63] . '...'
+                     endif
+                     let l:virt_lines = [
+                         \ [[l:separator, 'llama_hl_inst_virt_proc']],
+                         \ [[printf('(%s) Processing ...', g:llama_config.endpoint_inst), 'llama_hl_inst_virt_proc']],
+                         \ [['Instruction: ' . l:instruction_truncated, 'llama_hl_inst_virt_proc']]
+                         \ ]
+                 elseif a:status == 'gen'
+                     let l:virt_lines = [
+                         \ [[l:separator, 'llama_hl_inst_virt_gen']],
+                         \ [[printf('(%s) Generating ...', g:llama_config.endpoint_inst), 'llama_hl_inst_virt_gen']]
+                         \ ]
+                 endif
+
+                 if !empty(l:virt_lines)
+                     let l:req.extmark_virt = nvim_buf_set_extmark(bufnr('%'), l:ns, l:req.range[1] - 1, 0, {
+                         \ 'virt_lines': l:virt_lines
+                         \ })
+                 endif
+            elseif s:ghost_text_vim
+                " TODO: implement classic Vim support
+            endif
+            break
+        endif
+    endfor
+endfunction
+
+function! s:inst_on_response(id, callback, job_id, data, event = v:null)
+    call s:inst_update(a:id, 'gen')
+
+    if s:ghost_text_nvim
+        let l:raw = join(a:data, "\n")
+    elseif s:ghost_text_vim
+        let l:raw = a:data
+    endif
+
+    if len(l:raw) == 0
+        return
+    endif
+
+    let l:content = ''
+    try
+        let l:response = json_decode(l:raw)
+        let l:content = get(l:response, 'choices', [{}])[0].message.content
+    catch
+        " Assume plain text response
+        let l:content = l:raw
+    endtry
+
+    " Store result
+    for l:req in s:inst_requests
+        if l:req.id == a:id
+            let l:req.result = l:content
+            break
+        endif
+    endfor
+endfunction
+
+function! s:inst_on_exit(id, callback, job_id, exit_code, event = v:null)
+    if a:exit_code != 0
+        echohl ErrorMsg
+        echo "Instruct job failed with exit code: " . a:exit_code
+        echohl None
+        call s:inst_remove(a:id)
+        return
+    endif
+
+    call s:inst_update(a:id, 'ready')
+endfunction
+
+function! s:inst_remove(id)
+    for i in range(len(s:inst_requests))
+        if s:inst_requests[i].id == a:id
+            " Clear extmarks
+            if s:ghost_text_nvim
+                let l:ns = nvim_create_namespace('vt_inst')
+                call nvim_buf_del_extmark(bufnr('%'), l:ns, s:inst_requests[i].extmark)
+                if s:inst_requests[i].extmark_virt != -1
+                    call nvim_buf_del_extmark(bufnr('%'), l:ns, s:inst_requests[i].extmark_virt)
+                endif
+            endif
+            call remove(s:inst_requests, i)
+            break
+        endif
+    endfor
+endfunction
+
+function! s:inst_callback(start, end, result)
+    let l:result_lines = split(a:result, "\n", 1)
+    " Remove trailing empty lines
+    while len(l:result_lines) > 0 && l:result_lines[-1] == ""
+        call remove(l:result_lines, -1)
+    endwhile
+
+    let l:num_result = len(l:result_lines)
+    let l:num_original = a:end - a:start + 1
+
+    " Delete the original range
+    call deletebufline(bufnr('%'), a:start, a:end)
+
+    " Insert the new lines
+    call append(a:start - 1, l:result_lines)
+endfunction
+
+function! llama#inst_accept()
+    let l:line = line('.')
+    for l:req in s:inst_requests
+        if l:req.status == 'ready' && l:line >= l:req.range[0] && l:line <= l:req.range[1]
+            call s:inst_callback(l:req.range[0], l:req.range[1], l:req.result)
+            call s:inst_remove(l:req.id)
+            return
+        endif
+    endfor
+    " If not in range, do normal Tab
+    call feedkeys("\<Tab>", 'n')
+endfunction
+
+function! llama#inst_cancel()
+    let l:line = line('.')
+    for l:req in s:inst_requests
+        if l:line >= l:req.range[0] && l:line <= l:req.range[1]
+            call s:inst_remove(l:req.id)
+            return
+        endif
+    endfor
+    " If not in range, do normal Esc (nothing)
+endfunction
+
+" =====================================
+" Debug helpers
+" =====================================
+
 function! llama#debug_log(msg, ...) abort
     return call('llama_debug#log', [a:msg] + a:000)
 endfunction

From 3d7413205e9e925487dddd097b0b1e0d4598a676 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 19 Jan 2026 18:27:53 +0200
Subject: [PATCH 2/8] cont : fix extmark position updates

---
 autoload/llama.vim | 145 ++++++++++++++++++++++++++++-----------------
 1 file changed, 89 insertions(+), 56 deletions(-)

diff --git a/autoload/llama.vim b/autoload/llama.vim
index c1d7316..4e0f473 100644
--- a/autoload/llama.vim
+++ b/autoload/llama.vim
@@ -82,10 +82,12 @@ let s:default_config = {
     \ 'keymap_fim_accept_full': "<Tab>",
     \ 'keymap_fim_accept_line': "<S-Tab>",
     \ 'keymap_fim_accept_word': "<C-B>",
-    \ 'keymap_debug_toggle':    v:null,
     \ 'keymap_inst_trigger':    v:null,
+    \ 'keymap_inst_accept':     "<Tab>",
+    \ 'keymap_inst_cancel':     "<Esc>",
+    \ 'keymap_debug_toggle':    v:null,
     \ 'enable_at_startup':      v:true,
-    \ 'timeout_inst':           30000,
+    \ 'timeout_inst':           60000,
     \ }
 
 let llama_config = get(g:, 'llama_config', s:default_config)
@@ -190,8 +192,8 @@ function! llama#disable()
 
     exe "silent!  unmap          " .. g:llama_config.keymap_debug_toggle
     exe "silent! vunmap          " .. g:llama_config.keymap_inst_trigger
-    exe "silent!  unmap          <Tab>"
-    exe "silent!  unmap          <Esc>"
+    exe "silent!  unmap          " .. g:llama_config.keymap_inst_accept
+    exe "silent!  unmap          " .. g:llama_config.keymap_inst_cancel
 
     let s:llama_enabled = v:false
 
@@ -315,10 +317,10 @@ function! llama#enable()
     " setup keymaps
     exe "autocmd InsertEnter * inoremap <buffer> <expr> <silent> " . g:llama_config.keymap_fim_trigger . " llama#fim_inline(v:false, v:false)"
     exe "nnoremap <silent> " .. g:llama_config.keymap_debug_toggle .. " :call llama#debug_toggle()<CR>"
-    exe "vnoremap <silent> " .. g:llama_config.keymap_inst_trigger .. " :LlamaInstruct<CR>"
 
-    exe "nnoremap <silent> <Tab> :call llama#inst_accept()<CR>"
-    exe "nnoremap <silent> <Esc> :call llama#inst_cancel()<CR>"
+    exe "vnoremap <silent> " .. g:llama_config.keymap_inst_trigger .. " :LlamaInstruct<CR>"
+    exe "nnoremap <silent> " .. g:llama_config.keymap_inst_accept  .. " :call llama#inst_accept()<CR>"
+    exe "nnoremap <silent> " .. g:llama_config.keymap_inst_cancel  .. " :call llama#inst_cancel()<CR>"
 
     call llama#setup_autocmds()
 
@@ -484,8 +486,8 @@ function! s:ring_update()
         \ 'prompt':           "",
         \ 'n_predict':        0,
         \ 'temperature':      0.0,
-        \ 'stream':           v:false,
         \ 'samplers':         [],
+        \ 'stream':           v:false,
         \ 'cache_prompt':     v:true,
         \ 't_max_prompt_ms':  1,
         \ 't_max_predict_ms': 1,
@@ -739,8 +741,8 @@ function! llama#fim(pos_x, pos_y, is_auto, prev, use_cache) abort
         \ 'n_indent':         l:indent,
         \ 'top_k':            40,
         \ 'top_p':            0.90,
-        \ 'stream':           v:false,
         \ 'samplers':         ["top_k", "top_p", "infill"],
+        \ 'stream':           v:false,
         \ 'cache_prompt':     v:true,
         \ 't_max_prompt_ms':  g:llama_config.t_max_prompt_ms,
         \ 't_max_predict_ms': l:t_max_predict_ms,
@@ -1253,6 +1255,7 @@ function! llama#inst(start, end)
     if empty(l:inst)
         return
     endif
+
     call llama#inst_send(a:start, a:end, l:lines, l:inst, function('s:inst_callback', [a:start, a:end]))
 endfunction
 
@@ -1268,7 +1271,7 @@ function! llama#inst_send(start, end, lines, inst, callback)
         \ 'range': [a:start, l:end],
         \ 'status': 'proc',
         \ 'result': '',
-        \ 'instruction': a:inst,
+        \ 'inst': a:inst,
         \ 'job': v:null,
         \ 'extmark': -1,
         \ 'extmark_virt': -1,
@@ -1293,7 +1296,10 @@ function! llama#inst_send(start, end, lines, inst, callback)
      call s:inst_update(l:request_id, 'proc')
 
     " Build the payload
-    let l:system_message = {'role': 'system', 'content': 'You are a code-editing assistant. Return ONLY the result after applying the instruction to the selection.'}
+    let l:system_message = {
+        \ 'role': 'system',
+        \ 'content': 'You are a code-editing assistant. Return ONLY the result after applying the instruction to the selection.'
+        \ }
 
     " extra context
     " TODO: deduplicate
@@ -1320,10 +1326,12 @@ function! llama#inst_send(start, end, lines, inst, callback)
     let l:messages = [l:system_message, l:user_message]
 
     let l:request = {
-        \ 'model': g:llama_config.model_inst,
-        \ 'messages': l:messages,
-        \ 'temperature': 0.0,
-        \ 'stream': v:false,
+        \ 'model':        g:llama_config.model_inst,
+        \ 'messages':     l:messages,
+        \ 'min_p':        0.1,
+        \ 'samplers':     ["min_p"],
+        \ 'stream':       v:false,
+        \ 'cache_prompt': v:true,
         \ }
 
     call llama#debug_log('inst_send | ' . a:inst, l:user_content)
@@ -1364,45 +1372,63 @@ function! llama#inst_send(start, end, lines, inst, callback)
     endif
 endfunction
 
+function! llama#inst_update_pos(req)
+    let l:bufnr = bufnr('%')
+    let l:ns    = nvim_create_namespace('vt_inst')
+
+    let l:extmark_pos = nvim_buf_get_extmark_by_id(l:bufnr, l:ns, a:req.extmark, {})
+    if empty(l:extmark_pos)
+        continue
+    endif
+
+    let l:extmark_line = l:extmark_pos[0] + 1
+    let a:req.range[1] = l:extmark_line + a:req.range[1] - a:req.range[0]
+    let a:req.range[0] = l:extmark_line
+endfunction
+
 function! s:inst_update(id, status)
     for l:req in s:inst_requests
         if l:req.id == a:id
             let l:req.status = a:status
-             if s:ghost_text_nvim
-                 let l:ns = nvim_create_namespace('vt_inst')
-                 " Clear existing virtual text extmark if it exists
-                 if l:req.extmark_virt != -1
-                     call nvim_buf_del_extmark(bufnr('%'), l:ns, l:req.extmark_virt)
-                     let l:req.extmark_virt = -1
-                 endif
-                 " Create virtual text extmark
-                 let l:virt_lines = []
-                 let l:separator = '====================================='
-                 if a:status == 'ready'
-                     let l:result_lines = split(l:req.result, "\n")
-                     let l:virt_lines = [[[l:separator, 'llama_hl_inst_virt_ready']]] + map(l:result_lines, {idx, val -> [[val, 'llama_hl_inst_virt_ready']]})
-                 elseif a:status == 'proc'
-                     let l:instruction_truncated = l:req.instruction
-                     if len(l:instruction_truncated) > 64
-                         let l:instruction_truncated = l:instruction_truncated[:63] . '...'
-                     endif
-                     let l:virt_lines = [
-                         \ [[l:separator, 'llama_hl_inst_virt_proc']],
-                         \ [[printf('(%s) Processing ...', g:llama_config.endpoint_inst), 'llama_hl_inst_virt_proc']],
-                         \ [['Instruction: ' . l:instruction_truncated, 'llama_hl_inst_virt_proc']]
-                         \ ]
-                 elseif a:status == 'gen'
-                     let l:virt_lines = [
-                         \ [[l:separator, 'llama_hl_inst_virt_gen']],
-                         \ [[printf('(%s) Generating ...', g:llama_config.endpoint_inst), 'llama_hl_inst_virt_gen']]
-                         \ ]
-                 endif
-
-                 if !empty(l:virt_lines)
-                     let l:req.extmark_virt = nvim_buf_set_extmark(bufnr('%'), l:ns, l:req.range[1] - 1, 0, {
-                         \ 'virt_lines': l:virt_lines
-                         \ })
-                 endif
+            call llama#inst_update_pos(l:req)
+
+            if s:ghost_text_nvim
+                let l:ns = nvim_create_namespace('vt_inst')
+
+                if l:req.extmark_virt != -1
+                    call nvim_buf_del_extmark(bufnr('%'), l:ns, l:req.extmark_virt)
+                    let l:req.extmark_virt = -1
+                endif
+
+                let l:inst_trunc = l:req.inst
+                if len(l:inst_trunc) > 64
+                    let l:inst_trunc = l:inst_trunc[:63] . '...'
+                endif
+
+                let l:virt_lines = []
+                let l:separator = '====================================='
+                if a:status == 'ready'
+                    let l:result_lines = split(l:req.result, "\n")
+                    let l:virt_lines = [[[l:separator, 'llama_hl_inst_virt_ready']]] + map(l:result_lines, {idx, val -> [[val, 'llama_hl_inst_virt_ready']]})
+                elseif a:status == 'proc'
+                    let l:virt_lines = [
+                        \ [[l:separator, 'llama_hl_inst_virt_proc']],
+                        \ [[printf('(%s) Processing ...', g:llama_config.endpoint_inst), 'llama_hl_inst_virt_proc']],
+                        \ [['Instruction: ' . l:inst_trunc, 'llama_hl_inst_virt_proc']]
+                        \ ]
+                elseif a:status == 'gen'
+                    let l:virt_lines = [
+                        \ [[l:separator, 'llama_hl_inst_virt_gen']],
+                        \ [[printf('(%s) Generating ...', g:llama_config.endpoint_inst), 'llama_hl_inst_virt_gen']],
+                        \ [['Instruction: ' . l:inst_trunc, 'llama_hl_inst_virt_gen']]
+                        \ ]
+                endif
+
+                if !empty(l:virt_lines)
+                    let l:req.extmark_virt = nvim_buf_set_extmark(bufnr('%'), l:ns, l:req.range[1] - 1, 0, {
+                        \ 'virt_lines': l:virt_lines
+                        \ })
+                endif
             elseif s:ghost_text_vim
                 " TODO: implement classic Vim support
             endif
@@ -1457,7 +1483,6 @@ endfunction
 function! s:inst_remove(id)
     for i in range(len(s:inst_requests))
         if s:inst_requests[i].id == a:id
-            " Clear extmarks
             if s:ghost_text_nvim
                 let l:ns = nvim_create_namespace('vt_inst')
                 call nvim_buf_del_extmark(bufnr('%'), l:ns, s:inst_requests[i].extmark)
@@ -1473,6 +1498,7 @@ endfunction
 
 function! s:inst_callback(start, end, result)
     let l:result_lines = split(a:result, "\n", 1)
+
     " Remove trailing empty lines
     while len(l:result_lines) > 0 && l:result_lines[-1] == ""
         call remove(l:result_lines, -1)
@@ -1490,20 +1516,27 @@ endfunction
 
 function! llama#inst_accept()
     let l:line = line('.')
+
     for l:req in s:inst_requests
-        if l:req.status == 'ready' && l:line >= l:req.range[0] && l:line <= l:req.range[1]
-            call s:inst_callback(l:req.range[0], l:req.range[1], l:req.result)
-            call s:inst_remove(l:req.id)
-            return
+        if l:req.status ==# 'ready'
+            call llama#inst_update_pos(l:req)
+
+            if l:line >= l:req.range[0] && l:line <= l:req.range[1]
+                call s:inst_callback(l:req.range[0], l:req.range[1], l:req.result)
+                call s:inst_remove(l:req.id)
+                return
+            endif
         endif
     endfor
-    " If not in range, do normal Tab
+
     call feedkeys("\<Tab>", 'n')
 endfunction
 
 function! llama#inst_cancel()
     let l:line = line('.')
     for l:req in s:inst_requests
+        call llama#inst_update_pos(l:req)
+
         if l:line >= l:req.range[0] && l:line <= l:req.range[1]
             call s:inst_remove(l:req.id)
             return

From 2c887282398f55cdfa8c5e12a783aa75bfa4faab Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 19 Jan 2026 19:02:13 +0200
Subject: [PATCH 3/8] cont : update docs

---
 README.md          |  7 ++++++-
 autoload/llama.vim |  3 +--
 doc/llama.txt      | 19 ++++++++++++++-----
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 4e6ded8..ae24db7 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,7 @@ Local LLM-assisted text completion.
 - Toggle the suggestion manually by pressing `Ctrl+F`
 - Accept a suggestion with `Tab`
 - Accept the first line of a suggestion with `Shift+Tab`
+- Instruction-based editing (instruct mode) with `Ctrl+I`
 - Control max text generation time
 - Configure scope of context around the cursor
 - Ring context with chunks from open and edited files and yanked text
@@ -83,10 +84,14 @@ Examples:
 
 4. Changing accept line keymap
 
+5. Configure instruction-based editing trigger keymap
+
     ```vim
-    let g:llama_config.keymap_accept_full = "<C-S>"
+    let g:llama_config.keymap_inst_trigger = "<C-I>"
     ```
 
+5. Configure instruction-based editing trigger keymap
+
 Please refer to `:help llama_config` or the [source](./autoload/llama.vim)
 for the full list of options.
 
diff --git a/autoload/llama.vim b/autoload/llama.vim
index 4e0f473..30d4f4e 100644
--- a/autoload/llama.vim
+++ b/autoload/llama.vim
@@ -82,12 +82,11 @@ let s:default_config = {
     \ 'keymap_fim_accept_full': "<Tab>",
     \ 'keymap_fim_accept_line': "<S-Tab>",
     \ 'keymap_fim_accept_word': "<C-B>",
-    \ 'keymap_inst_trigger':    v:null,
+    \ 'keymap_inst_trigger':    "<C-I>",
     \ 'keymap_inst_accept':     "<Tab>",
     \ 'keymap_inst_cancel':     "<Esc>",
     \ 'keymap_debug_toggle':    v:null,
     \ 'enable_at_startup':      v:true,
-    \ 'timeout_inst':           60000,
     \ }
 
 let llama_config = get(g:, 'llama_config', s:default_config)
diff --git a/doc/llama.txt b/doc/llama.txt
index 09a0c30..a13abb1 100644
--- a/doc/llama.txt
+++ b/doc/llama.txt
@@ -20,6 +20,7 @@ Default Shortcut
 - Shift+Tab - accept just the first line of the suggestion
 - Ctrl+B    - accept just the first word of the suggestion
 - Ctrl+F    - trigger FIM completion manually
+- Ctrl+I    - trigger instruction-based editing (instruct mode)
 
 ================================================================================
 Commands
@@ -96,9 +97,11 @@ variable.
 Currently the default config is:
 >vim
 		let s:default_config = {
-		    \ 'endpoint':               'http://127.0.0.1:8012/infill',
+		    \ 'endpoint_fim':           'http://127.0.0.1:8012/infill',
+		    \ 'endpoint_inst':          'http://127.0.0.1:8012/v1/chat/completions',
 		    \ 'api_key':                '',
-		    \ 'model':                  '',
+		    \ 'model_fim':              '',
+		    \ 'model_inst':             '',
 		    \ 'n_prefix':               256,
 		    \ 'n_suffix':               64,
 		    \ 'n_predict':              128,
@@ -117,17 +120,21 @@ Currently the default config is:
 		    \ 'keymap_fim_accept_full': "<Tab>",
 		    \ 'keymap_fim_accept_line': "<S-Tab>",
 		    \ 'keymap_fim_accept_word': "<C-B>",
+		    \ 'keymap_inst_trigger':    v:null,
 		    \ 'keymap_debug_toggle':    v:null,
 		    \ 'enable_at_startup':      v:true,
 		    \ }
 <
 
-- {endpoint}			llama.cpp server endpoint
+- {endpoint_fim}		llama.cpp server endpoint for FIM completion
+
+- {endpoint_inst}		llama.cpp server endpoint for instruction completion
 
 - {api_key}				llama.cpp server api key (optional)
 
-- {model}				model name in case if multiple models are
-						loaded (optional)
+- {model_fim}			model name for FIM completion (optional)
+
+- {model_inst}			model name for instruction completion (optional)
 
 - {n_prefix}			number of lines before the cursor location to include
 						in the local prefix
@@ -184,6 +191,8 @@ keymaps parameters:
 
 - {keymap_fim_accept_word}	keymap to accept word suggestion, default: <C-B>
 
+- {keymap_inst_trigger}		keymap to trigger instruction-based editing, default: null
+
 - {keymap_debug_toggle}		keymap to toggle the debug pane, default: null
 
 Example:

From 3574074d162ced4d76c0831f52cb35e2be594a17 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 19 Jan 2026 19:10:17 +0200
Subject: [PATCH 4/8] cont : cleanup

---
 autoload/llama.vim | 44 ++++++++++++++++++++++++--------------------
 doc/llama.txt      |  3 +++
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/autoload/llama.vim b/autoload/llama.vim
index 30d4f4e..31867ce 100644
--- a/autoload/llama.vim
+++ b/autoload/llama.vim
@@ -1278,21 +1278,21 @@ function! llama#inst_send(start, end, lines, inst, callback)
 
     call add(s:inst_requests, l:req)
 
-     " highlights the selected text
-     let l:bufnr = bufnr('%')
-     if s:ghost_text_nvim
-         let l:ns = nvim_create_namespace('vt_inst')
-         let l:req.extmark = nvim_buf_set_extmark(l:bufnr, l:ns, a:start - 1, 0, {
-             \ 'end_row': l:end - 1,
-             \ 'end_col': len(getline(l:end)),
-             \ 'hl_group': 'llama_hl_inst_src'
-             \ })
-     elseif s:ghost_text_vim
-         " TODO: implement classic Vim support
-     endif
-
-     " Initialize virtual text with processing status
-     call s:inst_update(l:request_id, 'proc')
+    " highlights the selected text
+    let l:bufnr = bufnr('%')
+    if s:ghost_text_nvim
+        let l:ns = nvim_create_namespace('vt_inst')
+        let l:req.extmark = nvim_buf_set_extmark(l:bufnr, l:ns, a:start - 1, 0, {
+            \ 'end_row': l:end - 1,
+            \ 'end_col': len(getline(l:end)),
+            \ 'hl_group': 'llama_hl_inst_src'
+            \ })
+    elseif s:ghost_text_vim
+        " TODO: implement classic Vim support
+    endif
+
+    " Initialize virtual text with processing status
+    call s:inst_update(l:request_id, 'proc')
 
     " Build the payload
     let l:system_message = {
@@ -1314,18 +1314,20 @@ function! llama#inst_send(start, end, lines, inst, callback)
     let l:user_content  = ""
     let l:user_content .= "--- context ----------------------------------------------------\n"
     let l:user_content .= join(l:extra_context, "\n") . "\n"
-    let l:user_content .= "--- instruction ------------------------------------------------\n"
-    let l:user_content .= a:inst . "\n"
     let l:user_content .= "--- selection --------------------------------------------------\n"
     let l:user_content .= join(a:lines, "\n") . "\n"
+    let l:user_content .= "--- instruction ------------------------------------------------\n"
+    let l:user_content .= a:inst . "\n"
     let l:user_content .= "--- result -----------------------------------------------------\n"
 
+    call llama#debug_log('inst_send | ' . a:inst, l:user_content)
+
     let l:user_message = {'role': 'user', 'content': l:user_content}
 
     let l:messages = [l:system_message, l:user_message]
 
     let l:request = {
-        \ 'model':        g:llama_config.model_inst,
+        \ 'id_slot':      0,
         \ 'messages':     l:messages,
         \ 'min_p':        0.1,
         \ 'samplers':     ["min_p"],
@@ -1333,8 +1335,6 @@ function! llama#inst_send(start, end, lines, inst, callback)
         \ 'cache_prompt': v:true,
         \ }
 
-    call llama#debug_log('inst_send | ' . a:inst, l:user_content)
-
     let l:curl_command = [
         \ "curl",
         \ "--silent",
@@ -1345,6 +1345,10 @@ function! llama#inst_send(start, end, lines, inst, callback)
         \ "--data", "@-",
         \ ]
 
+    if exists("g:llama_config.model_inst") && len("g:llama_config.model_inst") > 0
+        let l:request.model = g:llama_config.model_inst
+    endif
+
     if exists("g:llama_config.api_key") && len("g:llama_config.api_key") > 0
         call extend(l:curl_command, ['--header', 'Authorization: Bearer ' .. g:llama_config.api_key])
     endif
diff --git a/doc/llama.txt b/doc/llama.txt
index a13abb1..2054421 100644
--- a/doc/llama.txt
+++ b/doc/llama.txt
@@ -45,6 +45,9 @@ Commands
 		Toggle autofim for this vim/nvim session
 		Equivalent to vimscript function: `llama#toggle_auto_fim()`
 
+*:LlamaInstruct*
+		Trigger instruction-based editing (instruct mode) on selected text
+
 *:LlamaDebugClear*
 		Clear the debug pane logs.
 		Equivalent to vimscript function: `debug#clear()`

From 2563150c03f0eab805b14d0684803fed9820ae3a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 19 Jan 2026 19:27:30 +0200
Subject: [PATCH 5/8] cont : fix bufnr

---
 autoload/llama.vim | 75 +++++++++++++++++++++-------------------------
 1 file changed, 34 insertions(+), 41 deletions(-)

diff --git a/autoload/llama.vim b/autoload/llama.vim
index 31867ce..e799c50 100644
--- a/autoload/llama.vim
+++ b/autoload/llama.vim
@@ -443,6 +443,20 @@ function! s:pick_chunk(text, no_mod, do_evict)
     "let &statusline = 'extra context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
 endfunction
 
+function! s:ring_get_extra()
+    " extra context
+    let l:extra = []
+    for l:chunk in s:ring_chunks
+        call add(l:extra, {
+            \ 'text':     l:chunk.str,
+            \ 'time':     l:chunk.time,
+            \ 'filename': l:chunk.filename
+            \ })
+    endfor
+
+    return l:extra
+endfunction
+
 " picks a queued chunk, sends it for processing and adds it to s:ring_chunks
 " called every g:llama_config.ring_update_ms
 function! s:ring_update()
@@ -467,21 +481,14 @@ function! s:ring_update()
     "let &statusline = 'updated context: ' . len(s:ring_chunks) . ' / ' . len(s:ring_queued)
 
     " send asynchronous job with the new extra context so that it is ready for the next FIM
-    let l:extra_context = []
-    for l:chunk in s:ring_chunks
-        call add(l:extra_context, {
-            \ 'text':     l:chunk.str,
-            \ 'time':     l:chunk.time,
-            \ 'filename': l:chunk.filename
-            \ })
-    endfor
+    let l:extra = s:ring_get_extra()
 
     " no samplers needed here
     let l:request = {
         \ 'id_slot':          0,
         \ 'input_prefix':     "",
         \ 'input_suffix':     "",
-        \ 'input_extra':      l:extra_context,
+        \ 'input_extra':      l:extra,
         \ 'prompt':           "",
         \ 'n_predict':        0,
         \ 'temperature':      0.0,
@@ -719,21 +726,13 @@ function! llama#fim(pos_x, pos_y, is_auto, prev, use_cache) abort
     "    endif
     "endfor
 
-    " prepare the extra context data
-    let l:extra_ctx = []
-    for l:chunk in s:ring_chunks
-        call add(l:extra_ctx, {
-            \ 'text':     l:chunk.str,
-            \ 'time':     l:chunk.time,
-            \ 'filename': l:chunk.filename
-            \ })
-    endfor
+    let l:extra = s:ring_get_extra()
 
     let l:request = {
         \ 'id_slot':          0,
         \ 'input_prefix':     l:prefix,
         \ 'input_suffix':     l:suffix,
-        \ 'input_extra':      l:extra_ctx,
+        \ 'input_extra':      l:extra,
         \ 'prompt':           l:middle,
         \ 'n_predict':        g:llama_config.n_predict,
         \ 'stop':             g:llama_config.stop_strings,
@@ -1265,8 +1264,11 @@ function! llama#inst_send(start, end, lines, inst, callback)
 
     let l:end = min([a:end, line('$')])
 
+    let l:bufnr = bufnr('%')
+
     let l:req = {
         \ 'id': l:request_id,
+        \ 'bufnr': l:bufnr,
         \ 'range': [a:start, l:end],
         \ 'status': 'proc',
         \ 'result': '',
@@ -1279,7 +1281,6 @@ function! llama#inst_send(start, end, lines, inst, callback)
     call add(s:inst_requests, l:req)
 
     " highlights the selected text
-    let l:bufnr = bufnr('%')
     if s:ghost_text_nvim
         let l:ns = nvim_create_namespace('vt_inst')
         let l:req.extmark = nvim_buf_set_extmark(l:bufnr, l:ns, a:start - 1, 0, {
@@ -1300,20 +1301,11 @@ function! llama#inst_send(start, end, lines, inst, callback)
         \ 'content': 'You are a code-editing assistant. Return ONLY the result after applying the instruction to the selection.'
         \ }
 
-    " extra context
-    " TODO: deduplicate
-    let l:extra_context = []
-    for l:chunk in s:ring_chunks
-        call add(l:extra_context, {
-            \ 'text':     l:chunk.str,
-            \ 'time':     l:chunk.time,
-            \ 'filename': l:chunk.filename
-            \ })
-    endfor
+    let l:extra = s:ring_get_extra()
 
     let l:user_content  = ""
     let l:user_content .= "--- context ----------------------------------------------------\n"
-    let l:user_content .= join(l:extra_context, "\n") . "\n"
+    let l:user_content .= join(l:extra, "\n") . "\n"
     let l:user_content .= "--- selection --------------------------------------------------\n"
     let l:user_content .= join(a:lines, "\n") . "\n"
     let l:user_content .= "--- instruction ------------------------------------------------\n"
@@ -1376,7 +1368,7 @@ function! llama#inst_send(start, end, lines, inst, callback)
 endfunction
 
 function! llama#inst_update_pos(req)
-    let l:bufnr = bufnr('%')
+    let l:bufnr = a:req.bufnr
     let l:ns    = nvim_create_namespace('vt_inst')
 
     let l:extmark_pos = nvim_buf_get_extmark_by_id(l:bufnr, l:ns, a:req.extmark, {})
@@ -1399,7 +1391,7 @@ function! s:inst_update(id, status)
                 let l:ns = nvim_create_namespace('vt_inst')
 
                 if l:req.extmark_virt != -1
-                    call nvim_buf_del_extmark(bufnr('%'), l:ns, l:req.extmark_virt)
+                    call nvim_buf_del_extmark(l:req.bufnr, l:ns, l:req.extmark_virt)
                     let l:req.extmark_virt = -1
                 endif
 
@@ -1428,7 +1420,7 @@ function! s:inst_update(id, status)
                 endif
 
                 if !empty(l:virt_lines)
-                    let l:req.extmark_virt = nvim_buf_set_extmark(bufnr('%'), l:ns, l:req.range[1] - 1, 0, {
+                    let l:req.extmark_virt = nvim_buf_set_extmark(l:req.bufnr, l:ns, l:req.range[1] - 1, 0, {
                         \ 'virt_lines': l:virt_lines
                         \ })
                 endif
@@ -1485,12 +1477,13 @@ endfunction
 
 function! s:inst_remove(id)
     for i in range(len(s:inst_requests))
-        if s:inst_requests[i].id == a:id
+        let l:req = s:inst_requests[i]
+        if l:req.id == a:id
             if s:ghost_text_nvim
                 let l:ns = nvim_create_namespace('vt_inst')
-                call nvim_buf_del_extmark(bufnr('%'), l:ns, s:inst_requests[i].extmark)
-                if s:inst_requests[i].extmark_virt != -1
-                    call nvim_buf_del_extmark(bufnr('%'), l:ns, s:inst_requests[i].extmark_virt)
+                call nvim_buf_del_extmark(l:req.bufnr, l:ns, l:req.extmark)
+                if l:req.extmark_virt != -1
+                    call nvim_buf_del_extmark(l:req.bufnr, l:ns, l:req.extmark_virt)
                 endif
             endif
             call remove(s:inst_requests, i)
@@ -1499,7 +1492,7 @@ function! s:inst_remove(id)
     endfor
 endfunction
 
-function! s:inst_callback(start, end, result)
+function! s:inst_callback(bufnr, start, end, result)
     let l:result_lines = split(a:result, "\n", 1)
 
     " Remove trailing empty lines
@@ -1511,7 +1504,7 @@ function! s:inst_callback(start, end, result)
     let l:num_original = a:end - a:start + 1
 
     " Delete the original range
-    call deletebufline(bufnr('%'), a:start, a:end)
+    call deletebufline(a:bufnr, a:start, a:end)
 
     " Insert the new lines
     call append(a:start - 1, l:result_lines)
@@ -1525,7 +1518,7 @@ function! llama#inst_accept()
             call llama#inst_update_pos(l:req)
 
             if l:line >= l:req.range[0] && l:line <= l:req.range[1]
-                call s:inst_callback(l:req.range[0], l:req.range[1], l:req.result)
+                call s:inst_callback(l:req.bufnr, l:req.range[0], l:req.range[1], l:req.result)
                 call s:inst_remove(l:req.id)
                 return
             endif

From 9df00957ea26b66f697fae8b59213c366ca95b41 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 19 Jan 2026 19:35:24 +0200
Subject: [PATCH 6/8] cont : send warm-up request

---
 autoload/llama.vim | 96 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 75 insertions(+), 21 deletions(-)

diff --git a/autoload/llama.vim b/autoload/llama.vim
index e799c50..3ab27e5 100644
--- a/autoload/llama.vim
+++ b/autoload/llama.vim
@@ -1247,8 +1247,81 @@ endfunction
 " Instruct-based editing
 " =====================================
 
+function! llama#inst_build(lines, inst)
+    let l:system_message = {
+        \ 'role': 'system',
+        \ 'content': 'You are a code-editing assistant. Return ONLY the result after applying the instruction to the selection.'
+        \ }
+
+    let l:extra = s:ring_get_extra()
+
+    let l:user_content  = ""
+    let l:user_content .= "--- context ----------------------------------------------------\n"
+    let l:user_content .= join(l:extra, "\n") . "\n"
+    let l:user_content .= "--- selection --------------------------------------------------\n"
+    let l:user_content .= join(a:lines, "\n") . "\n"
+    let l:user_content .= "--- instruction ------------------------------------------------\n"
+
+    if !empty(a:inst)
+        let l:user_content .= a:inst . "\n"
+        let l:user_content .= "--- result -----------------------------------------------------\n"
+    endif
+
+    let l:user_message = {'role': 'user', 'content': l:user_content}
+
+    let l:messages = [l:system_message, l:user_message]
+
+    return l:messages
+endfunction
+
 function! llama#inst(start, end)
     let l:lines = getline(a:start, a:end)
+
+    " while the user is providing an instruction, send a warm-up request
+    let l:messages = llama#inst_build(l:lines, '')
+
+    let l:request = {
+        \ 'id_slot':      0,
+        \ 'messages':     l:messages,
+        \ 'samplers':     [],
+        \ 'n_predict':    0,
+        \ 'stream':       v:false,
+        \ 'cache_prompt': v:true,
+        \ 'response_fields':  [""],
+        \ }
+
+    let l:curl_command = [
+        \ "curl",
+        \ "--silent",
+        \ "--no-buffer",
+        \ "--request", "POST",
+        \ "--url", g:llama_config.endpoint_inst,
+        \ "--header", "Content-Type: application/json",
+        \ "--data", "@-",
+        \ ]
+
+    if exists("g:llama_config.model_inst") && len("g:llama_config.model_inst") > 0
+        let l:request.model = g:llama_config.model_inst
+    endif
+
+    if exists("g:llama_config.api_key") && len("g:llama_config.api_key") > 0
+        call extend(l:curl_command, ['--header', 'Authorization: Bearer ' .. g:llama_config.api_key])
+    endif
+
+    let l:request_json = json_encode(l:request)
+
+    " no callbacks because we don't need to process the response
+    if s:ghost_text_nvim
+        let jobid = jobstart(l:curl_command, {})
+        call chansend(jobid, l:request_json)
+        call chanclose(jobid, 'stdin')
+    elseif s:ghost_text_vim
+        let jobid = job_start(l:curl_command, {})
+        let channel = job_getchannel(jobid)
+        call ch_sendraw(channel, l:request_json)
+        call ch_close_in(channel)
+    endif
+
     let l:inst = input('Instruction: ')
     if empty(l:inst)
         return
@@ -1295,28 +1368,9 @@ function! llama#inst_send(start, end, lines, inst, callback)
     " Initialize virtual text with processing status
     call s:inst_update(l:request_id, 'proc')
 
-    " Build the payload
-    let l:system_message = {
-        \ 'role': 'system',
-        \ 'content': 'You are a code-editing assistant. Return ONLY the result after applying the instruction to the selection.'
-        \ }
-
-    let l:extra = s:ring_get_extra()
-
-    let l:user_content  = ""
-    let l:user_content .= "--- context ----------------------------------------------------\n"
-    let l:user_content .= join(l:extra, "\n") . "\n"
-    let l:user_content .= "--- selection --------------------------------------------------\n"
-    let l:user_content .= join(a:lines, "\n") . "\n"
-    let l:user_content .= "--- instruction ------------------------------------------------\n"
-    let l:user_content .= a:inst . "\n"
-    let l:user_content .= "--- result -----------------------------------------------------\n"
-
-    call llama#debug_log('inst_send | ' . a:inst, l:user_content)
+    let l:messages = llama#inst_build(a:lines, a:inst)
 
-    let l:user_message = {'role': 'user', 'content': l:user_content}
-
-    let l:messages = [l:system_message, l:user_message]
+    call llama#debug_log('inst_send | ' . a:inst, join(l:messages, "\n"))
 
     let l:request = {
         \ 'id_slot':      0,

From 9c35719feef49f8dc921e49af039e2d22cf7b2fb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 19 Jan 2026 19:48:10 +0200
Subject: [PATCH 7/8] readme : update

---
 README.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ae24db7..e19f6c4 100644
--- a/README.md
+++ b/README.md
@@ -4,10 +4,16 @@ Local LLM-assisted text completion.
 
 <img width="485" alt="image" src="https://github.com/user-attachments/assets/a950e38c-3b3f-4c46-94fe-0d6e0f790fc6">
 
----
+#### Fill-in-Middle (FIM) completions
 
 ![llama vim-spec-1](https://github.com/user-attachments/assets/404ebc2a-e4b8-4119-999b-e5365ec3208d)
 
+#### Instruction-based editing
+
+https://github.com/user-attachments/assets/641a6e72-f1a2-4fe5-b0fd-c2597c6f4cdc
+
+---
+
 ## Features
 
 - Auto-suggest on cursor movement in `Insert` mode

From b48d5a5bdef4187daa798a3c2958ffb27f57b380 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 19 Jan 2026 19:49:54 +0200
Subject: [PATCH 8/8] readme : update about endpoints

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e19f6c4..7630896 100644
--- a/README.md
+++ b/README.md
@@ -103,7 +103,7 @@ for the full list of options.
 
 ### llama.cpp setup
 
-The plugin requires a [llama.cpp](https://github.com/ggml-org/llama.cpp) server instance to be running at [`g:llama_config.endpoint`](https://github.com/ggml-org/llama.vim/blob/master/autoload/llama.vim#L37).
+The plugin requires a [llama.cpp](https://github.com/ggml-org/llama.cpp) server instance to be running at [`g:llama_config.endpoint_fim`](https://github.com/ggml-org/llama.vim/blob/master/autoload/llama.vim#L18) and/or [`g:llama_config.endpoint_inst`](https://github.com/ggml-org/llama.vim/blob/master/autoload/llama.vim#L19).
 
 #### Mac OS