From 949c928fbc55a763adf8678275f4aa5709aef922 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 8 Oct 2024 14:24:16 +0300 Subject: [PATCH] llama.vim : wip [no ci] --- examples/llama.vim | 194 ++++++++++++++++----------------------------- 1 file changed, 70 insertions(+), 124 deletions(-) diff --git a/examples/llama.vim b/examples/llama.vim index 1b5ad6ba0f32f0..39297970390029 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -1,135 +1,81 @@ -" Requires an already running llama.cpp server -" To install either copy or symlink to ~/.vim/autoload/llama.vim -" Then start with either :call llama#doLlamaGen(), -" or add a keybind to your vimrc such as -" nnoremap Z :call llama#doLlamaGen() -" Similarly, you could add an insert mode keybind with -" inoremap call llama#doLlamaGen() +" sample config: " -" g:llama_api_url, g:llama_api_key and g:llama_overrides can be configured in your .vimrc -" let g:llama_api_url = "192.168.1.10:8080" -" llama_overrides can also be set through buffer/window scopes. For instance -" autocmd filetype python let b:llama_overrides = {"temp": 0.2} -" Could be added to your .vimrc to automatically set a lower temperature when -" editing a python script -" Additionally, an override dict can be stored at the top of a file -" !*{"stop": ["User:"]} -" Could be added to the start of your chatlog.txt to set the stopping token -" These parameter dicts are merged together from lowest to highest priority: -" server default -> g:llama_overrides -> w:llama_overrides -> -" b:llama_overrides -> in file (!*) overrides +" - Ctrl+F - trigger FIM completion +" +" copy paste this in your .vimrc: +" +"augroup llama_cpp +" autocmd! +" autocmd InsertEnter * inoremap :call llama#fim() +"augroup END " -" Sublists (like logit_bias and stop) are overridden, not merged -" Example override: -" !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647} -if !exists("g:llama_api_url") - let g:llama_api_url= "127.0.0.1:8080" -endif -if !exists("g:llama_overrides") - let g:llama_overrides = {} -endif -const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true } -const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"] -let s:linedict = {} -func s:callbackHandler(bufn, channel, msg) - if len(a:msg) < 3 - return - elseif a:msg[0] == "d" - let l:msg = a:msg[6:-1] - else - let l:msg = a:msg - endif - let l:decoded_msg = json_decode(l:msg) - let l:newtext = split(l:decoded_msg['content'], "\n", 1) - if len(l:newtext) > 0 - call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0]) - else - echo "nothing genned" - endif - if len(newtext) > 1 - let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1]) - let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1 - endif - if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop - echo "Finished generation" - endif -endfunction +let s:default_config = { + \ 'prefix_lines': 32, + \ 'suffix_lines': 32, + \ 'endpoint': 'http://127.0.0.1:8012/infill', + \ 'stop': ["\n"], + \ 'n_predict': 64, + \ 'n_probs': 3, + \ 'temperature': 0.1 + \} -func llama#doLlamaGen() - if exists("b:job") - if job_status(b:job) == "run" - call job_stop(b:job) - return - endif - endif +let g:llama_config = get(g:, 'llama_config', s:default_config) - let l:cbuffer = bufnr("%") - let s:linedict[l:cbuffer] = line('$') - let l:buflines = getbufline(l:cbuffer, 1, 1000) - let l:querydata = copy(s:querydata) - call extend(l:querydata, g:llama_overrides) - if exists("w:llama_overrides") - call extend(l:querydata, w:llama_overrides) - endif - if exists("b:llama_overrides") - call extend(l:querydata, b:llama_overrides) - endif - if l:buflines[0][0:1] == '!*' - let l:userdata = json_decode(l:buflines[0][2:-1]) - call extend(l:querydata, l:userdata) - let l:buflines = l:buflines[1:-1] - endif - let l:querydata.prompt = join(l:buflines, "\n") - let l:curlcommand = copy(s:curlcommand) - if exists("g:llama_api_key") - call extend(l:curlcommand, ['--header', 'Authorization: Bearer ' .. g:llama_api_key]) - endif - let l:curlcommand[2] = json_encode(l:querydata) - let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])}) -endfunction +function! llama#fim() abort + let l:lines_prefix = getline(max([1, line('.') - g:llama_config.suffix_lines]), line('.') - 1) + let l:lines_suffix = getline(line('.') + 1, min([line('$'), line('.') + g:llama_config.prefix_lines])) -" Echos the tokkenization of the provided string , or cursor to end of word -" Onus is placed on the user to include the preceding space -func llama#tokenizeWord(...) - if (a:0 > 0) - let l:input = a:1 - else - exe "normal \"*ye" - let l:input = @* - endif - let l:querydata = {"content": l:input} - let l:curlcommand = copy(s:curlcommand) - let l:curlcommand[2] = json_encode(l:querydata) - let l:curlcommand[8] = g:llama_api_url .. "/tokenize" - let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])}) -endfunction + let l:cursor_col = col('.') -func s:tokenizeWordCallback(plaintext, channel, msg) - echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens) -endfunction + let l:line_cur = getline('.') + let l:line_cur_prefix = strpart(l:line_cur, 0, l:cursor_col) + let l:line_cur_suffix = strpart(l:line_cur, l:cursor_col) + let l:prefix = "" + \ . join(l:lines_prefix, "\n") + \ . "\n" + \ . l:line_cur_prefix -" Echos the token count of the entire buffer (or provided string) -" Example usage :echo llama#tokenCount() -func llama#tokenCount(...) - if (a:0 > 0) - let l:buflines = a:1 - else - let l:buflines = getline(1,1000) - if l:buflines[0][0:1] == '!*' - let l:buflines = l:buflines[1:-1] - endif - let l:buflines = join(l:buflines, "\n") - endif - let l:querydata = {"content": l:buflines} - let l:curlcommand = copy(s:curlcommand) - let l:curlcommand[2] = json_encode(l:querydata) - let l:curlcommand[8] = g:llama_api_url .. "/tokenize" - let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"}) -endfunction + let l:suffix = "" + \ . l:line_cur_suffix + \ . join(l:lines_suffix, "\n") + + let l:request = json_encode({ + \ 'prompt': "", + \ 'input_prefix': l:prefix, + \ 'input_suffix': l:suffix, + "\ 'stop': g:llama_config.stop, + \ 'n_predict': g:llama_config.n_predict, + "\ 'n_probs': g:llama_config.n_probs, + \ 'penalty_last_n': 0, + \ 'temperature': g:llama_config.temperature, + \ 'top_k': 10, + \ 'stream': v:false, + \ 'samplers': ["top_k"] + \ }) + + " request completion from the server + let l:curl_command = printf( + \ "curl --silent --no-buffer --request POST --url %s --header \"Content-Type: application/json\" --data %s", + \ g:llama_config.endpoint, shellescape(l:request) + \ ) + + let l:response = json_decode(system(l:curl_command)) + + echom l:response + + let l:content = [] + for l:part in split(get(l:response, 'content', ''), "\n", 1) + call add(l:content, l:part) + endfor + + echom l:content + + " insert the 'content' at the current cursor location + let l:content[0] = l:line_cur_prefix . l:content[0] + let l:content[-1] .= l:line_cur_suffix -func s:tokenCountCallback(channel, msg) - let resp = json_decode(a:msg) - echo len(resp.tokens) + call setline('.', l:content[0]) + call append (line('.'), l:content[1:-1]) endfunction