Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/concedo'
Browse files Browse the repository at this point in the history
  • Loading branch information
YellowRoseCx committed Dec 23, 2023
2 parents 031c60b + 71a5afa commit 76eb691
Show file tree
Hide file tree
Showing 55 changed files with 3,113 additions and 1,726 deletions.
3 changes: 3 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@ insert_final_newline = unset

[examples/server/public/*]
indent_size = 2

[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
indent_style = tab
File renamed without changes.
36 changes: 36 additions & 0 deletions .github/workflows/kcpp-build-release-win-cuda.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: Koboldcpp Builder Windows CUDA

on: workflow_dispatch
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}

jobs:
windows:
runs-on: windows-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
with:
ref: concedo_experimental

- uses: Jimver/cuda-toolkit@v0.2.11
id: cuda-toolkit
with:
cuda: '11.7.1'
method: 'network'
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'

- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DLLAMA_CUBLAS=ON
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Save artifact
uses: actions/upload-artifact@v3
with:
name: kcpp_windows_cuda_binary
path: build/bin/Release/
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ models-mnt
/llama-bench
/llava-cli
/lookahead
/lookup
/main
/metal
/perplexity
Expand Down
7 changes: 6 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,12 @@ if (LLAMA_CUBLAS)
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})

if (LLAMA_STATIC)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
if (WIN32)
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
else ()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
endif()
else()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
Expand Down
71 changes: 45 additions & 26 deletions class.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import requests
import numpy as np
from typing import List, Optional, Union
import os
import os, time
from . import koboldcpp

import utils
Expand All @@ -20,11 +20,9 @@
InferenceModel,
)

model_backend_name = "koboldcpp" #specific instead of ggml
model_backend_name = "KoboldCPP" #specific instead of ggml
model_backend_type = "ggml" #This should be a generic name in case multiple model backends are compatible (think Hugging Face Custom and Basic Hugging Face)

kcpp_backend_loaded = False

class KoboldCppException(Exception):
"""To be used for errors on cpp side of KoboldCpp."""

Expand All @@ -35,6 +33,7 @@ def __init__(self, **kwargs):
class model_backend(InferenceModel):
def __init__(self) -> None:
super().__init__()
self.kcpp_backend_loaded = False

def is_valid(self, model_name, model_path, menu_path):

Expand Down Expand Up @@ -257,26 +256,31 @@ def set_input_parameters(self, parameters):

def unload(self):
print("Attemping to unload library")
koboldcpp.unload_libs()
global kcpp_backend_loaded
kcpp_backend_loaded = False
pass
self.process.terminate()


def _load(self, save_model: bool, initial_load: bool) -> None:
global kcpp_backend_loaded
self.tokenizer = self._get_tokenizer("gpt2")
if not kcpp_backend_loaded:
kcppargs = KcppArgsObject(model=self.kcpp_filename, model_param=self.kcpp_filename,
port=5001, port_param=5001, host='', launch=False, lora=None, threads=self.kcpp_threads, blasthreads=self.kcpp_threads,
highpriority=False, contextsize=self.kcpp_ctxsize, blasbatchsize=self.kcpp_blasbatchsize, ropeconfig=[self.kcpp_ropescale, self.kcpp_ropebase],
smartcontext=self.kcpp_smartcontext, bantokens=None, forceversion=0, nommap=self.kcpp_nommap,
usemlock=False, noavx2=self.kcpp_noavx2, debugmode=self.kcpp_debugmode, skiplauncher=True, hordeconfig=None, noblas=self.kcpp_noblas,
useclblast=self.kcpp_useclblast, usecublas=self.kcpp_usecublas, gpulayers=self.kcpp_gpulayers, tensor_split=self.kcpp_tensor_split, config=None,
onready='', multiuser=False, foreground=False)
kcppargs = KcppArgsObject(model=self.kcpp_filename, model_param=self.kcpp_filename,
port=5001, port_param=5001, host='', launch=False, lora=None, threads=self.kcpp_threads, blasthreads=self.kcpp_threads,
psutil_set_threads=False, highpriority=False, contextsize=self.kcpp_ctxsize,
blasbatchsize=self.kcpp_blasbatchsize, ropeconfig=[self.kcpp_ropescale, self.kcpp_ropebase], stream=False, smartcontext=self.kcpp_smartcontext,
unbantokens=False, bantokens=None, usemirostat=None, forceversion=0, nommap=self.kcpp_nommap,
usemlock=False, noavx2=self.kcpp_noavx2, debugmode=self.kcpp_debugmode, skiplauncher=True, hordeconfig=None, noblas=self.kcpp_noblas,
useclblast=self.kcpp_useclblast, usecublas=self.kcpp_usecublas, gpulayers=self.kcpp_gpulayers, tensor_split=self.kcpp_tensor_split, config=None,
onready='', multiuser=False, foreground=False, preloadstory=None, noshift=False, remotetunnel=False)


koboldcpp.main(kcppargs,False) #initialize library without enabling Lite http server
kcpp_backend_loaded = True
pass
#koboldcpp.main(kcppargs,False) #initialize library without enabling Lite http server
(self.output_queue, self.input_queue, self.process) = koboldcpp.start_in_seperate_process(kcppargs)
while True:
data = self.output_queue.get()
if data['command'] == 'load status':
utils.koboldai_vars.total_layers = data['data']['total']
utils.koboldai_vars.loaded_layers = data['data']['loaded']
elif data['command'] == 'complete':
break
time.sleep(0.02)

def _save_settings(self):
pass
Expand All @@ -297,16 +301,31 @@ def _raw_generate(
# Store context in memory to use it for comparison with generated content
utils.koboldai_vars.lastctx = decoded_prompt

genresult = koboldcpp.generate(decoded_prompt,max_new,utils.koboldai_vars.max_length,
gen_settings.temp,int(gen_settings.top_k),gen_settings.top_a,gen_settings.top_p,
gen_settings.typical,gen_settings.tfs,gen_settings.rep_pen,gen_settings.rep_pen_range,
sampler_order=gen_settings.sampler_order,use_default_badwordsids=utils.koboldai_vars.use_default_badwordsids)
self.input_queue.put({'command': 'generate', 'data': [(decoded_prompt,max_new,utils.koboldai_vars.max_length,
gen_settings.temp,int(gen_settings.top_k),gen_settings.top_a,gen_settings.top_p,
gen_settings.typical,gen_settings.tfs,gen_settings.rep_pen,gen_settings.rep_pen_range),
{"sampler_order": gen_settings.sampler_order, "use_default_badwordsids": utils.koboldai_vars.use_default_badwordsids}
]})

#genresult = koboldcpp.generate(decoded_prompt,max_new,utils.koboldai_vars.max_length,
#gen_settings.temp,int(gen_settings.top_k),gen_settings.top_a,gen_settings.top_p,
#gen_settings.typical,gen_settings.tfs,gen_settings.rep_pen,gen_settings.rep_pen_range,
#sampler_order=gen_settings.sampler_order,use_default_badwordsids=utils.koboldai_vars.use_default_badwordsids)

genresult = []
while True:
data = self.output_queue.get()
print(data)
if data['command'] == 'generated text':
genresult.append(data['data'])
if self.output_queue.empty():
break
time.sleep(0.02)

outputs = [genresult]
return GenerationResult(
model=self,
out_batches=np.array(
[self.tokenizer.encode(x) for x in outputs]
[self.tokenizer.encode(x) for x in genresult]
),
prompt=prompt_tokens,
is_whole_generation=True,
Expand Down
2 changes: 1 addition & 1 deletion common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -921,7 +921,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -m FNAME, --model FNAME\n");
printf(" model path (default: %s)\n", params.model.c_str());
printf(" -md FNAME, --model-draft FNAME\n");
printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str());
printf(" draft model for speculative decoding\n");
printf(" -ld LOGDIR, --logdir LOGDIR\n");
printf(" path under which to save YAML logs (no logging if unset)\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
Expand Down
3 changes: 2 additions & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ struct gpt_params {
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_draft = 8; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode
Expand Down Expand Up @@ -248,3 +248,4 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);

// Dump the KV cache view showing individual sequences in each cell (long output).
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);

22 changes: 22 additions & 0 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,8 @@ def from_model_architecture(model_architecture):
return QwenModel
if model_architecture == "MixtralForCausalLM":
return MixtralModel
if model_architecture == "PhiForCausalLM":
return Phi2Model
return Model

def _is_model_safetensors(self) -> bool:
Expand Down Expand Up @@ -221,6 +223,8 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH:
return gguf.MODEL_ARCH.QWEN
if arch == "MixtralForCausalLM":
return gguf.MODEL_ARCH.LLAMA
if arch == "PhiForCausalLM":
return gguf.MODEL_ARCH.PHI2

raise NotImplementedError(f'Architecture "{arch}" not supported!')

Expand Down Expand Up @@ -980,6 +984,24 @@ def write_tensors(self):
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
self.gguf_writer.add_tensor(new_name, data)


class Phi2Model(Model):
def set_gguf_parameters(self):
block_count = self.hparams["n_layer"]

self.gguf_writer.add_name("Phi2")
self.gguf_writer.add_context_length(self.hparams["n_positions"])
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_head_count(self.hparams["n_head"])
self.gguf_writer.add_head_count_kv(self.hparams["n_head"])
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
self.gguf_writer.add_rope_dimension_count(self.hparams["rotary_dim"])
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_add_bos_token(False)


###### CONVERSION LOGIC ######


Expand Down
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ else()
add_subdirectory(simple)
add_subdirectory(speculative)
add_subdirectory(lookahead)
add_subdirectory(lookup)
add_subdirectory(train-text-from-scratch)
if (LLAMA_METAL)
add_subdirectory(metal)
Expand Down
15 changes: 3 additions & 12 deletions examples/baby-llama/baby-llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -575,10 +575,7 @@ static struct ggml_tensor * forward(

// KQ_scaled = KQ / sqrt(n_embd/n_head)
// KQ_scaled shape [n_past + N, N, n_head, 1]
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));

// KQ_masked = mask_past(KQ_scaled)
// KQ_masked shape [n_past + N, N, n_head, 1]
Expand Down Expand Up @@ -844,10 +841,7 @@ static struct ggml_tensor * forward_batch(

// KQ_scaled = KQ / sqrt(n_embd/n_head)
// KQ_scaled shape [n_past + N, N, n_head, n_batch]
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch);

// KQ_masked = mask_past(KQ_scaled)
Expand Down Expand Up @@ -1131,10 +1125,7 @@ static struct ggml_tensor * forward_lora(

// KQ_scaled = KQ / sqrt(n_embd/n_head)
// KQ_scaled shape [n_past + N, N, n_head, 1]
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));

// KQ_masked = mask_past(KQ_scaled)
// KQ_masked shape [n_past + N, N, n_head, 1]
Expand Down
2 changes: 1 addition & 1 deletion examples/export-lora/export-lora.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ static struct ggml_cgraph * build_graph_lora(
) {
struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
if (scaling != 1.0f) {
ab = ggml_scale(ctx, ab, ggml_new_f32(ctx, scaling));
ab = ggml_scale(ctx, ab, scaling);
}
struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);

Expand Down
Loading

0 comments on commit 76eb691

Please sign in to comment.