Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sync : ggml #2737

Merged
merged 23 commits into from
Jan 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
19c147b
ggml : allow loading backend with env variable (ggml/1059)
rgerganov Jan 5, 2025
95cd1d3
fix: Vulkan shader gen binary path (llama/11037)
giladgd Jan 4, 2025
aababf1
Support for models with non-512-aligned tensors over RPC. (llama/11047)
matt23654 Jan 4, 2025
fbe4db4
Vulkan: Add device-specific blacklist for coopmat for the AMD proprie…
0cc4m Jan 4, 2025
9325f4a
CUDA: add BF16 support (llama/11093)
JohannesGaessler Jan 6, 2025
c52b2f6
SYCL: Use get_multi_ptr instead of deprecated get_pointer in wkv6 (ll…
qnixsynapse Jan 7, 2025
f078351
rpc : code cleanup (llama/11107)
rgerganov Jan 7, 2025
c324f37
ggml-backend : only offload from host buffers (llama/11120)
slaren Jan 7, 2025
6679500
ggml-backend : only offload from host buffers (fix) (llama/11124)
slaren Jan 7, 2025
a48be79
GGUF: C++ refactor, backend support, misc fixes (llama/11030)
JohannesGaessler Jan 7, 2025
f2031c5
fix: Vulkan shader gen binary path when Cross-compiling (llama/11096)
ag2s20150909 Jan 8, 2025
e322e91
Disable GL_KHR_cooperative_matrix Vulkan extension if not available. …
mbaudier Jan 8, 2025
3272320
llamafile : ppc64le MMA INT8 implementation (llama/10912)
amritahs-ibm Jan 8, 2025
40aa3fa
fix: add missing msg in static_assert (llama/11143)
hydai Jan 8, 2025
fe7bb88
SYCL: Refactor ggml_sycl_compute_forward (llama/11121)
qnixsynapse Jan 10, 2025
2d6f599
llama: add support for QRWKV6 model architecture (llama/11001)
MollySophia Jan 10, 2025
618d94a
Vulkan: Fix float16 use on devices without float16 support + fix subg…
0cc4m Jan 10, 2025
1f14567
ggml : do not define GGML_USE_CUDA when building with GGML_BACKEND_DL…
rgerganov Jan 13, 2025
6718f14
cuda : CUDA Graph Compute Function Refactor (precursor for performanc…
aendk Jan 13, 2025
cd6391c
ggml : add opencl backend (skip) (llama/10693)
lhez Jan 14, 2025
6a0e8b5
GGUF: C++ refactor, backend support, misc fixes (skip) (llama/11030)
JohannesGaessler Jan 14, 2025
41f3a68
sync : ggml
ggerganov Jan 14, 2025
ef62a22
talk-llama : sync llama.cpp
ggerganov Jan 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 57 additions & 44 deletions examples/talk-llama/llama-adapter.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include "llama-adapter.h"

#include "llama-impl.h"
#include "llama-mmap.h"
#include "llama-model.h"

#include <algorithm>
Expand All @@ -9,15 +11,15 @@

// vec

struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
return nullptr;
}

return tensors[il];
}

struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const {
ggml_tensor * layer_dir = tensor_for(il);
if (layer_dir != nullptr) {
cur = ggml_add(ctx, cur, layer_dir);
Expand All @@ -26,12 +28,12 @@ struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, s
return cur;
}

static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
bool llama_adapter_cvec::init(const llama_model & model) {
const auto & hparams = model.hparams;

GGML_ASSERT(cvec.tensors.empty());
GGML_ASSERT(cvec.ctxs.empty());
GGML_ASSERT(cvec.bufs.empty());
GGML_ASSERT(tensors.empty());
GGML_ASSERT(ctxs.empty());
GGML_ASSERT(bufs.empty());

// create a context for each buffer type
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
Expand All @@ -50,7 +52,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
}

ctx_map[buft] = ctx;
cvec.ctxs.emplace_back(ctx);
ctxs.emplace_back(ctx);

return ctx;
}
Expand All @@ -59,21 +61,21 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
};

// make tensors
cvec.tensors.reserve(hparams.n_layer);
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
tensors.reserve(hparams.n_layer);
tensors.push_back(nullptr); // there's never a tensor for layer 0
for (size_t il = 1; il < hparams.n_layer; il++) {
ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
ggml_backend_buffer_type_t buft = model.select_buft(il);
ggml_context * ctx = ctx_for_buft(buft);
if (!ctx) {
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
return false;
}
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
cvec.tensors.push_back(tensor);
tensors.push_back(tensor);
}

// allocate tensors / buffers and zero
cvec.bufs.reserve(ctx_map.size());
bufs.reserve(ctx_map.size());
for (auto it : ctx_map) {
ggml_backend_buffer_type_t buft = it.first;
ggml_context * ctx = it.second;
Expand All @@ -83,14 +85,13 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
return false;
}
ggml_backend_buffer_clear(buf, 0);
cvec.bufs.emplace_back(buf);
bufs.emplace_back(buf);
}

return true;
}

int32_t llama_control_vector_apply(
struct llama_control_vector & cvec,
int32_t llama_adapter_cvec::apply(
const llama_model & model,
const float * data,
size_t len,
Expand All @@ -101,8 +102,8 @@ int32_t llama_control_vector_apply(

if (data == nullptr) {
// disable the current control vector (but leave allocated for later)
cvec.layer_start = -1;
cvec.layer_end = -1;
layer_start = -1;
layer_end = -1;
return 0;
}

Expand All @@ -111,21 +112,21 @@ int32_t llama_control_vector_apply(
return 1;
}

if (cvec.tensors.empty()) {
if (!llama_control_vector_init(cvec, model)) {
if (tensors.empty()) {
if (!init(model)) {
return 1;
}
}

cvec.layer_start = il_start;
cvec.layer_end = il_end;
layer_start = il_start;
layer_end = il_end;

for (size_t il = 1; il < hparams.n_layer; il++) {
assert(cvec.tensors[il] != nullptr);
assert(tensors[il] != nullptr);

const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
if (off + n_embd <= len) {
ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il]));
}
}

Expand All @@ -134,7 +135,7 @@ int32_t llama_control_vector_apply(

// lora

llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) {
const std::string name(w->name);

const auto pos = ab_map.find(name);
Expand All @@ -145,11 +146,7 @@ llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
return nullptr;
}

void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
delete adapter;
}

static void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);

ggml_context * ctx_init;
Expand Down Expand Up @@ -221,7 +218,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
};

// bundle lora_a and lora_b into pairs
std::map<std::string, llama_lora_weight> ab_map;
std::map<std::string, llama_adapter_lora_weight> ab_map;
auto str_endswith = [](const std::string & str, const std::string & suffix) {
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
};
Expand All @@ -231,17 +228,21 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
if (str_endswith(name, ".lora_a")) {
replace_all(name, ".lora_a", "");
if (ab_map.find(name) == ab_map.end()) {
ab_map[name] = llama_lora_weight(cur, nullptr);
ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
} else {
ab_map[name].a = cur;
}
} else if (str_endswith(name, ".lora_b")) {
replace_all(name, ".lora_b", "");
if (ab_map.find(name) == ab_map.end()) {
ab_map[name] = llama_lora_weight(nullptr, cur);
ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
} else {
ab_map[name].b = cur;
}
} else if (str_endswith(name, "_norm.weight")) {
// TODO: add support for norm vector
// for now, we don't really care because most adapters still work fine without it
continue;
} else {
throw std::runtime_error("LoRA tensor '" + name + "' has unexpected suffix");
}
Expand All @@ -250,33 +251,41 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
// add tensors
for (auto & it : ab_map) {
const std::string & name = it.first;
llama_lora_weight & w = it.second;
llama_adapter_lora_weight & w = it.second;
bool is_token_embd = str_endswith(name, "token_embd.weight");

if (!w.a || !w.b) {
throw std::runtime_error("LoRA tensor pair for '" + name + "' is missing one component");
}

// device buft and device ctx
auto * model_tensor = llama_model_get_tensor(model, name.c_str());
const auto * model_tensor = model.get_tensor(name.c_str());
if (!model_tensor) {
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model");
throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
}

struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
// validate tensor shape
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
throw std::runtime_error("tensor '" + name + "' has incorrect shape");
}
if (w.a->ne[1] != w.b->ne[0]) {
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
if (is_token_embd) {
// expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
if (model_tensor->ne[0] != w.b->ne[1] || model_tensor->ne[1] != w.a->ne[1]) {
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
}
} else {
if (model_tensor->ne[0] != w.a->ne[0] || model_tensor->ne[1] != w.b->ne[1]) {
throw std::runtime_error("tensor '" + name + "' has incorrect shape (hint: maybe wrong base model?)");
}
if (w.a->ne[1] != w.b->ne[0]) {
throw std::runtime_error("lora_a tensor is not transposed (hint: adapter from \"finetune\" example is no longer supported)");
}
}

// save tensor to adapter
struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
ggml_set_name(tensor_a, w.a->name);
ggml_set_name(tensor_b, w.b->name);
adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
}

// allocate tensors / buffers and zero
Expand Down Expand Up @@ -318,11 +327,11 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
}

struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
struct llama_lora_adapter * adapter = new llama_lora_adapter();
struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
struct llama_adapter_lora * adapter = new llama_adapter_lora();

try {
llama_lora_adapter_init_impl(*model, path_lora, *adapter);
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
return adapter;
} catch (const std::exception & err) {
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
Expand All @@ -332,3 +341,7 @@ struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model,

return nullptr;
}

void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
delete adapter;
}
64 changes: 36 additions & 28 deletions examples/talk-llama/llama-adapter.h
Original file line number Diff line number Diff line change
@@ -1,66 +1,74 @@
#pragma once

#include "llama-impl.h"
#include "llama-hparams.h"
#include "llama.h"

#include "ggml-cpp.h"

#include <string>
#include <unordered_map>
#include <vector>

// TODO: pimpl

//
// llama_adapter_cvec
//

// TODO: rename to llama_adapter_cvec
struct llama_control_vector {
std::vector<ggml_context_ptr> ctxs;
std::vector<ggml_backend_buffer_ptr> bufs;
struct llama_adapter_cvec {
struct ggml_tensor * tensor_for(int il) const;

std::vector<struct ggml_tensor *> tensors; // per layer
struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const;

int32_t apply(
const llama_model & model,
const float * data,
size_t len,
int32_t n_embd,
int32_t il_start,
int32_t il_end);

private:
bool init(const llama_model & model);

int32_t layer_start = -1;
int32_t layer_end = -1;

struct ggml_tensor * tensor_for(int il) const;
std::vector<ggml_context_ptr> ctxs;
std::vector<ggml_backend_buffer_ptr> bufs;

struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const;
std::vector<struct ggml_tensor *> tensors; // per layer
};

int32_t llama_control_vector_apply(
struct llama_control_vector & cvec,
const llama_model & model,
const float * data,
size_t len,
int32_t n_embd,
int32_t il_start,
int32_t il_end);

//
// llama_adapter_lora
//

// TODO: rename to llama_adapter_lora_weight
struct llama_lora_weight {
struct llama_adapter_lora_weight {
struct ggml_tensor * a = nullptr;
struct ggml_tensor * b = nullptr;

llama_lora_weight() = default;
llama_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
// get actual scale based on rank and alpha
float get_scale(float alpha, float adapter_scale) const {
const float rank = (float) b->ne[0];
const float scale = alpha ? adapter_scale * alpha / rank : adapter_scale;
return scale;
}

llama_adapter_lora_weight() = default;
llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
};

// TODO: rename to llama_adapter_lora
struct llama_lora_adapter {
struct llama_adapter_lora {
// map tensor name to lora_a_b
std::unordered_map<std::string, struct llama_lora_weight> ab_map;
std::unordered_map<std::string, struct llama_adapter_lora_weight> ab_map;

std::vector<ggml_context_ptr> ctxs;
std::vector<ggml_backend_buffer_ptr> bufs;

float alpha;

llama_lora_adapter() = default;
~llama_lora_adapter() = default;
llama_adapter_lora() = default;
~llama_adapter_lora() = default;

llama_lora_weight * get_weight(struct ggml_tensor * w);
llama_adapter_lora_weight * get_weight(struct ggml_tensor * w);
};
Loading
Loading