Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama : refactor model loader with backend registry #10026

Merged
merged 2 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 22 additions & 110 deletions examples/llama-bench/llama-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,6 @@
#include "ggml.h"
#include "llama.h"
#include "common.h"
#include "ggml-cuda.h"
#include "ggml-sycl.h"

#ifdef GGML_USE_CANN
#include "ggml-cann.h"
#endif

#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
Expand Down Expand Up @@ -82,95 +76,27 @@ static T stdev(const std::vector<T> & v) {
}

static std::string get_cpu_info() {
std::string id;
#ifdef __linux__
FILE * f = fopen("/proc/cpuinfo", "r");
if (f) {
char buf[1024];
while (fgets(buf, sizeof(buf), f)) {
if (strncmp(buf, "model name", 10) == 0) {
char * p = strchr(buf, ':');
if (p) {
p++;
while (std::isspace(*p)) {
p++;
}
while (std::isspace(p[strlen(p) - 1])) {
p[strlen(p) - 1] = '\0';
}
id = p;
break;
}
}
std::vector<std::string> gpu_list;
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
auto * dev = ggml_backend_dev_get(i);
auto dev_type = ggml_backend_dev_type(dev);
if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
gpu_list.push_back(ggml_backend_dev_description(dev));
}
fclose(f);
}
#elif defined(_WIN32)
HKEY hKey;
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
0,
KEY_READ,
&hKey) != ERROR_SUCCESS) {
// fail to open registry key
return "";
}
char cpu_brand[256];
DWORD cpu_brand_size = sizeof(cpu_brand);
if (RegQueryValueExA(hKey,
TEXT("ProcessorNameString"),
NULL,
NULL,
(LPBYTE)cpu_brand,
&cpu_brand_size) == ERROR_SUCCESS) {
id.assign(cpu_brand, cpu_brand_size);
if (id.find('\0') != std::string::npos) {
id.resize(id.find('\0'));
}
}
RegCloseKey(hKey);
#endif
// TODO: other platforms
return id;
return join(gpu_list, ", ");
}

static std::string get_gpu_info() {
std::string id;
#ifdef GGML_USE_CUDA
int count = ggml_backend_cuda_get_device_count();
for (int i = 0; i < count; i++) {
char buf[128];
ggml_backend_cuda_get_device_description(i, buf, sizeof(buf));
id += buf;
if (i < count - 1) {
id += "/";
std::vector<std::string> gpu_list;
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
auto * dev = ggml_backend_dev_get(i);
auto dev_type = ggml_backend_dev_type(dev);
if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) {
gpu_list.push_back(ggml_backend_dev_description(dev));
}
}
#endif
#ifdef GGML_USE_SYCL
int count = ggml_backend_sycl_get_device_count();
for (int i = 0; i < count; i++) {
char buf[128];
ggml_backend_sycl_get_device_description(i, buf, sizeof(buf));
id += buf;
if (i < count - 1) {
id += "/";
}
}
#endif
#ifdef GGML_USE_CANN
uint32_t count = ggml_backend_cann_get_device_count();
for (uint32_t i = 0; i < count; i++) {
char buf[128];
ggml_backend_cann_get_device_description(i, buf, sizeof(buf));
id += buf;
if (i < count - 1) {
id += "/";
}
}
#endif
// TODO: other backends
return id;
return join(gpu_list, ", ");
}

// command line params
Expand Down Expand Up @@ -938,29 +864,15 @@ struct test {
}

static std::string get_backend() {
if (cuda) {
return GGML_CUDA_NAME;
}
if (vulkan) {
return "Vulkan";
}
if (kompute) {
return "Kompute";
}
if (metal) {
return "Metal";
}
if (sycl) {
return GGML_SYCL_NAME;
}
if (gpu_blas) {
return "GPU BLAS";
}
if (blas) {
return "BLAS";
std::vector<std::string> backends;
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
auto * reg = ggml_backend_reg_get(i);
std::string name = ggml_backend_reg_name(reg);
if (name != "CPU") {
backends.push_back(ggml_backend_reg_name(reg));
}
}

return "CPU";
return backends.empty() ? "CPU" : join(backends, ",");
}

static const std::vector<std::string> & get_fields() {
Expand Down
16 changes: 9 additions & 7 deletions ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,12 @@ extern "C" {
//

enum ggml_backend_dev_type {
// CPU device using system memory
GGML_BACKEND_DEVICE_TYPE_CPU,
// GPU device using dedicated memory
GGML_BACKEND_DEVICE_TYPE_GPU,
// devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
GGML_BACKEND_DEVICE_TYPE_GPU_FULL
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
GGML_BACKEND_DEVICE_TYPE_ACCEL
};

// functionality supported by the device
Expand Down Expand Up @@ -168,9 +169,10 @@ extern "C" {
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);


// Functions that may be obtained using ggml_backend_reg_get_proc_address
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t, int);
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);

//
// Backend registry
Expand All @@ -192,7 +194,7 @@ extern "C" {
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
GGML_API ggml_backend_t ggml_backend_init_best(void);

//
Expand Down
2 changes: 1 addition & 1 deletion ggml/include/ggml-cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);

// split tensor buffer that splits matrices by rows across multiple devices
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);

// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
Expand Down
33 changes: 8 additions & 25 deletions ggml/src/ggml-amx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,6 @@
#if defined(__AMX_INT8__)

// AMX buffer interface
static const char * ggml_backend_amx_buffer_get_name(ggml_backend_buffer_t buffer) {
return "AMX";

GGML_UNUSED(buffer);
}

static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context);
}
Expand Down Expand Up @@ -72,7 +66,6 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
}

static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
/* .get_name = */ ggml_backend_amx_buffer_get_name,
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
/* .get_base = */ ggml_backend_amx_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required
Expand Down Expand Up @@ -121,14 +114,14 @@ static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
/* .iface = */ {
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
/* .is_host = */ ggml_backend_amx_buffer_type_is_host,
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
/* .is_host = */ ggml_backend_amx_buffer_type_is_host,
},
/* .device = */ NULL,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
/* .context = */ NULL,
};

Expand All @@ -149,12 +142,6 @@ static void ggml_backend_amx_free(ggml_backend_t backend) {
delete backend;
}

static ggml_backend_buffer_type_t ggml_backend_amx_get_default_buffer_type(ggml_backend_t backend) {
return ggml_backend_amx_buffer_type();

GGML_UNUSED(backend);
}

static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;

Expand Down Expand Up @@ -187,7 +174,6 @@ static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, s
static struct ggml_backend_i ggml_backend_amx_i = {
/* .get_name = */ ggml_backend_amx_name,
/* .free = */ ggml_backend_amx_free,
/* .get_default_buffer_type = */ ggml_backend_amx_get_default_buffer_type,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
Expand All @@ -197,9 +183,6 @@ static struct ggml_backend_i ggml_backend_amx_i = {
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_amx_graph_compute,
/* .supports_op = */ NULL,
/* .supports_buft = */ NULL,
/* .offload_op = */ NULL,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
};
Expand Down Expand Up @@ -279,7 +262,7 @@ static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t *
}

static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
return GGML_BACKEND_DEVICE_TYPE_CPU;
return GGML_BACKEND_DEVICE_TYPE_ACCEL;

GGML_UNUSED(dev);
}
Expand Down
19 changes: 4 additions & 15 deletions ggml/src/ggml-backend-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ extern "C" {
size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
// (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
// (optional) check if tensor data is in host memory (defaults to false)
// (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
bool (*is_host) (ggml_backend_buffer_type_t buft);
};

Expand All @@ -37,7 +37,6 @@ extern "C" {
//

struct ggml_backend_buffer_i {
const char * (*get_name) (ggml_backend_buffer_t buffer);
// (optional) free the buffer
void (*free_buffer) (ggml_backend_buffer_t buffer);
// base address of the buffer
Expand Down Expand Up @@ -88,19 +87,16 @@ extern "C" {

void (*free)(ggml_backend_t backend);

// Will be moved to the device interface
// buffer allocation
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);

// (optional) asynchronous tensor data access
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);

// (optional) complete all pending operations
// (optional) complete all pending operations (required if the backend supports async operations)
void (*synchronize)(ggml_backend_t backend);

// (optional) compute graph with a plan (not used currently)
// (optional) graph plans
// compute graph with a plan (not used currently)
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
Expand All @@ -111,13 +107,6 @@ extern "C" {
// compute graph (always async if supported by the backend)
enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);

// IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
// new backends should implement the device interface instead
// These functions are being moved to the device interface
bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op);
bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op);

// (optional) event synchronization
// record an event on this stream
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
Expand Down
Loading
Loading