Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ggml : adapt Metal to new ggml_backend interface #2258

Closed
wants to merge 9 commits into from
8 changes: 6 additions & 2 deletions ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,9 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst
return;
}

//printf("src->data = %p, src->extra = %p\n", src->data, src->extra);
//printf("dst->data = %p, dst->extra = %p\n", dst->data, dst->extra);

if (dst->backend->interface.cpy_tensor_from != NULL) {
dst->backend->interface.cpy_tensor_from(dst->backend->context, src, dst);
} else if (src->backend->interface.cpy_tensor_to != NULL) {
Expand Down Expand Up @@ -311,8 +314,9 @@ struct ggml_backend * ggml_backend_cpu_init(void) {
struct ggml_backend * cpu_backend = malloc(sizeof(struct ggml_backend));

*cpu_backend = (struct ggml_backend) {
/* .interface = */ cpu_backend_interface,
/* .context = */ ctx
/* .interface = */ cpu_backend_interface,
/* .context = */ ctx,
/* .is_ram_shared = */ true,
};
return cpu_backend;
}
Expand Down
2 changes: 2 additions & 0 deletions ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ extern "C" {
struct ggml_backend {
struct ggml_backend_interface interface;
ggml_backend_context_t context;

bool is_ram_shared;
};

// backend helper functions
Expand Down
5 changes: 3 additions & 2 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1810,8 +1810,9 @@ ggml_backend * ggml_backend_cuda_init(void) {

ggml_backend * cuda_backend = new ggml_backend;
*cuda_backend = (ggml_backend){
/* .interface = */ cuda_backend_interface,
/* .context = */ ctx
/* .interface = */ cuda_backend_interface,
/* .context = */ ctx
/* .is_ram_shared = */ false,
};
return cuda_backend;
}
69 changes: 35 additions & 34 deletions ggml-metal.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,48 +22,49 @@
#include <stddef.h>
#include <stdbool.h>

// max memory buffers that can be mapped to the device
#define GGML_METAL_MAX_BUFFERS 16

struct ggml_tensor;
struct ggml_cgraph;
//struct ggml_tensor;
//struct ggml_cgraph;

#ifdef __cplusplus
extern "C" {
#endif

struct ggml_metal_context;

// number of command buffers to use
struct ggml_metal_context * ggml_metal_init(int n_cb);
void ggml_metal_free(struct ggml_metal_context * ctx);
struct ggml_backend;

// set the number of command buffers to use
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
struct ggml_backend * ggml_backend_metal_init(void);

// creates a mapping between a host memory buffer and a device memory buffer
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
// - the mapping is used during computation to determine the arguments of the compute kernels
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
// - max_size specifies the maximum size of a tensor and is used to create shared views such
// that it is guaranteed that the tensor will fit in at least one of the views
//struct ggml_metal_context;
//
bool ggml_metal_add_buffer(
struct ggml_metal_context * ctx,
const char * name,
void * data,
size_t size,
size_t max_size);

// set data from host memory into the device
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);

// get data from the device into host memory
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);

// same as ggml_graph_compute but uses Metal
// creates gf->n_threads command buffers in parallel
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
//// number of command buffers to use
//struct ggml_metal_context * ggml_metal_init(int n_cb);
//void ggml_metal_free(struct ggml_metal_context * ctx);
//
//// set the number of command buffers to use
//void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
//
//// creates a mapping between a host memory buffer and a device memory buffer
//// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
//// - the mapping is used during computation to determine the arguments of the compute kernels
//// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
//// - max_size specifies the maximum size of a tensor and is used to create shared views such
//// that it is guaranteed that the tensor will fit in at least one of the views
////
//bool ggml_metal_add_buffer(
// struct ggml_metal_context * ctx,
// const char * name,
// void * data,
// size_t size,
// size_t max_size);
//
//// set data from host memory into the device
//void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
//
//// get data from the device into host memory
//void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
//
//// same as ggml_graph_compute but uses Metal
//// creates gf->n_threads command buffers in parallel
//void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);

#ifdef __cplusplus
}
Expand Down
Loading