diff --git a/CMakeLists.txt b/CMakeLists.txt index 9a509334020..fbf56e5e8b8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -94,6 +94,17 @@ if (APPLE AND NOT WHISPER_NO_ACCELERATE) else() message(WARNING "Accelerate framework not found") endif() + + find_library(FOUNDATION_LIBRARY Foundation REQUIRED) + find_library(METAL_FRAMEWORK Metal REQUIRED) + find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) + find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED) + + set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} + ${FOUNDATION_LIBRARY} + ${METAL_FRAMEWORK} + ${METALKIT_FRAMEWORK} + ${METALPERFORMANCE_FRAMEWORK}) endif() if (WHISPER_SUPPORT_OPENBLAS) @@ -168,6 +179,7 @@ set(TARGET whisper) add_library(${TARGET} ggml.c + ggml-mtl.m whisper.cpp ) diff --git a/ggml-mtl.h b/ggml-mtl.h new file mode 100644 index 00000000000..223eb94e212 --- /dev/null +++ b/ggml-mtl.h @@ -0,0 +1,38 @@ +#pragma once + +#include +#include + +// TODO: this will hold dynamic context data in the future +// currently unused +struct ggml_mtl_context { + void * dummy; +}; + +struct ggml_mtl_object { + int32_t id; + void * data; +}; + +struct ggml_mtl_context * ggml_mtl_init(void); + +struct ggml_mtl_object ggml_mtl_alloc(size_t size); + +// multiply matrix by vector +void ggml_mtl_mul_mat_vec_f16( + struct ggml_mtl_context * ctx, + struct ggml_mtl_object src0, // matrix f16 + const __fp16 * src1, // vector f16 + float * dst, // vector f32 + int nrows, + int ncols); + +// multiply matrix by matrix +void ggml_mtl_mul_mat_f16( + struct ggml_mtl_context * ctx, + struct ggml_mtl_object src0, // matrix f16 + const __fp16 * src1, // matrix f16 + float * dst, // matrix f32 + int nrows0, + int nrows1, + int ncols); diff --git a/ggml-mtl.m b/ggml-mtl.m new file mode 100644 index 00000000000..90ae2cd6c76 --- /dev/null +++ b/ggml-mtl.m @@ -0,0 +1,162 @@ +#import "ggml-mtl.h" + +#import +#import +#import + +#define GGML_MTL_MAX_BUFFERS 256 + +// global static storage for Metal buffers +// TODO: move this into a dynamic context +static id g_buffers[GGML_MTL_MAX_BUFFERS]; + +// global MTL context +// TODO: move this into a dynamic context +static id g_device; +static id g_command_queue; + +struct ggml_mtl_context * ggml_mtl_init() { + // TODO: implement properly + // for now, init the global MTL context and MTL buffers + g_device = MTLCreateSystemDefaultDevice(); + + g_command_queue = [g_device newCommandQueue]; + if (g_command_queue == nil) + { + NSLog(@"Failed to find the command queue."); + return nil; + } + + return nil; +} + +// search for unallocated buffer slot and use it +struct ggml_mtl_object ggml_mtl_alloc(size_t size) { + // TODO: temporarily making sure that the buffers are nil at the start + static bool first = true; + if (first) { + for (int i = 0; i < GGML_MTL_MAX_BUFFERS; ++i) { + assert(g_buffers[i] == nil); + } + first = false; + } + + struct ggml_mtl_object obj = { -1, nil }; + + for (int i = 0; i < GGML_MTL_MAX_BUFFERS; i++) { + if (g_buffers[i] == nil) { + g_buffers[i] = [g_device newBufferWithLength:size options:MTLResourceStorageModeManaged]; + + // lunk the MTL buffer to the ggml object + obj.id = i; + obj.data = [g_buffers[i] contents]; + + break; + } + } + + return obj; +} + +struct params_mul_mat_vec { + int N; // rows + int M; // cols +}; + +// multiply matrix with a vector using MPSMatrixVectorMultiplication +void ggml_mtl_mul_mat_vec_f16( + struct ggml_mtl_context * ctx, + struct ggml_mtl_object src0, + const __fp16 * src1, + float * dst, + int nrows, + int ncols) { + (void) ctx; // unused + + // Create a command buffer to hold commands. + id commandBuffer = [g_command_queue commandBuffer]; + assert(commandBuffer != nil); + + // make managed device buffer to store src1 + id src1_buffer = [g_device newBufferWithBytes:src1 length:ncols*sizeof(__fp16) options:MTLResourceStorageModeManaged]; + id dst_buffer = [g_device newBufferWithLength:nrows*sizeof(float) options:MTLResourceStorageModeManaged]; + + // MPSMatrixDescriptor + MPSMatrixDescriptor *src0_desc = [MPSMatrixDescriptor matrixDescriptorWithRows:nrows columns:ncols rowBytes:ncols*sizeof(__fp16) dataType:MPSDataTypeFloat16]; + MPSVectorDescriptor *src1_desc = [MPSVectorDescriptor vectorDescriptorWithLength:ncols dataType:MPSDataTypeFloat16]; + MPSVectorDescriptor *dst_desc = [MPSVectorDescriptor vectorDescriptorWithLength:nrows dataType:MPSDataTypeFloat32]; + + // MPSMatrix + MPSMatrix *src0_mat = [[MPSMatrix alloc] initWithBuffer:g_buffers[src0.id] descriptor:src0_desc]; + MPSVector *src1_vec = [[MPSVector alloc] initWithBuffer:src1_buffer descriptor:src1_desc]; + MPSVector *dst_vec = [[MPSVector alloc] initWithBuffer:dst_buffer descriptor:dst_desc]; + + // MPSMatrixVectorMultiplication + MPSMatrixVectorMultiplication *mul_mat_vec = [[MPSMatrixVectorMultiplication alloc] initWithDevice:g_device transpose:NO rows:nrows columns:ncols alpha:1.0 beta:0.0]; + + // encode + [mul_mat_vec encodeToCommandBuffer:commandBuffer + inputMatrix:src0_mat + inputVector:src1_vec + resultVector:dst_vec]; + + [commandBuffer commit]; + [commandBuffer waitUntilCompleted]; + + // copy GPU result to CPU + memcpy(dst, [dst_buffer contents], nrows*sizeof(float)); +} + +// multiply matrix with a matrix using MPSMatrixMultiplication +void ggml_mtl_mul_mat_f16( + struct ggml_mtl_context * ctx, + struct ggml_mtl_object src0, + const __fp16 * src1, + float * dst, + int nrows0, + int nrows1, + int ncols) { + (void) ctx; // unused + + // Create a command buffer to hold commands. + id commandBuffer = [g_command_queue commandBuffer]; + assert(commandBuffer != nil); + + // make managed device buffer to store src1 + id src1_buffer = [g_device newBufferWithBytes:src1 length:ncols*nrows1*sizeof(__fp16) options:MTLResourceStorageModeManaged]; + id dst_buffer = [g_device newBufferWithLength:nrows0*nrows1*sizeof(float) options:MTLResourceStorageModeManaged]; + + // MPSMatrixDescriptor + MPSMatrixDescriptor *src0_desc = [MPSMatrixDescriptor matrixDescriptorWithRows:nrows0 columns:ncols rowBytes:ncols*sizeof(__fp16) dataType:MPSDataTypeFloat16]; + MPSMatrixDescriptor *src1_desc = [MPSMatrixDescriptor matrixDescriptorWithRows:nrows1 columns:ncols rowBytes:ncols*sizeof(__fp16) dataType:MPSDataTypeFloat16]; + MPSMatrixDescriptor *dst_desc = [MPSMatrixDescriptor matrixDescriptorWithRows:nrows1 columns:nrows0 rowBytes:nrows0*sizeof(float) dataType:MPSDataTypeFloat32]; + + // MPSMatrix + MPSMatrix *src0_mat = [[MPSMatrix alloc] initWithBuffer:g_buffers[src0.id] descriptor:src0_desc]; + MPSMatrix *src1_mat = [[MPSMatrix alloc] initWithBuffer:src1_buffer descriptor:src1_desc]; + MPSMatrix *dst_mat = [[MPSMatrix alloc] initWithBuffer:dst_buffer descriptor:dst_desc]; + + //// MPSMatrixMultiplication z = x * yT + //MPSMatrixMultiplication *mul_mat = [[MPSMatrixMultiplication alloc] initWithDevice:g_device transposeLeft:NO transposeRight:YES resultRows:nrows resultColumns:nrows interiorColumns:ncols alpha:1.0 beta:0.0]; + + //// encode + //[mul_mat encodeToCommandBuffer:commandBuffer + // leftMatrix:src0_mat + // rightMatrix:src1_mat + // resultMatrix:dst_mat]; + + // MPSMatrixMultiplication zT = xT * y + MPSMatrixMultiplication *mul_mat = [[MPSMatrixMultiplication alloc] initWithDevice:g_device transposeLeft:NO transposeRight:YES resultRows:nrows1 resultColumns:nrows0 interiorColumns:ncols alpha:1.0 beta:0.0]; + + // encode + [mul_mat encodeToCommandBuffer:commandBuffer + leftMatrix:src1_mat + rightMatrix:src0_mat + resultMatrix:dst_mat]; + + [commandBuffer commit]; + [commandBuffer waitUntilCompleted]; + + // copy GPU result to CPU + memcpy(dst, [dst_buffer contents], nrows0*nrows1*sizeof(float)); +} diff --git a/ggml.c b/ggml.c index 79b910bbac6..314759dac8a 100644 --- a/ggml.c +++ b/ggml.c @@ -1,5 +1,7 @@ #include "ggml.h" +#include "ggml-mtl.h" + #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW #elif !defined(__FreeBSD__) @@ -1307,6 +1309,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { static bool first_time = true; if (first_time) { + ggml_mtl_init(); // TODO: fix this + for (int i = 0; i < GGML_MAX_CONTEXTS; i++) { g_state.contexts[i].used = false; } @@ -1462,6 +1466,104 @@ struct ggml_tensor * ggml_new_tensor_impl( /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, /*.data =*/ data == NULL ? (void *)(result + 1) : data, + /*.id =*/ -1, + /*.pad =*/ { 0 }, + }; + + ggml_assert_aligned(result->data); + + for (int i = 0; i < n_dims; i++) { + result->ne[i] = ne[i]; + } + + result->nb[0] = GGML_TYPE_SIZE[type]; + for (int i = 1; i < GGML_MAX_DIMS; i++) { + result->nb[i] = result->nb[i - 1]*result->ne[i - 1]; + } + + ctx->n_objects++; + + return result; +} + +struct ggml_tensor * ggml_new_tensor_mtl_impl( + struct ggml_context * ctx, + enum ggml_type type, + int n_dims, + const int* ne, + void* data) { + // always insert objects at the end of the context's memory pool + struct ggml_object * obj_cur = ctx->objects_end; + + const size_t cur_offset = obj_cur == NULL ? 0 : obj_cur->offset; + const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size; + const size_t cur_end = cur_offset + cur_size; + + struct ggml_mtl_object obj_mtl; + { + assert(data == NULL); // TODO: in-place metal buffer, need page aligned memory + size_t size_needed_mtl = 0; + if (data == NULL) { + size_needed_mtl += GGML_TYPE_SIZE[type]; + for (int i = 0; i < n_dims; i++) { + size_needed_mtl *= ne[i]; + } + } + + obj_mtl = ggml_mtl_alloc(size_needed_mtl); + } + + size_t size_needed = 0; + size_needed += sizeof(struct ggml_tensor); + + if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) { + GGML_PRINT("%s: not enough space in the context's memory pool\n", __func__); + assert(false); + return NULL; + } + + char * const mem_buffer = ctx->mem_buffer; + + struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end); + + *obj_new = (struct ggml_object) { + .offset = cur_end + GGML_OBJECT_SIZE, + .size = size_needed, + .next = NULL, + }; + + if (obj_cur != NULL) { + obj_cur->next = obj_new; + } else { + // this is the first object in this context + ctx->objects_begin = obj_new; + } + + ctx->objects_end = obj_new; + + //GGML_PRINT_DEBUG("%s: inserted new object at %zu\n", __func__, cur_end); + + struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offset); + + ggml_assert_aligned(result); + + *result = (struct ggml_tensor) { + /*.type =*/ type, + /*.n_dims =*/ n_dims, + /*.ne =*/ { 1, 1, 1, 1 }, + /*.nb =*/ { 0, 0, 0, 0 }, + /*.op =*/ GGML_OP_NONE, + /*.is_param =*/ false, + /*.grad =*/ NULL, + /*.src0 =*/ NULL, + /*.src1 =*/ NULL, + /*.opt =*/ { NULL }, + /*.n_tasks =*/ 0, + /*.perf_runs =*/ 0, + /*.perf_cycles =*/ 0, + /*.perf_time_us =*/ 0, + /*.data =*/ obj_mtl.data, + /*.id =*/ obj_mtl.id, /*.pad =*/ { 0 }, }; @@ -1489,6 +1591,14 @@ struct ggml_tensor * ggml_new_tensor( return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL); } +struct ggml_tensor * ggml_new_tensor_mtl( + struct ggml_context * ctx, + enum ggml_type type, + int n_dims, + const int* ne) { + return ggml_new_tensor_mtl_impl(ctx, type, n_dims, ne, NULL); +} + struct ggml_tensor * ggml_new_tensor_1d( struct ggml_context * ctx, enum ggml_type type, @@ -1505,6 +1615,15 @@ struct ggml_tensor * ggml_new_tensor_2d( return ggml_new_tensor(ctx, type, 2, ne); } +struct ggml_tensor * ggml_new_tensor_2d_mtl( + struct ggml_context * ctx, + enum ggml_type type, + int ne0, + int ne1) { + const int ne[2] = { ne0, ne1 }; + return ggml_new_tensor_mtl(ctx, type, 2, ne); +} + struct ggml_tensor * ggml_new_tensor_3d( struct ggml_context * ctx, enum ggml_type type, @@ -4343,8 +4462,11 @@ void ggml_compute_forward_mul_mat_f16_f32( // nb00 < nb01 - src0 is transposed // compute by src0 columns + // are we using Metal? + const bool is_mtl = src0->id >= 0; + #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { + if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst) && !is_mtl) { GGML_ASSERT(nb10 == sizeof(float)); if (params->ith != 0) return; @@ -4472,6 +4594,20 @@ void ggml_compute_forward_mul_mat_f16_f32( // parallelize by src0 rows using ggml_vec_dot_f32 + if (is_mtl) { + assert(ne02 == 1); + assert(ne03 == 1); + + if (params->ith == 0) { + printf("XXXXXXXXXXX src0->ne[0] = %d, src0->ne[1] = %d\n", src0->ne[0], src0->ne[1]); + printf("XXXXXXXXXXX src1->ne[0] = %d, src1->ne[1] = %d\n", src1->ne[0], src1->ne[1]); + struct ggml_mtl_object src0_mtl = { src0->id, src0->data }; + ggml_fp16_t * src1_fp16 = params->wdata; + ggml_mtl_mul_mat_f16(NULL, src0_mtl, src1_fp16, dst->data, ne01, ne11, ne00); + } + return; + } + // total rows in src0 const int nr = ne01*ne02*ne03; diff --git a/ggml.h b/ggml.h index f92ae73c3ad..113fedd340e 100644 --- a/ggml.h +++ b/ggml.h @@ -108,7 +108,8 @@ struct ggml_tensor { int64_t perf_time_us; void * data; - char padding[8]; + int32_t id; // TODO: mtl buffer id + char pad[4]; }; // computation graph @@ -173,6 +174,12 @@ struct ggml_tensor * ggml_new_tensor_2d( int ne0, int ne1); +struct ggml_tensor * ggml_new_tensor_2d_mtl( + struct ggml_context * ctx, + enum ggml_type type, + int ne0, + int ne1); + struct ggml_tensor * ggml_new_tensor_3d( struct ggml_context * ctx, enum ggml_type type, diff --git a/whisper.cpp b/whisper.cpp index 7078863aa3e..e21214b5764 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -788,10 +788,10 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); - layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, 4*n_audio_state); + layer.mlp_0_w = ggml_new_tensor_2d_mtl(ctx, wtype, n_audio_state, 4*n_audio_state); // offload to GPU layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_audio_state); - layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_audio_state, n_audio_state); + layer.mlp_1_w = ggml_new_tensor_2d_mtl(ctx, wtype, 4*n_audio_state, n_audio_state); // offload to GPU layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); @@ -1342,7 +1342,7 @@ static bool whisper_encode( ggml_build_forward_expand(&gf, inpO); ggml_graph_compute (ctxL, &gf); - //ggml_graph_print(&gf); + ggml_graph_print(&gf); } // TODO: this is a hack to have per-layer computation graphs - need to come up with something better