diff --git a/coreml/whisper-encoder.mm b/coreml/whisper-encoder.mm index 69c1484c31e..499edaed434 100644 --- a/coreml/whisper-encoder.mm +++ b/coreml/whisper-encoder.mm @@ -24,8 +24,8 @@ // select which device to run the Core ML model on MLModelConfiguration *config = [[MLModelConfiguration alloc] init]; - //config.computeUnits = MLComputeUnitsCPUAndGPU; - config.computeUnits = MLComputeUnitsCPUAndNeuralEngine; + config.computeUnits = MLComputeUnitsCPUAndGPU; + //config.computeUnits = MLComputeUnitsCPUAndNeuralEngine; //config.computeUnits = MLComputeUnitsAll; const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model configuration:config error:nil]); diff --git a/whisper.cpp b/whisper.cpp index e607c32865e..5e0714b7b25 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -136,6 +136,19 @@ static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * ggml_graph_compute(graph, &plan); } +// faster matrix multiplications for tensors that do not have dimension 0 divisible "pad" +// the idea is to represent the original matrix multiplication: +// +// Z = X @ Y +// +// with two matrix multiplications: +// +// Z = [X_0; X_1] @ [Y_0; Y_1] +// +// here X_0 and Y_0 are views of X and Y that have dimension 0 divisible by "pad" +// and X_1 and Y_1 are the remaining views. X_1 and Y_1 end up being small matrices that can be processed with more +// general-purpose kernels +// static struct ggml_tensor * ggml_mul_mat_pad(struct ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * y, int pad = 32) { //#if !defined(GGML_USE_METAL) // return ggml_mul_mat(ctx, x, y);