Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

whisper : Metal and ggml-alloc support #1270

Merged
merged 44 commits into from
Sep 15, 2023
Merged
Changes from 1 commit
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
fbc3f80
metal : init
ggerganov Sep 10, 2023
949ab63
whisper : factor out graph builds
ggerganov Sep 10, 2023
bed5ad6
whisper : allocate encoder and decoder using ggml-alloc
ggerganov Sep 10, 2023
af6f67b
whisper : ggml-alloc is now supported
ggerganov Sep 10, 2023
fa672b4
whisper : CoreML support ggml-alloc
ggerganov Sep 10, 2023
794e8fe
build : fix ggml-alloc
ggerganov Sep 10, 2023
9a78b72
ios : update submodule
ggerganov Sep 10, 2023
06d1d28
extra : update sync-ggml.sh script to also sync ggml-alloc
ggerganov Sep 10, 2023
4d9acc6
ci : see if this is causing the crash
ggerganov Sep 10, 2023
2770d46
whisper : refactor ggml-alloc init
ggerganov Sep 11, 2023
4845b9e
whisper.android : try to fix build
ggerganov Sep 11, 2023
d3b2dd4
whisper : initial Metal version
ggerganov Sep 11, 2023
de94c78
Merge branch 'master' into metal-and-alloc
ggerganov Sep 12, 2023
3b9979a
ci : try to debug vmem issue
ggerganov Sep 12, 2023
fbc9ddc
metal : decoder works on GPU!
ggerganov Sep 12, 2023
79a8805
metal : add multi-decoder support
ggerganov Sep 12, 2023
9fdd415
ggml : fix ggml_nbytes (probably temp solution)
ggerganov Sep 12, 2023
cd47637
metal : run "cross" step on the GPU
ggerganov Sep 12, 2023
ec9a7db
whisper : remove ggml_repeat in the encoder
ggerganov Sep 12, 2023
3074a7f
whisper : offload the Encoder to Metal
ggerganov Sep 12, 2023
905c944
ggml : use simpler ggml_bytes() implementation
ggerganov Sep 13, 2023
b19888c
ggml-alloc : try to make CI happy by reducing vram to 128GB
ggerganov Sep 13, 2023
254b687
whisper : add whisper_allocr to wrap ggml_allocr
ggerganov Sep 13, 2023
b6f0966
whisper : factor out alloc init in a function
ggerganov Sep 13, 2023
77f4bf4
cmake : update to support Metal build
ggerganov Sep 13, 2023
796f84c
whisper : add <functional> header
ggerganov Sep 13, 2023
181bb8c
objc : fix build (no Metal yet)
ggerganov Sep 13, 2023
257d794
ios : add Metal support
ggerganov Sep 13, 2023
16db4da
swiftui : fix build
ggerganov Sep 13, 2023
8e8daa8
metal : speed-up KQ multiplication
ggerganov Sep 13, 2023
ecb23fb
metal : sync latest llama.cpp kernels
ggerganov Sep 13, 2023
23277d2
readme : add Metal info
ggerganov Sep 13, 2023
d37f56e
ios : update submodule
ggerganov Sep 13, 2023
d863f72
coreml : add code to toggle Core ML config (CPU, ANE, GPU)
ggerganov Sep 13, 2023
f408c64
bench : fix timings by running a pre-heat
ggerganov Sep 13, 2023
e81c67a
bench : start benching the decoder
ggerganov Sep 14, 2023
af947cb
whisper : add ggml_mul_mat_pad
ggerganov Sep 14, 2023
c46167f
bench : fix uninitialized vars
ggerganov Sep 14, 2023
f365543
whisper : add comment for disabling mul-mat padding
ggerganov Sep 14, 2023
2b4160a
whisper : add description of ggml_mul_mat_pad
ggerganov Sep 14, 2023
0d5e4cd
whisper : clean-up ggml_mul_mat_pad
ggerganov Sep 14, 2023
bfcb2a2
metal : remove the "concurrent" flag
ggerganov Sep 14, 2023
a166457
bench : variable n_past
ggerganov Sep 14, 2023
3ac0558
ios : update SPM package
ggerganov Sep 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
whisper : clean-up ggml_mul_mat_pad
  • Loading branch information
ggerganov committed Sep 14, 2023

Verified

This commit was signed with the committer’s verified signature. The key has expired.
tagliala Geremia Taglialatela
commit 0d5e4cdc36d8e80fcb9858300eac034fdddc7b60
2 changes: 1 addition & 1 deletion ggml-metal.h
Original file line number Diff line number Diff line change
@@ -77,7 +77,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);

// same as ggml_graph_compute but uses Metal
// creates gf->n_threads command buffers in parallel
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool concurrent);

#ifdef __cplusplus
}
7 changes: 4 additions & 3 deletions ggml-metal.m
Original file line number Diff line number Diff line change
@@ -610,14 +610,15 @@ void ggml_metal_graph_find_concurrency(

void ggml_metal_graph_compute(
struct ggml_metal_context * ctx,
struct ggml_cgraph * gf) {
struct ggml_cgraph * gf,
bool concurrent) {
@autoreleasepool {

// if there is ctx->concur_list, dispatch concurrently
// else fallback to serial dispatch
MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;

const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR;
const bool has_concur = concurrent && ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR;

const int n_nodes = has_concur ? ctx->concur_list_len : gf->n_nodes;
edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial;
@@ -927,7 +928,7 @@ void ggml_metal_graph_compute(
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
//} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
} else if (false) {
// TODO: with the ggml_cont(ctx0, Q), this kernel is no longer useful
// TODO: with ggml_mul_mat_pad this kernel no longer seems to be needed
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
nrows = ne11;
} else {
69 changes: 37 additions & 32 deletions whisper.cpp
Original file line number Diff line number Diff line change
@@ -141,20 +141,20 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
//
// Z = X @ Y
//
// with two matrix multiplications:
// with the sum of two matrix multiplications:
//
// Z = [X_0; X_1] @ [Y_0; Y_1]
// Z = (X_0 @ Y_0) + (X_1 @ Y_1)
//
// here X_0 and Y_0 are views of X and Y that have dimension 0 divisible by "pad"
// and X_1 and Y_1 are the remaining views. X_1 and Y_1 end up being small matrices that can be processed with more
// general-purpose kernels
//
static struct ggml_tensor * ggml_mul_mat_pad(struct ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * y, int pad = 32) {
//#if !defined(GGML_USE_METAL)
// return ggml_mul_mat(ctx, x, y);
//#endif
// use padding only if dimension 0 is at least 8 times larger than the padding
// else we won't get much benefit from the optimization
const int n_pad_req = 8;

if (x->ne[0] % pad == 0 || x->ne[0] / pad < 2) {
if (x->ne[0] % pad == 0 || x->ne[0] / pad < n_pad_req) {
return ggml_mul_mat(ctx, x, y);
}

@@ -169,6 +169,11 @@ static struct ggml_tensor * ggml_mul_mat_pad(struct ggml_context * ctx, struct g
ggml_mul_mat(ctx, x_1, y_1));
}

// TODO: check if other platforms can benefit from this optimization
#if defined(GGML_USE_METAL)
#define ggml_mul_mat ggml_mul_mat_pad
#endif

// available whisper models
enum e_model {
MODEL_UNKNOWN,
@@ -1659,7 +1664,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder(

// self-attention
{
struct ggml_tensor * Qcur = ggml_mul_mat_pad(ctx0,
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
layer.attn_q_w,
cur);

@@ -1668,13 +1673,13 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
//Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));

// note: no bias for Key
struct ggml_tensor * Kcur = ggml_mul_mat_pad(ctx0,
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
layer.attn_k_w,
cur);

//Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));

struct ggml_tensor * Vcur = ggml_mul_mat_pad(ctx0,
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
layer.attn_v_w,
cur);

@@ -1723,7 +1728,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
0, 2, 1, 3);

// K * Q
struct ggml_tensor * KQ = ggml_mul_mat_pad(ctx0, K, Q);
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);

struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQscale);

@@ -1739,7 +1744,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head)
);

struct ggml_tensor * KQV = ggml_mul_mat_pad(ctx0, V, KQ_soft_max);
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
#endif
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);

@@ -1750,7 +1755,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder(

// projection
{
cur = ggml_mul_mat_pad(ctx0,
cur = ggml_mul_mat(ctx0,
layer.attn_ln_1_w,
cur);

@@ -1780,7 +1785,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
#else
// fully connected
cur = ggml_mul_mat_pad(ctx0,
cur = ggml_mul_mat(ctx0,
layer.mlp_0_w,
cur);

@@ -1790,7 +1795,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
cur = ggml_gelu(ctx0, cur);

// projection
cur = ggml_mul_mat_pad(ctx0,
cur = ggml_mul_mat(ctx0,
layer.mlp_1_w,
cur);

@@ -1868,13 +1873,13 @@ static struct ggml_cgraph * whisper_build_graph_cross(
for (int il = 0; il < model.hparams.n_text_layer; ++il) {
auto & layer = model.layers_decoder[il];

struct ggml_tensor* Kcross = ggml_mul_mat_pad(ctx0,
struct ggml_tensor* Kcross = ggml_mul_mat(ctx0,
layer.cross_attn_k_w,
cur);

Kcross = ggml_scale(ctx0, Kcross, Kscale);

struct ggml_tensor* Vcross = ggml_mul_mat_pad(ctx0,
struct ggml_tensor* Vcross = ggml_mul_mat(ctx0,
layer.cross_attn_v_w,
cur);

@@ -1948,7 +1953,7 @@ static bool whisper_encode_internal(
#ifdef GGML_USE_METAL
if (wstate.ctx_metal) {
ggml_metal_set_n_cb (wstate.ctx_metal, n_threads);
ggml_metal_graph_compute(wstate.ctx_metal, gf);
ggml_metal_graph_compute(wstate.ctx_metal, gf, false);
} else {
ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads);
}
@@ -1970,7 +1975,7 @@ static bool whisper_encode_internal(
#ifdef GGML_USE_METAL
if (wstate.ctx_metal) {
ggml_metal_set_n_cb (wstate.ctx_metal, n_threads);
ggml_metal_graph_compute(wstate.ctx_metal, gf);
ggml_metal_graph_compute(wstate.ctx_metal, gf, false);
} else {
ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads);
}
@@ -2071,7 +2076,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(

// self-attention
{
struct ggml_tensor * Qcur = ggml_mul_mat_pad(ctx0,
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
layer.attn_q_w,
cur);

@@ -2082,15 +2087,15 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
Qcur = ggml_scale(ctx0, Qcur, KQscale);

// note: no bias for Key
struct ggml_tensor * Kcur = ggml_mul_mat_pad(ctx0,
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
layer.attn_k_w,
cur);

Kcur = ggml_scale(ctx0, Kcur, KQscale);

// store key and value to memory
{
struct ggml_tensor * Vcur = ggml_mul_mat_pad(ctx0,
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
layer.attn_v_w,
cur);

@@ -2124,7 +2129,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
ggml_element_size(kv_self.k)*n_state*n_ctx*il);

// K * Q
struct ggml_tensor * KQ = ggml_mul_mat_pad(ctx0, K, Q);
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);

//struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);

@@ -2139,7 +2144,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
n_ctx*ggml_element_size(kv_self.v)*n_state/n_head,
il*n_ctx*ggml_element_size(kv_self.v)*n_state);

struct ggml_tensor * KQV = ggml_mul_mat_pad(ctx0, V, KQ_soft_max);
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);

struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);

@@ -2150,7 +2155,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(

// projection
{
cur = ggml_mul_mat_pad(ctx0,
cur = ggml_mul_mat(ctx0,
layer.attn_ln_1_w,
cur);

@@ -2176,7 +2181,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(

// cross-attention
{
struct ggml_tensor * Qcur = ggml_mul_mat_pad(ctx0,
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
layer.cross_attn_q_w,
cur);

@@ -2219,7 +2224,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
0, 2, 1, 3);

// K * Q
struct ggml_tensor * KQ = ggml_mul_mat_pad(ctx0, Kcross, Q);
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, Kcross, Q);

//struct ggml_tensor * KQ_scaled =
// ggml_scale(ctx0,
@@ -2232,7 +2237,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(

struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);

struct ggml_tensor * KQV = ggml_mul_mat_pad(ctx0, V, KQ_soft_max);
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);

struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);

@@ -2244,7 +2249,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(

// projection
{
cur = ggml_mul_mat_pad(ctx0,
cur = ggml_mul_mat(ctx0,
layer.cross_attn_ln_1_w,
cur);

@@ -2273,7 +2278,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
}

// fully connected
cur = ggml_mul_mat_pad(ctx0,
cur = ggml_mul_mat(ctx0,
layer.mlp_0_w,
cur);

@@ -2285,7 +2290,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
cur = ggml_gelu(ctx0, cur);

// projection
cur = ggml_mul_mat_pad(ctx0,
cur = ggml_mul_mat(ctx0,
layer.mlp_1_w,
cur);

@@ -2315,7 +2320,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
// might be useful in the future
cur = ggml_view_2d(ctx0, cur, cur->ne[0], 1, cur->nb[1], (cur->ne[1] - 1)*cur->nb[1]);

struct ggml_tensor * logits = ggml_mul_mat_pad(ctx0, model.d_te, cur);
struct ggml_tensor * logits = ggml_mul_mat(ctx0, model.d_te, cur);

ggml_build_forward_expand(gf, logits);

@@ -2368,7 +2373,7 @@ static bool whisper_decode_internal(
#ifdef GGML_USE_METAL
if (wstate.ctx_metal) {
ggml_metal_set_n_cb (wstate.ctx_metal, n_threads);
ggml_metal_graph_compute(wstate.ctx_metal, gf);
ggml_metal_graph_compute(wstate.ctx_metal, gf, false);
} else {
ggml_graph_compute_helper(wstate.work_buffer, gf, n_threads);
}