From 38b16dfca6e5032e6cfb90c1653bf1ba4cf647b4 Mon Sep 17 00:00:00 2001 From: Shouzheng Liu Date: Thu, 24 Aug 2023 12:27:25 -0400 Subject: [PATCH] metal : bug-fix when enable ggml-alloc (#2757) * metal: better memory alloc w/ concurrency dispatch The ggml-alloc should only free tensors at memory barriers. * ggml-alloc: avoid return silently In certain cases, the allocate_node() function may silently return without performing any memory allocation. --- ggml-alloc.c | 135 ++++++++++++++++++++++++++++----------------------- llama.cpp | 5 -- 2 files changed, 74 insertions(+), 66 deletions(-) diff --git a/ggml-alloc.c b/ggml-alloc.c index 547ec0399fdb5..af4affa4e018c 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -68,7 +68,7 @@ struct ggml_allocr { size_t max_size; bool measure; int parse_seq[GGML_MAX_NODES]; - bool has_parse_seq; + int parse_seq_len; #ifdef GGML_ALLOCATOR_DEBUG struct ggml_tensor * allocated_tensors[1024]; @@ -239,14 +239,10 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t } void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) { - int pos = 0; for (int i = 0; i < n; i++) { - if (list[i] != -1) { - alloc->parse_seq[pos] = list[i]; - pos++; - } + alloc->parse_seq[i] = list[i]; } - alloc->has_parse_seq = true; + alloc->parse_seq_len = n; } void ggml_allocr_reset(struct ggml_allocr * alloc) { @@ -269,7 +265,7 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) /*.max_size = */ 0, /*.measure = */ false, /*.parse_seq = */ {0}, - /*.has_parse_seq = */ false, + /*.parse_seq_len = */ 0, #ifdef GGML_ALLOCATOR_DEBUG /*.allocated_tensors = */ = {0}, #endif @@ -298,7 +294,7 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) { /*.max_size = */ 0, /*.measure = */ true, /*.parse_seq = */ {0}, - /*.has_parse_seq = */ false, + /*.parse_seq_len = */ 0, #ifdef GGML_ALLOCATOR_DEBUG /*.allocated_tensors = */ = {0}, #endif @@ -445,8 +441,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) else { AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); node->data = parent->data; + return; } - return; } } } @@ -497,69 +493,86 @@ static size_t ggml_allocator_alloc_graph_tensors_n( allocate_node(alloc, input); } } - for (int ind = 0; ind < gf->n_nodes; ind++) { - int i; - if (alloc->has_parse_seq) { - i = alloc->parse_seq[ind]; - } else { - i = ind; - } - struct ggml_tensor * node = gf->nodes[i]; - - // allocate parents (leafs) - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; + // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers + int last_barrier_pos = 0; + int n_nodes = alloc->parse_seq_len ? alloc->parse_seq_len : gf->n_nodes; + + for (int ind = 0; ind < n_nodes; ind++) { + // allocate a node if there is no parse_seq or this is not a barrier + if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] != -1) { + int i = alloc->parse_seq_len ? alloc->parse_seq[ind] : ind; + struct ggml_tensor * node = gf->nodes[i]; + + // allocate parents (leafs) + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + allocate_node(alloc, parent); } - allocate_node(alloc, parent); - } - // allocate node - allocate_node(alloc, node); + // allocate node + allocate_node(alloc, node); - AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name); - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; - } - AT_PRINTF("%s", parent->name); - if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { - AT_PRINTF(", "); + AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name); + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + AT_PRINTF("%s", parent->name); + if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { + AT_PRINTF(", "); + } } + AT_PRINTF("\n"); } - AT_PRINTF("\n"); + // update parents - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; - } - struct hash_node * p_hn = hash_get(ht, parent); - p_hn->n_children -= 1; - - //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views); - - if (p_hn->n_children == 0 && p_hn->n_views == 0) { - if (ggml_is_view(parent)) { - struct ggml_tensor * view_src = get_view_source(parent); - struct hash_node * view_src_hn = hash_get(ht, view_src); - view_src_hn->n_views -= 1; - AT_PRINTF("view_src %s\n", view_src->name); - if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) { - ggml_allocator_free_tensor(alloc, view_src); + // update immediately if there is no parse_seq + // update only at barriers if there is parse_seq + if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) { + int update_start = alloc->parse_seq_len ? last_barrier_pos : ind; + int update_end = alloc->parse_seq_len ? ind : ind + 1; + for (int i = update_start; i < update_end; i++) { + int node_i = alloc->parse_seq_len ? alloc->parse_seq[i] : i; + struct ggml_tensor * node = gf->nodes[node_i]; + + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; } - } - else { - if (parent->data != node->data) { - ggml_allocator_free_tensor(alloc, parent); + struct hash_node * p_hn = hash_get(ht, parent); + p_hn->n_children -= 1; + + //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views); + + if (p_hn->n_children == 0 && p_hn->n_views == 0) { + if (ggml_is_view(parent)) { + struct ggml_tensor * view_src = get_view_source(parent); + struct hash_node * view_src_hn = hash_get(ht, view_src); + view_src_hn->n_views -= 1; + AT_PRINTF("view_src %s\n", view_src->name); + if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) { + ggml_allocator_free_tensor(alloc, view_src); + } + } + else { + if (parent->data != node->data) { + ggml_allocator_free_tensor(alloc, parent); + } + } } } } + AT_PRINTF("\n"); + if (alloc->parse_seq_len) { + last_barrier_pos = ind + 1; + } } - AT_PRINTF("\n"); } // free graph outputs here that wouldn't be freed otherwise because they have no children if (outputs != NULL && outputs[g] != NULL) { diff --git a/llama.cpp b/llama.cpp index 7ee6bcdae6aca..b5266c1e19a60 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2707,11 +2707,6 @@ static struct ggml_cgraph * llm_build_falcon( struct ggml_tensor * inpFF = attn_norm; cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF); - - // TODO: this is temporary needed to introduce artificial dependency between FF and ATTN - // adding this, because there seems to be a bug in the Metal concurrency optimization - // without this line, the results are non-deterministic and wrong - cur->src[2] = attn_out; offload_func(cur); cur = ggml_gelu(ctx0, cur);