Skip to content

Commit

Permalink
llama : increase inference graph size up to 4096 nodes
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Nov 3, 2023
1 parent b1592ea commit e50ab5a
Showing 1 changed file with 11 additions and 9 deletions.
20 changes: 11 additions & 9 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@
#define LLAMA_ATTRIBUTE_FORMAT(...)
#endif

#define LLAMA_MAX_NODES 4096

//
// logging
//
Expand Down Expand Up @@ -3580,7 +3582,7 @@ struct llm_build_context {
}

struct ggml_cgraph * build_llama() {
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);

GGML_ASSERT(n_embd_head == hparams.n_rot);

Expand Down Expand Up @@ -3692,7 +3694,7 @@ struct llm_build_context {
}

struct ggml_cgraph * build_baichuan() {
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);

struct ggml_tensor * cur;
struct ggml_tensor * inpL;
Expand Down Expand Up @@ -3812,7 +3814,7 @@ struct llm_build_context {
}

struct ggml_cgraph * build_falcon() {
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);

struct ggml_tensor * cur;
struct ggml_tensor * inpL;
Expand Down Expand Up @@ -3934,7 +3936,7 @@ struct llm_build_context {
}

struct ggml_cgraph * build_starcoder() {
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);

struct ggml_tensor * cur;
struct ggml_tensor * pos;
Expand Down Expand Up @@ -4033,7 +4035,7 @@ struct llm_build_context {
}

struct ggml_cgraph * build_persimmon() {
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);

const int64_t n_rot = n_embd_head / 2;

Expand Down Expand Up @@ -4243,7 +4245,7 @@ struct llm_build_context {
}

struct ggml_cgraph * build_refact() {
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);

struct ggml_tensor * cur;
struct ggml_tensor * inpL;
Expand Down Expand Up @@ -4334,7 +4336,7 @@ struct llm_build_context {
}

struct ggml_cgraph * build_bloom() {
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);

struct ggml_tensor * cur;
struct ggml_tensor * inpL;
Expand Down Expand Up @@ -4428,7 +4430,7 @@ struct llm_build_context {
}

struct ggml_cgraph * build_mpt() {
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);

struct ggml_tensor * cur;
struct ggml_tensor * inpL;
Expand Down Expand Up @@ -8169,7 +8171,7 @@ struct llama_context * llama_new_context_with_model(
{
static const size_t tensor_alignment = 32;
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead());
ctx->buf_compute.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());

// create measure allocator
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
Expand Down

0 comments on commit e50ab5a

Please sign in to comment.