Skip to content

Commit

Permalink
Speedup of up to 300% on large context (ggerganov#58)
Browse files Browse the repository at this point in the history
KV cache is now cyclic split into permuted V variant
The ggml_tensor_print function has been completely reworked to output proper 1-4dim tensors with data.
Example:
```
+======================+======================+======================+======================+
| :0
| V                                [f32 type]
+----------------------+----------------------+----------------------+----------------------+
| Dimensions           | Strides              | Layer id             | Backend              |
| 3                    | 4x16x1024            | 0                    | CPU                  |
+----------------------+----------------------+----------------------+----------------------+
| Elements             | Src0                 | Src1                 | Operation            |
| 4 x 64 x 2           | 4 x 64 x 2           | N/A                  | CONT                 |
+----------------------+----------------------+----------------------+----------------------+
| Transposed:      No  | Permuted:        No  | Contiguous:      Yes | Size:        0.00 MB |
| Src0 name:           | cache_v (view) (permuted)                                          |
+----------------------+----------------------+----------------------+----------------------+

+-------------------------------------------------------------------------------------------+
| Content of src0 "cache_v (view) (permuted)" (3 dim)

+-------------------------------------------------------------------------------------------+
| Content of src0 "cache_v (view) (permuted)" (3 dim)
| Total Elements : [ Row:4   Col:64  Layer:2   ]
+-------------------------------------------------------------------------------------------+
| Row 1: [0.302  , 0.010  ] [-0.238 , 0.680  ] [0.305  , 0.206  ] [-0.013 , 0.436  ] [-0.074 , -0.698 ] [-0.153 , -0.067 ]
| Row 2: [0.091  , 0.199  ] [0.253  , 0.151  ] [-0.557 , 0.089  ] [0.298  , -0.272 ] [-0.149 , 0.232  ] [-0.217 , 0.193  ]
| Row 3: [-0.085 , -0.014 ] [0.225  , 0.089  ] [-0.338 , 0.072  ] [0.416  , -0.186 ] [-0.071 , 0.110  ] [0.467  , 0.497  ]
| Row 4: [-0.336 , 0.471  ] [-0.144 , 0.070  ] [-0.062 , 0.520  ] [0.093  , 0.217  ] [-0.332 , -0.205 ] [0.012  , 0.335  ]
+-------------------------------------------------------------------------------------------+
+-------------------------------------------------------------------------------------------+
| Content of dst "V" (3 dim)

+-------------------------------------------------------------------------------------------+
| Content of dst "V" (3 dim)
| Total Elements : [ Row:4   Col:64  Layer:2   ]
+-------------------------------------------------------------------------------------------+
| Row 1: [0.302  , 0.010  ] [-0.238 , 0.680  ] [0.305  , 0.206  ] [-0.013 , 0.436  ] [-0.074 , -0.698 ] [-0.153 , -0.067 ]
| Row 2: [0.091  , 0.199  ] [0.253  , 0.151  ] [-0.557 , 0.089  ] [0.298  , -0.272 ] [-0.149 , 0.232  ] [-0.217 , 0.193  ]
| Row 3: [-0.085 , -0.014 ] [0.225  , 0.089  ] [-0.338 , 0.072  ] [0.416  , -0.186 ] [-0.071 , 0.110  ] [0.467  , 0.497  ]
| Row 4: [-0.336 , 0.471  ] [-0.144 , 0.070  ] [-0.062 , 0.520  ] [0.093  , 0.217  ] [-0.332 , -0.205 ] [0.012  , 0.335  ]
+-------------------------------------------------------------------------------------------+
+======================+======================+======================+======================+
```
  • Loading branch information
cmp-nct authored Jul 13, 2023
1 parent 09f2184 commit 2cae279
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 64 deletions.
2 changes: 1 addition & 1 deletion examples/falcon/falcon_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -659,7 +659,7 @@ fprintf(stderr, "+------------+-------+-------+-------+-------+---------------+-
{
n_regen = 4;
}
if (n_regen > all_generation.size()-embd.size()) n_regen = (int)all_generation.size()-embd.size();
if (n_regen > all_generation.size()-embd.size()) n_regen = (int)all_generation.size()-(int)embd.size();

// add right sided part of all_generation storage if we still have room remaining
if (n_regen)
Expand Down
114 changes: 68 additions & 46 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -19540,60 +19540,80 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i

////////////////////////////////////////////////////////////////////////////////

void ggml_printTensorSample(char *prefix,const struct ggml_tensor * tensor) {
void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tensor) {
const char *sep = "+-------------------------------------------------------------------------------------------+\n";
printf("%s", sep);
printf("| Content of %s \"%s\" (%d dim)",prefix,tensor->name,tensor->n_dims);
printf("\n");
const int max_elements = 40000;
printf("%s| Content of %s \"%s\" (%d dim)\n", sep, prefix, tensor->name, tensor->n_dims);

if (tensor->n_dims == 1) {
printf("| ");
for(int i = 0; i < tensor->ne[0] && i < max_elements; i++){
printf("%-20f ", (double) *(float *)((char *) tensor->data + i*tensor->nb[0]));
}
printf("|");
printf("\n");
printf("%s", sep);
const int MAX_ELEMENTS_ROW = 10;
const int MAX_ELEMENTS_COL = 6;
const int MAX_ELEMENTS_LAYER = 3; // layered
const int MAX_ELEMENTS_BATCH = 2; // repeated display
const char *dimensionLabels[] = {"Row", "Col", "Layer", "Batch"};

printf("\n%s| Content of %s \"%s\" (%d dim)\n", sep, prefix, tensor->name, tensor->n_dims);
printf("| Total Elements : [ ");
for (int i = 0; i < tensor->n_dims; i++)
printf("%s:%-3" PRId64 " ", dimensionLabels[i], tensor->ne[i]);
printf("]\n%s", sep);

if (tensor->n_dims == 1) {
printf("| 1: ");
for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
printf("%-7.3f, ", *(float *)((char *) tensor->data + i*tensor->nb[0]));
}
if(MAX_ELEMENTS_ROW < tensor->ne[0]) printf(", ..");
printf("\n%s", sep);
}
else if (tensor->n_dims == 2) {
for(int i = 0; i < tensor->ne[0] && i < max_elements; i++){
printf("| ");
for(int j = 0; j < tensor->ne[1] && j < max_elements; j++){
printf("%-20f ", (double) *(float *)((char *) tensor->data + i*tensor->nb[0] + j*tensor->nb[1]));
for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
printf("| %d: ", i+1);
for(int j = 0; j < tensor->ne[1] && j < MAX_ELEMENTS_COL; j++){
printf("%-7.3f ", *(float *)((char *) tensor->data + i*tensor->nb[0] + j*tensor->nb[1]));
if(j == MAX_ELEMENTS_COL - 1 && tensor->ne[1] > MAX_ELEMENTS_COL) printf(", ..");
}
printf("|");
printf("\n");
}
printf("%s", sep);
}
else if(tensor->n_dims == 3) {
for(int i = 0; i < tensor->ne[0] && i < 3; i++){
printf("Layer %d\n", i);
for(int j = 0; j < tensor->ne[1] && j < max_elements; j++){
printf("| ");
for(int k = 0; k < tensor->ne[2] && k < max_elements; k++){
printf("%-20f ", (double) *(float *)((char *) tensor->data + i*tensor->nb[0] + j*tensor->nb[1] + k*tensor->nb[2]));
if(MAX_ELEMENTS_ROW < tensor->ne[0]) printf(" .. additional rows\n");
printf("%s", sep);
}else if(tensor->n_dims == 3) {
for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
printf("| Row %d: ", i+1);
for(int j = 0; j < tensor->ne[1] && j < MAX_ELEMENTS_COL; j++){
printf("[");
for(int k = 0; k < tensor->ne[2] && k < MAX_ELEMENTS_LAYER; k++){
printf("%-7.3f", *(float *)((char *) tensor->data + i*tensor->nb[0] + j*tensor->nb[1] + k*tensor->nb[2]));
if(k < tensor->ne[2] - 1 && k < MAX_ELEMENTS_LAYER - 1)
printf(", ");
}
printf("|\n");
if(MAX_ELEMENTS_LAYER < tensor->ne[2]) printf(", ..");
printf("] ");
}
printf("%s\n", sep);
}
}
else if(tensor->n_dims == 4){
for(int i = 0; i < tensor->ne[0] && i < 3; i++){
printf("Batch %d\n", i);
for(int j = 0; j < tensor->ne[1] && j < 3; j++){
printf("Layer %d\n", j);
for(int k = 0; k < tensor->ne[2] && k < max_elements; k++){
printf("| ");
for(int l = 0; l < tensor->ne[3] && l < 3; l++){
printf("%-20f ", (double) *(float *)((char *) tensor->data + i*tensor->nb[0] + j*tensor->nb[1] + k*tensor->nb[2] + l*tensor->nb[3]));
printf("\n");
}
if(MAX_ELEMENTS_ROW < tensor->ne[0]) printf(" ... additional layers\n");
printf("%s", sep);
}

// For 4D tensor
else if(tensor->n_dims == 4) {
for(int batch = 0; batch < tensor->ne[0] && batch < MAX_ELEMENTS_BATCH; batch++){
printf("Batch %d\n", batch+1);
for(int i = 0; i < tensor->ne[1] && i < MAX_ELEMENTS_ROW; i++){
printf("| Row %d: ", i+1);
for(int j = 0; j < tensor->ne[2] && j < MAX_ELEMENTS_COL; j++){
printf("[");
for(int k = 0; k < tensor->ne[3] && k < MAX_ELEMENTS_LAYER; k++){
printf("%-7.3f", *(float *)((char *) tensor->data + batch*tensor->nb[0] + i*tensor->nb[1] + j*tensor->nb[2] + k*tensor->nb[3]));
if(k < tensor->ne[3] - 1 && k < MAX_ELEMENTS_LAYER - 1)
printf(", ");
}
printf("|\n");
if(MAX_ELEMENTS_LAYER < tensor->ne[3]) printf(", ..");
printf("] ");
}
printf("%s\n", sep);
printf("\n");
}
if(MAX_ELEMENTS_BATCH < tensor->ne[0]) printf(" ... additional batches\n");
printf("%s", sep);
}
}
}
Expand All @@ -19614,11 +19634,13 @@ void ggml_tensor_printf(const struct ggml_tensor *tensor, char *prefix, int line
// nb[i] = nb[i-1] * ne[i-1]
*/
{
pos = 0;
for (int i = 0; i <= tensor->n_dims; i++) {
pos += snprintf(strides + pos, sizeof(strides) - pos, "%" PRId64, tensor->nb[i]);
strides[0] = '\0';
for (int i = 0; i < tensor->n_dims; i++) {
char dim_str[20];
snprintf(dim_str, sizeof(dim_str), "%" PRId64, tensor->nb[i]);
strncat(strides, dim_str, sizeof(strides) - strlen(strides) - 1);
if (i != tensor->n_dims - 1) {
pos += snprintf(strides + pos, sizeof(strides) - pos, "x");
strncat(strides, "x", sizeof(strides) - strlen(strides) - 1);
}
}
}
Expand Down
6 changes: 5 additions & 1 deletion ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,9 @@ extern "C" {
float f_custom[4];
int i_custom[4];

// uint8_t padding;
uint8_t debug_flag;

char padding[15];
} tensor_meta;
static const tensor_meta GGML_DEFAULT_TENSOR_META = {
/*.layer_id =*/ -1,
Expand All @@ -410,6 +412,8 @@ extern "C" {
/*.f_custom =*/ {0.0f, 0.0f, 0.0f, 0.0f},
/*.i_custom =*/ {0, 0, 0, 0},

/*.debug_flag =*/ 0,


// /*.padding =*/ 0,
};
Expand Down
Loading

0 comments on commit 2cae279

Please sign in to comment.