Speedup of up to 300% on large context (ggerganov#58)

KV cache is now cyclic split into permuted V variant The ggml_tensor_print function has been completely reworked to output proper 1-4dim tensors with data. Example: ``` +======================+======================+======================+======================+ | :0 | V [f32 type] +----------------------+----------------------+----------------------+----------------------+ | Dimensions | Strides | Layer id | Backend | | 3 | 4x16x1024 | 0 | CPU | +----------------------+----------------------+----------------------+----------------------+ | Elements | Src0 | Src1 | Operation | | 4 x 64 x 2 | 4 x 64 x 2 | N/A | CONT | +----------------------+----------------------+----------------------+----------------------+ | Transposed: No | Permuted: No | Contiguous: Yes | Size: 0.00 MB | | Src0 name: | cache_v (view) (permuted) | +----------------------+----------------------+----------------------+----------------------+ +-------------------------------------------------------------------------------------------+ | Content of src0 "cache_v (view) (permuted)" (3 dim) +-------------------------------------------------------------------------------------------+ | Content of src0 "cache_v (view) (permuted)" (3 dim) | Total Elements : [ Row:4 Col:64 Layer:2 ] +-------------------------------------------------------------------------------------------+ | Row 1: [0.302 , 0.010 ] [-0.238 , 0.680 ] [0.305 , 0.206 ] [-0.013 , 0.436 ] [-0.074 , -0.698 ] [-0.153 , -0.067 ] | Row 2: [0.091 , 0.199 ] [0.253 , 0.151 ] [-0.557 , 0.089 ] [0.298 , -0.272 ] [-0.149 , 0.232 ] [-0.217 , 0.193 ] | Row 3: [-0.085 , -0.014 ] [0.225 , 0.089 ] [-0.338 , 0.072 ] [0.416 , -0.186 ] [-0.071 , 0.110 ] [0.467 , 0.497 ] | Row 4: [-0.336 , 0.471 ] [-0.144 , 0.070 ] [-0.062 , 0.520 ] [0.093 , 0.217 ] [-0.332 , -0.205 ] [0.012 , 0.335 ] +-------------------------------------------------------------------------------------------+ +-------------------------------------------------------------------------------------------+ | Content of dst "V" (3 dim) +-------------------------------------------------------------------------------------------+ | Content of dst "V" (3 dim) | Total Elements : [ Row:4 Col:64 Layer:2 ] +-------------------------------------------------------------------------------------------+ | Row 1: [0.302 , 0.010 ] [-0.238 , 0.680 ] [0.305 , 0.206 ] [-0.013 , 0.436 ] [-0.074 , -0.698 ] [-0.153 , -0.067 ] | Row 2: [0.091 , 0.199 ] [0.253 , 0.151 ] [-0.557 , 0.089 ] [0.298 , -0.272 ] [-0.149 , 0.232 ] [-0.217 , 0.193 ] | Row 3: [-0.085 , -0.014 ] [0.225 , 0.089 ] [-0.338 , 0.072 ] [0.416 , -0.186 ] [-0.071 , 0.110 ] [0.467 , 0.497 ] | Row 4: [-0.336 , 0.471 ] [-0.144 , 0.070 ] [-0.062 , 0.520 ] [0.093 , 0.217 ] [-0.332 , -0.205 ] [0.012 , 0.335 ] +-------------------------------------------------------------------------------------------+ +======================+======================+======================+======================+ ```
44670 · Jul 13, 2023 · 2cae279 · 2cae279
1 parent 09f2184
commit 2cae279
Show file tree

Hide file tree

Showing 4 changed files with 166 additions and 64 deletions.
diff --git a/examples/falcon/falcon_main.cpp b/examples/falcon/falcon_main.cpp
@@ -659,7 +659,7 @@ fprintf(stderr, "+------------+-------+-------+-------+-------+---------------+-
                     {
                         n_regen = 4;
                     }
-                    if (n_regen > all_generation.size()-embd.size()) n_regen = (int)all_generation.size()-embd.size();
+                    if (n_regen > all_generation.size()-embd.size()) n_regen = (int)all_generation.size()-(int)embd.size();
 
                     // add right sided part of all_generation storage if we still have room remaining
                     if (n_regen)

diff --git a/ggml.c b/ggml.c
@@ -19540,60 +19540,80 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
 
 ////////////////////////////////////////////////////////////////////////////////
 
-void ggml_printTensorSample(char *prefix,const struct ggml_tensor * tensor) {
+void ggml_printTensorSample(const char *prefix, const struct ggml_tensor * tensor) {
     const char *sep = "+-------------------------------------------------------------------------------------------+\n";
-    printf("%s",  sep);
-    printf("| Content of %s \"%s\" (%d dim)",prefix,tensor->name,tensor->n_dims);
-    printf("\n");
-    const int max_elements = 40000;
+    printf("%s| Content of %s \"%s\" (%d dim)\n", sep, prefix, tensor->name, tensor->n_dims);
 
-    if (tensor->n_dims == 1) {
-        printf("| ");
-        for(int i = 0; i < tensor->ne[0] && i < max_elements; i++){
-            printf("%-20f ", (double) *(float *)((char *) tensor->data + i*tensor->nb[0])); 
-        }
-        printf("|");
-        printf("\n");
-        printf("%s",  sep);
+    const int MAX_ELEMENTS_ROW = 10;
+    const int MAX_ELEMENTS_COL = 6;
+    const int MAX_ELEMENTS_LAYER = 3;  // layered
+    const int MAX_ELEMENTS_BATCH = 2;   // repeated display
+    const char *dimensionLabels[] = {"Row", "Col", "Layer", "Batch"};
+
+    printf("\n%s| Content of %s \"%s\" (%d dim)\n", sep, prefix, tensor->name, tensor->n_dims);
+    printf("| Total Elements : [ ");
+    for (int i = 0; i < tensor->n_dims; i++)
+        printf("%s:%-3" PRId64 " ", dimensionLabels[i], tensor->ne[i]);
+    printf("]\n%s", sep);
+
+     if (tensor->n_dims == 1) {
+        printf("| 1: ");
+        for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
+            printf("%-7.3f, ",  *(float *)((char *) tensor->data + i*tensor->nb[0]));
+        }
+        if(MAX_ELEMENTS_ROW < tensor->ne[0]) printf(", ..");
+        printf("\n%s", sep);
     }
     else if (tensor->n_dims == 2) {
-        for(int i = 0; i < tensor->ne[0] && i < max_elements; i++){
-            printf("| ");
-            for(int j = 0; j < tensor->ne[1] && j < max_elements; j++){
-                printf("%-20f ", (double) *(float *)((char *) tensor->data + i*tensor->nb[0] + j*tensor->nb[1]));
+        for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
+            printf("| %d: ", i+1);
+            for(int j = 0; j < tensor->ne[1] && j < MAX_ELEMENTS_COL; j++){
+                printf("%-7.3f ",  *(float *)((char *) tensor->data + i*tensor->nb[0] + j*tensor->nb[1]));
+                if(j == MAX_ELEMENTS_COL - 1 && tensor->ne[1] > MAX_ELEMENTS_COL) printf(", ..");
             }
-            printf("|");
             printf("\n");
         }
-        printf("%s",  sep);
-    }
-    else if(tensor->n_dims == 3) {
-        for(int i = 0; i < tensor->ne[0] && i < 3; i++){
-            printf("Layer %d\n", i);
-            for(int j = 0; j < tensor->ne[1] && j < max_elements; j++){
-                printf("| ");
-                for(int k = 0; k < tensor->ne[2] && k < max_elements; k++){
-                    printf("%-20f ", (double) *(float *)((char *) tensor->data + i*tensor->nb[0] + j*tensor->nb[1] + k*tensor->nb[2]));
+        if(MAX_ELEMENTS_ROW < tensor->ne[0]) printf("     .. additional rows\n");
+        printf("%s", sep);
+    }else if(tensor->n_dims == 3) {
+        for(int i = 0; i < tensor->ne[0] && i < MAX_ELEMENTS_ROW; i++){
+            printf("| Row %d: ", i+1);
+            for(int j = 0; j < tensor->ne[1] && j < MAX_ELEMENTS_COL; j++){
+                printf("[");
+                for(int k = 0; k < tensor->ne[2] && k < MAX_ELEMENTS_LAYER; k++){
+                    printf("%-7.3f",  *(float *)((char *) tensor->data + i*tensor->nb[0] + j*tensor->nb[1] + k*tensor->nb[2]));
+                    if(k < tensor->ne[2] - 1 && k < MAX_ELEMENTS_LAYER - 1) 
+                        printf(", ");
                 }
-                printf("|\n");
+                if(MAX_ELEMENTS_LAYER < tensor->ne[2]) printf(", ..");
+                printf("] ");
             }
-            printf("%s\n",  sep);
-        }
-    }
-    else if(tensor->n_dims == 4){
-        for(int i = 0; i < tensor->ne[0] && i < 3; i++){
-            printf("Batch %d\n", i);
-            for(int j = 0; j < tensor->ne[1] && j < 3; j++){
-                printf("Layer %d\n", j);
-                for(int k = 0; k < tensor->ne[2] && k < max_elements; k++){
-                    printf("| ");
-                    for(int l = 0; l < tensor->ne[3] && l < 3; l++){
-                        printf("%-20f ", (double) *(float *)((char *) tensor->data + i*tensor->nb[0] + j*tensor->nb[1] + k*tensor->nb[2] + l*tensor->nb[3]));
+            printf("\n");
+        }
+        if(MAX_ELEMENTS_ROW < tensor->ne[0]) printf("     ... additional layers\n");
+        printf("%s", sep);
+    }
+
+    // For 4D tensor
+    else if(tensor->n_dims == 4) {
+        for(int batch = 0; batch < tensor->ne[0] && batch < MAX_ELEMENTS_BATCH; batch++){
+            printf("Batch %d\n", batch+1);
+            for(int i = 0; i < tensor->ne[1] && i < MAX_ELEMENTS_ROW; i++){
+                printf("| Row %d: ", i+1);
+                for(int j = 0; j < tensor->ne[2] && j < MAX_ELEMENTS_COL; j++){
+                    printf("[");
+                    for(int k = 0; k < tensor->ne[3] && k < MAX_ELEMENTS_LAYER; k++){
+                        printf("%-7.3f",  *(float *)((char *) tensor->data + batch*tensor->nb[0] + i*tensor->nb[1] + j*tensor->nb[2] + k*tensor->nb[3]));
+                        if(k < tensor->ne[3] - 1 && k < MAX_ELEMENTS_LAYER - 1) 
+                            printf(", ");
                     }
-                    printf("|\n");
+                    if(MAX_ELEMENTS_LAYER < tensor->ne[3]) printf(", ..");
+                    printf("] ");
                 }
-                printf("%s\n",  sep);
+                printf("\n");
             }
+            if(MAX_ELEMENTS_BATCH < tensor->ne[0]) printf("     ... additional batches\n");
+            printf("%s", sep);
         }
     }
 }
@@ -19614,11 +19634,13 @@ void ggml_tensor_printf(const struct ggml_tensor *tensor, char *prefix, int line
         // nb[i] = nb[i-1] * ne[i-1]
     */
     {
-        pos = 0;
-        for (int i = 0; i <= tensor->n_dims; i++) {
-            pos += snprintf(strides + pos, sizeof(strides) - pos, "%" PRId64, tensor->nb[i]);
+        strides[0] = '\0';
+        for (int i = 0; i < tensor->n_dims; i++) {
+            char dim_str[20];
+            snprintf(dim_str, sizeof(dim_str), "%" PRId64, tensor->nb[i]);
+            strncat(strides, dim_str, sizeof(strides) - strlen(strides) - 1);
             if (i != tensor->n_dims - 1) {
-                pos += snprintf(strides + pos, sizeof(strides) - pos, "x");
+                strncat(strides, "x", sizeof(strides) - strlen(strides) - 1);
             }
         }
     }

diff --git a/ggml.h b/ggml.h
@@ -397,7 +397,9 @@ extern "C" {
                 float f_custom[4];  
                 int i_custom[4];
 
-                // uint8_t padding;
+                uint8_t debug_flag;
+
+                char padding[15];
         } tensor_meta;
         static const tensor_meta GGML_DEFAULT_TENSOR_META = {
                 /*.layer_id =*/ -1,
@@ -410,6 +412,8 @@ extern "C" {
                 /*.f_custom =*/ {0.0f, 0.0f, 0.0f, 0.0f},
                 /*.i_custom =*/ {0, 0, 0, 0},
 
+                /*.debug_flag =*/ 0,
+
 
                 // /*.padding =*/ 0,
         };