Mozilla-Ocho · jart · May 30, 2024 · May 27, 2024 · May 27, 2024 · jart
diff --git a/llama.cpp/ggml-common.h b/llama.cpp/ggml-common.h
@@ -203,6 +203,18 @@ typedef struct {
 } block_q8_1;
 static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
 
+//[kawrakow] Need these two for performance on Arm
+typedef struct {
+    ggml_half d[8];
+    int8_t qs[4*QK8_1];
+} block_q8_1_x4;
+static_assert(sizeof(block_q8_1_x4) == 4*sizeof(block_q8_1), "wrong q8_1_x4 block size/padding");
+typedef struct {
+    ggml_half d[4];
+    int8_t qs[4*QK8_0];
+} block_q8_0_x4;
+static_assert(sizeof(block_q8_0_x4) == 4*sizeof(block_q8_0), "wrong q8_0_x4 block size/padding");
+
 //
 // Super-block quantization structures
 //
@@ -313,10 +325,11 @@ typedef struct {
 static_assert(sizeof(block_q6_K) == sizeof(ggml_half) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
 
 // This is only used for intermediate quantization and dot products
+// [kawrakow] Note: I have switched the order of bsums and qs. This results in some performance gain on Arm
 typedef struct {
     float   d;              // delta
-    int8_t  qs[QK_K];       // quants
     int16_t bsums[QK_K/16]; // sum of quants in groups of 16
+    int8_t  qs[QK_K];       // quants
 } block_q8_K;
 static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
 

diff --git a/llama.cpp/ggml-quants.inc b/llama.cpp/ggml-quants.inc
@@ -873,7 +873,11 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
     block_q8_0 * restrict y = vy;
 
 #if defined(__ARM_NEON)
+    // [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
+    block_q8_0_x4 * y4 = (block_q8_0_x4 *)vy;
+    int nb4 = 4*(nb/4);
     for (int i = 0; i < nb; i++) {
+        int i4 = i/4, ir = i%4;
         float32x4_t srcv [8];
         float32x4_t asrcv[8];
         float32x4_t amaxv[8];
@@ -890,16 +894,29 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
         const float d = amax / ((1 << 7) - 1);
         const float id = d ? 1.0f/d : 0.0f;
 
-        y[i].d = GGML_FP32_TO_FP16(d);
+        // [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
+        if (i < nb4) {
+            y4[i4].d[ir] = GGML_FP32_TO_FP16(d);
+        } else {
+            y[i].d = GGML_FP32_TO_FP16(d);
+        }
 
         for (int j = 0; j < 8; j++) {
             const float32x4_t v  = vmulq_n_f32(srcv[j], id);
             const int32x4_t   vi = vcvtnq_s32_f32(v);
 
-            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+            // [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
+            if (i < nb4) {
+                y4[i4].qs[32*ir + 4*j + 0] = vgetq_lane_s32(vi, 0);
+                y4[i4].qs[32*ir + 4*j + 1] = vgetq_lane_s32(vi, 1);
+                y4[i4].qs[32*ir + 4*j + 2] = vgetq_lane_s32(vi, 2);
+                y4[i4].qs[32*ir + 4*j + 3] = vgetq_lane_s32(vi, 3);
+            } else {
+                y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+                y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+                y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+                y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+            }
         }
     }
 #elif defined(__wasm_simd128__)
@@ -1192,7 +1209,11 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
     block_q8_1 * restrict y = vy;
 
 #if defined(__ARM_NEON)
+    // [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
+    block_q8_1_x4 * restrict y4 = vy;
+    int nb4 = 4*(nb/4);
     for (int i = 0; i < nb; i++) {
+        int i4 = i/4, ir = i%4;
         float32x4_t srcv [8];
         float32x4_t asrcv[8];
         float32x4_t amaxv[8];
@@ -1209,23 +1230,41 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
         const float d = amax / ((1 << 7) - 1);
         const float id = d ? 1.0f/d : 0.0f;
 
-        y[i].d = GGML_FP32_TO_FP16(d);
+        // [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
+        if (i < nb4) {
+            y4[i4].d[ir] = GGML_FP32_TO_FP16(d);
+        } else {
+            y[i].d = GGML_FP32_TO_FP16(d);
+        }
 
         int32x4_t accv = vdupq_n_s32(0);
 
         for (int j = 0; j < 8; j++) {
             const float32x4_t v  = vmulq_n_f32(srcv[j], id);
             const int32x4_t   vi = vcvtnq_s32_f32(v);
 
-            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+            // [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
+            if (i < nb4) {
+                y4[i4].qs[QK8_1*ir + 4*j + 0] = vgetq_lane_s32(vi, 0);
+                y4[i4].qs[QK8_1*ir + 4*j + 1] = vgetq_lane_s32(vi, 1);
+                y4[i4].qs[QK8_1*ir + 4*j + 2] = vgetq_lane_s32(vi, 2);
+                y4[i4].qs[QK8_1*ir + 4*j + 3] = vgetq_lane_s32(vi, 3);
+            } else {
+                y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+                y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+                y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+                y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+            }
 
             accv = vaddq_s32(accv, vi);
         }
 
-        y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
+        // [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
+        if (i < nb4) {
+            y4[i4].d[ir+4] = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
+        } else {
+            y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
+        }
     }
 #elif defined(__wasm_simd128__)
     for (int i = 0; i < nb; i++) {

diff --git a/llama.cpp/quantize/quantize.cpp b/llama.cpp/quantize/quantize.cpp
@@ -65,10 +65,12 @@ static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS   = "quantize.imatrix.chunks_count";
 
 static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
-    std::string ftype_str;
+    std::string ftype_str; ftype_str.reserve(ftype_str_in.size());
 
+    bool is_number = true;
     for (auto ch : ftype_str_in) {
         ftype_str.push_back(std::toupper(ch));
+        if (!std::isdigit(ftype_str.back())) is_number = false;
     }
     for (auto & it : QUANT_OPTIONS) {
         if (it.name == ftype_str) {
@@ -77,6 +79,9 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
             return true;
         }
     }
+    // On my system (OS Ventura 13.2.1) calling std::stoi with invalid input leads to a crash (Segmentation fault 11)
+    // Hence the check above and the early return
+    if (!is_number) return false;
     try {
         int ftype_int = std::stoi(ftype_str);
         for (auto & it : QUANT_OPTIONS) {

diff --git a/llamafile/BUILD.mk b/llamafile/BUILD.mk
@@ -91,6 +91,7 @@ o/$(MODE)/llamafile:					\
 o/$(MODE)/llamafile/sgemm.o: private CXXFLAGS += -Os
 o/$(MODE)/llamafile/iqk_mul_mat_amd_avx2.o: private TARGET_ARCH += -Xx86_64-mtune=skylake -Xx86_64-mavx2 -Xx86_64-mfma -Xx86_64-mf16c
 o/$(MODE)/llamafile/iqk_mul_mat_amd_zen4.o: private TARGET_ARCH += -Xx86_64-mtune=skylake -Xx86_64-mavx2 -Xx86_64-mfma -Xx86_64-mf16c -Xx86_64-mavx512f -Xx86_64-mavx512vl -Xx86_64-mavx512vnni -Xx86_64-mavx512bw -Xx86_64-mavx512dq
+o/$(MODE)/llamafile/iqk_mul_mat_arm82.o: private TARGET_ARCH += -Xaarch64-march=armv8.2-a+dotprod+fp16
 o/$(MODE)/llamafile/tinyblas_cpu_sgemm_amd_avx.o: private TARGET_ARCH += -Xx86_64-mtune=sandybridge -Xx86_64-mf16c
 o/$(MODE)/llamafile/tinyblas_cpu_mixmul_amd_avx.o: private TARGET_ARCH += -Xx86_64-mtune=sandybridge -Xx86_64-mf16c
 o/$(MODE)/llamafile/tinyblas_cpu_sgemm_amd_fma.o: private TARGET_ARCH += -Xx86_64-mtune=bdver2 -Xx86_64-mf16c -Xx86_64-mfma