From 40b0a477b855eb1e3c26a7be76f8a975fb9232ec Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Mar 2024 10:27:49 +0200 Subject: [PATCH] cuda : reuse ggml-common (cont) ggml-ci --- ggml-common.h | 182 ++++++++++++++++++++++++++++++-------------------- ggml-cuda.cu | 10 +-- ggml-quants.c | 83 +++++++++++------------ 3 files changed, 158 insertions(+), 117 deletions(-) diff --git a/ggml-common.h b/ggml-common.h index 7e574dd5f687c7..04beea7f8e2369 100644 --- a/ggml-common.h +++ b/ggml-common.h @@ -3,19 +3,29 @@ #if defined(GGML_COMMON_DECL_C) #include -typedef uint16_t ggml_fp16_t; +typedef uint16_t ggml_half; +typedef uint32_t ggml_half2; + +#define GGML_COMMON_AGGR #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_METAL) #include -typedef half ggml_fp16_t; +typedef half ggml_half; +typedef half2 ggml_half2; + +#define GGML_COMMON_AGGR #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_CUDA) +#include #include -typedef half ggml_fp16_t; +typedef half ggml_half; +typedef half2 ggml_half2; + +#define GGML_COMMON_AGGR data #define GGML_COMMON_DECL #endif @@ -40,60 +50,75 @@ typedef half ggml_fp16_t; #define QI4_0 (QK4_0 / (4 * QR4_0)) #define QR4_0 2 typedef struct { - ggml_fp16_t d; // delta - uint8_t qs[QK4_0 / 2]; // nibbles / quants + ggml_half d; // delta + uint8_t qs[QK4_0 / 2]; // nibbles / quants } block_q4_0; -static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding"); +static_assert(sizeof(block_q4_0) == sizeof(ggml_half) + QK4_0 / 2, "wrong q4_0 block size/padding"); #define QK4_1 32 #define QI4_1 (QK4_1 / (4 * QR4_1)) #define QR4_1 2 typedef struct { - ggml_fp16_t d; // delta - ggml_fp16_t m; // min - uint8_t qs[QK4_1 / 2]; // nibbles / quants + union { + struct { + ggml_half d; // delta + ggml_half m; // min + } GGML_COMMON_AGGR; + ggml_half2 dm; + }; + uint8_t qs[QK4_1 / 2]; // nibbles / quants } block_q4_1; -static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding"); +static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding"); #define QK5_0 32 #define QI5_0 (QK5_0 / (4 * QR5_0)) #define QR5_0 2 typedef struct { - ggml_fp16_t d; // delta + ggml_half d; // delta uint8_t qh[4]; // 5-th bit of quants uint8_t qs[QK5_0 / 2]; // nibbles / quants } block_q5_0; -static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding"); +static_assert(sizeof(block_q5_0) == sizeof(ggml_half) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding"); #define QK5_1 32 #define QI5_1 (QK5_1 / (4 * QR5_1)) #define QR5_1 2 typedef struct { - ggml_fp16_t d; // delta - ggml_fp16_t m; // min + union { + struct { + ggml_half d; // delta + ggml_half m; // min + } GGML_COMMON_AGGR; + ggml_half2 dm; + }; uint8_t qh[4]; // 5-th bit of quants uint8_t qs[QK5_1 / 2]; // nibbles / quants } block_q5_1; -static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); +static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_half) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); #define QK8_0 32 #define QI8_0 (QK8_0 / (4 * QR8_0)) #define QR8_0 1 typedef struct { - ggml_fp16_t d; // delta - int8_t qs[QK8_0]; // quants + ggml_half d; // delta + int8_t qs[QK8_0]; // quants } block_q8_0; -static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding"); +static_assert(sizeof(block_q8_0) == sizeof(ggml_half) + QK8_0, "wrong q8_0 block size/padding"); #define QK8_1 32 #define QI8_1 (QK8_1 / (4 * QR8_1)) #define QR8_1 1 typedef struct { - float d; // delta - float s; // d * sum(qs[i]) - int8_t qs[QK8_1]; // quants + union { + struct { + ggml_half xxxd; // delta + ggml_half xxxs; // d * sum(qs[i]) + } GGML_COMMON_AGGR; + ggml_half2 ds; + }; + int8_t qs[QK8_1]; // quants } block_q8_1; -static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding"); +static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding"); // // Super-block quantization structures @@ -117,10 +142,15 @@ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block s typedef struct { uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits uint8_t qs[QK_K/4]; // quants - ggml_fp16_t d; // super-block scale for quantized scales - ggml_fp16_t dmin; // super-block scale for quantized mins + union { + struct { + ggml_half d; // super-block scale for quantized scales + ggml_half dmin; // super-block scale for quantized mins + } GGML_COMMON_AGGR; + ggml_half2 dm; + }; } block_q2_K; -static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); +static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); // 3-bit quantization // weight is represented as x = a * q @@ -130,20 +160,20 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w #define QR3_K 4 #ifdef GGML_QKK_64 typedef struct { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits + uint8_t hmask[QK_K/8]; // quants - high bit + uint8_t qs[QK_K/4]; // quants - low 2 bits uint8_t scales[2]; - ggml_fp16_t d; // super-block scale + ggml_half d; // super-block scale } block_q3_K; -static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding"); +static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding"); #else typedef struct { - uint8_t hmask[QK_K/8]; // quants - high bit - uint8_t qs[QK_K/4]; // quants - low 2 bits - uint8_t scales[12]; // scales, quantized with 6 bits - ggml_fp16_t d; // super-block scale + uint8_t hmask[QK_K/8]; // quants - high bit + uint8_t qs[QK_K/4]; // quants - low 2 bits + uint8_t scales[12]; // scales, quantized with 6 bits + ggml_half d; // super-block scale } block_q3_K; -static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding"); +static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding"); #endif // 4-bit quantization @@ -154,19 +184,24 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + #define QR4_K 2 #ifdef GGML_QKK_64 typedef struct { - ggml_fp16_t d[2]; // super-block scales/mins - uint8_t scales[2]; // 4-bit block scales/mins - uint8_t qs[QK_K/2]; // 4--bit quants + ggml_half d[2]; // super-block scales/mins + uint8_t scales[2]; // 4-bit block scales/mins + uint8_t qs[QK_K/2]; // 4--bit quants } block_q4_K; -static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding"); +static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding"); #else typedef struct { - ggml_fp16_t d; // super-block scale for quantized scales - ggml_fp16_t dmin; // super-block scale for quantized mins + union { + struct { + ggml_half d; // super-block scale for quantized scales + ggml_half dmin; // super-block scale for quantized mins + } GGML_COMMON_AGGR; + ggml_half2 dm; + }; uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qs[QK_K/2]; // 4--bit quants + uint8_t qs[QK_K/2]; // 4--bit quants } block_q4_K; -static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding"); +static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding"); #endif // 5-bit quantization @@ -177,21 +212,26 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/ #define QR5_K 2 #ifdef GGML_QKK_64 typedef struct { - ggml_fp16_t d; // super-block scale - int8_t scales[QK_K/16]; // 8-bit block scales - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits + ggml_half d; // super-block scale + int8_t scales[QK_K/16]; // 8-bit block scales + uint8_t qh[QK_K/8]; // quants, high bit + uint8_t qs[QK_K/2]; // quants, low 4 bits } block_q5_K; -static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding"); +static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding"); #else typedef struct { - ggml_fp16_t d; // super-block scale for quantized scales - ggml_fp16_t dmin; // super-block scale for quantized mins - uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits - uint8_t qh[QK_K/8]; // quants, high bit - uint8_t qs[QK_K/2]; // quants, low 4 bits + union { + struct { + ggml_half d; // super-block scale for quantized scales + ggml_half dmin; // super-block scale for quantized mins + } GGML_COMMON_AGGR; + ggml_half2 dm; + }; + uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits + uint8_t qh[QK_K/8]; // quants, high bit + uint8_t qs[QK_K/2]; // quants, low 4 bits } block_q5_K; -static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); +static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding"); #endif // 6-bit quantization @@ -204,9 +244,9 @@ typedef struct { uint8_t ql[QK_K/2]; // quants, lower 4 bits uint8_t qh[QK_K/4]; // quants, upper 2 bits int8_t scales[QK_K/16]; // scales, quantized with 8 bits - ggml_fp16_t d; // super-block scale + ggml_half d; // super-block scale } block_q6_K; -static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding"); +static_assert(sizeof(block_q6_K) == sizeof(ggml_half) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding"); // This is only used for intermediate quantization and dot products typedef struct { @@ -222,31 +262,31 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_ #define QI2_XXS (QK_K / (4*QR2_XXS)) #define QR2_XXS 8 typedef struct { - ggml_fp16_t d; + ggml_half d; uint16_t qs[QK_K/8]; } block_iq2_xxs; -static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding"); +static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_half) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding"); // 2.3125 bpw quants #define QI2_XS (QK_K / (4*QR2_XS)) #define QR2_XS 8 typedef struct { - ggml_fp16_t d; + ggml_half d; uint16_t qs[QK_K/8]; uint8_t scales[QK_K/32]; } block_iq2_xs; -static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding"); +static_assert(sizeof(block_iq2_xs) == sizeof(ggml_half) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding"); // 2.5625 bpw quants #define QI2_S (QK_K / (4*QR2_S)) #define QR2_S 8 typedef struct { - ggml_fp16_t d; + ggml_half d; uint8_t qs[QK_K/4]; uint8_t qh[QK_K/32]; uint8_t scales[QK_K/32]; } block_iq2_s; -static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding"); +static_assert(sizeof(block_iq2_s) == sizeof(ggml_half) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding"); // (Almost) "true" 3-bit quantization. // Due to the need to use blocks as per ggml design, it ends up using @@ -254,10 +294,10 @@ static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wr #define QI3_XXS (QK_K / (4*QR3_XXS)) #define QR3_XXS 8 typedef struct { - ggml_fp16_t d; + ggml_half d; uint8_t qs[3*QK_K/8]; } block_iq3_xxs; -static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); +static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); // 3.4375 bpw #if QK_K == 64 @@ -268,32 +308,32 @@ static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong #define QI3_XS (QK_K / (4*QR3_XS)) #define QR3_XS 8 typedef struct { - ggml_fp16_t d; + ggml_half d; uint8_t qs[QK_K/4]; uint8_t qh[QK_K/32]; uint8_t signs[QK_K/8]; uint8_t scales[IQ3S_N_SCALE]; } block_iq3_s; -static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding"); +static_assert(sizeof(block_iq3_s) == sizeof(ggml_half) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding"); #define QI1_S (QK_K / (4*QR1_S)) #define QR1_S 8 typedef struct { - ggml_fp16_t d; + ggml_half d; uint8_t qs[QK_K/8]; uint8_t scales[QK_K/16]; } block_iq1_s; -static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding"); +static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding"); // Non-linear quants #define QK4_NL 32 #define QI4_NL (QK4_NL / (4*QR4_NL)) #define QR4_NL 2 typedef struct { - ggml_fp16_t d; + ggml_half d; uint8_t qs[QK4_NL/2]; } block_iq4_nl; -static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding"); +static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding"); #if QK_K == 64 #define block_iq4_xs block_iq4_nl @@ -304,12 +344,12 @@ static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4 #define QI4_XS (QK_K / (4*QR4_XS)) #define QR4_XS 8 typedef struct { - ggml_fp16_t d; + ggml_half d; uint16_t scales_h; uint8_t scales_l[QK_K/64]; uint8_t qs[QK_K/2]; } block_iq4_xs; -static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding"); +static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding"); #endif #endif // GGML_COMMON_DECL diff --git a/ggml-cuda.cu b/ggml-cuda.cu index d6c1f3db3a7fb1..f075c0539fc633 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -3339,7 +3339,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1( #pragma unroll for (int i = 0; i < QR2_K; ++ i) { u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); - d8[i] = __low2half(bq8_1[bq8_offset + i].ds); + d8[i] = __low2float(bq8_1[bq8_offset + i].ds); } return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8); @@ -3461,7 +3461,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1( #pragma unroll for (int i = 0; i < QR3_K; ++i) { u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); - d8[i] = __low2half(bq8_1[bq8_offset + i].ds); + d8[i] = __low2float(bq8_1[bq8_offset + i].ds); } return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8); @@ -3630,7 +3630,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( for (int i = 0; i < QR4_K; ++i) { const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - d8[i] = __low2half(bq8i->ds); + d8[i] = __low2float(bq8i->ds); const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); u[2*i+0] = q8[0]; @@ -3995,7 +3995,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1( #pragma unroll for (int i = 0; i < QR6_K; ++i) { u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); - d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds); + d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds); } return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); @@ -4540,7 +4540,7 @@ static __device__ __forceinline__ void mul_mat_q( *dsi_dst = *dsi_src; } else { float * dfi_dst = (float *) dsi_dst; - *dfi_dst = __low2half(*dsi_src); + *dfi_dst = __low2float(*dsi_src); } } diff --git a/ggml-quants.c b/ggml-quants.c index 813a22b8c91c72..1b141cc44c5fb8 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -951,7 +951,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = d; + y[i].xxxd = GGML_FP32_TO_FP16(d); int sum = 0; @@ -966,7 +966,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict sum += y[i].qs[QK8_1/2 + j]; } - y[i].s = sum*d; + y[i].xxxs = GGML_FP32_TO_FP16(sum*d); } } @@ -994,7 +994,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = d; + y[i].xxxd = GGML_FP32_TO_FP16(d); int32x4_t accv = vdupq_n_s32(0); @@ -1010,7 +1010,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { accv = vaddq_s32(accv, vi); } - y[i].s = d * vaddvq_s32(accv); + y[i].xxxs = GGML_FP32_TO_FP16(d * vaddvq_s32(accv)); } #elif defined(__wasm_simd128__) for (int i = 0; i < nb; i++) { @@ -1033,7 +1033,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = d; + y[i].xxxd = GGML_FP32_TO_FP16(d); v128_t accv = wasm_i32x4_splat(0); @@ -1049,10 +1049,11 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { accv = wasm_i32x4_add(accv, vi); } - y[i].s = d * (wasm_i32x4_extract_lane(accv, 0) + - wasm_i32x4_extract_lane(accv, 1) + - wasm_i32x4_extract_lane(accv, 2) + - wasm_i32x4_extract_lane(accv, 3)); + y[i].xxxs = GGML_FP32_TO_FP16( + d * (wasm_i32x4_extract_lane(accv, 0) + + wasm_i32x4_extract_lane(accv, 1) + + wasm_i32x4_extract_lane(accv, 2) + + wasm_i32x4_extract_lane(accv, 3))); } #elif defined(__AVX2__) || defined(__AVX__) for (int i = 0; i < nb; i++) { @@ -1077,7 +1078,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { // Quantize these floats const float d = maxScalar / 127.f; - y[i].d = d; + y[i].xxxd = GGML_FP32_TO_FP16(d); const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; const __m256 mul = _mm256_set1_ps( id ); @@ -1101,7 +1102,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { #if defined(__AVX2__) // Compute the sum of the quants and set y[i].s - y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))); + y[i].xxxs = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)))); // Convert int32 to int16 i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 @@ -1131,7 +1132,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { // Compute the sum of the quants and set y[i].s const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3)); const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7)); - y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1)); + y[i].xxxs = GGML_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1))); // Convert int32 to int16 ni0 = _mm_packs_epi32( ni0, ni1 ); @@ -1162,7 +1163,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { const float d = amax / ((1 << 7) - 1); const float id = d ? 1.0f/d : 0.0f; - y[i].d = d; + y[i].xxxd = GGML_FP32_TO_FP16(d); vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl); @@ -1179,7 +1180,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { // set y[i].s int sum = __riscv_vmv_x_s_i16m1_i16(vwrs); - y[i].s = sum*d; + y[i].xxxs = GGML_FP32_TO_FP16(sum*d); } #else GGML_UNUSED(nb); @@ -4082,10 +4083,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r const block_q8_1 * restrict b_y0 = &vy0[i]; const block_q8_1 * restrict b_y1 = &vy1[i]; - float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * b_y0->s, - GGML_FP16_TO_FP32(b_x1->m) * b_y0->s, - GGML_FP16_TO_FP32(b_x0->m) * b_y1->s, - GGML_FP16_TO_FP32(b_x1->m) * b_y1->s}; + float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->xxxs), + GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->xxxs), + GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->xxxs), + GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->xxxs)}; summs0 += summs_t; const uint8x16_t m4b = vdupq_n_u8(0x0F); @@ -4106,10 +4107,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); // mmla into int32x4_t - float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)}; + float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->xxxd), + GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->xxxd), + GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->xxxd), + GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->xxxd)}; int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); @@ -4150,7 +4151,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r const block_q8_1 * restrict y0 = &y[i + 0]; const block_q8_1 * restrict y1 = &y[i + 1]; - summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s; + summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->xxxs) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->xxxs); const uint8x16_t m4b = vdupq_n_u8(0x0F); @@ -4173,8 +4174,8 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); - sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d); - sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d); + sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->xxxd)); + sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->xxxd)); } *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs; @@ -4187,9 +4188,9 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r // Main loop for (int i = 0; i < nb; ++i) { const float d0 = GGML_FP16_TO_FP32(x[i].d); - const float d1 = y[i].d; + const float d1 = GGML_FP16_TO_FP32(y[i].xxxd); - summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; + summs += GGML_FP16_TO_FP32(x[i].m) * GGML_FP16_TO_FP32(y[i].xxxs); const __m256 d0v = _mm256_set1_ps( d0 ); const __m256 d1v = _mm256_set1_ps( d1 ); @@ -4241,7 +4242,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); - sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; + sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].xxxd))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].xxxs); } *s = sumf; @@ -4259,7 +4260,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); } - sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; + sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML+GGML_FP16_TO_FP32(y[i].xxxd))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].xxxs); } *s = sumf; @@ -4595,8 +4596,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r const uint8x16_t m4b = vdupq_n_u8(0x0F); - summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s; - summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s; + summs0 += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->xxxs); + summs1 += GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->xxxs); // extract the 5th bit via lookup table ((b) << 4) memcpy(&qh0, x0->qh, sizeof(qh0)); @@ -4640,10 +4641,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l), - ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d); + ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->xxxd)); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l), - ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d); + ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->xxxd)); } *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1; @@ -4660,7 +4661,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r const block_q5_1 * restrict x0 = &x[i]; const block_q8_1 * restrict y0 = &y[i]; - summs += GGML_FP16_TO_FP32(x0->m) * y0->s; + summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->xxxs); const v128_t m4b = wasm_i8x16_splat(0x0F); @@ -4707,7 +4708,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r wasm_i32x4_dot_i16x8(v0lfh, v1lh)), wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), - wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d))); + wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->xxxd)))); } *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + @@ -4722,14 +4723,14 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; i++) { const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)); - summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; + summs += GGML_FP16_TO_FP32(x[i].m) * GGML_FP16_TO_FP32(y[i].xxxs); __m256i qx = bytes_from_nibbles_32(x[i].qs); __m256i bxhi = bytes_from_bits_32(x[i].qh); bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10)); qx = _mm256_or_si256(qx, bxhi); - const __m256 dy = _mm256_set1_ps(y[i].d); + const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[i].xxxd)); const __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs); const __m256 q = mul_sum_us8_pairs_float(qx, qy); @@ -4749,7 +4750,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r for (int i = 0; i < nb; i++) { const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)); - summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s; + summs += GGML_FP16_TO_FP32(x[i].m) * GGML_FP16_TO_FP32(y[i].xxxs); __m256i bx_0 = bytes_from_nibbles_32(x[i].qs); const __m256i bxhi = bytes_from_bits_32(x[i].qh); @@ -4763,7 +4764,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r bxh = _mm_or_si128(bxh, bxhih); bx_0 = MM256_SET_M128I(bxh, bxl); - const __m256 dy = _mm256_set1_ps(y[i].d); + const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[i].xxxd)); const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs); const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0); @@ -4830,7 +4831,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); - sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; + sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].xxxd))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].xxxs); } *s = sumf; @@ -4854,7 +4855,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); } - sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; + sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].xxxd))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].xxxs); } *s = sumf;