Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance improvements on Arm for legacy and k-quants #453

Merged
merged 2 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion llama.cpp/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,18 @@ typedef struct {
} block_q8_1;
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");

//[kawrakow] Need these two for performance on Arm
typedef struct {
ggml_half d[8];
int8_t qs[4*QK8_1];
} block_q8_1_x4;
static_assert(sizeof(block_q8_1_x4) == 4*sizeof(block_q8_1), "wrong q8_1_x4 block size/padding");
typedef struct {
ggml_half d[4];
int8_t qs[4*QK8_0];
} block_q8_0_x4;
static_assert(sizeof(block_q8_0_x4) == 4*sizeof(block_q8_0), "wrong q8_0_x4 block size/padding");

//
// Super-block quantization structures
//
Expand Down Expand Up @@ -313,10 +325,11 @@ typedef struct {
static_assert(sizeof(block_q6_K) == sizeof(ggml_half) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");

// This is only used for intermediate quantization and dot products
// [kawrakow] Note: I have switched the order of bsums and qs. This results in some performance gain on Arm
typedef struct {
float d; // delta
int8_t qs[QK_K]; // quants
int16_t bsums[QK_K/16]; // sum of quants in groups of 16
int8_t qs[QK_K]; // quants
} block_q8_K;
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");

Expand Down
61 changes: 50 additions & 11 deletions llama.cpp/ggml-quants.inc
Original file line number Diff line number Diff line change
Expand Up @@ -873,7 +873,11 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
block_q8_0 * restrict y = vy;

#if defined(__ARM_NEON)
// [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
block_q8_0_x4 * y4 = (block_q8_0_x4 *)vy;
int nb4 = 4*(nb/4);
for (int i = 0; i < nb; i++) {
int i4 = i/4, ir = i%4;
float32x4_t srcv [8];
float32x4_t asrcv[8];
float32x4_t amaxv[8];
Expand All @@ -890,16 +894,29 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;

y[i].d = GGML_FP32_TO_FP16(d);
// [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
if (i < nb4) {
y4[i4].d[ir] = GGML_FP32_TO_FP16(d);
} else {
y[i].d = GGML_FP32_TO_FP16(d);
}

for (int j = 0; j < 8; j++) {
const float32x4_t v = vmulq_n_f32(srcv[j], id);
const int32x4_t vi = vcvtnq_s32_f32(v);

y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
// [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
if (i < nb4) {
y4[i4].qs[32*ir + 4*j + 0] = vgetq_lane_s32(vi, 0);
y4[i4].qs[32*ir + 4*j + 1] = vgetq_lane_s32(vi, 1);
y4[i4].qs[32*ir + 4*j + 2] = vgetq_lane_s32(vi, 2);
y4[i4].qs[32*ir + 4*j + 3] = vgetq_lane_s32(vi, 3);
} else {
y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
}
}
}
#elif defined(__wasm_simd128__)
Expand Down Expand Up @@ -1192,7 +1209,11 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
block_q8_1 * restrict y = vy;

#if defined(__ARM_NEON)
// [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
block_q8_1_x4 * restrict y4 = vy;
int nb4 = 4*(nb/4);
for (int i = 0; i < nb; i++) {
int i4 = i/4, ir = i%4;
float32x4_t srcv [8];
float32x4_t asrcv[8];
float32x4_t amaxv[8];
Expand All @@ -1209,23 +1230,41 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
const float d = amax / ((1 << 7) - 1);
const float id = d ? 1.0f/d : 0.0f;

y[i].d = GGML_FP32_TO_FP16(d);
// [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
if (i < nb4) {
y4[i4].d[ir] = GGML_FP32_TO_FP16(d);
} else {
y[i].d = GGML_FP32_TO_FP16(d);
}

int32x4_t accv = vdupq_n_s32(0);

for (int j = 0; j < 8; j++) {
const float32x4_t v = vmulq_n_f32(srcv[j], id);
const int32x4_t vi = vcvtnq_s32_f32(v);

y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
// [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
if (i < nb4) {
y4[i4].qs[QK8_1*ir + 4*j + 0] = vgetq_lane_s32(vi, 0);
y4[i4].qs[QK8_1*ir + 4*j + 1] = vgetq_lane_s32(vi, 1);
y4[i4].qs[QK8_1*ir + 4*j + 2] = vgetq_lane_s32(vi, 2);
y4[i4].qs[QK8_1*ir + 4*j + 3] = vgetq_lane_s32(vi, 3);
} else {
y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
}

accv = vaddq_s32(accv, vi);
}

y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
// [kawrakow] When running on Arm, we change how the data is layed out for performance reasons
if (i < nb4) {
y4[i4].d[ir+4] = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
} else {
y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
}
}
#elif defined(__wasm_simd128__)
for (int i = 0; i < nb; i++) {
Expand Down
7 changes: 6 additions & 1 deletion llama.cpp/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,12 @@ static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix
static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count";

static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
std::string ftype_str;
std::string ftype_str; ftype_str.reserve(ftype_str_in.size());

bool is_number = true;
for (auto ch : ftype_str_in) {
ftype_str.push_back(std::toupper(ch));
if (!std::isdigit(ftype_str.back())) is_number = false;
}
for (auto & it : QUANT_OPTIONS) {
if (it.name == ftype_str) {
Expand All @@ -77,6 +79,9 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
return true;
}
}
// On my system (OS Ventura 13.2.1) calling std::stoi with invalid input leads to a crash (Segmentation fault 11)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can fix that after this change goes in.

// Hence the check above and the early return
if (!is_number) return false;
try {
int ftype_int = std::stoi(ftype_str);
for (auto & it : QUANT_OPTIONS) {
Expand Down
1 change: 1 addition & 0 deletions llamafile/BUILD.mk
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ o/$(MODE)/llamafile: \
o/$(MODE)/llamafile/sgemm.o: private CXXFLAGS += -Os
o/$(MODE)/llamafile/iqk_mul_mat_amd_avx2.o: private TARGET_ARCH += -Xx86_64-mtune=skylake -Xx86_64-mavx2 -Xx86_64-mfma -Xx86_64-mf16c
o/$(MODE)/llamafile/iqk_mul_mat_amd_zen4.o: private TARGET_ARCH += -Xx86_64-mtune=skylake -Xx86_64-mavx2 -Xx86_64-mfma -Xx86_64-mf16c -Xx86_64-mavx512f -Xx86_64-mavx512vl -Xx86_64-mavx512vnni -Xx86_64-mavx512bw -Xx86_64-mavx512dq
o/$(MODE)/llamafile/iqk_mul_mat_arm82.o: private TARGET_ARCH += -Xaarch64-march=armv8.2-a+dotprod+fp16
o/$(MODE)/llamafile/tinyblas_cpu_sgemm_amd_avx.o: private TARGET_ARCH += -Xx86_64-mtune=sandybridge -Xx86_64-mf16c
o/$(MODE)/llamafile/tinyblas_cpu_mixmul_amd_avx.o: private TARGET_ARCH += -Xx86_64-mtune=sandybridge -Xx86_64-mf16c
o/$(MODE)/llamafile/tinyblas_cpu_sgemm_amd_fma.o: private TARGET_ARCH += -Xx86_64-mtune=bdver2 -Xx86_64-mf16c -Xx86_64-mfma
Expand Down
Loading