Skip to content

Commit

Permalink
fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
slaren committed Nov 7, 2024
1 parent 02f8cdf commit a0a4646
Show file tree
Hide file tree
Showing 9 changed files with 113 additions and 101 deletions.
5 changes: 2 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -874,9 +874,8 @@ ggml/src/ggml-cuda/%.o: \
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
endif # GGML_HIPBLAS

ifdef GGML_CPU_AARCH64
ifndef GGML_NO_CPU_AARCH64
MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
MK_CFLAGS += -DGGML_USE_CPU_AARCH64
endif

ifdef GGML_METAL
Expand All @@ -888,7 +887,7 @@ ifdef GGML_METAL_NDEBUG
endif
ifdef GGML_METAL_EMBED_LIBRARY
MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
OBJ_GGML += ggml/src/ggml-metal-embed.o
OBJ_GGML += ggml/src/ggml-metal-embed.o
endif
endif # GGML_METAL

Expand Down
2 changes: 1 addition & 1 deletion ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ else()
endif()

option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
option(GGML_CPU_AARCH64 "ggml: use runtime weight quantization to enable optimized GEMM/GEMV kernels for AARCH64 cpu" OFF)
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversionn of Q4_0 to Q4_X_X" ON)

option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
Expand Down
5 changes: 3 additions & 2 deletions ggml/include/ggml-cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,10 @@ extern "C" {
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
#endif

#ifdef GGML_USE_CPU_AARCH64
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
#endif
GGML_API bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft);



#ifdef __cplusplus
}
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -881,7 +881,7 @@ if (GGML_CPU_HBM)
endif()

if (GGML_CPU_AARCH64)
message(STATUS "Using runtime weight quantization to enable optimized GEMM/GEMV kernels for AARCH64 cpu")
message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")

add_compile_definitions(GGML_USE_CPU_AARCH64)
endif()
Expand Down
70 changes: 32 additions & 38 deletions ggml/src/ggml-aarch64.c
Original file line number Diff line number Diff line change
Expand Up @@ -3477,14 +3477,13 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
}
}

#ifdef GGML_USE_CPU_AARCH64
static void repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * data, size_t data_size) {
static void repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) {
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
GGML_ASSERT(t->ne[0] % 8 == 0);
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);

block_q4_0x4 *dst = (block_q4_0x4 *)t->data;
const block_q4_0 *src = (const block_q4_0 *)data;
block_q4_0x4 * dst = (block_q4_0x4 *)t->data;
const block_q4_0 * src = (const block_q4_0 *)data;
block_q4_0 dst_tmp[4];
int nrow = t->ne[1]; // Number of rows
int nrows_interleaved = 4;
Expand All @@ -3493,8 +3492,7 @@ static void repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_bloc
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));

for (int b = 0; b < nrow; b += nrows_interleaved) {
for (int64_t x = 0; x < nblocks; x++)
{
for (int64_t x = 0; x < nblocks; x++) {
for (int i = 0; i < nrows_interleaved; i++) {
dst_tmp[i] = src[x + i * nblocks];
}
Expand All @@ -3506,13 +3504,13 @@ static void repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_bloc
GGML_UNUSED(data_size);
}

static void repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, const void * data, size_t data_size) {
static void repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) {
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
GGML_ASSERT(t->ne[0] % 8 == 0);
GGML_ASSERT(interleave_block == 8);

block_q4_0x8 *dst = (block_q4_0x8*)t->data;
const block_q4_0 *src = (const block_q4_0*) data;
block_q4_0x8 * dst = (block_q4_0x8*)t->data;
const block_q4_0 * src = (const block_q4_0*) data;
block_q4_0 dst_tmp[8];
int nrow = t->ne[1]; // Number of rows
int nrows_interleaved = 8;
Expand All @@ -3534,46 +3532,42 @@ static void repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block
}

// Prepare for optimized kernels if applicable
int ggml_prepare_optimal_kernel(struct ggml_tensor * cur, const void * data, size_t data_size) {
GGML_ASSERT(cur->type == GGML_TYPE_Q4_0);
int ret = -1;
#if defined(__ARM_ARCH)
if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
ret = 0;
}
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
ret = 0;
}
else if (ggml_cpu_has_neon()) {
repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
ret = 0;
void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * restrict data, size_t data_size) {
if (cur->type == repack_type) {
memcpy(cur->data, data, data_size);
return;
}
#endif
return ret;

GGML_UNUSED(cur);
GGML_UNUSED(data);
GGML_UNUSED(data_size);
GGML_ASSERT(cur->type == GGML_TYPE_Q4_0);

switch (repack_type) {
case GGML_TYPE_Q4_0_8_8:
repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
break;
case GGML_TYPE_Q4_0_4_8:
repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
break;
case GGML_TYPE_Q4_0_4_4:
repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
break;
default:
GGML_ABORT("Unsupported type");
}
}

enum ggml_type ggml_get_optimal_type(const struct ggml_tensor * cur) {
#if defined(__ARM_ARCH)
enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) {
if (cur->type == GGML_TYPE_Q4_0) {
if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
// TODO: enable for AVX2 - currently disabled due to bad gemv performance
if (/* ggml_cpu_has_avx2() || */ (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
return GGML_TYPE_Q4_0_8_8;
}
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
return GGML_TYPE_Q4_0_4_8;
}
else if (ggml_cpu_has_neon()) {
if (ggml_cpu_has_neon()) {
return GGML_TYPE_Q4_0_4_4;
}
}
#endif
return cur->type;

GGML_UNUSED(cur);
return cur->type;
}
#endif
6 changes: 2 additions & 4 deletions ggml/src/ggml-aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);

#ifdef GGML_USE_CPU_AARCH64
int ggml_prepare_optimal_kernel(struct ggml_tensor * cur, const void * data, size_t data_size);
enum ggml_type ggml_get_optimal_type(const struct ggml_tensor * cur);
#endif
void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size);
enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur);

#ifdef __cplusplus
}
Expand Down
97 changes: 59 additions & 38 deletions ggml/src/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2239,36 +2239,44 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
}
#endif

#ifdef GGML_USE_CPU_AARCH64

// buffer type AARCH64

#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpedantic"
#endif

#include "ggml-aarch64.h"

#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif

static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
tensor->extra = (void *)ggml_aarch64_get_optimal_repack_type(tensor); // NOLINT

GGML_UNUSED(buffer);
}

static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
bool quantize = tensor->type == GGML_TYPE_Q4_0 &&
tensor->op == GGML_OP_NONE &&
strcmp(tensor->name, "token_embd.weight") != 0;
GGML_ASSERT(offset == 0);
GGML_ASSERT(size == ggml_nbytes(tensor));

if (quantize) {
GGML_ASSERT(offset == 0);
if (ggml_prepare_optimal_kernel(tensor, data, size) == 0) {
return;
}
}
memcpy((char *)tensor->data + offset, data, size);
enum ggml_type repack_type = (enum ggml_type)(intptr_t)tensor->extra;

ggml_aarch64_repack_tensor(tensor, repack_type, data, size);

GGML_UNUSED(buffer);
}

static const struct ggml_backend_buffer_i ggml_backend_cpu_aarch64_buffer_i = {
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required
/* .init_tensor = */ ggml_backend_cpu_aarch64_buffer_init_tensor,
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_cpu_aarch64_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
/* .get_tensor = */ NULL,
/* .cpy_tensor = */ NULL,
/* .clear = */ ggml_backend_cpu_buffer_clear,
/* .reset = */ NULL,
};
Expand Down Expand Up @@ -2298,33 +2306,37 @@ ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) {
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
/* .is_host = */ NULL,
},
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
/* .context = */ NULL,
/* .context = */ NULL,
};

return &ggml_backend_cpu_buffer_type_aarch64;
}
#endif

bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft) {
return buft == ggml_backend_cpu_aarch64_buffer_type();
}

static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
static ggml_backend_buffer_type_t bufts[3];
int index = 0;
static std::vector<ggml_backend_buffer_type_t> bufts = []() {
std::vector<ggml_backend_buffer_type_t> bufts;

#ifdef GGML_USE_CPU_HBM
bufts[index++] = ggml_backend_cpu_hbm_buffer_type();
bufts.push_back(ggml_backend_cpu_hbm_buffer_type());
#endif

#ifdef GGML_USE_CPU_AARCH64
if (ggml_cpu_has_neon() || ggml_cpu_has_matmul_int8() || ggml_cpu_has_sve()) {
bufts[index++] = ggml_backend_cpu_aarch64_buffer_type();
}
bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
#endif

bufts[index] = NULL; // Terminate the list
bufts.push_back(NULL);

return bufts;
}();

return bufts;
return bufts.data();

GGML_UNUSED(device);
}
Expand Down Expand Up @@ -2635,15 +2647,21 @@ static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_b
}

static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
#ifdef GGML_USE_CPU_AARCH64
const struct ggml_tensor *tensor = op->src[0];
if (tensor && tensor->buffer && (strcmp(tensor->buffer->buft->iface.get_name(tensor->buffer->buft),"CPU_AARCH64") == 0)) {
if (op->op == GGML_OP_MUL_MAT && tensor->type == GGML_TYPE_Q4_0) {
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_get_type_traits_cpu(tensor->type)->vec_dot_type;
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1];

if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
if (op->op != GGML_OP_MUL_MAT || src0->type != GGML_TYPE_Q4_0 || ggml_aarch64_get_optimal_repack_type(src0) == GGML_TYPE_Q4_0) {
return false;
}
return false;
}
#endif

for (int i = 1; i < GGML_MAX_SRC; i++) {
if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
return false;
}
}

switch (op->op) {
case GGML_OP_CPY:
return
Expand All @@ -2652,13 +2670,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
op->type != GGML_TYPE_IQ1_S &&
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
case GGML_OP_MUL_MAT:
return op->src[1]->type == GGML_TYPE_F32;// FIXME || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
case GGML_OP_ROPE_BACK:
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
case GGML_OP_IM2COL_BACK:
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
case GGML_OP_OUT_PROD:
return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32;
return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32;
default:
return true;
}
Expand All @@ -2667,7 +2685,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
}

static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
return ggml_backend_buft_is_host(buft);
return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);

GGML_UNUSED(dev);
}
Expand Down Expand Up @@ -2721,7 +2739,7 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
return (void *)ggml_backend_cpu_set_n_threads;
}
if (strcmp(name, "ggml_backend_cpu_get_extra_bufts") == 0) {
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
return (void *)ggml_backend_cpu_get_extra_bufts;
}

Expand All @@ -2738,6 +2756,9 @@ static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
};

ggml_backend_reg_t ggml_backend_cpu_reg(void) {
// init CPU feature detection
ggml_cpu_init();

static struct ggml_backend_reg ggml_backend_cpu_reg = {
/* .iface = */ ggml_backend_cpu_reg_i,
/* .context = */ NULL,
Expand Down
Loading

0 comments on commit a0a4646

Please sign in to comment.