Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ggml : fix BLAS with unsupported types #9775

Merged
merged 2 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions examples/export-lora/export-lora.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -314,9 +314,9 @@ struct lora_merge_ctx {
// optionally dequantize it
printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
auto nels = ggml_nelements(inp_base);
ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
const auto * qtype = ggml_get_type_traits(base->type);
std::vector<uint8_t> dequant_buf(nels * sizeof(float));
qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
} else {
ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
Expand Down
10 changes: 5 additions & 5 deletions examples/quantize-stats/quantize-stats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
}

static void test_roundtrip_on_chunk(
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, bool use_reference,
float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
) {
if (layer->type == GGML_TYPE_F16) {
Expand All @@ -166,7 +166,7 @@ static void test_roundtrip_on_chunk(

// Run quantization function for a single layer and update error stats
static void test_roundtrip_on_layer(
std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, bool use_reference,
const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
) {
Expand Down Expand Up @@ -371,8 +371,8 @@ int main(int argc, char ** argv) {
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
continue;
}
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
if (qfns.from_float && qfns.to_float) {
const auto * qfns = ggml_get_type_traits(type);
if (qfns->from_float && qfns->to_float) {
if (params.verbose) {
printf("testing %s ...\n", ggml_type_name(type));
}
Expand All @@ -393,7 +393,7 @@ int main(int argc, char ** argv) {
test_roundtrip_on_layer(
layer_name,
params.per_layer_stats,
qfns,
*qfns,
params.reference,
kv_tensor.second,
input_scratch,
Expand Down
6 changes: 3 additions & 3 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -2535,7 +2535,7 @@ extern "C" {
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);

typedef struct {
struct ggml_type_traits {
const char * type_name;
int64_t blck_size;
int64_t blck_size_interleave; // interleave elements in blocks
Expand All @@ -2551,9 +2551,9 @@ extern "C" {
int64_t ncols; // number of columns to process simultaneously
ggml_gemv_t gemv;
ggml_gemm_t gemm;
} ggml_type_traits_t;
};

GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);

#ifdef __cplusplus
}
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1177,7 +1177,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
op->type != GGML_TYPE_IQ1_S &&
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
case GGML_OP_MUL_MAT:
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
case GGML_OP_ROPE_BACK:
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
case GGML_OP_IM2COL_BACK:
Expand Down
26 changes: 14 additions & 12 deletions ggml/src/ggml-blas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg

// convert src0 to float
if (type != GGML_TYPE_F32) {
ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
ggml_to_float_t const to_float = type_traits.to_float;
const auto * type_traits = ggml_get_type_traits(type);
ggml_to_float_t const to_float = type_traits->to_float;

for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
Expand Down Expand Up @@ -420,19 +420,21 @@ static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const s
// TODO: find the optimal value
const int64_t min_batch = 32;

return (ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) &&
src1->type == GGML_TYPE_F32 &&
(ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch));
return ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) &&
src1->type == GGML_TYPE_F32 &&
(ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch) &&
(src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
}

case GGML_OP_OUT_PROD:
return (op->src[0]->type == GGML_TYPE_F32 &&
op->src[1]->type == GGML_TYPE_F32 &&
ggml_is_matrix(src0) &&
ggml_is_matrix(src1) &&
ggml_is_contiguous(src0) &&
(ggml_is_contiguous(src1) || ggml_is_transposed(src1)));
return op->src[0]->type == GGML_TYPE_F32 &&
op->src[1]->type == GGML_TYPE_F32 &&
ggml_is_matrix(src0) &&
ggml_is_matrix(src1) &&
ggml_is_contiguous(src0) &&
(ggml_is_contiguous(src1) || ggml_is_transposed(src1)) &&
(src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);

default:
return false;
Expand Down
4 changes: 2 additions & 2 deletions ggml/src/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5287,9 +5287,9 @@ static void ggml_vk_dequantize_data(const void * from, float * to, size_t ne, gg
return;
}

ggml_type_traits_t tt = ggml_internal_get_type_traits(quant);
const auto * tt = ggml_get_type_traits(quant);

ggml_to_float_t dequant_fn = tt.to_float;
ggml_to_float_t dequant_fn = tt->to_float;

dequant_fn(from, to, ne);
}
Expand Down
6 changes: 3 additions & 3 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -729,7 +729,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);

static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
[GGML_TYPE_I8] = {
.type_name = "i8",
.blck_size = 1,
Expand Down Expand Up @@ -1151,9 +1151,9 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
};

// For internal test use
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
GGML_ASSERT(type < GGML_TYPE_COUNT);
return type_traits[type];
return &type_traits[type];
}

//
Expand Down
6 changes: 3 additions & 3 deletions pocs/vdot/q8dot.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ int main(int argc, char** argv) {

auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1;

auto funcs = ggml_internal_get_type_traits(ggml_type);
const auto * funcs = ggml_get_type_traits(ggml_type);

Stat simple, ggml;

Expand All @@ -156,8 +156,8 @@ int main(int argc, char** argv) {

t1 = std::chrono::high_resolution_clock::now();
float fs;
if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
else funcs.vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
if (type == 0) funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x40.data(), 0, y.data(), 0, 1);
else funcs->vec_dot(kVecSize * QK4_1, &fs, 0, x41.data(), 0, y.data(), 0, 1);
t2 = std::chrono::high_resolution_clock::now();
t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
if (iloop > 3) ggml.addResult(fs, t);
Expand Down
14 changes: 7 additions & 7 deletions pocs/vdot/vdot.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ int main(int argc, char** argv) {
int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);

auto funcs = useQ4_1 ? ggml_internal_get_type_traits(GGML_TYPE_Q4_1) : ggml_internal_get_type_traits(GGML_TYPE_Q4_0);
const auto * funcs = useQ4_1 ? ggml_get_type_traits(GGML_TYPE_Q4_1) : ggml_get_type_traits(GGML_TYPE_Q4_0);

std::vector<block_q4_0> q40;
std::vector<block_q4_1> q41;
Expand All @@ -261,9 +261,9 @@ int main(int argc, char** argv) {
// Note, we do not include this in the timing as in practical application
// we already have the quantized model weights.
if (useQ4_1) {
funcs.from_float(x1.data(), q41.data(), kVecSize);
funcs->from_float(x1.data(), q41.data(), kVecSize);
} else {
funcs.from_float(x1.data(), q40.data(), kVecSize);
funcs->from_float(x1.data(), q40.data(), kVecSize);
}

// Now measure time the dot product needs using the "scalar" version above
Expand All @@ -282,10 +282,10 @@ int main(int argc, char** argv) {
dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
}
else {
auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type);
vdot.from_float(y1.data(), q8.data(), kVecSize);
if (useQ4_1) funcs.vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
else funcs.vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
const auto * vdot = ggml_get_type_traits(funcs->vec_dot_type);
vdot->from_float(y1.data(), q8.data(), kVecSize);
if (useQ4_1) funcs->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1);
else funcs->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1);
}
sumq += result;
t2 = std::chrono::high_resolution_clock::now();
Expand Down
9 changes: 4 additions & 5 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17872,10 +17872,9 @@ static void llama_tensor_dequantize_internal(
}
float * f32_output = (float *) output.data();

ggml_type_traits_t qtype;
const ggml_type_traits * qtype = ggml_get_type_traits(tensor->type);
if (ggml_is_quantized(tensor->type)) {
qtype = ggml_internal_get_type_traits(tensor->type);
if (qtype.to_float == NULL) {
if (qtype->to_float == NULL) {
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
}
} else if (tensor->type != GGML_TYPE_F16 &&
Expand All @@ -17889,7 +17888,7 @@ static void llama_tensor_dequantize_internal(
} else if (tensor->type == GGML_TYPE_BF16) {
ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
} else if (ggml_is_quantized(tensor->type)) {
qtype.to_float(tensor->data, f32_output, nelements);
qtype->to_float(tensor->data, f32_output, nelements);
} else {
GGML_ABORT("fatal error"); // unreachable
}
Expand Down Expand Up @@ -17925,7 +17924,7 @@ static void llama_tensor_dequantize_internal(
} else if (typ == GGML_TYPE_BF16) {
ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
} else {
qtype.to_float(inbuf, outbuf, nels);
qtype->to_float(inbuf, outbuf, nels);
}
};
workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
Expand Down
4 changes: 2 additions & 2 deletions tests/test-backend-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
std::vector<uint8_t> buf(ggml_nbytes(t));
ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));

ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
const auto * tt = ggml_get_type_traits(t->type);
size_t bs = ggml_blck_size(t->type);
std::vector<float> vq(ggml_blck_size(t->type));
bool quantized = ggml_is_quantized(t->type);
Expand All @@ -159,7 +159,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
} else if (t->type == GGML_TYPE_I8) {
tv.push_back((float)*(int8_t *) &buf[i]);
} else if (quantized) {
tt.to_float(&buf[i], vq.data(), bs);
tt->to_float(&buf[i], vq.data(), bs);
tv.insert(tv.end(), vq.begin(), vq.end());
} else {
GGML_ABORT("fatal error");
Expand Down
32 changes: 16 additions & 16 deletions tests/test-quantize-fns.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,26 +44,26 @@ static float array_rmse(const float * a1, const float * a2, size_t n) {
}

// Total quantization error on test data
static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
static float total_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
std::vector<uint8_t> tmp_q(2*test_size);
std::vector<float> tmp_out(test_size);

qfns.from_float(test_data, tmp_q.data(), test_size);
qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
qfns->from_float(test_data, tmp_q.data(), test_size);
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);
return array_rmse(test_data, tmp_out.data(), test_size);
}

// Total quantization error on test data
static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
static float reference_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) {
std::vector<uint8_t> tmp_q(2*test_size);
std::vector<float> tmp_out(test_size);
std::vector<float> tmp_out_ref(test_size);

qfns.from_float(test_data, tmp_q.data(), test_size);
qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
qfns->from_float(test_data, tmp_q.data(), test_size);
qfns->to_float(tmp_q.data(), tmp_out.data(), test_size);

qfns.from_float_ref(test_data, tmp_q.data(), test_size);
qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
qfns->from_float_ref(test_data, tmp_q.data(), test_size);
qfns->to_float(tmp_q.data(), tmp_out_ref.data(), test_size);

return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
}
Expand All @@ -78,18 +78,18 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {

// Total dot product error
static float dot_product_error(
ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
const ggml_type_traits * qfns, size_t test_size, const float * test_data1, const float *test_data2
) {
std::vector<uint8_t> tmp_q1(2*test_size);
std::vector<uint8_t> tmp_q2(2*test_size);

auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
const auto * vdot = ggml_get_type_traits(qfns->vec_dot_type);

qfns.from_float(test_data1, tmp_q1.data(), test_size);
vdot.from_float(test_data2, tmp_q2.data(), test_size);
qfns->from_float(test_data1, tmp_q1.data(), test_size);
vdot->from_float(test_data2, tmp_q2.data(), test_size);

float result = INFINITY;
qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
qfns->vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);

const float dot_ref = dot_product(test_data1, test_data2, test_size);

Expand Down Expand Up @@ -131,10 +131,10 @@ int main(int argc, char * argv[]) {

for (int i = 0; i < GGML_TYPE_COUNT; i++) {
ggml_type type = (ggml_type) i;
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
const auto * qfns = ggml_get_type_traits(type);

// deprecated - skip
if (qfns.blck_size == 0) {
if (qfns->blck_size == 0) {
continue;
}

Expand All @@ -143,7 +143,7 @@ int main(int argc, char * argv[]) {
printf("Testing %s\n", ggml_type_name((ggml_type) i));
ggml_quantize_init(ei);

if (qfns.from_float && qfns.to_float) {
if (qfns->from_float && qfns->to_float) {
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
const float max_quantization_error =
type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY :
Expand Down
Loading
Loading