Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix quantized k-cache without FA #105

Merged
merged 2 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions ggml/src/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1170,8 +1170,8 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {

GGML_ASSERT(ggml_backend_buffer_is_cuda(src->buffer));
char * src_ptr = (char *) src->data;
char * dst_ptr = (char *) dst;
const char * src_ptr = (const char *) src->data;
char * dst_ptr = (char *) dst;

const int64_t ne0 = src->ne[0];
const int64_t nb0 = src->nb[0];
Expand All @@ -1182,7 +1182,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
const int64_t ts = ggml_type_size(type);
const int64_t rs = ggml_row_size(type, ne0);
const int64_t bs = ggml_blck_size(type);
int64_t i1_diff = i1_high - i1_low;
const int64_t i1_diff = i1_high - i1_low;

const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
if (nb0 == ts && nb1 == rs) {
Expand Down Expand Up @@ -1532,10 +1532,14 @@ static void ggml_cuda_op_mul_mat(
if (src0_is_contiguous) {
dev[id].src0_dd = split ? (char *) src0_extra->data_device[id] : (char *) src0->data;
} else {
dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0));
// If src0 is not contiguous it will be copied to a temporary buffer, it may then be necessary to clear padding.
const size_t nbytes_data = ggml_nbytes(src0);
const size_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), nbytes_data + nbytes_padding);
CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd, 0, nbytes_data + nbytes_padding, stream));
}

// If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared:
// If src0 is on a temporary compute buffer (partial offloading) there may be some padding that needs to be cleared:
if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
const int64_t nbytes_data = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
Expand Down
4 changes: 1 addition & 3 deletions ggml/src/ggml-cuda/mmq.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,14 @@ void ggml_cuda_op_mul_mat_q(

const int64_t ne00 = src0->ne[0];

const int64_t nb01 = src0->nb[1];

const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
GGML_ASSERT(ne10 % QK8_1 == 0);

const int64_t ne0 = dst->ne[0];

const int64_t row_diff = row_high - row_low;
const int64_t stride00 = nb01 / ggml_type_size(src0->type);
const int64_t stride00 = ne00 / ggml_blck_size(src0->type);

int id = ggml_cuda_get_device();
const int compute_capability = ggml_cuda_info().devices[id].cc;
Expand Down
7 changes: 2 additions & 5 deletions ggml/src/ggml-cuda/quantize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ static __global__ void quantize_mmq_q8_1(
}
}

const float d_inv = 127.0f / amax;
const float d = amax/127.f;
const float d_inv = d > 0 ? 1/d : 0.f;
char4 q;
q.x = roundf(xi.x*d_inv);
q.y = roundf(xi.y*d_inv);
Expand All @@ -106,8 +107,6 @@ static __global__ void quantize_mmq_q8_1(
return;
}

const float d = 1.0f / d_inv;

y[ib].d2s6[iqs/64] = d;

return;
Expand All @@ -117,8 +116,6 @@ static __global__ void quantize_mmq_q8_1(
return;
}

const float d = 1.0f / d_inv;

if (ds_layout == MMQ_Q8_1_DS_LAYOUT_DS4) {
y[ib].ds4[iqs/32] = make_half2(d, sum);
} else {
Expand Down