Skip to content

Commit

Permalink
cuBLAS: improve ggml_compute_forward_mul_mat_f16_f32 with pinned memory
Browse files Browse the repository at this point in the history
  • Loading branch information
slaren committed Apr 27, 2023
1 parent 0418b27 commit 7455432
Showing 1 changed file with 7 additions and 6 deletions.
13 changes: 7 additions & 6 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -8401,15 +8401,19 @@ static void ggml_compute_forward_mul_mat_f16_f32(
const int d_ne = ne11 * ne01;

size_t x_size, y_size, d_size;
float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
ggml_fp16_t * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
ggml_fp16_t * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
#else
float * const wdata = params->wdata;
#endif
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
#if defined(GGML_USE_CUBLAS)
// copy src0 while converting src1
const ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + i02*nb02 + i03*nb03);
CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(ggml_fp16_t) * x_ne, cudaMemcpyHostToDevice, g_cudaStream));

// with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + (ne11 * ne10) * (i03 * ne02 + i02);
{
Expand All @@ -8432,13 +8436,10 @@ static void ggml_compute_forward_mul_mat_f16_f32(
#endif

#if defined(GGML_USE_CUBLAS)
const ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + i02*nb02 + i03*nb03);
const ggml_fp16_t * y = (ggml_fp16_t *) wdata;

float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);

// copy data to device
CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(ggml_fp16_t) * x_ne, cudaMemcpyHostToDevice, g_cudaStream));
CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));

// compute
Expand Down

0 comments on commit 7455432

Please sign in to comment.