Skip to content

Commit

Permalink
Add some minimal optimizations for CDNA
Browse files Browse the repository at this point in the history
  • Loading branch information
IMbackK committed Nov 25, 2024
1 parent a9a678a commit 462b9bf
Show file tree
Hide file tree
Showing 6 changed files with 19 additions and 4 deletions.
4 changes: 4 additions & 0 deletions ggml/src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@
#define CC_TURING 750
#define CC_AMPERE 800
#define CC_OFFSET_AMD 1000000
#define CC_GCN4 (CC_OFFSET_AMD + 803)
#define CC_VEGA (CC_OFFSET_AMD + 900)
#define CC_VEGA20 (CC_OFFSET_AMD + 906)
#define CC_CDNA (CC_OFFSET_AMD + 908)
#define CC_RDNA1 (CC_OFFSET_AMD + 1010)
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
Expand Down
9 changes: 8 additions & 1 deletion ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1107,14 +1107,18 @@ static void ggml_cuda_op_mul_mat_cublas(
const half alpha_f16 = 1.0f;
const half beta_f16 = 0.0f;

cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
if(ggml_cuda_info().devices[ctx.device].cc == CC_CDNA)
cu_compute_type = CUBLAS_COMPUTE_32F;

CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
CUBLAS_CHECK(
cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
row_diff, src1_ncols, ne10,
&alpha_f16, src0_ptr, CUDA_R_16F, ne00,
src1_ptr, CUDA_R_16F, ne10,
&beta_f16, dst_f16.get(), CUDA_R_16F, ldc,
CUBLAS_COMPUTE_16F,
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));

const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
Expand Down Expand Up @@ -1607,6 +1611,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
cudaDataType_t cu_data_type = CUDA_R_16F;

if(ggml_cuda_info().devices[ctx.device].cc == CC_CDNA)
cu_compute_type = CUBLAS_COMPUTE_32F;

// dst strides
size_t nbd2 = dst->nb[2];
size_t nbd3 = dst->nb[3];
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-cuda/mmq.cu
Original file line number Diff line number Diff line change
Expand Up @@ -148,5 +148,5 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
}

return cc < CC_RDNA3 || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
return (cc < CC_RDNA3 && cc != CC_CDNA && cc != CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
}
2 changes: 1 addition & 1 deletion ggml/src/ggml-cuda/mmq.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -2570,7 +2570,7 @@ static __device__ void mul_mat_q_process_tile(

template <ggml_type type, int mmq_x, int nwarps, bool need_check>
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
#if defined(RDNA3) || defined(RDNA2) || defined(CDNA)
__launch_bounds__(WARP_SIZE*nwarps, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#else
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-cuda/mmvq.cu
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ static void mul_mat_vec_q_cuda(
int64_t nwarps = 1;
int64_t rows_per_cuda_block = 1;

if (ggml_cuda_info().devices[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2
if (ggml_cuda_info().devices[id].cc < CC_CDNA || ggml_cuda_info().devices[id].cc == CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA
switch(ncols_y) {
case 1:
nwarps = 4;
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-cuda/vendors/hip.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@
#define RDNA1
#endif

#if defined(__gfx908__) || defined(__gfx90a__)
#define CDNA
#endif

#ifndef __has_builtin
#define __has_builtin(x) 0
#endif
Expand Down

0 comments on commit 462b9bf

Please sign in to comment.