From 44acb8b880c00c97f97cd4e40be67b69e873082e Mon Sep 17 00:00:00 2001 From: JohannesGaessler Date: Tue, 5 Sep 2023 12:00:14 +0200 Subject: [PATCH] refacotr --- ggml-cuda.cu | 170 +++++++++++++++++++++------------------------------ 1 file changed, 70 insertions(+), 100 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 11b4db1761bb02..4ae202185f711b 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -3418,23 +3418,20 @@ template static __global__ void mul_mat_q4_0( const int mmq_x = MMQ_X_Q4_0_AMPERE; const int mmq_y = MMQ_Y_Q4_0_AMPERE; const int nwarps = NWARPS_Q4_0_AMPERE; - - mul_mat_q, - load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y); - #elif __CUDA_ARCH__ >= MIN_CC_DP4A const int mmq_x = MMQ_X_Q4_0_PASCAL; const int mmq_y = MMQ_Y_Q4_0_PASCAL; const int nwarps = NWARPS_Q4_0_PASCAL; - - mul_mat_q, - load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else - (void) vec_dot_q4_0_q8_1_mul_mat; + const int mmq_x = -1; + const int mmq_y = -1; + const int nwarps = -1; assert(false); #endif // __CUDA_ARCH__ >= CC_TURING + + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y); } #define MMQ_X_Q4_1_AMPERE 64 @@ -3457,23 +3454,20 @@ template static __global__ void const int mmq_x = MMQ_X_Q4_1_AMPERE; const int mmq_y = MMQ_Y_Q4_1_AMPERE; const int nwarps = NWARPS_Q4_1_AMPERE; - - mul_mat_q, - load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y); - #elif __CUDA_ARCH__ >= MIN_CC_DP4A const int mmq_x = MMQ_X_Q4_1_PASCAL; const int mmq_y = MMQ_Y_Q4_1_PASCAL; const int nwarps = NWARPS_Q4_1_PASCAL; - - mul_mat_q, - load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else - (void) vec_dot_q4_1_q8_1_mul_mat; + const int mmq_x = -1; + const int mmq_y = -1; + const int nwarps = -1; assert(false); #endif // __CUDA_ARCH__ >= CC_TURING + + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y); } #define MMQ_X_Q5_0_AMPERE 128 @@ -3492,23 +3486,20 @@ template static __global__ void mul_mat_q5_0( const int mmq_x = MMQ_X_Q5_0_AMPERE; const int mmq_y = MMQ_Y_Q5_0_AMPERE; const int nwarps = NWARPS_Q5_0_AMPERE; - - mul_mat_q, - load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y); - #elif __CUDA_ARCH__ >= MIN_CC_DP4A const int mmq_x = MMQ_X_Q5_0_PASCAL; const int mmq_y = MMQ_Y_Q5_0_PASCAL; const int nwarps = NWARPS_Q5_0_PASCAL; - - mul_mat_q, - load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else - (void) vec_dot_q5_0_q8_1_mul_mat; + const int mmq_x = -1; + const int mmq_y = -1; + const int nwarps = -1; assert(false); #endif // __CUDA_ARCH__ >= CC_TURING + + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y); } #define MMQ_X_Q5_1_AMPERE 128 @@ -3527,23 +3518,20 @@ template static __global__ void mul_mat_q5_1( const int mmq_x = MMQ_X_Q5_1_AMPERE; const int mmq_y = MMQ_Y_Q5_1_AMPERE; const int nwarps = NWARPS_Q5_1_AMPERE; - - mul_mat_q, - load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y); - #elif __CUDA_ARCH__ >= MIN_CC_DP4A const int mmq_x = MMQ_X_Q5_1_PASCAL; const int mmq_y = MMQ_Y_Q5_1_PASCAL; const int nwarps = NWARPS_Q5_1_PASCAL; - - mul_mat_q, - load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else - (void) vec_dot_q5_1_q8_1_mul_mat; + const int mmq_x = -1; + const int mmq_y = -1; + const int nwarps = -1; assert(false); #endif // __CUDA_ARCH__ >= CC_TURING + + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y); } #define MMQ_X_Q8_0_AMPERE 128 @@ -3562,23 +3550,20 @@ template static __global__ void mul_mat_q8_0( const int mmq_x = MMQ_X_Q8_0_AMPERE; const int mmq_y = MMQ_Y_Q8_0_AMPERE; const int nwarps = NWARPS_Q8_0_AMPERE; - - mul_mat_q, - load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y); - #elif __CUDA_ARCH__ >= MIN_CC_DP4A const int mmq_x = MMQ_X_Q8_0_PASCAL; const int mmq_y = MMQ_Y_Q8_0_PASCAL; const int nwarps = NWARPS_Q8_0_PASCAL; - - mul_mat_q, - load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else - (void) vec_dot_q8_0_q8_1_mul_mat; + const int mmq_x = -1; + const int mmq_y = -1; + const int nwarps = -1; assert(false); #endif // __CUDA_ARCH__ >= CC_TURING + + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y); } #define MMQ_X_Q2_K_AMPERE 64 @@ -3597,23 +3582,20 @@ template static __global__ void mul_mat_q2_K( const int mmq_x = MMQ_X_Q2_K_AMPERE; const int mmq_y = MMQ_Y_Q2_K_AMPERE; const int nwarps = NWARPS_Q2_K_AMPERE; - - mul_mat_q, - load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y); - #elif __CUDA_ARCH__ >= MIN_CC_DP4A const int mmq_x = MMQ_X_Q2_K_PASCAL; const int mmq_y = MMQ_Y_Q2_K_PASCAL; const int nwarps = NWARPS_Q2_K_PASCAL; - - mul_mat_q, - load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else - (void) vec_dot_q2_K_q8_1_mul_mat; + const int mmq_x = -1; + const int mmq_y = -1; + const int nwarps = -1; assert(false); #endif // __CUDA_ARCH__ >= CC_TURING + + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y); } #define MMQ_X_Q3_K_AMPERE 128 @@ -3636,23 +3618,20 @@ template static __global__ void const int mmq_x = MMQ_X_Q3_K_AMPERE; const int mmq_y = MMQ_Y_Q3_K_AMPERE; const int nwarps = NWARPS_Q3_K_AMPERE; - - mul_mat_q, - load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y); - #elif __CUDA_ARCH__ >= MIN_CC_DP4A const int mmq_x = MMQ_X_Q3_K_PASCAL; const int mmq_y = MMQ_Y_Q3_K_PASCAL; const int nwarps = NWARPS_Q3_K_PASCAL; - - mul_mat_q, - load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else - (void) vec_dot_q3_K_q8_1_mul_mat; + const int mmq_x = -1; + const int mmq_y = -1; + const int nwarps = -1; assert(false); #endif // __CUDA_ARCH__ >= CC_TURING + + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y); } #define MMQ_X_Q4_K_AMPERE 64 @@ -3675,23 +3654,20 @@ template static __global__ void const int mmq_x = MMQ_X_Q4_K_AMPERE; const int mmq_y = MMQ_Y_Q4_K_AMPERE; const int nwarps = NWARPS_Q4_K_AMPERE; - - mul_mat_q, - load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y); - #elif __CUDA_ARCH__ >= MIN_CC_DP4A const int mmq_x = MMQ_X_Q4_K_PASCAL; const int mmq_y = MMQ_Y_Q4_K_PASCAL; const int nwarps = NWARPS_Q4_K_PASCAL; - - mul_mat_q, - load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else - (void) vec_dot_q4_K_q8_1_mul_mat; + const int mmq_x = -1; + const int mmq_y = -1; + const int nwarps = -1; assert(false); #endif // __CUDA_ARCH__ >= CC_TURING + + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y); } #define MMQ_X_Q5_K_AMPERE 64 @@ -3710,23 +3686,20 @@ template static __global__ void mul_mat_q5_K( const int mmq_x = MMQ_X_Q5_K_AMPERE; const int mmq_y = MMQ_Y_Q5_K_AMPERE; const int nwarps = NWARPS_Q5_K_AMPERE; - - mul_mat_q, - load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y); - #elif __CUDA_ARCH__ >= MIN_CC_DP4A const int mmq_x = MMQ_X_Q5_K_PASCAL; const int mmq_y = MMQ_Y_Q5_K_PASCAL; const int nwarps = NWARPS_Q5_K_PASCAL; - - mul_mat_q, - load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else - (void) vec_dot_q5_K_q8_1_mul_mat; + const int mmq_x = -1; + const int mmq_y = -1; + const int nwarps = -1; assert(false); #endif // __CUDA_ARCH__ >= CC_TURING + + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y); } #define MMQ_X_Q6_K_AMPERE 64 @@ -3749,23 +3722,20 @@ template static __global__ void const int mmq_x = MMQ_X_Q6_K_AMPERE; const int mmq_y = MMQ_Y_Q6_K_AMPERE; const int nwarps = NWARPS_Q6_K_AMPERE; - - mul_mat_q, - load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y); - #elif __CUDA_ARCH__ >= MIN_CC_DP4A const int mmq_x = MMQ_X_Q6_K_PASCAL; const int mmq_y = MMQ_Y_Q6_K_PASCAL; const int nwarps = NWARPS_Q6_K_PASCAL; - - mul_mat_q, - load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else - (void) vec_dot_q6_K_q8_1_mul_mat; + const int mmq_x = -1; + const int mmq_y = -1; + const int nwarps = -1; assert(false); #endif // __CUDA_ARCH__ >= CC_TURING + + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y); } template