From 12fb1c58ec679f45c8fb44548cdcf16639ff8525 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Sun, 26 Nov 2023 22:20:18 -0500 Subject: [PATCH 1/3] cuda : tweak mm stride to double perf on P40 + GTX 970 --- ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 5b80e4ae31329..fc82f80d93749 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -467,7 +467,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA #define GGML_CUDA_PEER_MAX_BATCH_SIZE 128 #endif // GGML_CUDA_PEER_MAX_BATCH_SIZE -#define MUL_MAT_SRC1_COL_STRIDE 128 +#define MUL_MAT_SRC1_COL_STRIDE 4096 #define MAX_STREAMS 8 static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } }; From dd71a35cc85192a2c9b5153b06cc99e10652032d Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Mon, 27 Nov 2023 13:05:55 -0500 Subject: [PATCH 2/3] make MUL_MAT_SRC1_COL_STRIDE conditional on runtime mmq --- ggml-cuda.cu | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index fc82f80d93749..1bd1cbda3be9b 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -467,7 +467,8 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA #define GGML_CUDA_PEER_MAX_BATCH_SIZE 128 #endif // GGML_CUDA_PEER_MAX_BATCH_SIZE -#define MUL_MAT_SRC1_COL_STRIDE 4096 +#define MUL_MAT_SRC1_COL_STRIDE_MMQ 128 +#define MUL_MAT_SRC1_COL_STRIDE 4096 #define MAX_STREAMS 8 static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } }; @@ -7158,7 +7159,10 @@ static void ggml_cuda_op_mul_mat( CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0])); } - const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11; + const int64_t src1_col_stride = !split || used_devices == 1 ? ne11 : + convert_src1_to_q8_1 ? MUL_MAT_SRC1_COL_STRIDE_MMQ : + MUL_MAT_SRC1_COL_STRIDE; + for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) { const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0; const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride; @@ -7296,7 +7300,7 @@ static void ggml_cuda_op_mul_mat( // main device waits for all other devices to be finished if (split && g_device_count > 1) { - int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE; + int64_t is_max = (ne11 + src1_col_stride - 1) / src1_col_stride; is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS; CUDA_CHECK(ggml_cuda_set_device(g_main_device)); From 6272b6764ad4b39eb05a411489577d3a6c914b89 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Mon, 27 Nov 2023 13:09:14 -0500 Subject: [PATCH 3/3] use stride=128 if built for tensor cores --- ggml-cuda.cu | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 1bd1cbda3be9b..51a79b19f09ea 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -468,7 +468,12 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA #endif // GGML_CUDA_PEER_MAX_BATCH_SIZE #define MUL_MAT_SRC1_COL_STRIDE_MMQ 128 -#define MUL_MAT_SRC1_COL_STRIDE 4096 + +#ifdef CUDA_USE_TENSOR_CORES +#define MUL_MAT_SRC1_COL_STRIDE 128 +#else +#define MUL_MAT_SRC1_COL_STRIDE 4096 +#endif #define MAX_STREAMS 8 static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };