diff --git a/ggml-cuda.cu b/ggml-cuda.cu index edc10afd3131fc..f75d1a28e317ae 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5975,7 +5975,7 @@ static void ggml_cuda_op_mul_mat( // if multiple devices are used they need to wait for the main device // here an event is recorded that signals that the main device has finished calculating the input data - if (split) { + if (split && g_device_count > 1) { CUDA_CHECK(cudaSetDevice(g_main_device)); CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0])); } @@ -6114,10 +6114,12 @@ static void ggml_cuda_op_mul_mat( } // main device waits for all other devices to be finished - if (split) { + if (split && g_device_count > 1) { + const int64_t is_max = ne11/MUL_MAT_SRC1_COL_STRIDE <= MAX_STREAMS ? ne11/MUL_MAT_SRC1_COL_STRIDE : MAX_STREAMS; + CUDA_CHECK(cudaSetDevice(g_main_device)); for (int64_t id = 0; id < g_device_count; ++id) { - for (int64_t is = 0; is < MAX_STREAMS; ++is) { + for (int64_t is = 0; is < is_max; ++is) { CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is])); } }