Skip to content

Commit

Permalink
retorder loops
Browse files Browse the repository at this point in the history
  • Loading branch information
JohannesGaessler committed Sep 11, 2023
1 parent 54da1a2 commit 844f4fd
Showing 1 changed file with 16 additions and 16 deletions.
32 changes: 16 additions & 16 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5985,26 +5985,26 @@ static void ggml_cuda_op_mul_mat(
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;

for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
const int64_t i03 = i0 / ne12;
const int64_t i02 = i0 % ne12;
for (int64_t id = 0; id < g_device_count; ++id) {
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
continue;
}

for (int64_t id = 0; id < g_device_count; ++id) {
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
continue;
}
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
const int64_t row_diff = row_high[id] - row_low[id];

const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
const int64_t row_diff = row_high[id] - row_low[id];
cudaSetDevice(id);
const cudaStream_t stream = g_cudaStreams[id][is];

cudaSetDevice(id);
const cudaStream_t stream = g_cudaStreams[id][is];
// wait for main GPU data if necessary
if (split && (id != g_main_device || is != 0)) {
CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0]));
}

// wait for main GPU data if necessary
if (split && (id != g_main_device || is != 0)) {
CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0]));
}
for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
const int64_t i03 = i0 / ne12;
const int64_t i02 = i0 % ne12;

const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;

Expand Down

0 comments on commit 844f4fd

Please sign in to comment.