diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu index 1f6e5a2ab16..0f01216da52 100644 --- a/cuda/matrix/dense_kernels.cu +++ b/cuda/matrix/dense_kernels.cu @@ -117,133 +117,6 @@ void apply(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); -template -void compute_dot(std::shared_ptr exec, - const matrix::Dense *x, - const matrix::Dense *y, - matrix::Dense *result) -{ - if (cublas::is_supported::value) { - // TODO: write a custom kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - cublas::dot(exec->get_cublas_handle(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - y->get_const_values() + col, y->get_stride(), - result->get_values() + col); - } - } else { - // TODO: these are tuning parameters obtained experimentally, once - // we decide how to handle this uniformly, they should be modified - // appropriately - constexpr auto work_per_thread = 32; - constexpr auto block_size = 1024; - - constexpr auto work_per_block = work_per_thread * block_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{config::warp_size, 1, - block_size / config::warp_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - kernel::compute_partial_dot<<>>( - x->get_size()[0], as_cuda_type(x->get_const_values() + col), - x->get_stride(), as_cuda_type(y->get_const_values() + col), - y->get_stride(), as_cuda_type(work.get_data())); - kernel::finalize_sum_reduce_computation - <<<1, block_dim>>>(grid_dim.x, - as_cuda_type(work.get_const_data()), - as_cuda_type(result->get_values() + col)); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); - - -template -void compute_conj_dot(std::shared_ptr exec, - const matrix::Dense *x, - const matrix::Dense *y, - matrix::Dense *result) -{ - if (cublas::is_supported::value) { - // TODO: write a custom kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - cublas::conj_dot(exec->get_cublas_handle(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - y->get_const_values() + col, y->get_stride(), - result->get_values() + col); - } - } else { - // TODO: these are tuning parameters obtained experimentally, once - // we decide how to handle this uniformly, they should be modified - // appropriately - constexpr auto work_per_thread = 32; - constexpr auto block_size = 1024; - - constexpr auto work_per_block = work_per_thread * block_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{config::warp_size, 1, - block_size / config::warp_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - kernel::compute_partial_conj_dot - <<>>( - x->get_size()[0], as_cuda_type(x->get_const_values() + col), - x->get_stride(), as_cuda_type(y->get_const_values() + col), - y->get_stride(), as_cuda_type(work.get_data())); - kernel::finalize_sum_reduce_computation - <<<1, block_dim>>>(grid_dim.x, - as_cuda_type(work.get_const_data()), - as_cuda_type(result->get_values() + col)); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); - - -template -void compute_norm2(std::shared_ptr exec, - const matrix::Dense *x, - matrix::Dense> *result) -{ - if (cublas::is_supported::value) { - for (size_type col = 0; col < x->get_size()[1]; ++col) { - cublas::norm2(exec->get_cublas_handle(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - result->get_values() + col); - } - } else { - using norm_type = remove_complex; - // TODO: these are tuning parameters obtained experimentally, once - // we decide how to handle this uniformly, they should be modified - // appropriately - constexpr auto work_per_thread = 32; - constexpr auto block_size = 1024; - - constexpr auto work_per_block = work_per_thread * block_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{config::warp_size, 1, - block_size / config::warp_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - kernel::compute_partial_norm2<<>>( - x->get_size()[0], as_cuda_type(x->get_const_values() + col), - x->get_stride(), as_cuda_type(work.get_data())); - kernel::finalize_sqrt_reduce_computation - <<<1, block_dim>>>(grid_dim.x, - as_cuda_type(work.get_const_data()), - as_cuda_type(result->get_values() + col)); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); - - template void convert_to_coo(std::shared_ptr exec, const matrix::Dense *source,