Skip to content

Commit

Permalink
move CUDA reduction kernels to common entirely
Browse files Browse the repository at this point in the history
  • Loading branch information
upsj committed Jul 27, 2021
1 parent 085f40f commit 28b3ca7
Showing 1 changed file with 0 additions and 127 deletions.
127 changes: 0 additions & 127 deletions cuda/matrix/dense_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -117,133 +117,6 @@ void apply(std::shared_ptr<const CudaExecutor> exec,
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);


template <typename ValueType>
void compute_dot(std::shared_ptr<const CudaExecutor> exec,
const matrix::Dense<ValueType> *x,
const matrix::Dense<ValueType> *y,
matrix::Dense<ValueType> *result)
{
if (cublas::is_supported<ValueType>::value) {
// TODO: write a custom kernel which does this more efficiently
for (size_type col = 0; col < x->get_size()[1]; ++col) {
cublas::dot(exec->get_cublas_handle(), x->get_size()[0],
x->get_const_values() + col, x->get_stride(),
y->get_const_values() + col, y->get_stride(),
result->get_values() + col);
}
} else {
// TODO: these are tuning parameters obtained experimentally, once
// we decide how to handle this uniformly, they should be modified
// appropriately
constexpr auto work_per_thread = 32;
constexpr auto block_size = 1024;

constexpr auto work_per_block = work_per_thread * block_size;
const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
const dim3 block_dim{config::warp_size, 1,
block_size / config::warp_size};
Array<ValueType> work(exec, grid_dim.x);
// TODO: write a kernel which does this more efficiently
for (size_type col = 0; col < x->get_size()[1]; ++col) {
kernel::compute_partial_dot<block_size><<<grid_dim, block_dim>>>(
x->get_size()[0], as_cuda_type(x->get_const_values() + col),
x->get_stride(), as_cuda_type(y->get_const_values() + col),
y->get_stride(), as_cuda_type(work.get_data()));
kernel::finalize_sum_reduce_computation<block_size>
<<<1, block_dim>>>(grid_dim.x,
as_cuda_type(work.get_const_data()),
as_cuda_type(result->get_values() + col));
}
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);


template <typename ValueType>
void compute_conj_dot(std::shared_ptr<const CudaExecutor> exec,
const matrix::Dense<ValueType> *x,
const matrix::Dense<ValueType> *y,
matrix::Dense<ValueType> *result)
{
if (cublas::is_supported<ValueType>::value) {
// TODO: write a custom kernel which does this more efficiently
for (size_type col = 0; col < x->get_size()[1]; ++col) {
cublas::conj_dot(exec->get_cublas_handle(), x->get_size()[0],
x->get_const_values() + col, x->get_stride(),
y->get_const_values() + col, y->get_stride(),
result->get_values() + col);
}
} else {
// TODO: these are tuning parameters obtained experimentally, once
// we decide how to handle this uniformly, they should be modified
// appropriately
constexpr auto work_per_thread = 32;
constexpr auto block_size = 1024;

constexpr auto work_per_block = work_per_thread * block_size;
const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
const dim3 block_dim{config::warp_size, 1,
block_size / config::warp_size};
Array<ValueType> work(exec, grid_dim.x);
// TODO: write a kernel which does this more efficiently
for (size_type col = 0; col < x->get_size()[1]; ++col) {
kernel::compute_partial_conj_dot<block_size>
<<<grid_dim, block_dim>>>(
x->get_size()[0], as_cuda_type(x->get_const_values() + col),
x->get_stride(), as_cuda_type(y->get_const_values() + col),
y->get_stride(), as_cuda_type(work.get_data()));
kernel::finalize_sum_reduce_computation<block_size>
<<<1, block_dim>>>(grid_dim.x,
as_cuda_type(work.get_const_data()),
as_cuda_type(result->get_values() + col));
}
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);


template <typename ValueType>
void compute_norm2(std::shared_ptr<const CudaExecutor> exec,
const matrix::Dense<ValueType> *x,
matrix::Dense<remove_complex<ValueType>> *result)
{
if (cublas::is_supported<ValueType>::value) {
for (size_type col = 0; col < x->get_size()[1]; ++col) {
cublas::norm2(exec->get_cublas_handle(), x->get_size()[0],
x->get_const_values() + col, x->get_stride(),
result->get_values() + col);
}
} else {
using norm_type = remove_complex<ValueType>;
// TODO: these are tuning parameters obtained experimentally, once
// we decide how to handle this uniformly, they should be modified
// appropriately
constexpr auto work_per_thread = 32;
constexpr auto block_size = 1024;

constexpr auto work_per_block = work_per_thread * block_size;
const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block);
const dim3 block_dim{config::warp_size, 1,
block_size / config::warp_size};
Array<norm_type> work(exec, grid_dim.x);
// TODO: write a kernel which does this more efficiently
for (size_type col = 0; col < x->get_size()[1]; ++col) {
kernel::compute_partial_norm2<block_size><<<grid_dim, block_dim>>>(
x->get_size()[0], as_cuda_type(x->get_const_values() + col),
x->get_stride(), as_cuda_type(work.get_data()));
kernel::finalize_sqrt_reduce_computation<block_size>
<<<1, block_dim>>>(grid_dim.x,
as_cuda_type(work.get_const_data()),
as_cuda_type(result->get_values() + col));
}
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);


template <typename ValueType, typename IndexType>
void convert_to_coo(std::shared_ptr<const CudaExecutor> exec,
const matrix::Dense<ValueType> *source,
Expand Down

0 comments on commit 28b3ca7

Please sign in to comment.