Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simple kernel reduction #833

Merged
merged 25 commits into from
Oct 13, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/bot-pr-format-base.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
source .github/bot-pr-base.sh

EXTENSION_REGEX='\.(cuh?|hpp|hpp\.inc?|cpp)$'
FORMAT_HEADER_REGEX='^(benchmark|core|cuda|hip|include/ginkgo/core|omp|reference|dpcpp)/'
FORMAT_REGEX='^(common|examples|test)/'
FORMAT_HEADER_REGEX='^(benchmark|core|cuda|hip|include/ginkgo/core|omp|reference|dpcpp|common/unified)/'
FORMAT_REGEX='^(common/cuda_hip|examples|test)/'

echo "Retrieving PR file list"
PR_FILES=$(bot_get_all_changed_files ${PR_URL})
Expand Down
7 changes: 4 additions & 3 deletions common/cuda_hip/components/reduction.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -208,14 +208,15 @@ __device__ void reduce_array(size_type size,
*
* Computes a reduction using the add operation (+) on an array
* `source` of any size. Has to be called a second time on `result` to reduce
* an array larger than `default_block_size`.
* an array larger than `default_reduce_block_size`.
*/
template <typename ValueType>
__global__ __launch_bounds__(default_block_size) void reduce_add_array(
__global__ __launch_bounds__(default_reduce_block_size) void reduce_add_array(
size_type size, const ValueType* __restrict__ source,
ValueType* __restrict__ result)
{
__shared__ UninitializedArray<ValueType, default_block_size> block_sum;
__shared__ UninitializedArray<ValueType, default_reduce_block_size>
block_sum;
reduce_array(size, source, static_cast<ValueType*>(block_sum),
[](const ValueType& x, const ValueType& y) { return x + y; });

Expand Down
15 changes: 7 additions & 8 deletions common/unified/base/kernel_launch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,14 +170,13 @@ namespace GKO_DEVICE_NAMESPACE {
template <typename ValueType>
struct matrix_accessor {
ValueType* data;
size_type stride;
int64 stride;

/**
* @internal
* Returns a reference to the element at position (row, col).
*/
GKO_INLINE GKO_ATTRIBUTES ValueType& operator()(size_type row,
size_type col)
GKO_INLINE GKO_ATTRIBUTES ValueType& operator()(int64 row, int64 col)
{
return data[row * stride + col];
}
Expand All @@ -187,7 +186,7 @@ struct matrix_accessor {
* Returns a reference to the element at position idx in the underlying
* storage.
*/
GKO_INLINE GKO_ATTRIBUTES ValueType& operator[](size_type idx)
GKO_INLINE GKO_ATTRIBUTES ValueType& operator[](int64 idx)
{
return data[idx];
}
Expand Down Expand Up @@ -223,7 +222,8 @@ struct to_device_type_impl<matrix::Dense<ValueType>*&> {
using type = matrix_accessor<device_type<ValueType>>;
static type map_to_device(matrix::Dense<ValueType>* mtx)
{
return {as_device_type(mtx->get_values()), mtx->get_stride()};
return {as_device_type(mtx->get_values()),
static_cast<int64>(mtx->get_stride())};
}
};

Expand All @@ -232,7 +232,8 @@ struct to_device_type_impl<const matrix::Dense<ValueType>*&> {
using type = matrix_accessor<const device_type<ValueType>>;
static type map_to_device(const matrix::Dense<ValueType>* mtx)
{
return {as_device_type(mtx->get_const_values()), mtx->get_stride()};
return {as_device_type(mtx->get_const_values()),
static_cast<int64>(mtx->get_stride())};
}
};

Expand Down Expand Up @@ -267,8 +268,6 @@ typename to_device_type_impl<T>::type map_to_device(T&& param)
} // namespace gko


// these files include this file again to make inclusion work from both sides,
// this does not lead to issues due to the header guards.
#if defined(GKO_COMPILING_CUDA)
#include "cuda/base/kernel_launch.cuh"
#elif defined(GKO_COMPILING_HIP)
Expand Down
51 changes: 51 additions & 0 deletions common/unified/base/kernel_launch_reduction.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*******************************<GINKGO LICENSE>******************************
Copyright (c) 2017-2021, the Ginkgo authors
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:

1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
******************************<GINKGO LICENSE>*******************************/

#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
#define GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_


#include "common/unified/base/kernel_launch.hpp"


#if defined(GKO_COMPILING_CUDA)
#include "cuda/base/kernel_launch_reduction.cuh"
#elif defined(GKO_COMPILING_HIP)
#include "hip/base/kernel_launch_reduction.hip.hpp"
#elif defined(GKO_COMPILING_DPCPP)
#include "dpcpp/base/kernel_launch_reduction.dp.hpp"
#elif defined(GKO_COMPILING_OMP)
#include "omp/base/kernel_launch_reduction.hpp"
#endif


#endif // GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
6 changes: 3 additions & 3 deletions common/unified/base/kernel_launch_solver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ struct default_stride_dense_wrapper {
template <typename T>
struct device_unpack_solver_impl {
using type = T;
static GKO_INLINE GKO_ATTRIBUTES type unpack(T param, size_type)
static GKO_INLINE GKO_ATTRIBUTES type unpack(T param, int64)
{
return param;
}
Expand All @@ -72,8 +72,8 @@ struct device_unpack_solver_impl {
template <typename ValueType>
struct device_unpack_solver_impl<default_stride_dense_wrapper<ValueType>> {
using type = matrix_accessor<ValueType>;
static GKO_INLINE GKO_ATTRIBUTES type unpack(
default_stride_dense_wrapper<ValueType> param, size_type default_stride)
static GKO_INLINE GKO_ATTRIBUTES type
unpack(default_stride_dense_wrapper<ValueType> param, int64 default_stride)
{
return {param.data, default_stride};
}
Expand Down
55 changes: 55 additions & 0 deletions common/unified/matrix/dense_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


#include "common/unified/base/kernel_launch.hpp"
#include "common/unified/base/kernel_launch_reduction.hpp"


namespace gko {
Expand Down Expand Up @@ -220,6 +221,60 @@ void sub_scaled_diag(std::shared_ptr<const DefaultExecutor> exec,
GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);


template <typename ValueType>
void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* x,
const matrix::Dense<ValueType>* y,
matrix::Dense<ValueType>* result)
{
run_kernel_col_reduction(
exec,
[] GKO_KERNEL(auto i, auto j, auto x, auto y) {
return x(i, j) * y(i, j);
},
[] GKO_KERNEL(auto a, auto b) { return a + b; },
[] GKO_KERNEL(auto a) { return a; }, ValueType{}, result->get_values(),
x->get_size(), x, y);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);


template <typename ValueType>
void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* x,
const matrix::Dense<ValueType>* y,
matrix::Dense<ValueType>* result)
{
run_kernel_col_reduction(
exec,
[] GKO_KERNEL(auto i, auto j, auto x, auto y) {
return conj(x(i, j)) * y(i, j);
},
[] GKO_KERNEL(auto a, auto b) { return a + b; },
[] GKO_KERNEL(auto a) { return a; }, ValueType{}, result->get_values(),
x->get_size(), x, y);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);


template <typename ValueType>
void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* x,
matrix::Dense<remove_complex<ValueType>>* result)
{
run_kernel_col_reduction(
exec,
[] GKO_KERNEL(auto i, auto j, auto x) { return squared_norm(x(i, j)); },
[] GKO_KERNEL(auto a, auto b) { return a + b; },
[] GKO_KERNEL(auto a) { return sqrt(a); }, remove_complex<ValueType>{},
result->get_values(), x->get_size(), x);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);


template <typename ValueType, typename IndexType>
void symm_permute(std::shared_ptr<const DefaultExecutor> exec,
const Array<IndexType>* permutation_indices,
Expand Down
15 changes: 8 additions & 7 deletions cuda/base/kernel_launch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ constexpr int default_block_size = 512;

template <typename KernelFunction, typename... KernelArgs>
__global__ __launch_bounds__(default_block_size) void generic_kernel_1d(
size_type size, KernelFunction fn, KernelArgs... args)
int64 size, KernelFunction fn, KernelArgs... args)
{
auto tidx = thread::get_thread_id_flat();
auto tidx = thread::get_thread_id_flat<int64>();
if (tidx >= size) {
return;
}
Expand All @@ -63,9 +63,9 @@ __global__ __launch_bounds__(default_block_size) void generic_kernel_1d(

template <typename KernelFunction, typename... KernelArgs>
__global__ __launch_bounds__(default_block_size) void generic_kernel_2d(
size_type rows, size_type cols, KernelFunction fn, KernelArgs... args)
int64 rows, int64 cols, KernelFunction fn, KernelArgs... args)
{
auto tidx = thread::get_thread_id_flat();
auto tidx = thread::get_thread_id_flat<int64>();
auto col = tidx % cols;
auto row = tidx / cols;
if (row >= rows) {
Expand All @@ -82,7 +82,7 @@ void run_kernel(std::shared_ptr<const CudaExecutor> exec, KernelFunction fn,
gko::cuda::device_guard guard{exec->get_device_id()};
constexpr auto block_size = default_block_size;
auto num_blocks = ceildiv(size, block_size);
generic_kernel_1d<<<num_blocks, block_size>>>(size, fn,
generic_kernel_1d<<<num_blocks, block_size>>>(static_cast<int64>(size), fn,
map_to_device(args)...);
}

Expand All @@ -93,8 +93,9 @@ void run_kernel(std::shared_ptr<const CudaExecutor> exec, KernelFunction fn,
gko::cuda::device_guard guard{exec->get_device_id()};
constexpr auto block_size = default_block_size;
auto num_blocks = ceildiv(size[0] * size[1], block_size);
generic_kernel_2d<<<num_blocks, block_size>>>(size[0], size[1], fn,
map_to_device(args)...);
generic_kernel_2d<<<num_blocks, block_size>>>(static_cast<int64>(size[0]),
static_cast<int64>(size[1]),
fn, map_to_device(args)...);
}


Expand Down
Loading