diff --git a/.github/bot-pr-format-base.sh b/.github/bot-pr-format-base.sh index e72539a9d61..10be68353b9 100644 --- a/.github/bot-pr-format-base.sh +++ b/.github/bot-pr-format-base.sh @@ -3,8 +3,8 @@ source .github/bot-pr-base.sh EXTENSION_REGEX='\.(cuh?|hpp|hpp\.inc?|cpp)$' -FORMAT_HEADER_REGEX='^(benchmark|core|cuda|hip|include/ginkgo/core|omp|reference|dpcpp)/' -FORMAT_REGEX='^(common|examples|test)/' +FORMAT_HEADER_REGEX='^(benchmark|core|cuda|hip|include/ginkgo/core|omp|reference|dpcpp|common/unified)/' +FORMAT_REGEX='^(common/cuda_hip|examples|test)/' echo "Retrieving PR file list" PR_FILES=$(bot_get_all_changed_files ${PR_URL}) diff --git a/common/cuda_hip/components/reduction.hpp.inc b/common/cuda_hip/components/reduction.hpp.inc index 9b4ed4cc8c7..3853fca6d44 100644 --- a/common/cuda_hip/components/reduction.hpp.inc +++ b/common/cuda_hip/components/reduction.hpp.inc @@ -208,14 +208,15 @@ __device__ void reduce_array(size_type size, * * Computes a reduction using the add operation (+) on an array * `source` of any size. Has to be called a second time on `result` to reduce - * an array larger than `default_block_size`. + * an array larger than `default_reduce_block_size`. */ template -__global__ __launch_bounds__(default_block_size) void reduce_add_array( +__global__ __launch_bounds__(default_reduce_block_size) void reduce_add_array( size_type size, const ValueType* __restrict__ source, ValueType* __restrict__ result) { - __shared__ UninitializedArray block_sum; + __shared__ UninitializedArray + block_sum; reduce_array(size, source, static_cast(block_sum), [](const ValueType& x, const ValueType& y) { return x + y; }); diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp index 6b3a698768c..0e25671c58a 100644 --- a/common/unified/base/kernel_launch.hpp +++ b/common/unified/base/kernel_launch.hpp @@ -170,14 +170,13 @@ namespace GKO_DEVICE_NAMESPACE { template struct matrix_accessor { ValueType* data; - size_type stride; + int64 stride; /** * @internal * Returns a reference to the element at position (row, col). */ - GKO_INLINE GKO_ATTRIBUTES ValueType& operator()(size_type row, - size_type col) + GKO_INLINE GKO_ATTRIBUTES ValueType& operator()(int64 row, int64 col) { return data[row * stride + col]; } @@ -187,7 +186,7 @@ struct matrix_accessor { * Returns a reference to the element at position idx in the underlying * storage. */ - GKO_INLINE GKO_ATTRIBUTES ValueType& operator[](size_type idx) + GKO_INLINE GKO_ATTRIBUTES ValueType& operator[](int64 idx) { return data[idx]; } @@ -223,7 +222,8 @@ struct to_device_type_impl*&> { using type = matrix_accessor>; static type map_to_device(matrix::Dense* mtx) { - return {as_device_type(mtx->get_values()), mtx->get_stride()}; + return {as_device_type(mtx->get_values()), + static_cast(mtx->get_stride())}; } }; @@ -232,7 +232,8 @@ struct to_device_type_impl*&> { using type = matrix_accessor>; static type map_to_device(const matrix::Dense* mtx) { - return {as_device_type(mtx->get_const_values()), mtx->get_stride()}; + return {as_device_type(mtx->get_const_values()), + static_cast(mtx->get_stride())}; } }; @@ -267,8 +268,6 @@ typename to_device_type_impl::type map_to_device(T&& param) } // namespace gko -// these files include this file again to make inclusion work from both sides, -// this does not lead to issues due to the header guards. #if defined(GKO_COMPILING_CUDA) #include "cuda/base/kernel_launch.cuh" #elif defined(GKO_COMPILING_HIP) diff --git a/common/unified/base/kernel_launch_reduction.hpp b/common/unified/base/kernel_launch_reduction.hpp new file mode 100644 index 00000000000..9eb65216416 --- /dev/null +++ b/common/unified/base/kernel_launch_reduction.hpp @@ -0,0 +1,51 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ +#define GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ + + +#include "common/unified/base/kernel_launch.hpp" + + +#if defined(GKO_COMPILING_CUDA) +#include "cuda/base/kernel_launch_reduction.cuh" +#elif defined(GKO_COMPILING_HIP) +#include "hip/base/kernel_launch_reduction.hip.hpp" +#elif defined(GKO_COMPILING_DPCPP) +#include "dpcpp/base/kernel_launch_reduction.dp.hpp" +#elif defined(GKO_COMPILING_OMP) +#include "omp/base/kernel_launch_reduction.hpp" +#endif + + +#endif // GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ diff --git a/common/unified/base/kernel_launch_solver.hpp b/common/unified/base/kernel_launch_solver.hpp index 6c8a1296b83..716bd94a093 100644 --- a/common/unified/base/kernel_launch_solver.hpp +++ b/common/unified/base/kernel_launch_solver.hpp @@ -63,7 +63,7 @@ struct default_stride_dense_wrapper { template struct device_unpack_solver_impl { using type = T; - static GKO_INLINE GKO_ATTRIBUTES type unpack(T param, size_type) + static GKO_INLINE GKO_ATTRIBUTES type unpack(T param, int64) { return param; } @@ -72,8 +72,8 @@ struct device_unpack_solver_impl { template struct device_unpack_solver_impl> { using type = matrix_accessor; - static GKO_INLINE GKO_ATTRIBUTES type unpack( - default_stride_dense_wrapper param, size_type default_stride) + static GKO_INLINE GKO_ATTRIBUTES type + unpack(default_stride_dense_wrapper param, int64 default_stride) { return {param.data, default_stride}; } diff --git a/common/unified/matrix/dense_kernels.cpp b/common/unified/matrix/dense_kernels.cpp index a06d8e1eef2..a3e90576ced 100644 --- a/common/unified/matrix/dense_kernels.cpp +++ b/common/unified/matrix/dense_kernels.cpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" namespace gko { @@ -220,6 +221,60 @@ void sub_scaled_diag(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL); +template +void compute_dot(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result) +{ + run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto x, auto y) { + return x(i, j) * y(i, j); + }, + [] GKO_KERNEL(auto a, auto b) { return a + b; }, + [] GKO_KERNEL(auto a) { return a; }, ValueType{}, result->get_values(), + x->get_size(), x, y); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); + + +template +void compute_conj_dot(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result) +{ + run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto x, auto y) { + return conj(x(i, j)) * y(i, j); + }, + [] GKO_KERNEL(auto a, auto b) { return a + b; }, + [] GKO_KERNEL(auto a) { return a; }, ValueType{}, result->get_values(), + x->get_size(), x, y); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); + + +template +void compute_norm2(std::shared_ptr exec, + const matrix::Dense* x, + matrix::Dense>* result) +{ + run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto x) { return squared_norm(x(i, j)); }, + [] GKO_KERNEL(auto a, auto b) { return a + b; }, + [] GKO_KERNEL(auto a) { return sqrt(a); }, remove_complex{}, + result->get_values(), x->get_size(), x); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); + + template void symm_permute(std::shared_ptr exec, const Array* permutation_indices, diff --git a/cuda/base/kernel_launch.cuh b/cuda/base/kernel_launch.cuh index d55faed5053..5179a5cc27d 100644 --- a/cuda/base/kernel_launch.cuh +++ b/cuda/base/kernel_launch.cuh @@ -51,9 +51,9 @@ constexpr int default_block_size = 512; template __global__ __launch_bounds__(default_block_size) void generic_kernel_1d( - size_type size, KernelFunction fn, KernelArgs... args) + int64 size, KernelFunction fn, KernelArgs... args) { - auto tidx = thread::get_thread_id_flat(); + auto tidx = thread::get_thread_id_flat(); if (tidx >= size) { return; } @@ -63,9 +63,9 @@ __global__ __launch_bounds__(default_block_size) void generic_kernel_1d( template __global__ __launch_bounds__(default_block_size) void generic_kernel_2d( - size_type rows, size_type cols, KernelFunction fn, KernelArgs... args) + int64 rows, int64 cols, KernelFunction fn, KernelArgs... args) { - auto tidx = thread::get_thread_id_flat(); + auto tidx = thread::get_thread_id_flat(); auto col = tidx % cols; auto row = tidx / cols; if (row >= rows) { @@ -82,7 +82,7 @@ void run_kernel(std::shared_ptr exec, KernelFunction fn, gko::cuda::device_guard guard{exec->get_device_id()}; constexpr auto block_size = default_block_size; auto num_blocks = ceildiv(size, block_size); - generic_kernel_1d<<>>(size, fn, + generic_kernel_1d<<>>(static_cast(size), fn, map_to_device(args)...); } @@ -93,8 +93,9 @@ void run_kernel(std::shared_ptr exec, KernelFunction fn, gko::cuda::device_guard guard{exec->get_device_id()}; constexpr auto block_size = default_block_size; auto num_blocks = ceildiv(size[0] * size[1], block_size); - generic_kernel_2d<<>>(size[0], size[1], fn, - map_to_device(args)...); + generic_kernel_2d<<>>(static_cast(size[0]), + static_cast(size[1]), + fn, map_to_device(args)...); } diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh new file mode 100644 index 00000000000..30f7fa1ba96 --- /dev/null +++ b/cuda/base/kernel_launch_reduction.cuh @@ -0,0 +1,529 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ +#error \ + "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp" +#endif + + +#include "core/synthesizer/implementation_selection.hpp" +#include "cuda/base/device_guard.hpp" +#include "cuda/base/types.hpp" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/reduction.cuh" +#include "cuda/components/thread_ids.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { + + +template +__global__ __launch_bounds__( + default_block_size) void generic_kernel_reduction_1d(int64 size, + KernelFunction fn, + ReductionOp op, + FinalizeOp finalize, + ValueType identity, + ValueType* storage, + KernelArgs... args) +{ + __shared__ + UninitializedArray + warp_partial; + static_assert(default_block_size / config::warp_size <= config::warp_size, + "needs third reduction level"); + auto tidx = thread::get_thread_id_flat(); + auto grid_size = thread::get_thread_num_flat(); + auto warp = + group::tiled_partition(group::this_thread_block()); + auto partial = identity; + for (int64 i = tidx; i < size; i += grid_size) { + partial = op(partial, fn(i, args...)); + } + partial = reduce(warp, partial, op); + if (warp.thread_rank() == 0) { + warp_partial[threadIdx.x / config::warp_size] = partial; + } + __syncthreads(); + if (threadIdx.x < config::warp_size) { + partial = reduce(warp, + threadIdx.x < default_block_size / config::warp_size + ? warp_partial[threadIdx.x] + : identity, + op); + if (threadIdx.x == 0) { + storage[blockIdx.x] = finalize(partial); + } + } +} + + +template +__global__ __launch_bounds__( + default_block_size) void generic_kernel_reduction_2d(int64 rows, int64 cols, + KernelFunction fn, + ReductionOp op, + FinalizeOp finalize, + ValueType identity, + ValueType* storage, + KernelArgs... args) +{ + __shared__ + UninitializedArray + warp_partial; + static_assert(default_block_size / config::warp_size <= config::warp_size, + "needs third reduction level"); + auto tidx = thread::get_thread_id_flat(); + auto grid_size = thread::get_thread_num_flat(); + auto warp = + group::tiled_partition(group::this_thread_block()); + auto partial = identity; + for (int64 i = tidx; i < rows * cols; i += grid_size) { + const auto row = i / cols; + const auto col = i % cols; + partial = op(partial, fn(row, col, args...)); + } + partial = reduce(warp, partial, op); + if (warp.thread_rank() == 0) { + warp_partial[threadIdx.x / config::warp_size] = partial; + } + __syncthreads(); + if (threadIdx.x < config::warp_size) { + partial = reduce(warp, + threadIdx.x < default_block_size / config::warp_size + ? warp_partial[threadIdx.x] + : identity, + op); + if (threadIdx.x == 0) { + storage[blockIdx.x] = finalize(partial); + } + } +} + + +template +void run_kernel_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, size_type size, + KernelArgs&&... args) +{ + constexpr int oversubscription = 16; + gko::cuda::device_guard guard{exec->get_device_id()}; + constexpr auto block_size = default_block_size; + const auto num_blocks = std::min( + ceildiv(size, block_size), exec->get_num_warps() * oversubscription); + if (num_blocks > 1) { + Array partial{exec, static_cast(num_blocks)}; + generic_kernel_reduction_1d<<>>( + static_cast(size), fn, op, + [] __device__(auto v) { return v; }, as_cuda_type(identity), + as_cuda_type(partial.get_data()), map_to_device(args)...); + generic_kernel_reduction_1d<<<1, block_size>>>( + static_cast(num_blocks), + [] __device__(auto i, auto v) { return v[i]; }, op, finalize, + as_cuda_type(identity), as_cuda_type(result), + as_cuda_type(partial.get_const_data())); + } else { + generic_kernel_reduction_1d<<<1, block_size>>>( + static_cast(size), fn, op, finalize, as_cuda_type(identity), + as_cuda_type(result), map_to_device(args)...); + } +} + + +template +void run_kernel_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, dim<2> size, KernelArgs&&... args) +{ + constexpr int oversubscription = 16; + gko::cuda::device_guard guard{exec->get_device_id()}; + constexpr auto block_size = default_block_size; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto num_blocks = + std::min(ceildiv(rows * cols, block_size), + exec->get_num_warps() * oversubscription); + if (num_blocks > 1) { + Array partial{exec, static_cast(num_blocks)}; + generic_kernel_reduction_2d<<>>( + rows, cols, fn, op, [] __device__(auto v) { return v; }, + as_cuda_type(identity), as_cuda_type(partial.get_data()), + map_to_device(args)...); + generic_kernel_reduction_1d<<<1, block_size>>>( + static_cast(num_blocks), + [] __device__(auto i, auto v) { return v[i]; }, op, finalize, + as_cuda_type(identity), as_cuda_type(result), + as_cuda_type(partial.get_const_data())); + } else { + generic_kernel_reduction_2d<<<1, block_size>>>( + rows, cols, fn, op, finalize, as_cuda_type(identity), + as_cuda_type(result), map_to_device(args)...); + } +} + + +template +__global__ + __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d( + int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, + ReductionOp op, FinalizeOp finalize, ValueType identity, + ValueType* result, int64 result_stride, KernelArgs... args) +{ + const auto idx = thread::get_subwarp_id_flat(); + const auto row = idx % rows; + const auto col_block = idx / rows; + if (col_block >= col_blocks) { + return; + } + const auto cols_per_part = + ceildiv(ceildiv(cols, subwarp_size), col_blocks) * subwarp_size; + const auto begin = cols_per_part * col_block; + const auto end = min(begin + cols_per_part, cols); + auto subwarp = + group::tiled_partition(group::this_thread_block()); + auto partial = identity; + for (auto col = begin + subwarp.thread_rank(); col < end; + col += subwarp_size) { + partial = op(partial, fn(row, col, args...)); + } + partial = reduce(subwarp, partial, op); + if (subwarp.thread_rank() == 0) { + result[(row + col_block * rows) * result_stride] = finalize(partial); + } +} + + +template +__global__ + __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small( + int64 rows, int64 cols, KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, ValueType* result, + KernelArgs... args) +{ + constexpr auto warp_size = config::warp_size; + constexpr auto warps_per_block = default_block_size / warp_size; + // stores the subwarp_size partial sums from each warp, grouped by warp + constexpr auto shared_storage = warps_per_block * subwarp_size; + __shared__ UninitializedArray block_partial; + const auto subwarp_id = thread::get_subwarp_id_flat(); + const auto local_warp_id = threadIdx.x / warp_size; + const auto local_subwarp_id = threadIdx.x % warp_size / subwarp_size; + const auto subwarp_num = + thread::get_subwarp_num_flat(); + const auto block = group::this_thread_block(); + const auto warp = group::tiled_partition(block); + const auto warp_rank = warp.thread_rank(); + const auto subwarp_rank = warp_rank % subwarp_size; + const auto col = static_cast(subwarp_rank); + auto partial = identity; + // accumulate within a thread + if (col < cols) { + for (auto row = subwarp_id; row < rows; row += subwarp_num) { + partial = op(partial, fn(row, col, args...)); + } + } + // accumulate between all subwarps in the warp +#pragma unroll + for (unsigned i = subwarp_size; i < warp_size; i *= 2) { + partial = op(partial, warp.shfl_xor(partial, i)); + } // store the result to shared memory + if (local_subwarp_id == 0) { + block_partial[local_warp_id * subwarp_size + subwarp_rank] = partial; + } + block.sync(); + // in a single thread: accumulate the results + if (local_warp_id == 0) { + partial = identity; + // accumulate the partial results within a thread + if (shared_storage >= warp_size) { +#pragma unroll + for (int i = 0; i < shared_storage; i += warp_size) { + partial = op(partial, block_partial[i + warp_rank]); + } + } else if (warp_rank < shared_storage) { + partial = op(partial, block_partial[warp_rank]); + } + // accumulate between all subwarps in the warp +#pragma unroll + for (unsigned i = subwarp_size; i < warp_size; i *= 2) { + partial = op(partial, warp.shfl_xor(partial, i)); + } + if (warp_rank < cols) { + result[warp_rank + blockIdx.x * cols] = finalize(partial); + } + } +} + + +template +__global__ + __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked( + int64 rows, int64 cols, KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, ValueType* result, + KernelArgs... args) +{ + constexpr auto warp_size = config::warp_size; + __shared__ UninitializedArray block_partial; + const auto warp_id = thread::get_subwarp_id_flat(); + const auto warp_num = thread::get_subwarp_num_flat(); + const auto block = group::this_thread_block(); + const auto warp = group::tiled_partition(block); + const auto warp_rank = warp.thread_rank(); + const auto col = warp_rank + static_cast(blockIdx.y) * warp_size; + auto partial = identity; + // accumulate within a thread + if (col < cols) { + for (auto row = warp_id; row < rows; row += warp_num) { + partial = op(partial, fn(row, col, args...)); + } + } + block_partial[threadIdx.x] = partial; + block.sync(); + // in a single warp: accumulate the results + if (threadIdx.x < warp_size) { + partial = identity; + // accumulate the partial results within a thread +#pragma unroll + for (int i = 0; i < default_block_size; i += warp_size) { + partial = op(partial, block_partial[i + warp_rank]); + } + if (col < cols) { + result[col + blockIdx.x * cols] = finalize(partial); + } + } +} + + +template +__global__ + __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d( + int64 num_results, int64 num_blocks, ReductionOp op, + FinalizeOp finalize, ValueType identity, const ValueType* input, + int64 result_stride, ValueType* result) +{ + const auto idx = thread::get_thread_id_flat(); + if (idx >= num_results) { + return; + } + auto partial = identity; + for (int64 block = 0; block < num_blocks; block++) { + partial = op(partial, input[idx + block * num_results]); + } + result[idx * result_stride] = finalize(partial); +} + + +namespace { + + +template +void run_generic_kernel_row_reduction(syn::value_list, + int64 rows, int64 cols, int64 col_blocks, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, int64 result_stride, + KernelArgs... args) +{ + const auto num_blocks = + ceildiv(rows * col_blocks * subwarp_size, default_block_size); + generic_kernel_row_reduction_2d + <<>>( + rows, cols, col_blocks, fn, op, finalize, as_cuda_type(identity), + as_cuda_type(result), result_stride, args...); +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction, + run_generic_kernel_row_reduction) + + +template +void run_generic_col_reduction_small(syn::value_list, + int64 max_blocks, + std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, dim<2> size, + MappedKernelArgs... args) +{ + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto num_blocks = std::min( + ceildiv(rows * subwarp_size, default_block_size), max_blocks); + if (num_blocks <= 1) { + generic_kernel_col_reduction_2d_small + <<<1, default_block_size>>>(rows, cols, fn, op, finalize, + as_cuda_type(identity), + as_cuda_type(result), args...); + } else { + Array tmp_storage{exec, + static_cast(num_blocks * cols)}; + generic_kernel_col_reduction_2d_small + <<>>( + rows, cols, fn, op, [] __device__(auto v) { return v; }, + as_cuda_type(identity), as_cuda_type(tmp_storage.get_data()), + args...); + generic_kernel_reduction_finalize_2d<<< + ceildiv(cols, default_block_size), default_block_size>>>( + cols, num_blocks, op, finalize, as_cuda_type(identity), + as_cuda_type(tmp_storage.get_const_data()), 1, + as_cuda_type(result)); + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small, + run_generic_col_reduction_small); + + +} // namespace + + +template +void run_kernel_row_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, size_type result_stride, + dim<2> size, KernelArgs&&... args) +{ + using subwarp_sizes = + syn::value_list; + constexpr int oversubscription = 16; + gko::cuda::device_guard guard{exec->get_device_id()}; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto resources = + exec->get_num_warps() * config::warp_size * oversubscription; + if (rows * cols > resources && rows < cols) { + const auto col_blocks = ceildiv(rows * cols, resources); + Array partial{exec, + static_cast(col_blocks * rows)}; + const auto num_blocks = + ceildiv(rows * col_blocks * config::warp_size, default_block_size); + generic_kernel_row_reduction_2d + <<>>( + rows, cols, col_blocks, fn, op, + [] __device__(auto v) { return v; }, as_cuda_type(identity), + as_cuda_type(partial.get_data()), 1, map_to_device(args)...); + const auto num_finalize_blocks = ceildiv(rows, default_block_size); + generic_kernel_reduction_finalize_2d<<>>( + rows, col_blocks, op, finalize, as_cuda_type(identity), + as_cuda_type(partial.get_const_data()), + static_cast(result_stride), as_cuda_type(result)); + } else { + select_run_generic_kernel_row_reduction( + subwarp_sizes(), + [&](int compiled_subwarp_size) { + return compiled_subwarp_size >= cols || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), rows, cols, 1, fn, op, + finalize, identity, result, static_cast(result_stride), + map_to_device(args)...); + } +} + + +template +void run_kernel_col_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, dim<2> size, + KernelArgs&&... args) +{ + using subwarp_sizes = + syn::value_list; + constexpr int oversubscription = 16; + gko::cuda::device_guard guard{exec->get_device_id()}; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto max_blocks = exec->get_num_warps() * config::warp_size * + oversubscription / default_block_size; + if (cols <= config::warp_size) { + select_generic_col_reduction_small( + subwarp_sizes(), + [&](int compiled_subwarp_size) { + return compiled_subwarp_size >= cols || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), max_blocks, exec, fn, + op, finalize, identity, result, size, map_to_device(args)...); + } else { + const auto col_blocks = ceildiv(cols, config::warp_size); + const auto row_blocks = + ceildiv(std::min( + ceildiv(rows * config::warp_size, default_block_size), + max_blocks), + col_blocks); + if (row_blocks <= 1) { + generic_kernel_col_reduction_2d_blocked<<>>( + rows, cols, fn, op, finalize, as_cuda_type(identity), + as_cuda_type(result), map_to_device(args)...); + } else { + Array tmp_storage{ + exec, static_cast(row_blocks * cols)}; + generic_kernel_col_reduction_2d_blocked<<< + dim3(row_blocks, col_blocks), default_block_size>>>( + rows, cols, fn, op, [] __device__(auto v) { return v; }, + as_cuda_type(identity), as_cuda_type(tmp_storage.get_data()), + map_to_device(args)...); + generic_kernel_reduction_finalize_2d<<< + ceildiv(cols, default_block_size), default_block_size>>>( + cols, row_blocks, op, finalize, as_cuda_type(identity), + as_cuda_type(tmp_storage.get_const_data()), 1, + as_cuda_type(result)); + } + } +} + + +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/base/kernel_launch_solver.cuh b/cuda/base/kernel_launch_solver.cuh index bf2f6e1a995..f4da60ddede 100644 --- a/cuda/base/kernel_launch_solver.cuh +++ b/cuda/base/kernel_launch_solver.cuh @@ -43,10 +43,10 @@ namespace cuda { template __global__ __launch_bounds__(default_block_size) void generic_kernel_2d_solver( - size_type rows, size_type cols, size_type default_stride, KernelFunction fn, + int64 rows, int64 cols, int64 default_stride, KernelFunction fn, KernelArgs... args) { - auto tidx = thread::get_thread_id_flat(); + auto tidx = thread::get_thread_id_flat(); auto col = tidx % cols; auto row = tidx / cols; if (row >= rows) { @@ -66,7 +66,8 @@ void run_kernel_solver(std::shared_ptr exec, constexpr auto block_size = default_block_size; auto num_blocks = ceildiv(size[0] * size[1], block_size); generic_kernel_2d_solver<<>>( - size[0], size[1], default_stride, fn, map_to_device(args)...); + static_cast(size[0]), static_cast(size[1]), + static_cast(default_stride), fn, map_to_device(args)...); } diff --git a/cuda/components/reduction.cuh b/cuda/components/reduction.cuh index 95ac3d8a417..8e0a962145e 100644 --- a/cuda/components/reduction.cuh +++ b/cuda/components/reduction.cuh @@ -53,7 +53,7 @@ namespace kernels { namespace cuda { -constexpr int default_block_size = 512; +constexpr int default_reduce_block_size = 512; #include "common/cuda_hip/components/reduction.hpp.inc" @@ -75,13 +75,14 @@ __host__ ValueType reduce_add_array(std::shared_ptr exec, auto block_results_val = source; size_type grid_dim = size; auto block_results = Array(exec); - if (size > default_block_size) { - const auto n = ceildiv(size, default_block_size); - grid_dim = (n <= default_block_size) ? n : default_block_size; + if (size > default_reduce_block_size) { + const auto n = ceildiv(size, default_reduce_block_size); + grid_dim = + (n <= default_reduce_block_size) ? n : default_reduce_block_size; block_results.resize_and_reset(grid_dim); - reduce_add_array<<>>( + reduce_add_array<<>>( size, as_cuda_type(source), as_cuda_type(block_results.get_data())); block_results_val = block_results.get_const_data(); @@ -89,7 +90,7 @@ __host__ ValueType reduce_add_array(std::shared_ptr exec, auto d_result = Array(exec, 1); - reduce_add_array<<<1, default_block_size>>>( + reduce_add_array<<<1, default_reduce_block_size>>>( grid_dim, as_cuda_type(block_results_val), as_cuda_type(d_result.get_data())); auto answer = exec->copy_val_to_host(d_result.get_const_data()); diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu index 7a18bb06e39..2b8fb8157c6 100644 --- a/cuda/matrix/dense_kernels.cu +++ b/cuda/matrix/dense_kernels.cu @@ -117,133 +117,6 @@ void apply(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); -template -void compute_dot(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - if (cublas::is_supported::value) { - // TODO: write a custom kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - cublas::dot(exec->get_cublas_handle(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - y->get_const_values() + col, y->get_stride(), - result->get_values() + col); - } - } else { - // TODO: these are tuning parameters obtained experimentally, once - // we decide how to handle this uniformly, they should be modified - // appropriately - constexpr int work_per_thread = 32; - constexpr int block_size = 1024; - - constexpr auto work_per_block = work_per_thread * block_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{config::warp_size, 1, - block_size / config::warp_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - kernel::compute_partial_dot<<>>( - x->get_size()[0], as_cuda_type(x->get_const_values() + col), - x->get_stride(), as_cuda_type(y->get_const_values() + col), - y->get_stride(), as_cuda_type(work.get_data())); - kernel::finalize_sum_reduce_computation - <<<1, block_dim>>>(grid_dim.x, - as_cuda_type(work.get_const_data()), - as_cuda_type(result->get_values() + col)); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); - - -template -void compute_conj_dot(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - if (cublas::is_supported::value) { - // TODO: write a custom kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - cublas::conj_dot(exec->get_cublas_handle(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - y->get_const_values() + col, y->get_stride(), - result->get_values() + col); - } - } else { - // TODO: these are tuning parameters obtained experimentally, once - // we decide how to handle this uniformly, they should be modified - // appropriately - constexpr int work_per_thread = 32; - constexpr int block_size = 1024; - - constexpr auto work_per_block = work_per_thread * block_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{config::warp_size, 1, - block_size / config::warp_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - kernel::compute_partial_conj_dot - <<>>( - x->get_size()[0], as_cuda_type(x->get_const_values() + col), - x->get_stride(), as_cuda_type(y->get_const_values() + col), - y->get_stride(), as_cuda_type(work.get_data())); - kernel::finalize_sum_reduce_computation - <<<1, block_dim>>>(grid_dim.x, - as_cuda_type(work.get_const_data()), - as_cuda_type(result->get_values() + col)); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); - - -template -void compute_norm2(std::shared_ptr exec, - const matrix::Dense* x, - matrix::Dense>* result) -{ - if (cublas::is_supported::value) { - for (size_type col = 0; col < x->get_size()[1]; ++col) { - cublas::norm2(exec->get_cublas_handle(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - result->get_values() + col); - } - } else { - using norm_type = remove_complex; - // TODO: these are tuning parameters obtained experimentally, once - // we decide how to handle this uniformly, they should be modified - // appropriately - constexpr int work_per_thread = 32; - constexpr int block_size = 1024; - - constexpr auto work_per_block = work_per_thread * block_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{config::warp_size, 1, - block_size / config::warp_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - kernel::compute_partial_norm2<<>>( - x->get_size()[0], as_cuda_type(x->get_const_values() + col), - x->get_stride(), as_cuda_type(work.get_data())); - kernel::finalize_sqrt_reduce_computation - <<<1, block_dim>>>(grid_dim.x, - as_cuda_type(work.get_const_data()), - as_cuda_type(result->get_values() + col)); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); - - template void convert_to_coo(std::shared_ptr exec, const matrix::Dense* source, diff --git a/cuda/test/base/kernel_launch.cu b/cuda/test/base/kernel_launch.cu index abd4775290c..e2f6583f930 100644 --- a/cuda/test/base/kernel_launch.cu +++ b/cuda/test/base/kernel_launch.cu @@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "common/unified/base/kernel_launch_reduction.hpp" #include "common/unified/base/kernel_launch_solver.hpp" #include "core/test/utils.hpp" @@ -54,6 +55,7 @@ namespace { using gko::dim; +using gko::int64; using gko::size_type; using std::is_same; @@ -104,7 +106,7 @@ void run1d(std::shared_ptr exec, size_type dim, int* data) gko::kernels::cuda::run_kernel( exec, [] GKO_KERNEL(auto i, auto d) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); d[i] = i; }, @@ -124,7 +126,7 @@ void run1d(std::shared_ptr exec, gko::Array& data) gko::kernels::cuda::run_kernel( exec, [] GKO_KERNEL(auto i, auto d, auto d_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); if (d == d_ptr) { @@ -149,7 +151,7 @@ void run1d(std::shared_ptr exec, gko::matrix::Dense<>* m) gko::kernels::cuda::run_kernel( exec, [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); @@ -185,8 +187,8 @@ void run2d(std::shared_ptr exec, int* data) gko::kernels::cuda::run_kernel( exec, [] GKO_KERNEL(auto i, auto j, auto d) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); d[i + 4 * j] = 4 * i + j; }, @@ -206,8 +208,8 @@ void run2d(std::shared_ptr exec, gko::Array& data) gko::kernels::cuda::run_kernel( exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); if (d == d_ptr) { @@ -234,7 +236,7 @@ void run2d(std::shared_ptr exec, gko::matrix::Dense<>* m1, exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3, auto d4, auto d2_ptr, auto d3_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); @@ -275,4 +277,195 @@ TEST_F(KernelLaunch, Runs2DDense) } +void run1d_reduction(std::shared_ptr exec) +{ + gko::Array output{exec, 1}; + + gko::kernels::cuda::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return i + 1; + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), + size_type{100000}, output); + + // 2 * sum i=0...99999 (i+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000LL); + + gko::kernels::cuda::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return i + 1; + }, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "a"); + static_assert(is_same::value, "b"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "value"); + return j * 2; + }, + int64{}, output.get_data(), size_type{100}, output); + + // 2 * sum i=0...99 (i+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100LL); +} + +TEST_F(KernelLaunch, Reduction1D) { run1d_reduction(exec); } + + +void run2d_reduction(std::shared_ptr exec) +{ + gko::Array output{exec, 1}; + + gko::kernels::cuda::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "a"); + static_assert(is_same::value, "b"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "value"); + return j * 4; + }, + int64{}, output.get_data(), gko::dim<2>{1000, 100}, output); + + // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000LL); + + gko::kernels::cuda::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "a"); + static_assert(is_same::value, "b"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "value"); + return j * 4; + }, + int64{}, output.get_data(), gko::dim<2>{10, 10}, output); + + // 4 * sum i=0...9 sum j=0...9 of (i+1)*(j+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100LL); +} + +TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); } + + +void run2d_row_reduction(std::shared_ptr exec) +{ + for (auto num_rows : {0, 100, 1000, 10000}) { + for (auto num_cols : {0, 10, 100, 1000, 10000}) { + SCOPED_TRACE(std::to_string(num_rows) + " rows, " + + std::to_string(num_cols) + " cols"); + gko::Array host_ref{exec->get_master(), + static_cast(2 * num_rows)}; + std::fill_n(host_ref.get_data(), 2 * num_rows, 1234); + gko::Array output{exec, host_ref}; + for (int64 i = 0; i < num_rows; i++) { + // we are computing 2 * sum {j=0, j(num_cols) * (num_cols + 1) * (i + 1); + } + + gko::kernels::cuda::run_kernel_row_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "a"); + static_assert(is_same::value, "b"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "value"); + return j * 2; + }, + int64{}, output.get_data(), 2, + gko::dim<2>{static_cast(num_rows), + static_cast(num_cols)}, + output); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); + } + } +} + +TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); } + + +void run2d_col_reduction(std::shared_ptr exec) +{ + // empty, most threads idle, most threads busy, multiple blocks + for (auto num_rows : {0, 10, 100, 1000, 10000}) { + // check different edge cases: subwarp sizes, blocked mode + for (auto num_cols : + {0, 1, 2, 3, 4, 5, 7, 8, 9, 16, 31, 32, 63, 127, 128, 129}) { + SCOPED_TRACE(std::to_string(num_rows) + " rows, " + + std::to_string(num_cols) + " cols"); + gko::Array host_ref{exec->get_master(), + static_cast(num_cols)}; + gko::Array output{exec, static_cast(num_cols)}; + for (int64 i = 0; i < num_cols; i++) { + // we are computing 2 * sum {j=0, j(num_rows) * (num_rows + 1) * (i + 1); + } + + gko::kernels::cuda::run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "a"); + static_assert(is_same::value, "b"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "value"); + return j * 2; + }, + int64{}, output.get_data(), + gko::dim<2>{static_cast(num_rows), + static_cast(num_cols)}, + output); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); + } + } +} + +TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); } + + } // namespace diff --git a/dev_tools/scripts/config b/dev_tools/scripts/config index 696d886d489..a8b0cb3841a 100644 --- a/dev_tools/scripts/config +++ b/dev_tools/scripts/config @@ -30,6 +30,9 @@ - FixInclude: "common/unified/base/kernel_launch.hpp" - "(cuda|hip|dpcpp|omp)/base/kernel_launch_solver\." - FixInclude: "common/unified/base/kernel_launch_solver.hpp" +- "common/unified/.*.cpp" + - PathIgnore: "2" + - PathPrefix: "core" - "core/test/base/(extended_float|iterator_factory)" - RemoveTest: "true" - "core/test/base/allocator" diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp index 90ec1cc05fe..f215864e01d 100644 --- a/dpcpp/base/helper.hpp +++ b/dpcpp/base/helper.hpp @@ -166,7 +166,7 @@ bool validate(sycl::queue* queue, unsigned workgroup_size, * @return the first valid config */ template -std::uint32_t get_first_cfg(IterArr& arr, Validate verify) +std::uint32_t get_first_cfg(const IterArr& arr, Validate verify) { for (auto& cfg : arr) { if (verify(cfg)) { diff --git a/dpcpp/base/kernel_launch.dp.hpp b/dpcpp/base/kernel_launch.dp.hpp index 4fe161ff320..2b99e98ac36 100644 --- a/dpcpp/base/kernel_launch.dp.hpp +++ b/dpcpp/base/kernel_launch.dp.hpp @@ -45,25 +45,27 @@ namespace dpcpp { template -void generic_kernel_1d(sycl::handler& cgh, size_type size, KernelFunction fn, +void generic_kernel_1d(sycl::handler& cgh, int64 size, KernelFunction fn, KernelArgs... args) { - cgh.parallel_for(sycl::range<1>{size}, [=](sycl::id<1> idx_id) { - auto idx = static_cast(idx_id[0]); - fn(idx, args...); - }); + cgh.parallel_for(sycl::range<1>{static_cast(size)}, + [=](sycl::id<1> idx_id) { + auto idx = static_cast(idx_id[0]); + fn(idx, args...); + }); } template -void generic_kernel_2d(sycl::handler& cgh, size_type rows, size_type cols, +void generic_kernel_2d(sycl::handler& cgh, int64 rows, int64 cols, KernelFunction fn, KernelArgs... args) { - cgh.parallel_for(sycl::range<2>{rows, cols}, [=](sycl::id<2> idx) { - auto row = static_cast(idx[0]); - auto col = static_cast(idx[1]); - fn(row, col, args...); - }); + cgh.parallel_for(sycl::range<1>{static_cast(rows * cols)}, + [=](sycl::id<1> idx) { + auto row = static_cast(idx[0]) / cols; + auto col = static_cast(idx[0]) % cols; + fn(row, col, args...); + }); } @@ -72,7 +74,8 @@ void run_kernel(std::shared_ptr exec, KernelFunction fn, size_type size, KernelArgs&&... args) { exec->get_queue()->submit([&](sycl::handler& cgh) { - generic_kernel_1d(cgh, size, fn, map_to_device(args)...); + generic_kernel_1d(cgh, static_cast(size), fn, + map_to_device(args)...); }); } @@ -81,7 +84,9 @@ void run_kernel(std::shared_ptr exec, KernelFunction fn, dim<2> size, KernelArgs&&... args) { exec->get_queue()->submit([&](sycl::handler& cgh) { - generic_kernel_2d(cgh, size[0], size[1], fn, map_to_device(args)...); + generic_kernel_2d(cgh, static_cast(size[0]), + static_cast(size[1]), fn, + map_to_device(args)...); }); } diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp new file mode 100644 index 00000000000..5ebf06b0f71 --- /dev/null +++ b/dpcpp/base/kernel_launch_reduction.dp.hpp @@ -0,0 +1,705 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ +#error \ + "This file can only be used from inside common/base/kernel_launch_reduction.hpp" +#endif + + +#include + + +#include "core/synthesizer/implementation_selection.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/components/uninitialized_array.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { + + +using KCFG_1D = ConfigSet<11, 7>; +constexpr auto kcfg_1d_list_simple_reduction = + syn::value_list(); + + +template +void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size, + int64 num_workgroups, KernelFunction fn, + ReductionOp op, FinalizeOp finalize, + ValueType identity, ValueType* storage, + MappedKernelArgs... args) +{ + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + constexpr auto num_partials = wg_size / sg_size; + sycl::accessor, 0, + sycl::access_mode::read_write, sycl::access::target::local> + subgroup_partial_acc(cgh); + const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size)); + const auto global_size = num_workgroups * wg_size; + + cgh.parallel_for( + range, [= + ](sycl::nd_item<3> idx) [[intel::reqd_sub_group_size(sg_size)]] { + auto subgroup_partial = &(*subgroup_partial_acc.get_pointer())[0]; + const auto tidx = thread::get_thread_id_flat(idx); + const auto local_tidx = static_cast(tidx % wg_size); + auto subgroup = + group::tiled_partition(group::this_thread_block(idx)); + auto partial = identity; + for (int64 i = tidx; i < size; i += global_size) { + partial = op(partial, fn(i, args...)); + } + partial = ::gko::kernels::dpcpp::reduce(subgroup, partial, op); + if (subgroup.thread_rank() == 0) { + subgroup_partial[local_tidx / sg_size] = partial; + } + idx.barrier(sycl::access::fence_space::local_space); + if (local_tidx < sg_size) { + partial = identity; + for (int64 i = local_tidx; i < num_partials; i += sg_size) { + partial = op(partial, subgroup_partial[i]); + } + partial = ::gko::kernels::dpcpp::reduce(subgroup, partial, op); + if (subgroup.thread_rank() == 0) { + storage[tidx / wg_size] = finalize(partial); + } + } + }); +} + + +template +void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols, + int64 num_workgroups, KernelFunction fn, + ReductionOp op, FinalizeOp finalize, + ValueType identity, ValueType* storage, + MappedKernelArgs... args) +{ + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + constexpr auto num_partials = wg_size / sg_size; + sycl::accessor, 0, + sycl::access_mode::read_write, sycl::access::target::local> + subgroup_partial_acc(cgh); + const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size)); + const auto global_size = num_workgroups * wg_size; + + cgh.parallel_for( + range, [= + ](sycl::nd_item<3> idx) [[intel::reqd_sub_group_size(sg_size)]] { + auto subgroup_partial = &(*subgroup_partial_acc.get_pointer())[0]; + const auto tidx = thread::get_thread_id_flat(idx); + const auto local_tidx = static_cast(tidx % wg_size); + auto subgroup = + group::tiled_partition(group::this_thread_block(idx)); + auto partial = identity; + for (int64 i = tidx; i < rows * cols; i += global_size) { + const auto row = i / cols; + const auto col = i % cols; + partial = op(partial, fn(row, col, args...)); + } + partial = ::gko::kernels::dpcpp::reduce(subgroup, partial, op); + if (subgroup.thread_rank() == 0) { + subgroup_partial[local_tidx / sg_size] = partial; + } + idx.barrier(sycl::access::fence_space::local_space); + if (local_tidx < sg_size) { + partial = identity; + for (int64 i = local_tidx; i < num_partials; i += sg_size) { + partial = op(partial, subgroup_partial[i]); + } + partial = ::gko::kernels::dpcpp::reduce(subgroup, partial, op); + if (subgroup.thread_rank() == 0) { + storage[tidx / wg_size] = finalize(partial); + } + } + }); +} + + +template +void run_kernel_reduction_impl(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, size_type size, + MappedKernelArgs... args) +{ + constexpr int oversubscription = 4; + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + const auto num_workgroups = + std::min(ceildiv(size, wg_size), + exec->get_num_computing_units() * oversubscription); + auto queue = exec->get_queue(); + if (num_workgroups > 1) { + Array partial{exec, static_cast(num_workgroups)}; + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_1d( + cgh, static_cast(size), num_workgroups, fn, op, + [](auto v) { return v; }, identity, partial.get_data(), + args...); + }); + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_1d( + cgh, static_cast(num_workgroups), 1, + [](auto i, auto v) { return v[i]; }, op, finalize, identity, + result, partial.get_const_data()); + }); + } else { + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_1d(cgh, static_cast(size), + num_workgroups, fn, op, finalize, + identity, result, args...); + }); + } +} + + +template +void run_kernel_reduction_impl(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, dim<2> size, + MappedKernelArgs... args) +{ + constexpr int oversubscription = 4; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto flat_size = rows * cols; + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + const auto num_workgroups = + std::min(ceildiv(flat_size, wg_size), + exec->get_num_computing_units() * oversubscription); + auto queue = exec->get_queue(); + if (num_workgroups > 1) { + Array partial{exec, static_cast(num_workgroups)}; + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_2d( + cgh, rows, cols, num_workgroups, fn, op, + [](auto v) { return v; }, identity, partial.get_data(), + args...); + }); + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_1d( + cgh, static_cast(num_workgroups), 1, + [](auto i, auto v) { return v[i]; }, op, finalize, identity, + result, partial.get_const_data()); + }); + } else { + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_2d(cgh, rows, cols, num_workgroups, + fn, op, finalize, identity, result, + args...); + }); + } +} + +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(select_run_kernel_reduction, + run_kernel_reduction_impl) + + +template +void run_kernel_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, dim<2> size, KernelArgs&&... args) +{ + const auto desired_cfg = get_first_cfg( + as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { + return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + select_run_kernel_reduction( + kcfg_1d_list_simple_reduction, + [&](std::uint32_t cfg) { return cfg == desired_cfg; }, + syn::value_list(), syn::value_list(), + syn::value_list(), syn::type_list<>(), exec, fn, op, + finalize, identity, result, size, map_to_device(args)...); +} + + +template +void run_kernel_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, size_type size, + KernelArgs&&... args) +{ + const auto desired_cfg = get_first_cfg( + as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { + return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + select_run_kernel_reduction( + kcfg_1d_list_simple_reduction, + [&](std::uint32_t cfg) { return cfg == desired_cfg; }, + syn::value_list(), syn::value_list(), + syn::value_list(), syn::type_list<>(), exec, fn, op, + finalize, identity, result, size, map_to_device(args)...); +} + + +namespace { + + +template +void generic_kernel_row_reduction_2d(syn::value_list, + std::shared_ptr exec, + int64 rows, int64 cols, int64 col_blocks, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, int64 result_stride, + MappedKernelArgs... args) +{ + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + static_assert(ssg_size <= sg_size, "ssg must be smaller than sg"); + const auto num_workgroups = ceildiv(rows * col_blocks * ssg_size, wg_size); + const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size)); + exec->get_queue()->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + range, [= + ](sycl::nd_item<3> id) [[intel::reqd_sub_group_size(sg_size)]] { + const auto idx = + thread::get_subwarp_id_flat(id); + const auto row = idx % rows; + const auto col_block = idx / rows; + auto partial = identity; + auto subgroup = group::tiled_partition( + group::this_thread_block(id)); + auto ssg_rank = + static_cast(subgroup.thread_rank() % ssg_size); + if (col_block < col_blocks) { + const auto cols_per_part = + ceildiv(ceildiv(cols, ssg_size), col_blocks) * ssg_size; + const auto begin = cols_per_part * col_block; + const auto end = min(begin + cols_per_part, cols); + for (auto col = begin + ssg_rank; col < end; + col += ssg_size) { + partial = op(partial, fn(row, col, args...)); + } + } +// since we do a sub-subgroup reduction, we can't use reduce +#pragma unroll + for (int i = 1; i < ssg_size; i *= 2) { + partial = op(partial, subgroup.shfl_xor(partial, i)); + } + if (col_block < col_blocks && ssg_rank == 0) { + result[(row + col_block * rows) * result_stride] = + finalize(partial); + } + }); + }); +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_kernel_row_reduction_2d, + generic_kernel_row_reduction_2d); + + +template +void generic_kernel_col_reduction_2d_small( + sycl::handler& cgh, int64 rows, int64 cols, int64 row_blocks, + KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, + ValueType* result, MappedKernelArgs... args) +{ + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + static_assert(ssg_size <= sg_size, "ssg must be smaller than sg"); + constexpr auto subgroups_per_workgroup = wg_size / sg_size; + // stores the subwarp_size partial sums from each warp, grouped by warp + constexpr auto shared_storage = subgroups_per_workgroup * ssg_size; + sycl::accessor, 0, + sycl::access_mode::read_write, sycl::access::target::local> + block_partial_acc(cgh); + const auto range = sycl_nd_range(dim3(row_blocks), dim3(wg_size)); + cgh.parallel_for( + range, [= + ](sycl::nd_item<3> id) [[intel::reqd_sub_group_size(sg_size)]] { + auto block_partial = &(*block_partial_acc.get_pointer())[0]; + const auto ssg_id = + thread::get_subwarp_id_flat(id); + const auto local_sg_id = id.get_local_id(2) / sg_size; + const auto local_ssg_id = id.get_local_id(2) % sg_size / ssg_size; + const auto ssg_num = + thread::get_subwarp_num_flat(id); + const auto workgroup = group::this_thread_block(id); + const auto subgroup = group::tiled_partition(workgroup); + const auto sg_rank = subgroup.thread_rank(); + const auto ssg_rank = sg_rank % ssg_size; + const auto col = static_cast(ssg_rank); + auto partial = identity; + // accumulate within a thread + if (col < cols) { + for (auto row = ssg_id; row < rows; row += ssg_num) { + partial = op(partial, fn(row, col, args...)); + } + } + // accumulate between all subsubgroups in the subgroup +#pragma unroll + for (unsigned i = ssg_size; i < sg_size; i *= 2) { + partial = op(partial, subgroup.shfl_xor(partial, i)); + } + // store the result to shared memory + if (local_ssg_id == 0) { + block_partial[local_sg_id * ssg_size + ssg_rank] = partial; + } + workgroup.sync(); + // in a single thread: accumulate the results + if (local_sg_id == 0) { + partial = identity; + // accumulate the partial results within a thread + if (shared_storage >= sg_size) { +#pragma unroll + for (int i = 0; i < shared_storage; i += sg_size) { + partial = op(partial, block_partial[i + sg_rank]); + } + } else if (sg_rank < shared_storage) { + partial = op(partial, block_partial[sg_rank]); + } + // accumulate between all subsubgroups in the subgroup +#pragma unroll + for (unsigned i = ssg_size; i < sg_size; i *= 2) { + partial = op(partial, subgroup.shfl_xor(partial, i)); + } + if (sg_rank < cols) { + result[sg_rank + id.get_group(2) * cols] = + finalize(partial); + } + } + }); +} + + +template +void generic_kernel_col_reduction_2d_blocked( + sycl::handler& cgh, int64 rows, int64 cols, int64 row_blocks, + int64 col_blocks, KernelFunction fn, ReductionOp op, FinalizeOp finalize, + ValueType identity, ValueType* result, MappedKernelArgs... args) +{ + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + const auto range = + sycl_nd_range(dim3(row_blocks, col_blocks), dim3(wg_size)); + sycl::accessor, 0, + sycl::access_mode::read_write, sycl::access::target::local> + block_partial_acc(cgh); + cgh.parallel_for( + range, [= + ](sycl::nd_item<3> id) [[intel::reqd_sub_group_size(sg_size)]] { + const auto sg_id = thread::get_subwarp_id_flat(id); + const auto sg_num = + thread::get_subwarp_num_flat(id); + const auto workgroup = group::this_thread_block(id); + const auto subgroup = group::tiled_partition(workgroup); + const auto sg_rank = subgroup.thread_rank(); + const auto col = + sg_rank + static_cast(id.get_group(1)) * sg_size; + auto block_partial = &(*block_partial_acc.get_pointer())[0]; + auto partial = identity; + // accumulate within a thread + if (col < cols) { + for (auto row = sg_id; row < rows; row += sg_num) { + partial = op(partial, fn(row, col, args...)); + } + } + block_partial[id.get_local_id(2)] = partial; + workgroup.sync(); + // in a single warp: accumulate the results + if (id.get_local_id(2) < sg_size) { + partial = identity; + // accumulate the partial results within a thread +#pragma unroll + for (int i = 0; i < wg_size; i += sg_size) { + partial = op(partial, block_partial[i + sg_rank]); + } + if (col < cols) { + result[col + id.get_group(2) * cols] = finalize(partial); + } + } + }); +} + + +template +void generic_kernel_reduction_finalize_2d( + sycl::handler& cgh, int64 num_results, int64 num_blocks, ReductionOp op, + FinalizeOp finalize, ValueType identity, const ValueType* input, + int64 result_stride, ValueType* result) +{ + cgh.parallel_for(sycl::range<1>{static_cast(num_results)}, + [=](sycl::id<1> id) { + auto partial = identity; + for (int64 block = 0; block < num_blocks; block++) { + partial = op(partial, + input[id[0] + block * num_results]); + } + result[id[0] * result_stride] = finalize(partial); + }); +} + + +template +void run_generic_col_reduction_small(syn::value_list, + std::shared_ptr exec, + int64 max_workgroups, KernelFunction fn, + ReductionOp op, FinalizeOp finalize, + ValueType identity, ValueType* result, + dim<2> size, MappedKernelArgs... args) +{ + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + static_assert(ssg_size <= sg_size, "ssg must be smaller than sg"); + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto row_blocks = + std::min(ceildiv(rows * ssg_size, wg_size), max_workgroups); + auto queue = exec->get_queue(); + if (row_blocks <= 1) { + queue->submit([&](sycl::handler& cgh) { + generic_kernel_col_reduction_2d_small( + cgh, rows, cols, 1, fn, op, finalize, identity, result, + args...); + }); + } else { + Array tmp_storage{exec, + static_cast(row_blocks * cols)}; + queue->submit([&](sycl::handler& cgh) { + generic_kernel_col_reduction_2d_small( + cgh, rows, cols, row_blocks, fn, op, [](auto v) { return v; }, + identity, tmp_storage.get_data(), args...); + }); + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_finalize_2d( + cgh, cols, row_blocks, op, finalize, identity, + tmp_storage.get_const_data(), 1, result); + }); + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small, + run_generic_col_reduction_small); + + +template +void run_kernel_row_reduction_stage1(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, size_type result_stride, + dim<2> size, MappedKernelArgs... args) +{ + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + using subsubgroup_sizes = + syn::value_list(16, sg_size), + std::min(32, sg_size), sg_size>; + constexpr int oversubscription = 16; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto resources = + exec->get_num_computing_units() * sg_size * oversubscription; + auto queue = exec->get_queue(); + if (rows * cols > resources && rows < cols) { + const auto col_blocks = ceildiv(rows * cols, resources); + Array partial{exec, + static_cast(col_blocks * rows)}; + generic_kernel_row_reduction_2d( + syn::value_list{}, exec, rows, cols, col_blocks, fn, + op, [](auto v) { return v; }, identity, partial.get_data(), 1, + args...); + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_finalize_2d( + cgh, rows, col_blocks, op, finalize, identity, + partial.get_const_data(), static_cast(result_stride), + result); + }); + } else { + select_generic_kernel_row_reduction_2d( + subsubgroup_sizes(), + [&](int compiled_ssg_size) { + return compiled_ssg_size >= cols || + compiled_ssg_size == sg_size; + }, + syn::value_list(), syn::type_list<>(), exec, rows, cols, + 1, fn, op, finalize, identity, result, + static_cast(result_stride), args...); + } +} + +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(select_kernel_row_reduction_stage1, + run_kernel_row_reduction_stage1); + + +template +void run_kernel_col_reduction_stage1(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, dim<2> size, + MappedKernelArgs... args) +{ + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + using subsubgroup_sizes = + syn::value_list(16, sg_size), + std::min(32, sg_size), sg_size>; + constexpr int oversubscription = 16; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto max_blocks = + exec->get_num_computing_units() * sg_size * oversubscription / wg_size; + if (cols <= sg_size) { + select_generic_col_reduction_small( + subsubgroup_sizes(), + [&](int compiled_ssg_size) { + return compiled_ssg_size >= cols || + compiled_ssg_size == sg_size; + }, + syn::value_list(), syn::type_list<>(), exec, max_blocks, + fn, op, finalize, identity, result, size, args...); + } else { + const auto col_blocks = ceildiv(cols, sg_size); + const auto row_blocks = ceildiv( + std::min(ceildiv(rows * sg_size, wg_size), max_blocks), + col_blocks); + auto queue = exec->get_queue(); + if (row_blocks <= 1) { + queue->submit([&](sycl::handler& cgh) { + generic_kernel_col_reduction_2d_blocked( + cgh, rows, cols, 1, col_blocks, fn, op, finalize, identity, + result, args...); + }); + } else { + Array tmp_storage{ + exec, static_cast(row_blocks * cols)}; + queue->submit([&](sycl::handler& cgh) { + generic_kernel_col_reduction_2d_blocked( + cgh, rows, cols, row_blocks, col_blocks, fn, op, + [](auto v) { return v; }, identity, tmp_storage.get_data(), + args...); + }); + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_finalize_2d( + cgh, cols, row_blocks, op, finalize, identity, + tmp_storage.get_const_data(), 1, result); + }); + } + } +} + +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(select_kernel_col_reduction_stage1, + run_kernel_col_reduction_stage1); + + +} // namespace + + +template +void run_kernel_row_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, size_type result_stride, + dim<2> size, KernelArgs&&... args) +{ + const auto desired_cfg = get_first_cfg( + as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { + return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + select_kernel_row_reduction_stage1( + kcfg_1d_list_simple_reduction, + [&](std::uint32_t cfg) { return cfg == desired_cfg; }, + syn::value_list(), syn::value_list(), + syn::value_list(), syn::type_list<>(), exec, fn, op, + finalize, identity, result, result_stride, size, + map_to_device(args)...); +} + + +template +void run_kernel_col_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, dim<2> size, + KernelArgs&&... args) +{ + const auto desired_cfg = get_first_cfg( + as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { + return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); + select_kernel_col_reduction_stage1( + kcfg_1d_list_simple_reduction, + [&](std::uint32_t cfg) { return cfg == desired_cfg; }, + syn::value_list(), syn::value_list(), + syn::value_list(), syn::type_list<>(), exec, fn, op, + finalize, identity, result, size, map_to_device(args)...); +} + + +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/dpcpp/base/kernel_launch_solver.dp.hpp b/dpcpp/base/kernel_launch_solver.dp.hpp index 5cec5b55d79..68ef10ac1fe 100644 --- a/dpcpp/base/kernel_launch_solver.dp.hpp +++ b/dpcpp/base/kernel_launch_solver.dp.hpp @@ -42,17 +42,18 @@ namespace dpcpp { template -void generic_kernel_2d_solver(sycl::handler& cgh, size_type rows, - size_type cols, size_type default_stride, - KernelFunction fn, KernelArgs... args) +void generic_kernel_2d_solver(sycl::handler& cgh, int64 rows, int64 cols, + int64 default_stride, KernelFunction fn, + KernelArgs... args) { - cgh.parallel_for(sycl::range<2>{rows, cols}, [=](sycl::id<2> idx) { - auto row = static_cast(idx[0]); - auto col = static_cast(idx[1]); - fn(row, col, - device_unpack_solver_impl::unpack(args, - default_stride)...); - }); + cgh.parallel_for(sycl::range<1>{static_cast(rows * cols)}, + [=](sycl::id<1> idx) { + auto row = static_cast(idx[0] / cols); + auto col = static_cast(idx[0] % cols); + fn(row, col, + device_unpack_solver_impl::unpack( + args, default_stride)...); + }); } @@ -63,7 +64,8 @@ void run_kernel_solver(std::shared_ptr exec, { exec->get_queue()->submit([&](sycl::handler& cgh) { kernels::dpcpp::generic_kernel_2d_solver( - cgh, size[0], size[1], default_stride, fn, + cgh, static_cast(size[0]), static_cast(size[1]), + static_cast(default_stride), fn, kernels::dpcpp::map_to_device(args)...); }); } diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 9a86ab9cd15..7873a687e4b 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -84,284 +84,6 @@ constexpr int default_block_size = 256; namespace kernel { -template -void compute_partial_reduce( - size_type num_rows, OutType* __restrict__ work, CallableGetValue get_value, - CallableReduce reduce_op, sycl::nd_item<3> item_ct1, - UninitializedArray(cfg)>& tmp_work) -{ - constexpr auto wg_size = KCFG_1D::decode<0>(cfg); - constexpr auto sg_size = KCFG_1D::decode<1>(cfg); - - constexpr auto warps_per_block = wg_size / sg_size; - - const auto num_blocks = item_ct1.get_group_range(2); - const auto local_id = thread::get_local_thread_id(item_ct1); - const auto global_id = - thread::get_thread_id(item_ct1); - - OutType* tmp_work_array = tmp_work; - auto tmp = zero(); - for (auto i = global_id; i < num_rows; i += wg_size * num_blocks) { - tmp = reduce_op(tmp, get_value(i)); - } - - tmp_work_array[local_id] = tmp; - - ::gko::kernels::dpcpp::reduce(group::this_thread_block(item_ct1), - tmp_work_array, reduce_op); - - if (local_id == 0) { - work[thread::get_block_id(item_ct1)] = tmp_work_array[0]; - } -} - - -template -void finalize_reduce_computation( - size_type size, const ValueType* work, ValueType* result, - CallableReduce reduce_op, CallableFinalize finalize_op, - sycl::nd_item<3> item_ct1, - UninitializedArray(cfg)>& tmp_work) -{ - constexpr auto wg_size = KCFG_1D::decode<0>(cfg); - constexpr auto sg_size = KCFG_1D::decode<1>(cfg); - - const auto local_id = thread::get_local_thread_id(item_ct1); - - ValueType tmp = zero(); - for (auto i = local_id; i < size; i += wg_size) { - tmp = reduce_op(tmp, work[i]); - } - ValueType* tmp_work_array = tmp_work; - tmp_work_array[local_id] = tmp; - - ::gko::kernels::dpcpp::reduce(group::this_thread_block(item_ct1), - tmp_work_array, reduce_op); - - if (local_id == 0) { - *result = finalize_op(tmp_work_array[0]); - } -} - - -template -void compute_partial_dot( - size_type num_rows, const ValueType* __restrict__ x, size_type stride_x, - const ValueType* __restrict__ y, size_type stride_y, - ValueType* __restrict__ work, sycl::nd_item<3> item_ct1, - UninitializedArray(cfg)>& tmp_work) -{ - compute_partial_reduce( - num_rows, work, - [x, stride_x, y, stride_y](size_type i) { - return x[i * stride_x] * y[i * stride_y]; - }, - [](const ValueType& x, const ValueType& y) { return x + y; }, item_ct1, - tmp_work); -} - -template -void compute_partial_dot(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_rows, - const ValueType* x, size_type stride_x, - const ValueType* y, size_type stride_y, - ValueType* work) -{ - constexpr auto wg_size = KCFG_1D::decode<0>(cfg); - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access::mode::read_write, - sycl::access::target::local> - tmp_work_acc_ct1(cgh); - - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - compute_partial_dot(num_rows, x, stride_x, y, stride_y, - work, item_ct1, - *tmp_work_acc_ct1.get_pointer()); - }); - }); -} - -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_dot, - compute_partial_dot) -GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_dot_call, compute_partial_dot, - kcfg_1d_list) - - -template -void compute_partial_conj_dot( - size_type num_rows, const ValueType* __restrict__ x, size_type stride_x, - const ValueType* __restrict__ y, size_type stride_y, - ValueType* __restrict__ work, sycl::nd_item<3> item_ct1, - UninitializedArray(cfg)>& tmp_work) -{ - compute_partial_reduce( - num_rows, work, - [x, stride_x, y, stride_y](size_type i) { - return conj(x[i * stride_x]) * y[i * stride_y]; - }, - [](const ValueType& x, const ValueType& y) { return x + y; }, item_ct1, - tmp_work); -} - -template -void compute_partial_conj_dot(dim3 grid, dim3 block, - size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_rows, - const ValueType* x, size_type stride_x, - const ValueType* y, size_type stride_y, - ValueType* work) -{ - constexpr auto wg_size = KCFG_1D::decode<0>(cfg); - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access::mode::read_write, - sycl::access::target::local> - tmp_work_acc_ct1(cgh); - - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - compute_partial_conj_dot(num_rows, x, stride_x, y, - stride_y, work, item_ct1, - *tmp_work_acc_ct1.get_pointer()); - }); - }); -} - -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_conj_dot, - compute_partial_conj_dot) -GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_conj_dot_call, - compute_partial_conj_dot, kcfg_1d_list) - - -template -void finalize_sum_reduce_computation( - size_type size, const ValueType* work, ValueType* result, - sycl::nd_item<3> item_ct1, - UninitializedArray(cfg)>& tmp_work) -{ - finalize_reduce_computation( - size, work, result, - [](const ValueType& x, const ValueType& y) { return x + y; }, - [](const ValueType& x) { return x; }, item_ct1, tmp_work); -} - -template -void finalize_sum_reduce_computation(dim3 grid, dim3 block, - size_type dynamic_shared_memory, - sycl::queue* queue, size_type size, - const ValueType* work, ValueType* result) -{ - constexpr auto wg_size = KCFG_1D::decode<0>(cfg); - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access::mode::read_write, - sycl::access::target::local> - tmp_work_acc_ct1(cgh); - - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - finalize_sum_reduce_computation( - size, work, result, item_ct1, - *tmp_work_acc_ct1.get_pointer()); - }); - }); -} - -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_sum_reduce_computation, - finalize_sum_reduce_computation) -GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sum_reduce_computation_call, - finalize_sum_reduce_computation, kcfg_1d_list) - - -template -void compute_partial_norm2( - size_type num_rows, const ValueType* __restrict__ x, size_type stride_x, - remove_complex* __restrict__ work, sycl::nd_item<3> item_ct1, - UninitializedArray, KCFG_1D::decode<0>(cfg)>& - tmp_work) -{ - using norm_type = remove_complex; - compute_partial_reduce( - num_rows, work, - [x, stride_x](size_type i) { return squared_norm(x[i * stride_x]); }, - [](const norm_type& x, const norm_type& y) { return x + y; }, item_ct1, - tmp_work); -} - -template -void compute_partial_norm2(dim3 grid, dim3 block, - size_type dynamic_shared_memory, sycl::queue* queue, - size_type num_rows, const ValueType* x, - size_type stride_x, remove_complex* work) -{ - constexpr auto wg_size = KCFG_1D::decode<0>(cfg); - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, wg_size>, - 0, sycl::access::mode::read_write, - sycl::access::target::local> - tmp_work_acc_ct1(cgh); - - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - compute_partial_norm2(num_rows, x, stride_x, work, - item_ct1, - *tmp_work_acc_ct1.get_pointer()); - }); - }); -} - -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_norm2, - compute_partial_norm2) -GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_norm2_call, - compute_partial_norm2, kcfg_1d_list) - - -template -void finalize_sqrt_reduce_computation( - size_type size, const ValueType* work, ValueType* result, - sycl::nd_item<3> item_ct1, - UninitializedArray(cfg)>& tmp_work) -{ - finalize_reduce_computation( - size, work, result, - [](const ValueType& x, const ValueType& y) { return x + y; }, - [](const ValueType& x) { return std::sqrt(x); }, item_ct1, tmp_work); -} - -template -void finalize_sqrt_reduce_computation(dim3 grid, dim3 block, - size_type dynamic_shared_memory, - sycl::queue* queue, size_type size, - const ValueType* work, ValueType* result) -{ - constexpr auto wg_size = KCFG_1D::decode<0>(cfg); - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access::mode::read_write, - sycl::access::target::local> - tmp_work_acc_ct1(cgh); - - - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - finalize_sqrt_reduce_computation( - size, work, result, item_ct1, - *tmp_work_acc_ct1.get_pointer()); - }); - }); -} - -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_sqrt_reduce_computation, - finalize_sqrt_reduce_computation) -GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sqrt_reduce_computation_call, - finalize_sqrt_reduce_computation, kcfg_1d_list) - - template void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride, const size_type* __restrict__ row_ptrs, @@ -812,144 +534,6 @@ void apply(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); -template -void compute_dot(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - if (x->get_size()[1] == 1) { - // TODO: write a custom kernel which does this more efficiently - onemkl::dot(*exec->get_queue(), x->get_size()[0], x->get_const_values(), - x->get_stride(), y->get_const_values(), y->get_stride(), - result->get_values()); - } else { - // TODO: these are tuning parameters obtained experimentally, once - // we decide how to handle this uniformly, they should be modified - // appropriately - constexpr int work_per_thread = 32; - auto queue = exec->get_queue(); - constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const std::uint32_t cfg = - get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { - return validate(queue, KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }); - const auto wg_size = KCFG_1D::decode<0>(cfg); - const auto sg_size = KCFG_1D::decode<1>(cfg); - const auto work_per_block = work_per_thread * wg_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{sg_size, 1, wg_size / sg_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - kernel::compute_partial_dot_call( - cfg, grid_dim, block_dim, 0, exec->get_queue(), - x->get_size()[0], x->get_const_values() + col, x->get_stride(), - y->get_const_values() + col, y->get_stride(), work.get_data()); - kernel::finalize_sum_reduce_computation_call( - cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x, - work.get_const_data(), result->get_values() + col); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); - - -template -void compute_conj_dot(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - if (x->get_size()[1] == 1) { - // TODO: write a custom kernel which does this more efficiently - onemkl::conj_dot(*exec->get_queue(), x->get_size()[0], - x->get_const_values(), x->get_stride(), - y->get_const_values(), y->get_stride(), - result->get_values()); - - } else { - // TODO: these are tuning parameters obtained experimentally, once - // we decide how to handle this uniformly, they should be modified - // appropriately - constexpr int work_per_thread = 32; - auto queue = exec->get_queue(); - constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const std::uint32_t cfg = - get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { - return validate(queue, KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }); - const auto wg_size = KCFG_1D::decode<0>(cfg); - const auto sg_size = KCFG_1D::decode<1>(cfg); - - const auto work_per_block = work_per_thread * wg_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{sg_size, 1, wg_size / sg_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - kernel::compute_partial_conj_dot_call( - cfg, grid_dim, block_dim, 0, exec->get_queue(), - x->get_size()[0], x->get_const_values() + col, x->get_stride(), - y->get_const_values() + col, y->get_stride(), work.get_data()); - kernel::finalize_sum_reduce_computation_call( - cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x, - work.get_const_data(), result->get_values() + col); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); - - -template -void compute_norm2(std::shared_ptr exec, - const matrix::Dense* x, - matrix::Dense>* result) -{ - if (x->get_size()[1] == 1) { - oneapi::mkl::blas::row_major::nrm2( - *exec->get_queue(), x->get_size()[0], x->get_const_values(), - x->get_stride(), result->get_values()); - } else { - using norm_type = remove_complex; - // TODO: these are tuning parameters obtained experimentally, once - // we decide how to handle this uniformly, they should be modified - // appropriately - constexpr int work_per_thread = 32; - auto queue = exec->get_queue(); - constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const std::uint32_t cfg = - get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { - return validate(queue, KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }); - const auto wg_size = KCFG_1D::decode<0>(cfg); - const auto sg_size = KCFG_1D::decode<1>(cfg); - - const auto work_per_block = work_per_thread * wg_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{sg_size, 1, wg_size / sg_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - kernel::compute_partial_norm2_call( - cfg, grid_dim, block_dim, 0, exec->get_queue(), - x->get_size()[0], x->get_const_values() + col, x->get_stride(), - work.get_data()); - kernel::finalize_sqrt_reduce_computation_call( - cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x, - work.get_const_data(), result->get_values() + col); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); - - template void convert_to_coo(std::shared_ptr exec, const matrix::Dense* source, diff --git a/dpcpp/test/base/kernel_launch.dp.cpp b/dpcpp/test/base/kernel_launch.dp.cpp index 27d3f1abd12..2c7a08cdb36 100644 --- a/dpcpp/test/base/kernel_launch.dp.cpp +++ b/dpcpp/test/base/kernel_launch.dp.cpp @@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "common/unified/base/kernel_launch_reduction.hpp" #include "common/unified/base/kernel_launch_solver.hpp" #include "core/test/utils.hpp" @@ -54,6 +55,7 @@ namespace { using gko::dim; +using gko::int64; using gko::size_type; using std::is_same; @@ -110,7 +112,7 @@ TEST_F(KernelLaunch, Runs1D) gko::kernels::dpcpp::run_kernel( exec, [] GKO_KERNEL(auto i, auto d) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); d[i] = i; }, @@ -125,7 +127,7 @@ TEST_F(KernelLaunch, Runs1DArray) gko::kernels::dpcpp::run_kernel( exec, [] GKO_KERNEL(auto i, auto d, auto d_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); if (d == d_ptr) { @@ -145,7 +147,7 @@ TEST_F(KernelLaunch, Runs1DDense) gko::kernels::dpcpp::run_kernel( exec, [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, @@ -177,8 +179,8 @@ TEST_F(KernelLaunch, Runs2D) gko::kernels::dpcpp::run_kernel( exec, [] GKO_KERNEL(auto i, auto j, auto d) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); d[i + 4 * j] = 4 * i + j; }, @@ -193,8 +195,8 @@ TEST_F(KernelLaunch, Runs2DArray) gko::kernels::dpcpp::run_kernel( exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); if (d == d_ptr) { @@ -215,11 +217,9 @@ TEST_F(KernelLaunch, Runs2DDense) exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3, auto d4, auto d2_ptr, auto d3_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); - static_assert(is_same::value, - "type"); static_assert(is_same::value, "type"); static_assert(is_same::value, @@ -257,4 +257,174 @@ TEST_F(KernelLaunch, Runs2DDense) } +TEST_F(KernelLaunch, Reduction1D) +{ + gko::Array output{exec, 1}; + + gko::kernels::dpcpp::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return i + 1; + }, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "i"); + static_assert(is_same::value, "j"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "j"); + return j * 2; + }, + int64{}, output.get_data(), size_type{100000}, output); + + // 2 * sum i=0...99999 (i+1) + EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000LL); + + gko::kernels::dpcpp::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return i + 1; + }, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "i"); + static_assert(is_same::value, "j"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "j"); + return j * 2; + }, + int64{}, output.get_data(), size_type{100}, output); + + // 2 * sum i=0...99 (i+1) + EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100LL); +} + + +TEST_F(KernelLaunch, Reduction2D) +{ + gko::Array output{exec, 1}; + + gko::kernels::dpcpp::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "i"); + static_assert(is_same::value, "j"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "j"); + return j * 4; + }, + int64{}, output.get_data(), gko::dim<2>{1000, 100}, output); + + // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1) + EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000LL); + + gko::kernels::dpcpp::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "i"); + static_assert(is_same::value, "j"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "j"); + return j * 4; + }, + int64{}, output.get_data(), gko::dim<2>{10, 10}, output); + + // 4 * sum i=0...9 sum j=0...9 of (i+1)*(j+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100LL); +} + + +TEST_F(KernelLaunch, ReductionRow2D) +{ + for (auto num_rows : {0, 1, 10, 100, 1000, 10000}) { + for (auto num_cols : {0, 1, 10, 100, 1000, 10000}) { + SCOPED_TRACE(std::to_string(num_rows) + " rows, " + + std::to_string(num_cols) + " cols"); + gko::Array host_ref{exec->get_master(), + static_cast(2 * num_rows)}; + std::fill_n(host_ref.get_data(), 2 * num_rows, 1234); + gko::Array output{exec, host_ref}; + for (int i = 0; i < num_rows; i++) { + // we are computing 2 * sum {j=0, j(num_cols) * (num_cols + 1) * (i + 1); + } + + gko::kernels::dpcpp::run_kernel_row_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return 2 * j; }, int64{}, + output.get_data(), 2, + gko::dim<2>{static_cast(num_rows), + static_cast(num_cols)}, + output); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); + } + } +} + + +TEST_F(KernelLaunch, ReductionCol2D) +{ + for (int num_rows : {0, 1, 10, 100, 1000, 10000}) { + for (int num_cols : + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 40, 100, 1000}) { + SCOPED_TRACE(std::to_string(num_rows) + " rows, " + + std::to_string(num_cols) + " cols"); + gko::Array host_ref{exec->get_master(), + static_cast(num_cols)}; + gko::Array output{exec, static_cast(num_cols)}; + for (int i = 0; i < num_cols; i++) { + // we are computing 2 * sum {j=0, j(num_rows) * (num_rows + 1) * (i + 1); + } + + gko::kernels::dpcpp::run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, + output.get_data(), + gko::dim<2>{static_cast(num_rows), + static_cast(num_cols)}, + output); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); + } + } +} + + } // namespace diff --git a/hip/base/kernel_launch.hip.hpp b/hip/base/kernel_launch.hip.hpp index 8967ee5597d..6c627838fea 100644 --- a/hip/base/kernel_launch.hip.hpp +++ b/hip/base/kernel_launch.hip.hpp @@ -54,9 +54,9 @@ constexpr int default_block_size = 512; template __global__ __launch_bounds__(default_block_size) void generic_kernel_1d( - size_type size, KernelFunction fn, KernelArgs... args) + int64 size, KernelFunction fn, KernelArgs... args) { - auto tidx = thread::get_thread_id_flat(); + auto tidx = thread::get_thread_id_flat(); if (tidx >= size) { return; } @@ -66,9 +66,9 @@ __global__ __launch_bounds__(default_block_size) void generic_kernel_1d( template __global__ __launch_bounds__(default_block_size) void generic_kernel_2d( - size_type rows, size_type cols, KernelFunction fn, KernelArgs... args) + int64 rows, int64 cols, KernelFunction fn, KernelArgs... args) { - auto tidx = thread::get_thread_id_flat(); + auto tidx = thread::get_thread_id_flat(); auto col = tidx % cols; auto row = tidx / cols; if (row >= rows) { @@ -85,8 +85,8 @@ void run_kernel(std::shared_ptr exec, KernelFunction fn, gko::hip::device_guard guard{exec->get_device_id()}; constexpr auto block_size = default_block_size; auto num_blocks = ceildiv(size, block_size); - hipLaunchKernelGGL(generic_kernel_1d, num_blocks, block_size, 0, 0, size, - fn, map_to_device(args)...); + hipLaunchKernelGGL(generic_kernel_1d, num_blocks, block_size, 0, 0, + static_cast(size), fn, map_to_device(args)...); } template @@ -96,8 +96,9 @@ void run_kernel(std::shared_ptr exec, KernelFunction fn, gko::hip::device_guard guard{exec->get_device_id()}; constexpr auto block_size = default_block_size; auto num_blocks = ceildiv(size[0] * size[1], block_size); - hipLaunchKernelGGL(generic_kernel_2d, num_blocks, block_size, 0, 0, size[0], - size[1], fn, map_to_device(args)...); + hipLaunchKernelGGL(generic_kernel_2d, num_blocks, block_size, 0, 0, + static_cast(size[0]), static_cast(size[1]), + fn, map_to_device(args)...); } diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp new file mode 100644 index 00000000000..40e4268dccb --- /dev/null +++ b/hip/base/kernel_launch_reduction.hip.hpp @@ -0,0 +1,540 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ +#error \ + "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp" +#endif + + +#include "core/synthesizer/implementation_selection.hpp" +#include "hip/base/device_guard.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { + + +template +__global__ __launch_bounds__( + default_block_size) void generic_kernel_reduction_1d(int64 size, + KernelFunction fn, + ReductionOp op, + FinalizeOp finalize, + ValueType identity, + ValueType* storage, + KernelArgs... args) +{ + __shared__ + UninitializedArray + warp_partial; + static_assert(default_block_size / config::warp_size <= config::warp_size, + "needs third reduction level"); + auto tidx = thread::get_thread_id_flat(); + auto grid_size = thread::get_thread_num_flat(); + auto warp = + group::tiled_partition(group::this_thread_block()); + auto partial = identity; + for (int64 i = tidx; i < size; i += grid_size) { + partial = op(partial, fn(i, args...)); + } + partial = reduce(warp, partial, op); + if (warp.thread_rank() == 0) { + warp_partial[threadIdx.x / config::warp_size] = partial; + } + __syncthreads(); + if (threadIdx.x < config::warp_size) { + partial = reduce(warp, + threadIdx.x < default_block_size / config::warp_size + ? warp_partial[threadIdx.x] + : identity, + op); + if (threadIdx.x == 0) { + storage[blockIdx.x] = finalize(partial); + } + } +} + + +template +__global__ __launch_bounds__( + default_block_size) void generic_kernel_reduction_2d(int64 rows, int64 cols, + KernelFunction fn, + ReductionOp op, + FinalizeOp finalize, + ValueType identity, + ValueType* storage, + KernelArgs... args) +{ + __shared__ + UninitializedArray + warp_partial; + static_assert(default_block_size / config::warp_size <= config::warp_size, + "needs third reduction level"); + auto tidx = thread::get_thread_id_flat(); + auto grid_size = thread::get_thread_num_flat(); + auto warp = + group::tiled_partition(group::this_thread_block()); + auto partial = identity; + for (int64 i = tidx; i < rows * cols; i += grid_size) { + const auto row = i / cols; + const auto col = i % cols; + partial = op(partial, fn(row, col, args...)); + } + partial = reduce(warp, partial, op); + if (warp.thread_rank() == 0) { + warp_partial[threadIdx.x / config::warp_size] = partial; + } + __syncthreads(); + if (threadIdx.x < config::warp_size) { + partial = reduce(warp, + threadIdx.x < default_block_size / config::warp_size + ? warp_partial[threadIdx.x] + : identity, + op); + if (threadIdx.x == 0) { + storage[blockIdx.x] = finalize(partial); + } + } +} + + +template +void run_kernel_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, size_type size, + KernelArgs&&... args) +{ + constexpr int oversubscription = 16; + gko::hip::device_guard guard{exec->get_device_id()}; + constexpr auto block_size = default_block_size; + const auto num_blocks = std::min( + ceildiv(size, block_size), exec->get_num_warps() * oversubscription); + if (num_blocks > 1) { + Array partial{exec, static_cast(num_blocks)}; + hipLaunchKernelGGL( + generic_kernel_reduction_1d, num_blocks, block_size, 0, 0, + static_cast(size), fn, op, + [] __device__(auto v) { return v; }, as_hip_type(identity), + as_hip_type(partial.get_data()), map_to_device(args)...); + hipLaunchKernelGGL( + generic_kernel_reduction_1d, 1, block_size, 0, 0, + static_cast(num_blocks), + [] __device__(auto i, auto v) { return v[i]; }, op, finalize, + as_hip_type(identity), as_hip_type(result), + as_hip_type(partial.get_const_data())); + } else { + hipLaunchKernelGGL(generic_kernel_reduction_1d, 1, block_size, 0, 0, + static_cast(size), fn, op, finalize, + as_hip_type(identity), as_hip_type(result), + map_to_device(args)...); + } +} + + +template +void run_kernel_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, dim<2> size, KernelArgs&&... args) +{ + constexpr int oversubscription = 16; + gko::hip::device_guard guard{exec->get_device_id()}; + constexpr auto block_size = default_block_size; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto num_blocks = + std::min(ceildiv(rows * cols, block_size), + exec->get_num_warps() * oversubscription); + if (num_blocks > 1) { + Array partial{exec, static_cast(num_blocks)}; + hipLaunchKernelGGL( + generic_kernel_reduction_2d, num_blocks, block_size, 0, 0, rows, + cols, fn, op, [] __device__(auto v) { return v; }, + as_hip_type(identity), as_hip_type(partial.get_data()), + map_to_device(args)...); + hipLaunchKernelGGL( + generic_kernel_reduction_1d, 1, block_size, 0, 0, + static_cast(num_blocks), + [] __device__(auto i, auto v) { return v[i]; }, op, finalize, + as_hip_type(identity), as_hip_type(result), + as_hip_type(partial.get_const_data())); + } else { + hipLaunchKernelGGL(generic_kernel_reduction_2d, 1, block_size, 0, 0, + rows, cols, fn, op, finalize, as_hip_type(identity), + as_hip_type(result), map_to_device(args)...); + } +} + + +template +__global__ + __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d( + int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, + ReductionOp op, FinalizeOp finalize, ValueType identity, + ValueType* result, int64 result_stride, KernelArgs... args) +{ + const auto idx = thread::get_subwarp_id_flat(); + const auto row = idx % rows; + const auto col_block = idx / rows; + if (col_block >= col_blocks) { + return; + } + const auto cols_per_part = + ceildiv(ceildiv(cols, subwarp_size), col_blocks) * subwarp_size; + const auto begin = cols_per_part * col_block; + const auto end = min(begin + cols_per_part, cols); + auto subwarp = + group::tiled_partition(group::this_thread_block()); + auto partial = identity; + for (auto col = begin + subwarp.thread_rank(); col < end; + col += subwarp_size) { + partial = op(partial, fn(row, col, args...)); + } + partial = reduce(subwarp, partial, op); + if (subwarp.thread_rank() == 0) { + result[(row + col_block * rows) * result_stride] = finalize(partial); + } +} + + +template +__global__ + __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small( + int64 rows, int64 cols, KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, ValueType* result, + KernelArgs... args) +{ + constexpr auto warp_size = config::warp_size; + constexpr auto warps_per_block = default_block_size / warp_size; + // stores the subwarp_size partial sums from each warp, grouped by warp + constexpr auto shared_storage = warps_per_block * subwarp_size; + __shared__ UninitializedArray block_partial; + const auto subwarp_id = thread::get_subwarp_id_flat(); + const auto local_warp_id = threadIdx.x / warp_size; + const auto local_subwarp_id = threadIdx.x % warp_size / subwarp_size; + const auto subwarp_num = + thread::get_subwarp_num_flat(); + const auto block = group::this_thread_block(); + const auto warp = group::tiled_partition(block); + const auto warp_rank = warp.thread_rank(); + const auto subwarp_rank = warp_rank % subwarp_size; + const auto col = static_cast(subwarp_rank); + auto partial = identity; + // accumulate within a thread + if (col < cols) { + for (auto row = subwarp_id; row < rows; row += subwarp_num) { + partial = op(partial, fn(row, col, args...)); + } + } + // accumulate between all subwarps in the warp +#pragma unroll + for (unsigned i = subwarp_size; i < warp_size; i *= 2) { + partial = op(partial, warp.shfl_xor(partial, i)); + } // store the result to shared memory + if (local_subwarp_id == 0) { + block_partial[local_warp_id * subwarp_size + subwarp_rank] = partial; + } + block.sync(); + // in a single thread: accumulate the results + if (local_warp_id == 0) { + partial = identity; + // accumulate the partial results within a thread + if (shared_storage >= warp_size) { +#pragma unroll + for (int i = 0; i < shared_storage; i += warp_size) { + partial = op(partial, block_partial[i + warp_rank]); + } + } else if (warp_rank < shared_storage) { + partial = op(partial, block_partial[warp_rank]); + } + // accumulate between all subwarps in the warp +#pragma unroll + for (unsigned i = subwarp_size; i < warp_size; i *= 2) { + partial = op(partial, warp.shfl_xor(partial, i)); + } + if (warp_rank < cols) { + result[warp_rank + blockIdx.x * cols] = finalize(partial); + } + } +} + + +template +__global__ + __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked( + int64 rows, int64 cols, KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, ValueType* result, + KernelArgs... args) +{ + constexpr auto warp_size = config::warp_size; + __shared__ UninitializedArray block_partial; + const auto warp_id = thread::get_subwarp_id_flat(); + const auto warp_num = thread::get_subwarp_num_flat(); + const auto block = group::this_thread_block(); + const auto warp = group::tiled_partition(block); + const auto warp_rank = warp.thread_rank(); + const auto col = warp_rank + static_cast(blockIdx.y) * warp_size; + auto partial = identity; + // accumulate within a thread + if (col < cols) { + for (auto row = warp_id; row < rows; row += warp_num) { + partial = op(partial, fn(row, col, args...)); + } + } + block_partial[threadIdx.x] = partial; + block.sync(); + // in a single warp: accumulate the results + if (threadIdx.x < warp_size) { + partial = identity; + // accumulate the partial results within a thread +#pragma unroll + for (int i = 0; i < default_block_size; i += warp_size) { + partial = op(partial, block_partial[i + warp_rank]); + } + if (col < cols) { + result[col + blockIdx.x * cols] = finalize(partial); + } + } +} + + +template +__global__ + __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d( + int64 num_results, int64 num_blocks, ReductionOp op, + FinalizeOp finalize, ValueType identity, const ValueType* input, + int64 result_stride, ValueType* result) +{ + const auto idx = thread::get_thread_id_flat(); + if (idx >= num_results) { + return; + } + auto partial = identity; + for (int64 block = 0; block < num_blocks; block++) { + partial = op(partial, input[idx + block * num_results]); + } + result[idx * result_stride] = finalize(partial); +} + + +namespace { + + +template +void run_generic_kernel_row_reduction(syn::value_list, + int64 rows, int64 cols, int64 col_blocks, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, int64 result_stride, + KernelArgs... args) +{ + const auto num_blocks = + ceildiv(rows * col_blocks * subwarp_size, default_block_size); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(generic_kernel_row_reduction_2d), + num_blocks, default_block_size, 0, 0, rows, cols, col_blocks, fn, op, + finalize, as_hip_type(identity), as_hip_type(result), result_stride, + args...); +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction, + run_generic_kernel_row_reduction) + + +template +void run_generic_col_reduction_small(syn::value_list, + int64 max_blocks, + std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, dim<2> size, + MappedKernelArgs... args) +{ + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto num_blocks = std::min( + ceildiv(rows * subwarp_size, default_block_size), max_blocks); + if (num_blocks <= 1) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + generic_kernel_col_reduction_2d_small), + 1, default_block_size, 0, 0, rows, cols, fn, op, finalize, + as_hip_type(identity), as_hip_type(result), args...); + } else { + Array tmp_storage{exec, + static_cast(num_blocks * cols)}; + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + generic_kernel_col_reduction_2d_small), + num_blocks, default_block_size, 0, 0, rows, cols, fn, op, + [] __device__(auto v) { return v; }, as_hip_type(identity), + as_hip_type(tmp_storage.get_data()), args...); + hipLaunchKernelGGL( + generic_kernel_reduction_finalize_2d, + ceildiv(cols, default_block_size), default_block_size, 0, 0, cols, + num_blocks, op, finalize, as_hip_type(identity), + as_hip_type(tmp_storage.get_const_data()), 1, as_hip_type(result)); + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small, + run_generic_col_reduction_small); + + +} // namespace + + +template +void run_kernel_row_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, size_type result_stride, + dim<2> size, KernelArgs&&... args) +{ + using subwarp_sizes = + syn::value_list; + constexpr int oversubscription = 16; + gko::hip::device_guard guard{exec->get_device_id()}; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto resources = + exec->get_num_warps() * config::warp_size * oversubscription; + if (rows * cols > resources && rows < cols) { + const auto col_blocks = ceildiv(rows * cols, resources); + Array partial{exec, + static_cast(col_blocks * rows)}; + const auto num_blocks = + ceildiv(rows * col_blocks * config::warp_size, default_block_size); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(generic_kernel_row_reduction_2d), + num_blocks, default_block_size, 0, 0, rows, cols, col_blocks, fn, + op, [] __device__(auto v) { return v; }, as_hip_type(identity), + as_hip_type(partial.get_data()), 1, map_to_device(args)...); + const auto num_finalize_blocks = ceildiv(rows, default_block_size); + hipLaunchKernelGGL( + generic_kernel_reduction_finalize_2d, num_finalize_blocks, + default_block_size, 0, 0, rows, col_blocks, op, finalize, + as_hip_type(identity), as_hip_type(partial.get_const_data()), + static_cast(result_stride), as_hip_type(result)); + } else { + select_run_generic_kernel_row_reduction( + subwarp_sizes(), + [&](int compiled_subwarp_size) { + return compiled_subwarp_size >= cols || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), rows, cols, 1, fn, op, + finalize, identity, result, static_cast(result_stride), + map_to_device(args)...); + } +} + + +template +void run_kernel_col_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, dim<2> size, + KernelArgs&&... args) +{ + using subwarp_sizes = + syn::value_list; + constexpr int oversubscription = 16; + gko::hip::device_guard guard{exec->get_device_id()}; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto max_blocks = exec->get_num_warps() * config::warp_size * + oversubscription / default_block_size; + if (cols <= config::warp_size) { + select_generic_col_reduction_small( + subwarp_sizes(), + [&](int compiled_subwarp_size) { + return compiled_subwarp_size >= cols || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), max_blocks, exec, fn, + op, finalize, identity, result, size, map_to_device(args)...); + } else { + const auto col_blocks = ceildiv(cols, config::warp_size); + const auto row_blocks = + ceildiv(std::min( + ceildiv(rows * config::warp_size, default_block_size), + max_blocks), + col_blocks); + if (row_blocks <= 1) { + hipLaunchKernelGGL(generic_kernel_col_reduction_2d_blocked, + dim3(1, col_blocks), default_block_size, 0, 0, + rows, cols, fn, op, finalize, + as_hip_type(identity), as_hip_type(result), + map_to_device(args)...); + } else { + Array tmp_storage{ + exec, static_cast(row_blocks * cols)}; + hipLaunchKernelGGL( + generic_kernel_col_reduction_2d_blocked, + dim3(row_blocks, col_blocks), default_block_size, 0, 0, rows, + cols, fn, op, [] __device__(auto v) { return v; }, + as_hip_type(identity), as_hip_type(tmp_storage.get_data()), + map_to_device(args)...); + hipLaunchKernelGGL(generic_kernel_reduction_finalize_2d, + ceildiv(cols, default_block_size), + default_block_size, 0, 0, cols, row_blocks, op, + finalize, as_hip_type(identity), + as_hip_type(tmp_storage.get_const_data()), 1, + as_hip_type(result)); + } + } +} + + +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/base/kernel_launch_solver.hip.hpp b/hip/base/kernel_launch_solver.hip.hpp index a8335851a0e..9798f6c4fbc 100644 --- a/hip/base/kernel_launch_solver.hip.hpp +++ b/hip/base/kernel_launch_solver.hip.hpp @@ -46,10 +46,10 @@ namespace hip { template __global__ __launch_bounds__(default_block_size) void generic_kernel_2d_solver( - size_type rows, size_type cols, size_type default_stride, KernelFunction fn, + int64 rows, int64 cols, int64 default_stride, KernelFunction fn, KernelArgs... args) { - auto tidx = thread::get_thread_id_flat(); + auto tidx = thread::get_thread_id_flat(); auto col = tidx % cols; auto row = tidx / cols; if (row >= rows) { @@ -69,7 +69,9 @@ void run_kernel_solver(std::shared_ptr exec, constexpr auto block_size = kernels::hip::default_block_size; auto num_blocks = ceildiv(size[0] * size[1], block_size); hipLaunchKernelGGL(kernels::hip::generic_kernel_2d_solver, num_blocks, - block_size, 0, 0, size[0], size[1], default_stride, fn, + block_size, 0, 0, static_cast(size[0]), + static_cast(size[1]), + static_cast(default_stride), fn, kernels::hip::map_to_device(args)...); } diff --git a/hip/components/reduction.hip.hpp b/hip/components/reduction.hip.hpp index 87d6b518123..7850b9f65a1 100644 --- a/hip/components/reduction.hip.hpp +++ b/hip/components/reduction.hip.hpp @@ -55,7 +55,7 @@ namespace kernels { namespace hip { -constexpr int default_block_size = 512; +constexpr int default_reduce_block_size = 512; #include "common/cuda_hip/components/reduction.hpp.inc" @@ -77,23 +77,26 @@ __host__ ValueType reduce_add_array(std::shared_ptr exec, auto block_results_val = source; size_type grid_dim = size; auto block_results = Array(exec); - if (size > default_block_size) { - const auto n = ceildiv(size, default_block_size); - grid_dim = (n <= default_block_size) ? n : default_block_size; + if (size > default_reduce_block_size) { + const auto n = ceildiv(size, default_reduce_block_size); + grid_dim = + (n <= default_reduce_block_size) ? n : default_reduce_block_size; block_results.resize_and_reset(grid_dim); - hipLaunchKernelGGL( - reduce_add_array, dim3(grid_dim), dim3(default_block_size), 0, 0, - size, as_hip_type(source), as_hip_type(block_results.get_data())); + hipLaunchKernelGGL(reduce_add_array, dim3(grid_dim), + dim3(default_reduce_block_size), 0, 0, size, + as_hip_type(source), + as_hip_type(block_results.get_data())); block_results_val = block_results.get_const_data(); } auto d_result = Array(exec, 1); - hipLaunchKernelGGL(reduce_add_array, dim3(1), dim3(default_block_size), 0, - 0, grid_dim, as_hip_type(block_results_val), + hipLaunchKernelGGL(reduce_add_array, dim3(1), + dim3(default_reduce_block_size), 0, 0, grid_dim, + as_hip_type(block_results_val), as_hip_type(d_result.get_data())); auto answer = exec->copy_val_to_host(d_result.get_const_data()); return answer; diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp index 56ed5c327b9..d4c815c9539 100644 --- a/hip/matrix/dense_kernels.hip.cpp +++ b/hip/matrix/dense_kernels.hip.cpp @@ -120,144 +120,6 @@ void apply(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); -template -void compute_dot(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - if (hipblas::is_supported::value) { - // TODO: write a custom kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - hipblas::dot(exec->get_hipblas_handle(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - y->get_const_values() + col, y->get_stride(), - result->get_values() + col); - } - } else { - // TODO: these are tuning parameters obtained experimentally, once - // we decide how to handle this uniformly, they should be modified - // appropriately - constexpr int work_per_thread = 32; - constexpr int block_size = 1024; - - constexpr auto work_per_block = work_per_thread * block_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{config::warp_size, 1, - block_size / config::warp_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::compute_partial_dot), - dim3(grid_dim), dim3(block_dim), 0, 0, x->get_size()[0], - as_hip_type(x->get_const_values() + col), x->get_stride(), - as_hip_type(y->get_const_values() + col), y->get_stride(), - as_hip_type(work.get_data())); - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - kernel::finalize_sum_reduce_computation), - dim3(1), dim3(block_dim), 0, 0, grid_dim.x, - as_hip_type(work.get_const_data()), - as_hip_type(result->get_values() + col)); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); - - -template -void compute_conj_dot(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - if (hipblas::is_supported::value) { - // TODO: write a custom kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - hipblas::conj_dot(exec->get_hipblas_handle(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - y->get_const_values() + col, y->get_stride(), - result->get_values() + col); - } - } else { - // TODO: these are tuning parameters obtained experimentally, once - // we decide how to handle this uniformly, they should be modified - // appropriately - constexpr int work_per_thread = 32; - constexpr int block_size = 1024; - - constexpr auto work_per_block = work_per_thread * block_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{config::warp_size, 1, - block_size / config::warp_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::compute_partial_conj_dot), - dim3(grid_dim), dim3(block_dim), 0, 0, x->get_size()[0], - as_hip_type(x->get_const_values() + col), x->get_stride(), - as_hip_type(y->get_const_values() + col), y->get_stride(), - as_hip_type(work.get_data())); - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - kernel::finalize_sum_reduce_computation), - dim3(1), dim3(block_dim), 0, 0, grid_dim.x, - as_hip_type(work.get_const_data()), - as_hip_type(result->get_values() + col)); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); - - -template -void compute_norm2(std::shared_ptr exec, - const matrix::Dense* x, - matrix::Dense>* result) -{ - if (hipblas::is_supported::value) { - for (size_type col = 0; col < x->get_size()[1]; ++col) { - hipblas::norm2(exec->get_hipblas_handle(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - result->get_values() + col); - } - } else { - using norm_type = remove_complex; - // TODO: these are tuning parameters obtained experimentally, once - // we decide how to handle this uniformly, they should be modified - // appropriately - constexpr int work_per_thread = 32; - constexpr int block_size = 1024; - - constexpr auto work_per_block = work_per_thread * block_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{config::warp_size, 1, - block_size / config::warp_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::compute_partial_norm2), - dim3(grid_dim), dim3(block_dim), 0, 0, x->get_size()[0], - as_hip_type(x->get_const_values() + col), x->get_stride(), - as_hip_type(work.get_data())); - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - kernel::finalize_sqrt_reduce_computation), - dim3(1), dim3(block_dim), 0, 0, grid_dim.x, - as_hip_type(work.get_const_data()), - as_hip_type(result->get_values() + col)); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); - - template void convert_to_coo(std::shared_ptr exec, const matrix::Dense* source, diff --git a/hip/test/base/kernel_launch.hip.cpp b/hip/test/base/kernel_launch.hip.cpp index 55ddb3fd01e..c7add9ddca8 100644 --- a/hip/test/base/kernel_launch.hip.cpp +++ b/hip/test/base/kernel_launch.hip.cpp @@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "common/unified/base/kernel_launch_reduction.hpp" #include "common/unified/base/kernel_launch_solver.hpp" #include "core/test/utils.hpp" @@ -54,6 +55,7 @@ namespace { using gko::dim; +using gko::int64; using gko::size_type; using std::is_same; @@ -103,7 +105,7 @@ void run1d(std::shared_ptr exec, size_type dim, int* data) gko::kernels::hip::run_kernel( exec, [] GKO_KERNEL(auto i, auto d) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); d[i] = i; }, @@ -123,7 +125,7 @@ void run1d(std::shared_ptr exec, gko::Array& data) gko::kernels::hip::run_kernel( exec, [] GKO_KERNEL(auto i, auto d, auto d_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); if (d == d_ptr) { @@ -148,7 +150,7 @@ void run1d(std::shared_ptr exec, gko::matrix::Dense<>* m) gko::kernels::hip::run_kernel( exec, [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); @@ -184,8 +186,8 @@ void run2d(std::shared_ptr exec, int* data) gko::kernels::hip::run_kernel( exec, [] GKO_KERNEL(auto i, auto j, auto d) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); d[i + 4 * j] = 4 * i + j; }, @@ -205,8 +207,8 @@ void run2d(std::shared_ptr exec, gko::Array& data) gko::kernels::hip::run_kernel( exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); if (d == d_ptr) { @@ -233,7 +235,7 @@ void run2d(std::shared_ptr exec, gko::matrix::Dense<>* m1, exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3, auto d4, auto d2_ptr, auto d3_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); @@ -274,4 +276,141 @@ TEST_F(KernelLaunch, Runs2DDense) } +void run1d_reduction(std::shared_ptr exec) +{ + gko::Array output{exec, 1}; + + gko::kernels::hip::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return i + 1; + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), + size_type{100000}, output); + + // 2 * sum i=0...99999 (i+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000LL); + + gko::kernels::hip::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return i + 1; + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), + size_type{100}, output); + + // 2 * sum i=0...99 (i+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100LL); +} + +TEST_F(KernelLaunch, Reduction1D) { run1d_reduction(exec); } + + +void run2d_reduction(std::shared_ptr exec) +{ + gko::Array output{exec, 1}; + + gko::kernels::hip::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), + gko::dim<2>{1000, 100}, output); + + // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000LL); + + gko::kernels::hip::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), + gko::dim<2>{10, 10}, output); + + // 4 * sum i=0...9 sum j=0...9 of (i+1)*(j+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100LL); +} + +TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); } + + +void run2d_row_reduction(std::shared_ptr exec) +{ + int num_rows = 1000; + int num_cols = 100; + gko::Array host_ref{exec->get_master(), + static_cast(2 * num_rows)}; + std::fill_n(host_ref.get_data(), 2 * num_rows, 1234); + gko::Array output{exec, host_ref}; + for (int i = 0; i < num_rows; i++) { + // we are computing 2 * sum {j=0, j::value, "index"); + static_assert(is_same::value, "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2, + gko::dim<2>{static_cast(num_rows), + static_cast(num_cols)}, + output); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); +} + +TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); } + + +void run2d_col_reduction(std::shared_ptr exec) +{ + int num_rows = 1000; + int num_cols = 100; + gko::Array host_ref{exec->get_master(), + static_cast(num_cols)}; + gko::Array output{exec, static_cast(num_cols)}; + for (int i = 0; i < num_cols; i++) { + // we are computing 2 * sum {j=0, j::value, "index"); + static_assert(is_same::value, "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), + gko::dim<2>{static_cast(num_rows), + static_cast(num_cols)}, + output); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); +} + +TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); } + + } // namespace diff --git a/omp/base/kernel_launch.hpp b/omp/base/kernel_launch.hpp index 7df6ff4c313..b6ef373fdee 100644 --- a/omp/base/kernel_launch.hpp +++ b/omp/base/kernel_launch.hpp @@ -36,108 +36,101 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#include "core/synthesizer/implementation_selection.hpp" + + namespace gko { namespace kernels { namespace omp { -template -void run_kernel(std::shared_ptr exec, KernelFunction fn, - size_type size, KernelArgs&&... args) -{ -#pragma omp parallel for - for (size_type i = 0; i < size; i++) { - [&]() { fn(i, map_to_device(args)...); }(); - } -} +namespace { -template -void run_kernel_fixed_cols_impl(std::shared_ptr exec, - KernelFunction fn, dim<2> size, - MappedKernelArgs... args) + +template +void run_kernel_impl(std::shared_ptr exec, KernelFunction fn, + size_type size, MappedKernelArgs... args) { - const auto rows = size[0]; #pragma omp parallel for - for (size_type row = 0; row < rows; row++) { -#pragma unroll - for (size_type col = 0; col < cols; col++) { - [&]() { fn(row, col, args...); }(); - } + for (int64 i = 0; i < static_cast(size); i++) { + [&]() { fn(i, args...); }(); } } -template -void run_kernel_blocked_cols_impl(std::shared_ptr exec, - KernelFunction fn, dim<2> size, - MappedKernelArgs... args) + +template +void run_kernel_sized_impl(syn::value_list, + std::shared_ptr exec, + KernelFunction fn, dim<2> size, + MappedKernelArgs... args) { + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); static_assert(remainder_cols < block_size, "remainder too large"); - const auto rows = size[0]; - const auto cols = size[1]; const auto rounded_cols = cols / block_size * block_size; GKO_ASSERT(rounded_cols + remainder_cols == cols); + if (rounded_cols == 0 || cols == block_size) { + // we group all sizes <= block_size here and unroll explicitly + constexpr auto local_cols = + remainder_cols == 0 ? block_size : remainder_cols; #pragma omp parallel for - for (size_type row = 0; row < rows; row++) { - for (size_type base_col = 0; base_col < rounded_cols; - base_col += block_size) { + for (int64 row = 0; row < rows; row++) { #pragma unroll - for (size_type i = 0; i < block_size; i++) { - [&]() { fn(row, base_col + i, args...); }(); + for (int64 col = 0; col < local_cols; col++) { + [&]() { fn(row, col, args...); }(); } } + } else { + // we operate in block_size blocks plus an explicitly unrolled remainder +#pragma omp parallel for + for (int64 row = 0; row < rows; row++) { + for (int64 base_col = 0; base_col < rounded_cols; + base_col += block_size) { +#pragma unroll + for (int64 i = 0; i < block_size; i++) { + [&]() { fn(row, base_col + i, args...); }(); + } + } #pragma unroll - for (size_type i = 0; i < remainder_cols; i++) { - [&]() { fn(row, rounded_cols + i, args...); }(); + for (int64 i = 0; i < remainder_cols; i++) { + [&]() { fn(row, rounded_cols + i, args...); }(); + } } } } +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_sized, + run_kernel_sized_impl) + + template void run_kernel_impl(std::shared_ptr exec, KernelFunction fn, dim<2> size, MappedKernelArgs... args) { - const auto rows = size[0]; - const auto cols = size[1]; - constexpr size_type block_size = 4; + const auto cols = static_cast(size[1]); + constexpr int block_size = 8; + using remainders = syn::as_list>; + if (cols <= 0) { return; } - if (cols == 1) { - run_kernel_fixed_cols_impl<1>(exec, fn, size, args...); - return; - } - if (cols == 2) { - run_kernel_fixed_cols_impl<2>(exec, fn, size, args...); - return; - } - if (cols == 3) { - run_kernel_fixed_cols_impl<3>(exec, fn, size, args...); - return; - } - if (cols == 4) { - run_kernel_fixed_cols_impl<4>(exec, fn, size, args...); - return; - } - const auto rem_cols = cols % block_size; - if (rem_cols == 0) { - run_kernel_blocked_cols_impl<0, block_size>(exec, fn, size, args...); - return; - } - if (rem_cols == 1) { - run_kernel_blocked_cols_impl<1, block_size>(exec, fn, size, args...); - return; - } - if (rem_cols == 2) { - run_kernel_blocked_cols_impl<2, block_size>(exec, fn, size, args...); - return; - } - if (rem_cols == 3) { - run_kernel_blocked_cols_impl<3, block_size>(exec, fn, size, args...); - return; - } - // should be unreachable - GKO_ASSERT(false); + select_run_kernel_sized( + remainders(), + [&](int remainder) { return remainder == cols % block_size; }, + syn::value_list(), syn::type_list<>(), exec, fn, size, + args...); +} + + +} // namespace + + +template +void run_kernel(std::shared_ptr exec, KernelFunction fn, + size_type size, KernelArgs&&... args) +{ + run_kernel_impl(exec, fn, size, map_to_device(args)...); } diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp new file mode 100644 index 00000000000..030da93c245 --- /dev/null +++ b/omp/base/kernel_launch_reduction.hpp @@ -0,0 +1,405 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ +#error \ + "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp" +#endif + + +#include + + +#include + + +namespace gko { +namespace kernels { +namespace omp { + + +// how many more reduction tasks we launch relative to the number of threads +constexpr int reduction_kernel_oversubscription = 4; + + +namespace { + + +template +void run_kernel_reduction_impl(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, size_type size, + MappedKernelArgs... args) +{ + const auto num_threads = static_cast(omp_get_max_threads()); + const auto ssize = static_cast(size); + const auto work_per_thread = ceildiv(ssize, num_threads); + Array partial{exec, static_cast(num_threads)}; +#pragma omp parallel num_threads(num_threads) + { + const auto thread_id = omp_get_thread_num(); + const auto begin = thread_id * work_per_thread; + const auto end = std::min(ssize, begin + work_per_thread); + + auto local_partial = identity; + for (auto i = begin; i < end; i++) { + local_partial = op(local_partial, fn(i, map_to_device(args)...)); + } + partial.get_data()[thread_id] = local_partial; + } + *result = finalize(std::accumulate(partial.get_const_data(), + partial.get_const_data() + num_threads, + identity, op)); +} + + +template +void run_kernel_reduction_sized_impl(syn::value_list, + std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, dim<2> size, + MappedKernelArgs... args) +{ + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto num_threads = static_cast(omp_get_max_threads()); + const auto work_per_thread = ceildiv(rows, num_threads); + Array partial{exec, static_cast(num_threads)}; + static_assert(remainder_cols < block_size, "remainder too large"); + const auto rounded_cols = cols / block_size * block_size; + GKO_ASSERT(rounded_cols + remainder_cols == cols); +#pragma omp parallel + { + const auto thread_id = omp_get_thread_num(); + const auto begin = thread_id * work_per_thread; + const auto end = std::min(rows, begin + work_per_thread); + + auto local_partial = identity; + if (rounded_cols == 0 || cols == block_size) { + // we group all sizes <= block_size here and unroll explicitly + constexpr auto local_cols = + remainder_cols == 0 ? block_size : remainder_cols; + for (auto row = begin; row < end; row++) { +#pragma unroll + for (int64 col = 0; col < local_cols; col++) { + local_partial = op(local_partial, fn(row, col, args...)); + } + } + } else { + // we operate in block_size blocks plus an explicitly unrolled + // remainder + for (auto row = begin; row < end; row++) { + for (int64 base_col = 0; base_col < rounded_cols; + base_col += block_size) { +#pragma unroll + for (int64 i = 0; i < block_size; i++) { + local_partial = + op(local_partial, fn(row, base_col + i, args...)); + } + } +#pragma unroll + for (int64 i = 0; i < remainder_cols; i++) { + local_partial = + op(local_partial, fn(row, rounded_cols + i, args...)); + } + } + } + partial.get_data()[thread_id] = local_partial; + } + *result = finalize(std::accumulate(partial.get_const_data(), + partial.get_const_data() + num_threads, + identity, op)); +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_sized, + run_kernel_reduction_sized_impl) + + +} // namespace + + +template +void run_kernel_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, size_type size, + KernelArgs&&... args) +{ + run_kernel_reduction_impl(exec, fn, op, finalize, identity, result, size, + map_to_device(args)...); +} + + +template +void run_kernel_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, dim<2> size, KernelArgs&&... args) +{ + const auto cols = static_cast(size[1]); + constexpr int block_size = 8; + using remainders = syn::as_list>; + + if (cols <= 0) { + *result = identity; + return; + } + select_run_kernel_reduction_sized( + remainders(), + [&](int remainder) { return remainder == cols % block_size; }, + syn::value_list(), syn::type_list<>(), exec, fn, op, + finalize, identity, result, size, map_to_device(args)...); +} + + +namespace { + + +template +void run_kernel_row_reduction_impl(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, size_type result_stride, + dim<2> size, MappedKernelArgs... args) +{ + constexpr int block_size = 8; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto num_threads = static_cast(omp_get_max_threads()); + if (rows <= 0) { + return; + } + // enough work to keep all threads busy or only very small reduction sizes + if (rows >= reduction_kernel_oversubscription * num_threads || + cols < rows) { +#pragma omp parallel for + for (int64 row = 0; row < rows; row++) { + [&]() { + auto partial = identity; + for (int64 col = 0; col < cols; col++) { + partial = op(partial, fn(row, col, args...)); + } + result[result_stride * row] = finalize(partial); + }(); + } + } else { + // small number of rows and large reduction sizes: do partial sum first + const auto work_per_thread = ceildiv(cols, num_threads); + Array partial{exec, + static_cast(rows * num_threads)}; +#pragma omp parallel num_threads(num_threads) + { + const auto thread_id = static_cast(omp_get_thread_num()); + const auto begin = thread_id * work_per_thread; + const auto end = std::min(begin + work_per_thread, cols); + for (int64 row = 0; row < rows; row++) { + auto local_partial = identity; + for (int64 col = begin; col < end; col++) { + local_partial = op(local_partial, [&]() { + return fn(row, col, args...); + }()); + } + partial.get_data()[row * num_threads + thread_id] = + local_partial; + } + } + // then accumulate the partial sums and write to result +#pragma omp parallel for + for (int64 row = 0; row < rows; row++) { + [&] { + auto local_partial = identity; + for (int64 thread_id = 0; thread_id < num_threads; + thread_id++) { + local_partial = op( + local_partial, + partial + .get_const_data()[row * num_threads + thread_id]); + } + result[row * result_stride] = finalize(local_partial); + }(); + } + } +} + + +template +void run_kernel_col_reduction_sized_block_impl( + KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, + ValueType* result, int64 row_begin, int64 row_end, int64 base_col, + MappedKernelArgs... args) +{ + std::array partial; + partial.fill(identity); + for (auto row = row_begin; row < row_end; row++) { +#pragma unroll + for (int64 rel_col = 0; rel_col < local_cols; rel_col++) { + partial[rel_col] = + op(partial[rel_col], fn(row, base_col + rel_col, args...)); + } + } +#pragma unroll + for (int64 rel_col = 0; rel_col < local_cols; rel_col++) { + result[base_col + rel_col] = finalize(partial[rel_col]); + } +} + + +template +void run_kernel_col_reduction_sized_impl( + syn::value_list, + std::shared_ptr exec, KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, + MappedKernelArgs... args) +{ + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto num_threads = static_cast(omp_get_max_threads()); + static_assert(remainder_cols < block_size, "remainder too large"); + GKO_ASSERT(cols % block_size == remainder_cols); + const auto num_col_blocks = ceildiv(cols, block_size); + // enough work to keep all threads busy or only very small reduction sizes + if (cols >= reduction_kernel_oversubscription * num_threads || + rows < cols) { +#pragma omp parallel for + for (int64 col_block = 0; col_block < num_col_blocks; col_block++) { + const auto base_col = col_block * block_size; + if (base_col + block_size <= cols) { + run_kernel_col_reduction_sized_block_impl( + fn, op, finalize, identity, result, 0, rows, base_col, + args...); + } else { + run_kernel_col_reduction_sized_block_impl( + fn, op, finalize, identity, result, 0, rows, base_col, + args...); + } + } + } else { + // number of blocks that need to be reduced afterwards + const auto reduction_size = + ceildiv(reduction_kernel_oversubscription * num_threads, cols); + const auto rows_per_thread = ceildiv(rows, reduction_size); + Array partial{exec, + static_cast(reduction_size * cols)}; +#pragma omp parallel for + for (int64 i = 0; i < reduction_size * num_col_blocks; i++) { + const auto col_block = i % num_col_blocks; + const auto row_block = i / num_col_blocks; + const auto begin = row_block * rows_per_thread; + const auto end = std::min(begin + rows_per_thread, rows); + const auto base_col = col_block * block_size; + const auto identity_fn = [](auto i) { return i; }; + if (base_col + block_size <= cols) { + run_kernel_col_reduction_sized_block_impl( + fn, op, identity_fn, identity, + partial.get_data() + cols * row_block, begin, end, base_col, + args...); + } else { + run_kernel_col_reduction_sized_block_impl( + fn, op, identity_fn, identity, + partial.get_data() + cols * row_block, begin, end, base_col, + args...); + } + } +#pragma omp parallel for + for (int64 col = 0; col < cols; col++) { + [&] { + auto total = identity; + for (int64 row_block = 0; row_block < reduction_size; + row_block++) { + total = + op(total, + partial.get_const_data()[col + cols * row_block]); + } + result[col] = finalize(total); + }(); + } + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_col_reduction_sized, + run_kernel_col_reduction_sized_impl) + + +} // namespace + + +template +void run_kernel_row_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, size_type result_stride, + dim<2> size, MappedKernelArgs... args) +{ + run_kernel_row_reduction_impl(exec, fn, op, finalize, identity, result, + result_stride, size, map_to_device(args)...); +} + + +template +void run_kernel_col_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, dim<2> size, + KernelArgs&&... args) +{ + constexpr auto block_size = 8; + using remainders = syn::as_list>; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + if (cols <= 0) { + return; + } + select_run_kernel_col_reduction_sized( + remainders(), + [&](int remainder) { return remainder == cols % block_size; }, + syn::value_list(), syn::type_list<>(), exec, fn, op, + finalize, identity, result, size, map_to_device(args)...); +} + + +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/omp/base/kernel_launch_solver.hpp b/omp/base/kernel_launch_solver.hpp index dd85ba21915..b5c936c847b 100644 --- a/omp/base/kernel_launch_solver.hpp +++ b/omp/base/kernel_launch_solver.hpp @@ -43,7 +43,7 @@ namespace omp { template typename device_unpack_solver_impl::type>::type -map_to_device_solver(T&& param, size_type default_stride) +map_to_device_solver(T&& param, int64 default_stride) { return device_unpack_solver_impl::type>:: unpack(to_device_type_impl::map_to_device(param), default_stride); @@ -55,8 +55,9 @@ void run_kernel_solver(std::shared_ptr exec, KernelFunction fn, dim<2> size, size_type default_stride, KernelArgs&&... args) { - run_kernel_impl(exec, fn, size, - map_to_device_solver(args, default_stride)...); + run_kernel_impl( + exec, fn, size, + map_to_device_solver(args, static_cast(default_stride))...); } diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp index c0e4ca75ae3..b9df5fddf24 100644 --- a/omp/matrix/dense_kernels.cpp +++ b/omp/matrix/dense_kernels.cpp @@ -127,73 +127,6 @@ void apply(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); -template -void compute_dot(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ -#pragma omp parallel for - for (size_type j = 0; j < x->get_size()[1]; ++j) { - result->at(0, j) = zero(); - } -#pragma omp parallel for - for (size_type j = 0; j < x->get_size()[1]; ++j) { - for (size_type i = 0; i < x->get_size()[0]; ++i) { - result->at(0, j) += x->at(i, j) * y->at(i, j); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); - - -template -void compute_conj_dot(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ -#pragma omp parallel for - for (size_type j = 0; j < x->get_size()[1]; ++j) { - result->at(0, j) = zero(); - } -#pragma omp parallel for - for (size_type j = 0; j < x->get_size()[1]; ++j) { - for (size_type i = 0; i < x->get_size()[0]; ++i) { - result->at(0, j) += conj(x->at(i, j)) * y->at(i, j); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); - - -template -void compute_norm2(std::shared_ptr exec, - const matrix::Dense* x, - matrix::Dense>* result) -{ - using norm_type = remove_complex; -#pragma omp parallel for - for (size_type j = 0; j < x->get_size()[1]; ++j) { - result->at(0, j) = zero(); - } -#pragma omp parallel for - for (size_type j = 0; j < x->get_size()[1]; ++j) { - for (size_type i = 0; i < x->get_size()[0]; ++i) { - result->at(0, j) += squared_norm(x->at(i, j)); - } - } -#pragma omp parallel for - for (size_type j = 0; j < x->get_size()[1]; ++j) { - result->at(0, j) = sqrt(result->at(0, j)); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); - - template void convert_to_coo(std::shared_ptr exec, const matrix::Dense* source, diff --git a/omp/test/base/kernel_launch.cpp b/omp/test/base/kernel_launch.cpp index 2c4712cfa52..f2c6a7e1465 100644 --- a/omp/test/base/kernel_launch.cpp +++ b/omp/test/base/kernel_launch.cpp @@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "common/unified/base/kernel_launch_reduction.hpp" #include "common/unified/base/kernel_launch_solver.hpp" #include "core/test/utils.hpp" @@ -54,6 +55,7 @@ namespace { using gko::dim; +using gko::int64; using gko::size_type; using std::is_same; @@ -96,7 +98,7 @@ TEST_F(KernelLaunch, Runs1D) gko::kernels::omp::run_kernel( exec, [] GKO_KERNEL(auto i, auto d) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); d[i] = i; }, @@ -111,7 +113,7 @@ TEST_F(KernelLaunch, Runs1DArray) gko::kernels::omp::run_kernel( exec, [] GKO_KERNEL(auto i, auto d, auto d_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); if (d == d_ptr) { @@ -131,7 +133,7 @@ TEST_F(KernelLaunch, Runs1DDense) gko::kernels::omp::run_kernel( exec, [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); @@ -163,8 +165,8 @@ TEST_F(KernelLaunch, Runs2D) gko::kernels::omp::run_kernel( exec, [] GKO_KERNEL(auto i, auto j, auto d) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); d[i + 4 * j] = 4 * i + j; }, @@ -179,8 +181,8 @@ TEST_F(KernelLaunch, Runs2DArray) gko::kernels::omp::run_kernel( exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); if (d == d_ptr) { @@ -201,7 +203,7 @@ TEST_F(KernelLaunch, Runs2DDense) exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3, auto d4, auto d2_ptr, auto d3_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); @@ -238,5 +240,204 @@ TEST_F(KernelLaunch, Runs2DDense) GKO_ASSERT_MTX_NEAR(zero_dense2, iota_dense, 0.0); } +TEST_F(KernelLaunch, Reduction1D) +{ + gko::Array output{exec, 1}; + + gko::kernels::omp::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return i + 1; + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), + size_type{100000}, output); + + // 2 * sum i=0...99999 (i+1) + ASSERT_EQ(*output.get_const_data(), 10000100000LL); + + gko::kernels::omp::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return i + 1; + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), + size_type{10}, output); + + // 2 * sum i=0...9 (i+1) + ASSERT_EQ(*output.get_const_data(), 110LL); +} + + +TEST_F(KernelLaunch, Reduction2DSmallRows) +{ + gko::Array output{exec, 1}; + + for (int cols = 0; cols < 17; cols++) { + gko::kernels::omp::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), + gko::dim<2>{10, cols}, output); + + // 4 * sum i=0...9 sum j=0...cols-1 of (i+1)*(j+1) + ASSERT_EQ(*output.get_const_data(), 110LL * cols * (cols + 1)); + } +} + + +TEST_F(KernelLaunch, Reduction2DLargeRows) +{ + gko::Array output{exec, 1}; + + for (int cols = 0; cols < 17; cols++) { + gko::kernels::omp::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), + gko::dim<2>{1000, cols}, output); + + // 4 * sum i=0...999 sum j=0...cols-1 of (i+1)*(j+1) + ASSERT_EQ(*output.get_const_data(), 1001000LL * cols * (cols + 1)); + } +} + + +TEST_F(KernelLaunch, Reduction2D) +{ + gko::Array output{exec, 1}; + + gko::kernels::omp::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), + gko::dim<2>{1000, 100}, output); + + + // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1) + ASSERT_EQ(*output.get_const_data(), 10110100000LL); +} + + +TEST_F(KernelLaunch, ReductionRow2DSmall) +{ + // 4 rows, with oversubscription this means we use multiple threads per row + // if OMP_NUM_THREADS >= 2 + int num_rows = 4; + int num_cols = 100; + gko::Array host_ref{exec->get_master(), + static_cast(2 * num_rows)}; + std::fill_n(host_ref.get_data(), 2 * num_rows, 1234); + gko::Array output{exec, host_ref}; + for (int i = 0; i < num_rows; i++) { + // we are computing 2 * sum {j=0, j(num_cols) * (num_cols + 1) * (i + 1); + } + + gko::kernels::omp::run_kernel_row_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2, + gko::dim<2>{static_cast(num_rows), + static_cast(num_cols)}, + output); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); +} + + +TEST_F(KernelLaunch, ReductionRow2D) +{ + int num_rows = 1000; + int num_cols = 100; + gko::Array host_ref{exec->get_master(), + static_cast(2 * num_rows)}; + std::fill_n(host_ref.get_data(), 2 * num_rows, 1234); + gko::Array output{exec, host_ref}; + for (int i = 0; i < num_rows; i++) { + // we are computing 2 * sum {j=0, j(num_cols) * (num_cols + 1) * (i + 1); + } + + gko::kernels::omp::run_kernel_row_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2, + gko::dim<2>{static_cast(num_rows), + static_cast(num_cols)}, + output); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); +} + + +TEST_F(KernelLaunch, ReductionCol2D) +{ + for (int num_rows : {0, 1, 10, 1000, 1000}) { + for (int num_cols : + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 40, 100, 1000}) { + gko::Array host_ref{exec->get_master(), + static_cast(num_cols)}; + gko::Array output{exec, static_cast(num_cols)}; + for (int i = 0; i < num_cols; i++) { + // we are computing 2 * sum {j=0, j(num_rows) * (num_rows + 1) * (i + 1); + } + + gko::kernels::omp::run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, + output.get_data(), + gko::dim<2>{static_cast(num_rows), + static_cast(num_cols)}, + output); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); + } + } +} + } // namespace