From 74e2b3111406867ea60bf7c71195303d5147747d Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 31 Mar 2022 12:59:46 +0200 Subject: [PATCH 1/5] unify HIP/CUDA common kernels --- common/cuda_hip/base/kernel_launch.hpp.inc | 82 +++ .../base/kernel_launch_reduction.hpp.inc | 507 ++++++++++++++++++ .../base/kernel_launch_solver.hpp.inc | 61 +++ cuda/base/kernel_launch.cuh | 51 +- cuda/base/kernel_launch_reduction.cuh | 477 +--------------- cuda/base/kernel_launch_solver.cuh | 30 +- hip/base/kernel_launch.hip.hpp | 53 +- hip/base/kernel_launch_reduction.hip.hpp | 488 +---------------- hip/base/kernel_launch_solver.hip.hpp | 32 +- 9 files changed, 656 insertions(+), 1125 deletions(-) create mode 100644 common/cuda_hip/base/kernel_launch.hpp.inc create mode 100644 common/cuda_hip/base/kernel_launch_reduction.hpp.inc create mode 100644 common/cuda_hip/base/kernel_launch_solver.hpp.inc diff --git a/common/cuda_hip/base/kernel_launch.hpp.inc b/common/cuda_hip/base/kernel_launch.hpp.inc new file mode 100644 index 00000000000..e4c74769fcf --- /dev/null +++ b/common/cuda_hip/base/kernel_launch.hpp.inc @@ -0,0 +1,82 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +template +__global__ __launch_bounds__(default_block_size) void generic_kernel_1d( + int64 size, KernelFunction fn, KernelArgs... args) +{ + auto tidx = thread::get_thread_id_flat(); + if (tidx >= size) { + return; + } + fn(tidx, args...); +} + + +template +__global__ __launch_bounds__(default_block_size) void generic_kernel_2d( + int64 rows, int64 cols, KernelFunction fn, KernelArgs... args) +{ + auto tidx = thread::get_thread_id_flat(); + auto col = tidx % cols; + auto row = tidx / cols; + if (row >= rows) { + return; + } + fn(row, col, args...); +} + + +template +void run_kernel(std::shared_ptr exec, KernelFunction fn, + size_type size, KernelArgs&&... args) +{ + if (size > 0) { + constexpr auto block_size = default_block_size; + auto num_blocks = ceildiv(size, block_size); + generic_kernel_1d<<>>( + static_cast(size), fn, map_to_device(args)...); + } +} + +template +void run_kernel(std::shared_ptr exec, KernelFunction fn, + dim<2> size, KernelArgs&&... args) +{ + if (size[0] > 0 && size[1] > 0) { + constexpr auto block_size = default_block_size; + auto num_blocks = ceildiv(size[0] * size[1], block_size); + generic_kernel_2d<<>>( + static_cast(size[0]), static_cast(size[1]), fn, + map_to_device(args)...); + } +} diff --git a/common/cuda_hip/base/kernel_launch_reduction.hpp.inc b/common/cuda_hip/base/kernel_launch_reduction.hpp.inc new file mode 100644 index 00000000000..28c9d8ba4fd --- /dev/null +++ b/common/cuda_hip/base/kernel_launch_reduction.hpp.inc @@ -0,0 +1,507 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +template +__global__ __launch_bounds__( + default_block_size) void generic_kernel_reduction_1d(int64 size, + KernelFunction fn, + ReductionOp op, + FinalizeOp finalize, + ValueType identity, + ValueType* storage, + KernelArgs... args) +{ + __shared__ + UninitializedArray + warp_partial; + static_assert(default_block_size / config::warp_size <= config::warp_size, + "needs third reduction level"); + auto tidx = thread::get_thread_id_flat(); + auto grid_size = thread::get_thread_num_flat(); + auto warp = + group::tiled_partition(group::this_thread_block()); + auto partial = identity; + for (int64 i = tidx; i < size; i += grid_size) { + partial = op(partial, fn(i, args...)); + } + partial = reduce(warp, partial, op); + if (warp.thread_rank() == 0) { + warp_partial[threadIdx.x / config::warp_size] = partial; + } + __syncthreads(); + if (threadIdx.x < config::warp_size) { + partial = reduce(warp, + threadIdx.x < default_block_size / config::warp_size + ? warp_partial[threadIdx.x] + : identity, + op); + if (threadIdx.x == 0) { + storage[blockIdx.x] = finalize(partial); + } + } +} + + +template +__global__ __launch_bounds__( + default_block_size) void generic_kernel_reduction_2d(int64 rows, int64 cols, + KernelFunction fn, + ReductionOp op, + FinalizeOp finalize, + ValueType identity, + ValueType* storage, + KernelArgs... args) +{ + __shared__ + UninitializedArray + warp_partial; + static_assert(default_block_size / config::warp_size <= config::warp_size, + "needs third reduction level"); + auto tidx = thread::get_thread_id_flat(); + auto grid_size = thread::get_thread_num_flat(); + auto warp = + group::tiled_partition(group::this_thread_block()); + auto partial = identity; + for (int64 i = tidx; i < rows * cols; i += grid_size) { + const auto row = i / cols; + const auto col = i % cols; + partial = op(partial, fn(row, col, args...)); + } + partial = reduce(warp, partial, op); + if (warp.thread_rank() == 0) { + warp_partial[threadIdx.x / config::warp_size] = partial; + } + __syncthreads(); + if (threadIdx.x < config::warp_size) { + partial = reduce(warp, + threadIdx.x < default_block_size / config::warp_size + ? warp_partial[threadIdx.x] + : identity, + op); + if (threadIdx.x == 0) { + storage[blockIdx.x] = finalize(partial); + } + } +} + + +template +void run_kernel_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, size_type size, + KernelArgs&&... args) +{ + constexpr int oversubscription = 16; + constexpr auto block_size = default_block_size; + const auto num_blocks = std::min( + ceildiv(size, block_size), exec->get_num_warps() * oversubscription); + if (num_blocks > 1) { + Array partial{exec, static_cast(num_blocks)}; + generic_kernel_reduction_1d<<>>( + static_cast(size), fn, op, + [] __device__(auto v) { return v; }, as_device_type(identity), + as_device_type(partial.get_data()), map_to_device(args)...); + generic_kernel_reduction_1d<<<1, block_size>>>( + static_cast(num_blocks), + [] __device__(auto i, auto v) { return v[i]; }, op, finalize, + as_device_type(identity), as_device_type(result), + as_device_type(partial.get_const_data())); + } else { + generic_kernel_reduction_1d<<<1, block_size>>>( + static_cast(size), fn, op, finalize, + as_device_type(identity), as_device_type(result), + map_to_device(args)...); + } +} + + +template +void run_kernel_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, dim<2> size, KernelArgs&&... args) +{ + constexpr int oversubscription = 16; + constexpr auto block_size = default_block_size; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto num_blocks = + std::min(ceildiv(rows * cols, block_size), + exec->get_num_warps() * oversubscription); + if (num_blocks > 1) { + Array partial{exec, static_cast(num_blocks)}; + generic_kernel_reduction_2d<<>>( + rows, cols, fn, op, [] __device__(auto v) { return v; }, + as_device_type(identity), as_device_type(partial.get_data()), + map_to_device(args)...); + generic_kernel_reduction_1d<<<1, block_size>>>( + static_cast(num_blocks), + [] __device__(auto i, auto v) { return v[i]; }, op, finalize, + as_device_type(identity), as_device_type(result), + as_device_type(partial.get_const_data())); + } else { + generic_kernel_reduction_2d<<<1, block_size>>>( + rows, cols, fn, op, finalize, as_device_type(identity), + as_device_type(result), map_to_device(args)...); + } +} + + +template +__global__ + __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d( + int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, + ReductionOp op, FinalizeOp finalize, ValueType identity, + ValueType* result, int64 result_stride, KernelArgs... args) +{ + const auto idx = thread::get_subwarp_id_flat(); + const auto row = idx % rows; + const auto col_block = idx / rows; + if (col_block >= col_blocks) { + return; + } + const auto cols_per_part = + ceildiv(ceildiv(cols, subwarp_size), col_blocks) * subwarp_size; + const auto begin = cols_per_part * col_block; + const auto end = min(begin + cols_per_part, cols); + auto subwarp = + group::tiled_partition(group::this_thread_block()); + auto partial = identity; + for (auto col = begin + subwarp.thread_rank(); col < end; + col += subwarp_size) { + partial = op(partial, fn(row, col, args...)); + } + partial = reduce(subwarp, partial, op); + if (subwarp.thread_rank() == 0) { + result[(row + col_block * rows) * result_stride] = finalize(partial); + } +} + + +template +__global__ + __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small( + int64 rows, int64 cols, KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, ValueType* result, + KernelArgs... args) +{ + constexpr auto warp_size = config::warp_size; + constexpr auto warps_per_block = default_block_size / warp_size; + // stores the subwarp_size partial sums from each warp, grouped by warp + constexpr auto shared_storage = warps_per_block * subwarp_size; + __shared__ UninitializedArray block_partial; + const auto subwarp_id = thread::get_subwarp_id_flat(); + const auto local_warp_id = threadIdx.x / warp_size; + const auto local_subwarp_id = threadIdx.x % warp_size / subwarp_size; + const auto subwarp_num = + thread::get_subwarp_num_flat(); + const auto block = group::this_thread_block(); + const auto warp = group::tiled_partition(block); + const auto warp_rank = warp.thread_rank(); + const auto subwarp_rank = warp_rank % subwarp_size; + const auto col = static_cast(subwarp_rank); + auto partial = identity; + // accumulate within a thread + if (col < cols) { + for (auto row = subwarp_id; row < rows; row += subwarp_num) { + partial = op(partial, fn(row, col, args...)); + } + } + // accumulate between all subwarps in the warp +#pragma unroll + for (unsigned i = subwarp_size; i < warp_size; i *= 2) { + partial = op(partial, warp.shfl_xor(partial, i)); + } // store the result to shared memory + if (local_subwarp_id == 0) { + block_partial[local_warp_id * subwarp_size + subwarp_rank] = partial; + } + block.sync(); + // in a single thread: accumulate the results + if (local_warp_id == 0) { + partial = identity; + // accumulate the partial results within a thread + if (shared_storage >= warp_size) { +#pragma unroll + for (int i = 0; i < shared_storage; i += warp_size) { + partial = op(partial, block_partial[i + warp_rank]); + } + } else if (warp_rank < shared_storage) { + partial = op(partial, block_partial[warp_rank]); + } + // accumulate between all subwarps in the warp +#pragma unroll + for (unsigned i = subwarp_size; i < warp_size; i *= 2) { + partial = op(partial, warp.shfl_xor(partial, i)); + } + if (warp_rank < cols) { + result[warp_rank + blockIdx.x * cols] = finalize(partial); + } + } +} + + +template +__global__ + __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked( + int64 rows, int64 cols, KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, ValueType* result, + KernelArgs... args) +{ + constexpr auto warp_size = config::warp_size; + __shared__ UninitializedArray block_partial; + const auto warp_id = thread::get_subwarp_id_flat(); + const auto warp_num = thread::get_subwarp_num_flat(); + const auto block = group::this_thread_block(); + const auto warp = group::tiled_partition(block); + const auto warp_rank = warp.thread_rank(); + const auto col = warp_rank + static_cast(blockIdx.y) * warp_size; + auto partial = identity; + // accumulate within a thread + if (col < cols) { + for (auto row = warp_id; row < rows; row += warp_num) { + partial = op(partial, fn(row, col, args...)); + } + } + block_partial[threadIdx.x] = partial; + block.sync(); + // in a single warp: accumulate the results + if (threadIdx.x < warp_size) { + partial = identity; + // accumulate the partial results within a thread +#pragma unroll + for (int i = 0; i < default_block_size; i += warp_size) { + partial = op(partial, block_partial[i + warp_rank]); + } + if (col < cols) { + result[col + blockIdx.x * cols] = finalize(partial); + } + } +} + + +template +__global__ + __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d( + int64 num_results, int64 num_blocks, ReductionOp op, + FinalizeOp finalize, ValueType identity, const ValueType* input, + int64 result_stride, ValueType* result) +{ + const auto idx = thread::get_thread_id_flat(); + if (idx >= num_results) { + return; + } + auto partial = identity; + for (int64 block = 0; block < num_blocks; block++) { + partial = op(partial, input[idx + block * num_results]); + } + result[idx * result_stride] = finalize(partial); +} + + +namespace { + + +template +void run_generic_kernel_row_reduction(syn::value_list, + int64 rows, int64 cols, int64 col_blocks, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, int64 result_stride, + KernelArgs... args) +{ + const auto num_blocks = + ceildiv(rows * col_blocks * subwarp_size, default_block_size); + if (num_blocks > 0) { + generic_kernel_row_reduction_2d + <<>>( + rows, cols, col_blocks, fn, op, finalize, + as_device_type(identity), as_device_type(result), result_stride, + args...); + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction, + run_generic_kernel_row_reduction); + + +template +void run_generic_col_reduction_small( + syn::value_list, int64 max_blocks, + std::shared_ptr exec, KernelFunction fn, + ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, + dim<2> size, MappedKernelArgs... args) +{ + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto num_blocks = std::min( + ceildiv(rows * subwarp_size, default_block_size), max_blocks); + if (num_blocks <= 1) { + generic_kernel_col_reduction_2d_small + <<<1, default_block_size>>>(rows, cols, fn, op, finalize, + as_device_type(identity), + as_device_type(result), args...); + } else { + Array tmp_storage{exec, + static_cast(num_blocks * cols)}; + generic_kernel_col_reduction_2d_small + <<>>( + rows, cols, fn, op, [] __device__(auto v) { return v; }, + as_device_type(identity), + as_device_type(tmp_storage.get_data()), args...); + if (cols > 0) { + generic_kernel_reduction_finalize_2d<<< + ceildiv(cols, default_block_size), default_block_size>>>( + cols, num_blocks, op, finalize, as_device_type(identity), + as_device_type(tmp_storage.get_const_data()), 1, + as_device_type(result)); + } + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small, + run_generic_col_reduction_small); + + +} // namespace + + +template +void run_kernel_row_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, size_type result_stride, + dim<2> size, KernelArgs&&... args) +{ + using subwarp_sizes = + syn::value_list; + constexpr int oversubscription = 16; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto resources = + exec->get_num_warps() * config::warp_size * oversubscription; + if (rows * cols > resources && rows < cols) { + const auto col_blocks = ceildiv(rows * cols, resources); + Array partial{exec, + static_cast(col_blocks * rows)}; + const auto num_blocks = + ceildiv(rows * col_blocks * config::warp_size, default_block_size); + // no need to guard this kernel, as rows * cols > resources + generic_kernel_row_reduction_2d + <<>>( + rows, cols, col_blocks, fn, op, + [] __device__(auto v) { return v; }, as_device_type(identity), + as_device_type(partial.get_data()), 1, map_to_device(args)...); + const auto num_finalize_blocks = ceildiv(rows, default_block_size); + generic_kernel_reduction_finalize_2d<<>>( + rows, col_blocks, op, finalize, as_device_type(identity), + as_device_type(partial.get_const_data()), + static_cast(result_stride), as_device_type(result)); + } else { + select_run_generic_kernel_row_reduction( + subwarp_sizes(), + [cols](int compiled_subwarp_size) { + return compiled_subwarp_size >= cols || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), rows, cols, 1, fn, op, + finalize, identity, result, static_cast(result_stride), + map_to_device(args)...); + } +} + + +template +void run_kernel_col_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, dim<2> size, + KernelArgs&&... args) +{ + using subwarp_sizes = + syn::value_list; + constexpr int oversubscription = 16; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto max_blocks = exec->get_num_warps() * config::warp_size * + oversubscription / default_block_size; + if (cols <= config::warp_size) { + select_generic_col_reduction_small( + subwarp_sizes(), + [cols](int compiled_subwarp_size) { + return compiled_subwarp_size >= cols || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), max_blocks, exec, fn, + op, finalize, identity, result, size, map_to_device(args)...); + } else { + const auto col_blocks = ceildiv(cols, config::warp_size); + const auto row_blocks = + ceildiv(std::min( + ceildiv(rows * config::warp_size, default_block_size), + max_blocks), + col_blocks); + if (row_blocks <= 1) { + generic_kernel_col_reduction_2d_blocked<<>>( + rows, cols, fn, op, finalize, as_device_type(identity), + as_device_type(result), map_to_device(args)...); + } else { + Array tmp_storage{ + exec, static_cast(row_blocks * cols)}; + // no need to guard this kernel, as cols > warp_size, row_blocks > 1 + generic_kernel_col_reduction_2d_blocked<<< + dim3(row_blocks, col_blocks), default_block_size>>>( + rows, cols, fn, op, [] __device__(auto v) { return v; }, + as_device_type(identity), + as_device_type(tmp_storage.get_data()), map_to_device(args)...); + generic_kernel_reduction_finalize_2d<<< + ceildiv(cols, default_block_size), default_block_size>>>( + cols, row_blocks, op, finalize, as_device_type(identity), + as_device_type(tmp_storage.get_const_data()), 1, + as_device_type(result)); + } + } +} diff --git a/common/cuda_hip/base/kernel_launch_solver.hpp.inc b/common/cuda_hip/base/kernel_launch_solver.hpp.inc new file mode 100644 index 00000000000..0d7a0cbce5d --- /dev/null +++ b/common/cuda_hip/base/kernel_launch_solver.hpp.inc @@ -0,0 +1,61 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +template +__global__ __launch_bounds__(default_block_size) void generic_kernel_2d_solver( + int64 rows, int64 cols, int64 default_stride, KernelFunction fn, + KernelArgs... args) +{ + auto tidx = thread::get_thread_id_flat(); + auto col = tidx % cols; + auto row = tidx / cols; + if (row >= rows) { + return; + } + fn(row, col, + device_unpack_solver_impl::unpack(args, default_stride)...); +} + + +template +void run_kernel_solver(std::shared_ptr exec, + KernelFunction fn, dim<2> size, size_type default_stride, + KernelArgs&&... args) +{ + if (size[0] > 0 && size[1] > 0) { + constexpr auto block_size = default_block_size; + auto num_blocks = ceildiv(size[0] * size[1], block_size); + generic_kernel_2d_solver<<>>( + static_cast(size[0]), static_cast(size[1]), + static_cast(default_stride), fn, map_to_device(args)...); + } +} diff --git a/cuda/base/kernel_launch.cuh b/cuda/base/kernel_launch.cuh index 749a9016282..257948b3d4d 100644 --- a/cuda/base/kernel_launch.cuh +++ b/cuda/base/kernel_launch.cuh @@ -54,56 +54,7 @@ namespace device_std = thrust; constexpr int default_block_size = 512; -template -__global__ __launch_bounds__(default_block_size) void generic_kernel_1d( - int64 size, KernelFunction fn, KernelArgs... args) -{ - auto tidx = thread::get_thread_id_flat(); - if (tidx >= size) { - return; - } - fn(tidx, args...); -} - - -template -__global__ __launch_bounds__(default_block_size) void generic_kernel_2d( - int64 rows, int64 cols, KernelFunction fn, KernelArgs... args) -{ - auto tidx = thread::get_thread_id_flat(); - auto col = tidx % cols; - auto row = tidx / cols; - if (row >= rows) { - return; - } - fn(row, col, args...); -} - - -template -void run_kernel(std::shared_ptr exec, KernelFunction fn, - size_type size, KernelArgs&&... args) -{ - if (size > 0) { - constexpr auto block_size = default_block_size; - auto num_blocks = ceildiv(size, block_size); - generic_kernel_1d<<>>( - static_cast(size), fn, map_to_device(args)...); - } -} - -template -void run_kernel(std::shared_ptr exec, KernelFunction fn, - dim<2> size, KernelArgs&&... args) -{ - if (size[0] > 0 && size[1] > 0) { - constexpr auto block_size = default_block_size; - auto num_blocks = ceildiv(size[0] * size[1], block_size); - generic_kernel_2d<<>>( - static_cast(size[0]), static_cast(size[1]), fn, - map_to_device(args)...); - } -} +#include "common/cuda_hip/base/kernel_launch.hpp.inc" } // namespace cuda diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh index db7040b07f5..e6da25a1706 100644 --- a/cuda/base/kernel_launch_reduction.cuh +++ b/cuda/base/kernel_launch_reduction.cuh @@ -48,482 +48,7 @@ namespace kernels { namespace cuda { -template -__global__ __launch_bounds__( - default_block_size) void generic_kernel_reduction_1d(int64 size, - KernelFunction fn, - ReductionOp op, - FinalizeOp finalize, - ValueType identity, - ValueType* storage, - KernelArgs... args) -{ - __shared__ - UninitializedArray - warp_partial; - static_assert(default_block_size / config::warp_size <= config::warp_size, - "needs third reduction level"); - auto tidx = thread::get_thread_id_flat(); - auto grid_size = thread::get_thread_num_flat(); - auto warp = - group::tiled_partition(group::this_thread_block()); - auto partial = identity; - for (int64 i = tidx; i < size; i += grid_size) { - partial = op(partial, fn(i, args...)); - } - partial = reduce(warp, partial, op); - if (warp.thread_rank() == 0) { - warp_partial[threadIdx.x / config::warp_size] = partial; - } - __syncthreads(); - if (threadIdx.x < config::warp_size) { - partial = reduce(warp, - threadIdx.x < default_block_size / config::warp_size - ? warp_partial[threadIdx.x] - : identity, - op); - if (threadIdx.x == 0) { - storage[blockIdx.x] = finalize(partial); - } - } -} - - -template -__global__ __launch_bounds__( - default_block_size) void generic_kernel_reduction_2d(int64 rows, int64 cols, - KernelFunction fn, - ReductionOp op, - FinalizeOp finalize, - ValueType identity, - ValueType* storage, - KernelArgs... args) -{ - __shared__ - UninitializedArray - warp_partial; - static_assert(default_block_size / config::warp_size <= config::warp_size, - "needs third reduction level"); - auto tidx = thread::get_thread_id_flat(); - auto grid_size = thread::get_thread_num_flat(); - auto warp = - group::tiled_partition(group::this_thread_block()); - auto partial = identity; - for (int64 i = tidx; i < rows * cols; i += grid_size) { - const auto row = i / cols; - const auto col = i % cols; - partial = op(partial, fn(row, col, args...)); - } - partial = reduce(warp, partial, op); - if (warp.thread_rank() == 0) { - warp_partial[threadIdx.x / config::warp_size] = partial; - } - __syncthreads(); - if (threadIdx.x < config::warp_size) { - partial = reduce(warp, - threadIdx.x < default_block_size / config::warp_size - ? warp_partial[threadIdx.x] - : identity, - op); - if (threadIdx.x == 0) { - storage[blockIdx.x] = finalize(partial); - } - } -} - - -template -void run_kernel_reduction(std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, size_type size, - KernelArgs&&... args) -{ - constexpr int oversubscription = 16; - constexpr auto block_size = default_block_size; - const auto num_blocks = std::min( - ceildiv(size, block_size), exec->get_num_warps() * oversubscription); - if (num_blocks > 1) { - Array partial{exec, static_cast(num_blocks)}; - generic_kernel_reduction_1d<<>>( - static_cast(size), fn, op, - [] __device__(auto v) { return v; }, as_cuda_type(identity), - as_cuda_type(partial.get_data()), map_to_device(args)...); - generic_kernel_reduction_1d<<<1, block_size>>>( - static_cast(num_blocks), - [] __device__(auto i, auto v) { return v[i]; }, op, finalize, - as_cuda_type(identity), as_cuda_type(result), - as_cuda_type(partial.get_const_data())); - } else { - generic_kernel_reduction_1d<<<1, block_size>>>( - static_cast(size), fn, op, finalize, as_cuda_type(identity), - as_cuda_type(result), map_to_device(args)...); - } -} - - -template -void run_kernel_reduction(std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, dim<2> size, KernelArgs&&... args) -{ - constexpr int oversubscription = 16; - constexpr auto block_size = default_block_size; - const auto rows = static_cast(size[0]); - const auto cols = static_cast(size[1]); - const auto num_blocks = - std::min(ceildiv(rows * cols, block_size), - exec->get_num_warps() * oversubscription); - if (num_blocks > 1) { - Array partial{exec, static_cast(num_blocks)}; - generic_kernel_reduction_2d<<>>( - rows, cols, fn, op, [] __device__(auto v) { return v; }, - as_cuda_type(identity), as_cuda_type(partial.get_data()), - map_to_device(args)...); - generic_kernel_reduction_1d<<<1, block_size>>>( - static_cast(num_blocks), - [] __device__(auto i, auto v) { return v[i]; }, op, finalize, - as_cuda_type(identity), as_cuda_type(result), - as_cuda_type(partial.get_const_data())); - } else { - generic_kernel_reduction_2d<<<1, block_size>>>( - rows, cols, fn, op, finalize, as_cuda_type(identity), - as_cuda_type(result), map_to_device(args)...); - } -} - - -template -__global__ - __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d( - int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, - ReductionOp op, FinalizeOp finalize, ValueType identity, - ValueType* result, int64 result_stride, KernelArgs... args) -{ - const auto idx = thread::get_subwarp_id_flat(); - const auto row = idx % rows; - const auto col_block = idx / rows; - if (col_block >= col_blocks) { - return; - } - const auto cols_per_part = - ceildiv(ceildiv(cols, subwarp_size), col_blocks) * subwarp_size; - const auto begin = cols_per_part * col_block; - const auto end = min(begin + cols_per_part, cols); - auto subwarp = - group::tiled_partition(group::this_thread_block()); - auto partial = identity; - for (auto col = begin + subwarp.thread_rank(); col < end; - col += subwarp_size) { - partial = op(partial, fn(row, col, args...)); - } - partial = reduce(subwarp, partial, op); - if (subwarp.thread_rank() == 0) { - result[(row + col_block * rows) * result_stride] = finalize(partial); - } -} - - -template -__global__ - __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small( - int64 rows, int64 cols, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, ValueType* result, - KernelArgs... args) -{ - constexpr auto warp_size = config::warp_size; - constexpr auto warps_per_block = default_block_size / warp_size; - // stores the subwarp_size partial sums from each warp, grouped by warp - constexpr auto shared_storage = warps_per_block * subwarp_size; - __shared__ UninitializedArray block_partial; - const auto subwarp_id = thread::get_subwarp_id_flat(); - const auto local_warp_id = threadIdx.x / warp_size; - const auto local_subwarp_id = threadIdx.x % warp_size / subwarp_size; - const auto subwarp_num = - thread::get_subwarp_num_flat(); - const auto block = group::this_thread_block(); - const auto warp = group::tiled_partition(block); - const auto warp_rank = warp.thread_rank(); - const auto subwarp_rank = warp_rank % subwarp_size; - const auto col = static_cast(subwarp_rank); - auto partial = identity; - // accumulate within a thread - if (col < cols) { - for (auto row = subwarp_id; row < rows; row += subwarp_num) { - partial = op(partial, fn(row, col, args...)); - } - } - // accumulate between all subwarps in the warp -#pragma unroll - for (unsigned i = subwarp_size; i < warp_size; i *= 2) { - partial = op(partial, warp.shfl_xor(partial, i)); - } // store the result to shared memory - if (local_subwarp_id == 0) { - block_partial[local_warp_id * subwarp_size + subwarp_rank] = partial; - } - block.sync(); - // in a single thread: accumulate the results - if (local_warp_id == 0) { - partial = identity; - // accumulate the partial results within a thread - if (shared_storage >= warp_size) { -#pragma unroll - for (int i = 0; i < shared_storage; i += warp_size) { - partial = op(partial, block_partial[i + warp_rank]); - } - } else if (warp_rank < shared_storage) { - partial = op(partial, block_partial[warp_rank]); - } - // accumulate between all subwarps in the warp -#pragma unroll - for (unsigned i = subwarp_size; i < warp_size; i *= 2) { - partial = op(partial, warp.shfl_xor(partial, i)); - } - if (warp_rank < cols) { - result[warp_rank + blockIdx.x * cols] = finalize(partial); - } - } -} - - -template -__global__ - __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked( - int64 rows, int64 cols, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, ValueType* result, - KernelArgs... args) -{ - constexpr auto warp_size = config::warp_size; - __shared__ UninitializedArray block_partial; - const auto warp_id = thread::get_subwarp_id_flat(); - const auto warp_num = thread::get_subwarp_num_flat(); - const auto block = group::this_thread_block(); - const auto warp = group::tiled_partition(block); - const auto warp_rank = warp.thread_rank(); - const auto col = warp_rank + static_cast(blockIdx.y) * warp_size; - auto partial = identity; - // accumulate within a thread - if (col < cols) { - for (auto row = warp_id; row < rows; row += warp_num) { - partial = op(partial, fn(row, col, args...)); - } - } - block_partial[threadIdx.x] = partial; - block.sync(); - // in a single warp: accumulate the results - if (threadIdx.x < warp_size) { - partial = identity; - // accumulate the partial results within a thread -#pragma unroll - for (int i = 0; i < default_block_size; i += warp_size) { - partial = op(partial, block_partial[i + warp_rank]); - } - if (col < cols) { - result[col + blockIdx.x * cols] = finalize(partial); - } - } -} - - -template -__global__ - __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d( - int64 num_results, int64 num_blocks, ReductionOp op, - FinalizeOp finalize, ValueType identity, const ValueType* input, - int64 result_stride, ValueType* result) -{ - const auto idx = thread::get_thread_id_flat(); - if (idx >= num_results) { - return; - } - auto partial = identity; - for (int64 block = 0; block < num_blocks; block++) { - partial = op(partial, input[idx + block * num_results]); - } - result[idx * result_stride] = finalize(partial); -} - - -namespace { - - -template -void run_generic_kernel_row_reduction(syn::value_list, - int64 rows, int64 cols, int64 col_blocks, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, int64 result_stride, - KernelArgs... args) -{ - const auto num_blocks = - ceildiv(rows * col_blocks * subwarp_size, default_block_size); - if (num_blocks > 0) { - generic_kernel_row_reduction_2d - <<>>( - rows, cols, col_blocks, fn, op, finalize, - as_cuda_type(identity), as_cuda_type(result), result_stride, - args...); - } -} - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction, - run_generic_kernel_row_reduction); - - -template -void run_generic_col_reduction_small(syn::value_list, - int64 max_blocks, - std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, dim<2> size, - MappedKernelArgs... args) -{ - const auto rows = static_cast(size[0]); - const auto cols = static_cast(size[1]); - const auto num_blocks = std::min( - ceildiv(rows * subwarp_size, default_block_size), max_blocks); - if (num_blocks <= 1) { - generic_kernel_col_reduction_2d_small - <<<1, default_block_size>>>(rows, cols, fn, op, finalize, - as_cuda_type(identity), - as_cuda_type(result), args...); - } else { - Array tmp_storage{exec, - static_cast(num_blocks * cols)}; - generic_kernel_col_reduction_2d_small - <<>>( - rows, cols, fn, op, [] __device__(auto v) { return v; }, - as_cuda_type(identity), as_cuda_type(tmp_storage.get_data()), - args...); - if (cols > 0) { - generic_kernel_reduction_finalize_2d<<< - ceildiv(cols, default_block_size), default_block_size>>>( - cols, num_blocks, op, finalize, as_cuda_type(identity), - as_cuda_type(tmp_storage.get_const_data()), 1, - as_cuda_type(result)); - } - } -} - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small, - run_generic_col_reduction_small); - - -} // namespace - - -template -void run_kernel_row_reduction(std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, size_type result_stride, - dim<2> size, KernelArgs&&... args) -{ - using subwarp_sizes = - syn::value_list; - constexpr int oversubscription = 16; - const auto rows = static_cast(size[0]); - const auto cols = static_cast(size[1]); - const auto resources = - exec->get_num_warps() * config::warp_size * oversubscription; - if (rows * cols > resources && rows < cols) { - const auto col_blocks = ceildiv(rows * cols, resources); - Array partial{exec, - static_cast(col_blocks * rows)}; - const auto num_blocks = - ceildiv(rows * col_blocks * config::warp_size, default_block_size); - // no need to guard this kernel, as rows * cols > resources - generic_kernel_row_reduction_2d - <<>>( - rows, cols, col_blocks, fn, op, - [] __device__(auto v) { return v; }, as_cuda_type(identity), - as_cuda_type(partial.get_data()), 1, map_to_device(args)...); - const auto num_finalize_blocks = ceildiv(rows, default_block_size); - generic_kernel_reduction_finalize_2d<<>>( - rows, col_blocks, op, finalize, as_cuda_type(identity), - as_cuda_type(partial.get_const_data()), - static_cast(result_stride), as_cuda_type(result)); - } else { - select_run_generic_kernel_row_reduction( - subwarp_sizes(), - [cols](int compiled_subwarp_size) { - return compiled_subwarp_size >= cols || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), rows, cols, 1, fn, op, - finalize, identity, result, static_cast(result_stride), - map_to_device(args)...); - } -} - - -template -void run_kernel_col_reduction(std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, dim<2> size, - KernelArgs&&... args) -{ - using subwarp_sizes = - syn::value_list; - constexpr int oversubscription = 16; - const auto rows = static_cast(size[0]); - const auto cols = static_cast(size[1]); - const auto max_blocks = exec->get_num_warps() * config::warp_size * - oversubscription / default_block_size; - if (cols <= config::warp_size) { - select_generic_col_reduction_small( - subwarp_sizes(), - [cols](int compiled_subwarp_size) { - return compiled_subwarp_size >= cols || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), max_blocks, exec, fn, - op, finalize, identity, result, size, map_to_device(args)...); - } else { - const auto col_blocks = ceildiv(cols, config::warp_size); - const auto row_blocks = - ceildiv(std::min( - ceildiv(rows * config::warp_size, default_block_size), - max_blocks), - col_blocks); - if (row_blocks <= 1) { - generic_kernel_col_reduction_2d_blocked<<>>( - rows, cols, fn, op, finalize, as_cuda_type(identity), - as_cuda_type(result), map_to_device(args)...); - } else { - Array tmp_storage{ - exec, static_cast(row_blocks * cols)}; - // no need to guard this kernel, as cols > warp_size, row_blocks > 1 - generic_kernel_col_reduction_2d_blocked<<< - dim3(row_blocks, col_blocks), default_block_size>>>( - rows, cols, fn, op, [] __device__(auto v) { return v; }, - as_cuda_type(identity), as_cuda_type(tmp_storage.get_data()), - map_to_device(args)...); - generic_kernel_reduction_finalize_2d<<< - ceildiv(cols, default_block_size), default_block_size>>>( - cols, row_blocks, op, finalize, as_cuda_type(identity), - as_cuda_type(tmp_storage.get_const_data()), 1, - as_cuda_type(result)); - } - } -} +#include "common/cuda_hip/base/kernel_launch_reduction.hpp.inc" } // namespace cuda diff --git a/cuda/base/kernel_launch_solver.cuh b/cuda/base/kernel_launch_solver.cuh index 38dc0afffff..117c47a1102 100644 --- a/cuda/base/kernel_launch_solver.cuh +++ b/cuda/base/kernel_launch_solver.cuh @@ -41,35 +41,7 @@ namespace kernels { namespace cuda { -template -__global__ __launch_bounds__(default_block_size) void generic_kernel_2d_solver( - int64 rows, int64 cols, int64 default_stride, KernelFunction fn, - KernelArgs... args) -{ - auto tidx = thread::get_thread_id_flat(); - auto col = tidx % cols; - auto row = tidx / cols; - if (row >= rows) { - return; - } - fn(row, col, - device_unpack_solver_impl::unpack(args, default_stride)...); -} - - -template -void run_kernel_solver(std::shared_ptr exec, - KernelFunction fn, dim<2> size, size_type default_stride, - KernelArgs&&... args) -{ - if (size[0] > 0 && size[1] > 0) { - constexpr auto block_size = default_block_size; - auto num_blocks = ceildiv(size[0] * size[1], block_size); - generic_kernel_2d_solver<<>>( - static_cast(size[0]), static_cast(size[1]), - static_cast(default_stride), fn, map_to_device(args)...); - } -} +#include "common/cuda_hip/base/kernel_launch_solver.hpp.inc" } // namespace cuda diff --git a/hip/base/kernel_launch.hip.hpp b/hip/base/kernel_launch.hip.hpp index 07c11a98d7f..7d5c0a33303 100644 --- a/hip/base/kernel_launch.hip.hpp +++ b/hip/base/kernel_launch.hip.hpp @@ -55,58 +55,7 @@ namespace device_std = thrust; constexpr int default_block_size = 512; -template -__global__ __launch_bounds__(default_block_size) void generic_kernel_1d( - int64 size, KernelFunction fn, KernelArgs... args) -{ - auto tidx = thread::get_thread_id_flat(); - if (tidx >= size) { - return; - } - fn(tidx, args...); -} - - -template -__global__ __launch_bounds__(default_block_size) void generic_kernel_2d( - int64 rows, int64 cols, KernelFunction fn, KernelArgs... args) -{ - auto tidx = thread::get_thread_id_flat(); - auto col = tidx % cols; - auto row = tidx / cols; - if (row >= rows) { - return; - } - fn(row, col, args...); -} - - -template -void run_kernel(std::shared_ptr exec, KernelFunction fn, - size_type size, KernelArgs&&... args) -{ - if (size > 0) { - constexpr auto block_size = default_block_size; - auto num_blocks = ceildiv(size, block_size); - hipLaunchKernelGGL(generic_kernel_1d, num_blocks, block_size, 0, 0, - static_cast(size), fn, - map_to_device(args)...); - } -} - -template -void run_kernel(std::shared_ptr exec, KernelFunction fn, - dim<2> size, KernelArgs&&... args) -{ - if (size[0] > 0 && size[1] > 0) { - constexpr auto block_size = default_block_size; - auto num_blocks = ceildiv(size[0] * size[1], block_size); - hipLaunchKernelGGL(generic_kernel_2d, num_blocks, block_size, 0, 0, - static_cast(size[0]), - static_cast(size[1]), fn, - map_to_device(args)...); - } -} +#include "common/cuda_hip/base/kernel_launch.hpp.inc" } // namespace hip diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp index e93f811bfb1..575d772855b 100644 --- a/hip/base/kernel_launch_reduction.hip.hpp +++ b/hip/base/kernel_launch_reduction.hip.hpp @@ -48,493 +48,7 @@ namespace kernels { namespace hip { -template -__global__ __launch_bounds__( - default_block_size) void generic_kernel_reduction_1d(int64 size, - KernelFunction fn, - ReductionOp op, - FinalizeOp finalize, - ValueType identity, - ValueType* storage, - KernelArgs... args) -{ - __shared__ - UninitializedArray - warp_partial; - static_assert(default_block_size / config::warp_size <= config::warp_size, - "needs third reduction level"); - auto tidx = thread::get_thread_id_flat(); - auto grid_size = thread::get_thread_num_flat(); - auto warp = - group::tiled_partition(group::this_thread_block()); - auto partial = identity; - for (int64 i = tidx; i < size; i += grid_size) { - partial = op(partial, fn(i, args...)); - } - partial = reduce(warp, partial, op); - if (warp.thread_rank() == 0) { - warp_partial[threadIdx.x / config::warp_size] = partial; - } - __syncthreads(); - if (threadIdx.x < config::warp_size) { - partial = reduce(warp, - threadIdx.x < default_block_size / config::warp_size - ? warp_partial[threadIdx.x] - : identity, - op); - if (threadIdx.x == 0) { - storage[blockIdx.x] = finalize(partial); - } - } -} - - -template -__global__ __launch_bounds__( - default_block_size) void generic_kernel_reduction_2d(int64 rows, int64 cols, - KernelFunction fn, - ReductionOp op, - FinalizeOp finalize, - ValueType identity, - ValueType* storage, - KernelArgs... args) -{ - __shared__ - UninitializedArray - warp_partial; - static_assert(default_block_size / config::warp_size <= config::warp_size, - "needs third reduction level"); - auto tidx = thread::get_thread_id_flat(); - auto grid_size = thread::get_thread_num_flat(); - auto warp = - group::tiled_partition(group::this_thread_block()); - auto partial = identity; - for (int64 i = tidx; i < rows * cols; i += grid_size) { - const auto row = i / cols; - const auto col = i % cols; - partial = op(partial, fn(row, col, args...)); - } - partial = reduce(warp, partial, op); - if (warp.thread_rank() == 0) { - warp_partial[threadIdx.x / config::warp_size] = partial; - } - __syncthreads(); - if (threadIdx.x < config::warp_size) { - partial = reduce(warp, - threadIdx.x < default_block_size / config::warp_size - ? warp_partial[threadIdx.x] - : identity, - op); - if (threadIdx.x == 0) { - storage[blockIdx.x] = finalize(partial); - } - } -} - - -template -void run_kernel_reduction(std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, size_type size, - KernelArgs&&... args) -{ - constexpr int oversubscription = 16; - constexpr auto block_size = default_block_size; - const auto num_blocks = std::min( - ceildiv(size, block_size), exec->get_num_warps() * oversubscription); - if (num_blocks > 1) { - Array partial{exec, static_cast(num_blocks)}; - hipLaunchKernelGGL( - generic_kernel_reduction_1d, num_blocks, block_size, 0, 0, - static_cast(size), fn, op, - [] __device__(auto v) { return v; }, as_hip_type(identity), - as_hip_type(partial.get_data()), map_to_device(args)...); - hipLaunchKernelGGL( - generic_kernel_reduction_1d, 1, block_size, 0, 0, - static_cast(num_blocks), - [] __device__(auto i, auto v) { return v[i]; }, op, finalize, - as_hip_type(identity), as_hip_type(result), - as_hip_type(partial.get_const_data())); - } else { - hipLaunchKernelGGL(generic_kernel_reduction_1d, 1, block_size, 0, 0, - static_cast(size), fn, op, finalize, - as_hip_type(identity), as_hip_type(result), - map_to_device(args)...); - } -} - - -template -void run_kernel_reduction(std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, dim<2> size, KernelArgs&&... args) -{ - constexpr int oversubscription = 16; - constexpr auto block_size = default_block_size; - const auto rows = static_cast(size[0]); - const auto cols = static_cast(size[1]); - const auto num_blocks = - std::min(ceildiv(rows * cols, block_size), - exec->get_num_warps() * oversubscription); - if (num_blocks > 1) { - Array partial{exec, static_cast(num_blocks)}; - hipLaunchKernelGGL( - generic_kernel_reduction_2d, num_blocks, block_size, 0, 0, rows, - cols, fn, op, [] __device__(auto v) { return v; }, - as_hip_type(identity), as_hip_type(partial.get_data()), - map_to_device(args)...); - hipLaunchKernelGGL( - generic_kernel_reduction_1d, 1, block_size, 0, 0, - static_cast(num_blocks), - [] __device__(auto i, auto v) { return v[i]; }, op, finalize, - as_hip_type(identity), as_hip_type(result), - as_hip_type(partial.get_const_data())); - } else { - hipLaunchKernelGGL(generic_kernel_reduction_2d, 1, block_size, 0, 0, - rows, cols, fn, op, finalize, as_hip_type(identity), - as_hip_type(result), map_to_device(args)...); - } -} - - -template -__global__ - __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d( - int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, - ReductionOp op, FinalizeOp finalize, ValueType identity, - ValueType* result, int64 result_stride, KernelArgs... args) -{ - const auto idx = thread::get_subwarp_id_flat(); - const auto row = idx % rows; - const auto col_block = idx / rows; - if (col_block >= col_blocks) { - return; - } - const auto cols_per_part = - ceildiv(ceildiv(cols, subwarp_size), col_blocks) * subwarp_size; - const auto begin = cols_per_part * col_block; - const auto end = min(begin + cols_per_part, cols); - auto subwarp = - group::tiled_partition(group::this_thread_block()); - auto partial = identity; - for (auto col = begin + subwarp.thread_rank(); col < end; - col += subwarp_size) { - partial = op(partial, fn(row, col, args...)); - } - partial = reduce(subwarp, partial, op); - if (subwarp.thread_rank() == 0) { - result[(row + col_block * rows) * result_stride] = finalize(partial); - } -} - - -template -__global__ - __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small( - int64 rows, int64 cols, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, ValueType* result, - KernelArgs... args) -{ - constexpr auto warp_size = config::warp_size; - constexpr auto warps_per_block = default_block_size / warp_size; - // stores the subwarp_size partial sums from each warp, grouped by warp - constexpr auto shared_storage = warps_per_block * subwarp_size; - __shared__ UninitializedArray block_partial; - const auto subwarp_id = thread::get_subwarp_id_flat(); - const auto local_warp_id = threadIdx.x / warp_size; - const auto local_subwarp_id = threadIdx.x % warp_size / subwarp_size; - const auto subwarp_num = - thread::get_subwarp_num_flat(); - const auto block = group::this_thread_block(); - const auto warp = group::tiled_partition(block); - const auto warp_rank = warp.thread_rank(); - const auto subwarp_rank = warp_rank % subwarp_size; - const auto col = static_cast(subwarp_rank); - auto partial = identity; - // accumulate within a thread - if (col < cols) { - for (auto row = subwarp_id; row < rows; row += subwarp_num) { - partial = op(partial, fn(row, col, args...)); - } - } - // accumulate between all subwarps in the warp -#pragma unroll - for (unsigned i = subwarp_size; i < warp_size; i *= 2) { - partial = op(partial, warp.shfl_xor(partial, i)); - } // store the result to shared memory - if (local_subwarp_id == 0) { - block_partial[local_warp_id * subwarp_size + subwarp_rank] = partial; - } - block.sync(); - // in a single thread: accumulate the results - if (local_warp_id == 0) { - partial = identity; - // accumulate the partial results within a thread - if (shared_storage >= warp_size) { -#pragma unroll - for (int i = 0; i < shared_storage; i += warp_size) { - partial = op(partial, block_partial[i + warp_rank]); - } - } else if (warp_rank < shared_storage) { - partial = op(partial, block_partial[warp_rank]); - } - // accumulate between all subwarps in the warp -#pragma unroll - for (unsigned i = subwarp_size; i < warp_size; i *= 2) { - partial = op(partial, warp.shfl_xor(partial, i)); - } - if (warp_rank < cols) { - result[warp_rank + blockIdx.x * cols] = finalize(partial); - } - } -} - - -template -__global__ - __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked( - int64 rows, int64 cols, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, ValueType* result, - KernelArgs... args) -{ - constexpr auto warp_size = config::warp_size; - __shared__ UninitializedArray block_partial; - const auto warp_id = thread::get_subwarp_id_flat(); - const auto warp_num = thread::get_subwarp_num_flat(); - const auto block = group::this_thread_block(); - const auto warp = group::tiled_partition(block); - const auto warp_rank = warp.thread_rank(); - const auto col = warp_rank + static_cast(blockIdx.y) * warp_size; - auto partial = identity; - // accumulate within a thread - if (col < cols) { - for (auto row = warp_id; row < rows; row += warp_num) { - partial = op(partial, fn(row, col, args...)); - } - } - block_partial[threadIdx.x] = partial; - block.sync(); - // in a single warp: accumulate the results - if (threadIdx.x < warp_size) { - partial = identity; - // accumulate the partial results within a thread -#pragma unroll - for (int i = 0; i < default_block_size; i += warp_size) { - partial = op(partial, block_partial[i + warp_rank]); - } - if (col < cols) { - result[col + blockIdx.x * cols] = finalize(partial); - } - } -} - - -template -__global__ - __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d( - int64 num_results, int64 num_blocks, ReductionOp op, - FinalizeOp finalize, ValueType identity, const ValueType* input, - int64 result_stride, ValueType* result) -{ - const auto idx = thread::get_thread_id_flat(); - if (idx >= num_results) { - return; - } - auto partial = identity; - for (int64 block = 0; block < num_blocks; block++) { - partial = op(partial, input[idx + block * num_results]); - } - result[idx * result_stride] = finalize(partial); -} - - -namespace { - - -template -void run_generic_kernel_row_reduction(syn::value_list, - int64 rows, int64 cols, int64 col_blocks, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, int64 result_stride, - KernelArgs... args) -{ - const auto num_blocks = - ceildiv(rows * col_blocks * subwarp_size, default_block_size); - if (num_blocks > 0) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(generic_kernel_row_reduction_2d), - num_blocks, default_block_size, 0, 0, rows, cols, col_blocks, fn, - op, finalize, as_hip_type(identity), as_hip_type(result), - result_stride, args...); - } -} - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction, - run_generic_kernel_row_reduction); - - -template -void run_generic_col_reduction_small(syn::value_list, - int64 max_blocks, - std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, dim<2> size, - MappedKernelArgs... args) -{ - const auto rows = static_cast(size[0]); - const auto cols = static_cast(size[1]); - const auto num_blocks = std::min( - ceildiv(rows * subwarp_size, default_block_size), max_blocks); - if (num_blocks <= 1) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - generic_kernel_col_reduction_2d_small), - 1, default_block_size, 0, 0, rows, cols, fn, op, finalize, - as_hip_type(identity), as_hip_type(result), args...); - } else { - Array tmp_storage{exec, - static_cast(num_blocks * cols)}; - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - generic_kernel_col_reduction_2d_small), - num_blocks, default_block_size, 0, 0, rows, cols, fn, op, - [] __device__(auto v) { return v; }, as_hip_type(identity), - as_hip_type(tmp_storage.get_data()), args...); - if (cols > 0) { - hipLaunchKernelGGL(generic_kernel_reduction_finalize_2d, - ceildiv(cols, default_block_size), - default_block_size, 0, 0, cols, num_blocks, op, - finalize, as_hip_type(identity), - as_hip_type(tmp_storage.get_const_data()), 1, - as_hip_type(result)); - } - } -} - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small, - run_generic_col_reduction_small); - - -} // namespace - - -template -void run_kernel_row_reduction(std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, size_type result_stride, - dim<2> size, KernelArgs&&... args) -{ - using subwarp_sizes = - syn::value_list; - constexpr int oversubscription = 16; - const auto rows = static_cast(size[0]); - const auto cols = static_cast(size[1]); - const auto resources = - exec->get_num_warps() * config::warp_size * oversubscription; - if (rows * cols > resources && rows < cols) { - const auto col_blocks = ceildiv(rows * cols, resources); - Array partial{exec, - static_cast(col_blocks * rows)}; - const auto num_blocks = - ceildiv(rows * col_blocks * config::warp_size, default_block_size); - // no need to guard this kernel, as rows * cols > resources - hipLaunchKernelGGL( - HIP_KERNEL_NAME(generic_kernel_row_reduction_2d), - num_blocks, default_block_size, 0, 0, rows, cols, col_blocks, fn, - op, [] __device__(auto v) { return v; }, as_hip_type(identity), - as_hip_type(partial.get_data()), 1, map_to_device(args)...); - const auto num_finalize_blocks = ceildiv(rows, default_block_size); - hipLaunchKernelGGL( - generic_kernel_reduction_finalize_2d, num_finalize_blocks, - default_block_size, 0, 0, rows, col_blocks, op, finalize, - as_hip_type(identity), as_hip_type(partial.get_const_data()), - static_cast(result_stride), as_hip_type(result)); - } else { - select_run_generic_kernel_row_reduction( - subwarp_sizes(), - [cols](int compiled_subwarp_size) { - return compiled_subwarp_size >= cols || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), rows, cols, 1, fn, op, - finalize, identity, result, static_cast(result_stride), - map_to_device(args)...); - } -} - - -template -void run_kernel_col_reduction(std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, dim<2> size, - KernelArgs&&... args) -{ - using subwarp_sizes = - syn::value_list; - constexpr int oversubscription = 16; - const auto rows = static_cast(size[0]); - const auto cols = static_cast(size[1]); - const auto max_blocks = exec->get_num_warps() * config::warp_size * - oversubscription / default_block_size; - if (cols <= config::warp_size) { - select_generic_col_reduction_small( - subwarp_sizes(), - [cols](int compiled_subwarp_size) { - return compiled_subwarp_size >= cols || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), max_blocks, exec, fn, - op, finalize, identity, result, size, map_to_device(args)...); - } else { - const auto col_blocks = ceildiv(cols, config::warp_size); - const auto row_blocks = - ceildiv(std::min( - ceildiv(rows * config::warp_size, default_block_size), - max_blocks), - col_blocks); - if (row_blocks <= 1) { - hipLaunchKernelGGL(generic_kernel_col_reduction_2d_blocked, - dim3(1, col_blocks), default_block_size, 0, 0, - rows, cols, fn, op, finalize, - as_hip_type(identity), as_hip_type(result), - map_to_device(args)...); - } else { - Array tmp_storage{ - exec, static_cast(row_blocks * cols)}; - // no need to guard this kernel, as cols > warp_size, row_blocks > 1 - hipLaunchKernelGGL( - generic_kernel_col_reduction_2d_blocked, - dim3(row_blocks, col_blocks), default_block_size, 0, 0, rows, - cols, fn, op, [] __device__(auto v) { return v; }, - as_hip_type(identity), as_hip_type(tmp_storage.get_data()), - map_to_device(args)...); - hipLaunchKernelGGL(generic_kernel_reduction_finalize_2d, - ceildiv(cols, default_block_size), - default_block_size, 0, 0, cols, row_blocks, op, - finalize, as_hip_type(identity), - as_hip_type(tmp_storage.get_const_data()), 1, - as_hip_type(result)); - } - } -} +#include "common/cuda_hip/base/kernel_launch_reduction.hpp.inc" } // namespace hip diff --git a/hip/base/kernel_launch_solver.hip.hpp b/hip/base/kernel_launch_solver.hip.hpp index adbe5046818..250c0f83074 100644 --- a/hip/base/kernel_launch_solver.hip.hpp +++ b/hip/base/kernel_launch_solver.hip.hpp @@ -44,37 +44,7 @@ namespace kernels { namespace hip { -template -__global__ __launch_bounds__(default_block_size) void generic_kernel_2d_solver( - int64 rows, int64 cols, int64 default_stride, KernelFunction fn, - KernelArgs... args) -{ - auto tidx = thread::get_thread_id_flat(); - auto col = tidx % cols; - auto row = tidx / cols; - if (row >= rows) { - return; - } - fn(row, col, - device_unpack_solver_impl::unpack(args, default_stride)...); -} - - -template -void run_kernel_solver(std::shared_ptr exec, - KernelFunction fn, dim<2> size, size_type default_stride, - KernelArgs&&... args) -{ - if (size[0] > 0 && size[1] > 0) { - constexpr auto block_size = kernels::hip::default_block_size; - auto num_blocks = ceildiv(size[0] * size[1], block_size); - hipLaunchKernelGGL(kernels::hip::generic_kernel_2d_solver, num_blocks, - block_size, 0, 0, static_cast(size[0]), - static_cast(size[1]), - static_cast(default_stride), fn, - kernels::hip::map_to_device(args)...); - } -} +#include "common/cuda_hip/base/kernel_launch_solver.hpp.inc" } // namespace hip From 73c67a4cee461014b60ad1f77b34386695973cf6 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 21 Mar 2022 13:45:26 +0100 Subject: [PATCH 2/5] add Dense reduction kernels using tmp array --- .../base/kernel_launch_reduction.hpp.inc | 109 +++++--- .../unified/base/kernel_launch_reduction.hpp | 67 +++++ common/unified/matrix/dense_kernels.cpp | 26 +- core/device_hooks/common_kernels.inc.cpp | 3 - core/matrix/dense.cpp | 145 +++++++++- core/matrix/dense_kernels.hpp | 45 +-- core/synthesizer/implementation_selection.hpp | 8 +- cuda/matrix/dense_kernels.cu | 70 ----- cuda/solver/cb_gmres_kernels.cu | 4 +- cuda/solver/gmres_kernels.cu | 3 +- cuda/test/base/kernel_launch.cu | 100 +++++-- dpcpp/base/kernel_launch_reduction.dp.hpp | 130 +++++---- dpcpp/matrix/dense_kernels.dp.cpp | 41 --- dpcpp/solver/cb_gmres_kernels.dp.cpp | 3 +- dpcpp/solver/gmres_kernels.dp.cpp | 3 +- dpcpp/test/base/kernel_launch.dp.cpp | 104 +++++-- hip/matrix/dense_kernels.hip.cpp | 70 ----- hip/solver/cb_gmres_kernels.hip.cpp | 3 +- hip/solver/gmres_kernels.hip.cpp | 3 +- hip/test/base/kernel_launch.hip.cpp | 256 ++++++++++++------ include/ginkgo/core/matrix/dense.hpp | 134 ++++----- omp/base/kernel_launch_reduction.hpp | 127 +++++---- omp/matrix/dense_kernels.cpp | 39 --- omp/test/base/kernel_launch.cpp | 112 +++++--- reference/matrix/dense_kernels.cpp | 48 +--- test/matrix/dense_kernels.cpp | 121 ++++++++- 26 files changed, 1051 insertions(+), 723 deletions(-) diff --git a/common/cuda_hip/base/kernel_launch_reduction.hpp.inc b/common/cuda_hip/base/kernel_launch_reduction.hpp.inc index 28c9d8ba4fd..4ecf9c9979d 100644 --- a/common/cuda_hip/base/kernel_launch_reduction.hpp.inc +++ b/common/cuda_hip/base/kernel_launch_reduction.hpp.inc @@ -118,27 +118,33 @@ __global__ __launch_bounds__( template -void run_kernel_reduction(std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, size_type size, - KernelArgs&&... args) +void run_kernel_reduction_cached(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, size_type size, + Array& tmp, KernelArgs&&... args) { constexpr int oversubscription = 16; constexpr auto block_size = default_block_size; const auto num_blocks = std::min( ceildiv(size, block_size), exec->get_num_warps() * oversubscription); if (num_blocks > 1) { - Array partial{exec, static_cast(num_blocks)}; + const auto required_storage = sizeof(ValueType) * num_blocks; + tmp.set_executor(exec); + if (tmp.get_num_elems() < required_storage) { + tmp.resize_and_reset(required_storage); + } generic_kernel_reduction_1d<<>>( static_cast(size), fn, op, [] __device__(auto v) { return v; }, as_device_type(identity), - as_device_type(partial.get_data()), map_to_device(args)...); + as_device_type(reinterpret_cast(tmp.get_data())), + map_to_device(args)...); generic_kernel_reduction_1d<<<1, block_size>>>( static_cast(num_blocks), [] __device__(auto i, auto v) { return v[i]; }, op, finalize, as_device_type(identity), as_device_type(result), - as_device_type(partial.get_const_data())); + as_device_type( + reinterpret_cast(tmp.get_const_data()))); } else { generic_kernel_reduction_1d<<<1, block_size>>>( static_cast(size), fn, op, finalize, @@ -150,10 +156,11 @@ void run_kernel_reduction(std::shared_ptr exec, template -void run_kernel_reduction(std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, dim<2> size, KernelArgs&&... args) +void run_kernel_reduction_cached(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, dim<2> size, + Array& tmp, KernelArgs&&... args) { constexpr int oversubscription = 16; constexpr auto block_size = default_block_size; @@ -163,16 +170,22 @@ void run_kernel_reduction(std::shared_ptr exec, std::min(ceildiv(rows * cols, block_size), exec->get_num_warps() * oversubscription); if (num_blocks > 1) { - Array partial{exec, static_cast(num_blocks)}; + const auto required_storage = sizeof(ValueType) * num_blocks; + tmp.set_executor(exec); + if (tmp.get_num_elems() < required_storage) { + tmp.resize_and_reset(required_storage); + } generic_kernel_reduction_2d<<>>( rows, cols, fn, op, [] __device__(auto v) { return v; }, - as_device_type(identity), as_device_type(partial.get_data()), + as_device_type(identity), + as_device_type(reinterpret_cast(tmp.get_data())), map_to_device(args)...); generic_kernel_reduction_1d<<<1, block_size>>>( static_cast(num_blocks), [] __device__(auto i, auto v) { return v[i]; }, op, finalize, as_device_type(identity), as_device_type(result), - as_device_type(partial.get_const_data())); + as_device_type( + reinterpret_cast(tmp.get_const_data()))); } else { generic_kernel_reduction_2d<<<1, block_size>>>( rows, cols, fn, op, finalize, as_device_type(identity), @@ -369,7 +382,7 @@ void run_generic_col_reduction_small( syn::value_list, int64 max_blocks, std::shared_ptr exec, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, - dim<2> size, MappedKernelArgs... args) + dim<2> size, Array& tmp, MappedKernelArgs... args) { const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); @@ -381,19 +394,24 @@ void run_generic_col_reduction_small( as_device_type(identity), as_device_type(result), args...); } else { - Array tmp_storage{exec, - static_cast(num_blocks * cols)}; + const auto required_storage = sizeof(ValueType) * num_blocks * cols; + tmp.set_executor(exec); + if (tmp.get_num_elems() < required_storage) { + tmp.resize_and_reset(required_storage); + } generic_kernel_col_reduction_2d_small <<>>( rows, cols, fn, op, [] __device__(auto v) { return v; }, as_device_type(identity), - as_device_type(tmp_storage.get_data()), args...); + as_device_type(reinterpret_cast(tmp.get_data())), + args...); if (cols > 0) { generic_kernel_reduction_finalize_2d<<< ceildiv(cols, default_block_size), default_block_size>>>( cols, num_blocks, op, finalize, as_device_type(identity), - as_device_type(tmp_storage.get_const_data()), 1, - as_device_type(result)); + as_device_type( + reinterpret_cast(tmp.get_const_data())), + 1, as_device_type(result)); } } } @@ -407,11 +425,11 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small, template -void run_kernel_row_reduction(std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, size_type result_stride, - dim<2> size, KernelArgs&&... args) +void run_kernel_row_reduction_cached( + std::shared_ptr exec, KernelFunction fn, + ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, + size_type result_stride, dim<2> size, Array& tmp, + KernelArgs&&... args) { using subwarp_sizes = syn::value_list; @@ -422,8 +440,11 @@ void run_kernel_row_reduction(std::shared_ptr exec, exec->get_num_warps() * config::warp_size * oversubscription; if (rows * cols > resources && rows < cols) { const auto col_blocks = ceildiv(rows * cols, resources); - Array partial{exec, - static_cast(col_blocks * rows)}; + const auto required_storage = sizeof(ValueType) * col_blocks * rows; + tmp.set_executor(exec); + if (tmp.get_num_elems() < required_storage) { + tmp.resize_and_reset(required_storage); + } const auto num_blocks = ceildiv(rows * col_blocks * config::warp_size, default_block_size); // no need to guard this kernel, as rows * cols > resources @@ -431,12 +452,14 @@ void run_kernel_row_reduction(std::shared_ptr exec, <<>>( rows, cols, col_blocks, fn, op, [] __device__(auto v) { return v; }, as_device_type(identity), - as_device_type(partial.get_data()), 1, map_to_device(args)...); + as_device_type(reinterpret_cast(tmp.get_data())), 1, + map_to_device(args)...); const auto num_finalize_blocks = ceildiv(rows, default_block_size); generic_kernel_reduction_finalize_2d<<>>( rows, col_blocks, op, finalize, as_device_type(identity), - as_device_type(partial.get_const_data()), + as_device_type( + reinterpret_cast(tmp.get_const_data())), static_cast(result_stride), as_device_type(result)); } else { select_run_generic_kernel_row_reduction( @@ -454,11 +477,10 @@ void run_kernel_row_reduction(std::shared_ptr exec, template -void run_kernel_col_reduction(std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, dim<2> size, - KernelArgs&&... args) +void run_kernel_col_reduction_cached( + std::shared_ptr exec, KernelFunction fn, + ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, + dim<2> size, Array& tmp, KernelArgs&&... args) { using subwarp_sizes = syn::value_list; @@ -475,7 +497,7 @@ void run_kernel_col_reduction(std::shared_ptr exec, compiled_subwarp_size == config::warp_size; }, syn::value_list(), syn::type_list<>(), max_blocks, exec, fn, - op, finalize, identity, result, size, map_to_device(args)...); + op, finalize, identity, result, size, tmp, map_to_device(args)...); } else { const auto col_blocks = ceildiv(cols, config::warp_size); const auto row_blocks = @@ -489,19 +511,24 @@ void run_kernel_col_reduction(std::shared_ptr exec, rows, cols, fn, op, finalize, as_device_type(identity), as_device_type(result), map_to_device(args)...); } else { - Array tmp_storage{ - exec, static_cast(row_blocks * cols)}; + const auto required_storage = sizeof(ValueType) * row_blocks * cols; + tmp.set_executor(exec); + if (tmp.get_num_elems() < required_storage) { + tmp.resize_and_reset(required_storage); + } // no need to guard this kernel, as cols > warp_size, row_blocks > 1 generic_kernel_col_reduction_2d_blocked<<< dim3(row_blocks, col_blocks), default_block_size>>>( rows, cols, fn, op, [] __device__(auto v) { return v; }, as_device_type(identity), - as_device_type(tmp_storage.get_data()), map_to_device(args)...); + as_device_type(reinterpret_cast(tmp.get_data())), + map_to_device(args)...); generic_kernel_reduction_finalize_2d<<< ceildiv(cols, default_block_size), default_block_size>>>( cols, row_blocks, op, finalize, as_device_type(identity), - as_device_type(tmp_storage.get_const_data()), 1, - as_device_type(result)); + as_device_type( + reinterpret_cast(tmp.get_const_data())), + 1, as_device_type(result)); } } } diff --git a/common/unified/base/kernel_launch_reduction.hpp b/common/unified/base/kernel_launch_reduction.hpp index 3e64ddc2819..f70845efb5e 100644 --- a/common/unified/base/kernel_launch_reduction.hpp +++ b/common/unified/base/kernel_launch_reduction.hpp @@ -58,4 +58,71 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { + + +template +void run_kernel_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, size_type size, + KernelArgs&&... args) +{ + Array cache{exec}; + run_kernel_reduction_cached(exec, fn, op, finalize, identity, result, size, + cache, std::forward(args)...); +} + + +template +void run_kernel_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, dim<2> size, KernelArgs&&... args) +{ + Array cache{exec}; + run_kernel_reduction_cached(exec, fn, op, finalize, identity, result, size, + cache, std::forward(args)...); +} + + +template +void run_kernel_row_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, size_type result_stride, + dim<2> size, KernelArgs&&... args) +{ + Array cache{exec}; + run_kernel_row_reduction_cached(exec, fn, op, finalize, identity, result, + result_stride, size, cache, + std::forward(args)...); +} + + +template +void run_kernel_col_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, dim<2> size, + KernelArgs&&... args) +{ + Array cache{exec}; + run_kernel_col_reduction_cached(exec, fn, op, finalize, identity, result, + size, cache, + std::forward(args)...); +} + + +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko + + #endif // GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ diff --git a/common/unified/matrix/dense_kernels.cpp b/common/unified/matrix/dense_kernels.cpp index 51a411daf3b..a328c5b56ad 100644 --- a/common/unified/matrix/dense_kernels.cpp +++ b/common/unified/matrix/dense_kernels.cpp @@ -246,15 +246,15 @@ template void compute_dot(std::shared_ptr exec, const matrix::Dense* x, const matrix::Dense* y, - matrix::Dense* result) + matrix::Dense* result, Array& tmp) { - run_kernel_col_reduction( + run_kernel_col_reduction_cached( exec, [] GKO_KERNEL(auto i, auto j, auto x, auto y) { return x(i, j) * y(i, j); }, GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(), - x, y); + tmp, x, y); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); @@ -264,15 +264,15 @@ template void compute_conj_dot(std::shared_ptr exec, const matrix::Dense* x, const matrix::Dense* y, - matrix::Dense* result) + matrix::Dense* result, Array& tmp) { - run_kernel_col_reduction( + run_kernel_col_reduction_cached( exec, [] GKO_KERNEL(auto i, auto j, auto x, auto y) { return conj(x(i, j)) * y(i, j); }, GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(), - x, y); + tmp, x, y); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); @@ -281,14 +281,15 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); template void compute_norm2(std::shared_ptr exec, const matrix::Dense* x, - matrix::Dense>* result) + matrix::Dense>* result, + Array& tmp) { - run_kernel_col_reduction( + run_kernel_col_reduction_cached( exec, [] GKO_KERNEL(auto i, auto j, auto x) { return squared_norm(x(i, j)); }, [] GKO_KERNEL(auto a, auto b) { return a + b; }, [] GKO_KERNEL(auto a) { return sqrt(a); }, remove_complex{}, - result->get_values(), x->get_size(), x); + result->get_values(), x->get_size(), tmp, x); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); @@ -296,12 +297,13 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); template void compute_norm1(std::shared_ptr exec, const matrix::Dense* x, - matrix::Dense>* result) + matrix::Dense>* result, + Array& tmp) { - run_kernel_col_reduction( + run_kernel_col_reduction_cached( exec, [] GKO_KERNEL(auto i, auto j, auto x) { return abs(x(i, j)); }, GKO_KERNEL_REDUCE_SUM(remove_complex), result->get_values(), - x->get_size(), x); + x->get_size(), tmp, x); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index a89d1f3c140..13dae15583f 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -256,11 +256,8 @@ GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); -GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL); diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 49f128a126d..afa0de1f0e9 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -77,12 +77,8 @@ GKO_REGISTER_OPERATION(sub_scaled, dense::sub_scaled); GKO_REGISTER_OPERATION(add_scaled_diag, dense::add_scaled_diag); GKO_REGISTER_OPERATION(sub_scaled_diag, dense::sub_scaled_diag); GKO_REGISTER_OPERATION(compute_dot, dense::compute_dot); -GKO_REGISTER_OPERATION(compute_dot_dispatch, dense::compute_dot_dispatch); GKO_REGISTER_OPERATION(compute_conj_dot, dense::compute_conj_dot); -GKO_REGISTER_OPERATION(compute_conj_dot_dispatch, - dense::compute_conj_dot_dispatch); GKO_REGISTER_OPERATION(compute_norm2, dense::compute_norm2); -GKO_REGISTER_OPERATION(compute_norm2_dispatch, dense::compute_norm2_dispatch); GKO_REGISTER_OPERATION(compute_norm1, dense::compute_norm1); GKO_REGISTER_OPERATION(compute_max_nnz_per_row, dense::compute_max_nnz_per_row); GKO_REGISTER_OPERATION(compute_hybrid_coo_row_ptrs, @@ -154,6 +150,75 @@ void Dense::fill(const ValueType value) } +template +void Dense::scale(const LinOp* alpha) +{ + auto exec = this->get_executor(); + this->scale_impl(make_temporary_clone(exec, alpha).get()); +} + + +template +void Dense::inv_scale(const LinOp* alpha) +{ + auto exec = this->get_executor(); + this->inv_scale_impl(make_temporary_clone(exec, alpha).get()); +} + + +template +void Dense::add_scaled(const LinOp* alpha, const LinOp* b) +{ + auto exec = this->get_executor(); + this->add_scaled_impl(make_temporary_clone(exec, alpha).get(), + make_temporary_clone(exec, b).get()); +} + + +template +void Dense::sub_scaled(const LinOp* alpha, const LinOp* b) +{ + auto exec = this->get_executor(); + this->sub_scaled_impl(make_temporary_clone(exec, alpha).get(), + make_temporary_clone(exec, b).get()); +} + + +template +void Dense::compute_dot(const LinOp* b, LinOp* result) const +{ + auto exec = this->get_executor(); + this->compute_dot_impl(make_temporary_clone(exec, b).get(), + make_temporary_output_clone(exec, result).get()); +} + + +template +void Dense::compute_conj_dot(const LinOp* b, LinOp* result) const +{ + auto exec = this->get_executor(); + this->compute_conj_dot_impl( + make_temporary_clone(exec, b).get(), + make_temporary_output_clone(exec, result).get()); +} + + +template +void Dense::compute_norm2(LinOp* result) const +{ + auto exec = this->get_executor(); + this->compute_norm2_impl(make_temporary_output_clone(exec, result).get()); +} + + +template +void Dense::compute_norm1(LinOp* result) const +{ + auto exec = this->get_executor(); + this->compute_norm1_impl(make_temporary_output_clone(exec, result).get()); +} + + template void Dense::inv_scale_impl(const LinOp* alpha) { @@ -269,6 +334,22 @@ void Dense::sub_scaled_impl(const LinOp* alpha, const LinOp* b) } +template +void Dense::compute_dot(const LinOp* b, LinOp* result, + Array& tmp) const +{ + GKO_ASSERT_EQUAL_DIMENSIONS(this, b); + GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1])); + auto exec = this->get_executor(); + auto local_b = make_temporary_clone(exec, b); + auto local_res = make_temporary_clone(exec, result); + auto dense_b = make_temporary_conversion(local_b.get()); + auto dense_res = make_temporary_conversion(local_res.get()); + exec->run( + dense::make_compute_dot(this, dense_b.get(), dense_res.get(), tmp)); +} + + template void Dense::compute_dot_impl(const LinOp* b, LinOp* result) const { @@ -277,8 +358,25 @@ void Dense::compute_dot_impl(const LinOp* b, LinOp* result) const auto exec = this->get_executor(); auto dense_b = make_temporary_conversion(b); auto dense_res = make_temporary_conversion(result); + Array tmp{exec}; exec->run( - dense::make_compute_dot_dispatch(this, dense_b.get(), dense_res.get())); + dense::make_compute_dot(this, dense_b.get(), dense_res.get(), tmp)); +} + + +template +void Dense::compute_conj_dot(const LinOp* b, LinOp* result, + Array& tmp) const +{ + GKO_ASSERT_EQUAL_DIMENSIONS(this, b); + GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1])); + auto exec = this->get_executor(); + auto local_b = make_temporary_clone(exec, b); + auto local_res = make_temporary_clone(exec, result); + auto dense_b = make_temporary_conversion(local_b.get()); + auto dense_res = make_temporary_conversion(local_res.get()); + exec->run(dense::make_compute_conj_dot(this, dense_b.get(), dense_res.get(), + tmp)); } @@ -291,8 +389,21 @@ void Dense::compute_conj_dot_impl(const LinOp* b, auto exec = this->get_executor(); auto dense_b = make_temporary_conversion(b); auto dense_res = make_temporary_conversion(result); - exec->run(dense::make_compute_conj_dot_dispatch(this, dense_b.get(), - dense_res.get())); + Array tmp{exec}; + exec->run(dense::make_compute_conj_dot(this, dense_b.get(), dense_res.get(), + tmp)); +} + + +template +void Dense::compute_norm2(LinOp* result, Array& tmp) const +{ + GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1])); + auto exec = this->get_executor(); + auto local_result = make_temporary_clone(exec, result); + auto dense_res = make_temporary_conversion>( + local_result.get()); + exec->run(dense::make_compute_norm2(this, dense_res.get(), tmp)); } @@ -303,9 +414,23 @@ void Dense::compute_norm2_impl(LinOp* result) const auto exec = this->get_executor(); auto dense_res = make_temporary_conversion>(result); - exec->run(dense::make_compute_norm2_dispatch(this, dense_res.get())); + Array tmp{exec}; + exec->run(dense::make_compute_norm2(this, dense_res.get(), tmp)); +} + + +template +void Dense::compute_norm1(LinOp* result, Array& tmp) const +{ + GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1])); + auto exec = this->get_executor(); + auto local_result = make_temporary_clone(exec, result); + auto dense_res = make_temporary_conversion>( + local_result.get()); + exec->run(dense::make_compute_norm1(this, dense_res.get(), tmp)); } + template void Dense::compute_norm1_impl(LinOp* result) const { @@ -313,9 +438,11 @@ void Dense::compute_norm1_impl(LinOp* result) const auto exec = this->get_executor(); auto dense_res = make_temporary_conversion>(result); - exec->run(dense::make_compute_norm1(this, dense_res.get())); + Array tmp{exec}; + exec->run(dense::make_compute_norm1(this, dense_res.get(), tmp)); } + template void Dense::convert_to(Dense* result) const { diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp index b5eded2ceb4..1ef142eb4d7 100644 --- a/core/matrix/dense_kernels.hpp +++ b/core/matrix/dense_kernels.hpp @@ -104,44 +104,29 @@ namespace kernels { const matrix::Diagonal<_type>* x, \ matrix::Dense<_type>* y) -#define GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL(_type) \ - void compute_dot_dispatch(std::shared_ptr exec, \ - const matrix::Dense<_type>* x, \ - const matrix::Dense<_type>* y, \ - matrix::Dense<_type>* result) - #define GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL(_type) \ void compute_dot(std::shared_ptr exec, \ const matrix::Dense<_type>* x, \ const matrix::Dense<_type>* y, \ - matrix::Dense<_type>* result) - -#define GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL(_type) \ - void compute_conj_dot_dispatch( \ - std::shared_ptr exec, \ - const matrix::Dense<_type>* x, const matrix::Dense<_type>* y, \ - matrix::Dense<_type>* result) + matrix::Dense<_type>* result, Array& tmp) #define GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL(_type) \ void compute_conj_dot(std::shared_ptr exec, \ const matrix::Dense<_type>* x, \ const matrix::Dense<_type>* y, \ - matrix::Dense<_type>* result) - -#define GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL(_type) \ - void compute_norm2(std::shared_ptr exec, \ - const matrix::Dense<_type>* x, \ - matrix::Dense>* result) + matrix::Dense<_type>* result, Array& tmp) -#define GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(_type) \ - void compute_norm2_dispatch(std::shared_ptr exec, \ - const matrix::Dense<_type>* x, \ - matrix::Dense>* result) +#define GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL(_type) \ + void compute_norm2(std::shared_ptr exec, \ + const matrix::Dense<_type>* x, \ + matrix::Dense>* result, \ + Array& tmp) -#define GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(_type) \ - void compute_norm1(std::shared_ptr exec, \ - const matrix::Dense<_type>* x, \ - matrix::Dense>* result) +#define GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(_type) \ + void compute_norm1(std::shared_ptr exec, \ + const matrix::Dense<_type>* x, \ + matrix::Dense>* result, \ + Array& tmp) #define GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(_type, _prec) \ void fill_in_matrix_data(std::shared_ptr exec, \ @@ -325,16 +310,10 @@ namespace kernels { template \ GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL(ValueType); \ template \ - GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL(ValueType); \ - template \ GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL(ValueType); \ template \ - GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL(ValueType); \ - template \ GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL(ValueType); \ template \ - GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(ValueType); \ - template \ GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(ValueType); \ template \ GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ diff --git a/core/synthesizer/implementation_selection.hpp b/core/synthesizer/implementation_selection.hpp index a78ac391d2e..a2cf335840e 100644 --- a/core/synthesizer/implementation_selection.hpp +++ b/core/synthesizer/implementation_selection.hpp @@ -50,7 +50,7 @@ namespace syn { typename... InferredArgs> \ inline void _name(::gko::syn::value_list, Predicate, \ ::gko::syn::value_list, \ - ::gko::syn::type_list, InferredArgs...) \ + ::gko::syn::type_list, InferredArgs&&...) \ GKO_KERNEL_NOT_FOUND; \ \ template , Predicate is_eligible, \ ::gko::syn::value_list int_args, \ - ::gko::syn::type_list type_args, InferredArgs... args) \ + ::gko::syn::type_list type_args, InferredArgs&&... args) \ { \ if (is_eligible(K)) { \ _callable( \ @@ -81,7 +81,7 @@ namespace syn { ::gko::syn::value_list, \ ::gko::syn::value_list, \ ::gko::syn::value_list, \ - ::gko::syn::type_list, InferredArgs...) \ + ::gko::syn::type_list, InferredArgs&&...) \ GKO_KERNEL_NOT_FOUND; \ \ template bool_args, \ ::gko::syn::value_list int_args, \ ::gko::syn::value_list size_args, \ - ::gko::syn::type_list type_args, InferredArgs... args) \ + ::gko::syn::type_list type_args, InferredArgs&&... args) \ { \ if (is_eligible(K)) { \ _callable( \ diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu index 0d39cbb9fbc..063b0f069d3 100644 --- a/cuda/matrix/dense_kernels.cu +++ b/cuda/matrix/dense_kernels.cu @@ -73,76 +73,6 @@ constexpr int default_block_size = 512; #include "common/cuda_hip/matrix/dense_kernels.hpp.inc" -template -void compute_dot_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { - if (cublas::is_supported::value) { - auto handle = exec->get_cublas_handle(); - cublas::dot(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), y->get_const_values(), y->get_stride(), - result->get_values()); - } else { - compute_dot(exec, x, y, result); - } - } else { - compute_dot(exec, x, y, result); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); - - -template -void compute_conj_dot_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { - if (cublas::is_supported::value) { - auto handle = exec->get_cublas_handle(); - cublas::conj_dot(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), y->get_const_values(), - y->get_stride(), result->get_values()); - } else { - compute_conj_dot(exec, x, y, result); - } - } else { - compute_conj_dot(exec, x, y, result); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); - - -template -void compute_norm2_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - matrix::Dense>* result) -{ - if (x->get_size()[1] == 1) { - if (cublas::is_supported::value) { - auto handle = exec->get_cublas_handle(); - cublas::norm2(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), result->get_values()); - } else { - compute_norm2(exec, x, result); - } - } else { - compute_norm2(exec, x, result); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); - - template void simple_apply(std::shared_ptr exec, const matrix::Dense* a, diff --git a/cuda/solver/cb_gmres_kernels.cu b/cuda/solver/cb_gmres_kernels.cu index f494fa760b8..7bee0750390 100644 --- a/cuda/solver/cb_gmres_kernels.cu +++ b/cuda/solver/cb_gmres_kernels.cu @@ -137,12 +137,14 @@ void initialize_2(std::shared_ptr exec, constexpr auto block_size = default_block_size; const auto stride_arnoldi = arnoldi_norm->get_stride(); + Array tmp{exec}; + initialize_2_1_kernel<<>>( residual->get_size()[0], residual->get_size()[1], krylov_dim, acc::as_cuda_range(krylov_bases), as_cuda_type(residual_norm_collection->get_values()), residual_norm_collection->get_stride()); - kernels::cuda::dense::compute_norm2(exec, residual, residual_norm); + kernels::cuda::dense::compute_norm2(exec, residual, residual_norm, tmp); if (use_scalar) { components::fill_array(exec, diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu index 5e946868fdf..d228b47dd37 100644 --- a/cuda/solver/gmres_kernels.cu +++ b/cuda/solver/gmres_kernels.cu @@ -117,8 +117,9 @@ void initialize_2(std::shared_ptr exec, default_block_size); const auto block_dim = default_block_size; constexpr auto block_size = default_block_size; + Array tmp{exec}; - kernels::cuda::dense::compute_norm2(exec, residual, residual_norm); + kernels::cuda::dense::compute_norm2(exec, residual, residual_norm, tmp); const auto grid_dim_2 = ceildiv(num_rows * num_rhs, default_block_size); initialize_2_2_kernel<<>>( diff --git a/cuda/test/base/kernel_launch.cu b/cuda/test/base/kernel_launch.cu index 647d42120ad..af1a3d471f9 100644 --- a/cuda/test/base/kernel_launch.cu +++ b/cuda/test/base/kernel_launch.cu @@ -51,15 +51,42 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/test/utils.hpp" -namespace { - - using gko::dim; using gko::int64; using gko::size_type; using std::is_same; +struct move_only_type { + move_only_type() {} + + move_only_type(move_only_type&&) {} + + move_only_type(const move_only_type&) = delete; +}; + + +move_only_type move_only_val{}; + + +namespace gko { +namespace kernels { +namespace cuda { + + +template <> +struct to_device_type_impl { + using type = int64; + + static type map_to_device(move_only_type&) { return 0; } +}; + + +} // namespace cuda +} // namespace kernels +} // namespace gko + + class KernelLaunch : public ::testing::Test { protected: KernelLaunch() @@ -105,12 +132,13 @@ void run1d(std::shared_ptr exec, size_type dim, int* data) { gko::kernels::cuda::run_kernel( exec, - [] GKO_KERNEL(auto i, auto d) { + [] GKO_KERNEL(auto i, auto d, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); d[i] = i; }, - dim, data); + dim, data, move_only_val); } TEST_F(KernelLaunch, Runs1D) @@ -125,17 +153,18 @@ void run1d(std::shared_ptr exec, gko::Array& data) { gko::kernels::cuda::run_kernel( exec, - [] GKO_KERNEL(auto i, auto d, auto d_ptr) { + [] GKO_KERNEL(auto i, auto d, auto d_ptr, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); if (d == d_ptr) { d[i] = i; } else { d[i] = 0; } }, - data.get_num_elems(), data, data.get_const_data()); + data.get_num_elems(), data, data.get_const_data(), move_only_val); } TEST_F(KernelLaunch, Runs1DArray) @@ -150,13 +179,14 @@ void run1d(std::shared_ptr exec, gko::matrix::Dense<>* m) { gko::kernels::cuda::run_kernel( exec, - [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) { + [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); bool pointers_correct = d.data == d_ptr && d2.data == d_ptr; bool strides_correct = d.stride == 5 && d2.stride == 5; bool accessors_2d_correct = @@ -171,7 +201,7 @@ void run1d(std::shared_ptr exec, gko::matrix::Dense<>* m) } }, 16, m, static_cast*>(m), - m->get_const_values()); + m->get_const_values(), move_only_val); } TEST_F(KernelLaunch, Runs1DDense) @@ -186,13 +216,14 @@ void run2d(std::shared_ptr exec, int* data) { gko::kernels::cuda::run_kernel( exec, - [] GKO_KERNEL(auto i, auto j, auto d) { + [] GKO_KERNEL(auto i, auto j, auto d, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); d[i + 4 * j] = 4 * i + j; }, - dim<2>{4, 4}, data); + dim<2>{4, 4}, data, move_only_val); } TEST_F(KernelLaunch, Runs2D) @@ -207,18 +238,19 @@ void run2d(std::shared_ptr exec, gko::Array& data) { gko::kernels::cuda::run_kernel( exec, - [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) { + [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); if (d == d_ptr) { d[i + 4 * j] = 4 * i + j; } else { d[i + 4 * j] = 0; } }, - dim<2>{4, 4}, data, data.get_const_data()); + dim<2>{4, 4}, data, data.get_const_data(), move_only_val); } TEST_F(KernelLaunch, Runs2DArray) @@ -235,7 +267,7 @@ void run2d(std::shared_ptr exec, gko::matrix::Dense<>* m1, gko::kernels::cuda::run_kernel_solver( exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3, - auto d4, auto d2_ptr, auto d3_ptr) { + auto d4, auto d2_ptr, auto d3_ptr, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, @@ -246,6 +278,7 @@ void run2d(std::shared_ptr exec, gko::matrix::Dense<>* m1, static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); bool pointers_correct = d.data == d_ptr && d2.data == d_ptr && d3.data == d2_ptr && d4 == d3_ptr; bool strides_correct = @@ -266,7 +299,8 @@ void run2d(std::shared_ptr exec, gko::matrix::Dense<>* m1, dim<2>{4, 4}, m2->get_stride(), m1, static_cast*>(m1), m1->get_const_values(), gko::kernels::cuda::default_stride(m2), - gko::kernels::cuda::row_vector(m3), m2->get_values(), m3->get_values()); + gko::kernels::cuda::row_vector(m3), m2->get_values(), m3->get_values(), + move_only_val); } TEST_F(KernelLaunch, Runs2DDense) @@ -283,23 +317,25 @@ void run1d_reduction(std::shared_ptr exec) gko::kernels::cuda::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto a) { + [] GKO_KERNEL(auto i, auto a, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, "dummy"); return i + 1; }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), - size_type{100000}, output); + size_type{100000}, output, move_only_val); // 2 * sum i=0...99999 (i+1) ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000LL); gko::kernels::cuda::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto a) { + [] GKO_KERNEL(auto i, auto a, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, "dummy"); return i + 1; }, [] GKO_KERNEL(auto i, auto j) { @@ -311,7 +347,7 @@ void run1d_reduction(std::shared_ptr exec) static_assert(is_same::value, "value"); return j * 2; }, - int64{}, output.get_data(), size_type{100}, output); + int64{}, output.get_data(), size_type{100}, output, move_only_val); // 2 * sum i=0...99 (i+1) ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100LL); @@ -326,10 +362,11 @@ void run2d_reduction(std::shared_ptr exec) gko::kernels::cuda::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto j, auto a) { + [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, "dummy"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { @@ -341,17 +378,19 @@ void run2d_reduction(std::shared_ptr exec) static_assert(is_same::value, "value"); return j * 4; }, - int64{}, output.get_data(), gko::dim<2>{1000, 100}, output); + int64{}, output.get_data(), gko::dim<2>{1000, 100}, output, + move_only_val); // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1) ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000LL); gko::kernels::cuda::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto j, auto a) { + [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, "dummy"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { @@ -363,7 +402,7 @@ void run2d_reduction(std::shared_ptr exec) static_assert(is_same::value, "value"); return j * 4; }, - int64{}, output.get_data(), gko::dim<2>{10, 10}, output); + int64{}, output.get_data(), gko::dim<2>{10, 10}, output, move_only_val); // 4 * sum i=0...9 sum j=0...9 of (i+1)*(j+1) ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100LL); @@ -391,10 +430,12 @@ void run2d_row_reduction(std::shared_ptr exec) gko::kernels::cuda::run_kernel_row_reduction( exec, - [] GKO_KERNEL(auto i, auto j, auto a) { + [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, + "dummy"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { @@ -409,7 +450,7 @@ void run2d_row_reduction(std::shared_ptr exec) int64{}, output.get_data(), 2, gko::dim<2>{static_cast(num_rows), static_cast(num_cols)}, - output); + output, move_only_val); GKO_ASSERT_ARRAY_EQ(host_ref, output); } @@ -440,10 +481,12 @@ void run2d_col_reduction(std::shared_ptr exec) gko::kernels::cuda::run_kernel_col_reduction( exec, - [] GKO_KERNEL(auto i, auto j, auto a) { + [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, + "dummy"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { @@ -458,7 +501,7 @@ void run2d_col_reduction(std::shared_ptr exec) int64{}, output.get_data(), gko::dim<2>{static_cast(num_rows), static_cast(num_cols)}, - output); + output, move_only_val); GKO_ASSERT_ARRAY_EQ(host_ref, output); } @@ -466,6 +509,3 @@ void run2d_col_reduction(std::shared_ptr exec) } TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); } - - -} // namespace diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp index 0df00f58c6e..330e5aa7f44 100644 --- a/dpcpp/base/kernel_launch_reduction.dp.hpp +++ b/dpcpp/base/kernel_launch_reduction.dp.hpp @@ -168,7 +168,7 @@ void run_kernel_reduction_impl(std::shared_ptr exec, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, size_type size, - MappedKernelArgs... args) + Array& tmp, MappedKernelArgs... args) { constexpr int oversubscription = 4; constexpr auto wg_size = KCFG_1D::decode<0>(cfg); @@ -178,18 +178,23 @@ void run_kernel_reduction_impl(std::shared_ptr exec, exec->get_num_computing_units() * oversubscription); auto queue = exec->get_queue(); if (num_workgroups > 1) { - Array partial{exec, static_cast(num_workgroups)}; + const auto required_storage = sizeof(ValueType) * num_workgroups; + tmp.set_executor(exec); + if (tmp.get_num_elems() < required_storage) { + tmp.resize_and_reset(required_storage); + } queue->submit([&](sycl::handler& cgh) { generic_kernel_reduction_1d( cgh, static_cast(size), num_workgroups, fn, op, - [](auto v) { return v; }, identity, partial.get_data(), - args...); + [](auto v) { return v; }, identity, + reinterpret_cast(tmp.get_data()), args...); }); queue->submit([&](sycl::handler& cgh) { generic_kernel_reduction_1d( cgh, static_cast(num_workgroups), 1, [](auto i, auto v) { return v[i]; }, op, finalize, identity, - result, partial.get_const_data()); + result, + reinterpret_cast(tmp.get_const_data())); }); } else { queue->submit([&](sycl::handler& cgh) { @@ -207,7 +212,7 @@ template exec, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, - ValueType* result, dim<2> size, + ValueType* result, dim<2> size, Array& tmp, MappedKernelArgs... args) { constexpr int oversubscription = 4; @@ -221,18 +226,23 @@ void run_kernel_reduction_impl(std::shared_ptr exec, exec->get_num_computing_units() * oversubscription); auto queue = exec->get_queue(); if (num_workgroups > 1) { - Array partial{exec, static_cast(num_workgroups)}; + const auto required_storage = sizeof(ValueType) * num_workgroups; + tmp.set_executor(exec); + if (tmp.get_num_elems() < required_storage) { + tmp.resize_and_reset(required_storage); + } queue->submit([&](sycl::handler& cgh) { generic_kernel_reduction_2d( cgh, rows, cols, num_workgroups, fn, op, - [](auto v) { return v; }, identity, partial.get_data(), - args...); + [](auto v) { return v; }, identity, + reinterpret_cast(tmp.get_data()), args...); }); queue->submit([&](sycl::handler& cgh) { generic_kernel_reduction_1d( cgh, static_cast(num_workgroups), 1, [](auto i, auto v) { return v[i]; }, op, finalize, identity, - result, partial.get_const_data()); + result, + reinterpret_cast(tmp.get_const_data())); }); } else { queue->submit([&](sycl::handler& cgh) { @@ -249,10 +259,11 @@ GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(select_run_kernel_reduction, template -void run_kernel_reduction(std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, dim<2> size, KernelArgs&&... args) +void run_kernel_reduction_cached(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, dim<2> size, + Array& tmp, KernelArgs&&... args) { const auto desired_cfg = get_first_cfg( as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { @@ -264,17 +275,17 @@ void run_kernel_reduction(std::shared_ptr exec, [&](std::uint32_t cfg) { return cfg == desired_cfg; }, syn::value_list(), syn::value_list(), syn::value_list(), syn::type_list<>(), exec, fn, op, - finalize, identity, result, size, map_to_device(args)...); + finalize, identity, result, size, tmp, map_to_device(args)...); } template -void run_kernel_reduction(std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, size_type size, - KernelArgs&&... args) +void run_kernel_reduction_cached(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, size_type size, + Array& tmp, KernelArgs&&... args) { const auto desired_cfg = get_first_cfg( as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { @@ -286,7 +297,7 @@ void run_kernel_reduction(std::shared_ptr exec, [&](std::uint32_t cfg) { return cfg == desired_cfg; }, syn::value_list(), syn::value_list(), syn::value_list(), syn::type_list<>(), exec, fn, op, - finalize, identity, result, size, map_to_device(args)...); + finalize, identity, result, size, tmp, map_to_device(args)...); } @@ -501,7 +512,8 @@ void run_generic_col_reduction_small(syn::value_list, int64 max_workgroups, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, - dim<2> size, MappedKernelArgs... args) + dim<2> size, Array& tmp, + MappedKernelArgs... args) { constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); @@ -518,17 +530,22 @@ void run_generic_col_reduction_small(syn::value_list, args...); }); } else { - Array tmp_storage{exec, - static_cast(row_blocks * cols)}; + const auto required_storage = sizeof(ValueType) * row_blocks * cols; + tmp.set_executor(exec); + if (tmp.get_num_elems() < required_storage) { + tmp.resize_and_reset(required_storage); + } queue->submit([&](sycl::handler& cgh) { generic_kernel_col_reduction_2d_small( cgh, rows, cols, row_blocks, fn, op, [](auto v) { return v; }, - identity, tmp_storage.get_data(), args...); + identity, reinterpret_cast(tmp.get_data()), + args...); }); queue->submit([&](sycl::handler& cgh) { generic_kernel_reduction_finalize_2d( cgh, cols, row_blocks, op, finalize, identity, - tmp_storage.get_const_data(), 1, result); + reinterpret_cast(tmp.get_const_data()), 1, + result); }); } } @@ -544,7 +561,8 @@ void run_kernel_row_reduction_stage1(std::shared_ptr exec, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, size_type result_stride, - dim<2> size, MappedKernelArgs... args) + dim<2> size, Array& tmp, + MappedKernelArgs... args) { constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); @@ -559,17 +577,20 @@ void run_kernel_row_reduction_stage1(std::shared_ptr exec, auto queue = exec->get_queue(); if (rows * cols > resources && rows < cols) { const auto col_blocks = ceildiv(rows * cols, resources); - Array partial{exec, - static_cast(col_blocks * rows)}; + const auto required_storage = sizeof(ValueType) * col_blocks * rows; + tmp.set_executor(exec); + if (tmp.get_num_elems() < required_storage) { + tmp.resize_and_reset(required_storage); + } generic_kernel_row_reduction_2d( syn::value_list{}, exec, rows, cols, col_blocks, fn, - op, [](auto v) { return v; }, identity, partial.get_data(), 1, - args...); + op, [](auto v) { return v; }, identity, + reinterpret_cast(tmp.get_data()), 1, args...); queue->submit([&](sycl::handler& cgh) { generic_kernel_reduction_finalize_2d( cgh, rows, col_blocks, op, finalize, identity, - partial.get_const_data(), static_cast(result_stride), - result); + reinterpret_cast(tmp.get_const_data()), + static_cast(result_stride), result); }); } else { select_generic_kernel_row_reduction_2d( @@ -595,7 +616,7 @@ void run_kernel_col_reduction_stage1(std::shared_ptr exec, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, - MappedKernelArgs... args) + Array& tmp, MappedKernelArgs... args) { constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); @@ -615,7 +636,7 @@ void run_kernel_col_reduction_stage1(std::shared_ptr exec, compiled_ssg_size == sg_size; }, syn::value_list(), syn::type_list<>(), exec, max_blocks, - fn, op, finalize, identity, result, size, args...); + fn, op, finalize, identity, result, size, tmp, args...); } else { const auto col_blocks = ceildiv(cols, sg_size); const auto row_blocks = ceildiv( @@ -629,18 +650,22 @@ void run_kernel_col_reduction_stage1(std::shared_ptr exec, result, args...); }); } else { - Array tmp_storage{ - exec, static_cast(row_blocks * cols)}; + const auto required_storage = sizeof(ValueType) * row_blocks * cols; + tmp.set_executor(exec); + if (tmp.get_num_elems() < required_storage) { + tmp.resize_and_reset(required_storage); + } queue->submit([&](sycl::handler& cgh) { generic_kernel_col_reduction_2d_blocked( cgh, rows, cols, row_blocks, col_blocks, fn, op, - [](auto v) { return v; }, identity, tmp_storage.get_data(), - args...); + [](auto v) { return v; }, identity, + reinterpret_cast(tmp.get_data()), args...); }); queue->submit([&](sycl::handler& cgh) { generic_kernel_reduction_finalize_2d( cgh, cols, row_blocks, op, finalize, identity, - tmp_storage.get_const_data(), 1, result); + reinterpret_cast(tmp.get_const_data()), 1, + result); }); } } @@ -655,11 +680,12 @@ GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(select_kernel_col_reduction_stage1, template -void run_kernel_row_reduction(std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, size_type result_stride, - dim<2> size, KernelArgs&&... args) +void run_kernel_row_reduction_cached(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, size_type result_stride, + dim<2> size, Array& tmp, + KernelArgs&&... args) { const auto desired_cfg = get_first_cfg( as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { @@ -671,18 +697,18 @@ void run_kernel_row_reduction(std::shared_ptr exec, [&](std::uint32_t cfg) { return cfg == desired_cfg; }, syn::value_list(), syn::value_list(), syn::value_list(), syn::type_list<>(), exec, fn, op, - finalize, identity, result, result_stride, size, + finalize, identity, result, result_stride, size, tmp, map_to_device(args)...); } template -void run_kernel_col_reduction(std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, dim<2> size, - KernelArgs&&... args) +void run_kernel_col_reduction_cached(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, dim<2> size, + Array& tmp, KernelArgs&&... args) { const auto desired_cfg = get_first_cfg( as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { @@ -694,7 +720,7 @@ void run_kernel_col_reduction(std::shared_ptr exec, [&](std::uint32_t cfg) { return cfg == desired_cfg; }, syn::value_list(), syn::value_list(), syn::value_list(), syn::type_list<>(), exec, fn, op, - finalize, identity, result, size, map_to_device(args)...); + finalize, identity, result, size, tmp, map_to_device(args)...); } diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 97c01e0fac5..39c2b0c54e4 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -189,47 +189,6 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(conj_transpose_call, conj_transpose, } // namespace kernel -template -void compute_dot_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - // TODO Add onemkl for single column ? - compute_dot(exec, x, y, result); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); - - -template -void compute_conj_dot_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - // TODO Add onemkl for single column ? - compute_conj_dot(exec, x, y, result); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); - - -template -void compute_norm2_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - matrix::Dense>* result) -{ - // TODO Add onemkl for single column ? - compute_norm2(exec, x, result); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); - - template void simple_apply(std::shared_ptr exec, const matrix::Dense* a, diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp index dc94bbb92ff..5e29c7aa612 100644 --- a/dpcpp/solver/cb_gmres_kernels.dp.cpp +++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp @@ -1006,13 +1006,14 @@ void initialize_2(std::shared_ptr exec, const dim3 block_dim(default_block_size, 1, 1); constexpr auto block_size = default_block_size; const auto stride_arnoldi = arnoldi_norm->get_stride(); + Array tmp{exec}; initialize_2_1_kernel( grid_dim_1, block_dim, 0, exec->get_queue(), residual->get_size()[0], residual->get_size()[1], krylov_dim, krylov_bases, residual_norm_collection->get_values(), residual_norm_collection->get_stride()); - kernels::dpcpp::dense::compute_norm2(exec, residual, residual_norm); + kernels::dpcpp::dense::compute_norm2(exec, residual, residual_norm, tmp); if (use_scalar) { components::fill_array(exec, diff --git a/dpcpp/solver/gmres_kernels.dp.cpp b/dpcpp/solver/gmres_kernels.dp.cpp index 2fe0d63bcbf..5c68e1580d5 100644 --- a/dpcpp/solver/gmres_kernels.dp.cpp +++ b/dpcpp/solver/gmres_kernels.dp.cpp @@ -470,8 +470,9 @@ void initialize_2(std::shared_ptr exec, 1, 1); const dim3 block_dim(default_block_size, 1, 1); constexpr auto block_size = default_block_size; + Array tmp{exec}; - kernels::dpcpp::dense::compute_norm2(exec, residual, residual_norm); + kernels::dpcpp::dense::compute_norm2(exec, residual, residual_norm, tmp); const dim3 grid_dim_2(ceildiv(num_rows * num_rhs, default_block_size), 1, 1); diff --git a/dpcpp/test/base/kernel_launch.dp.cpp b/dpcpp/test/base/kernel_launch.dp.cpp index a0f2d58806b..6cad2c705f4 100644 --- a/dpcpp/test/base/kernel_launch.dp.cpp +++ b/dpcpp/test/base/kernel_launch.dp.cpp @@ -51,15 +51,42 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/test/utils.hpp" -namespace { - - using gko::dim; using gko::int64; using gko::size_type; using std::is_same; +struct move_only_type { + move_only_type() {} + + move_only_type(move_only_type&&) {} + + move_only_type(const move_only_type&) = delete; +}; + + +move_only_type move_only_val{}; + + +namespace gko { +namespace kernels { +namespace dpcpp { + + +template <> +struct to_device_type_impl { + using type = int64; + + static type map_to_device(move_only_type&) { return 0; } +}; + + +} // namespace dpcpp +} // namespace kernels +} // namespace gko + + class KernelLaunch : public ::testing::Test { protected: #if GINKGO_DPCPP_SINGLE_MODE @@ -111,12 +138,13 @@ TEST_F(KernelLaunch, Runs1D) { gko::kernels::dpcpp::run_kernel( exec, - [] GKO_KERNEL(auto i, auto d) { + [] GKO_KERNEL(auto i, auto d, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); d[i] = i; }, - zero_array.get_num_elems(), zero_array.get_data()); + zero_array.get_num_elems(), zero_array.get_data(), move_only_val); GKO_ASSERT_ARRAY_EQ(zero_array, iota_array); } @@ -126,17 +154,19 @@ TEST_F(KernelLaunch, Runs1DArray) { gko::kernels::dpcpp::run_kernel( exec, - [] GKO_KERNEL(auto i, auto d, auto d_ptr) { + [] GKO_KERNEL(auto i, auto d, auto d_ptr, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); if (d == d_ptr) { d[i] = i; } else { d[i] = 0; } }, - zero_array.get_num_elems(), zero_array, zero_array.get_const_data()); + zero_array.get_num_elems(), zero_array, zero_array.get_const_data(), + move_only_val); GKO_ASSERT_ARRAY_EQ(zero_array, iota_array); } @@ -146,7 +176,7 @@ TEST_F(KernelLaunch, Runs1DDense) { gko::kernels::dpcpp::run_kernel( exec, - [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) { + [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); @@ -154,6 +184,7 @@ TEST_F(KernelLaunch, Runs1DDense) "type"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); bool pointers_correct = d.data == d_ptr && d2.data == d_ptr; bool strides_correct = d.stride == 5 && d2.stride == 5; bool accessors_2d_correct = @@ -168,7 +199,7 @@ TEST_F(KernelLaunch, Runs1DDense) } }, 16, zero_dense2.get(), static_cast(zero_dense2.get()), - zero_dense2->get_const_values()); + zero_dense2->get_const_values(), move_only_val); GKO_ASSERT_MTX_NEAR(zero_dense2, iota_dense, 0.0); } @@ -178,13 +209,14 @@ TEST_F(KernelLaunch, Runs2D) { gko::kernels::dpcpp::run_kernel( exec, - [] GKO_KERNEL(auto i, auto j, auto d) { + [] GKO_KERNEL(auto i, auto j, auto d, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); d[i + 4 * j] = 4 * i + j; }, - dim<2>{4, 4}, zero_array.get_data()); + dim<2>{4, 4}, zero_array.get_data(), move_only_val); GKO_ASSERT_ARRAY_EQ(zero_array, iota_transp_array); } @@ -194,18 +226,19 @@ TEST_F(KernelLaunch, Runs2DArray) { gko::kernels::dpcpp::run_kernel( exec, - [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) { + [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); if (d == d_ptr) { d[i + 4 * j] = 4 * i + j; } else { d[i + 4 * j] = 0; } }, - dim<2>{4, 4}, zero_array, zero_array.get_const_data()); + dim<2>{4, 4}, zero_array, zero_array.get_const_data(), move_only_val); GKO_ASSERT_ARRAY_EQ(zero_array, iota_transp_array); } @@ -216,7 +249,7 @@ TEST_F(KernelLaunch, Runs2DDense) gko::kernels::dpcpp::run_kernel_solver( exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3, - auto d4, auto d2_ptr, auto d3_ptr) { + auto d4, auto d2_ptr, auto d3_ptr, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); @@ -229,6 +262,7 @@ TEST_F(KernelLaunch, Runs2DDense) "type"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); bool pointers_correct = d.data == d_ptr && d2.data == d_ptr && d3.data == d2_ptr && d4 == d3_ptr; bool strides_correct = @@ -251,7 +285,7 @@ TEST_F(KernelLaunch, Runs2DDense) zero_dense2->get_const_values(), gko::kernels::dpcpp::default_stride(zero_dense.get()), gko::kernels::dpcpp::row_vector(vec_dense.get()), - zero_dense->get_values(), vec_dense->get_values()); + zero_dense->get_values(), vec_dense->get_values(), move_only_val); GKO_ASSERT_MTX_NEAR(zero_dense2, iota_dense, 0.0); } @@ -263,9 +297,10 @@ TEST_F(KernelLaunch, Reduction1D) gko::kernels::dpcpp::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto a) { + [] GKO_KERNEL(auto i, auto a, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, "dummy"); return i + 1; }, [] GKO_KERNEL(auto i, auto j) { @@ -277,16 +312,17 @@ TEST_F(KernelLaunch, Reduction1D) static_assert(is_same::value, "j"); return j * 2; }, - int64{}, output.get_data(), size_type{100000}, output); + int64{}, output.get_data(), size_type{100000}, output, move_only_val); // 2 * sum i=0...99999 (i+1) EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000LL); gko::kernels::dpcpp::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto a) { + [] GKO_KERNEL(auto i, auto a, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, "dummy"); return i + 1; }, [] GKO_KERNEL(auto i, auto j) { @@ -298,7 +334,7 @@ TEST_F(KernelLaunch, Reduction1D) static_assert(is_same::value, "j"); return j * 2; }, - int64{}, output.get_data(), size_type{100}, output); + int64{}, output.get_data(), size_type{100}, output, move_only_val); // 2 * sum i=0...99 (i+1) EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100LL); @@ -311,9 +347,11 @@ TEST_F(KernelLaunch, Reduction2D) gko::kernels::dpcpp::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto j, auto a) { + [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, "dummy"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { @@ -325,16 +363,19 @@ TEST_F(KernelLaunch, Reduction2D) static_assert(is_same::value, "j"); return j * 4; }, - int64{}, output.get_data(), gko::dim<2>{1000, 100}, output); + int64{}, output.get_data(), gko::dim<2>{1000, 100}, output, + move_only_val); // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1) EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000LL); gko::kernels::dpcpp::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto j, auto a) { + [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, "dummy"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { @@ -346,7 +387,7 @@ TEST_F(KernelLaunch, Reduction2D) static_assert(is_same::value, "j"); return j * 4; }, - int64{}, output.get_data(), gko::dim<2>{10, 10}, output); + int64{}, output.get_data(), gko::dim<2>{10, 10}, output, move_only_val); // 4 * sum i=0...9 sum j=0...9 of (i+1)*(j+1) ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100LL); @@ -372,9 +413,12 @@ TEST_F(KernelLaunch, ReductionRow2D) gko::kernels::dpcpp::run_kernel_row_reduction( exec, - [] GKO_KERNEL(auto i, auto j, auto a) { + [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, + "dummy"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, @@ -382,7 +426,7 @@ TEST_F(KernelLaunch, ReductionRow2D) output.get_data(), 2, gko::dim<2>{static_cast(num_rows), static_cast(num_cols)}, - output); + output, move_only_val); GKO_ASSERT_ARRAY_EQ(host_ref, output); } @@ -409,9 +453,12 @@ TEST_F(KernelLaunch, ReductionCol2D) gko::kernels::dpcpp::run_kernel_col_reduction( exec, - [] GKO_KERNEL(auto i, auto j, auto a) { + [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, + "dummy"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, @@ -419,12 +466,9 @@ TEST_F(KernelLaunch, ReductionCol2D) output.get_data(), gko::dim<2>{static_cast(num_rows), static_cast(num_cols)}, - output); + output, move_only_val); GKO_ASSERT_ARRAY_EQ(host_ref, output); } } } - - -} // namespace diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp index d883f415770..d193af9a4fb 100644 --- a/hip/matrix/dense_kernels.hip.cpp +++ b/hip/matrix/dense_kernels.hip.cpp @@ -76,76 +76,6 @@ constexpr int default_block_size = 512; #include "common/cuda_hip/matrix/dense_kernels.hpp.inc" -template -void compute_dot_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { - if (hipblas::is_supported::value) { - auto handle = exec->get_hipblas_handle(); - hipblas::dot(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), y->get_const_values(), - y->get_stride(), result->get_values()); - } else { - compute_dot(exec, x, y, result); - } - } else { - compute_dot(exec, x, y, result); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); - - -template -void compute_conj_dot_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { - if (hipblas::is_supported::value) { - auto handle = exec->get_hipblas_handle(); - hipblas::conj_dot(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), y->get_const_values(), - y->get_stride(), result->get_values()); - } else { - compute_conj_dot(exec, x, y, result); - } - } else { - compute_conj_dot(exec, x, y, result); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); - - -template -void compute_norm2_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - matrix::Dense>* result) -{ - if (x->get_size()[1] == 1) { - if (hipblas::is_supported::value) { - auto handle = exec->get_hipblas_handle(); - hipblas::norm2(handle, x->get_size()[0], x->get_const_values(), - x->get_stride(), result->get_values()); - } else { - compute_norm2(exec, x, result); - } - } else { - compute_norm2(exec, x, result); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); - - template void simple_apply(std::shared_ptr exec, const matrix::Dense* a, diff --git a/hip/solver/cb_gmres_kernels.hip.cpp b/hip/solver/cb_gmres_kernels.hip.cpp index 1ab7dc0f554..09f206d355b 100644 --- a/hip/solver/cb_gmres_kernels.hip.cpp +++ b/hip/solver/cb_gmres_kernels.hip.cpp @@ -139,13 +139,14 @@ void initialize_2(std::shared_ptr exec, const auto block_dim = default_block_size; constexpr auto block_size = default_block_size; const auto stride_arnoldi = arnoldi_norm->get_stride(); + Array tmp{exec}; hipLaunchKernelGGL(initialize_2_1_kernel, grid_dim_1, block_dim, 0, 0, residual->get_size()[0], residual->get_size()[1], krylov_dim, acc::as_hip_range(krylov_bases), as_hip_type(residual_norm_collection->get_values()), residual_norm_collection->get_stride()); - kernels::hip::dense::compute_norm2(exec, residual, residual_norm); + kernels::hip::dense::compute_norm2(exec, residual, residual_norm, tmp); if (use_scalar) { components::fill_array(exec, diff --git a/hip/solver/gmres_kernels.hip.cpp b/hip/solver/gmres_kernels.hip.cpp index 5945385e732..6c4c405f0de 100644 --- a/hip/solver/gmres_kernels.hip.cpp +++ b/hip/solver/gmres_kernels.hip.cpp @@ -121,8 +121,9 @@ void initialize_2(std::shared_ptr exec, default_block_size); const auto block_dim = default_block_size; constexpr auto block_size = default_block_size; + Array tmp{exec}; - kernels::hip::dense::compute_norm2(exec, residual, residual_norm); + kernels::hip::dense::compute_norm2(exec, residual, residual_norm, tmp); const auto grid_dim_2 = ceildiv(num_rows * num_rhs, default_block_size); hipLaunchKernelGGL( diff --git a/hip/test/base/kernel_launch.hip.cpp b/hip/test/base/kernel_launch.hip.cpp index 1ebf38cd1ba..e757cb6ef18 100644 --- a/hip/test/base/kernel_launch.hip.cpp +++ b/hip/test/base/kernel_launch.hip.cpp @@ -51,15 +51,42 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/test/utils.hpp" -namespace { - - using gko::dim; using gko::int64; using gko::size_type; using std::is_same; +struct move_only_type { + move_only_type() {} + + move_only_type(move_only_type&&) {} + + move_only_type(const move_only_type&) = delete; +}; + + +move_only_type move_only_val{}; + + +namespace gko { +namespace kernels { +namespace hip { + + +template <> +struct to_device_type_impl { + using type = int64; + + static type map_to_device(move_only_type&) { return 0; } +}; + + +} // namespace hip +} // namespace kernels +} // namespace gko + + class KernelLaunch : public ::testing::Test { protected: KernelLaunch() @@ -104,12 +131,13 @@ void run1d(std::shared_ptr exec, size_type dim, int* data) { gko::kernels::hip::run_kernel( exec, - [] GKO_KERNEL(auto i, auto d) { + [] GKO_KERNEL(auto i, auto d, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); d[i] = i; }, - dim, data); + dim, data, move_only_val); } TEST_F(KernelLaunch, Runs1D) @@ -124,17 +152,18 @@ void run1d(std::shared_ptr exec, gko::Array& data) { gko::kernels::hip::run_kernel( exec, - [] GKO_KERNEL(auto i, auto d, auto d_ptr) { + [] GKO_KERNEL(auto i, auto d, auto d_ptr, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); if (d == d_ptr) { d[i] = i; } else { d[i] = 0; } }, - data.get_num_elems(), data, data.get_const_data()); + data.get_num_elems(), data, data.get_const_data(), move_only_val); } TEST_F(KernelLaunch, Runs1DArray) @@ -149,13 +178,14 @@ void run1d(std::shared_ptr exec, gko::matrix::Dense<>* m) { gko::kernels::hip::run_kernel( exec, - [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) { + [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); bool pointers_correct = d.data == d_ptr && d2.data == d_ptr; bool strides_correct = d.stride == 5 && d2.stride == 5; bool accessors_2d_correct = @@ -170,7 +200,7 @@ void run1d(std::shared_ptr exec, gko::matrix::Dense<>* m) } }, 16, m, static_cast*>(m), - m->get_const_values()); + m->get_const_values(), move_only_val); } TEST_F(KernelLaunch, Runs1DDense) @@ -185,13 +215,14 @@ void run2d(std::shared_ptr exec, int* data) { gko::kernels::hip::run_kernel( exec, - [] GKO_KERNEL(auto i, auto j, auto d) { + [] GKO_KERNEL(auto i, auto j, auto d, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); d[i + 4 * j] = 4 * i + j; }, - dim<2>{4, 4}, data); + dim<2>{4, 4}, data, move_only_val); } TEST_F(KernelLaunch, Runs2D) @@ -206,18 +237,19 @@ void run2d(std::shared_ptr exec, gko::Array& data) { gko::kernels::hip::run_kernel( exec, - [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) { + [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); if (d == d_ptr) { d[i + 4 * j] = 4 * i + j; } else { d[i + 4 * j] = 0; } }, - dim<2>{4, 4}, data, data.get_const_data()); + dim<2>{4, 4}, data, data.get_const_data(), move_only_val); } TEST_F(KernelLaunch, Runs2DArray) @@ -234,7 +266,7 @@ void run2d(std::shared_ptr exec, gko::matrix::Dense<>* m1, gko::kernels::hip::run_kernel_solver( exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3, - auto d4, auto d2_ptr, auto d3_ptr) { + auto d4, auto d2_ptr, auto d3_ptr, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, @@ -245,6 +277,7 @@ void run2d(std::shared_ptr exec, gko::matrix::Dense<>* m1, static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); bool pointers_correct = d.data == d_ptr && d2.data == d_ptr && d3.data == d2_ptr && d4 == d3_ptr; bool strides_correct = @@ -265,7 +298,8 @@ void run2d(std::shared_ptr exec, gko::matrix::Dense<>* m1, dim<2>{4, 4}, m2->get_stride(), m1, static_cast*>(m1), m1->get_const_values(), gko::kernels::hip::default_stride(m2), - gko::kernels::hip::row_vector(m3), m2->get_values(), m3->get_values()); + gko::kernels::hip::row_vector(m3), m2->get_values(), m3->get_values(), + move_only_val); } TEST_F(KernelLaunch, Runs2DDense) @@ -282,28 +316,37 @@ void run1d_reduction(std::shared_ptr exec) gko::kernels::hip::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto a) { + [] GKO_KERNEL(auto i, auto a, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, "dummy"); return i + 1; }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), - size_type{100000}, output); + size_type{100000}, output, move_only_val); // 2 * sum i=0...99999 (i+1) ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000LL); gko::kernels::hip::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto a) { + [] GKO_KERNEL(auto i, auto a, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, "dummy"); return i + 1; }, - [] GKO_KERNEL(auto i, auto j) { return i + j; }, - [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), - size_type{100}, output); + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "a"); + static_assert(is_same::value, "b"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "value"); + return j * 2; + }, + int64{}, output.get_data(), size_type{100}, output, move_only_val); // 2 * sum i=0...99 (i+1) ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100LL); @@ -318,28 +361,47 @@ void run2d_reduction(std::shared_ptr exec) gko::kernels::hip::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto j, auto a) { + [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, "dummy"); return (i + 1) * (j + 1); }, - [] GKO_KERNEL(auto i, auto j) { return i + j; }, - [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), - gko::dim<2>{1000, 100}, output); + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "a"); + static_assert(is_same::value, "b"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "value"); + return j * 4; + }, + int64{}, output.get_data(), gko::dim<2>{1000, 100}, output, + move_only_val); // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1) ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000LL); gko::kernels::hip::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto j, auto a) { + [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, "dummy"); return (i + 1) * (j + 1); }, - [] GKO_KERNEL(auto i, auto j) { return i + j; }, - [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), - gko::dim<2>{10, 10}, output); + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "a"); + static_assert(is_same::value, "b"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "value"); + return j * 4; + }, + int64{}, output.get_data(), gko::dim<2>{10, 10}, output, move_only_val); // 4 * sum i=0...9 sum j=0...9 of (i+1)*(j+1) ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100LL); @@ -350,32 +412,48 @@ TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); } void run2d_row_reduction(std::shared_ptr exec) { - int num_rows = 1000; - int num_cols = 100; - gko::Array host_ref{exec->get_master(), - static_cast(2 * num_rows)}; - std::fill_n(host_ref.get_data(), 2 * num_rows, 1234); - gko::Array output{exec, host_ref}; - for (int i = 0; i < num_rows; i++) { - // we are computing 2 * sum {j=0, j::value, "index"); - static_assert(is_same::value, "value"); - return (i + 1) * (j + 1); - }, - [] GKO_KERNEL(auto i, auto j) { return i + j; }, - [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2, - gko::dim<2>{static_cast(num_rows), - static_cast(num_cols)}, - output); + for (auto num_rows : {0, 100, 1000, 10000}) { + for (auto num_cols : {0, 10, 100, 1000, 10000}) { + SCOPED_TRACE(std::to_string(num_rows) + " rows, " + + std::to_string(num_cols) + " cols"); + gko::Array host_ref{exec->get_master(), + static_cast(2 * num_rows)}; + std::fill_n(host_ref.get_data(), 2 * num_rows, 1234); + gko::Array output{exec, host_ref}; + for (int64 i = 0; i < num_rows; i++) { + // we are computing 2 * sum {j=0, j(num_cols) * (num_cols + 1) * (i + 1); + } - GKO_ASSERT_ARRAY_EQ(host_ref, output); + gko::kernels::hip::run_kernel_row_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + static_assert(is_same::value, + "dummy"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "a"); + static_assert(is_same::value, "b"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "value"); + return j * 2; + }, + int64{}, output.get_data(), 2, + gko::dim<2>{static_cast(num_rows), + static_cast(num_cols)}, + output, move_only_val); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); + } + } } TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); } @@ -383,34 +461,50 @@ TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); } void run2d_col_reduction(std::shared_ptr exec) { - int num_rows = 1000; - int num_cols = 100; - gko::Array host_ref{exec->get_master(), - static_cast(num_cols)}; - gko::Array output{exec, static_cast(num_cols)}; - for (int i = 0; i < num_cols; i++) { - // we are computing 2 * sum {j=0, j::value, "index"); - static_assert(is_same::value, "value"); - return (i + 1) * (j + 1); - }, - [] GKO_KERNEL(auto i, auto j) { return i + j; }, - [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), - gko::dim<2>{static_cast(num_rows), - static_cast(num_cols)}, - output); + // empty, most threads idle, most threads busy, multiple blocks + for (auto num_rows : {0, 10, 100, 1000, 10000}) { + // check different edge cases: subwarp sizes, blocked mode + for (auto num_cols : + {0, 1, 2, 3, 4, 5, 7, 8, 9, 16, 31, 32, 63, 127, 128, 129}) { + SCOPED_TRACE(std::to_string(num_rows) + " rows, " + + std::to_string(num_cols) + " cols"); + gko::Array host_ref{exec->get_master(), + static_cast(num_cols)}; + gko::Array output{exec, static_cast(num_cols)}; + for (int64 i = 0; i < num_cols; i++) { + // we are computing 2 * sum {j=0, j(num_rows) * (num_rows + 1) * (i + 1); + } - GKO_ASSERT_ARRAY_EQ(host_ref, output); + gko::kernels::hip::run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + static_assert(is_same::value, + "dummy"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "a"); + static_assert(is_same::value, "b"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "value"); + return j * 2; + }, + int64{}, output.get_data(), + gko::dim<2>{static_cast(num_rows), + static_cast(num_cols)}, + output, move_only_val); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); + } + } } TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); } - - -} // namespace diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 2b3d17fee13..210c67941a8 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -693,11 +693,7 @@ class Dense * element of alpha (the number of columns of alpha has to * match the number of columns of the matrix). */ - void scale(const LinOp* alpha) - { - auto exec = this->get_executor(); - this->scale_impl(make_temporary_clone(exec, alpha).get()); - } + void scale(const LinOp* alpha); /** * Scales the matrix with the inverse of a scalar. @@ -708,11 +704,7 @@ class Dense * of the i-th element of alpha (the number of columns of * alpha has to match the number of columns of the matrix). */ - void inv_scale(const LinOp* alpha) - { - auto exec = this->get_executor(); - this->inv_scale_impl(make_temporary_clone(exec, alpha).get()); - } + void inv_scale(const LinOp* alpha); /** * Adds `b` scaled by `alpha` to the matrix (aka: BLAS axpy). @@ -724,12 +716,7 @@ class Dense * match the number of columns of the matrix). * @param b a matrix of the same dimension as this */ - void add_scaled(const LinOp* alpha, const LinOp* b) - { - auto exec = this->get_executor(); - this->add_scaled_impl(make_temporary_clone(exec, alpha).get(), - make_temporary_clone(exec, b).get()); - } + void add_scaled(const LinOp* alpha, const LinOp* b); /** * Subtracts `b` scaled by `alpha` fron the matrix (aka: BLAS axpy). @@ -741,12 +728,7 @@ class Dense * match the number of columns of the matrix). * @param b a matrix of the same dimension as this */ - void sub_scaled(const LinOp* alpha, const LinOp* b) - { - auto exec = this->get_executor(); - this->sub_scaled_impl(make_temporary_clone(exec, alpha).get(), - make_temporary_clone(exec, b).get()); - } + void sub_scaled(const LinOp* alpha, const LinOp* b); /** * Computes the column-wise dot product of this matrix and `b`. @@ -756,12 +738,17 @@ class Dense * (the number of column in the vector must match the number * of columns of this) */ - void compute_dot(const LinOp* b, LinOp* result) const - { - auto exec = this->get_executor(); - this->compute_dot_impl(make_temporary_clone(exec, b).get(), - make_temporary_output_clone(exec, result).get()); - } + void compute_dot(const LinOp* b, LinOp* result) const; + + /** + * Computes the column-wise dot product of this matrix and `b`. + * + * @param b a Dense matrix of same dimension as this + * @param result a Dense row vector, used to store the dot product + * (the number of column in the vector must match the number + * of columns of this) + */ + void compute_dot(const LinOp* b, LinOp* result, Array& tmp) const; /** * Computes the column-wise dot product of `conj(this matrix)` and `b`. @@ -771,13 +758,18 @@ class Dense * (the number of column in the vector must match the number * of columns of this) */ - void compute_conj_dot(const LinOp* b, LinOp* result) const - { - auto exec = this->get_executor(); - this->compute_conj_dot_impl( - make_temporary_clone(exec, b).get(), - make_temporary_output_clone(exec, result).get()); - } + void compute_conj_dot(const LinOp* b, LinOp* result) const; + + /** + * Computes the column-wise dot product of `conj(this matrix)` and `b`. + * + * @param b a Dense matrix of same dimension as this + * @param result a Dense row vector, used to store the dot product + * (the number of column in the vector must match the number + * of columns of this) + */ + void compute_conj_dot(const LinOp* b, LinOp* result, + Array& tmp) const; /** * Computes the column-wise Euclidian (L^2) norm of this matrix. @@ -786,12 +778,16 @@ class Dense * (the number of columns in the vector must match the number * of columns of this) */ - void compute_norm2(LinOp* result) const - { - auto exec = this->get_executor(); - this->compute_norm2_impl( - make_temporary_output_clone(exec, result).get()); - } + void compute_norm2(LinOp* result) const; + + /** + * Computes the column-wise Euclidian (L^2) norm of this matrix. + * + * @param result a Dense row vector, used to store the norm + * (the number of columns in the vector must match the + * number of columns of this) + */ + void compute_norm2(LinOp* result, Array& tmp) const; /** * Computes the column-wise (L^1) norm of this matrix. @@ -800,12 +796,16 @@ class Dense * (the number of columns in the vector must match the number * of columns of this) */ - void compute_norm1(LinOp* result) const - { - auto exec = this->get_executor(); - this->compute_norm1_impl( - make_temporary_output_clone(exec, result).get()); - } + void compute_norm1(LinOp* result) const; + + /** + * Computes the column-wise (L^1) norm of this matrix. + * + * @param result a Dense row vector, used to store the norm + * (the number of columns in the vector must match the + * number of columns of this) + */ + void compute_norm1(LinOp* result, Array& tmp) const; /** * Create a submatrix from the original matrix. @@ -1012,64 +1012,64 @@ class Dense /** * @copydoc scale(const LinOp *) * - * @note Other implementations of dense should override this function - * instead of scale(const LinOp *alpha). + * @deprecated This function will be removed in the future, + * we will instead always use Ginkgo's implementation. */ virtual void scale_impl(const LinOp* alpha); /** * @copydoc inv_scale(const LinOp *) * - * @note Other implementations of dense should override this function - * instead of inv_scale(const LinOp *alpha). + * @deprecated This function will be removed in the future, + * we will instead always use Ginkgo's implementation. */ virtual void inv_scale_impl(const LinOp* alpha); /** * @copydoc add_scaled(const LinOp *, const LinOp *) * - * @note Other implementations of dense should override this function - * instead of add_scale(const LinOp *alpha, const LinOp *b). + * @deprecated This function will be removed in the future, + * we will instead always use Ginkgo's implementation. */ virtual void add_scaled_impl(const LinOp* alpha, const LinOp* b); /** * @copydoc sub_scaled(const LinOp *, const LinOp *) * - * @note Other implementations of dense should override this function - * instead of sub_scale(const LinOp *alpha, const LinOp *b). + * @deprecated This function will be removed in the future, + * we will instead always use Ginkgo's implementation. */ virtual void sub_scaled_impl(const LinOp* alpha, const LinOp* b); /** - * @copydoc compute_dot(const LinOp *, LinOp *) const + * @copydoc compute_dot(const LinOp*, LinOp*) const * - * @note Other implementations of dense should override this function - * instead of compute_dot(const LinOp *b, LinOp *result). + * @deprecated This function will be removed in the future, + * we will instead always use Ginkgo's implementation. */ virtual void compute_dot_impl(const LinOp* b, LinOp* result) const; /** - * @copydoc compute_conj_dot(const LinOp *, LinOp *) const + * @copydoc compute_conj_dot(const LinOp*, LinOp*) const * - * @note Other implementations of dense should override this function - * instead of compute_conj_dot(const LinOp *b, LinOp *result). + * @deprecated This function will be removed in the future, + * we will instead always use Ginkgo's implementation. */ virtual void compute_conj_dot_impl(const LinOp* b, LinOp* result) const; /** - * @copydoc compute_norm2(LinOp *) const + * @copydoc compute_norm2(LinOp*) const * - * @note Other implementations of dense should override this function - * instead of compute_norm2(LinOp *result). + * @deprecated This function will be removed in the future, + * we will instead always use Ginkgo's implementation. */ virtual void compute_norm2_impl(LinOp* result) const; /** - * @copydoc compute_norm1(LinOp *) const + * @copydoc compute_norm1(LinOp*) const * - * @note Other implementations of dense should override this function - * instead of compute_norm1(LinOp *result). + * @deprecated This function will be removed in the future, + * we will instead always use Ginkgo's implementation. */ virtual void compute_norm1_impl(LinOp* result) const; diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp index d4a489a258f..55046cae832 100644 --- a/omp/base/kernel_launch_reduction.hpp +++ b/omp/base/kernel_launch_reduction.hpp @@ -60,12 +60,17 @@ void run_kernel_reduction_impl(std::shared_ptr exec, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, size_type size, - MappedKernelArgs... args) + Array& tmp, MappedKernelArgs... args) { const auto num_threads = static_cast(omp_get_max_threads()); const auto ssize = static_cast(size); const auto work_per_thread = ceildiv(ssize, num_threads); - Array partial{exec, static_cast(num_threads)}; + const auto required_storage = sizeof(ValueType) * num_threads; + tmp.set_executor(exec); + if (tmp.get_num_elems() < required_storage) { + tmp.resize_and_reset(required_storage); + } + const auto partial = reinterpret_cast(tmp.get_data()); #pragma omp parallel num_threads(num_threads) { const auto thread_id = omp_get_thread_num(); @@ -76,11 +81,10 @@ void run_kernel_reduction_impl(std::shared_ptr exec, for (auto i = begin; i < end; i++) { local_partial = op(local_partial, fn(i, map_to_device(args)...)); } - partial.get_data()[thread_id] = local_partial; + partial[thread_id] = local_partial; } - *result = finalize(std::accumulate(partial.get_const_data(), - partial.get_const_data() + num_threads, - identity, op)); + *result = + finalize(std::accumulate(partial, partial + num_threads, identity, op)); } @@ -92,13 +96,18 @@ void run_kernel_reduction_sized_impl(syn::value_list, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, - MappedKernelArgs... args) + Array& tmp, MappedKernelArgs... args) { const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); const auto num_threads = static_cast(omp_get_max_threads()); const auto work_per_thread = ceildiv(rows, num_threads); - Array partial{exec, static_cast(num_threads)}; + const auto required_storage = sizeof(ValueType) * num_threads; + tmp.set_executor(exec); + if (tmp.get_num_elems() < required_storage) { + tmp.resize_and_reset(required_storage); + } + const auto partial = reinterpret_cast(tmp.get_data()); static_assert(remainder_cols < block_size, "remainder too large"); const auto rounded_cols = cols / block_size * block_size; GKO_ASSERT(rounded_cols + remainder_cols == cols); @@ -138,11 +147,10 @@ void run_kernel_reduction_sized_impl(syn::value_list, } } } - partial.get_data()[thread_id] = local_partial; + partial[thread_id] = local_partial; } - *result = finalize(std::accumulate(partial.get_const_data(), - partial.get_const_data() + num_threads, - identity, op)); + *result = + finalize(std::accumulate(partial, partial + num_threads, identity, op)); } GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_sized, @@ -154,23 +162,24 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_sized, template -void run_kernel_reduction(std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, size_type size, - KernelArgs&&... args) +void run_kernel_reduction_cached(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, size_type size, + Array& tmp, KernelArgs&&... args) { run_kernel_reduction_impl(exec, fn, op, finalize, identity, result, size, - map_to_device(args)...); + tmp, map_to_device(args)...); } template -void run_kernel_reduction(std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, dim<2> size, KernelArgs&&... args) +void run_kernel_reduction_cached(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, dim<2> size, + Array& tmp, KernelArgs&&... args) { const auto cols = static_cast(size[1]); constexpr int block_size = 8; @@ -184,7 +193,7 @@ void run_kernel_reduction(std::shared_ptr exec, remainders(), [&](int remainder) { return remainder == cols % block_size; }, syn::value_list(), syn::type_list<>(), exec, fn, op, - finalize, identity, result, size, map_to_device(args)...); + finalize, identity, result, size, tmp, map_to_device(args)...); } @@ -197,7 +206,8 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, size_type result_stride, - dim<2> size, MappedKernelArgs... args) + dim<2> size, Array& tmp, + MappedKernelArgs... args) { constexpr int block_size = 8; const auto rows = static_cast(size[0]); @@ -222,8 +232,12 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, } else { // small number of rows and large reduction sizes: do partial sum first const auto work_per_thread = ceildiv(cols, num_threads); - Array partial{exec, - static_cast(rows * num_threads)}; + const auto required_storage = sizeof(ValueType) * rows * num_threads; + tmp.set_executor(exec); + if (tmp.get_num_elems() < required_storage) { + tmp.resize_and_reset(required_storage); + } + const auto partial = reinterpret_cast(tmp.get_data()); #pragma omp parallel num_threads(num_threads) { const auto thread_id = static_cast(omp_get_thread_num()); @@ -236,8 +250,7 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, return fn(row, col, args...); }()); } - partial.get_data()[row * num_threads + thread_id] = - local_partial; + partial[row * num_threads + thread_id] = local_partial; } } // then accumulate the partial sums and write to result @@ -247,10 +260,8 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, auto local_partial = identity; for (int64 thread_id = 0; thread_id < num_threads; thread_id++) { - local_partial = op( - local_partial, - partial - .get_const_data()[row * num_threads + thread_id]); + local_partial = op(local_partial, + partial[row * num_threads + thread_id]); } result[row * result_stride] = finalize(local_partial); }(); @@ -290,7 +301,7 @@ void run_kernel_col_reduction_sized_impl( syn::value_list, std::shared_ptr exec, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, - MappedKernelArgs... args) + Array& tmp, MappedKernelArgs... args) { const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); @@ -319,8 +330,12 @@ void run_kernel_col_reduction_sized_impl( const auto reduction_size = ceildiv(reduction_kernel_oversubscription * num_threads, cols); const auto rows_per_thread = ceildiv(rows, reduction_size); - Array partial{exec, - static_cast(reduction_size * cols)}; + const auto required_storage = sizeof(ValueType) * rows * reduction_size; + tmp.set_executor(exec); + if (tmp.get_num_elems() < required_storage) { + tmp.resize_and_reset(required_storage); + } + const auto partial = reinterpret_cast(tmp.get_data()); #pragma omp parallel for for (int64 i = 0; i < reduction_size * num_col_blocks; i++) { const auto col_block = i % num_col_blocks; @@ -331,14 +346,12 @@ void run_kernel_col_reduction_sized_impl( const auto identity_fn = [](auto i) { return i; }; if (base_col + block_size <= cols) { run_kernel_col_reduction_sized_block_impl( - fn, op, identity_fn, identity, - partial.get_data() + cols * row_block, begin, end, base_col, - args...); + fn, op, identity_fn, identity, partial + cols * row_block, + begin, end, base_col, args...); } else { run_kernel_col_reduction_sized_block_impl( - fn, op, identity_fn, identity, - partial.get_data() + cols * row_block, begin, end, base_col, - args...); + fn, op, identity_fn, identity, partial + cols * row_block, + begin, end, base_col, args...); } } #pragma omp parallel for @@ -347,9 +360,7 @@ void run_kernel_col_reduction_sized_impl( auto total = identity; for (int64 row_block = 0; row_block < reduction_size; row_block++) { - total = - op(total, - partial.get_const_data()[col + cols * row_block]); + total = op(total, partial[col + cols * row_block]); } result[col] = finalize(total); }(); @@ -365,25 +376,27 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_col_reduction_sized, template -void run_kernel_row_reduction(std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, size_type result_stride, - dim<2> size, MappedKernelArgs... args) + typename FinalizeOp, typename... KernelArgs> +void run_kernel_row_reduction_cached(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, size_type result_stride, + dim<2> size, Array& tmp, + KernelArgs&&... args) { run_kernel_row_reduction_impl(exec, fn, op, finalize, identity, result, - result_stride, size, map_to_device(args)...); + result_stride, size, tmp, + map_to_device(args)...); } template -void run_kernel_col_reduction(std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, - ValueType* result, dim<2> size, - KernelArgs&&... args) +void run_kernel_col_reduction_cached(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, + ValueType* result, dim<2> size, + Array& tmp, KernelArgs&&... args) { constexpr auto block_size = 8; using remainders = syn::as_list>; @@ -396,7 +409,7 @@ void run_kernel_col_reduction(std::shared_ptr exec, remainders(), [&](int remainder) { return remainder == cols % block_size; }, syn::value_list(), syn::type_list<>(), exec, fn, op, - finalize, identity, result, size, map_to_device(args)...); + finalize, identity, result, size, tmp, map_to_device(args)...); } diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp index 6a7e3fa7b01..dd3a290b342 100644 --- a/omp/matrix/dense_kernels.cpp +++ b/omp/matrix/dense_kernels.cpp @@ -68,45 +68,6 @@ namespace omp { namespace dense { -template -void compute_dot_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - // OpenMP uses the unified kernel. - compute_dot(exec, x, y, result); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); - - -template -void compute_conj_dot_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - compute_conj_dot(exec, x, y, result); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); - - -template -void compute_norm2_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - matrix::Dense>* result) -{ - compute_norm2(exec, x, result); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); - - template void simple_apply(std::shared_ptr exec, const matrix::Dense* a, diff --git a/omp/test/base/kernel_launch.cpp b/omp/test/base/kernel_launch.cpp index fbc14a1ec79..5666e5cfd3f 100644 --- a/omp/test/base/kernel_launch.cpp +++ b/omp/test/base/kernel_launch.cpp @@ -51,15 +51,42 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/test/utils.hpp" -namespace { - - using gko::dim; using gko::int64; using gko::size_type; using std::is_same; +struct move_only_type { + move_only_type() {} + + move_only_type(move_only_type&&) {} + + move_only_type(const move_only_type&) = delete; +}; + + +move_only_type move_only_val{}; + + +namespace gko { +namespace kernels { +namespace omp { + + +template <> +struct to_device_type_impl { + using type = int64; + + static type map_to_device(move_only_type&) { return 0; } +}; + + +} // namespace omp +} // namespace kernels +} // namespace gko + + class KernelLaunch : public ::testing::Test { protected: KernelLaunch() @@ -97,12 +124,13 @@ TEST_F(KernelLaunch, Runs1D) { gko::kernels::omp::run_kernel( exec, - [] GKO_KERNEL(auto i, auto d) { + [] GKO_KERNEL(auto i, auto d, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); d[i] = i; }, - zero_array.get_num_elems(), zero_array.get_data()); + zero_array.get_num_elems(), zero_array.get_data(), move_only_val); GKO_ASSERT_ARRAY_EQ(zero_array, iota_array); } @@ -112,17 +140,19 @@ TEST_F(KernelLaunch, Runs1DArray) { gko::kernels::omp::run_kernel( exec, - [] GKO_KERNEL(auto i, auto d, auto d_ptr) { + [] GKO_KERNEL(auto i, auto d, auto d_ptr, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); if (d == d_ptr) { d[i] = i; } else { d[i] = 0; } }, - zero_array.get_num_elems(), zero_array, zero_array.get_const_data()); + zero_array.get_num_elems(), zero_array, zero_array.get_const_data(), + move_only_val); GKO_ASSERT_ARRAY_EQ(zero_array, iota_array); } @@ -132,13 +162,14 @@ TEST_F(KernelLaunch, Runs1DDense) { gko::kernels::omp::run_kernel( exec, - [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) { + [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); bool pointers_correct = d.data == d_ptr && d2.data == d_ptr; bool strides_correct = d.stride == 5 && d2.stride == 5; bool accessors_2d_correct = @@ -154,7 +185,7 @@ TEST_F(KernelLaunch, Runs1DDense) }, 16, zero_dense2.get(), static_cast*>(zero_dense2.get()), - zero_dense2->get_const_values()); + zero_dense2->get_const_values(), move_only_val); GKO_ASSERT_MTX_NEAR(zero_dense2, iota_dense, 0.0); } @@ -164,13 +195,14 @@ TEST_F(KernelLaunch, Runs2D) { gko::kernels::omp::run_kernel( exec, - [] GKO_KERNEL(auto i, auto j, auto d) { + [] GKO_KERNEL(auto i, auto j, auto d, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); d[i + 4 * j] = 4 * i + j; }, - dim<2>{4, 4}, zero_array.get_data()); + dim<2>{4, 4}, zero_array.get_data(), move_only_val); GKO_ASSERT_ARRAY_EQ(zero_array, iota_transp_array); } @@ -180,18 +212,19 @@ TEST_F(KernelLaunch, Runs2DArray) { gko::kernels::omp::run_kernel( exec, - [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) { + [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); if (d == d_ptr) { d[i + 4 * j] = 4 * i + j; } else { d[i + 4 * j] = 0; } }, - dim<2>{4, 4}, zero_array, zero_array.get_const_data()); + dim<2>{4, 4}, zero_array, zero_array.get_const_data(), move_only_val); GKO_ASSERT_ARRAY_EQ(zero_array, iota_transp_array); } @@ -202,7 +235,7 @@ TEST_F(KernelLaunch, Runs2DDense) gko::kernels::omp::run_kernel_solver( exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3, - auto d4, auto d2_ptr, auto d3_ptr) { + auto d4, auto d2_ptr, auto d3_ptr, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, @@ -213,6 +246,7 @@ TEST_F(KernelLaunch, Runs2DDense) static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); + static_assert(is_same::value, "dummy"); bool pointers_correct = d.data == d_ptr && d2.data == d_ptr && d3.data == d2_ptr && d4 == d3_ptr; bool strides_correct = @@ -235,7 +269,7 @@ TEST_F(KernelLaunch, Runs2DDense) zero_dense2->get_const_values(), gko::kernels::omp::default_stride(zero_dense.get()), gko::kernels::omp::row_vector(vec_dense.get()), - zero_dense->get_values(), vec_dense->get_values()); + zero_dense->get_values(), vec_dense->get_values(), move_only_val); GKO_ASSERT_MTX_NEAR(zero_dense2, iota_dense, 0.0); } @@ -246,28 +280,30 @@ TEST_F(KernelLaunch, Reduction1D) gko::kernels::omp::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto a) { + [] GKO_KERNEL(auto i, auto a, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, "dummy"); return i + 1; }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), - size_type{100000}, output); + size_type{100000}, output, move_only_val); // 2 * sum i=0...99999 (i+1) ASSERT_EQ(*output.get_const_data(), 10000100000LL); gko::kernels::omp::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto a) { + [] GKO_KERNEL(auto i, auto a, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, "dummy"); return i + 1; }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), - size_type{10}, output); + size_type{10}, output, move_only_val); // 2 * sum i=0...9 (i+1) ASSERT_EQ(*output.get_const_data(), 110LL); @@ -281,14 +317,16 @@ TEST_F(KernelLaunch, Reduction2DSmallRows) for (int cols = 0; cols < 17; cols++) { gko::kernels::omp::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto j, auto a) { + [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, "dummy"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), - gko::dim<2>{10, cols}, output); + gko::dim<2>{10, cols}, output, move_only_val); // 4 * sum i=0...9 sum j=0...cols-1 of (i+1)*(j+1) ASSERT_EQ(*output.get_const_data(), 110LL * cols * (cols + 1)); @@ -303,14 +341,16 @@ TEST_F(KernelLaunch, Reduction2DLargeRows) for (int cols = 0; cols < 17; cols++) { gko::kernels::omp::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto j, auto a) { + [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, "dummy"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), - gko::dim<2>{1000, cols}, output); + gko::dim<2>{1000, cols}, output, move_only_val); // 4 * sum i=0...999 sum j=0...cols-1 of (i+1)*(j+1) ASSERT_EQ(*output.get_const_data(), 1001000LL * cols * (cols + 1)); @@ -324,14 +364,15 @@ TEST_F(KernelLaunch, Reduction2D) gko::kernels::omp::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto j, auto a) { + [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, "dummy"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), - gko::dim<2>{1000, 100}, output); + gko::dim<2>{1000, 100}, output, move_only_val); // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1) @@ -358,16 +399,18 @@ TEST_F(KernelLaunch, ReductionRow2DSmall) gko::kernels::omp::run_kernel_row_reduction( exec, - [] GKO_KERNEL(auto i, auto j, auto a) { + [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, "dummy"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2, gko::dim<2>{static_cast(num_rows), static_cast(num_cols)}, - output); + output, move_only_val); GKO_ASSERT_ARRAY_EQ(host_ref, output); } @@ -390,16 +433,17 @@ TEST_F(KernelLaunch, ReductionRow2D) gko::kernels::omp::run_kernel_row_reduction( exec, - [] GKO_KERNEL(auto i, auto j, auto a) { + [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, "dummy"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2, gko::dim<2>{static_cast(num_rows), static_cast(num_cols)}, - output); + output, move_only_val); GKO_ASSERT_ARRAY_EQ(host_ref, output); } @@ -422,9 +466,12 @@ TEST_F(KernelLaunch, ReductionCol2D) gko::kernels::omp::run_kernel_col_reduction( exec, - [] GKO_KERNEL(auto i, auto j, auto a) { + [] GKO_KERNEL(auto i, auto j, auto a, auto dummy) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); + static_assert(is_same::value, + "dummy"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, @@ -432,12 +479,9 @@ TEST_F(KernelLaunch, ReductionCol2D) output.get_data(), gko::dim<2>{static_cast(num_rows), static_cast(num_cols)}, - output); + output, move_only_val); GKO_ASSERT_ARRAY_EQ(host_ref, output); } } } - - -} // namespace diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp index 1bf28847374..648bc8a6e7e 100644 --- a/reference/matrix/dense_kernels.cpp +++ b/reference/matrix/dense_kernels.cpp @@ -282,7 +282,7 @@ template void compute_dot(std::shared_ptr exec, const matrix::Dense* x, const matrix::Dense* y, - matrix::Dense* result) + matrix::Dense* result, Array&) { for (size_type j = 0; j < x->get_size()[1]; ++j) { result->at(0, j) = zero(); @@ -297,24 +297,11 @@ void compute_dot(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); -template -void compute_dot_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - compute_dot(exec, x, y, result); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); - - template void compute_conj_dot(std::shared_ptr exec, const matrix::Dense* x, const matrix::Dense* y, - matrix::Dense* result) + matrix::Dense* result, Array&) { for (size_type j = 0; j < x->get_size()[1]; ++j) { result->at(0, j) = zero(); @@ -329,23 +316,11 @@ void compute_conj_dot(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); -template -void compute_conj_dot_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - compute_conj_dot(exec, x, y, result); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); - - template void compute_norm2(std::shared_ptr exec, const matrix::Dense* x, - matrix::Dense>* result) + matrix::Dense>* result, + Array&) { for (size_type j = 0; j < x->get_size()[1]; ++j) { result->at(0, j) = zero>(); @@ -363,22 +338,11 @@ void compute_norm2(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); -template -void compute_norm2_dispatch(std::shared_ptr exec, - const matrix::Dense* x, - matrix::Dense>* result) -{ - compute_norm2(exec, x, result); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); - - template void compute_norm1(std::shared_ptr exec, const matrix::Dense* x, - matrix::Dense>* result) + matrix::Dense>* result, + Array&) { for (size_type j = 0; j < x->get_size()[1]; ++j) { result->at(0, j) = zero>(); diff --git a/test/matrix/dense_kernels.cpp b/test/matrix/dense_kernels.cpp index 5a118feb36a..b843f3ec6a0 100644 --- a/test/matrix/dense_kernels.cpp +++ b/test/matrix/dense_kernels.cpp @@ -998,18 +998,135 @@ TEST_F(Dense, ExtractDiagonalOnShortFatIntoDenseCrossExecutor) } +TEST_F(Dense, ComputeDotIsEquivalentToRef) +{ + set_up_vector_data(2); + + auto dot_size = gko::dim<2>{1, x->get_size()[1]}; + auto dot_expected = Mtx::create(ref, dot_size); + auto ddot = Mtx::create(ref, dot_size); + + // all parameters are on ref to check cross-executor calls + x->compute_dot(y.get(), dot_expected.get()); + dx->compute_dot(y.get(), ddot.get()); + + GKO_ASSERT_MTX_NEAR(ddot, dot_expected, r::value); +} + + +TEST_F(Dense, ComputeDotWithTmpIsEquivalentToRef) +{ + set_up_vector_data(40); + + auto dot_size = gko::dim<2>{1, x->get_size()[1]}; + auto dot_expected = Mtx::create(ref, dot_size); + auto ddot = Mtx::create(ref, dot_size); + gko::Array tmp{ref}; + + // all parameters are on ref to check cross-executor calls + x->compute_dot(y.get(), dot_expected.get(), tmp); + dx->compute_dot(y.get(), ddot.get(), tmp); + + GKO_ASSERT_MTX_NEAR(ddot, dot_expected, r::value); +} + + +TEST_F(Dense, ComputeConjDotIsEquivalentToRef) +{ + set_up_vector_data(13); + + auto dot_size = gko::dim<2>{1, x->get_size()[1]}; + auto dot_expected = Mtx::create(ref, dot_size); + auto ddot = Mtx::create(ref, dot_size); + + // all parameters are on ref to check cross-executor calls + x->compute_conj_dot(y.get(), dot_expected.get()); + dx->compute_conj_dot(y.get(), ddot.get()); + + GKO_ASSERT_MTX_NEAR(ddot, dot_expected, r::value); +} + + +TEST_F(Dense, ComputeConjDotWithTmpIsEquivalentToRef) +{ + set_up_vector_data(65); + + auto dot_size = gko::dim<2>{1, x->get_size()[1]}; + auto dot_expected = Mtx::create(ref, dot_size); + auto ddot = Mtx::create(ref, dot_size); + gko::Array tmp{ref}; + + // all parameters are on ref to check cross-executor calls + x->compute_conj_dot(y.get(), dot_expected.get(), tmp); + dx->compute_conj_dot(y.get(), ddot.get(), tmp); + + GKO_ASSERT_MTX_NEAR(ddot, dot_expected, r::value); +} + + TEST_F(Dense, ComputeNorm1IsEquivalentToRef) { - set_up_apply_data(); + set_up_vector_data(2); auto norm_size = gko::dim<2>{1, x->get_size()[1]}; auto norm_expected = NormVector::create(ref, norm_size); auto dnorm = NormVector::create(ref, norm_size); + // all parameters are on ref to check cross-executor calls x->compute_norm1(norm_expected.get()); dx->compute_norm1(dnorm.get()); - GKO_ASSERT_MTX_NEAR(x, dx, r::value); + GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, r::value); +} + + +TEST_F(Dense, ComputeNorm1WithTmpIsEquivalentToRef) +{ + set_up_vector_data(10); + + auto norm_size = gko::dim<2>{1, x->get_size()[1]}; + auto norm_expected = NormVector::create(ref, norm_size); + auto dnorm = NormVector::create(ref, norm_size); + gko::Array tmp{ref}; + + // all parameters are on ref to check cross-executor calls + x->compute_norm1(norm_expected.get(), tmp); + dx->compute_norm1(dnorm.get(), tmp); + + GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, r::value); +} + + +TEST_F(Dense, ComputeNorm2IsEquivalentToRef) +{ + set_up_vector_data(5); + + auto norm_size = gko::dim<2>{1, x->get_size()[1]}; + auto norm_expected = NormVector::create(ref, norm_size); + auto dnorm = NormVector::create(ref, norm_size); + + // all parameters are on ref to check cross-executor calls + x->compute_norm1(norm_expected.get()); + dx->compute_norm1(dnorm.get()); + + GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, r::value); +} + + +TEST_F(Dense, ComputeNorm2WithTmpIsEquivalentToRef) +{ + set_up_vector_data(3); + + auto norm_size = gko::dim<2>{1, x->get_size()[1]}; + auto norm_expected = NormVector::create(ref, norm_size); + auto dnorm = NormVector::create(ref, norm_size); + gko::Array tmp{ref}; + + // all parameters are on ref to check cross-executor calls + x->compute_norm1(norm_expected.get(), tmp); + dx->compute_norm1(dnorm.get(), tmp); + + GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, r::value); } From ee00f4968a39a9383de4b0328c115a1b97c813d8 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 29 Mar 2022 19:31:07 +0200 Subject: [PATCH 3/5] restore reduction dispatch kernels for vendor BLAS --- core/device_hooks/common_kernels.inc.cpp | 3 + core/matrix/dense_kernels.hpp | 24 ++++++++ cuda/matrix/dense_kernels.cu | 72 ++++++++++++++++++++++++ dpcpp/matrix/dense_kernels.dp.cpp | 43 ++++++++++++++ hip/matrix/dense_kernels.hip.cpp | 72 ++++++++++++++++++++++++ omp/matrix/dense_kernels.cpp | 41 ++++++++++++++ reference/matrix/dense_kernels.cpp | 40 +++++++++++++ 7 files changed, 295 insertions(+) diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 13dae15583f..a89d1f3c140 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -256,8 +256,11 @@ GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_IDENTITY_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL); diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp index 1ef142eb4d7..3cf15052826 100644 --- a/core/matrix/dense_kernels.hpp +++ b/core/matrix/dense_kernels.hpp @@ -104,12 +104,24 @@ namespace kernels { const matrix::Diagonal<_type>* x, \ matrix::Dense<_type>* y) +#define GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL(_type) \ + void compute_dot_dispatch(std::shared_ptr exec, \ + const matrix::Dense<_type>* x, \ + const matrix::Dense<_type>* y, \ + matrix::Dense<_type>* result, Array& tmp) + #define GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL(_type) \ void compute_dot(std::shared_ptr exec, \ const matrix::Dense<_type>* x, \ const matrix::Dense<_type>* y, \ matrix::Dense<_type>* result, Array& tmp) +#define GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL(_type) \ + void compute_conj_dot_dispatch( \ + std::shared_ptr exec, \ + const matrix::Dense<_type>* x, const matrix::Dense<_type>* y, \ + matrix::Dense<_type>* result, Array& tmp) + #define GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL(_type) \ void compute_conj_dot(std::shared_ptr exec, \ const matrix::Dense<_type>* x, \ @@ -122,6 +134,12 @@ namespace kernels { matrix::Dense>* result, \ Array& tmp) +#define GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(_type) \ + void compute_norm2_dispatch(std::shared_ptr exec, \ + const matrix::Dense<_type>* x, \ + matrix::Dense>* result, \ + Array& tmp) + #define GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(_type) \ void compute_norm1(std::shared_ptr exec, \ const matrix::Dense<_type>* x, \ @@ -310,10 +328,16 @@ namespace kernels { template \ GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL(ValueType); \ template \ + GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL(ValueType); \ + template \ GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL(ValueType); \ template \ + GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL(ValueType); \ + template \ GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL(ValueType); \ template \ + GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(ValueType); \ + template \ GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(ValueType); \ template \ GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu index 063b0f069d3..7eeebd82357 100644 --- a/cuda/matrix/dense_kernels.cu +++ b/cuda/matrix/dense_kernels.cu @@ -73,6 +73,78 @@ constexpr int default_block_size = 512; #include "common/cuda_hip/matrix/dense_kernels.hpp.inc" +template +void compute_dot_dispatch(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result, Array& tmp) +{ + if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { + if (cublas::is_supported::value) { + auto handle = exec->get_cublas_handle(); + cublas::dot(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), y->get_const_values(), y->get_stride(), + result->get_values()); + } else { + compute_dot(exec, x, y, result, tmp); + } + } else { + compute_dot(exec, x, y, result, tmp); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); + + +template +void compute_conj_dot_dispatch(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result, + Array& tmp) +{ + if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { + if (cublas::is_supported::value) { + auto handle = exec->get_cublas_handle(); + cublas::conj_dot(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), y->get_const_values(), + y->get_stride(), result->get_values()); + } else { + compute_conj_dot(exec, x, y, result, tmp); + } + } else { + compute_conj_dot(exec, x, y, result, tmp); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); + + +template +void compute_norm2_dispatch(std::shared_ptr exec, + const matrix::Dense* x, + matrix::Dense>* result, + Array& tmp) +{ + if (x->get_size()[1] == 1) { + if (cublas::is_supported::value) { + auto handle = exec->get_cublas_handle(); + cublas::norm2(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), result->get_values()); + } else { + compute_norm2(exec, x, result, tmp); + } + } else { + compute_norm2(exec, x, result, tmp); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); + + template void simple_apply(std::shared_ptr exec, const matrix::Dense* a, diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 39c2b0c54e4..0503b490e47 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -189,6 +189,49 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(conj_transpose_call, conj_transpose, } // namespace kernel +template +void compute_dot_dispatch(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result, Array& tmp) +{ + // TODO Add onemkl for single column ? + compute_dot(exec, x, y, result, tmp); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); + + +template +void compute_conj_dot_dispatch(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result, + Array& tmp) +{ + // TODO Add onemkl for single column ? + compute_conj_dot(exec, x, y, result, tmp); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); + + +template +void compute_norm2_dispatch(std::shared_ptr exec, + const matrix::Dense* x, + matrix::Dense>* result, + Array& tmp) +{ + // TODO Add onemkl for single column ? + compute_norm2(exec, x, result, tmp); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); + + template void simple_apply(std::shared_ptr exec, const matrix::Dense* a, diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp index d193af9a4fb..0f1d2c455da 100644 --- a/hip/matrix/dense_kernels.hip.cpp +++ b/hip/matrix/dense_kernels.hip.cpp @@ -76,6 +76,78 @@ constexpr int default_block_size = 512; #include "common/cuda_hip/matrix/dense_kernels.hpp.inc" +template +void compute_dot_dispatch(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result, Array& tmp) +{ + if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { + if (hipblas::is_supported::value) { + auto handle = exec->get_hipblas_handle(); + hipblas::dot(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), y->get_const_values(), + y->get_stride(), result->get_values()); + } else { + compute_dot(exec, x, y, result, tmp); + } + } else { + compute_dot(exec, x, y, result, tmp); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); + + +template +void compute_conj_dot_dispatch(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result, + Array& tmp) +{ + if (x->get_size()[1] == 1 && y->get_size()[1] == 1) { + if (hipblas::is_supported::value) { + auto handle = exec->get_hipblas_handle(); + hipblas::conj_dot(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), y->get_const_values(), + y->get_stride(), result->get_values()); + } else { + compute_conj_dot(exec, x, y, result, tmp); + } + } else { + compute_conj_dot(exec, x, y, result, tmp); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); + + +template +void compute_norm2_dispatch(std::shared_ptr exec, + const matrix::Dense* x, + matrix::Dense>* result, + Array& tmp) +{ + if (x->get_size()[1] == 1) { + if (hipblas::is_supported::value) { + auto handle = exec->get_hipblas_handle(); + hipblas::norm2(handle, x->get_size()[0], x->get_const_values(), + x->get_stride(), result->get_values()); + } else { + compute_norm2(exec, x, result, tmp); + } + } else { + compute_norm2(exec, x, result, tmp); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); + + template void simple_apply(std::shared_ptr exec, const matrix::Dense* a, diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp index dd3a290b342..e09c54b2522 100644 --- a/omp/matrix/dense_kernels.cpp +++ b/omp/matrix/dense_kernels.cpp @@ -68,6 +68,47 @@ namespace omp { namespace dense { +template +void compute_dot_dispatch(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result, Array& tmp) +{ + // OpenMP uses the unified kernel. + compute_dot(exec, x, y, result, tmp); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); + + +template +void compute_conj_dot_dispatch(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result, + Array& tmp) +{ + compute_conj_dot(exec, x, y, result, tmp); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); + + +template +void compute_norm2_dispatch(std::shared_ptr exec, + const matrix::Dense* x, + matrix::Dense>* result, + Array& tmp) +{ + compute_norm2(exec, x, result, tmp); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); + + template void simple_apply(std::shared_ptr exec, const matrix::Dense* a, diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp index 648bc8a6e7e..317b3599ae3 100644 --- a/reference/matrix/dense_kernels.cpp +++ b/reference/matrix/dense_kernels.cpp @@ -297,6 +297,19 @@ void compute_dot(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); +template +void compute_dot_dispatch(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result, Array& tmp) +{ + compute_dot(exec, x, y, result, tmp); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL); + + template void compute_conj_dot(std::shared_ptr exec, const matrix::Dense* x, @@ -316,6 +329,20 @@ void compute_conj_dot(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); +template +void compute_conj_dot_dispatch(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result, + Array& tmp) +{ + compute_conj_dot(exec, x, y, result, tmp); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); + + template void compute_norm2(std::shared_ptr exec, const matrix::Dense* x, @@ -338,6 +365,19 @@ void compute_norm2(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); +template +void compute_norm2_dispatch(std::shared_ptr exec, + const matrix::Dense* x, + matrix::Dense>* result, + Array& tmp) +{ + compute_norm2(exec, x, result, tmp); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); + + template void compute_norm1(std::shared_ptr exec, const matrix::Dense* x, From e71958ece0e574c043bed911c6f2229bb158930a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 31 Mar 2022 14:35:37 +0200 Subject: [PATCH 4/5] review updates * handle executor of tmp storage in core, not device * add missing documentation Co-authored-by: Pratik Nayak --- .../base/kernel_launch_reduction.hpp.inc | 5 -- core/matrix/dense.cpp | 16 ++++ dpcpp/base/kernel_launch_reduction.dp.hpp | 5 -- include/ginkgo/core/matrix/dense.hpp | 16 +++- omp/base/kernel_launch_reduction.hpp | 4 - test/matrix/dense_kernels.cpp | 80 +++++++++++++++++-- 6 files changed, 104 insertions(+), 22 deletions(-) diff --git a/common/cuda_hip/base/kernel_launch_reduction.hpp.inc b/common/cuda_hip/base/kernel_launch_reduction.hpp.inc index 4ecf9c9979d..e32312e4eb4 100644 --- a/common/cuda_hip/base/kernel_launch_reduction.hpp.inc +++ b/common/cuda_hip/base/kernel_launch_reduction.hpp.inc @@ -130,7 +130,6 @@ void run_kernel_reduction_cached(std::shared_ptr exec, ceildiv(size, block_size), exec->get_num_warps() * oversubscription); if (num_blocks > 1) { const auto required_storage = sizeof(ValueType) * num_blocks; - tmp.set_executor(exec); if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); } @@ -171,7 +170,6 @@ void run_kernel_reduction_cached(std::shared_ptr exec, exec->get_num_warps() * oversubscription); if (num_blocks > 1) { const auto required_storage = sizeof(ValueType) * num_blocks; - tmp.set_executor(exec); if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); } @@ -395,7 +393,6 @@ void run_generic_col_reduction_small( as_device_type(result), args...); } else { const auto required_storage = sizeof(ValueType) * num_blocks * cols; - tmp.set_executor(exec); if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); } @@ -441,7 +438,6 @@ void run_kernel_row_reduction_cached( if (rows * cols > resources && rows < cols) { const auto col_blocks = ceildiv(rows * cols, resources); const auto required_storage = sizeof(ValueType) * col_blocks * rows; - tmp.set_executor(exec); if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); } @@ -512,7 +508,6 @@ void run_kernel_col_reduction_cached( as_device_type(result), map_to_device(args)...); } else { const auto required_storage = sizeof(ValueType) * row_blocks * cols; - tmp.set_executor(exec); if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); } diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index afa0de1f0e9..611925231cc 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -341,6 +341,10 @@ void Dense::compute_dot(const LinOp* b, LinOp* result, GKO_ASSERT_EQUAL_DIMENSIONS(this, b); GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1])); auto exec = this->get_executor(); + if (tmp.get_executor() != exec) { + tmp.clear(); + tmp.set_executor(exec); + } auto local_b = make_temporary_clone(exec, b); auto local_res = make_temporary_clone(exec, result); auto dense_b = make_temporary_conversion(local_b.get()); @@ -371,6 +375,10 @@ void Dense::compute_conj_dot(const LinOp* b, LinOp* result, GKO_ASSERT_EQUAL_DIMENSIONS(this, b); GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1])); auto exec = this->get_executor(); + if (tmp.get_executor() != exec) { + tmp.clear(); + tmp.set_executor(exec); + } auto local_b = make_temporary_clone(exec, b); auto local_res = make_temporary_clone(exec, result); auto dense_b = make_temporary_conversion(local_b.get()); @@ -400,6 +408,10 @@ void Dense::compute_norm2(LinOp* result, Array& tmp) const { GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1])); auto exec = this->get_executor(); + if (tmp.get_executor() != exec) { + tmp.clear(); + tmp.set_executor(exec); + } auto local_result = make_temporary_clone(exec, result); auto dense_res = make_temporary_conversion>( local_result.get()); @@ -424,6 +436,10 @@ void Dense::compute_norm1(LinOp* result, Array& tmp) const { GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1])); auto exec = this->get_executor(); + if (tmp.get_executor() != exec) { + tmp.clear(); + tmp.set_executor(exec); + } auto local_result = make_temporary_clone(exec, result); auto dense_res = make_temporary_conversion>( local_result.get()); diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp index 330e5aa7f44..601527c6cae 100644 --- a/dpcpp/base/kernel_launch_reduction.dp.hpp +++ b/dpcpp/base/kernel_launch_reduction.dp.hpp @@ -179,7 +179,6 @@ void run_kernel_reduction_impl(std::shared_ptr exec, auto queue = exec->get_queue(); if (num_workgroups > 1) { const auto required_storage = sizeof(ValueType) * num_workgroups; - tmp.set_executor(exec); if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); } @@ -227,7 +226,6 @@ void run_kernel_reduction_impl(std::shared_ptr exec, auto queue = exec->get_queue(); if (num_workgroups > 1) { const auto required_storage = sizeof(ValueType) * num_workgroups; - tmp.set_executor(exec); if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); } @@ -531,7 +529,6 @@ void run_generic_col_reduction_small(syn::value_list, }); } else { const auto required_storage = sizeof(ValueType) * row_blocks * cols; - tmp.set_executor(exec); if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); } @@ -578,7 +575,6 @@ void run_kernel_row_reduction_stage1(std::shared_ptr exec, if (rows * cols > resources && rows < cols) { const auto col_blocks = ceildiv(rows * cols, resources); const auto required_storage = sizeof(ValueType) * col_blocks * rows; - tmp.set_executor(exec); if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); } @@ -651,7 +647,6 @@ void run_kernel_col_reduction_stage1(std::shared_ptr exec, }); } else { const auto required_storage = sizeof(ValueType) * row_blocks * cols; - tmp.set_executor(exec); if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); } diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 210c67941a8..bcaba840fdf 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -747,6 +747,9 @@ class Dense * @param result a Dense row vector, used to store the dot product * (the number of column in the vector must match the number * of columns of this) + * @param tmp the temporary storage to use for partial sums during the + * reduction computation. It may be resized and/or reset to the + * correct executor. */ void compute_dot(const LinOp* b, LinOp* result, Array& tmp) const; @@ -767,6 +770,9 @@ class Dense * @param result a Dense row vector, used to store the dot product * (the number of column in the vector must match the number * of columns of this) + * @param tmp the temporary storage to use for partial sums during the + * reduction computation. It may be resized and/or reset to the + * correct executor. */ void compute_conj_dot(const LinOp* b, LinOp* result, Array& tmp) const; @@ -785,7 +791,10 @@ class Dense * * @param result a Dense row vector, used to store the norm * (the number of columns in the vector must match the - * number of columns of this) + * number of columns of this) + * @param tmp the temporary storage to use for partial sums during the + * reduction computation. It may be resized and/or reset to the + * correct executor. */ void compute_norm2(LinOp* result, Array& tmp) const; @@ -803,7 +812,10 @@ class Dense * * @param result a Dense row vector, used to store the norm * (the number of columns in the vector must match the - * number of columns of this) + * number of columns of this) + * @param tmp the temporary storage to use for partial sums during the + * reduction computation. It may be resized and/or reset to the + * correct executor. */ void compute_norm1(LinOp* result, Array& tmp) const; diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp index 55046cae832..c95a9caa5de 100644 --- a/omp/base/kernel_launch_reduction.hpp +++ b/omp/base/kernel_launch_reduction.hpp @@ -66,7 +66,6 @@ void run_kernel_reduction_impl(std::shared_ptr exec, const auto ssize = static_cast(size); const auto work_per_thread = ceildiv(ssize, num_threads); const auto required_storage = sizeof(ValueType) * num_threads; - tmp.set_executor(exec); if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); } @@ -103,7 +102,6 @@ void run_kernel_reduction_sized_impl(syn::value_list, const auto num_threads = static_cast(omp_get_max_threads()); const auto work_per_thread = ceildiv(rows, num_threads); const auto required_storage = sizeof(ValueType) * num_threads; - tmp.set_executor(exec); if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); } @@ -233,7 +231,6 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, // small number of rows and large reduction sizes: do partial sum first const auto work_per_thread = ceildiv(cols, num_threads); const auto required_storage = sizeof(ValueType) * rows * num_threads; - tmp.set_executor(exec); if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); } @@ -331,7 +328,6 @@ void run_kernel_col_reduction_sized_impl( ceildiv(reduction_kernel_oversubscription * num_threads, cols); const auto rows_per_thread = ceildiv(rows, reduction_size); const auto required_storage = sizeof(ValueType) * rows * reduction_size; - tmp.set_executor(exec); if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); } diff --git a/test/matrix/dense_kernels.cpp b/test/matrix/dense_kernels.cpp index b843f3ec6a0..5ea753c225a 100644 --- a/test/matrix/dense_kernels.cpp +++ b/test/matrix/dense_kernels.cpp @@ -1014,6 +1014,23 @@ TEST_F(Dense, ComputeDotIsEquivalentToRef) } +TEST_F(Dense, ComputeDotWithPreallocatedTmpIsEquivalentToRef) +{ + set_up_vector_data(42); + + auto dot_size = gko::dim<2>{1, x->get_size()[1]}; + auto dot_expected = Mtx::create(ref, dot_size); + auto ddot = Mtx::create(ref, dot_size); + gko::Array tmp{exec, 12345}; + + // all parameters are on ref to check cross-executor calls + x->compute_dot(y.get(), dot_expected.get()); + dx->compute_dot(y.get(), ddot.get(), tmp); + + GKO_ASSERT_MTX_NEAR(ddot, dot_expected, r::value); +} + + TEST_F(Dense, ComputeDotWithTmpIsEquivalentToRef) { set_up_vector_data(40); @@ -1021,10 +1038,10 @@ TEST_F(Dense, ComputeDotWithTmpIsEquivalentToRef) auto dot_size = gko::dim<2>{1, x->get_size()[1]}; auto dot_expected = Mtx::create(ref, dot_size); auto ddot = Mtx::create(ref, dot_size); - gko::Array tmp{ref}; + gko::Array tmp{exec}; // all parameters are on ref to check cross-executor calls - x->compute_dot(y.get(), dot_expected.get(), tmp); + x->compute_dot(y.get(), dot_expected.get()); dx->compute_dot(y.get(), ddot.get(), tmp); GKO_ASSERT_MTX_NEAR(ddot, dot_expected, r::value); @@ -1047,6 +1064,23 @@ TEST_F(Dense, ComputeConjDotIsEquivalentToRef) } +TEST_F(Dense, ComputeConjDotWithPreallocatedTmpIsEquivalentToRef) +{ + set_up_vector_data(36); + + auto dot_size = gko::dim<2>{1, x->get_size()[1]}; + auto dot_expected = Mtx::create(ref, dot_size); + auto ddot = Mtx::create(ref, dot_size); + gko::Array tmp{exec, 12345}; + + // all parameters are on ref to check cross-executor calls + x->compute_conj_dot(y.get(), dot_expected.get()); + dx->compute_conj_dot(y.get(), ddot.get(), tmp); + + GKO_ASSERT_MTX_NEAR(ddot, dot_expected, r::value); +} + + TEST_F(Dense, ComputeConjDotWithTmpIsEquivalentToRef) { set_up_vector_data(65); @@ -1057,7 +1091,7 @@ TEST_F(Dense, ComputeConjDotWithTmpIsEquivalentToRef) gko::Array tmp{ref}; // all parameters are on ref to check cross-executor calls - x->compute_conj_dot(y.get(), dot_expected.get(), tmp); + x->compute_conj_dot(y.get(), dot_expected.get()); dx->compute_conj_dot(y.get(), ddot.get(), tmp); GKO_ASSERT_MTX_NEAR(ddot, dot_expected, r::value); @@ -1080,6 +1114,23 @@ TEST_F(Dense, ComputeNorm1IsEquivalentToRef) } +TEST_F(Dense, ComputeNorm1WithPreallocatedTmpIsEquivalentToRef) +{ + set_up_vector_data(7); + + auto norm_size = gko::dim<2>{1, x->get_size()[1]}; + auto norm_expected = NormVector::create(ref, norm_size); + auto dnorm = NormVector::create(ref, norm_size); + gko::Array tmp{exec, 12345}; + + // all parameters are on ref to check cross-executor calls + x->compute_norm1(norm_expected.get()); + dx->compute_norm1(dnorm.get(), tmp); + + GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, r::value); +} + + TEST_F(Dense, ComputeNorm1WithTmpIsEquivalentToRef) { set_up_vector_data(10); @@ -1090,7 +1141,7 @@ TEST_F(Dense, ComputeNorm1WithTmpIsEquivalentToRef) gko::Array tmp{ref}; // all parameters are on ref to check cross-executor calls - x->compute_norm1(norm_expected.get(), tmp); + x->compute_norm1(norm_expected.get()); dx->compute_norm1(dnorm.get(), tmp); GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, r::value); @@ -1113,7 +1164,7 @@ TEST_F(Dense, ComputeNorm2IsEquivalentToRef) } -TEST_F(Dense, ComputeNorm2WithTmpIsEquivalentToRef) +TEST_F(Dense, ComputeNorm2WithPreallocatedTmpIsEquivalentToRef) { set_up_vector_data(3); @@ -1123,7 +1174,24 @@ TEST_F(Dense, ComputeNorm2WithTmpIsEquivalentToRef) gko::Array tmp{ref}; // all parameters are on ref to check cross-executor calls - x->compute_norm1(norm_expected.get(), tmp); + x->compute_norm1(norm_expected.get()); + dx->compute_norm1(dnorm.get(), tmp); + + GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, r::value); +} + + +TEST_F(Dense, ComputeNorm2WithTmpIsEquivalentToRef) +{ + set_up_vector_data(14); + + auto norm_size = gko::dim<2>{1, x->get_size()[1]}; + auto norm_expected = NormVector::create(ref, norm_size); + auto dnorm = NormVector::create(ref, norm_size); + gko::Array tmp{exec, 12345}; + + // all parameters are on ref to check cross-executor calls + x->compute_norm1(norm_expected.get()); dx->compute_norm1(dnorm.get(), tmp); GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, r::value); From c6fe8561dd2919bb6fb10b849286af9d99a856a4 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 7 Apr 2022 11:25:56 +0200 Subject: [PATCH 5/5] review updates * use *_dispatch reduction kernels in core and GMRES * test vendor kernels Co-authored-by: Yuhsiang M. Tsai Co-authored-by: Pratik Nayak --- core/matrix/dense.cpp | 6 +++--- cuda/solver/cb_gmres_kernels.cu | 3 ++- cuda/solver/gmres_kernels.cu | 3 ++- dpcpp/solver/cb_gmres_kernels.dp.cpp | 3 ++- dpcpp/solver/gmres_kernels.dp.cpp | 3 ++- hip/solver/cb_gmres_kernels.hip.cpp | 3 ++- hip/solver/gmres_kernels.hip.cpp | 3 ++- test/matrix/dense_kernels.cpp | 6 +++--- 8 files changed, 18 insertions(+), 12 deletions(-) diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 611925231cc..b2a92cbbe95 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -76,9 +76,9 @@ GKO_REGISTER_OPERATION(add_scaled, dense::add_scaled); GKO_REGISTER_OPERATION(sub_scaled, dense::sub_scaled); GKO_REGISTER_OPERATION(add_scaled_diag, dense::add_scaled_diag); GKO_REGISTER_OPERATION(sub_scaled_diag, dense::sub_scaled_diag); -GKO_REGISTER_OPERATION(compute_dot, dense::compute_dot); -GKO_REGISTER_OPERATION(compute_conj_dot, dense::compute_conj_dot); -GKO_REGISTER_OPERATION(compute_norm2, dense::compute_norm2); +GKO_REGISTER_OPERATION(compute_dot, dense::compute_dot_dispatch); +GKO_REGISTER_OPERATION(compute_conj_dot, dense::compute_conj_dot_dispatch); +GKO_REGISTER_OPERATION(compute_norm2, dense::compute_norm2_dispatch); GKO_REGISTER_OPERATION(compute_norm1, dense::compute_norm1); GKO_REGISTER_OPERATION(compute_max_nnz_per_row, dense::compute_max_nnz_per_row); GKO_REGISTER_OPERATION(compute_hybrid_coo_row_ptrs, diff --git a/cuda/solver/cb_gmres_kernels.cu b/cuda/solver/cb_gmres_kernels.cu index 7bee0750390..0abdb07db28 100644 --- a/cuda/solver/cb_gmres_kernels.cu +++ b/cuda/solver/cb_gmres_kernels.cu @@ -144,7 +144,8 @@ void initialize_2(std::shared_ptr exec, acc::as_cuda_range(krylov_bases), as_cuda_type(residual_norm_collection->get_values()), residual_norm_collection->get_stride()); - kernels::cuda::dense::compute_norm2(exec, residual, residual_norm, tmp); + kernels::cuda::dense::compute_norm2_dispatch(exec, residual, residual_norm, + tmp); if (use_scalar) { components::fill_array(exec, diff --git a/cuda/solver/gmres_kernels.cu b/cuda/solver/gmres_kernels.cu index d228b47dd37..dc3d5ef93a8 100644 --- a/cuda/solver/gmres_kernels.cu +++ b/cuda/solver/gmres_kernels.cu @@ -119,7 +119,8 @@ void initialize_2(std::shared_ptr exec, constexpr auto block_size = default_block_size; Array tmp{exec}; - kernels::cuda::dense::compute_norm2(exec, residual, residual_norm, tmp); + kernels::cuda::dense::compute_norm2_dispatch(exec, residual, residual_norm, + tmp); const auto grid_dim_2 = ceildiv(num_rows * num_rhs, default_block_size); initialize_2_2_kernel<<>>( diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp index 5e29c7aa612..2f048e1828f 100644 --- a/dpcpp/solver/cb_gmres_kernels.dp.cpp +++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp @@ -1013,7 +1013,8 @@ void initialize_2(std::shared_ptr exec, residual->get_size()[1], krylov_dim, krylov_bases, residual_norm_collection->get_values(), residual_norm_collection->get_stride()); - kernels::dpcpp::dense::compute_norm2(exec, residual, residual_norm, tmp); + kernels::dpcpp::dense::compute_norm2_dispatch(exec, residual, residual_norm, + tmp); if (use_scalar) { components::fill_array(exec, diff --git a/dpcpp/solver/gmres_kernels.dp.cpp b/dpcpp/solver/gmres_kernels.dp.cpp index 5c68e1580d5..608b956ba15 100644 --- a/dpcpp/solver/gmres_kernels.dp.cpp +++ b/dpcpp/solver/gmres_kernels.dp.cpp @@ -472,7 +472,8 @@ void initialize_2(std::shared_ptr exec, constexpr auto block_size = default_block_size; Array tmp{exec}; - kernels::dpcpp::dense::compute_norm2(exec, residual, residual_norm, tmp); + kernels::dpcpp::dense::compute_norm2_dispatch(exec, residual, residual_norm, + tmp); const dim3 grid_dim_2(ceildiv(num_rows * num_rhs, default_block_size), 1, 1); diff --git a/hip/solver/cb_gmres_kernels.hip.cpp b/hip/solver/cb_gmres_kernels.hip.cpp index 09f206d355b..b86c8bdc34a 100644 --- a/hip/solver/cb_gmres_kernels.hip.cpp +++ b/hip/solver/cb_gmres_kernels.hip.cpp @@ -146,7 +146,8 @@ void initialize_2(std::shared_ptr exec, krylov_dim, acc::as_hip_range(krylov_bases), as_hip_type(residual_norm_collection->get_values()), residual_norm_collection->get_stride()); - kernels::hip::dense::compute_norm2(exec, residual, residual_norm, tmp); + kernels::hip::dense::compute_norm2_dispatch(exec, residual, residual_norm, + tmp); if (use_scalar) { components::fill_array(exec, diff --git a/hip/solver/gmres_kernels.hip.cpp b/hip/solver/gmres_kernels.hip.cpp index 6c4c405f0de..4272eadbc12 100644 --- a/hip/solver/gmres_kernels.hip.cpp +++ b/hip/solver/gmres_kernels.hip.cpp @@ -123,7 +123,8 @@ void initialize_2(std::shared_ptr exec, constexpr auto block_size = default_block_size; Array tmp{exec}; - kernels::hip::dense::compute_norm2(exec, residual, residual_norm, tmp); + kernels::hip::dense::compute_norm2_dispatch(exec, residual, residual_norm, + tmp); const auto grid_dim_2 = ceildiv(num_rows * num_rhs, default_block_size); hipLaunchKernelGGL( diff --git a/test/matrix/dense_kernels.cpp b/test/matrix/dense_kernels.cpp index 5ea753c225a..eba5be5fd9d 100644 --- a/test/matrix/dense_kernels.cpp +++ b/test/matrix/dense_kernels.cpp @@ -1000,7 +1000,7 @@ TEST_F(Dense, ExtractDiagonalOnShortFatIntoDenseCrossExecutor) TEST_F(Dense, ComputeDotIsEquivalentToRef) { - set_up_vector_data(2); + set_up_vector_data(1); auto dot_size = gko::dim<2>{1, x->get_size()[1]}; auto dot_expected = Mtx::create(ref, dot_size); @@ -1050,7 +1050,7 @@ TEST_F(Dense, ComputeDotWithTmpIsEquivalentToRef) TEST_F(Dense, ComputeConjDotIsEquivalentToRef) { - set_up_vector_data(13); + set_up_vector_data(1); auto dot_size = gko::dim<2>{1, x->get_size()[1]}; auto dot_expected = Mtx::create(ref, dot_size); @@ -1150,7 +1150,7 @@ TEST_F(Dense, ComputeNorm1WithTmpIsEquivalentToRef) TEST_F(Dense, ComputeNorm2IsEquivalentToRef) { - set_up_vector_data(5); + set_up_vector_data(1); auto norm_size = gko::dim<2>{1, x->get_size()[1]}; auto norm_expected = NormVector::create(ref, norm_size);