From 631b4f6950963d7511f69d78703784b9118e5c51 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 12 Jul 2021 15:44:09 +0200 Subject: [PATCH 01/25] use signed integers in simple kernels --- common/unified/base/kernel_launch.hpp | 13 +++++---- common/unified/base/kernel_launch_solver.hpp | 6 ++-- cuda/base/kernel_launch.cuh | 15 +++++----- cuda/base/kernel_launch_solver.cuh | 7 +++-- cuda/test/base/kernel_launch.cu | 17 ++++++------ dpcpp/base/kernel_launch.dp.hpp | 17 +++++++----- dpcpp/base/kernel_launch_solver.dp.hpp | 13 +++++---- dpcpp/test/base/kernel_launch.dp.cpp | 19 ++++++------- hip/base/kernel_launch.hip.hpp | 17 ++++++------ hip/base/kernel_launch_solver.hip.hpp | 8 ++++-- hip/test/base/kernel_launch.hip.cpp | 17 ++++++------ omp/base/kernel_launch.hpp | 29 ++++++++++---------- omp/base/kernel_launch_solver.hpp | 7 +++-- omp/test/base/kernel_launch.cpp | 17 ++++++------ 14 files changed, 108 insertions(+), 94 deletions(-) diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp index 6b3a698768c..bf403d3a656 100644 --- a/common/unified/base/kernel_launch.hpp +++ b/common/unified/base/kernel_launch.hpp @@ -170,14 +170,13 @@ namespace GKO_DEVICE_NAMESPACE { template struct matrix_accessor { ValueType* data; - size_type stride; + int64 stride; /** * @internal * Returns a reference to the element at position (row, col). */ - GKO_INLINE GKO_ATTRIBUTES ValueType& operator()(size_type row, - size_type col) + GKO_INLINE GKO_ATTRIBUTES ValueType& operator()(int64 row, int64 col) { return data[row * stride + col]; } @@ -187,7 +186,7 @@ struct matrix_accessor { * Returns a reference to the element at position idx in the underlying * storage. */ - GKO_INLINE GKO_ATTRIBUTES ValueType& operator[](size_type idx) + GKO_INLINE GKO_ATTRIBUTES ValueType& operator[](int64 idx) { return data[idx]; } @@ -223,7 +222,8 @@ struct to_device_type_impl*&> { using type = matrix_accessor>; static type map_to_device(matrix::Dense* mtx) { - return {as_device_type(mtx->get_values()), mtx->get_stride()}; + return {as_device_type(mtx->get_values()), + static_cast(mtx->get_stride())}; } }; @@ -232,7 +232,8 @@ struct to_device_type_impl*&> { using type = matrix_accessor>; static type map_to_device(const matrix::Dense* mtx) { - return {as_device_type(mtx->get_const_values()), mtx->get_stride()}; + return {as_device_type(mtx->get_const_values()), + static_cast(mtx->get_stride())}; } }; diff --git a/common/unified/base/kernel_launch_solver.hpp b/common/unified/base/kernel_launch_solver.hpp index 6c8a1296b83..716bd94a093 100644 --- a/common/unified/base/kernel_launch_solver.hpp +++ b/common/unified/base/kernel_launch_solver.hpp @@ -63,7 +63,7 @@ struct default_stride_dense_wrapper { template struct device_unpack_solver_impl { using type = T; - static GKO_INLINE GKO_ATTRIBUTES type unpack(T param, size_type) + static GKO_INLINE GKO_ATTRIBUTES type unpack(T param, int64) { return param; } @@ -72,8 +72,8 @@ struct device_unpack_solver_impl { template struct device_unpack_solver_impl> { using type = matrix_accessor; - static GKO_INLINE GKO_ATTRIBUTES type unpack( - default_stride_dense_wrapper param, size_type default_stride) + static GKO_INLINE GKO_ATTRIBUTES type + unpack(default_stride_dense_wrapper param, int64 default_stride) { return {param.data, default_stride}; } diff --git a/cuda/base/kernel_launch.cuh b/cuda/base/kernel_launch.cuh index d55faed5053..5179a5cc27d 100644 --- a/cuda/base/kernel_launch.cuh +++ b/cuda/base/kernel_launch.cuh @@ -51,9 +51,9 @@ constexpr int default_block_size = 512; template __global__ __launch_bounds__(default_block_size) void generic_kernel_1d( - size_type size, KernelFunction fn, KernelArgs... args) + int64 size, KernelFunction fn, KernelArgs... args) { - auto tidx = thread::get_thread_id_flat(); + auto tidx = thread::get_thread_id_flat(); if (tidx >= size) { return; } @@ -63,9 +63,9 @@ __global__ __launch_bounds__(default_block_size) void generic_kernel_1d( template __global__ __launch_bounds__(default_block_size) void generic_kernel_2d( - size_type rows, size_type cols, KernelFunction fn, KernelArgs... args) + int64 rows, int64 cols, KernelFunction fn, KernelArgs... args) { - auto tidx = thread::get_thread_id_flat(); + auto tidx = thread::get_thread_id_flat(); auto col = tidx % cols; auto row = tidx / cols; if (row >= rows) { @@ -82,7 +82,7 @@ void run_kernel(std::shared_ptr exec, KernelFunction fn, gko::cuda::device_guard guard{exec->get_device_id()}; constexpr auto block_size = default_block_size; auto num_blocks = ceildiv(size, block_size); - generic_kernel_1d<<>>(size, fn, + generic_kernel_1d<<>>(static_cast(size), fn, map_to_device(args)...); } @@ -93,8 +93,9 @@ void run_kernel(std::shared_ptr exec, KernelFunction fn, gko::cuda::device_guard guard{exec->get_device_id()}; constexpr auto block_size = default_block_size; auto num_blocks = ceildiv(size[0] * size[1], block_size); - generic_kernel_2d<<>>(size[0], size[1], fn, - map_to_device(args)...); + generic_kernel_2d<<>>(static_cast(size[0]), + static_cast(size[1]), + fn, map_to_device(args)...); } diff --git a/cuda/base/kernel_launch_solver.cuh b/cuda/base/kernel_launch_solver.cuh index bf2f6e1a995..f4da60ddede 100644 --- a/cuda/base/kernel_launch_solver.cuh +++ b/cuda/base/kernel_launch_solver.cuh @@ -43,10 +43,10 @@ namespace cuda { template __global__ __launch_bounds__(default_block_size) void generic_kernel_2d_solver( - size_type rows, size_type cols, size_type default_stride, KernelFunction fn, + int64 rows, int64 cols, int64 default_stride, KernelFunction fn, KernelArgs... args) { - auto tidx = thread::get_thread_id_flat(); + auto tidx = thread::get_thread_id_flat(); auto col = tidx % cols; auto row = tidx / cols; if (row >= rows) { @@ -66,7 +66,8 @@ void run_kernel_solver(std::shared_ptr exec, constexpr auto block_size = default_block_size; auto num_blocks = ceildiv(size[0] * size[1], block_size); generic_kernel_2d_solver<<>>( - size[0], size[1], default_stride, fn, map_to_device(args)...); + static_cast(size[0]), static_cast(size[1]), + static_cast(default_stride), fn, map_to_device(args)...); } diff --git a/cuda/test/base/kernel_launch.cu b/cuda/test/base/kernel_launch.cu index abd4775290c..adf443445a5 100644 --- a/cuda/test/base/kernel_launch.cu +++ b/cuda/test/base/kernel_launch.cu @@ -54,6 +54,7 @@ namespace { using gko::dim; +using gko::int64; using gko::size_type; using std::is_same; @@ -104,7 +105,7 @@ void run1d(std::shared_ptr exec, size_type dim, int* data) gko::kernels::cuda::run_kernel( exec, [] GKO_KERNEL(auto i, auto d) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); d[i] = i; }, @@ -124,7 +125,7 @@ void run1d(std::shared_ptr exec, gko::Array& data) gko::kernels::cuda::run_kernel( exec, [] GKO_KERNEL(auto i, auto d, auto d_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); if (d == d_ptr) { @@ -149,7 +150,7 @@ void run1d(std::shared_ptr exec, gko::matrix::Dense<>* m) gko::kernels::cuda::run_kernel( exec, [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); @@ -185,8 +186,8 @@ void run2d(std::shared_ptr exec, int* data) gko::kernels::cuda::run_kernel( exec, [] GKO_KERNEL(auto i, auto j, auto d) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); d[i + 4 * j] = 4 * i + j; }, @@ -206,8 +207,8 @@ void run2d(std::shared_ptr exec, gko::Array& data) gko::kernels::cuda::run_kernel( exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); if (d == d_ptr) { @@ -234,7 +235,7 @@ void run2d(std::shared_ptr exec, gko::matrix::Dense<>* m1, exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3, auto d4, auto d2_ptr, auto d3_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); diff --git a/dpcpp/base/kernel_launch.dp.hpp b/dpcpp/base/kernel_launch.dp.hpp index 4fe161ff320..0941fc7d524 100644 --- a/dpcpp/base/kernel_launch.dp.hpp +++ b/dpcpp/base/kernel_launch.dp.hpp @@ -45,23 +45,23 @@ namespace dpcpp { template -void generic_kernel_1d(sycl::handler& cgh, size_type size, KernelFunction fn, +void generic_kernel_1d(sycl::handler& cgh, int64 size, KernelFunction fn, KernelArgs... args) { cgh.parallel_for(sycl::range<1>{size}, [=](sycl::id<1> idx_id) { - auto idx = static_cast(idx_id[0]); + auto idx = static_cast(idx_id[0]); fn(idx, args...); }); } template -void generic_kernel_2d(sycl::handler& cgh, size_type rows, size_type cols, +void generic_kernel_2d(sycl::handler& cgh, int64 rows, int64 cols, KernelFunction fn, KernelArgs... args) { cgh.parallel_for(sycl::range<2>{rows, cols}, [=](sycl::id<2> idx) { - auto row = static_cast(idx[0]); - auto col = static_cast(idx[1]); + auto row = static_cast(idx[0]); + auto col = static_cast(idx[1]); fn(row, col, args...); }); } @@ -72,7 +72,8 @@ void run_kernel(std::shared_ptr exec, KernelFunction fn, size_type size, KernelArgs&&... args) { exec->get_queue()->submit([&](sycl::handler& cgh) { - generic_kernel_1d(cgh, size, fn, map_to_device(args)...); + generic_kernel_1d(cgh, static_cast(size), fn, + map_to_device(args)...); }); } @@ -81,7 +82,9 @@ void run_kernel(std::shared_ptr exec, KernelFunction fn, dim<2> size, KernelArgs&&... args) { exec->get_queue()->submit([&](sycl::handler& cgh) { - generic_kernel_2d(cgh, size[0], size[1], fn, map_to_device(args)...); + generic_kernel_2d(cgh, static_cast(size[0]), + static_cast(size[1]), fn, + map_to_device(args)...); }); } diff --git a/dpcpp/base/kernel_launch_solver.dp.hpp b/dpcpp/base/kernel_launch_solver.dp.hpp index 5cec5b55d79..aa25d167bf3 100644 --- a/dpcpp/base/kernel_launch_solver.dp.hpp +++ b/dpcpp/base/kernel_launch_solver.dp.hpp @@ -42,13 +42,13 @@ namespace dpcpp { template -void generic_kernel_2d_solver(sycl::handler& cgh, size_type rows, - size_type cols, size_type default_stride, - KernelFunction fn, KernelArgs... args) +void generic_kernel_2d_solver(sycl::handler& cgh, int64 rows, int64 cols, + int64 default_stride, KernelFunction fn, + KernelArgs... args) { cgh.parallel_for(sycl::range<2>{rows, cols}, [=](sycl::id<2> idx) { - auto row = static_cast(idx[0]); - auto col = static_cast(idx[1]); + auto row = static_cast(idx[0]); + auto col = static_cast(idx[1]); fn(row, col, device_unpack_solver_impl::unpack(args, default_stride)...); @@ -63,7 +63,8 @@ void run_kernel_solver(std::shared_ptr exec, { exec->get_queue()->submit([&](sycl::handler& cgh) { kernels::dpcpp::generic_kernel_2d_solver( - cgh, size[0], size[1], default_stride, fn, + cgh, static_cast(size[0]), static_cast(size[1]), + static_cast(default_stride), fn, kernels::dpcpp::map_to_device(args)...); }); } diff --git a/dpcpp/test/base/kernel_launch.dp.cpp b/dpcpp/test/base/kernel_launch.dp.cpp index 27d3f1abd12..decd2e8c64a 100644 --- a/dpcpp/test/base/kernel_launch.dp.cpp +++ b/dpcpp/test/base/kernel_launch.dp.cpp @@ -54,6 +54,7 @@ namespace { using gko::dim; +using gko::int64; using gko::size_type; using std::is_same; @@ -110,7 +111,7 @@ TEST_F(KernelLaunch, Runs1D) gko::kernels::dpcpp::run_kernel( exec, [] GKO_KERNEL(auto i, auto d) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); d[i] = i; }, @@ -125,7 +126,7 @@ TEST_F(KernelLaunch, Runs1DArray) gko::kernels::dpcpp::run_kernel( exec, [] GKO_KERNEL(auto i, auto d, auto d_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); if (d == d_ptr) { @@ -145,7 +146,7 @@ TEST_F(KernelLaunch, Runs1DDense) gko::kernels::dpcpp::run_kernel( exec, [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, @@ -177,8 +178,8 @@ TEST_F(KernelLaunch, Runs2D) gko::kernels::dpcpp::run_kernel( exec, [] GKO_KERNEL(auto i, auto j, auto d) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); d[i + 4 * j] = 4 * i + j; }, @@ -193,8 +194,8 @@ TEST_F(KernelLaunch, Runs2DArray) gko::kernels::dpcpp::run_kernel( exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); if (d == d_ptr) { @@ -215,11 +216,9 @@ TEST_F(KernelLaunch, Runs2DDense) exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3, auto d4, auto d2_ptr, auto d3_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); - static_assert(is_same::value, - "type"); static_assert(is_same::value, "type"); static_assert(is_same::value, diff --git a/hip/base/kernel_launch.hip.hpp b/hip/base/kernel_launch.hip.hpp index 8967ee5597d..6c627838fea 100644 --- a/hip/base/kernel_launch.hip.hpp +++ b/hip/base/kernel_launch.hip.hpp @@ -54,9 +54,9 @@ constexpr int default_block_size = 512; template __global__ __launch_bounds__(default_block_size) void generic_kernel_1d( - size_type size, KernelFunction fn, KernelArgs... args) + int64 size, KernelFunction fn, KernelArgs... args) { - auto tidx = thread::get_thread_id_flat(); + auto tidx = thread::get_thread_id_flat(); if (tidx >= size) { return; } @@ -66,9 +66,9 @@ __global__ __launch_bounds__(default_block_size) void generic_kernel_1d( template __global__ __launch_bounds__(default_block_size) void generic_kernel_2d( - size_type rows, size_type cols, KernelFunction fn, KernelArgs... args) + int64 rows, int64 cols, KernelFunction fn, KernelArgs... args) { - auto tidx = thread::get_thread_id_flat(); + auto tidx = thread::get_thread_id_flat(); auto col = tidx % cols; auto row = tidx / cols; if (row >= rows) { @@ -85,8 +85,8 @@ void run_kernel(std::shared_ptr exec, KernelFunction fn, gko::hip::device_guard guard{exec->get_device_id()}; constexpr auto block_size = default_block_size; auto num_blocks = ceildiv(size, block_size); - hipLaunchKernelGGL(generic_kernel_1d, num_blocks, block_size, 0, 0, size, - fn, map_to_device(args)...); + hipLaunchKernelGGL(generic_kernel_1d, num_blocks, block_size, 0, 0, + static_cast(size), fn, map_to_device(args)...); } template @@ -96,8 +96,9 @@ void run_kernel(std::shared_ptr exec, KernelFunction fn, gko::hip::device_guard guard{exec->get_device_id()}; constexpr auto block_size = default_block_size; auto num_blocks = ceildiv(size[0] * size[1], block_size); - hipLaunchKernelGGL(generic_kernel_2d, num_blocks, block_size, 0, 0, size[0], - size[1], fn, map_to_device(args)...); + hipLaunchKernelGGL(generic_kernel_2d, num_blocks, block_size, 0, 0, + static_cast(size[0]), static_cast(size[1]), + fn, map_to_device(args)...); } diff --git a/hip/base/kernel_launch_solver.hip.hpp b/hip/base/kernel_launch_solver.hip.hpp index a8335851a0e..9798f6c4fbc 100644 --- a/hip/base/kernel_launch_solver.hip.hpp +++ b/hip/base/kernel_launch_solver.hip.hpp @@ -46,10 +46,10 @@ namespace hip { template __global__ __launch_bounds__(default_block_size) void generic_kernel_2d_solver( - size_type rows, size_type cols, size_type default_stride, KernelFunction fn, + int64 rows, int64 cols, int64 default_stride, KernelFunction fn, KernelArgs... args) { - auto tidx = thread::get_thread_id_flat(); + auto tidx = thread::get_thread_id_flat(); auto col = tidx % cols; auto row = tidx / cols; if (row >= rows) { @@ -69,7 +69,9 @@ void run_kernel_solver(std::shared_ptr exec, constexpr auto block_size = kernels::hip::default_block_size; auto num_blocks = ceildiv(size[0] * size[1], block_size); hipLaunchKernelGGL(kernels::hip::generic_kernel_2d_solver, num_blocks, - block_size, 0, 0, size[0], size[1], default_stride, fn, + block_size, 0, 0, static_cast(size[0]), + static_cast(size[1]), + static_cast(default_stride), fn, kernels::hip::map_to_device(args)...); } diff --git a/hip/test/base/kernel_launch.hip.cpp b/hip/test/base/kernel_launch.hip.cpp index 55ddb3fd01e..ad3ba3cc643 100644 --- a/hip/test/base/kernel_launch.hip.cpp +++ b/hip/test/base/kernel_launch.hip.cpp @@ -54,6 +54,7 @@ namespace { using gko::dim; +using gko::int64; using gko::size_type; using std::is_same; @@ -103,7 +104,7 @@ void run1d(std::shared_ptr exec, size_type dim, int* data) gko::kernels::hip::run_kernel( exec, [] GKO_KERNEL(auto i, auto d) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); d[i] = i; }, @@ -123,7 +124,7 @@ void run1d(std::shared_ptr exec, gko::Array& data) gko::kernels::hip::run_kernel( exec, [] GKO_KERNEL(auto i, auto d, auto d_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); if (d == d_ptr) { @@ -148,7 +149,7 @@ void run1d(std::shared_ptr exec, gko::matrix::Dense<>* m) gko::kernels::hip::run_kernel( exec, [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); @@ -184,8 +185,8 @@ void run2d(std::shared_ptr exec, int* data) gko::kernels::hip::run_kernel( exec, [] GKO_KERNEL(auto i, auto j, auto d) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); d[i + 4 * j] = 4 * i + j; }, @@ -205,8 +206,8 @@ void run2d(std::shared_ptr exec, gko::Array& data) gko::kernels::hip::run_kernel( exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); if (d == d_ptr) { @@ -233,7 +234,7 @@ void run2d(std::shared_ptr exec, gko::matrix::Dense<>* m1, exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3, auto d4, auto d2_ptr, auto d3_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); diff --git a/omp/base/kernel_launch.hpp b/omp/base/kernel_launch.hpp index 7df6ff4c313..155ba8fd88b 100644 --- a/omp/base/kernel_launch.hpp +++ b/omp/base/kernel_launch.hpp @@ -46,48 +46,49 @@ void run_kernel(std::shared_ptr exec, KernelFunction fn, size_type size, KernelArgs&&... args) { #pragma omp parallel for - for (size_type i = 0; i < size; i++) { + for (int64 i = 0; i < static_cast(size); i++) { [&]() { fn(i, map_to_device(args)...); }(); } } -template + +template void run_kernel_fixed_cols_impl(std::shared_ptr exec, KernelFunction fn, dim<2> size, MappedKernelArgs... args) { - const auto rows = size[0]; + const auto rows = static_cast(size[0]); #pragma omp parallel for - for (size_type row = 0; row < rows; row++) { + for (int64 row = 0; row < rows; row++) { #pragma unroll - for (size_type col = 0; col < cols; col++) { + for (int64 col = 0; col < cols; col++) { [&]() { fn(row, col, args...); }(); } } } -template +template void run_kernel_blocked_cols_impl(std::shared_ptr exec, KernelFunction fn, dim<2> size, MappedKernelArgs... args) { static_assert(remainder_cols < block_size, "remainder too large"); - const auto rows = size[0]; - const auto cols = size[1]; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); const auto rounded_cols = cols / block_size * block_size; GKO_ASSERT(rounded_cols + remainder_cols == cols); #pragma omp parallel for - for (size_type row = 0; row < rows; row++) { - for (size_type base_col = 0; base_col < rounded_cols; + for (int64 row = 0; row < rows; row++) { + for (int64 base_col = 0; base_col < rounded_cols; base_col += block_size) { #pragma unroll - for (size_type i = 0; i < block_size; i++) { + for (int64 i = 0; i < block_size; i++) { [&]() { fn(row, base_col + i, args...); }(); } } #pragma unroll - for (size_type i = 0; i < remainder_cols; i++) { + for (int64 i = 0; i < remainder_cols; i++) { [&]() { fn(row, rounded_cols + i, args...); }(); } } @@ -99,7 +100,7 @@ void run_kernel_impl(std::shared_ptr exec, KernelFunction fn, { const auto rows = size[0]; const auto cols = size[1]; - constexpr size_type block_size = 4; + constexpr int64 block_size = 4; if (cols <= 0) { return; } diff --git a/omp/base/kernel_launch_solver.hpp b/omp/base/kernel_launch_solver.hpp index dd85ba21915..b5c936c847b 100644 --- a/omp/base/kernel_launch_solver.hpp +++ b/omp/base/kernel_launch_solver.hpp @@ -43,7 +43,7 @@ namespace omp { template typename device_unpack_solver_impl::type>::type -map_to_device_solver(T&& param, size_type default_stride) +map_to_device_solver(T&& param, int64 default_stride) { return device_unpack_solver_impl::type>:: unpack(to_device_type_impl::map_to_device(param), default_stride); @@ -55,8 +55,9 @@ void run_kernel_solver(std::shared_ptr exec, KernelFunction fn, dim<2> size, size_type default_stride, KernelArgs&&... args) { - run_kernel_impl(exec, fn, size, - map_to_device_solver(args, default_stride)...); + run_kernel_impl( + exec, fn, size, + map_to_device_solver(args, static_cast(default_stride))...); } diff --git a/omp/test/base/kernel_launch.cpp b/omp/test/base/kernel_launch.cpp index 2c4712cfa52..dfdf85c3e0e 100644 --- a/omp/test/base/kernel_launch.cpp +++ b/omp/test/base/kernel_launch.cpp @@ -54,6 +54,7 @@ namespace { using gko::dim; +using gko::int64; using gko::size_type; using std::is_same; @@ -96,7 +97,7 @@ TEST_F(KernelLaunch, Runs1D) gko::kernels::omp::run_kernel( exec, [] GKO_KERNEL(auto i, auto d) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); d[i] = i; }, @@ -111,7 +112,7 @@ TEST_F(KernelLaunch, Runs1DArray) gko::kernels::omp::run_kernel( exec, [] GKO_KERNEL(auto i, auto d, auto d_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); if (d == d_ptr) { @@ -131,7 +132,7 @@ TEST_F(KernelLaunch, Runs1DDense) gko::kernels::omp::run_kernel( exec, [] GKO_KERNEL(auto i, auto d, auto d2, auto d_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); @@ -163,8 +164,8 @@ TEST_F(KernelLaunch, Runs2D) gko::kernels::omp::run_kernel( exec, [] GKO_KERNEL(auto i, auto j, auto d) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); d[i + 4 * j] = 4 * i + j; }, @@ -179,8 +180,8 @@ TEST_F(KernelLaunch, Runs2DArray) gko::kernels::omp::run_kernel( exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d_ptr) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); if (d == d_ptr) { @@ -201,7 +202,7 @@ TEST_F(KernelLaunch, Runs2DDense) exec, [] GKO_KERNEL(auto i, auto j, auto d, auto d2, auto d_ptr, auto d3, auto d4, auto d2_ptr, auto d3_ptr) { - static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "type"); static_assert(is_same::value, "type"); From 7124077c66ba0146930c2bd2389517abcc48876e Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 12 Jul 2021 16:28:25 +0200 Subject: [PATCH 02/25] use synthesizer for omp simple kernels --- omp/base/kernel_launch.hpp | 117 +++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 64 deletions(-) diff --git a/omp/base/kernel_launch.hpp b/omp/base/kernel_launch.hpp index 155ba8fd88b..79c65ef868e 100644 --- a/omp/base/kernel_launch.hpp +++ b/omp/base/kernel_launch.hpp @@ -36,6 +36,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#include "core/synthesizer/implementation_selection.hpp" + + namespace gko { namespace kernels { namespace omp { @@ -52,96 +55,82 @@ void run_kernel(std::shared_ptr exec, KernelFunction fn, } -template -void run_kernel_fixed_cols_impl(std::shared_ptr exec, - KernelFunction fn, dim<2> size, - MappedKernelArgs... args) -{ - const auto rows = static_cast(size[0]); -#pragma omp parallel for - for (int64 row = 0; row < rows; row++) { -#pragma unroll - for (int64 col = 0; col < cols; col++) { - [&]() { fn(row, col, args...); }(); - } - } -} +namespace { -template -void run_kernel_blocked_cols_impl(std::shared_ptr exec, - KernelFunction fn, dim<2> size, - MappedKernelArgs... args) +void run_kernel_sized_impl(syn::value_list, + std::shared_ptr exec, + KernelFunction fn, dim<2> size, + MappedKernelArgs... args) { - static_assert(remainder_cols < block_size, "remainder too large"); const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); + static_assert(remainder_cols < block_size, "remainder too large"); const auto rounded_cols = cols / block_size * block_size; GKO_ASSERT(rounded_cols + remainder_cols == cols); + if (rounded_cols == 0) { #pragma omp parallel for - for (int64 row = 0; row < rows; row++) { - for (int64 base_col = 0; base_col < rounded_cols; - base_col += block_size) { + for (int64 row = 0; row < rows; row++) { #pragma unroll - for (int64 i = 0; i < block_size; i++) { - [&]() { fn(row, base_col + i, args...); }(); + for (int64 col = 0; col < remainder_cols; col++) { + [&]() { fn(row, col, args...); }(); } } + } else if (cols == block_size) { +#pragma omp parallel for + for (int64 row = 0; row < rows; row++) { #pragma unroll - for (int64 i = 0; i < remainder_cols; i++) { - [&]() { fn(row, rounded_cols + i, args...); }(); + for (int64 col = 0; col < block_size; col++) { + [&]() { fn(row, col, args...); }(); + } + } + } else { +#pragma omp parallel for + for (int64 row = 0; row < rows; row++) { + for (int64 base_col = 0; base_col < rounded_cols; + base_col += block_size) { +#pragma unroll + for (int64 i = 0; i < block_size; i++) { + [&]() { fn(row, base_col + i, args...); }(); + } + } +#pragma unroll + for (int64 i = 0; i < remainder_cols; i++) { + [&]() { fn(row, rounded_cols + i, args...); }(); + } } } } + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_sized, + run_kernel_sized_impl) + + template void run_kernel_impl(std::shared_ptr exec, KernelFunction fn, dim<2> size, MappedKernelArgs... args) { - const auto rows = size[0]; - const auto cols = size[1]; - constexpr int64 block_size = 4; + const auto cols = static_cast(size[1]); + constexpr int block_size = 8; + using remainders = syn::as_list>; + if (cols <= 0) { return; } - if (cols == 1) { - run_kernel_fixed_cols_impl<1>(exec, fn, size, args...); - return; - } - if (cols == 2) { - run_kernel_fixed_cols_impl<2>(exec, fn, size, args...); - return; - } - if (cols == 3) { - run_kernel_fixed_cols_impl<3>(exec, fn, size, args...); - return; - } - if (cols == 4) { - run_kernel_fixed_cols_impl<4>(exec, fn, size, args...); - return; - } - const auto rem_cols = cols % block_size; - if (rem_cols == 0) { - run_kernel_blocked_cols_impl<0, block_size>(exec, fn, size, args...); - return; - } - if (rem_cols == 1) { - run_kernel_blocked_cols_impl<1, block_size>(exec, fn, size, args...); - return; - } - if (rem_cols == 2) { - run_kernel_blocked_cols_impl<2, block_size>(exec, fn, size, args...); - return; - } - if (rem_cols == 3) { - run_kernel_blocked_cols_impl<3, block_size>(exec, fn, size, args...); - return; - } - // should be unreachable - GKO_ASSERT(false); + select_run_kernel_sized( + remainders(), + [&](int remainder) { return remainder == cols % block_size; }, + syn::value_list(), syn::type_list<>(), exec, fn, size, + args...); } +} // namespace + + template void run_kernel(std::shared_ptr exec, KernelFunction fn, dim<2> size, KernelArgs&&... args) From 789a46c6f33adf70065e046e356e5d6875128a91 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 13 Jul 2021 18:47:01 +0200 Subject: [PATCH 03/25] add simple reduction kernels --- common/cuda_hip/components/reduction.hpp.inc | 7 +- common/unified/base/kernel_launch.hpp | 2 - .../unified/base/kernel_launch_reduction.hpp | 51 +++++ cuda/base/kernel_launch_reduction.cuh | 193 +++++++++++++++++ cuda/components/reduction.cuh | 13 +- cuda/test/base/kernel_launch.cu | 59 ++++++ hip/base/kernel_launch_reduction.hip.hpp | 197 ++++++++++++++++++ hip/components/reduction.hip.hpp | 21 +- hip/test/base/kernel_launch.hip.cpp | 59 ++++++ omp/base/kernel_launch.hpp | 17 +- omp/base/kernel_launch_reduction.hpp | 182 ++++++++++++++++ omp/test/base/kernel_launch.cpp | 79 +++++++ 12 files changed, 849 insertions(+), 31 deletions(-) create mode 100644 common/unified/base/kernel_launch_reduction.hpp create mode 100644 cuda/base/kernel_launch_reduction.cuh create mode 100644 hip/base/kernel_launch_reduction.hip.hpp create mode 100644 omp/base/kernel_launch_reduction.hpp diff --git a/common/cuda_hip/components/reduction.hpp.inc b/common/cuda_hip/components/reduction.hpp.inc index 9b4ed4cc8c7..3853fca6d44 100644 --- a/common/cuda_hip/components/reduction.hpp.inc +++ b/common/cuda_hip/components/reduction.hpp.inc @@ -208,14 +208,15 @@ __device__ void reduce_array(size_type size, * * Computes a reduction using the add operation (+) on an array * `source` of any size. Has to be called a second time on `result` to reduce - * an array larger than `default_block_size`. + * an array larger than `default_reduce_block_size`. */ template -__global__ __launch_bounds__(default_block_size) void reduce_add_array( +__global__ __launch_bounds__(default_reduce_block_size) void reduce_add_array( size_type size, const ValueType* __restrict__ source, ValueType* __restrict__ result) { - __shared__ UninitializedArray block_sum; + __shared__ UninitializedArray + block_sum; reduce_array(size, source, static_cast(block_sum), [](const ValueType& x, const ValueType& y) { return x + y; }); diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp index bf403d3a656..0e25671c58a 100644 --- a/common/unified/base/kernel_launch.hpp +++ b/common/unified/base/kernel_launch.hpp @@ -268,8 +268,6 @@ typename to_device_type_impl::type map_to_device(T&& param) } // namespace gko -// these files include this file again to make inclusion work from both sides, -// this does not lead to issues due to the header guards. #if defined(GKO_COMPILING_CUDA) #include "cuda/base/kernel_launch.cuh" #elif defined(GKO_COMPILING_HIP) diff --git a/common/unified/base/kernel_launch_reduction.hpp b/common/unified/base/kernel_launch_reduction.hpp new file mode 100644 index 00000000000..78de06466aa --- /dev/null +++ b/common/unified/base/kernel_launch_reduction.hpp @@ -0,0 +1,51 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ +#define GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ + + +#include "common/unified/base/kernel_launch.hpp" + + +#if defined(GKO_COMPILING_CUDA) +#include "cuda/base/kernel_launch_reduction.cuh" +#elif defined(GKO_COMPILING_HIP) +#include "hip/base/kernel_launch_reduction.hip.hpp" +#elif defined(GKO_COMPILING_DPCPP) +#include "dpcpp/base/kernel_launch_reduction.dp.hpp" +#elif defined(GKO_COMPILING_OMP) +#include "omp/base/kernel_launch_reduction.hpp" +#endif + + +#endif // GKO_COMMON_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh new file mode 100644 index 00000000000..3a661366b53 --- /dev/null +++ b/cuda/base/kernel_launch_reduction.cuh @@ -0,0 +1,193 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_COMMON_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ +#error \ + "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp" +#endif + + +#include "cuda/base/device_guard.hpp" +#include "cuda/base/types.hpp" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/reduction.cuh" +#include "cuda/components/thread_ids.cuh" + + +namespace gko { +namespace kernels { +namespace cuda { + + +template +__global__ __launch_bounds__( + default_block_size) void generic_kernel_reduction_1d(int64 size, + KernelFunction fn, + ReductionOp op, + FinalizeOp finalize, + ValueType init, + ValueType* storage, + KernelArgs... args) +{ + __shared__ + UninitializedArray + warp_partial; + static_assert(default_block_size / config::warp_size <= config::warp_size, + "needs third reduction level"); + auto tidx = thread::get_thread_id_flat(); + auto grid_size = thread::get_thread_num_flat(); + auto warp = + group::tiled_partition(group::this_thread_block()); + auto partial = init; + for (int64 i = tidx; i < size; i += grid_size) { + partial = op(partial, fn(i, args...)); + } + partial = reduce(warp, partial, op); + if (warp.thread_rank() == 0) { + warp_partial[threadIdx.x / config::warp_size] = partial; + } + __syncthreads(); + if (threadIdx.x < config::warp_size) { + storage[blockIdx.x] = + finalize(reduce(warp, warp_partial[threadIdx.x], op)); + } +} + + +template +__global__ __launch_bounds__( + default_block_size) void generic_kernel_reduction_2d(int64 rows, int64 cols, + KernelFunction fn, + ReductionOp op, + FinalizeOp finalize, + ValueType init, + ValueType* storage, + KernelArgs... args) +{ + __shared__ + UninitializedArray + warp_partial; + static_assert(default_block_size / config::warp_size <= config::warp_size, + "needs third reduction level"); + auto tidx = thread::get_thread_id_flat(); + auto grid_size = thread::get_thread_num_flat(); + auto warp = + group::tiled_partition(group::this_thread_block()); + auto partial = init; + for (int64 i = tidx; i < rows * cols; i += grid_size) { + const auto row = i / cols; + const auto col = i % cols; + partial = op(partial, fn(row, col, args...)); + } + partial = reduce(warp, partial, op); + if (warp.thread_rank() == 0) { + warp_partial[threadIdx.x / config::warp_size] = partial; + } + __syncthreads(); + if (threadIdx.x < config::warp_size) { + storage[blockIdx.x] = + finalize(reduce(warp, warp_partial[threadIdx.x], op)); + } +} + + +template +void run_kernel_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, size_type size, + KernelArgs&&... args) +{ + constexpr int oversubscription = 4; + gko::cuda::device_guard guard{exec->get_device_id()}; + constexpr auto block_size = default_block_size; + const auto num_blocks = std::min( + ceildiv(size, block_size), exec->get_num_warps() * oversubscription); + if (num_blocks > 1) { + Array partial{exec, static_cast(num_blocks)}; + generic_kernel_reduction_1d<<>>( + static_cast(size), fn, op, + [] __device__(auto v) { return v; }, as_cuda_type(init), + as_cuda_type(partial.get_data()), map_to_device(args)...); + generic_kernel_reduction_1d<<<1, block_size>>>( + static_cast(num_blocks), + [] __device__(auto i, auto v) { return v[i]; }, op, finalize, + as_cuda_type(init), as_cuda_type(result), + as_cuda_type(partial.get_const_data())); + } else { + generic_kernel_reduction_1d<<<1, block_size>>>( + static_cast(size), fn, op, finalize, as_cuda_type(init), + as_cuda_type(result), map_to_device(args)...); + } +} + + +template +void run_kernel_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, dim<2> size, KernelArgs&&... args) +{ + constexpr int oversubscription = 4; + gko::cuda::device_guard guard{exec->get_device_id()}; + constexpr auto block_size = default_block_size; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto num_blocks = + std::min(ceildiv(rows * cols, block_size), + exec->get_num_warps() * oversubscription); + if (num_blocks > 1) { + Array partial{exec, static_cast(num_blocks)}; + generic_kernel_reduction_2d<<>>( + rows, cols, fn, op, [] __device__(auto v) { return v; }, + as_cuda_type(init), as_cuda_type(partial.get_data()), + map_to_device(args)...); + generic_kernel_reduction_1d<<<1, block_size>>>( + static_cast(num_blocks), + [] __device__(auto i, auto v) { return v[i]; }, op, finalize, + as_cuda_type(init), as_cuda_type(result), + as_cuda_type(partial.get_const_data())); + } else { + generic_kernel_reduction_2d<<<1, block_size>>>( + rows, cols, fn, op, finalize, as_cuda_type(init), + as_cuda_type(result), map_to_device(args)...); + } +} + + +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/components/reduction.cuh b/cuda/components/reduction.cuh index 95ac3d8a417..8e0a962145e 100644 --- a/cuda/components/reduction.cuh +++ b/cuda/components/reduction.cuh @@ -53,7 +53,7 @@ namespace kernels { namespace cuda { -constexpr int default_block_size = 512; +constexpr int default_reduce_block_size = 512; #include "common/cuda_hip/components/reduction.hpp.inc" @@ -75,13 +75,14 @@ __host__ ValueType reduce_add_array(std::shared_ptr exec, auto block_results_val = source; size_type grid_dim = size; auto block_results = Array(exec); - if (size > default_block_size) { - const auto n = ceildiv(size, default_block_size); - grid_dim = (n <= default_block_size) ? n : default_block_size; + if (size > default_reduce_block_size) { + const auto n = ceildiv(size, default_reduce_block_size); + grid_dim = + (n <= default_reduce_block_size) ? n : default_reduce_block_size; block_results.resize_and_reset(grid_dim); - reduce_add_array<<>>( + reduce_add_array<<>>( size, as_cuda_type(source), as_cuda_type(block_results.get_data())); block_results_val = block_results.get_const_data(); @@ -89,7 +90,7 @@ __host__ ValueType reduce_add_array(std::shared_ptr exec, auto d_result = Array(exec, 1); - reduce_add_array<<<1, default_block_size>>>( + reduce_add_array<<<1, default_reduce_block_size>>>( grid_dim, as_cuda_type(block_results_val), as_cuda_type(d_result.get_data())); auto answer = exec->copy_val_to_host(d_result.get_const_data()); diff --git a/cuda/test/base/kernel_launch.cu b/cuda/test/base/kernel_launch.cu index adf443445a5..8e78e3ee830 100644 --- a/cuda/test/base/kernel_launch.cu +++ b/cuda/test/base/kernel_launch.cu @@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "common/unified/base/kernel_launch_reduction.hpp" #include "common/unified/base/kernel_launch_solver.hpp" #include "core/test/utils.hpp" @@ -276,4 +277,62 @@ TEST_F(KernelLaunch, Runs2DDense) } +void run1d_reduction(std::shared_ptr exec) +{ + gko::Array output{exec, 1}; + gko::kernels::cuda::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i) { + static_assert(is_same::value, "index"); + return i + 1; + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), + size_type{100000}); + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000ll); + + gko::kernels::cuda::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i) { + static_assert(is_same::value, "index"); + return i + 1; + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), + size_type{100}); + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100ll); +} + +TEST_F(KernelLaunch, Reduction1D) { run1d_reduction(exec); } + + +void run2d_reduction(std::shared_ptr exec) +{ + gko::Array output{exec, 1}; + gko::kernels::cuda::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "index"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), + gko::dim<2>{1000, 100}); + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000ll); + + gko::kernels::cuda::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "index"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), + gko::dim<2>{10, 10}); + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100ll); +} + +TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); } + + } // namespace diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp new file mode 100644 index 00000000000..1075acc8198 --- /dev/null +++ b/hip/base/kernel_launch_reduction.hip.hpp @@ -0,0 +1,197 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_COMMON_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ +#error \ + "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp" +#endif + + +#include "hip/base/device_guard.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/reduction.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { + + +template +__global__ __launch_bounds__( + default_block_size) void generic_kernel_reduction_1d(int64 size, + KernelFunction fn, + ReductionOp op, + FinalizeOp finalize, + ValueType init, + ValueType* storage, + KernelArgs... args) +{ + __shared__ + UninitializedArray + warp_partial; + static_assert(default_block_size / config::warp_size <= config::warp_size, + "needs third reduction level"); + auto tidx = thread::get_thread_id_flat(); + auto grid_size = thread::get_thread_num_flat(); + auto warp = + group::tiled_partition(group::this_thread_block()); + auto partial = init; + for (int64 i = tidx; i < size; i += grid_size) { + partial = op(partial, fn(i, args...)); + } + partial = reduce(warp, partial, op); + if (warp.thread_rank() == 0) { + warp_partial[threadIdx.x / config::warp_size] = partial; + } + __syncthreads(); + if (threadIdx.x < config::warp_size) { + storage[blockIdx.x] = + finalize(reduce(warp, warp_partial[threadIdx.x], op)); + } +} + + +template +__global__ __launch_bounds__( + default_block_size) void generic_kernel_reduction_2d(int64 rows, int64 cols, + KernelFunction fn, + ReductionOp op, + FinalizeOp finalize, + ValueType init, + ValueType* storage, + KernelArgs... args) +{ + __shared__ + UninitializedArray + warp_partial; + static_assert(default_block_size / config::warp_size <= config::warp_size, + "needs third reduction level"); + auto tidx = thread::get_thread_id_flat(); + auto grid_size = thread::get_thread_num_flat(); + auto warp = + group::tiled_partition(group::this_thread_block()); + auto partial = init; + for (int64 i = tidx; i < rows * cols; i += grid_size) { + const auto row = i / cols; + const auto col = i % cols; + partial = op(partial, fn(row, col, args...)); + } + partial = reduce(warp, partial, op); + if (warp.thread_rank() == 0) { + warp_partial[threadIdx.x / config::warp_size] = partial; + } + __syncthreads(); + if (threadIdx.x < config::warp_size) { + storage[blockIdx.x] = + finalize(reduce(warp, warp_partial[threadIdx.x], op)); + } +} + + +template +void run_kernel_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, size_type size, + KernelArgs&&... args) +{ + constexpr int oversubscription = 4; + gko::hip::device_guard guard{exec->get_device_id()}; + constexpr auto block_size = default_block_size; + const auto num_blocks = std::min( + ceildiv(size, block_size), exec->get_num_warps() * oversubscription); + if (num_blocks > 1) { + Array partial{exec, static_cast(num_blocks)}; + hipLaunchKernelGGL( + generic_kernel_reduction_1d, num_blocks, block_size, 0, 0, + static_cast(size), fn, op, + [] __device__(auto v) { return v; }, as_hip_type(init), + as_hip_type(partial.get_data()), map_to_device(args)...); + hipLaunchKernelGGL( + generic_kernel_reduction_1d, 1, block_size, 0, 0, + static_cast(num_blocks), + [] __device__(auto i, auto v) { return v[i]; }, op, finalize, + as_hip_type(init), as_hip_type(result), + as_hip_type(partial.get_const_data())); + } else { + hipLaunchKernelGGL(generic_kernel_reduction_1d, 1, block_size, 0, 0, + static_cast(size), fn, op, finalize, + as_hip_type(init), as_hip_type(result), + map_to_device(args)...); + } +} + + +template +void run_kernel_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, dim<2> size, KernelArgs&&... args) +{ + constexpr int oversubscription = 4; + gko::hip::device_guard guard{exec->get_device_id()}; + constexpr auto block_size = default_block_size; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto num_blocks = + std::min(ceildiv(rows * cols, block_size), + exec->get_num_warps() * oversubscription); + if (num_blocks > 1) { + Array partial{exec, static_cast(num_blocks)}; + generic_kernel_reduction_2d<<>>( + rows, cols, fn, op, [] __device__(auto v) { return v; }, + as_hip_type(init), as_hip_type(partial.get_data()), + map_to_device(args)...); + hipLaunchKernelGGL( + generic_kernel_reduction_1d, 1, block_size, 0, 0, + static_cast(num_blocks), + [] __device__(auto i, auto v) { return v[i]; }, op, finalize, + as_hip_type(init), as_hip_type(result), + as_hip_type(partial.get_const_data())); + } else { + hipLaunchKernelGGL(generic_kernel_reduction_2d, 1, block_size, 0, 0, + rows, cols, fn, op, finalize, as_hip_type(init), + as_hip_type(result), map_to_device(args)...); + } +} + + +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/components/reduction.hip.hpp b/hip/components/reduction.hip.hpp index 87d6b518123..7850b9f65a1 100644 --- a/hip/components/reduction.hip.hpp +++ b/hip/components/reduction.hip.hpp @@ -55,7 +55,7 @@ namespace kernels { namespace hip { -constexpr int default_block_size = 512; +constexpr int default_reduce_block_size = 512; #include "common/cuda_hip/components/reduction.hpp.inc" @@ -77,23 +77,26 @@ __host__ ValueType reduce_add_array(std::shared_ptr exec, auto block_results_val = source; size_type grid_dim = size; auto block_results = Array(exec); - if (size > default_block_size) { - const auto n = ceildiv(size, default_block_size); - grid_dim = (n <= default_block_size) ? n : default_block_size; + if (size > default_reduce_block_size) { + const auto n = ceildiv(size, default_reduce_block_size); + grid_dim = + (n <= default_reduce_block_size) ? n : default_reduce_block_size; block_results.resize_and_reset(grid_dim); - hipLaunchKernelGGL( - reduce_add_array, dim3(grid_dim), dim3(default_block_size), 0, 0, - size, as_hip_type(source), as_hip_type(block_results.get_data())); + hipLaunchKernelGGL(reduce_add_array, dim3(grid_dim), + dim3(default_reduce_block_size), 0, 0, size, + as_hip_type(source), + as_hip_type(block_results.get_data())); block_results_val = block_results.get_const_data(); } auto d_result = Array(exec, 1); - hipLaunchKernelGGL(reduce_add_array, dim3(1), dim3(default_block_size), 0, - 0, grid_dim, as_hip_type(block_results_val), + hipLaunchKernelGGL(reduce_add_array, dim3(1), + dim3(default_reduce_block_size), 0, 0, grid_dim, + as_hip_type(block_results_val), as_hip_type(d_result.get_data())); auto answer = exec->copy_val_to_host(d_result.get_const_data()); return answer; diff --git a/hip/test/base/kernel_launch.hip.cpp b/hip/test/base/kernel_launch.hip.cpp index ad3ba3cc643..4fb5ef0a4dc 100644 --- a/hip/test/base/kernel_launch.hip.cpp +++ b/hip/test/base/kernel_launch.hip.cpp @@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "common/unified/base/kernel_launch_reduction.hpp" #include "common/unified/base/kernel_launch_solver.hpp" #include "core/test/utils.hpp" @@ -275,4 +276,62 @@ TEST_F(KernelLaunch, Runs2DDense) } +void run1d_reduction(std::shared_ptr exec) +{ + gko::Array output{exec, 1}; + gko::kernels::hip::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i) { + static_assert(is_same::value, "index"); + return i + 1; + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), + size_type{100000}); + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000ll); + + gko::kernels::hip::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i) { + static_assert(is_same::value, "index"); + return i + 1; + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), + size_type{100}); + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100ll); +} + +TEST_F(KernelLaunch, Reduction1D) { run1d_reduction(exec); } + + +void run2d_reduction(std::shared_ptr exec) +{ + gko::Array output{exec, 1}; + gko::kernels::hip::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "index"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), + gko::dim<2>{1000, 100}); + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000ll); + + gko::kernels::hip::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "index"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), + gko::dim<2>{10, 10}); + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100ll); +} + +TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); } + + } // namespace diff --git a/omp/base/kernel_launch.hpp b/omp/base/kernel_launch.hpp index 79c65ef868e..98432c2d155 100644 --- a/omp/base/kernel_launch.hpp +++ b/omp/base/kernel_launch.hpp @@ -70,23 +70,19 @@ void run_kernel_sized_impl(syn::value_list, static_assert(remainder_cols < block_size, "remainder too large"); const auto rounded_cols = cols / block_size * block_size; GKO_ASSERT(rounded_cols + remainder_cols == cols); - if (rounded_cols == 0) { + if (rounded_cols == 0 || cols == block_size) { + // we group all sizes <= block_size here and unroll explicitly + constexpr auto local_cols = + remainder_cols == 0 ? block_size : remainder_cols; #pragma omp parallel for for (int64 row = 0; row < rows; row++) { #pragma unroll - for (int64 col = 0; col < remainder_cols; col++) { - [&]() { fn(row, col, args...); }(); - } - } - } else if (cols == block_size) { -#pragma omp parallel for - for (int64 row = 0; row < rows; row++) { -#pragma unroll - for (int64 col = 0; col < block_size; col++) { + for (int64 col = 0; col < local_cols; col++) { [&]() { fn(row, col, args...); }(); } } } else { + // we operate in block_size blocks plus an explicitly unrolled remainder #pragma omp parallel for for (int64 row = 0; row < rows; row++) { for (int64 base_col = 0; base_col < rounded_cols; @@ -104,7 +100,6 @@ void run_kernel_sized_impl(syn::value_list, } } - GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_sized, run_kernel_sized_impl) diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp new file mode 100644 index 00000000000..4981d4ed902 --- /dev/null +++ b/omp/base/kernel_launch_reduction.hpp @@ -0,0 +1,182 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_COMMON_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ +#error \ + "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp" +#endif + + +#include + + +#include + + +namespace gko { +namespace kernels { +namespace omp { + + +template +void run_kernel_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, size_type size, + KernelArgs&&... args) +{ + const auto num_threads = static_cast(omp_get_max_threads()); + const auto ssize = static_cast(size); + const auto work_per_thread = ceildiv(ssize, num_threads); + Array partial{exec, static_cast(num_threads)}; +#pragma omp parallel num_threads(num_threads) + { + const auto thread_id = omp_get_thread_num(); + const auto begin = thread_id * work_per_thread; + const auto end = std::min(ssize, begin + work_per_thread); + + auto local_partial = init; + for (auto i = begin; i < end; i++) { + local_partial = op(local_partial, [&]() { + return fn(i, map_to_device(args)...); + }()); + } + partial.get_data()[thread_id] = local_partial; + } + *result = finalize(std::accumulate(partial.get_const_data(), + partial.get_const_data() + num_threads, + init, op)); +} + + +namespace { + + +template +void run_kernel_reduction_sized_impl(syn::value_list, + std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, dim<2> size, + MappedKernelArgs... args) +{ + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto num_threads = static_cast(omp_get_max_threads()); + const auto work_per_thread = ceildiv(rows, num_threads); + Array partial{exec, static_cast(num_threads)}; + static_assert(remainder_cols < block_size, "remainder too large"); + const auto rounded_cols = cols / block_size * block_size; + GKO_ASSERT(rounded_cols + remainder_cols == cols); +#pragma omp parallel + { + const auto thread_id = omp_get_thread_num(); + const auto begin = thread_id * work_per_thread; + const auto end = std::min(rows, begin + work_per_thread); + + auto local_partial = init; + if (rounded_cols == 0 || cols == block_size) { + // we group all sizes <= block_size here and unroll explicitly + constexpr auto local_cols = + remainder_cols == 0 ? block_size : remainder_cols; + for (auto row = begin; row < end; row++) { +#pragma unroll + for (int64 col = 0; col < local_cols; col++) { + local_partial = op(local_partial, [&]() { + return fn(row, col, args...); + }()); + } + } + } else { + // we operate in block_size blocks plus an explicitly unrolled + // remainder + for (auto row = begin; row < end; row++) { + for (int64 base_col = 0; base_col < rounded_cols; + base_col += block_size) { +#pragma unroll + for (int64 i = 0; i < block_size; i++) { + local_partial = op(local_partial, [&]() { + return fn(row, base_col + i, args...); + }()); + } + } +#pragma unroll + for (int64 i = 0; i < remainder_cols; i++) { + local_partial = op(local_partial, [&]() { + return fn(row, rounded_cols + i, args...); + }()); + } + } + } + partial.get_data()[thread_id] = local_partial; + } + *result = finalize(std::accumulate(partial.get_const_data(), + partial.get_const_data() + num_threads, + init, op)); +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_sized, + run_kernel_reduction_sized_impl) + + +} // namespace + + +template +void run_kernel_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, dim<2> size, KernelArgs&&... args) +{ + const auto cols = static_cast(size[1]); + constexpr int block_size = 8; + using remainders = syn::as_list>; + + if (cols <= 0) { + *result = init; + return; + } + select_run_kernel_reduction_sized( + remainders(), + [&](int remainder) { return remainder == cols % block_size; }, + syn::value_list(), syn::type_list<>(), exec, fn, op, + finalize, init, result, size, args...); +} + + +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/omp/test/base/kernel_launch.cpp b/omp/test/base/kernel_launch.cpp index dfdf85c3e0e..6f13797b85f 100644 --- a/omp/test/base/kernel_launch.cpp +++ b/omp/test/base/kernel_launch.cpp @@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "common/unified/base/kernel_launch_reduction.hpp" #include "common/unified/base/kernel_launch_solver.hpp" #include "core/test/utils.hpp" @@ -239,5 +240,83 @@ TEST_F(KernelLaunch, Runs2DDense) GKO_ASSERT_MTX_NEAR(zero_dense2, iota_dense, 0.0); } +TEST_F(KernelLaunch, Reduction1D) +{ + gko::Array output{exec, 1}; + gko::kernels::omp::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i) { + static_assert(is_same::value, "index"); + return i + 1; + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), + size_type{100000}); + ASSERT_EQ(*output.get_const_data(), 10000100000ll); + + gko::kernels::omp::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i) { + static_assert(is_same::value, "index"); + return i + 1; + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), + size_type{10}); + ASSERT_EQ(*output.get_const_data(), 110ll); +} + + +TEST_F(KernelLaunch, Reduction2DSmallRows) +{ + gko::Array output{exec, 1}; + for (int cols = 0; cols < 17; cols++) { + gko::kernels::omp::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "index"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), + gko::dim<2>{10, cols}); + ASSERT_EQ(*output.get_const_data(), 110ll * cols * (cols + 1)); + } +} + + +TEST_F(KernelLaunch, Reduction2DLargeRows) +{ + gko::Array output{exec, 1}; + for (int cols = 0; cols < 17; cols++) { + gko::kernels::omp::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "index"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), + gko::dim<2>{1000, cols}); + ASSERT_EQ(*output.get_const_data(), 1001000ll * cols * (cols + 1)); + } +} + + +TEST_F(KernelLaunch, Reduction2D) +{ + gko::Array output{exec, 1}; + gko::kernels::omp::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "index"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), + gko::dim<2>{1000, 100}); + ASSERT_EQ(*output.get_const_data(), 10110100000ll); +} + } // namespace From 12cb03c3f4e06410d37f867f09984f3e7db88e56 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 14 Jul 2021 02:30:43 +0200 Subject: [PATCH 04/25] add row and column reduction kernels --- cuda/base/kernel_launch_reduction.cuh | 235 +++++++++++++++++++++++++- cuda/test/base/kernel_launch.cu | 57 +++++++ 2 files changed, 288 insertions(+), 4 deletions(-) diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh index 3a661366b53..08849a90a4a 100644 --- a/cuda/base/kernel_launch_reduction.cuh +++ b/cuda/base/kernel_launch_reduction.cuh @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/device_guard.hpp" #include "cuda/base/types.hpp" #include "cuda/components/cooperative_groups.cuh" @@ -78,8 +79,10 @@ __global__ __launch_bounds__( } __syncthreads(); if (threadIdx.x < config::warp_size) { - storage[blockIdx.x] = - finalize(reduce(warp, warp_partial[threadIdx.x], op)); + partial = reduce(warp, warp_partial[threadIdx.x], op); + if (threadIdx.x == 0) { + storage[blockIdx.x] = finalize(partial); + } } } @@ -116,8 +119,10 @@ __global__ __launch_bounds__( } __syncthreads(); if (threadIdx.x < config::warp_size) { - storage[blockIdx.x] = - finalize(reduce(warp, warp_partial[threadIdx.x], op)); + partial = reduce(warp, warp_partial[threadIdx.x], op); + if (threadIdx.x == 0) { + storage[blockIdx.x] = finalize(partial); + } } } @@ -188,6 +193,228 @@ void run_kernel_reduction(std::shared_ptr exec, } +template +__global__ + __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d( + int64 rows, int64 cols, int64 col_parts, KernelFunction fn, + ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* result, + int64 result_stride, KernelArgs... args) +{ + const auto idx = thread::get_subwarp_id_flat(); + const auto row = idx % rows; + const auto col_part = idx / rows; + if (col_part >= col_parts) { + return; + } + const auto cols_per_part = ceildiv(cols, col_parts); + // TODO use boundaries divisible by subwarp_size + const auto begin = cols_per_part * col_part; + const auto end = min(begin + cols_per_part, cols); + auto subwarp = + group::tiled_partition(group::this_thread_block()); + auto partial = init; + for (auto col = begin + subwarp.thread_rank(); col < end; + col += subwarp_size) { + partial = op(partial, fn(row, col, args...)); + } + partial = reduce(subwarp, partial, op); + result[(row * col_parts + col_part) * result_stride] = finalize(partial); +} + + +template +__global__ + __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d( + int64 rows, int64 cols, int64 row_parts, KernelFunction fn, + ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* result, + KernelArgs... args) +{ + const auto idx = thread::get_thread_id_flat(); + const auto col = idx % cols; + const auto row_part = idx / cols; + if (row_part >= row_parts) { + return; + } + const auto rows_per_part = ceildiv(rows, row_parts); + const auto begin = rows_per_part * row_part; + const auto end = min(begin + rows_per_part, rows); + auto partial = init; + for (auto row = begin; row < end; row++) { + partial = op(partial, fn(row, col, args...)); + } + result[col * row_parts + row_part] = finalize(partial); +} + + +template +__global__ + __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d( + int64 num_results, int64 num_parts, ReductionOp op, FinalizeOp finalize, + ValueType init, const ValueType* input, int64 result_stride, + ValueType* result) +{ + const auto idx = thread::get_subwarp_id_flat(); + if (idx >= num_results) { + return; + } + auto subwarp = + group::tiled_partition(group::this_thread_block()); + auto partial = init; + for (int64 part = subwarp.thread_rank(); part < num_parts; + part += subwarp_size) { + partial = op(partial, input[idx * num_parts + part]); + } + partial = reduce(subwarp, partial, op); + if (subwarp.thread_rank() == 0) { + result[idx * result_stride] = finalize(partial); + } +} + + +namespace { + + +template +void run_generic_kernel_row_reduction(syn::value_list, + int64 rows, int64 cols, int64 col_parts, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, int64 result_stride, + KernelArgs... args) +{ + constexpr auto block_size = default_block_size; + const auto num_blocks = ceildiv(rows * cols * subwarp_size, block_size); + generic_kernel_row_reduction_2d<<>>( + rows, cols, col_parts, fn, op, finalize, as_cuda_type(init), + as_cuda_type(result), result_stride, args...); +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction, + run_generic_kernel_row_reduction) + + +template +void run_kernel_reduction_finalize(syn::value_list, + int64 num_results, int64 num_parts, + ReductionOp op, FinalizeOp finalize, + ValueType init, const ValueType* input, + int64 result_stride, ValueType* result) +{ + constexpr auto block_size = default_block_size; + const auto num_blocks = ceildiv(num_results * subwarp_size, block_size); + generic_kernel_reduction_finalize_2d + <<>>(num_results, num_parts, op, finalize, + as_cuda_type(init), as_cuda_type(input), + static_cast(result_stride), + as_cuda_type(result)); +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_finalize, + run_kernel_reduction_finalize) + + +} // namespace + + +template +void run_kernel_row_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, size_type result_stride, + dim<2> size, KernelArgs&&... args) +{ + using subwarp_sizes = + syn::value_list; + constexpr int oversubscription = 4; + gko::cuda::device_guard guard{exec->get_device_id()}; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto resources = exec->get_num_warps() * oversubscription; + const auto col_parts = 1; // TODO tune + if (col_parts > 1) { + Array partial{exec, + static_cast(col_parts * rows)}; + select_run_generic_kernel_row_reduction( + subwarp_sizes(), + [&](int compiled_subwarp_size) { + return compiled_subwarp_size >= cols || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), rows, cols, col_parts, + fn, op, [] __device__(auto i) { return i; }, init, + partial.get_data(), 1, map_to_device(args)...); + select_run_kernel_reduction_finalize( + subwarp_sizes(), + [&](int compiled_subwarp_size) { + return compiled_subwarp_size >= col_parts || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), rows, col_parts, op, + finalize, init, partial.get_const_data(), + static_cast(result_stride), result); + } else { + select_run_generic_kernel_row_reduction( + subwarp_sizes(), + [&](int compiled_subwarp_size) { + return compiled_subwarp_size >= cols || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), rows, cols, 1, fn, op, + finalize, init, result, static_cast(result_stride), + map_to_device(args)...); + } +} + + +template +void run_kernel_col_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, dim<2> size, + KernelArgs&&... args) +{ + constexpr int oversubscription = 4; + gko::cuda::device_guard guard{exec->get_device_id()}; + constexpr auto block_size = default_block_size; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto resources = + exec->get_num_warps() * config::warp_size * oversubscription; + const auto num_blocks = ceildiv(rows * cols, block_size); + const auto row_parts = 1; // TODO tune + if (row_parts > 1) { + Array partial{exec, + static_cast(row_parts * cols)}; + generic_kernel_col_reduction_2d<<>>( + rows, cols, row_parts, fn, op, [] __device__(auto i) { return i; }, + as_cuda_type(init), as_cuda_type(partial.get_data()), + map_to_device(args)...); + using subwarp_sizes = + syn::value_list; + select_run_kernel_reduction_finalize( + subwarp_sizes(), + [&](int compiled_subwarp_size) { + return compiled_subwarp_size >= row_parts || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), cols, row_parts, op, + finalize, as_cuda_type(init), + as_cuda_type(partial.get_const_data()), 1, as_cuda_type(result)); + } else { + generic_kernel_col_reduction_2d<<>>( + rows, cols, 1, fn, op, finalize, as_cuda_type(init), + as_cuda_type(result), map_to_device(args)...); + } +} + + } // namespace cuda } // namespace kernels } // namespace gko diff --git a/cuda/test/base/kernel_launch.cu b/cuda/test/base/kernel_launch.cu index 8e78e3ee830..c6a4f1c679f 100644 --- a/cuda/test/base/kernel_launch.cu +++ b/cuda/test/base/kernel_launch.cu @@ -335,4 +335,61 @@ void run2d_reduction(std::shared_ptr exec) TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); } +void run2d_row_reduction(std::shared_ptr exec) +{ + int num_rows = 1000; + int num_cols = 100; + gko::Array host_ref{exec->get_master(), + static_cast(2 * num_rows)}; + std::fill_n(host_ref.get_data(), 2 * num_rows, 1234); + gko::Array output{exec, host_ref}; + for (int i = 0; i < num_rows; i++) { + host_ref.get_data()[2 * i] = num_cols * (num_cols + 1) * (i + 1); + } + + gko::kernels::cuda::run_kernel_row_reduction( + exec, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "index"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2, + gko::dim<2>{static_cast(num_rows), + static_cast(num_cols)}); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); +} + +TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); } + + +void run2d_col_reduction(std::shared_ptr exec) +{ + int num_rows = 1000; + int num_cols = 100; + gko::Array host_ref{exec->get_master(), + static_cast(num_cols)}; + gko::Array output{exec, static_cast(num_cols)}; + for (int i = 0; i < num_cols; i++) { + host_ref.get_data()[i] = num_rows * (num_rows + 1) * (i + 1); + } + + gko::kernels::cuda::run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "index"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), + gko::dim<2>{static_cast(num_rows), + static_cast(num_cols)}); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); +} + +TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); } + + } // namespace From 2232a44ec6a0b770625b984e335fc261b3ec7e70 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 14 Jul 2021 13:43:46 +0200 Subject: [PATCH 05/25] add HIP reduction support --- hip/base/kernel_launch_reduction.hip.hpp | 237 ++++++++++++++++++++++- hip/test/base/kernel_launch.hip.cpp | 57 ++++++ 2 files changed, 290 insertions(+), 4 deletions(-) diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp index 1075acc8198..fe4b697bc30 100644 --- a/hip/base/kernel_launch_reduction.hip.hpp +++ b/hip/base/kernel_launch_reduction.hip.hpp @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#include "core/synthesizer/implementation_selection.hpp" #include "hip/base/device_guard.hip.hpp" #include "hip/base/types.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" @@ -78,8 +79,10 @@ __global__ __launch_bounds__( } __syncthreads(); if (threadIdx.x < config::warp_size) { - storage[blockIdx.x] = - finalize(reduce(warp, warp_partial[threadIdx.x], op)); + partial = reduce(warp, warp_partial[threadIdx.x], op); + if (threadIdx.x == 0) { + storage[blockIdx.x] = finalize(partial); + } } } @@ -174,8 +177,9 @@ void run_kernel_reduction(std::shared_ptr exec, exec->get_num_warps() * oversubscription); if (num_blocks > 1) { Array partial{exec, static_cast(num_blocks)}; - generic_kernel_reduction_2d<<>>( - rows, cols, fn, op, [] __device__(auto v) { return v; }, + hipLaunchKernelGGL( + generic_kernel_reduction_2d, num_blocks, block_size, 0, 0, rows, + cols, fn, op, [] __device__(auto v) { return v; }, as_hip_type(init), as_hip_type(partial.get_data()), map_to_device(args)...); hipLaunchKernelGGL( @@ -192,6 +196,231 @@ void run_kernel_reduction(std::shared_ptr exec, } +template +__global__ + __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d( + int64 rows, int64 cols, int64 col_parts, KernelFunction fn, + ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* result, + int64 result_stride, KernelArgs... args) +{ + const auto idx = thread::get_subwarp_id_flat(); + const auto row = idx % rows; + const auto col_part = idx / rows; + if (col_part >= col_parts) { + return; + } + const auto cols_per_part = ceildiv(cols, col_parts); + // TODO use boundaries divisible by subwarp_size + const auto begin = cols_per_part * col_part; + const auto end = min(begin + cols_per_part, cols); + auto subwarp = + group::tiled_partition(group::this_thread_block()); + auto partial = init; + for (auto col = begin + subwarp.thread_rank(); col < end; + col += subwarp_size) { + partial = op(partial, fn(row, col, args...)); + } + partial = reduce(subwarp, partial, op); + result[(row * col_parts + col_part) * result_stride] = finalize(partial); +} + + +template +__global__ + __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d( + int64 rows, int64 cols, int64 row_parts, KernelFunction fn, + ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* result, + KernelArgs... args) +{ + const auto idx = thread::get_thread_id_flat(); + const auto col = idx % cols; + const auto row_part = idx / cols; + if (row_part >= row_parts) { + return; + } + const auto rows_per_part = ceildiv(rows, row_parts); + const auto begin = rows_per_part * row_part; + const auto end = min(begin + rows_per_part, rows); + auto partial = init; + for (auto row = begin; row < end; row++) { + partial = op(partial, fn(row, col, args...)); + } + result[col * row_parts + row_part] = finalize(partial); +} + + +template +__global__ + __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d( + int64 num_results, int64 num_parts, ReductionOp op, FinalizeOp finalize, + ValueType init, const ValueType* input, int64 result_stride, + ValueType* result) +{ + const auto idx = thread::get_subwarp_id_flat(); + if (idx >= num_results) { + return; + } + auto subwarp = + group::tiled_partition(group::this_thread_block()); + auto partial = init; + for (int64 part = subwarp.thread_rank(); part < num_parts; + part += subwarp_size) { + partial = op(partial, input[idx * num_parts + part]); + } + partial = reduce(subwarp, partial, op); + if (subwarp.thread_rank() == 0) { + result[idx * result_stride] = finalize(partial); + } +} + + +namespace { + + +template +void run_generic_kernel_row_reduction(syn::value_list, + int64 rows, int64 cols, int64 col_parts, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, int64 result_stride, + KernelArgs... args) +{ + constexpr auto block_size = default_block_size; + const auto num_blocks = ceildiv(rows * cols * subwarp_size, block_size); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(generic_kernel_row_reduction_2d), + num_blocks, block_size, 0, 0, rows, cols, col_parts, fn, op, finalize, + as_hip_type(init), as_hip_type(result), result_stride, args...); +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction, + run_generic_kernel_row_reduction) + + +template +void run_kernel_reduction_finalize(syn::value_list, + int64 num_results, int64 num_parts, + ReductionOp op, FinalizeOp finalize, + ValueType init, const ValueType* input, + int64 result_stride, ValueType* result) +{ + constexpr auto block_size = default_block_size; + const auto num_blocks = ceildiv(num_results * subwarp_size, block_size); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(generic_kernel_reduction_finalize_2d), + num_blocks, block_size, 0, 0, num_results, num_parts, op, finalize, + as_hip_type(init), as_hip_type(input), + static_cast(result_stride), as_hip_type(result)); +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_finalize, + run_kernel_reduction_finalize) + + +} // namespace + + +template +void run_kernel_row_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, size_type result_stride, + dim<2> size, KernelArgs&&... args) +{ + using subwarp_sizes = + syn::value_list; + constexpr int oversubscription = 4; + gko::hip::device_guard guard{exec->get_device_id()}; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto resources = exec->get_num_warps() * oversubscription; + const auto col_parts = 1; // TODO tune + if (col_parts > 1) { + Array partial{exec, + static_cast(col_parts * rows)}; + select_run_generic_kernel_row_reduction( + subwarp_sizes(), + [&](int compiled_subwarp_size) { + return compiled_subwarp_size >= cols || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), rows, cols, col_parts, + fn, op, [] __device__(auto i) { return i; }, init, + partial.get_data(), 1, map_to_device(args)...); + select_run_kernel_reduction_finalize( + subwarp_sizes(), + [&](int compiled_subwarp_size) { + return compiled_subwarp_size >= col_parts || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), rows, col_parts, op, + finalize, init, partial.get_const_data(), + static_cast(result_stride), result); + } else { + select_run_generic_kernel_row_reduction( + subwarp_sizes(), + [&](int compiled_subwarp_size) { + return compiled_subwarp_size >= cols || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), rows, cols, 1, fn, op, + finalize, init, result, static_cast(result_stride), + map_to_device(args)...); + } +} + + +template +void run_kernel_col_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, dim<2> size, + KernelArgs&&... args) +{ + constexpr int oversubscription = 4; + gko::hip::device_guard guard{exec->get_device_id()}; + constexpr auto block_size = default_block_size; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto resources = + exec->get_num_warps() * config::warp_size * oversubscription; + const auto num_blocks = ceildiv(rows * cols, block_size); + const auto row_parts = 1; // TODO tune + if (row_parts > 1) { + Array partial{exec, + static_cast(row_parts * cols)}; + hipLaunchKernelGGL( + generic_kernel_col_reduction_2d, num_blocks, block_size, 0, 0, rows, + cols, row_parts, fn, op, [] __device__(auto i) { return i; }, + as_hip_type(init), as_hip_type(partial.get_data()), + map_to_device(args)...); + using subwarp_sizes = + syn::value_list; + select_run_kernel_reduction_finalize( + subwarp_sizes(), + [&](int compiled_subwarp_size) { + return compiled_subwarp_size >= row_parts || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), cols, row_parts, op, + finalize, as_hip_type(init), as_hip_type(partial.get_const_data()), + 1, as_hip_type(result)); + } else { + hipLaunchKernelGGL(generic_kernel_col_reduction_2d, num_blocks, + block_size, 0, 0, rows, cols, 1, fn, op, finalize, + as_hip_type(init), as_hip_type(result), + map_to_device(args)...); + } +} + + } // namespace hip } // namespace kernels } // namespace gko diff --git a/hip/test/base/kernel_launch.hip.cpp b/hip/test/base/kernel_launch.hip.cpp index 4fb5ef0a4dc..849d8b45161 100644 --- a/hip/test/base/kernel_launch.hip.cpp +++ b/hip/test/base/kernel_launch.hip.cpp @@ -334,4 +334,61 @@ void run2d_reduction(std::shared_ptr exec) TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); } +void run2d_row_reduction(std::shared_ptr exec) +{ + int num_rows = 1000; + int num_cols = 100; + gko::Array host_ref{exec->get_master(), + static_cast(2 * num_rows)}; + std::fill_n(host_ref.get_data(), 2 * num_rows, 1234); + gko::Array output{exec, host_ref}; + for (int i = 0; i < num_rows; i++) { + host_ref.get_data()[2 * i] = num_cols * (num_cols + 1) * (i + 1); + } + + gko::kernels::hip::run_kernel_row_reduction( + exec, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "index"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2, + gko::dim<2>{static_cast(num_rows), + static_cast(num_cols)}); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); +} + +TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); } + + +void run2d_col_reduction(std::shared_ptr exec) +{ + int num_rows = 1000; + int num_cols = 100; + gko::Array host_ref{exec->get_master(), + static_cast(num_cols)}; + gko::Array output{exec, static_cast(num_cols)}; + for (int i = 0; i < num_cols; i++) { + host_ref.get_data()[i] = num_rows * (num_rows + 1) * (i + 1); + } + + gko::kernels::hip::run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "index"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), + gko::dim<2>{static_cast(num_rows), + static_cast(num_cols)}); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); +} + +TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); } + + } // namespace From 98bc7df16266e906a058e04d495a0ecd5673a682 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 15 Jul 2021 21:38:48 +0200 Subject: [PATCH 06/25] add row and column OpenMP reduction kernels --- omp/base/kernel_launch_reduction.hpp | 193 +++++++++++++++++++++++++++ omp/test/base/kernel_launch.cpp | 86 ++++++++++++ 2 files changed, 279 insertions(+) diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp index 4981d4ed902..dbc055fffd6 100644 --- a/omp/base/kernel_launch_reduction.hpp +++ b/omp/base/kernel_launch_reduction.hpp @@ -47,6 +47,10 @@ namespace kernels { namespace omp { +// how many more reduction tasks we launch relative to the number of threads +constexpr int reduction_kernel_oversubscription = 4; + + template void run_kernel_reduction(std::shared_ptr exec, @@ -177,6 +181,195 @@ void run_kernel_reduction(std::shared_ptr exec, } +template +void run_kernel_row_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, size_type result_stride, + dim<2> size, KernelArgs&&... args) +{ + constexpr int block_size = 8; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto num_threads = static_cast(omp_get_max_threads()); + if (rows <= 0) { + return; + } + // enough work to keep all threads busy or only very small reduction sizes + if (rows >= reduction_kernel_oversubscription * num_threads || + cols < rows) { +#pragma omp parallel for + for (int64 row = 0; row < rows; row++) { + auto partial = init; + for (int64 col = 0; col < cols; col++) { + partial = + op(partial, [&]() { return fn(row, col, args...); }()); + } + result[result_stride * row] = finalize(partial); + } + } else { + // small number of rows and large reduction sizes: do partial sum first + const auto work_per_thread = ceildiv(cols, num_threads); + Array partial{exec, + static_cast(rows * num_threads)}; +#pragma omp parallel num_threads(num_threads) + { + const auto thread_id = static_cast(omp_get_thread_num()); + const auto begin = thread_id * work_per_thread; + const auto end = std::min(begin + work_per_thread, cols); + for (int64 row = 0; row < rows; row++) { + auto local_partial = init; + for (int64 col = begin; col < end; col++) { + local_partial = op(local_partial, [&]() { + return fn(row, col, args...); + }()); + } + partial.get_data()[row * num_threads + thread_id] = + local_partial; + } + } + // then accumulate the partial sums and write to result +#pragma omp parallel for + for (int64 row = 0; row < rows; row++) { + auto local_partial = init; + for (int64 thread_id = 0; thread_id < num_threads; thread_id++) { + local_partial = + op(local_partial, + partial.get_const_data()[row * num_threads + thread_id]); + } + result[row * result_stride] = finalize(local_partial); + } + } +} + + +namespace { + + +template +void run_kernel_col_reduction_sized_block_impl( + KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType init, + ValueType* result, int64 row_begin, int64 row_end, int64 base_col, + MappedKernelArgs... args) +{ + std::array partial; + partial.fill(init); + for (auto row = row_begin; row < row_end; row++) { +#pragma unroll + for (int64 rel_col = 0; rel_col < local_cols; rel_col++) { + partial[rel_col] = op(partial[rel_col], [&]() { + return fn(row, base_col + rel_col, args...); + }()); + } + } +#pragma unroll + for (int64 rel_col = 0; rel_col < local_cols; rel_col++) { + result[base_col + rel_col] = finalize(partial[rel_col]); + } +} + + +template +void run_kernel_col_reduction_sized_impl( + syn::value_list, + std::shared_ptr exec, KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, ValueType* result, dim<2> size, + MappedKernelArgs... args) +{ + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto num_threads = static_cast(omp_get_max_threads()); + static_assert(remainder_cols < block_size, "remainder too large"); + GKO_ASSERT(cols % block_size == remainder_cols); + const auto num_col_blocks = ceildiv(cols, block_size); + // enough work to keep all threads busy or only very small reduction sizes + if (cols >= reduction_kernel_oversubscription * num_threads || + rows < cols) { +#pragma omp parallel for + for (int64 col_block = 0; col_block < num_col_blocks; col_block++) { + const auto base_col = col_block * block_size; + if (base_col + block_size <= cols) { + run_kernel_col_reduction_sized_block_impl( + fn, op, finalize, init, result, 0, rows, base_col); + } else { + run_kernel_col_reduction_sized_block_impl( + fn, op, finalize, init, result, 0, rows, base_col); + } + } + } else { + // number of blocks that need to be reduced afterwards + const auto reduction_size = + ceildiv(reduction_kernel_oversubscription * num_threads, cols); + const auto rows_per_thread = ceildiv(rows, reduction_size); + Array partial{exec, + static_cast(reduction_size * cols)}; +#pragma omp parallel for + for (int64 i = 0; i < reduction_size * num_col_blocks; i++) { + const auto col_block = i % num_col_blocks; + const auto row_block = i / num_col_blocks; + const auto begin = row_block * rows_per_thread; + const auto end = std::min(begin + rows_per_thread, rows); + const auto base_col = col_block * block_size; + const auto identity = [](auto i) { return i; }; + if (base_col + block_size <= cols) { + run_kernel_col_reduction_sized_block_impl( + fn, op, identity, init, + partial.get_data() + cols * row_block, begin, end, + base_col); + } else { + run_kernel_col_reduction_sized_block_impl( + fn, op, identity, init, + partial.get_data() + cols * row_block, begin, end, + base_col); + } + } +#pragma omp parallel for + for (int64 col = 0; col < cols; col++) { + auto total = init; + for (int64 row_block = 0; row_block < reduction_size; row_block++) { + total = + op(total, partial.get_const_data()[col + cols * row_block]); + } + result[col] = finalize(total); + } + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_col_reduction_sized, + run_kernel_col_reduction_sized_impl) + + +} // namespace + + +template +void run_kernel_col_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, dim<2> size, + KernelArgs&&... args) +{ + constexpr auto block_size = 8; + using remainders = syn::as_list>; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + if (cols <= 0) { + return; + } + select_run_kernel_col_reduction_sized( + remainders(), + [&](int remainder) { return remainder == cols % block_size; }, + syn::value_list(), syn::type_list<>(), exec, fn, op, + finalize, init, result, size, args...); +} + + } // namespace omp } // namespace kernels } // namespace gko diff --git a/omp/test/base/kernel_launch.cpp b/omp/test/base/kernel_launch.cpp index 6f13797b85f..7184649c3ae 100644 --- a/omp/test/base/kernel_launch.cpp +++ b/omp/test/base/kernel_launch.cpp @@ -319,4 +319,90 @@ TEST_F(KernelLaunch, Reduction2D) } +TEST_F(KernelLaunch, ReductionRow2DSmall) +{ + // 4 rows, with oversubscription this means we use multiple threads per row + // if OMP_NUM_THREADS >= 2 + int num_rows = 4; + int num_cols = 100; + gko::Array host_ref{exec->get_master(), + static_cast(2 * num_rows)}; + std::fill_n(host_ref.get_data(), 2 * num_rows, 1234); + gko::Array output{exec, host_ref}; + for (int i = 0; i < num_rows; i++) { + host_ref.get_data()[2 * i] = num_cols * (num_cols + 1) * (i + 1); + } + + gko::kernels::omp::run_kernel_row_reduction( + exec, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "index"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2, + gko::dim<2>{static_cast(num_rows), + static_cast(num_cols)}); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); +} + + +TEST_F(KernelLaunch, ReductionRow2D) +{ + int num_rows = 1000; + int num_cols = 100; + gko::Array host_ref{exec->get_master(), + static_cast(2 * num_rows)}; + std::fill_n(host_ref.get_data(), 2 * num_rows, 1234); + gko::Array output{exec, host_ref}; + for (int i = 0; i < num_rows; i++) { + host_ref.get_data()[2 * i] = num_cols * (num_cols + 1) * (i + 1); + } + + gko::kernels::omp::run_kernel_row_reduction( + exec, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "index"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2, + gko::dim<2>{static_cast(num_rows), + static_cast(num_cols)}); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); +} + + +TEST_F(KernelLaunch, ReductionCol2D) +{ + for (int num_rows : {0, 1, 10, 1000, 1000}) { + for (int num_cols : + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 40, 100, 1000}) { + gko::Array host_ref{exec->get_master(), + static_cast(num_cols)}; + gko::Array output{exec, static_cast(num_cols)}; + for (int i = 0; i < num_cols; i++) { + host_ref.get_data()[i] = num_rows * (num_rows + 1) * (i + 1); + } + + gko::kernels::omp::run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "index"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, + output.get_data(), + gko::dim<2>{static_cast(num_rows), + static_cast(num_cols)}); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); + } + } +} + + } // namespace From 41d92b4bf56896734dc79ee69f464727c5218d5b Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 15 Jul 2021 23:56:42 +0200 Subject: [PATCH 07/25] fix types and tests --- cuda/test/base/kernel_launch.cu | 38 +++++++++----- hip/test/base/kernel_launch.hip.cpp | 38 +++++++++----- omp/base/kernel_launch.hpp | 22 +++++--- omp/base/kernel_launch_reduction.hpp | 78 ++++++++++++++++++---------- omp/test/base/kernel_launch.cpp | 52 +++++++++++++------ 5 files changed, 155 insertions(+), 73 deletions(-) diff --git a/cuda/test/base/kernel_launch.cu b/cuda/test/base/kernel_launch.cu index c6a4f1c679f..6a5494e03fa 100644 --- a/cuda/test/base/kernel_launch.cu +++ b/cuda/test/base/kernel_launch.cu @@ -280,26 +280,31 @@ TEST_F(KernelLaunch, Runs2DDense) void run1d_reduction(std::shared_ptr exec) { gko::Array output{exec, 1}; + gko::kernels::cuda::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i) { + [] GKO_KERNEL(auto i, auto a) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); return i + 1; }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), - size_type{100000}); + size_type{100000}, output); + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000ll); gko::kernels::cuda::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i) { + [] GKO_KERNEL(auto i, auto a) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); return i + 1; }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), - size_type{100}); + size_type{100}, output); + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100ll); } @@ -309,26 +314,31 @@ TEST_F(KernelLaunch, Reduction1D) { run1d_reduction(exec); } void run2d_reduction(std::shared_ptr exec) { gko::Array output{exec, 1}; + gko::kernels::cuda::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto j) { + [] GKO_KERNEL(auto i, auto j, auto a) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), - gko::dim<2>{1000, 100}); + gko::dim<2>{1000, 100}, output); + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000ll); gko::kernels::cuda::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto j) { + [] GKO_KERNEL(auto i, auto j, auto a) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), - gko::dim<2>{10, 10}); + gko::dim<2>{10, 10}, output); + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100ll); } @@ -349,14 +359,16 @@ void run2d_row_reduction(std::shared_ptr exec) gko::kernels::cuda::run_kernel_row_reduction( exec, - [] GKO_KERNEL(auto i, auto j) { + [] GKO_KERNEL(auto i, auto j, auto a) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2, gko::dim<2>{static_cast(num_rows), - static_cast(num_cols)}); + static_cast(num_cols)}, + output); GKO_ASSERT_ARRAY_EQ(host_ref, output); } @@ -377,14 +389,16 @@ void run2d_col_reduction(std::shared_ptr exec) gko::kernels::cuda::run_kernel_col_reduction( exec, - [] GKO_KERNEL(auto i, auto j) { + [] GKO_KERNEL(auto i, auto j, auto a) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), gko::dim<2>{static_cast(num_rows), - static_cast(num_cols)}); + static_cast(num_cols)}, + output); GKO_ASSERT_ARRAY_EQ(host_ref, output); } diff --git a/hip/test/base/kernel_launch.hip.cpp b/hip/test/base/kernel_launch.hip.cpp index 849d8b45161..755f8b3834d 100644 --- a/hip/test/base/kernel_launch.hip.cpp +++ b/hip/test/base/kernel_launch.hip.cpp @@ -279,26 +279,31 @@ TEST_F(KernelLaunch, Runs2DDense) void run1d_reduction(std::shared_ptr exec) { gko::Array output{exec, 1}; + gko::kernels::hip::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i) { + [] GKO_KERNEL(auto i, auto a) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); return i + 1; }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), - size_type{100000}); + size_type{100000}, output); + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000ll); gko::kernels::hip::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i) { + [] GKO_KERNEL(auto i, auto a) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); return i + 1; }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), - size_type{100}); + size_type{100}, output); + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100ll); } @@ -308,26 +313,31 @@ TEST_F(KernelLaunch, Reduction1D) { run1d_reduction(exec); } void run2d_reduction(std::shared_ptr exec) { gko::Array output{exec, 1}; + gko::kernels::hip::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto j) { + [] GKO_KERNEL(auto i, auto j, auto a) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), - gko::dim<2>{1000, 100}); + gko::dim<2>{1000, 100}, output); + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000ll); gko::kernels::hip::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto j) { + [] GKO_KERNEL(auto i, auto j, auto a) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), - gko::dim<2>{10, 10}); + gko::dim<2>{10, 10}, output); + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100ll); } @@ -348,14 +358,16 @@ void run2d_row_reduction(std::shared_ptr exec) gko::kernels::hip::run_kernel_row_reduction( exec, - [] GKO_KERNEL(auto i, auto j) { + [] GKO_KERNEL(auto i, auto j, auto a) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2, gko::dim<2>{static_cast(num_rows), - static_cast(num_cols)}); + static_cast(num_cols)}, + output); GKO_ASSERT_ARRAY_EQ(host_ref, output); } @@ -376,14 +388,16 @@ void run2d_col_reduction(std::shared_ptr exec) gko::kernels::hip::run_kernel_col_reduction( exec, - [] GKO_KERNEL(auto i, auto j) { + [] GKO_KERNEL(auto i, auto j, auto a) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), gko::dim<2>{static_cast(num_rows), - static_cast(num_cols)}); + static_cast(num_cols)}, + output); GKO_ASSERT_ARRAY_EQ(host_ref, output); } diff --git a/omp/base/kernel_launch.hpp b/omp/base/kernel_launch.hpp index 98432c2d155..b6ef373fdee 100644 --- a/omp/base/kernel_launch.hpp +++ b/omp/base/kernel_launch.hpp @@ -44,20 +44,20 @@ namespace kernels { namespace omp { -template -void run_kernel(std::shared_ptr exec, KernelFunction fn, - size_type size, KernelArgs&&... args) +namespace { + + +template +void run_kernel_impl(std::shared_ptr exec, KernelFunction fn, + size_type size, MappedKernelArgs... args) { #pragma omp parallel for for (int64 i = 0; i < static_cast(size); i++) { - [&]() { fn(i, map_to_device(args)...); }(); + [&]() { fn(i, args...); }(); } } -namespace { - - template void run_kernel_sized_impl(syn::value_list, @@ -126,6 +126,14 @@ void run_kernel_impl(std::shared_ptr exec, KernelFunction fn, } // namespace +template +void run_kernel(std::shared_ptr exec, KernelFunction fn, + size_type size, KernelArgs&&... args) +{ + run_kernel_impl(exec, fn, size, map_to_device(args)...); +} + + template void run_kernel(std::shared_ptr exec, KernelFunction fn, dim<2> size, KernelArgs&&... args) diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp index dbc055fffd6..84758549918 100644 --- a/omp/base/kernel_launch_reduction.hpp +++ b/omp/base/kernel_launch_reduction.hpp @@ -51,13 +51,16 @@ namespace omp { constexpr int reduction_kernel_oversubscription = 4; +namespace { + + template -void run_kernel_reduction(std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, - ValueType* result, size_type size, - KernelArgs&&... args) + typename FinalizeOp, typename... MappedKernelArgs> +void run_kernel_reduction_impl(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, size_type size, + MappedKernelArgs... args) { const auto num_threads = static_cast(omp_get_max_threads()); const auto ssize = static_cast(size); @@ -83,9 +86,6 @@ void run_kernel_reduction(std::shared_ptr exec, } -namespace { - - template @@ -158,6 +158,19 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_sized, } // namespace +template +void run_kernel_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, size_type size, + KernelArgs&&... args) +{ + run_kernel_reduction_impl(exec, fn, op, finalize, init, result, size, + map_to_device(args)...); +} + + template void run_kernel_reduction(std::shared_ptr exec, @@ -177,17 +190,20 @@ void run_kernel_reduction(std::shared_ptr exec, remainders(), [&](int remainder) { return remainder == cols % block_size; }, syn::value_list(), syn::type_list<>(), exec, fn, op, - finalize, init, result, size, args...); + finalize, init, result, size, map_to_device(args)...); } +namespace { + + template -void run_kernel_row_reduction(std::shared_ptr exec, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, - ValueType* result, size_type result_stride, - dim<2> size, KernelArgs&&... args) + typename FinalizeOp, typename... MappedKernelArgs> +void run_kernel_row_reduction_impl(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, size_type result_stride, + dim<2> size, MappedKernelArgs... args) { constexpr int block_size = 8; const auto rows = static_cast(size[0]); @@ -244,9 +260,6 @@ void run_kernel_row_reduction(std::shared_ptr exec, } -namespace { - - template @@ -295,10 +308,10 @@ void run_kernel_col_reduction_sized_impl( const auto base_col = col_block * block_size; if (base_col + block_size <= cols) { run_kernel_col_reduction_sized_block_impl( - fn, op, finalize, init, result, 0, rows, base_col); + fn, op, finalize, init, result, 0, rows, base_col, args...); } else { run_kernel_col_reduction_sized_block_impl( - fn, op, finalize, init, result, 0, rows, base_col); + fn, op, finalize, init, result, 0, rows, base_col, args...); } } } else { @@ -319,13 +332,13 @@ void run_kernel_col_reduction_sized_impl( if (base_col + block_size <= cols) { run_kernel_col_reduction_sized_block_impl( fn, op, identity, init, - partial.get_data() + cols * row_block, begin, end, - base_col); + partial.get_data() + cols * row_block, begin, end, base_col, + args...); } else { run_kernel_col_reduction_sized_block_impl( fn, op, identity, init, - partial.get_data() + cols * row_block, begin, end, - base_col); + partial.get_data() + cols * row_block, begin, end, base_col, + args...); } } #pragma omp parallel for @@ -347,6 +360,19 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_col_reduction_sized, } // namespace +template +void run_kernel_row_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, size_type result_stride, + dim<2> size, MappedKernelArgs... args) +{ + run_kernel_row_reduction_impl(exec, fn, op, finalize, init, result, + result_stride, size, map_to_device(args)...); +} + + template void run_kernel_col_reduction(std::shared_ptr exec, @@ -366,7 +392,7 @@ void run_kernel_col_reduction(std::shared_ptr exec, remainders(), [&](int remainder) { return remainder == cols % block_size; }, syn::value_list(), syn::type_list<>(), exec, fn, op, - finalize, init, result, size, args...); + finalize, init, result, size, map_to_device(args)...); } diff --git a/omp/test/base/kernel_launch.cpp b/omp/test/base/kernel_launch.cpp index 7184649c3ae..01c39514cdb 100644 --- a/omp/test/base/kernel_launch.cpp +++ b/omp/test/base/kernel_launch.cpp @@ -243,26 +243,31 @@ TEST_F(KernelLaunch, Runs2DDense) TEST_F(KernelLaunch, Reduction1D) { gko::Array output{exec, 1}; + gko::kernels::omp::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i) { + [] GKO_KERNEL(auto i, auto a) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); return i + 1; }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), - size_type{100000}); + size_type{100000}, output); + ASSERT_EQ(*output.get_const_data(), 10000100000ll); gko::kernels::omp::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i) { + [] GKO_KERNEL(auto i, auto a) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); return i + 1; }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), - size_type{10}); + size_type{10}, output); + ASSERT_EQ(*output.get_const_data(), 110ll); } @@ -270,16 +275,19 @@ TEST_F(KernelLaunch, Reduction1D) TEST_F(KernelLaunch, Reduction2DSmallRows) { gko::Array output{exec, 1}; + for (int cols = 0; cols < 17; cols++) { gko::kernels::omp::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto j) { + [] GKO_KERNEL(auto i, auto j, auto a) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), - gko::dim<2>{10, cols}); + gko::dim<2>{10, cols}, output); + ASSERT_EQ(*output.get_const_data(), 110ll * cols * (cols + 1)); } } @@ -288,16 +296,19 @@ TEST_F(KernelLaunch, Reduction2DSmallRows) TEST_F(KernelLaunch, Reduction2DLargeRows) { gko::Array output{exec, 1}; + for (int cols = 0; cols < 17; cols++) { gko::kernels::omp::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto j) { + [] GKO_KERNEL(auto i, auto j, auto a) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), - gko::dim<2>{1000, cols}); + gko::dim<2>{1000, cols}, output); + ASSERT_EQ(*output.get_const_data(), 1001000ll * cols * (cols + 1)); } } @@ -306,15 +317,18 @@ TEST_F(KernelLaunch, Reduction2DLargeRows) TEST_F(KernelLaunch, Reduction2D) { gko::Array output{exec, 1}; + gko::kernels::omp::run_kernel_reduction( exec, - [] GKO_KERNEL(auto i, auto j) { + [] GKO_KERNEL(auto i, auto j, auto a) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), - gko::dim<2>{1000, 100}); + gko::dim<2>{1000, 100}, output); + ASSERT_EQ(*output.get_const_data(), 10110100000ll); } @@ -335,14 +349,16 @@ TEST_F(KernelLaunch, ReductionRow2DSmall) gko::kernels::omp::run_kernel_row_reduction( exec, - [] GKO_KERNEL(auto i, auto j) { + [] GKO_KERNEL(auto i, auto j, auto a) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2, gko::dim<2>{static_cast(num_rows), - static_cast(num_cols)}); + static_cast(num_cols)}, + output); GKO_ASSERT_ARRAY_EQ(host_ref, output); } @@ -362,14 +378,16 @@ TEST_F(KernelLaunch, ReductionRow2D) gko::kernels::omp::run_kernel_row_reduction( exec, - [] GKO_KERNEL(auto i, auto j) { + [] GKO_KERNEL(auto i, auto j, auto a) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2, gko::dim<2>{static_cast(num_rows), - static_cast(num_cols)}); + static_cast(num_cols)}, + output); GKO_ASSERT_ARRAY_EQ(host_ref, output); } @@ -389,15 +407,17 @@ TEST_F(KernelLaunch, ReductionCol2D) gko::kernels::omp::run_kernel_col_reduction( exec, - [] GKO_KERNEL(auto i, auto j) { + [] GKO_KERNEL(auto i, auto j, auto a) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), gko::dim<2>{static_cast(num_rows), - static_cast(num_cols)}); + static_cast(num_cols)}, + output); GKO_ASSERT_ARRAY_EQ(host_ref, output); } From 513972683e44f0ee9e1d4ab3a0b3f52099a0ff23 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 15 Jul 2021 23:57:11 +0200 Subject: [PATCH 08/25] add dense reduction kernels --- common/unified/matrix/dense_kernels.cpp | 55 ++++++++++++++++++++ omp/matrix/dense_kernels.cpp | 67 ------------------------- 2 files changed, 55 insertions(+), 67 deletions(-) diff --git a/common/unified/matrix/dense_kernels.cpp b/common/unified/matrix/dense_kernels.cpp index a06d8e1eef2..a3e90576ced 100644 --- a/common/unified/matrix/dense_kernels.cpp +++ b/common/unified/matrix/dense_kernels.cpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" namespace gko { @@ -220,6 +221,60 @@ void sub_scaled_diag(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL); +template +void compute_dot(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result) +{ + run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto x, auto y) { + return x(i, j) * y(i, j); + }, + [] GKO_KERNEL(auto a, auto b) { return a + b; }, + [] GKO_KERNEL(auto a) { return a; }, ValueType{}, result->get_values(), + x->get_size(), x, y); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); + + +template +void compute_conj_dot(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result) +{ + run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto x, auto y) { + return conj(x(i, j)) * y(i, j); + }, + [] GKO_KERNEL(auto a, auto b) { return a + b; }, + [] GKO_KERNEL(auto a) { return a; }, ValueType{}, result->get_values(), + x->get_size(), x, y); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); + + +template +void compute_norm2(std::shared_ptr exec, + const matrix::Dense* x, + matrix::Dense>* result) +{ + run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto x) { return squared_norm(x(i, j)); }, + [] GKO_KERNEL(auto a, auto b) { return a + b; }, + [] GKO_KERNEL(auto a) { return sqrt(a); }, remove_complex{}, + result->get_values(), x->get_size(), x); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); + + template void symm_permute(std::shared_ptr exec, const Array* permutation_indices, diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp index c0e4ca75ae3..b9df5fddf24 100644 --- a/omp/matrix/dense_kernels.cpp +++ b/omp/matrix/dense_kernels.cpp @@ -127,73 +127,6 @@ void apply(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); -template -void compute_dot(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ -#pragma omp parallel for - for (size_type j = 0; j < x->get_size()[1]; ++j) { - result->at(0, j) = zero(); - } -#pragma omp parallel for - for (size_type j = 0; j < x->get_size()[1]; ++j) { - for (size_type i = 0; i < x->get_size()[0]; ++i) { - result->at(0, j) += x->at(i, j) * y->at(i, j); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); - - -template -void compute_conj_dot(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ -#pragma omp parallel for - for (size_type j = 0; j < x->get_size()[1]; ++j) { - result->at(0, j) = zero(); - } -#pragma omp parallel for - for (size_type j = 0; j < x->get_size()[1]; ++j) { - for (size_type i = 0; i < x->get_size()[0]; ++i) { - result->at(0, j) += conj(x->at(i, j)) * y->at(i, j); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); - - -template -void compute_norm2(std::shared_ptr exec, - const matrix::Dense* x, - matrix::Dense>* result) -{ - using norm_type = remove_complex; -#pragma omp parallel for - for (size_type j = 0; j < x->get_size()[1]; ++j) { - result->at(0, j) = zero(); - } -#pragma omp parallel for - for (size_type j = 0; j < x->get_size()[1]; ++j) { - for (size_type i = 0; i < x->get_size()[0]; ++i) { - result->at(0, j) += squared_norm(x->at(i, j)); - } - } -#pragma omp parallel for - for (size_type j = 0; j < x->get_size()[1]; ++j) { - result->at(0, j) = sqrt(result->at(0, j)); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); - - template void convert_to_coo(std::shared_ptr exec, const matrix::Dense* source, From 5b07de5697ae547617fb8bd1d33b0519e77da87f Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 22 Jul 2021 22:58:56 +0200 Subject: [PATCH 09/25] fix dpcpp simple kernel indexing --- dpcpp/base/kernel_launch.dp.hpp | 20 +++++++++++--------- dpcpp/base/kernel_launch_solver.dp.hpp | 15 ++++++++------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/dpcpp/base/kernel_launch.dp.hpp b/dpcpp/base/kernel_launch.dp.hpp index 0941fc7d524..2b99e98ac36 100644 --- a/dpcpp/base/kernel_launch.dp.hpp +++ b/dpcpp/base/kernel_launch.dp.hpp @@ -48,10 +48,11 @@ template void generic_kernel_1d(sycl::handler& cgh, int64 size, KernelFunction fn, KernelArgs... args) { - cgh.parallel_for(sycl::range<1>{size}, [=](sycl::id<1> idx_id) { - auto idx = static_cast(idx_id[0]); - fn(idx, args...); - }); + cgh.parallel_for(sycl::range<1>{static_cast(size)}, + [=](sycl::id<1> idx_id) { + auto idx = static_cast(idx_id[0]); + fn(idx, args...); + }); } @@ -59,11 +60,12 @@ template void generic_kernel_2d(sycl::handler& cgh, int64 rows, int64 cols, KernelFunction fn, KernelArgs... args) { - cgh.parallel_for(sycl::range<2>{rows, cols}, [=](sycl::id<2> idx) { - auto row = static_cast(idx[0]); - auto col = static_cast(idx[1]); - fn(row, col, args...); - }); + cgh.parallel_for(sycl::range<1>{static_cast(rows * cols)}, + [=](sycl::id<1> idx) { + auto row = static_cast(idx[0]) / cols; + auto col = static_cast(idx[0]) % cols; + fn(row, col, args...); + }); } diff --git a/dpcpp/base/kernel_launch_solver.dp.hpp b/dpcpp/base/kernel_launch_solver.dp.hpp index aa25d167bf3..68ef10ac1fe 100644 --- a/dpcpp/base/kernel_launch_solver.dp.hpp +++ b/dpcpp/base/kernel_launch_solver.dp.hpp @@ -46,13 +46,14 @@ void generic_kernel_2d_solver(sycl::handler& cgh, int64 rows, int64 cols, int64 default_stride, KernelFunction fn, KernelArgs... args) { - cgh.parallel_for(sycl::range<2>{rows, cols}, [=](sycl::id<2> idx) { - auto row = static_cast(idx[0]); - auto col = static_cast(idx[1]); - fn(row, col, - device_unpack_solver_impl::unpack(args, - default_stride)...); - }); + cgh.parallel_for(sycl::range<1>{static_cast(rows * cols)}, + [=](sycl::id<1> idx) { + auto row = static_cast(idx[0] / cols); + auto col = static_cast(idx[0] % cols); + fn(row, col, + device_unpack_solver_impl::unpack( + args, default_stride)...); + }); } From e9ee66627e74686dd017b3446b7cddcfb1b41f90 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 22 Jul 2021 23:01:05 +0200 Subject: [PATCH 10/25] add reduction kernels --- dpcpp/base/helper.hpp | 2 +- dpcpp/base/kernel_launch_reduction.dp.hpp | 291 ++++++++++++++++++++++ dpcpp/test/base/kernel_launch.dp.cpp | 93 +++++++ 3 files changed, 385 insertions(+), 1 deletion(-) create mode 100644 dpcpp/base/kernel_launch_reduction.dp.hpp diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp index 90ec1cc05fe..f215864e01d 100644 --- a/dpcpp/base/helper.hpp +++ b/dpcpp/base/helper.hpp @@ -166,7 +166,7 @@ bool validate(sycl::queue* queue, unsigned workgroup_size, * @return the first valid config */ template -std::uint32_t get_first_cfg(IterArr& arr, Validate verify) +std::uint32_t get_first_cfg(const IterArr& arr, Validate verify) { for (auto& cfg : arr) { if (verify(cfg)) { diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp new file mode 100644 index 00000000000..cbf3e3d7158 --- /dev/null +++ b/dpcpp/base/kernel_launch_reduction.dp.hpp @@ -0,0 +1,291 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_COMMON_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ +#error \ + "This file can only be used from inside common/base/kernel_launch_reduction.hpp" +#endif + + +#include "core/synthesizer/implementation_selection.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/components/cooperative_groups.dp.hpp" +#include "dpcpp/components/reduction.dp.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/components/uninitialized_array.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { + + +using KCFG_1D = ConfigSet<11, 7>; +constexpr auto kcfg_1d_list_simple_reduction = + syn::value_list(KCFG_1D::encode(512, 64)), + static_cast(KCFG_1D::encode(512, 32)), + static_cast(KCFG_1D::encode(512, 16)), + static_cast(KCFG_1D::encode(256, 32)), + static_cast(KCFG_1D::encode(256, 16)), + static_cast(KCFG_1D::encode(256, 8))>(); + + +template +void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size, + int64 num_workgroups, KernelFunction fn, + ReductionOp op, FinalizeOp finalize, + ValueType init, ValueType* storage, + KernelArgs... args) +{ + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + constexpr auto num_partials = wg_size / sg_size; + sycl::accessor, 1, + sycl::access_mode::read_write, sycl::access::target::local> + subgroup_partial_acc(sycl::range<1>{1}, cgh); + const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size)); + const auto global_size = num_workgroups * wg_size; + + cgh.parallel_for( + range, [= + ](sycl::nd_item<3> idx) [[intel::reqd_sub_group_size(sg_size)]] { + auto subgroup_partial = &subgroup_partial_acc[0][0]; + const auto tidx = thread::get_thread_id_flat(idx); + const auto local_tidx = static_cast(tidx % wg_size); + auto subgroup = + group::tiled_partition(group::this_thread_block(idx)); + auto partial = init; + for (int64 i = tidx; i < size; i += global_size) { + partial = op(partial, fn(i, args...)); + } + partial = ::gko::kernels::dpcpp::reduce(subgroup, partial, op); + if (subgroup.thread_rank() == 0) { + subgroup_partial[local_tidx / sg_size] = partial; + } + idx.barrier(sycl::access::fence_space::local_space); + if (local_tidx < sg_size) { + partial = init; + for (int64 i = local_tidx; i < num_partials; i += sg_size) { + partial = op(partial, subgroup_partial[i]); + } + partial = ::gko::kernels::dpcpp::reduce(subgroup, partial, op); + if (subgroup.thread_rank() == 0) { + storage[tidx / wg_size] = finalize(partial); + } + } + }); +} + + +template +void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols, + int64 num_workgroups, KernelFunction fn, + ReductionOp op, FinalizeOp finalize, + ValueType init, ValueType* storage, + KernelArgs... args) +{ + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + constexpr auto num_partials = wg_size / sg_size; + sycl::accessor, 1, + sycl::access_mode::read_write, sycl::access::target::local> + subgroup_partial_acc(sycl::range<1>{1}, cgh); + const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size)); + const auto global_size = num_workgroups * wg_size; + + cgh.parallel_for( + range, [= + ](sycl::nd_item<3> idx) [[intel::reqd_sub_group_size(sg_size)]] { + auto subgroup_partial = &subgroup_partial_acc[0][0]; + const auto tidx = thread::get_thread_id_flat(idx); + const auto local_tidx = static_cast(tidx % wg_size); + auto subgroup = + group::tiled_partition(group::this_thread_block(idx)); + auto partial = init; + for (int64 i = tidx; i < rows * cols; i += global_size) { + const auto row = i / cols; + const auto col = i % cols; + partial = op(partial, fn(row, col, args...)); + } + partial = ::gko::kernels::dpcpp::reduce(subgroup, partial, op); + if (subgroup.thread_rank() == 0) { + subgroup_partial[local_tidx / sg_size] = partial; + } + idx.barrier(sycl::access::fence_space::local_space); + if (local_tidx < sg_size) { + partial = init; + for (int64 i = local_tidx; i < num_partials; i += sg_size) { + partial = op(partial, subgroup_partial[i]); + } + partial = ::gko::kernels::dpcpp::reduce(subgroup, partial, op); + if (subgroup.thread_rank() == 0) { + storage[tidx / wg_size] = finalize(partial); + } + } + }); +} + + +template +void run_kernel_reduction_impl(syn::value_list, + std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, size_type size, + KernelArgs... args) +{ + constexpr auto cfg = static_cast(icfg); + constexpr int oversubscription = 4; + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + const auto num_workgroups = + std::min(ceildiv(size, wg_size), + exec->get_num_computing_units() * oversubscription); + auto queue = exec->get_queue(); + if (num_workgroups > 1) { + Array partial{exec, static_cast(num_workgroups)}; + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_1d( + cgh, static_cast(size), num_workgroups, fn, op, + [](auto v) { return v; }, init, partial.get_data(), args...); + }); + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_1d( + cgh, static_cast(num_workgroups), 1, + [](auto i, auto v) { return v[i]; }, op, finalize, init, result, + partial.get_const_data()); + }); + } else { + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_1d(cgh, static_cast(size), + num_workgroups, fn, op, finalize, init, + result, args...); + }); + } +} + + +template +void run_kernel_reduction_impl(syn::value_list, + std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, dim<2> size, + KernelArgs... args) +{ + constexpr auto cfg = static_cast(icfg); + constexpr int oversubscription = 4; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto flat_size = rows * cols; + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + const auto num_workgroups = + std::min(ceildiv(flat_size, wg_size), + exec->get_num_computing_units() * oversubscription); + auto queue = exec->get_queue(); + if (num_workgroups > 1) { + Array partial{exec, static_cast(num_workgroups)}; + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_2d( + cgh, rows, cols, num_workgroups, fn, op, + [](auto v) { return v; }, init, partial.get_data(), args...); + }); + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_1d( + cgh, static_cast(num_workgroups), 1, + [](auto i, auto v) { return v[i]; }, op, finalize, init, result, + partial.get_const_data()); + }); + } else { + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_2d(cgh, rows, cols, num_workgroups, fn, op, + finalize, init, result, args...); + }); + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction, + run_kernel_reduction_impl) + + +template +void run_kernel_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, dim<2> size, KernelArgs&&... args) +{ + const auto desired_icfg = static_cast(get_first_cfg( + as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { + return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + })); + select_run_kernel_reduction( + kcfg_1d_list_simple_reduction, + [&](int icfg) { return icfg == desired_icfg; }, syn::value_list(), + syn::type_list<>(), exec, fn, op, finalize, init, result, size, + map_to_device(args)...); +} + + +template +void run_kernel_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, size_type size, + KernelArgs&&... args) +{ + const auto desired_icfg = static_cast(get_first_cfg( + as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { + return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + })); + select_run_kernel_reduction( + kcfg_1d_list_simple_reduction, + [&](int icfg) { return icfg == desired_icfg; }, syn::value_list(), + syn::type_list<>(), exec, fn, op, finalize, init, result, size, + map_to_device(args)...); +} + + +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/dpcpp/test/base/kernel_launch.dp.cpp b/dpcpp/test/base/kernel_launch.dp.cpp index decd2e8c64a..592ce9b934c 100644 --- a/dpcpp/test/base/kernel_launch.dp.cpp +++ b/dpcpp/test/base/kernel_launch.dp.cpp @@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "common/unified/base/kernel_launch_reduction.hpp" #include "common/unified/base/kernel_launch_solver.hpp" #include "core/test/utils.hpp" @@ -256,4 +257,96 @@ TEST_F(KernelLaunch, Runs2DDense) } +TEST_F(KernelLaunch, Reduction1D) +{ + gko::Array output{exec, 1}; + + gko::kernels::dpcpp::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return i + 1; + }, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "i"); + static_assert(is_same::value, "j"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "j"); + return j * 2; + }, + int64{}, output.get_data(), size_type{100000}, output); + + EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000ll); + + gko::kernels::dpcpp::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return i + 1; + }, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "i"); + static_assert(is_same::value, "j"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "j"); + return j * 2; + }, + int64{}, output.get_data(), size_type{100}, output); + + EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100ll); +} + + +TEST_F(KernelLaunch, Reduction2D) +{ + gko::Array output{exec, 1}; + + gko::kernels::dpcpp::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "i"); + static_assert(is_same::value, "j"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "j"); + return j * 4; + }, + int64{}, output.get_data(), gko::dim<2>{1000, 100}, output); + + EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000ll); + + gko::kernels::dpcpp::run_kernel_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "i"); + static_assert(is_same::value, "j"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "j"); + return j * 4; + }, + int64{}, output.get_data(), gko::dim<2>{10, 10}, output); + + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100ll); +} + + } // namespace From 923913910779d4282ac5001a0ddfe0e30f62470e Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 27 Jul 2021 23:50:37 +0200 Subject: [PATCH 11/25] improve simple reductions for CUDA --- cuda/base/kernel_launch_reduction.cuh | 335 +++++++++++++++++--------- cuda/test/base/kernel_launch.cu | 163 ++++++++----- 2 files changed, 333 insertions(+), 165 deletions(-) diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh index 08849a90a4a..c2560ded334 100644 --- a/cuda/base/kernel_launch_reduction.cuh +++ b/cuda/base/kernel_launch_reduction.cuh @@ -79,7 +79,11 @@ __global__ __launch_bounds__( } __syncthreads(); if (threadIdx.x < config::warp_size) { - partial = reduce(warp, warp_partial[threadIdx.x], op); + partial = reduce(warp, + threadIdx.x < default_block_size / config::warp_size + ? warp_partial[threadIdx.x] + : init, + op); if (threadIdx.x == 0) { storage[blockIdx.x] = finalize(partial); } @@ -119,7 +123,11 @@ __global__ __launch_bounds__( } __syncthreads(); if (threadIdx.x < config::warp_size) { - partial = reduce(warp, warp_partial[threadIdx.x], op); + partial = reduce(warp, + threadIdx.x < default_block_size / config::warp_size + ? warp_partial[threadIdx.x] + : init, + op); if (threadIdx.x == 0) { storage[blockIdx.x] = finalize(partial); } @@ -135,7 +143,7 @@ void run_kernel_reduction(std::shared_ptr exec, ValueType* result, size_type size, KernelArgs&&... args) { - constexpr int oversubscription = 4; + constexpr int oversubscription = 16; gko::cuda::device_guard guard{exec->get_device_id()}; constexpr auto block_size = default_block_size; const auto num_blocks = std::min( @@ -144,7 +152,7 @@ void run_kernel_reduction(std::shared_ptr exec, Array partial{exec, static_cast(num_blocks)}; generic_kernel_reduction_1d<<>>( static_cast(size), fn, op, - [] __device__(auto v) { return v; }, as_cuda_type(init), + [] GKO_KERNEL(auto v) { return v; }, as_cuda_type(init), as_cuda_type(partial.get_data()), map_to_device(args)...); generic_kernel_reduction_1d<<<1, block_size>>>( static_cast(num_blocks), @@ -166,7 +174,7 @@ void run_kernel_reduction(std::shared_ptr exec, FinalizeOp finalize, ValueType init, ValueType* result, dim<2> size, KernelArgs&&... args) { - constexpr int oversubscription = 4; + constexpr int oversubscription = 16; gko::cuda::device_guard guard{exec->get_device_id()}; constexpr auto block_size = default_block_size; const auto rows = static_cast(size[0]); @@ -177,7 +185,7 @@ void run_kernel_reduction(std::shared_ptr exec, if (num_blocks > 1) { Array partial{exec, static_cast(num_blocks)}; generic_kernel_reduction_2d<<>>( - rows, cols, fn, op, [] __device__(auto v) { return v; }, + rows, cols, fn, op, [] GKO_KERNEL(auto v) { return v; }, as_cuda_type(init), as_cuda_type(partial.get_data()), map_to_device(args)...); generic_kernel_reduction_1d<<<1, block_size>>>( @@ -197,19 +205,19 @@ template __global__ __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d( - int64 rows, int64 cols, int64 col_parts, KernelFunction fn, + int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* result, int64 result_stride, KernelArgs... args) { const auto idx = thread::get_subwarp_id_flat(); const auto row = idx % rows; - const auto col_part = idx / rows; - if (col_part >= col_parts) { + const auto col_block = idx / rows; + if (col_block >= col_blocks) { return; } - const auto cols_per_part = ceildiv(cols, col_parts); - // TODO use boundaries divisible by subwarp_size - const auto begin = cols_per_part * col_part; + const auto cols_per_part = + ceildiv(ceildiv(cols, subwarp_size), col_blocks) * subwarp_size; + const auto begin = cols_per_part * col_block; const auto end = min(begin + cols_per_part, cols); auto subwarp = group::tiled_partition(group::this_thread_block()); @@ -219,58 +227,135 @@ __global__ partial = op(partial, fn(row, col, args...)); } partial = reduce(subwarp, partial, op); - result[(row * col_parts + col_part) * result_stride] = finalize(partial); + result[(row + col_block * rows) * result_stride] = finalize(partial); +} + + +template +__global__ + __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small( + int64 rows, int64 cols, KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, ValueType* result, + KernelArgs... args) +{ + constexpr auto warp_size = config::warp_size; + constexpr auto warps_per_block = default_block_size / warp_size; + // stores the subwarp_size partial sums from each warp, grouped by warp + constexpr auto shared_storage = warps_per_block * subwarp_size; + __shared__ UninitializedArray block_partial; + const auto subwarp_id = thread::get_subwarp_id_flat(); + const auto local_warp_id = threadIdx.x / warp_size; + const auto local_subwarp_id = threadIdx.x % warp_size / subwarp_size; + const auto subwarp_num = + thread::get_subwarp_num_flat(); + const auto block = group::this_thread_block(); + // + if (threadIdx.x < shared_storage) { + block_partial[threadIdx.x] = init; + } + block.sync(); + // + const auto warp = group::tiled_partition(block); + const auto warp_rank = warp.thread_rank(); + const auto subwarp_rank = warp_rank % subwarp_size; + const auto col = static_cast(subwarp_rank); + auto partial = init; + // accumulate within a thread + if (col < cols) { + for (auto row = subwarp_id; row < rows; row += subwarp_num) { + partial = op(partial, fn(row, col, args...)); + } + } + // accumulate between all subwarps in the warp +#pragma unroll + for (unsigned i = subwarp_size; i < warp_size; i *= 2) { + partial = op(partial, warp.shfl_xor(partial, i)); + } // store the result to shared memory + if (local_subwarp_id == 0) { + block_partial[local_warp_id * subwarp_size + subwarp_rank] = partial; + } + block.sync(); + // in a single thread: accumulate the results + if (local_warp_id == 0) { + partial = init; + // accumulate the partial results within a thread + if (shared_storage >= warp_size) { +#pragma unroll + for (int i = 0; i < shared_storage; i += warp_size) { + partial = op(partial, block_partial[i + warp_rank]); + } + } else if (warp_rank < shared_storage) { + partial = op(partial, block_partial[warp_rank]); + } + // accumulate between all subwarps in the warp +#pragma unroll + for (unsigned i = subwarp_size; i < warp_size; i *= 2) { + partial = op(partial, warp.shfl_xor(partial, i)); + } + if (warp_rank < cols) { + result[warp_rank + blockIdx.x * cols] = finalize(partial); + } + } } template __global__ - __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d( - int64 rows, int64 cols, int64 row_parts, KernelFunction fn, - ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* result, + __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked( + int64 rows, int64 cols, KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, ValueType* result, KernelArgs... args) { - const auto idx = thread::get_thread_id_flat(); - const auto col = idx % cols; - const auto row_part = idx / cols; - if (row_part >= row_parts) { - return; - } - const auto rows_per_part = ceildiv(rows, row_parts); - const auto begin = rows_per_part * row_part; - const auto end = min(begin + rows_per_part, rows); + constexpr auto warp_size = config::warp_size; + __shared__ UninitializedArray block_partial; + const auto warp_id = thread::get_subwarp_id_flat(); + const auto warp_num = thread::get_subwarp_num_flat(); + const auto block = group::this_thread_block(); + const auto warp = group::tiled_partition(block); + const auto warp_rank = warp.thread_rank(); + const auto col = warp_rank + static_cast(blockIdx.y) * warp_size; auto partial = init; - for (auto row = begin; row < end; row++) { - partial = op(partial, fn(row, col, args...)); + // accumulate within a thread + if (col < cols) { + for (auto row = warp_id; row < rows; row += warp_num) { + partial = op(partial, fn(row, col, args...)); + } + } + block_partial[threadIdx.x] = partial; + block.sync(); + // in a single warp: accumulate the results + if (threadIdx.x < warp_size) { + partial = init; + // accumulate the partial results within a thread +#pragma unroll + for (int i = 0; i < default_block_size; i += warp_size) { + partial = op(partial, block_partial[i + warp_rank]); + } + if (col < cols) { + result[col + blockIdx.x * cols] = finalize(partial); + } } - result[col * row_parts + row_part] = finalize(partial); } -template +template __global__ __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d( - int64 num_results, int64 num_parts, ReductionOp op, FinalizeOp finalize, - ValueType init, const ValueType* input, int64 result_stride, - ValueType* result) + int64 num_results, int64 num_blocks, ReductionOp op, + FinalizeOp finalize, ValueType init, const ValueType* input, + int64 result_stride, ValueType* result) { - const auto idx = thread::get_subwarp_id_flat(); + const auto idx = thread::get_thread_id_flat(); if (idx >= num_results) { return; } - auto subwarp = - group::tiled_partition(group::this_thread_block()); auto partial = init; - for (int64 part = subwarp.thread_rank(); part < num_parts; - part += subwarp_size) { - partial = op(partial, input[idx * num_parts + part]); - } - partial = reduce(subwarp, partial, op); - if (subwarp.thread_rank() == 0) { - result[idx * result_stride] = finalize(partial); + for (int64 block = 0; block < num_blocks; block++) { + partial = op(partial, input[idx + block * num_results]); } + result[idx * result_stride] = finalize(partial); } @@ -280,42 +365,62 @@ namespace { template void run_generic_kernel_row_reduction(syn::value_list, - int64 rows, int64 cols, int64 col_parts, + int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* result, int64 result_stride, KernelArgs... args) { - constexpr auto block_size = default_block_size; - const auto num_blocks = ceildiv(rows * cols * subwarp_size, block_size); - generic_kernel_row_reduction_2d<<>>( - rows, cols, col_parts, fn, op, finalize, as_cuda_type(init), - as_cuda_type(result), result_stride, args...); + const auto num_blocks = + ceildiv(rows * col_blocks * subwarp_size, default_block_size); + generic_kernel_row_reduction_2d + <<>>( + rows, cols, col_blocks, fn, op, finalize, as_cuda_type(init), + as_cuda_type(result), result_stride, args...); } GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction, run_generic_kernel_row_reduction) -template -void run_kernel_reduction_finalize(syn::value_list, - int64 num_results, int64 num_parts, - ReductionOp op, FinalizeOp finalize, - ValueType init, const ValueType* input, - int64 result_stride, ValueType* result) +template +void run_generic_col_reduction_small(syn::value_list, + int64 max_blocks, + std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, dim<2> size, + MappedKernelArgs... args) { - constexpr auto block_size = default_block_size; - const auto num_blocks = ceildiv(num_results * subwarp_size, block_size); - generic_kernel_reduction_finalize_2d - <<>>(num_results, num_parts, op, finalize, - as_cuda_type(init), as_cuda_type(input), - static_cast(result_stride), - as_cuda_type(result)); + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto num_blocks = std::min( + ceildiv(rows * subwarp_size, default_block_size), max_blocks); + if (num_blocks <= 1) { + generic_kernel_col_reduction_2d_small + <<<1, default_block_size>>>(rows, cols, fn, op, finalize, + as_cuda_type(init), + as_cuda_type(result), args...); + } else { + Array tmp_storage{exec, + static_cast(num_blocks * cols)}; + generic_kernel_col_reduction_2d_small + <<>>( + rows, cols, fn, op, [] GKO_KERNEL(auto v) { return v; }, + as_cuda_type(init), as_cuda_type(tmp_storage.get_data()), + args...); + generic_kernel_reduction_finalize_2d<<< + ceildiv(cols, default_block_size), default_block_size>>>( + cols, num_blocks, op, finalize, as_cuda_type(init), + as_cuda_type(tmp_storage.get_const_data()), 1, + as_cuda_type(result)); + } } -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_finalize, - run_kernel_reduction_finalize) +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small, + run_generic_col_reduction_small); } // namespace @@ -331,33 +436,29 @@ void run_kernel_row_reduction(std::shared_ptr exec, { using subwarp_sizes = syn::value_list; - constexpr int oversubscription = 4; + constexpr int oversubscription = 16; gko::cuda::device_guard guard{exec->get_device_id()}; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); - const auto resources = exec->get_num_warps() * oversubscription; - const auto col_parts = 1; // TODO tune - if (col_parts > 1) { + const auto resources = + exec->get_num_warps() * config::warp_size * oversubscription; + if (rows * cols > resources && rows < cols) { + const auto col_blocks = ceildiv(rows * cols, resources); Array partial{exec, - static_cast(col_parts * rows)}; - select_run_generic_kernel_row_reduction( - subwarp_sizes(), - [&](int compiled_subwarp_size) { - return compiled_subwarp_size >= cols || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), rows, cols, col_parts, - fn, op, [] __device__(auto i) { return i; }, init, - partial.get_data(), 1, map_to_device(args)...); - select_run_kernel_reduction_finalize( - subwarp_sizes(), - [&](int compiled_subwarp_size) { - return compiled_subwarp_size >= col_parts || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), rows, col_parts, op, - finalize, init, partial.get_const_data(), - static_cast(result_stride), result); + static_cast(col_blocks * rows)}; + const auto num_blocks = + ceildiv(rows * col_blocks * config::warp_size, default_block_size); + generic_kernel_row_reduction_2d + <<>>( + rows, cols, col_blocks, fn, op, + [] GKO_KERNEL(auto v) { return v; }, as_cuda_type(init), + as_cuda_type(partial.get_data()), 1, map_to_device(args)...); + const auto num_finalize_blocks = ceildiv(rows, default_block_size); + generic_kernel_reduction_finalize_2d<<>>( + rows, col_blocks, op, finalize, as_cuda_type(init), + as_cuda_type(partial.get_const_data()), + static_cast(result_stride), as_cuda_type(result)); } else { select_run_generic_kernel_row_reduction( subwarp_sizes(), @@ -380,37 +481,49 @@ void run_kernel_col_reduction(std::shared_ptr exec, ValueType* result, dim<2> size, KernelArgs&&... args) { - constexpr int oversubscription = 4; + using subwarp_sizes = + syn::value_list; + constexpr int oversubscription = 16; gko::cuda::device_guard guard{exec->get_device_id()}; - constexpr auto block_size = default_block_size; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); - const auto resources = - exec->get_num_warps() * config::warp_size * oversubscription; - const auto num_blocks = ceildiv(rows * cols, block_size); - const auto row_parts = 1; // TODO tune - if (row_parts > 1) { - Array partial{exec, - static_cast(row_parts * cols)}; - generic_kernel_col_reduction_2d<<>>( - rows, cols, row_parts, fn, op, [] __device__(auto i) { return i; }, - as_cuda_type(init), as_cuda_type(partial.get_data()), - map_to_device(args)...); - using subwarp_sizes = - syn::value_list; - select_run_kernel_reduction_finalize( + const auto max_blocks = exec->get_num_warps() * config::warp_size * + oversubscription / default_block_size; + if (cols <= config::warp_size) { + select_generic_col_reduction_small( subwarp_sizes(), [&](int compiled_subwarp_size) { - return compiled_subwarp_size >= row_parts || + return compiled_subwarp_size >= cols || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), cols, row_parts, op, - finalize, as_cuda_type(init), - as_cuda_type(partial.get_const_data()), 1, as_cuda_type(result)); + syn::value_list(), syn::type_list<>(), max_blocks, exec, fn, + op, finalize, init, result, size, map_to_device(args)...); } else { - generic_kernel_col_reduction_2d<<>>( - rows, cols, 1, fn, op, finalize, as_cuda_type(init), - as_cuda_type(result), map_to_device(args)...); + const auto col_blocks = ceildiv(cols, config::warp_size); + const auto row_blocks = + ceildiv(std::min( + ceildiv(rows * config::warp_size, default_block_size), + max_blocks), + col_blocks); + if (row_blocks <= 1) { + generic_kernel_col_reduction_2d_blocked<<>>( + rows, cols, fn, op, finalize, as_cuda_type(init), + as_cuda_type(result), map_to_device(args)...); + } else { + Array tmp_storage{ + exec, static_cast(row_blocks * cols)}; + generic_kernel_col_reduction_2d_blocked<<< + dim3(row_blocks, col_blocks), default_block_size>>>( + rows, cols, fn, op, [] GKO_KERNEL(auto v) { return v; }, + as_cuda_type(init), as_cuda_type(tmp_storage.get_data()), + map_to_device(args)...); + generic_kernel_reduction_finalize_2d<<< + ceildiv(cols, default_block_size), default_block_size>>>( + cols, row_blocks, op, finalize, as_cuda_type(init), + as_cuda_type(tmp_storage.get_const_data()), 1, + as_cuda_type(result)); + } } } diff --git a/cuda/test/base/kernel_launch.cu b/cuda/test/base/kernel_launch.cu index 6a5494e03fa..2df6ee4ade7 100644 --- a/cuda/test/base/kernel_launch.cu +++ b/cuda/test/base/kernel_launch.cu @@ -301,9 +301,16 @@ void run1d_reduction(std::shared_ptr exec) static_assert(is_same::value, "value"); return i + 1; }, - [] GKO_KERNEL(auto i, auto j) { return i + j; }, - [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), - size_type{100}, output); + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "a"); + static_assert(is_same::value, "b"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "value"); + return j * 2; + }, + int64{}, output.get_data(), size_type{100}, output); ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100ll); } @@ -319,12 +326,20 @@ void run2d_reduction(std::shared_ptr exec) exec, [] GKO_KERNEL(auto i, auto j, auto a) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); return (i + 1) * (j + 1); }, - [] GKO_KERNEL(auto i, auto j) { return i + j; }, - [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), - gko::dim<2>{1000, 100}, output); + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "a"); + static_assert(is_same::value, "b"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "value"); + return j * 4; + }, + int64{}, output.get_data(), gko::dim<2>{1000, 100}, output); ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000ll); @@ -332,12 +347,20 @@ void run2d_reduction(std::shared_ptr exec) exec, [] GKO_KERNEL(auto i, auto j, auto a) { static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); static_assert(is_same::value, "value"); return (i + 1) * (j + 1); }, - [] GKO_KERNEL(auto i, auto j) { return i + j; }, - [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), - gko::dim<2>{10, 10}, output); + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "a"); + static_assert(is_same::value, "b"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "value"); + return j * 4; + }, + int64{}, output.get_data(), gko::dim<2>{10, 10}, output); ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100ll); } @@ -347,30 +370,45 @@ TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); } void run2d_row_reduction(std::shared_ptr exec) { - int num_rows = 1000; - int num_cols = 100; - gko::Array host_ref{exec->get_master(), - static_cast(2 * num_rows)}; - std::fill_n(host_ref.get_data(), 2 * num_rows, 1234); - gko::Array output{exec, host_ref}; - for (int i = 0; i < num_rows; i++) { - host_ref.get_data()[2 * i] = num_cols * (num_cols + 1) * (i + 1); - } - - gko::kernels::cuda::run_kernel_row_reduction( - exec, - [] GKO_KERNEL(auto i, auto j, auto a) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "value"); - return (i + 1) * (j + 1); - }, - [] GKO_KERNEL(auto i, auto j) { return i + j; }, - [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), 2, - gko::dim<2>{static_cast(num_rows), - static_cast(num_cols)}, - output); + for (auto num_rows : {0, 100, 1000, 10000}) { + for (auto num_cols : {0, 10, 100, 1000, 10000}) { + SCOPED_TRACE(std::to_string(num_rows) + " rows, " + + std::to_string(num_cols) + " cols"); + gko::Array host_ref{exec->get_master(), + static_cast(2 * num_rows)}; + std::fill_n(host_ref.get_data(), 2 * num_rows, 1234); + gko::Array output{exec, host_ref}; + for (int64 i = 0; i < num_rows; i++) { + host_ref.get_data()[2 * i] = + num_cols * (num_cols + 1) * (i + 1); + } - GKO_ASSERT_ARRAY_EQ(host_ref, output); + gko::kernels::cuda::run_kernel_row_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, + "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "a"); + static_assert(is_same::value, "b"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "value"); + return j * 2; + }, + int64{}, output.get_data(), 2, + gko::dim<2>{static_cast(num_rows), + static_cast(num_cols)}, + output); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); + } + } } TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); } @@ -378,29 +416,46 @@ TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); } void run2d_col_reduction(std::shared_ptr exec) { - int num_rows = 1000; - int num_cols = 100; - gko::Array host_ref{exec->get_master(), - static_cast(num_cols)}; - gko::Array output{exec, static_cast(num_cols)}; - for (int i = 0; i < num_cols; i++) { - host_ref.get_data()[i] = num_rows * (num_rows + 1) * (i + 1); - } - - gko::kernels::cuda::run_kernel_col_reduction( - exec, - [] GKO_KERNEL(auto i, auto j, auto a) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "value"); - return (i + 1) * (j + 1); - }, - [] GKO_KERNEL(auto i, auto j) { return i + j; }, - [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), - gko::dim<2>{static_cast(num_rows), - static_cast(num_cols)}, - output); + // empty, most threads idle, most threads busy, multiple blocks + for (auto num_rows : {0, 10, 100, 1000, 10000}) { + // check different edge cases: subwarp sizes, blocked mode + for (auto num_cols : + {0, 1, 2, 3, 4, 5, 7, 8, 9, 16, 31, 32, 63, 127, 128, 129}) { + SCOPED_TRACE(std::to_string(num_rows) + " rows, " + + std::to_string(num_cols) + " cols"); + gko::Array host_ref{exec->get_master(), + static_cast(num_cols)}; + gko::Array output{exec, static_cast(num_cols)}; + for (int64 i = 0; i < num_cols; i++) { + host_ref.get_data()[i] = num_rows * (num_rows + 1) * (i + 1); + } - GKO_ASSERT_ARRAY_EQ(host_ref, output); + gko::kernels::cuda::run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + static_assert(is_same::value, + "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "a"); + static_assert(is_same::value, "b"); + return i + j; + }, + [] GKO_KERNEL(auto j) { + static_assert(is_same::value, "value"); + return j * 2; + }, + int64{}, output.get_data(), + gko::dim<2>{static_cast(num_rows), + static_cast(num_cols)}, + output); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); + } + } } TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); } From 45202f045c2d4874c25332c5475b7e9f8b79c266 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 27 Jul 2021 23:53:02 +0200 Subject: [PATCH 12/25] move CUDA reduction kernels to common entirely --- cuda/matrix/dense_kernels.cu | 127 ----------------------------------- 1 file changed, 127 deletions(-) diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu index 7a18bb06e39..2b8fb8157c6 100644 --- a/cuda/matrix/dense_kernels.cu +++ b/cuda/matrix/dense_kernels.cu @@ -117,133 +117,6 @@ void apply(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); -template -void compute_dot(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - if (cublas::is_supported::value) { - // TODO: write a custom kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - cublas::dot(exec->get_cublas_handle(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - y->get_const_values() + col, y->get_stride(), - result->get_values() + col); - } - } else { - // TODO: these are tuning parameters obtained experimentally, once - // we decide how to handle this uniformly, they should be modified - // appropriately - constexpr int work_per_thread = 32; - constexpr int block_size = 1024; - - constexpr auto work_per_block = work_per_thread * block_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{config::warp_size, 1, - block_size / config::warp_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - kernel::compute_partial_dot<<>>( - x->get_size()[0], as_cuda_type(x->get_const_values() + col), - x->get_stride(), as_cuda_type(y->get_const_values() + col), - y->get_stride(), as_cuda_type(work.get_data())); - kernel::finalize_sum_reduce_computation - <<<1, block_dim>>>(grid_dim.x, - as_cuda_type(work.get_const_data()), - as_cuda_type(result->get_values() + col)); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); - - -template -void compute_conj_dot(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - if (cublas::is_supported::value) { - // TODO: write a custom kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - cublas::conj_dot(exec->get_cublas_handle(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - y->get_const_values() + col, y->get_stride(), - result->get_values() + col); - } - } else { - // TODO: these are tuning parameters obtained experimentally, once - // we decide how to handle this uniformly, they should be modified - // appropriately - constexpr int work_per_thread = 32; - constexpr int block_size = 1024; - - constexpr auto work_per_block = work_per_thread * block_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{config::warp_size, 1, - block_size / config::warp_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - kernel::compute_partial_conj_dot - <<>>( - x->get_size()[0], as_cuda_type(x->get_const_values() + col), - x->get_stride(), as_cuda_type(y->get_const_values() + col), - y->get_stride(), as_cuda_type(work.get_data())); - kernel::finalize_sum_reduce_computation - <<<1, block_dim>>>(grid_dim.x, - as_cuda_type(work.get_const_data()), - as_cuda_type(result->get_values() + col)); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); - - -template -void compute_norm2(std::shared_ptr exec, - const matrix::Dense* x, - matrix::Dense>* result) -{ - if (cublas::is_supported::value) { - for (size_type col = 0; col < x->get_size()[1]; ++col) { - cublas::norm2(exec->get_cublas_handle(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - result->get_values() + col); - } - } else { - using norm_type = remove_complex; - // TODO: these are tuning parameters obtained experimentally, once - // we decide how to handle this uniformly, they should be modified - // appropriately - constexpr int work_per_thread = 32; - constexpr int block_size = 1024; - - constexpr auto work_per_block = work_per_thread * block_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{config::warp_size, 1, - block_size / config::warp_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - kernel::compute_partial_norm2<<>>( - x->get_size()[0], as_cuda_type(x->get_const_values() + col), - x->get_stride(), as_cuda_type(work.get_data())); - kernel::finalize_sqrt_reduce_computation - <<<1, block_dim>>>(grid_dim.x, - as_cuda_type(work.get_const_data()), - as_cuda_type(result->get_values() + col)); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); - - template void convert_to_coo(std::shared_ptr exec, const matrix::Dense* source, From 879c678e4a46eb35d8a9a8a4d6106fd064566ee4 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 28 Jul 2021 00:34:45 +0200 Subject: [PATCH 13/25] add HIP kernels --- cuda/base/kernel_launch_reduction.cuh | 10 +- hip/base/kernel_launch_reduction.hip.hpp | 339 +++++++++++++++-------- hip/matrix/dense_kernels.hip.cpp | 138 --------- 3 files changed, 233 insertions(+), 254 deletions(-) diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh index c2560ded334..a857e0d5035 100644 --- a/cuda/base/kernel_launch_reduction.cuh +++ b/cuda/base/kernel_launch_reduction.cuh @@ -152,7 +152,7 @@ void run_kernel_reduction(std::shared_ptr exec, Array partial{exec, static_cast(num_blocks)}; generic_kernel_reduction_1d<<>>( static_cast(size), fn, op, - [] GKO_KERNEL(auto v) { return v; }, as_cuda_type(init), + [] __device__(auto v) { return v; }, as_cuda_type(init), as_cuda_type(partial.get_data()), map_to_device(args)...); generic_kernel_reduction_1d<<<1, block_size>>>( static_cast(num_blocks), @@ -185,7 +185,7 @@ void run_kernel_reduction(std::shared_ptr exec, if (num_blocks > 1) { Array partial{exec, static_cast(num_blocks)}; generic_kernel_reduction_2d<<>>( - rows, cols, fn, op, [] GKO_KERNEL(auto v) { return v; }, + rows, cols, fn, op, [] __device__(auto v) { return v; }, as_cuda_type(init), as_cuda_type(partial.get_data()), map_to_device(args)...); generic_kernel_reduction_1d<<<1, block_size>>>( @@ -408,7 +408,7 @@ void run_generic_col_reduction_small(syn::value_list, static_cast(num_blocks * cols)}; generic_kernel_col_reduction_2d_small <<>>( - rows, cols, fn, op, [] GKO_KERNEL(auto v) { return v; }, + rows, cols, fn, op, [] __device__(auto v) { return v; }, as_cuda_type(init), as_cuda_type(tmp_storage.get_data()), args...); generic_kernel_reduction_finalize_2d<<< @@ -451,7 +451,7 @@ void run_kernel_row_reduction(std::shared_ptr exec, generic_kernel_row_reduction_2d <<>>( rows, cols, col_blocks, fn, op, - [] GKO_KERNEL(auto v) { return v; }, as_cuda_type(init), + [] __device__(auto v) { return v; }, as_cuda_type(init), as_cuda_type(partial.get_data()), 1, map_to_device(args)...); const auto num_finalize_blocks = ceildiv(rows, default_block_size); generic_kernel_reduction_finalize_2d<< exec, exec, static_cast(row_blocks * cols)}; generic_kernel_col_reduction_2d_blocked<<< dim3(row_blocks, col_blocks), default_block_size>>>( - rows, cols, fn, op, [] GKO_KERNEL(auto v) { return v; }, + rows, cols, fn, op, [] __device__(auto v) { return v; }, as_cuda_type(init), as_cuda_type(tmp_storage.get_data()), map_to_device(args)...); generic_kernel_reduction_finalize_2d<<< diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp index fe4b697bc30..7a875491899 100644 --- a/hip/base/kernel_launch_reduction.hip.hpp +++ b/hip/base/kernel_launch_reduction.hip.hpp @@ -79,7 +79,11 @@ __global__ __launch_bounds__( } __syncthreads(); if (threadIdx.x < config::warp_size) { - partial = reduce(warp, warp_partial[threadIdx.x], op); + partial = reduce(warp, + threadIdx.x < default_block_size / config::warp_size + ? warp_partial[threadIdx.x] + : init, + op); if (threadIdx.x == 0) { storage[blockIdx.x] = finalize(partial); } @@ -119,8 +123,14 @@ __global__ __launch_bounds__( } __syncthreads(); if (threadIdx.x < config::warp_size) { - storage[blockIdx.x] = - finalize(reduce(warp, warp_partial[threadIdx.x], op)); + partial = reduce(warp, + threadIdx.x < default_block_size / config::warp_size + ? warp_partial[threadIdx.x] + : init, + op); + if (threadIdx.x == 0) { + storage[blockIdx.x] = finalize(partial); + } } } @@ -133,7 +143,7 @@ void run_kernel_reduction(std::shared_ptr exec, ValueType* result, size_type size, KernelArgs&&... args) { - constexpr int oversubscription = 4; + constexpr int oversubscription = 16; gko::hip::device_guard guard{exec->get_device_id()}; constexpr auto block_size = default_block_size; const auto num_blocks = std::min( @@ -167,7 +177,7 @@ void run_kernel_reduction(std::shared_ptr exec, FinalizeOp finalize, ValueType init, ValueType* result, dim<2> size, KernelArgs&&... args) { - constexpr int oversubscription = 4; + constexpr int oversubscription = 16; gko::hip::device_guard guard{exec->get_device_id()}; constexpr auto block_size = default_block_size; const auto rows = static_cast(size[0]); @@ -200,19 +210,19 @@ template __global__ __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d( - int64 rows, int64 cols, int64 col_parts, KernelFunction fn, + int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* result, int64 result_stride, KernelArgs... args) { const auto idx = thread::get_subwarp_id_flat(); const auto row = idx % rows; - const auto col_part = idx / rows; - if (col_part >= col_parts) { + const auto col_block = idx / rows; + if (col_block >= col_blocks) { return; } - const auto cols_per_part = ceildiv(cols, col_parts); - // TODO use boundaries divisible by subwarp_size - const auto begin = cols_per_part * col_part; + const auto cols_per_part = + ceildiv(ceildiv(cols, subwarp_size), col_blocks) * subwarp_size; + const auto begin = cols_per_part * col_block; const auto end = min(begin + cols_per_part, cols); auto subwarp = group::tiled_partition(group::this_thread_block()); @@ -222,58 +232,135 @@ __global__ partial = op(partial, fn(row, col, args...)); } partial = reduce(subwarp, partial, op); - result[(row * col_parts + col_part) * result_stride] = finalize(partial); + result[(row + col_block * rows) * result_stride] = finalize(partial); +} + + +template +__global__ + __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small( + int64 rows, int64 cols, KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, ValueType* result, + KernelArgs... args) +{ + constexpr auto warp_size = config::warp_size; + constexpr auto warps_per_block = default_block_size / warp_size; + // stores the subwarp_size partial sums from each warp, grouped by warp + constexpr auto shared_storage = warps_per_block * subwarp_size; + __shared__ UninitializedArray block_partial; + const auto subwarp_id = thread::get_subwarp_id_flat(); + const auto local_warp_id = threadIdx.x / warp_size; + const auto local_subwarp_id = threadIdx.x % warp_size / subwarp_size; + const auto subwarp_num = + thread::get_subwarp_num_flat(); + const auto block = group::this_thread_block(); + // + if (threadIdx.x < shared_storage) { + block_partial[threadIdx.x] = init; + } + block.sync(); + // + const auto warp = group::tiled_partition(block); + const auto warp_rank = warp.thread_rank(); + const auto subwarp_rank = warp_rank % subwarp_size; + const auto col = static_cast(subwarp_rank); + auto partial = init; + // accumulate within a thread + if (col < cols) { + for (auto row = subwarp_id; row < rows; row += subwarp_num) { + partial = op(partial, fn(row, col, args...)); + } + } + // accumulate between all subwarps in the warp +#pragma unroll + for (unsigned i = subwarp_size; i < warp_size; i *= 2) { + partial = op(partial, warp.shfl_xor(partial, i)); + } // store the result to shared memory + if (local_subwarp_id == 0) { + block_partial[local_warp_id * subwarp_size + subwarp_rank] = partial; + } + block.sync(); + // in a single thread: accumulate the results + if (local_warp_id == 0) { + partial = init; + // accumulate the partial results within a thread + if (shared_storage >= warp_size) { +#pragma unroll + for (int i = 0; i < shared_storage; i += warp_size) { + partial = op(partial, block_partial[i + warp_rank]); + } + } else if (warp_rank < shared_storage) { + partial = op(partial, block_partial[warp_rank]); + } + // accumulate between all subwarps in the warp +#pragma unroll + for (unsigned i = subwarp_size; i < warp_size; i *= 2) { + partial = op(partial, warp.shfl_xor(partial, i)); + } + if (warp_rank < cols) { + result[warp_rank + blockIdx.x * cols] = finalize(partial); + } + } } template __global__ - __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d( - int64 rows, int64 cols, int64 row_parts, KernelFunction fn, - ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* result, + __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked( + int64 rows, int64 cols, KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, ValueType* result, KernelArgs... args) { - const auto idx = thread::get_thread_id_flat(); - const auto col = idx % cols; - const auto row_part = idx / cols; - if (row_part >= row_parts) { - return; - } - const auto rows_per_part = ceildiv(rows, row_parts); - const auto begin = rows_per_part * row_part; - const auto end = min(begin + rows_per_part, rows); + constexpr auto warp_size = config::warp_size; + __shared__ UninitializedArray block_partial; + const auto warp_id = thread::get_subwarp_id_flat(); + const auto warp_num = thread::get_subwarp_num_flat(); + const auto block = group::this_thread_block(); + const auto warp = group::tiled_partition(block); + const auto warp_rank = warp.thread_rank(); + const auto col = warp_rank + static_cast(blockIdx.y) * warp_size; auto partial = init; - for (auto row = begin; row < end; row++) { - partial = op(partial, fn(row, col, args...)); + // accumulate within a thread + if (col < cols) { + for (auto row = warp_id; row < rows; row += warp_num) { + partial = op(partial, fn(row, col, args...)); + } + } + block_partial[threadIdx.x] = partial; + block.sync(); + // in a single warp: accumulate the results + if (threadIdx.x < warp_size) { + partial = init; + // accumulate the partial results within a thread +#pragma unroll + for (int i = 0; i < default_block_size; i += warp_size) { + partial = op(partial, block_partial[i + warp_rank]); + } + if (col < cols) { + result[col + blockIdx.x * cols] = finalize(partial); + } } - result[col * row_parts + row_part] = finalize(partial); } -template +template __global__ __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d( - int64 num_results, int64 num_parts, ReductionOp op, FinalizeOp finalize, - ValueType init, const ValueType* input, int64 result_stride, - ValueType* result) + int64 num_results, int64 num_blocks, ReductionOp op, + FinalizeOp finalize, ValueType init, const ValueType* input, + int64 result_stride, ValueType* result) { - const auto idx = thread::get_subwarp_id_flat(); + const auto idx = thread::get_thread_id_flat(); if (idx >= num_results) { return; } - auto subwarp = - group::tiled_partition(group::this_thread_block()); auto partial = init; - for (int64 part = subwarp.thread_rank(); part < num_parts; - part += subwarp_size) { - partial = op(partial, input[idx * num_parts + part]); - } - partial = reduce(subwarp, partial, op); - if (subwarp.thread_rank() == 0) { - result[idx * result_stride] = finalize(partial); + for (int64 block = 0; block < num_blocks; block++) { + partial = op(partial, input[idx + block * num_results]); } + result[idx * result_stride] = finalize(partial); } @@ -283,43 +370,65 @@ namespace { template void run_generic_kernel_row_reduction(syn::value_list, - int64 rows, int64 cols, int64 col_parts, + int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* result, int64 result_stride, KernelArgs... args) { - constexpr auto block_size = default_block_size; - const auto num_blocks = ceildiv(rows * cols * subwarp_size, block_size); + const auto num_blocks = + ceildiv(rows * col_blocks * subwarp_size, default_block_size); hipLaunchKernelGGL( HIP_KERNEL_NAME(generic_kernel_row_reduction_2d), - num_blocks, block_size, 0, 0, rows, cols, col_parts, fn, op, finalize, - as_hip_type(init), as_hip_type(result), result_stride, args...); + num_blocks, default_block_size, 0, 0, rows, cols, col_blocks, fn, op, + finalize, as_hip_type(init), as_hip_type(result), result_stride, + args...); } GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction, run_generic_kernel_row_reduction) -template -void run_kernel_reduction_finalize(syn::value_list, - int64 num_results, int64 num_parts, - ReductionOp op, FinalizeOp finalize, - ValueType init, const ValueType* input, - int64 result_stride, ValueType* result) +template +void run_generic_col_reduction_small(syn::value_list, + int64 max_blocks, + std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, dim<2> size, + MappedKernelArgs... args) { - constexpr auto block_size = default_block_size; - const auto num_blocks = ceildiv(num_results * subwarp_size, block_size); - hipLaunchKernelGGL( - HIP_KERNEL_NAME(generic_kernel_reduction_finalize_2d), - num_blocks, block_size, 0, 0, num_results, num_parts, op, finalize, - as_hip_type(init), as_hip_type(input), - static_cast(result_stride), as_hip_type(result)); + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto num_blocks = std::min( + ceildiv(rows * subwarp_size, default_block_size), max_blocks); + if (num_blocks <= 1) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + generic_kernel_col_reduction_2d_small), + 1, default_block_size, 0, 0, rows, cols, fn, op, finalize, + as_hip_type(init), as_hip_type(result), args...); + } else { + Array tmp_storage{exec, + static_cast(num_blocks * cols)}; + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + generic_kernel_col_reduction_2d_small), + num_blocks, default_block_size, 0, 0, rows, cols, fn, op, + [] __device__(auto v) { return v; }, as_hip_type(init), + as_hip_type(tmp_storage.get_data()), args...); + hipLaunchKernelGGL( + generic_kernel_reduction_finalize_2d, + ceildiv(cols, default_block_size), default_block_size, 0, 0, cols, + num_blocks, op, finalize, as_hip_type(init), + as_hip_type(tmp_storage.get_const_data()), 1, as_hip_type(result)); + } } -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_finalize, - run_kernel_reduction_finalize) +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small, + run_generic_col_reduction_small); } // namespace @@ -335,33 +444,29 @@ void run_kernel_row_reduction(std::shared_ptr exec, { using subwarp_sizes = syn::value_list; - constexpr int oversubscription = 4; + constexpr int oversubscription = 16; gko::hip::device_guard guard{exec->get_device_id()}; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); - const auto resources = exec->get_num_warps() * oversubscription; - const auto col_parts = 1; // TODO tune - if (col_parts > 1) { + const auto resources = + exec->get_num_warps() * config::warp_size * oversubscription; + if (rows * cols > resources && rows < cols) { + const auto col_blocks = ceildiv(rows * cols, resources); Array partial{exec, - static_cast(col_parts * rows)}; - select_run_generic_kernel_row_reduction( - subwarp_sizes(), - [&](int compiled_subwarp_size) { - return compiled_subwarp_size >= cols || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), rows, cols, col_parts, - fn, op, [] __device__(auto i) { return i; }, init, - partial.get_data(), 1, map_to_device(args)...); - select_run_kernel_reduction_finalize( - subwarp_sizes(), - [&](int compiled_subwarp_size) { - return compiled_subwarp_size >= col_parts || - compiled_subwarp_size == config::warp_size; - }, - syn::value_list(), syn::type_list<>(), rows, col_parts, op, - finalize, init, partial.get_const_data(), - static_cast(result_stride), result); + static_cast(col_blocks * rows)}; + const auto num_blocks = + ceildiv(rows * col_blocks * config::warp_size, default_block_size); + hipLaunchKernelGGL( + HIP_KERNEL_NAME(generic_kernel_row_reduction_2d), + num_blocks, default_block_size, 0, 0, rows, cols, col_blocks, fn, + op, [] __device__(auto v) { return v; }, as_hip_type(init), + as_hip_type(partial.get_data()), 1, map_to_device(args)...); + const auto num_finalize_blocks = ceildiv(rows, default_block_size); + hipLaunchKernelGGL( + generic_kernel_reduction_finalize_2d, num_finalize_blocks, + default_block_size, 0, 0, rows, col_blocks, op, finalize, + as_hip_type(init), as_hip_type(partial.get_const_data()), + static_cast(result_stride), as_hip_type(result)); } else { select_run_generic_kernel_row_reduction( subwarp_sizes(), @@ -384,39 +489,51 @@ void run_kernel_col_reduction(std::shared_ptr exec, ValueType* result, dim<2> size, KernelArgs&&... args) { - constexpr int oversubscription = 4; + using subwarp_sizes = + syn::value_list; + constexpr int oversubscription = 16; gko::hip::device_guard guard{exec->get_device_id()}; - constexpr auto block_size = default_block_size; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); - const auto resources = - exec->get_num_warps() * config::warp_size * oversubscription; - const auto num_blocks = ceildiv(rows * cols, block_size); - const auto row_parts = 1; // TODO tune - if (row_parts > 1) { - Array partial{exec, - static_cast(row_parts * cols)}; - hipLaunchKernelGGL( - generic_kernel_col_reduction_2d, num_blocks, block_size, 0, 0, rows, - cols, row_parts, fn, op, [] __device__(auto i) { return i; }, - as_hip_type(init), as_hip_type(partial.get_data()), - map_to_device(args)...); - using subwarp_sizes = - syn::value_list; - select_run_kernel_reduction_finalize( + const auto max_blocks = exec->get_num_warps() * config::warp_size * + oversubscription / default_block_size; + if (cols <= config::warp_size) { + select_generic_col_reduction_small( subwarp_sizes(), [&](int compiled_subwarp_size) { - return compiled_subwarp_size >= row_parts || + return compiled_subwarp_size >= cols || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), cols, row_parts, op, - finalize, as_hip_type(init), as_hip_type(partial.get_const_data()), - 1, as_hip_type(result)); + syn::value_list(), syn::type_list<>(), max_blocks, exec, fn, + op, finalize, init, result, size, map_to_device(args)...); } else { - hipLaunchKernelGGL(generic_kernel_col_reduction_2d, num_blocks, - block_size, 0, 0, rows, cols, 1, fn, op, finalize, - as_hip_type(init), as_hip_type(result), - map_to_device(args)...); + const auto col_blocks = ceildiv(cols, config::warp_size); + const auto row_blocks = + ceildiv(std::min( + ceildiv(rows * config::warp_size, default_block_size), + max_blocks), + col_blocks); + if (row_blocks <= 1) { + hipLaunchKernelGGL(generic_kernel_col_reduction_2d_blocked, + dim3(1, col_blocks), default_block_size, 0, 0, + rows, cols, fn, op, finalize, as_hip_type(init), + as_hip_type(result), map_to_device(args)...); + } else { + Array tmp_storage{ + exec, static_cast(row_blocks * cols)}; + hipLaunchKernelGGL( + generic_kernel_col_reduction_2d_blocked, + dim3(row_blocks, col_blocks), default_block_size, 0, 0, rows, + cols, fn, op, [] __device__(auto v) { return v; }, + as_hip_type(init), as_hip_type(tmp_storage.get_data()), + map_to_device(args)...); + hipLaunchKernelGGL(generic_kernel_reduction_finalize_2d, + ceildiv(cols, default_block_size), + default_block_size, 0, 0, cols, row_blocks, op, + finalize, as_hip_type(init), + as_hip_type(tmp_storage.get_const_data()), 1, + as_hip_type(result)); + } } } diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp index 56ed5c327b9..d4c815c9539 100644 --- a/hip/matrix/dense_kernels.hip.cpp +++ b/hip/matrix/dense_kernels.hip.cpp @@ -120,144 +120,6 @@ void apply(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); -template -void compute_dot(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - if (hipblas::is_supported::value) { - // TODO: write a custom kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - hipblas::dot(exec->get_hipblas_handle(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - y->get_const_values() + col, y->get_stride(), - result->get_values() + col); - } - } else { - // TODO: these are tuning parameters obtained experimentally, once - // we decide how to handle this uniformly, they should be modified - // appropriately - constexpr int work_per_thread = 32; - constexpr int block_size = 1024; - - constexpr auto work_per_block = work_per_thread * block_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{config::warp_size, 1, - block_size / config::warp_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::compute_partial_dot), - dim3(grid_dim), dim3(block_dim), 0, 0, x->get_size()[0], - as_hip_type(x->get_const_values() + col), x->get_stride(), - as_hip_type(y->get_const_values() + col), y->get_stride(), - as_hip_type(work.get_data())); - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - kernel::finalize_sum_reduce_computation), - dim3(1), dim3(block_dim), 0, 0, grid_dim.x, - as_hip_type(work.get_const_data()), - as_hip_type(result->get_values() + col)); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); - - -template -void compute_conj_dot(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - if (hipblas::is_supported::value) { - // TODO: write a custom kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - hipblas::conj_dot(exec->get_hipblas_handle(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - y->get_const_values() + col, y->get_stride(), - result->get_values() + col); - } - } else { - // TODO: these are tuning parameters obtained experimentally, once - // we decide how to handle this uniformly, they should be modified - // appropriately - constexpr int work_per_thread = 32; - constexpr int block_size = 1024; - - constexpr auto work_per_block = work_per_thread * block_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{config::warp_size, 1, - block_size / config::warp_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::compute_partial_conj_dot), - dim3(grid_dim), dim3(block_dim), 0, 0, x->get_size()[0], - as_hip_type(x->get_const_values() + col), x->get_stride(), - as_hip_type(y->get_const_values() + col), y->get_stride(), - as_hip_type(work.get_data())); - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - kernel::finalize_sum_reduce_computation), - dim3(1), dim3(block_dim), 0, 0, grid_dim.x, - as_hip_type(work.get_const_data()), - as_hip_type(result->get_values() + col)); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); - - -template -void compute_norm2(std::shared_ptr exec, - const matrix::Dense* x, - matrix::Dense>* result) -{ - if (hipblas::is_supported::value) { - for (size_type col = 0; col < x->get_size()[1]; ++col) { - hipblas::norm2(exec->get_hipblas_handle(), x->get_size()[0], - x->get_const_values() + col, x->get_stride(), - result->get_values() + col); - } - } else { - using norm_type = remove_complex; - // TODO: these are tuning parameters obtained experimentally, once - // we decide how to handle this uniformly, they should be modified - // appropriately - constexpr int work_per_thread = 32; - constexpr int block_size = 1024; - - constexpr auto work_per_block = work_per_thread * block_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{config::warp_size, 1, - block_size / config::warp_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::compute_partial_norm2), - dim3(grid_dim), dim3(block_dim), 0, 0, x->get_size()[0], - as_hip_type(x->get_const_values() + col), x->get_stride(), - as_hip_type(work.get_data())); - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - kernel::finalize_sqrt_reduce_computation), - dim3(1), dim3(block_dim), 0, 0, grid_dim.x, - as_hip_type(work.get_const_data()), - as_hip_type(result->get_values() + col)); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); - - template void convert_to_coo(std::shared_ptr exec, const matrix::Dense* source, From 69068184070ad769fb1fffc17852a8fe123229b0 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 11 Aug 2021 10:00:36 +0200 Subject: [PATCH 14/25] fix overflows in reduction tests --- cuda/test/base/kernel_launch.cu | 5 +++-- omp/test/base/kernel_launch.cpp | 9 ++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cuda/test/base/kernel_launch.cu b/cuda/test/base/kernel_launch.cu index 2df6ee4ade7..1d43293d553 100644 --- a/cuda/test/base/kernel_launch.cu +++ b/cuda/test/base/kernel_launch.cu @@ -380,7 +380,7 @@ void run2d_row_reduction(std::shared_ptr exec) gko::Array output{exec, host_ref}; for (int64 i = 0; i < num_rows; i++) { host_ref.get_data()[2 * i] = - num_cols * (num_cols + 1) * (i + 1); + static_cast(num_cols) * (num_cols + 1) * (i + 1); } gko::kernels::cuda::run_kernel_row_reduction( @@ -427,7 +427,8 @@ void run2d_col_reduction(std::shared_ptr exec) static_cast(num_cols)}; gko::Array output{exec, static_cast(num_cols)}; for (int64 i = 0; i < num_cols; i++) { - host_ref.get_data()[i] = num_rows * (num_rows + 1) * (i + 1); + host_ref.get_data()[i] = + static_cast(num_rows) * (num_rows + 1) * (i + 1); } gko::kernels::cuda::run_kernel_col_reduction( diff --git a/omp/test/base/kernel_launch.cpp b/omp/test/base/kernel_launch.cpp index 01c39514cdb..a615c452f64 100644 --- a/omp/test/base/kernel_launch.cpp +++ b/omp/test/base/kernel_launch.cpp @@ -344,7 +344,8 @@ TEST_F(KernelLaunch, ReductionRow2DSmall) std::fill_n(host_ref.get_data(), 2 * num_rows, 1234); gko::Array output{exec, host_ref}; for (int i = 0; i < num_rows; i++) { - host_ref.get_data()[2 * i] = num_cols * (num_cols + 1) * (i + 1); + host_ref.get_data()[2 * i] = + static_cast(num_cols) * (num_cols + 1) * (i + 1); } gko::kernels::omp::run_kernel_row_reduction( @@ -373,7 +374,8 @@ TEST_F(KernelLaunch, ReductionRow2D) std::fill_n(host_ref.get_data(), 2 * num_rows, 1234); gko::Array output{exec, host_ref}; for (int i = 0; i < num_rows; i++) { - host_ref.get_data()[2 * i] = num_cols * (num_cols + 1) * (i + 1); + host_ref.get_data()[2 * i] = + static_cast(num_cols) * (num_cols + 1) * (i + 1); } gko::kernels::omp::run_kernel_row_reduction( @@ -402,7 +404,8 @@ TEST_F(KernelLaunch, ReductionCol2D) static_cast(num_cols)}; gko::Array output{exec, static_cast(num_cols)}; for (int i = 0; i < num_cols; i++) { - host_ref.get_data()[i] = num_rows * (num_rows + 1) * (i + 1); + host_ref.get_data()[i] = + static_cast(num_rows) * (num_rows + 1) * (i + 1); } gko::kernels::omp::run_kernel_col_reduction( From ddbe7fd9c2c6c169d32dbc42a2034d095168d8b1 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 11 Aug 2021 10:03:10 +0200 Subject: [PATCH 15/25] avoid duplicate writes in reductions --- cuda/base/kernel_launch_reduction.cuh | 4 +++- hip/base/kernel_launch_reduction.hip.hpp | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh index a857e0d5035..49a6ca95f87 100644 --- a/cuda/base/kernel_launch_reduction.cuh +++ b/cuda/base/kernel_launch_reduction.cuh @@ -227,7 +227,9 @@ __global__ partial = op(partial, fn(row, col, args...)); } partial = reduce(subwarp, partial, op); - result[(row + col_block * rows) * result_stride] = finalize(partial); + if (subwarp.thread_rank() == 0) { + result[(row + col_block * rows) * result_stride] = finalize(partial); + } } diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp index 7a875491899..aa3f3384ca6 100644 --- a/hip/base/kernel_launch_reduction.hip.hpp +++ b/hip/base/kernel_launch_reduction.hip.hpp @@ -232,7 +232,9 @@ __global__ partial = op(partial, fn(row, col, args...)); } partial = reduce(subwarp, partial, op); - result[(row + col_block * rows) * result_stride] = finalize(partial); + if (subwarp.thread_rank() == 0) { + result[(row + col_block * rows) * result_stride] = finalize(partial); + } } From 7103627befece41b0580100d0c5a356f985ab6c5 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 11 Aug 2021 10:03:44 +0200 Subject: [PATCH 16/25] add DPC++ reduction kernels --- dpcpp/base/kernel_launch_reduction.dp.hpp | 478 +++++++++++++++++++++- dpcpp/matrix/dense_kernels.dp.cpp | 416 ------------------- dpcpp/test/base/kernel_launch.dp.cpp | 70 ++++ 3 files changed, 526 insertions(+), 438 deletions(-) diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp index cbf3e3d7158..ca82a897269 100644 --- a/dpcpp/base/kernel_launch_reduction.dp.hpp +++ b/dpcpp/base/kernel_launch_reduction.dp.hpp @@ -30,12 +30,15 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_COMMON_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ +#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ #error \ "This file can only be used from inside common/base/kernel_launch_reduction.hpp" #endif +#include + + #include "core/synthesizer/implementation_selection.hpp" #include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" @@ -60,14 +63,14 @@ constexpr auto kcfg_1d_list_simple_reduction = static_cast(KCFG_1D::encode(256, 8))>(); -template +template void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size, int64 num_workgroups, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* storage, - KernelArgs... args) + MappedKernelArgs... args) { constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); @@ -109,14 +112,14 @@ void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size, } -template +template void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols, int64 num_workgroups, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* storage, - KernelArgs... args) + MappedKernelArgs... args) { constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); @@ -161,13 +164,14 @@ void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols, template + typename ReductionOp, typename FinalizeOp, + typename... MappedKernelArgs> void run_kernel_reduction_impl(syn::value_list, std::shared_ptr exec, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* result, size_type size, - KernelArgs... args) + MappedKernelArgs... args) { constexpr auto cfg = static_cast(icfg); constexpr int oversubscription = 4; @@ -180,34 +184,35 @@ void run_kernel_reduction_impl(syn::value_list, if (num_workgroups > 1) { Array partial{exec, static_cast(num_workgroups)}; queue->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_1d( + generic_kernel_reduction_1d( cgh, static_cast(size), num_workgroups, fn, op, [](auto v) { return v; }, init, partial.get_data(), args...); }); queue->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_1d( + generic_kernel_reduction_1d( cgh, static_cast(num_workgroups), 1, [](auto i, auto v) { return v[i]; }, op, finalize, init, result, partial.get_const_data()); }); } else { queue->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_1d(cgh, static_cast(size), - num_workgroups, fn, op, finalize, init, - result, args...); + generic_kernel_reduction_1d(cgh, static_cast(size), + num_workgroups, fn, op, finalize, + init, result, args...); }); } } template + typename ReductionOp, typename FinalizeOp, + typename... MappedKernelArgs> void run_kernel_reduction_impl(syn::value_list, std::shared_ptr exec, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType init, ValueType* result, dim<2> size, - KernelArgs... args) + MappedKernelArgs... args) { constexpr auto cfg = static_cast(icfg); constexpr int oversubscription = 4; @@ -223,20 +228,21 @@ void run_kernel_reduction_impl(syn::value_list, if (num_workgroups > 1) { Array partial{exec, static_cast(num_workgroups)}; queue->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_2d( + generic_kernel_reduction_2d( cgh, rows, cols, num_workgroups, fn, op, [](auto v) { return v; }, init, partial.get_data(), args...); }); queue->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_1d( + generic_kernel_reduction_1d( cgh, static_cast(num_workgroups), 1, [](auto i, auto v) { return v[i]; }, op, finalize, init, result, partial.get_const_data()); }); } else { queue->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_2d(cgh, rows, cols, num_workgroups, fn, op, - finalize, init, result, args...); + generic_kernel_reduction_2d(cgh, rows, cols, num_workgroups, + fn, op, finalize, init, result, + args...); }); } } @@ -286,6 +292,434 @@ void run_kernel_reduction(std::shared_ptr exec, } +namespace { + + +template +void generic_kernel_row_reduction_2d(syn::value_list, + std::shared_ptr exec, + int64 rows, int64 cols, int64 col_blocks, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, int64 result_stride, + MappedKernelArgs... args) +{ + constexpr auto wg_size = + KCFG_1D::decode<0>(static_cast(icfg)); + constexpr auto sg_size = + KCFG_1D::decode<1>(static_cast(icfg)); + static_assert(ssg_size <= sg_size, "ssg must be smaller than sg"); + const auto num_workgroups = ceildiv(rows * col_blocks * ssg_size, wg_size); + const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size)); + exec->get_queue()->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + range, [= + ](sycl::nd_item<3> id) [[intel::reqd_sub_group_size(sg_size)]] { + const auto idx = + thread::get_subwarp_id_flat(id); + const auto row = idx % rows; + const auto col_block = idx / rows; + auto partial = init; + auto subgroup = group::tiled_partition( + group::this_thread_block(id)); + auto ssg_rank = + static_cast(subgroup.thread_rank() % ssg_size); + if (col_block < col_blocks) { + const auto cols_per_part = + ceildiv(ceildiv(cols, ssg_size), col_blocks) * ssg_size; + const auto begin = cols_per_part * col_block; + const auto end = min(begin + cols_per_part, cols); + for (auto col = begin + ssg_rank; col < end; + col += ssg_size) { + partial = op(partial, fn(row, col, args...)); + } + } +// since we do a sub-subgroup reduction, we can't use reduce +#pragma unroll + for (int i = 1; i < ssg_size; i *= 2) { + partial = op(partial, subgroup.shfl_xor(partial, i)); + } + if (col_block < col_blocks && ssg_rank == 0) { + result[(row + col_block * rows) * result_stride] = + finalize(partial); + } + }); + }); +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_kernel_row_reduction_2d, + generic_kernel_row_reduction_2d); + + +template +void generic_kernel_col_reduction_2d_small(sycl::handler& cgh, int64 rows, + int64 cols, int64 row_blocks, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, + MappedKernelArgs... args) +{ + constexpr auto wg_size = + KCFG_1D::decode<0>(static_cast(icfg)); + constexpr auto sg_size = + KCFG_1D::decode<1>(static_cast(icfg)); + static_assert(ssg_size <= sg_size, "ssg must be smaller than sg"); + constexpr auto subgroups_per_workgroup = wg_size / sg_size; + // stores the subwarp_size partial sums from each warp, grouped by warp + constexpr auto shared_storage = subgroups_per_workgroup * ssg_size; + sycl::accessor, 1, + sycl::access_mode::read_write, sycl::access::target::local> + block_partial_acc(sycl::range<1>{1}, cgh); + const auto range = sycl_nd_range(dim3(row_blocks), dim3(wg_size)); + cgh.parallel_for( + range, [= + ](sycl::nd_item<3> id) [[intel::reqd_sub_group_size(sg_size)]] { + auto block_partial = &block_partial_acc[0][0]; + const auto ssg_id = + thread::get_subwarp_id_flat(id); + const auto local_sg_id = id.get_local_id(2) / sg_size; + const auto local_ssg_id = id.get_local_id(2) % sg_size / ssg_size; + const auto ssg_num = + thread::get_subwarp_num_flat(id); + const auto workgroup = group::this_thread_block(id); + // TODO remove + if (id.get_local_id(2) < shared_storage) { + block_partial[id.get_local_id(2)] = init; + } + workgroup.sync(); + // TODO end + const auto subgroup = group::tiled_partition(workgroup); + const auto sg_rank = subgroup.thread_rank(); + const auto ssg_rank = sg_rank % ssg_size; + const auto col = static_cast(ssg_rank); + auto partial = init; + // accumulate within a thread + if (col < cols) { + for (auto row = ssg_id; row < rows; row += ssg_num) { + partial = op(partial, fn(row, col, args...)); + } + } + // accumulate between all subsubgroups in the subgroup +#pragma unroll + for (unsigned i = ssg_size; i < sg_size; i *= 2) { + partial = op(partial, subgroup.shfl_xor(partial, i)); + } + // store the result to shared memory + if (local_ssg_id == 0) { + block_partial[local_sg_id * ssg_size + ssg_rank] = partial; + } + workgroup.sync(); + // in a single thread: accumulate the results + if (local_sg_id == 0) { + partial = init; + // accumulate the partial results within a thread + if (shared_storage >= sg_size) { +#pragma unroll + for (int i = 0; i < shared_storage; i += sg_size) { + partial = op(partial, block_partial[i + sg_rank]); + } + } else if (sg_rank < shared_storage) { + partial = op(partial, block_partial[sg_rank]); + } + // accumulate between all subsubgroups in the subgroup +#pragma unroll + for (unsigned i = ssg_size; i < sg_size; i *= 2) { + partial = op(partial, subgroup.shfl_xor(partial, i)); + } + if (sg_rank < cols) { + result[sg_rank + id.get_group(2) * cols] = + finalize(partial); + } + } + }); +} + + +template +void generic_kernel_col_reduction_2d_blocked( + sycl::handler& cgh, int64 rows, int64 cols, int64 row_blocks, + int64 col_blocks, KernelFunction fn, ReductionOp op, FinalizeOp finalize, + ValueType init, ValueType* result, MappedKernelArgs... args) +{ + constexpr auto cfg = static_cast(icfg); + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); + const auto range = + sycl_nd_range(dim3(row_blocks, col_blocks), dim3(wg_size)); + sycl::accessor, 1, + sycl::access_mode::read_write, sycl::access::target::local> + block_partial_acc(sycl::range<1>{1}, cgh); + cgh.parallel_for( + range, [= + ](sycl::nd_item<3> id) [[intel::reqd_sub_group_size(sg_size)]] { + const auto sg_id = thread::get_subwarp_id_flat(id); + const auto sg_num = + thread::get_subwarp_num_flat(id); + const auto workgroup = group::this_thread_block(id); + const auto subgroup = group::tiled_partition(workgroup); + const auto sg_rank = subgroup.thread_rank(); + const auto col = + sg_rank + static_cast(id.get_group(1)) * sg_size; + auto block_partial = &block_partial_acc[0][0]; + auto partial = init; + // accumulate within a thread + if (col < cols) { + for (auto row = sg_id; row < rows; row += sg_num) { + partial = op(partial, fn(row, col, args...)); + } + } + block_partial[id.get_local_id(2)] = partial; + workgroup.sync(); + // in a single warp: accumulate the results + if (id.get_local_id(2) < sg_size) { + partial = init; + // accumulate the partial results within a thread +#pragma unroll + for (int i = 0; i < wg_size; i += sg_size) { + partial = op(partial, block_partial[i + sg_rank]); + } + if (col < cols) { + result[col + id.get_group(2) * cols] = finalize(partial); + } + } + }); +} + + +template +void generic_kernel_reduction_finalize_2d(sycl::handler& cgh, int64 num_results, + int64 num_blocks, ReductionOp op, + FinalizeOp finalize, ValueType init, + const ValueType* input, + int64 result_stride, + ValueType* result) +{ + cgh.parallel_for(sycl::range<1>{static_cast(num_results)}, + [=](sycl::id<1> id) { + auto partial = init; + for (int64 block = 0; block < num_blocks; block++) { + partial = op(partial, + input[id[0] + block * num_results]); + } + result[id[0] * result_stride] = finalize(partial); + }); +} + + +template +void run_generic_col_reduction_small(syn::value_list, + std::shared_ptr exec, + int64 max_workgroups, KernelFunction fn, + ReductionOp op, FinalizeOp finalize, + ValueType init, ValueType* result, + dim<2> size, MappedKernelArgs... args) +{ + constexpr auto wg_size = + KCFG_1D::decode<0>(static_cast(icfg)); + constexpr auto sg_size = + KCFG_1D::decode<1>(static_cast(icfg)); + static_assert(ssg_size <= sg_size, "ssg must be smaller than sg"); + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto row_blocks = + std::min(ceildiv(rows * ssg_size, wg_size), max_workgroups); + auto queue = exec->get_queue(); + if (row_blocks <= 1) { + queue->submit([&](sycl::handler& cgh) { + generic_kernel_col_reduction_2d_small( + cgh, rows, cols, 1, fn, op, finalize, init, result, args...); + }); + } else { + Array tmp_storage{exec, + static_cast(row_blocks * cols)}; + queue->submit([&](sycl::handler& cgh) { + generic_kernel_col_reduction_2d_small( + cgh, rows, cols, row_blocks, fn, op, [](auto v) { return v; }, + init, tmp_storage.get_data(), args...); + }); + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_finalize_2d( + cgh, cols, row_blocks, op, finalize, init, + tmp_storage.get_const_data(), 1, result); + }); + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small, + run_generic_col_reduction_small); + + +template +void run_kernel_row_reduction_stage1(syn::value_list, + std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, size_type result_stride, + dim<2> size, MappedKernelArgs... args) +{ + constexpr auto wg_size = + KCFG_1D::decode<0>(static_cast(icfg)); + constexpr auto sg_size = + KCFG_1D::decode<1>(static_cast(icfg)); + using subsubgroup_sizes = + syn::value_list(16, sg_size), + std::min(32, sg_size), sg_size>; + constexpr int oversubscription = 16; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto resources = + exec->get_num_computing_units() * sg_size * oversubscription; + auto queue = exec->get_queue(); + if (rows * cols > resources && rows < cols) { + const auto col_blocks = ceildiv(rows * cols, resources); + Array partial{exec, + static_cast(col_blocks * rows)}; + generic_kernel_row_reduction_2d( + syn::value_list{}, exec, rows, cols, col_blocks, fn, + op, [](auto v) { return v; }, init, partial.get_data(), 1, args...); + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_finalize_2d( + cgh, rows, col_blocks, op, finalize, init, + partial.get_const_data(), static_cast(result_stride), + result); + }); + } else { + select_generic_kernel_row_reduction_2d( + subsubgroup_sizes(), + [&](int compiled_ssg_size) { + return compiled_ssg_size >= cols || + compiled_ssg_size == sg_size; + }, + syn::value_list(), syn::type_list<>(), exec, rows, cols, + 1, fn, op, finalize, init, result, + static_cast(result_stride), args...); + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_kernel_row_reduction_stage1, + run_kernel_row_reduction_stage1); + + +template +void run_kernel_col_reduction_stage1(syn::value_list, + std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, dim<2> size, + MappedKernelArgs... args) +{ + constexpr auto wg_size = + KCFG_1D::decode<0>(static_cast(icfg)); + constexpr auto sg_size = + KCFG_1D::decode<1>(static_cast(icfg)); + using subsubgroup_sizes = + syn::value_list(16, sg_size), + std::min(32, sg_size), sg_size>; + constexpr int oversubscription = 16; + const auto rows = static_cast(size[0]); + const auto cols = static_cast(size[1]); + const auto max_blocks = + exec->get_num_computing_units() * sg_size * oversubscription / wg_size; + if (cols <= sg_size) { + select_generic_col_reduction_small( + subsubgroup_sizes(), + [&](int compiled_ssg_size) { + return compiled_ssg_size >= cols || + compiled_ssg_size == sg_size; + }, + syn::value_list(), syn::type_list<>(), exec, max_blocks, + fn, op, finalize, init, result, size, args...); + } else { + const auto col_blocks = ceildiv(cols, sg_size); + const auto row_blocks = ceildiv( + std::min(ceildiv(rows * sg_size, wg_size), max_blocks), + col_blocks); + auto queue = exec->get_queue(); + if (row_blocks <= 1) { + queue->submit([&](sycl::handler& cgh) { + generic_kernel_col_reduction_2d_blocked( + cgh, rows, cols, 1, col_blocks, fn, op, finalize, init, + result, args...); + }); + } else { + Array tmp_storage{ + exec, static_cast(row_blocks * cols)}; + queue->submit([&](sycl::handler& cgh) { + generic_kernel_col_reduction_2d_blocked( + cgh, rows, cols, row_blocks, col_blocks, fn, op, + [](auto v) { return v; }, init, tmp_storage.get_data(), + args...); + }); + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_finalize_2d( + cgh, cols, row_blocks, op, finalize, init, + tmp_storage.get_const_data(), 1, result); + }); + } + } +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_kernel_col_reduction_stage1, + run_kernel_col_reduction_stage1); + + +} // namespace + + +template +void run_kernel_row_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, size_type result_stride, + dim<2> size, KernelArgs&&... args) +{ + const auto desired_icfg = static_cast(get_first_cfg( + as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { + return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + })); + select_kernel_row_reduction_stage1( + kcfg_1d_list_simple_reduction, + [&](int icfg) { return icfg == desired_icfg; }, syn::value_list(), + syn::type_list<>(), exec, fn, op, finalize, init, result, result_stride, + size, map_to_device(args)...); +} + + +template +void run_kernel_col_reduction(std::shared_ptr exec, + KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType init, + ValueType* result, dim<2> size, + KernelArgs&&... args) +{ + const auto desired_icfg = static_cast(get_first_cfg( + as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { + return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + })); + select_kernel_col_reduction_stage1( + kcfg_1d_list_simple_reduction, + [&](int icfg) { return icfg == desired_icfg; }, syn::value_list(), + syn::type_list<>(), exec, fn, op, finalize, init, result, size, + map_to_device(args)...); +} + + } // namespace dpcpp } // namespace kernels } // namespace gko diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 9a86ab9cd15..7873a687e4b 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -84,284 +84,6 @@ constexpr int default_block_size = 256; namespace kernel { -template -void compute_partial_reduce( - size_type num_rows, OutType* __restrict__ work, CallableGetValue get_value, - CallableReduce reduce_op, sycl::nd_item<3> item_ct1, - UninitializedArray(cfg)>& tmp_work) -{ - constexpr auto wg_size = KCFG_1D::decode<0>(cfg); - constexpr auto sg_size = KCFG_1D::decode<1>(cfg); - - constexpr auto warps_per_block = wg_size / sg_size; - - const auto num_blocks = item_ct1.get_group_range(2); - const auto local_id = thread::get_local_thread_id(item_ct1); - const auto global_id = - thread::get_thread_id(item_ct1); - - OutType* tmp_work_array = tmp_work; - auto tmp = zero(); - for (auto i = global_id; i < num_rows; i += wg_size * num_blocks) { - tmp = reduce_op(tmp, get_value(i)); - } - - tmp_work_array[local_id] = tmp; - - ::gko::kernels::dpcpp::reduce(group::this_thread_block(item_ct1), - tmp_work_array, reduce_op); - - if (local_id == 0) { - work[thread::get_block_id(item_ct1)] = tmp_work_array[0]; - } -} - - -template -void finalize_reduce_computation( - size_type size, const ValueType* work, ValueType* result, - CallableReduce reduce_op, CallableFinalize finalize_op, - sycl::nd_item<3> item_ct1, - UninitializedArray(cfg)>& tmp_work) -{ - constexpr auto wg_size = KCFG_1D::decode<0>(cfg); - constexpr auto sg_size = KCFG_1D::decode<1>(cfg); - - const auto local_id = thread::get_local_thread_id(item_ct1); - - ValueType tmp = zero(); - for (auto i = local_id; i < size; i += wg_size) { - tmp = reduce_op(tmp, work[i]); - } - ValueType* tmp_work_array = tmp_work; - tmp_work_array[local_id] = tmp; - - ::gko::kernels::dpcpp::reduce(group::this_thread_block(item_ct1), - tmp_work_array, reduce_op); - - if (local_id == 0) { - *result = finalize_op(tmp_work_array[0]); - } -} - - -template -void compute_partial_dot( - size_type num_rows, const ValueType* __restrict__ x, size_type stride_x, - const ValueType* __restrict__ y, size_type stride_y, - ValueType* __restrict__ work, sycl::nd_item<3> item_ct1, - UninitializedArray(cfg)>& tmp_work) -{ - compute_partial_reduce( - num_rows, work, - [x, stride_x, y, stride_y](size_type i) { - return x[i * stride_x] * y[i * stride_y]; - }, - [](const ValueType& x, const ValueType& y) { return x + y; }, item_ct1, - tmp_work); -} - -template -void compute_partial_dot(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_rows, - const ValueType* x, size_type stride_x, - const ValueType* y, size_type stride_y, - ValueType* work) -{ - constexpr auto wg_size = KCFG_1D::decode<0>(cfg); - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access::mode::read_write, - sycl::access::target::local> - tmp_work_acc_ct1(cgh); - - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - compute_partial_dot(num_rows, x, stride_x, y, stride_y, - work, item_ct1, - *tmp_work_acc_ct1.get_pointer()); - }); - }); -} - -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_dot, - compute_partial_dot) -GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_dot_call, compute_partial_dot, - kcfg_1d_list) - - -template -void compute_partial_conj_dot( - size_type num_rows, const ValueType* __restrict__ x, size_type stride_x, - const ValueType* __restrict__ y, size_type stride_y, - ValueType* __restrict__ work, sycl::nd_item<3> item_ct1, - UninitializedArray(cfg)>& tmp_work) -{ - compute_partial_reduce( - num_rows, work, - [x, stride_x, y, stride_y](size_type i) { - return conj(x[i * stride_x]) * y[i * stride_y]; - }, - [](const ValueType& x, const ValueType& y) { return x + y; }, item_ct1, - tmp_work); -} - -template -void compute_partial_conj_dot(dim3 grid, dim3 block, - size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_rows, - const ValueType* x, size_type stride_x, - const ValueType* y, size_type stride_y, - ValueType* work) -{ - constexpr auto wg_size = KCFG_1D::decode<0>(cfg); - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access::mode::read_write, - sycl::access::target::local> - tmp_work_acc_ct1(cgh); - - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - compute_partial_conj_dot(num_rows, x, stride_x, y, - stride_y, work, item_ct1, - *tmp_work_acc_ct1.get_pointer()); - }); - }); -} - -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_conj_dot, - compute_partial_conj_dot) -GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_conj_dot_call, - compute_partial_conj_dot, kcfg_1d_list) - - -template -void finalize_sum_reduce_computation( - size_type size, const ValueType* work, ValueType* result, - sycl::nd_item<3> item_ct1, - UninitializedArray(cfg)>& tmp_work) -{ - finalize_reduce_computation( - size, work, result, - [](const ValueType& x, const ValueType& y) { return x + y; }, - [](const ValueType& x) { return x; }, item_ct1, tmp_work); -} - -template -void finalize_sum_reduce_computation(dim3 grid, dim3 block, - size_type dynamic_shared_memory, - sycl::queue* queue, size_type size, - const ValueType* work, ValueType* result) -{ - constexpr auto wg_size = KCFG_1D::decode<0>(cfg); - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access::mode::read_write, - sycl::access::target::local> - tmp_work_acc_ct1(cgh); - - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - finalize_sum_reduce_computation( - size, work, result, item_ct1, - *tmp_work_acc_ct1.get_pointer()); - }); - }); -} - -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_sum_reduce_computation, - finalize_sum_reduce_computation) -GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sum_reduce_computation_call, - finalize_sum_reduce_computation, kcfg_1d_list) - - -template -void compute_partial_norm2( - size_type num_rows, const ValueType* __restrict__ x, size_type stride_x, - remove_complex* __restrict__ work, sycl::nd_item<3> item_ct1, - UninitializedArray, KCFG_1D::decode<0>(cfg)>& - tmp_work) -{ - using norm_type = remove_complex; - compute_partial_reduce( - num_rows, work, - [x, stride_x](size_type i) { return squared_norm(x[i * stride_x]); }, - [](const norm_type& x, const norm_type& y) { return x + y; }, item_ct1, - tmp_work); -} - -template -void compute_partial_norm2(dim3 grid, dim3 block, - size_type dynamic_shared_memory, sycl::queue* queue, - size_type num_rows, const ValueType* x, - size_type stride_x, remove_complex* work) -{ - constexpr auto wg_size = KCFG_1D::decode<0>(cfg); - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, wg_size>, - 0, sycl::access::mode::read_write, - sycl::access::target::local> - tmp_work_acc_ct1(cgh); - - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - compute_partial_norm2(num_rows, x, stride_x, work, - item_ct1, - *tmp_work_acc_ct1.get_pointer()); - }); - }); -} - -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(compute_partial_norm2, - compute_partial_norm2) -GKO_ENABLE_DEFAULT_CONFIG_CALL(compute_partial_norm2_call, - compute_partial_norm2, kcfg_1d_list) - - -template -void finalize_sqrt_reduce_computation( - size_type size, const ValueType* work, ValueType* result, - sycl::nd_item<3> item_ct1, - UninitializedArray(cfg)>& tmp_work) -{ - finalize_reduce_computation( - size, work, result, - [](const ValueType& x, const ValueType& y) { return x + y; }, - [](const ValueType& x) { return std::sqrt(x); }, item_ct1, tmp_work); -} - -template -void finalize_sqrt_reduce_computation(dim3 grid, dim3 block, - size_type dynamic_shared_memory, - sycl::queue* queue, size_type size, - const ValueType* work, ValueType* result) -{ - constexpr auto wg_size = KCFG_1D::decode<0>(cfg); - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access::mode::read_write, - sycl::access::target::local> - tmp_work_acc_ct1(cgh); - - - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - finalize_sqrt_reduce_computation( - size, work, result, item_ct1, - *tmp_work_acc_ct1.get_pointer()); - }); - }); -} - -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(finalize_sqrt_reduce_computation, - finalize_sqrt_reduce_computation) -GKO_ENABLE_DEFAULT_CONFIG_CALL(finalize_sqrt_reduce_computation_call, - finalize_sqrt_reduce_computation, kcfg_1d_list) - - template void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride, const size_type* __restrict__ row_ptrs, @@ -812,144 +534,6 @@ void apply(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL); -template -void compute_dot(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - if (x->get_size()[1] == 1) { - // TODO: write a custom kernel which does this more efficiently - onemkl::dot(*exec->get_queue(), x->get_size()[0], x->get_const_values(), - x->get_stride(), y->get_const_values(), y->get_stride(), - result->get_values()); - } else { - // TODO: these are tuning parameters obtained experimentally, once - // we decide how to handle this uniformly, they should be modified - // appropriately - constexpr int work_per_thread = 32; - auto queue = exec->get_queue(); - constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const std::uint32_t cfg = - get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { - return validate(queue, KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }); - const auto wg_size = KCFG_1D::decode<0>(cfg); - const auto sg_size = KCFG_1D::decode<1>(cfg); - const auto work_per_block = work_per_thread * wg_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{sg_size, 1, wg_size / sg_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - kernel::compute_partial_dot_call( - cfg, grid_dim, block_dim, 0, exec->get_queue(), - x->get_size()[0], x->get_const_values() + col, x->get_stride(), - y->get_const_values() + col, y->get_stride(), work.get_data()); - kernel::finalize_sum_reduce_computation_call( - cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x, - work.get_const_data(), result->get_values() + col); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); - - -template -void compute_conj_dot(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - if (x->get_size()[1] == 1) { - // TODO: write a custom kernel which does this more efficiently - onemkl::conj_dot(*exec->get_queue(), x->get_size()[0], - x->get_const_values(), x->get_stride(), - y->get_const_values(), y->get_stride(), - result->get_values()); - - } else { - // TODO: these are tuning parameters obtained experimentally, once - // we decide how to handle this uniformly, they should be modified - // appropriately - constexpr int work_per_thread = 32; - auto queue = exec->get_queue(); - constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const std::uint32_t cfg = - get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { - return validate(queue, KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }); - const auto wg_size = KCFG_1D::decode<0>(cfg); - const auto sg_size = KCFG_1D::decode<1>(cfg); - - const auto work_per_block = work_per_thread * wg_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{sg_size, 1, wg_size / sg_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - kernel::compute_partial_conj_dot_call( - cfg, grid_dim, block_dim, 0, exec->get_queue(), - x->get_size()[0], x->get_const_values() + col, x->get_stride(), - y->get_const_values() + col, y->get_stride(), work.get_data()); - kernel::finalize_sum_reduce_computation_call( - cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x, - work.get_const_data(), result->get_values() + col); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); - - -template -void compute_norm2(std::shared_ptr exec, - const matrix::Dense* x, - matrix::Dense>* result) -{ - if (x->get_size()[1] == 1) { - oneapi::mkl::blas::row_major::nrm2( - *exec->get_queue(), x->get_size()[0], x->get_const_values(), - x->get_stride(), result->get_values()); - } else { - using norm_type = remove_complex; - // TODO: these are tuning parameters obtained experimentally, once - // we decide how to handle this uniformly, they should be modified - // appropriately - constexpr int work_per_thread = 32; - auto queue = exec->get_queue(); - constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const std::uint32_t cfg = - get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { - return validate(queue, KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }); - const auto wg_size = KCFG_1D::decode<0>(cfg); - const auto sg_size = KCFG_1D::decode<1>(cfg); - - const auto work_per_block = work_per_thread * wg_size; - const dim3 grid_dim = ceildiv(x->get_size()[0], work_per_block); - const dim3 block_dim{sg_size, 1, wg_size / sg_size}; - Array work(exec, grid_dim.x); - // TODO: write a kernel which does this more efficiently - for (size_type col = 0; col < x->get_size()[1]; ++col) { - kernel::compute_partial_norm2_call( - cfg, grid_dim, block_dim, 0, exec->get_queue(), - x->get_size()[0], x->get_const_values() + col, x->get_stride(), - work.get_data()); - kernel::finalize_sqrt_reduce_computation_call( - cfg, 1, block_dim, 0, exec->get_queue(), grid_dim.x, - work.get_const_data(), result->get_values() + col); - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); - - template void convert_to_coo(std::shared_ptr exec, const matrix::Dense* source, diff --git a/dpcpp/test/base/kernel_launch.dp.cpp b/dpcpp/test/base/kernel_launch.dp.cpp index 592ce9b934c..8a70d9bc1cb 100644 --- a/dpcpp/test/base/kernel_launch.dp.cpp +++ b/dpcpp/test/base/kernel_launch.dp.cpp @@ -349,4 +349,74 @@ TEST_F(KernelLaunch, Reduction2D) } +TEST_F(KernelLaunch, ReductionRow2D) +{ + for (auto num_rows : {0, 1, 10, 100, 1000, 10000}) { + for (auto num_cols : {0, 1, 10, 100, 1000, 10000}) { + SCOPED_TRACE(std::to_string(num_rows) + " rows, " + + std::to_string(num_cols) + " cols"); + gko::Array host_ref{exec->get_master(), + static_cast(2 * num_rows)}; + std::fill_n(host_ref.get_data(), 2 * num_rows, 1234); + gko::Array output{exec, host_ref}; + for (int i = 0; i < num_rows; i++) { + host_ref.get_data()[2 * i] = + static_cast(num_cols) * (num_cols + 1) * (i + 1); + } + + gko::kernels::dpcpp::run_kernel_row_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return 2 * j; }, int64{}, + output.get_data(), 2, + gko::dim<2>{static_cast(num_rows), + static_cast(num_cols)}, + output); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); + } + } +} + + +TEST_F(KernelLaunch, ReductionCol2D) +{ + for (int num_rows : {0, 1, 10, 100, 1000, 10000}) { + for (int num_cols : + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 40, 100, 1000}) { + SCOPED_TRACE(std::to_string(num_rows) + " rows, " + + std::to_string(num_cols) + " cols"); + gko::Array host_ref{exec->get_master(), + static_cast(num_cols)}; + gko::Array output{exec, static_cast(num_cols)}; + for (int i = 0; i < num_cols; i++) { + host_ref.get_data()[i] = + static_cast(num_rows) * (num_rows + 1) * (i + 1); + } + + gko::kernels::dpcpp::run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto a) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "value"); + return (i + 1) * (j + 1); + }, + [] GKO_KERNEL(auto i, auto j) { return i + j; }, + [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, + output.get_data(), + gko::dim<2>{static_cast(num_rows), + static_cast(num_cols)}, + output); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); + } + } +} + + } // namespace From 7eec0c51aeb1e7d260519362eab49dc2037297e9 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 11 Aug 2021 10:42:55 +0200 Subject: [PATCH 17/25] fix include guards for kernel launch reduction --- common/unified/base/kernel_launch_reduction.hpp | 2 +- cuda/base/kernel_launch_reduction.cuh | 2 +- hip/base/kernel_launch_reduction.hip.hpp | 2 +- omp/base/kernel_launch_reduction.hpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/common/unified/base/kernel_launch_reduction.hpp b/common/unified/base/kernel_launch_reduction.hpp index 78de06466aa..9eb65216416 100644 --- a/common/unified/base/kernel_launch_reduction.hpp +++ b/common/unified/base/kernel_launch_reduction.hpp @@ -48,4 +48,4 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#endif // GKO_COMMON_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ +#endif // GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh index 49a6ca95f87..a083a92eac5 100644 --- a/cuda/base/kernel_launch_reduction.cuh +++ b/cuda/base/kernel_launch_reduction.cuh @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_COMMON_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ +#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ #error \ "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp" #endif diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp index aa3f3384ca6..502a87cc3fd 100644 --- a/hip/base/kernel_launch_reduction.hip.hpp +++ b/hip/base/kernel_launch_reduction.hip.hpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_COMMON_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ +#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ #error \ "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp" #endif diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp index 84758549918..bcab938449f 100644 --- a/omp/base/kernel_launch_reduction.hpp +++ b/omp/base/kernel_launch_reduction.hpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_COMMON_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ +#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_ #error \ "This file can only be used from inside common/unified/base/kernel_launch_reduction.hpp" #endif From 53ba972021dace92fae73063650f747d7e549be2 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 11 Aug 2021 10:43:13 +0200 Subject: [PATCH 18/25] run header formatting on common kernels --- .github/bot-pr-format-base.sh | 4 ++-- dev_tools/scripts/config | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/bot-pr-format-base.sh b/.github/bot-pr-format-base.sh index e72539a9d61..10be68353b9 100644 --- a/.github/bot-pr-format-base.sh +++ b/.github/bot-pr-format-base.sh @@ -3,8 +3,8 @@ source .github/bot-pr-base.sh EXTENSION_REGEX='\.(cuh?|hpp|hpp\.inc?|cpp)$' -FORMAT_HEADER_REGEX='^(benchmark|core|cuda|hip|include/ginkgo/core|omp|reference|dpcpp)/' -FORMAT_REGEX='^(common|examples|test)/' +FORMAT_HEADER_REGEX='^(benchmark|core|cuda|hip|include/ginkgo/core|omp|reference|dpcpp|common/unified)/' +FORMAT_REGEX='^(common/cuda_hip|examples|test)/' echo "Retrieving PR file list" PR_FILES=$(bot_get_all_changed_files ${PR_URL}) diff --git a/dev_tools/scripts/config b/dev_tools/scripts/config index 696d886d489..a8b0cb3841a 100644 --- a/dev_tools/scripts/config +++ b/dev_tools/scripts/config @@ -30,6 +30,9 @@ - FixInclude: "common/unified/base/kernel_launch.hpp" - "(cuda|hip|dpcpp|omp)/base/kernel_launch_solver\." - FixInclude: "common/unified/base/kernel_launch_solver.hpp" +- "common/unified/.*.cpp" + - PathIgnore: "2" + - PathPrefix: "core" - "core/test/base/(extended_float|iterator_factory)" - RemoveTest: "true" - "core/test/base/allocator" From 5889bbc23205f87a839de1dbf19d733e0287191a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 11 Aug 2021 11:57:17 +0200 Subject: [PATCH 19/25] work around Intel compiler bug icpc has issues with some combination of pragma omp parallel for and lambdas called inside the loop: internal error: assertion failed: find_assoc_pragma: pragma not found Putting the entire scope into an immediately evaluated lambda helps --- omp/base/kernel_launch_reduction.hpp | 70 ++++++++++++++-------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp index bcab938449f..0c5acf0ebe2 100644 --- a/omp/base/kernel_launch_reduction.hpp +++ b/omp/base/kernel_launch_reduction.hpp @@ -74,9 +74,7 @@ void run_kernel_reduction_impl(std::shared_ptr exec, auto local_partial = init; for (auto i = begin; i < end; i++) { - local_partial = op(local_partial, [&]() { - return fn(i, map_to_device(args)...); - }()); + local_partial = op(local_partial, fn(i, map_to_device(args)...)); } partial.get_data()[thread_id] = local_partial; } @@ -118,9 +116,7 @@ void run_kernel_reduction_sized_impl(syn::value_list, for (auto row = begin; row < end; row++) { #pragma unroll for (int64 col = 0; col < local_cols; col++) { - local_partial = op(local_partial, [&]() { - return fn(row, col, args...); - }()); + local_partial = op(local_partial, fn(row, col, args...)); } } } else { @@ -131,16 +127,14 @@ void run_kernel_reduction_sized_impl(syn::value_list, base_col += block_size) { #pragma unroll for (int64 i = 0; i < block_size; i++) { - local_partial = op(local_partial, [&]() { - return fn(row, base_col + i, args...); - }()); + local_partial = + op(local_partial, fn(row, base_col + i, args...)); } } #pragma unroll for (int64 i = 0; i < remainder_cols; i++) { - local_partial = op(local_partial, [&]() { - return fn(row, rounded_cols + i, args...); - }()); + local_partial = + op(local_partial, fn(row, rounded_cols + i, args...)); } } } @@ -217,12 +211,13 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, cols < rows) { #pragma omp parallel for for (int64 row = 0; row < rows; row++) { - auto partial = init; - for (int64 col = 0; col < cols; col++) { - partial = - op(partial, [&]() { return fn(row, col, args...); }()); - } - result[result_stride * row] = finalize(partial); + [&]() { + auto partial = init; + for (int64 col = 0; col < cols; col++) { + partial = op(partial, fn(row, col, args...)); + } + result[result_stride * row] = finalize(partial); + }(); } } else { // small number of rows and large reduction sizes: do partial sum first @@ -248,13 +243,17 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, // then accumulate the partial sums and write to result #pragma omp parallel for for (int64 row = 0; row < rows; row++) { - auto local_partial = init; - for (int64 thread_id = 0; thread_id < num_threads; thread_id++) { - local_partial = - op(local_partial, - partial.get_const_data()[row * num_threads + thread_id]); - } - result[row * result_stride] = finalize(local_partial); + [&] { + auto local_partial = init; + for (int64 thread_id = 0; thread_id < num_threads; + thread_id++) { + local_partial = op( + local_partial, + partial + .get_const_data()[row * num_threads + thread_id]); + } + result[row * result_stride] = finalize(local_partial); + }(); } } } @@ -273,9 +272,8 @@ void run_kernel_col_reduction_sized_block_impl( for (auto row = row_begin; row < row_end; row++) { #pragma unroll for (int64 rel_col = 0; rel_col < local_cols; rel_col++) { - partial[rel_col] = op(partial[rel_col], [&]() { - return fn(row, base_col + rel_col, args...); - }()); + partial[rel_col] = + op(partial[rel_col], fn(row, base_col + rel_col, args...)); } } #pragma unroll @@ -343,12 +341,16 @@ void run_kernel_col_reduction_sized_impl( } #pragma omp parallel for for (int64 col = 0; col < cols; col++) { - auto total = init; - for (int64 row_block = 0; row_block < reduction_size; row_block++) { - total = - op(total, partial.get_const_data()[col + cols * row_block]); - } - result[col] = finalize(total); + [&] { + auto total = init; + for (int64 row_block = 0; row_block < reduction_size; + row_block++) { + total = + op(total, + partial.get_const_data()[col + cols * row_block]); + } + result[col] = finalize(total); + }(); } } } From 00e61103095f4ad966f0281ddf68eb1650c35632 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 4 Oct 2021 12:23:40 +0200 Subject: [PATCH 20/25] review updates * remove unnecessary shmem init * add test comments --- cuda/base/kernel_launch_reduction.cuh | 6 ------ cuda/test/base/kernel_launch.cu | 16 ++++++++++++---- dpcpp/base/kernel_launch_reduction.dp.hpp | 6 ------ dpcpp/test/base/kernel_launch.dp.cpp | 16 ++++++++++++---- hip/base/kernel_launch_reduction.hip.hpp | 6 ------ hip/test/base/kernel_launch.hip.cpp | 16 ++++++++++++---- omp/test/base/kernel_launch.cpp | 22 +++++++++++++++++----- 7 files changed, 53 insertions(+), 35 deletions(-) diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh index a083a92eac5..d98bb878672 100644 --- a/cuda/base/kernel_launch_reduction.cuh +++ b/cuda/base/kernel_launch_reduction.cuh @@ -252,12 +252,6 @@ __global__ const auto subwarp_num = thread::get_subwarp_num_flat(); const auto block = group::this_thread_block(); - // - if (threadIdx.x < shared_storage) { - block_partial[threadIdx.x] = init; - } - block.sync(); - // const auto warp = group::tiled_partition(block); const auto warp_rank = warp.thread_rank(); const auto subwarp_rank = warp_rank % subwarp_size; diff --git a/cuda/test/base/kernel_launch.cu b/cuda/test/base/kernel_launch.cu index 1d43293d553..66fc3d9e94d 100644 --- a/cuda/test/base/kernel_launch.cu +++ b/cuda/test/base/kernel_launch.cu @@ -292,7 +292,8 @@ void run1d_reduction(std::shared_ptr exec) [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), size_type{100000}, output); - ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000ll); + // 2 * sum i=0...99999 (i+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000LL); gko::kernels::cuda::run_kernel_reduction( exec, @@ -312,7 +313,8 @@ void run1d_reduction(std::shared_ptr exec) }, int64{}, output.get_data(), size_type{100}, output); - ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100ll); + // 2 * sum i=0...99 (i+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100LL); } TEST_F(KernelLaunch, Reduction1D) { run1d_reduction(exec); } @@ -341,7 +343,8 @@ void run2d_reduction(std::shared_ptr exec) }, int64{}, output.get_data(), gko::dim<2>{1000, 100}, output); - ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000ll); + // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000LL); gko::kernels::cuda::run_kernel_reduction( exec, @@ -362,7 +365,8 @@ void run2d_reduction(std::shared_ptr exec) }, int64{}, output.get_data(), gko::dim<2>{10, 10}, output); - ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100ll); + // 4 * sum i=0...9 sum j=0...9 of (i+1)*(j+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100LL); } TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); } @@ -379,6 +383,8 @@ void run2d_row_reduction(std::shared_ptr exec) std::fill_n(host_ref.get_data(), 2 * num_rows, 1234); gko::Array output{exec, host_ref}; for (int64 i = 0; i < num_rows; i++) { + // we are computing 2 * sum {j=0, j(num_cols) * (num_cols + 1) * (i + 1); } @@ -427,6 +433,8 @@ void run2d_col_reduction(std::shared_ptr exec) static_cast(num_cols)}; gko::Array output{exec, static_cast(num_cols)}; for (int64 i = 0; i < num_cols; i++) { + // we are computing 2 * sum {j=0, j(num_rows) * (num_rows + 1) * (i + 1); } diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp index ca82a897269..4bda0422178 100644 --- a/dpcpp/base/kernel_launch_reduction.dp.hpp +++ b/dpcpp/base/kernel_launch_reduction.dp.hpp @@ -386,12 +386,6 @@ void generic_kernel_col_reduction_2d_small(sycl::handler& cgh, int64 rows, const auto ssg_num = thread::get_subwarp_num_flat(id); const auto workgroup = group::this_thread_block(id); - // TODO remove - if (id.get_local_id(2) < shared_storage) { - block_partial[id.get_local_id(2)] = init; - } - workgroup.sync(); - // TODO end const auto subgroup = group::tiled_partition(workgroup); const auto sg_rank = subgroup.thread_rank(); const auto ssg_rank = sg_rank % ssg_size; diff --git a/dpcpp/test/base/kernel_launch.dp.cpp b/dpcpp/test/base/kernel_launch.dp.cpp index 8a70d9bc1cb..2c7a08cdb36 100644 --- a/dpcpp/test/base/kernel_launch.dp.cpp +++ b/dpcpp/test/base/kernel_launch.dp.cpp @@ -279,7 +279,8 @@ TEST_F(KernelLaunch, Reduction1D) }, int64{}, output.get_data(), size_type{100000}, output); - EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000ll); + // 2 * sum i=0...99999 (i+1) + EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000LL); gko::kernels::dpcpp::run_kernel_reduction( exec, @@ -299,7 +300,8 @@ TEST_F(KernelLaunch, Reduction1D) }, int64{}, output.get_data(), size_type{100}, output); - EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100ll); + // 2 * sum i=0...99 (i+1) + EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100LL); } @@ -325,7 +327,8 @@ TEST_F(KernelLaunch, Reduction2D) }, int64{}, output.get_data(), gko::dim<2>{1000, 100}, output); - EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000ll); + // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1) + EXPECT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000LL); gko::kernels::dpcpp::run_kernel_reduction( exec, @@ -345,7 +348,8 @@ TEST_F(KernelLaunch, Reduction2D) }, int64{}, output.get_data(), gko::dim<2>{10, 10}, output); - ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100ll); + // 4 * sum i=0...9 sum j=0...9 of (i+1)*(j+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100LL); } @@ -360,6 +364,8 @@ TEST_F(KernelLaunch, ReductionRow2D) std::fill_n(host_ref.get_data(), 2 * num_rows, 1234); gko::Array output{exec, host_ref}; for (int i = 0; i < num_rows; i++) { + // we are computing 2 * sum {j=0, j(num_cols) * (num_cols + 1) * (i + 1); } @@ -395,6 +401,8 @@ TEST_F(KernelLaunch, ReductionCol2D) static_cast(num_cols)}; gko::Array output{exec, static_cast(num_cols)}; for (int i = 0; i < num_cols; i++) { + // we are computing 2 * sum {j=0, j(num_rows) * (num_rows + 1) * (i + 1); } diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp index 502a87cc3fd..47b33f411ac 100644 --- a/hip/base/kernel_launch_reduction.hip.hpp +++ b/hip/base/kernel_launch_reduction.hip.hpp @@ -257,12 +257,6 @@ __global__ const auto subwarp_num = thread::get_subwarp_num_flat(); const auto block = group::this_thread_block(); - // - if (threadIdx.x < shared_storage) { - block_partial[threadIdx.x] = init; - } - block.sync(); - // const auto warp = group::tiled_partition(block); const auto warp_rank = warp.thread_rank(); const auto subwarp_rank = warp_rank % subwarp_size; diff --git a/hip/test/base/kernel_launch.hip.cpp b/hip/test/base/kernel_launch.hip.cpp index 755f8b3834d..c7add9ddca8 100644 --- a/hip/test/base/kernel_launch.hip.cpp +++ b/hip/test/base/kernel_launch.hip.cpp @@ -291,7 +291,8 @@ void run1d_reduction(std::shared_ptr exec) [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), size_type{100000}, output); - ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000ll); + // 2 * sum i=0...99999 (i+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10000100000LL); gko::kernels::hip::run_kernel_reduction( exec, @@ -304,7 +305,8 @@ void run1d_reduction(std::shared_ptr exec) [] GKO_KERNEL(auto j) { return j * 2; }, int64{}, output.get_data(), size_type{100}, output); - ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100ll); + // 2 * sum i=0...99 (i+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10100LL); } TEST_F(KernelLaunch, Reduction1D) { run1d_reduction(exec); } @@ -325,7 +327,8 @@ void run2d_reduction(std::shared_ptr exec) [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), gko::dim<2>{1000, 100}, output); - ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000ll); + // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 10110100000LL); gko::kernels::hip::run_kernel_reduction( exec, @@ -338,7 +341,8 @@ void run2d_reduction(std::shared_ptr exec) [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), gko::dim<2>{10, 10}, output); - ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100ll); + // 4 * sum i=0...9 sum j=0...9 of (i+1)*(j+1) + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), 12100LL); } TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); } @@ -353,6 +357,8 @@ void run2d_row_reduction(std::shared_ptr exec) std::fill_n(host_ref.get_data(), 2 * num_rows, 1234); gko::Array output{exec, host_ref}; for (int i = 0; i < num_rows; i++) { + // we are computing 2 * sum {j=0, j exec) static_cast(num_cols)}; gko::Array output{exec, static_cast(num_cols)}; for (int i = 0; i < num_cols; i++) { + // we are computing 2 * sum {j=0, j{10, cols}, output); - ASSERT_EQ(*output.get_const_data(), 110ll * cols * (cols + 1)); + // 4 * sum i=0...9 sum j=0...cols-1 of (i+1)*(j+1) + ASSERT_EQ(*output.get_const_data(), 110LL * cols * (cols + 1)); } } @@ -309,7 +312,8 @@ TEST_F(KernelLaunch, Reduction2DLargeRows) [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), gko::dim<2>{1000, cols}, output); - ASSERT_EQ(*output.get_const_data(), 1001000ll * cols * (cols + 1)); + // 4 * sum i=0...999 sum j=0...cols-1 of (i+1)*(j+1) + ASSERT_EQ(*output.get_const_data(), 1001000LL * cols * (cols + 1)); } } @@ -329,7 +333,9 @@ TEST_F(KernelLaunch, Reduction2D) [] GKO_KERNEL(auto j) { return j * 4; }, int64{}, output.get_data(), gko::dim<2>{1000, 100}, output); - ASSERT_EQ(*output.get_const_data(), 10110100000ll); + + // 4 * sum i=0...999 sum j=0...99 of (i+1)*(j+1) + ASSERT_EQ(*output.get_const_data(), 10110100000LL); } @@ -344,6 +350,8 @@ TEST_F(KernelLaunch, ReductionRow2DSmall) std::fill_n(host_ref.get_data(), 2 * num_rows, 1234); gko::Array output{exec, host_ref}; for (int i = 0; i < num_rows; i++) { + // we are computing 2 * sum {j=0, j(num_cols) * (num_cols + 1) * (i + 1); } @@ -374,6 +382,8 @@ TEST_F(KernelLaunch, ReductionRow2D) std::fill_n(host_ref.get_data(), 2 * num_rows, 1234); gko::Array output{exec, host_ref}; for (int i = 0; i < num_rows; i++) { + // we are computing 2 * sum {j=0, j(num_cols) * (num_cols + 1) * (i + 1); } @@ -404,6 +414,8 @@ TEST_F(KernelLaunch, ReductionCol2D) static_cast(num_cols)}; gko::Array output{exec, static_cast(num_cols)}; for (int i = 0; i < num_cols; i++) { + // we are computing 2 * sum {j=0, j(num_rows) * (num_rows + 1) * (i + 1); } From c0093c8ca9003dab88848b8046820f7d98e9f8a5 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 5 Oct 2021 12:07:48 +0200 Subject: [PATCH 21/25] rename init to identity in reduction interface --- cuda/base/kernel_launch_reduction.cuh | 80 +++++++-------- dpcpp/base/kernel_launch_reduction.dp.hpp | 116 +++++++++++----------- hip/base/kernel_launch_reduction.hip.hpp | 83 ++++++++-------- omp/base/kernel_launch_reduction.hpp | 56 ++++++----- 4 files changed, 169 insertions(+), 166 deletions(-) diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh index d98bb878672..30f7fa1ba96 100644 --- a/cuda/base/kernel_launch_reduction.cuh +++ b/cuda/base/kernel_launch_reduction.cuh @@ -56,7 +56,7 @@ __global__ __launch_bounds__( KernelFunction fn, ReductionOp op, FinalizeOp finalize, - ValueType init, + ValueType identity, ValueType* storage, KernelArgs... args) { @@ -69,7 +69,7 @@ __global__ __launch_bounds__( auto grid_size = thread::get_thread_num_flat(); auto warp = group::tiled_partition(group::this_thread_block()); - auto partial = init; + auto partial = identity; for (int64 i = tidx; i < size; i += grid_size) { partial = op(partial, fn(i, args...)); } @@ -82,7 +82,7 @@ __global__ __launch_bounds__( partial = reduce(warp, threadIdx.x < default_block_size / config::warp_size ? warp_partial[threadIdx.x] - : init, + : identity, op); if (threadIdx.x == 0) { storage[blockIdx.x] = finalize(partial); @@ -98,7 +98,7 @@ __global__ __launch_bounds__( KernelFunction fn, ReductionOp op, FinalizeOp finalize, - ValueType init, + ValueType identity, ValueType* storage, KernelArgs... args) { @@ -111,7 +111,7 @@ __global__ __launch_bounds__( auto grid_size = thread::get_thread_num_flat(); auto warp = group::tiled_partition(group::this_thread_block()); - auto partial = init; + auto partial = identity; for (int64 i = tidx; i < rows * cols; i += grid_size) { const auto row = i / cols; const auto col = i % cols; @@ -126,7 +126,7 @@ __global__ __launch_bounds__( partial = reduce(warp, threadIdx.x < default_block_size / config::warp_size ? warp_partial[threadIdx.x] - : init, + : identity, op); if (threadIdx.x == 0) { storage[blockIdx.x] = finalize(partial); @@ -139,7 +139,7 @@ template void run_kernel_reduction(std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, size_type size, KernelArgs&&... args) { @@ -152,16 +152,16 @@ void run_kernel_reduction(std::shared_ptr exec, Array partial{exec, static_cast(num_blocks)}; generic_kernel_reduction_1d<<>>( static_cast(size), fn, op, - [] __device__(auto v) { return v; }, as_cuda_type(init), + [] __device__(auto v) { return v; }, as_cuda_type(identity), as_cuda_type(partial.get_data()), map_to_device(args)...); generic_kernel_reduction_1d<<<1, block_size>>>( static_cast(num_blocks), [] __device__(auto i, auto v) { return v[i]; }, op, finalize, - as_cuda_type(init), as_cuda_type(result), + as_cuda_type(identity), as_cuda_type(result), as_cuda_type(partial.get_const_data())); } else { generic_kernel_reduction_1d<<<1, block_size>>>( - static_cast(size), fn, op, finalize, as_cuda_type(init), + static_cast(size), fn, op, finalize, as_cuda_type(identity), as_cuda_type(result), map_to_device(args)...); } } @@ -171,7 +171,7 @@ template void run_kernel_reduction(std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, KernelArgs&&... args) { constexpr int oversubscription = 16; @@ -186,16 +186,16 @@ void run_kernel_reduction(std::shared_ptr exec, Array partial{exec, static_cast(num_blocks)}; generic_kernel_reduction_2d<<>>( rows, cols, fn, op, [] __device__(auto v) { return v; }, - as_cuda_type(init), as_cuda_type(partial.get_data()), + as_cuda_type(identity), as_cuda_type(partial.get_data()), map_to_device(args)...); generic_kernel_reduction_1d<<<1, block_size>>>( static_cast(num_blocks), [] __device__(auto i, auto v) { return v[i]; }, op, finalize, - as_cuda_type(init), as_cuda_type(result), + as_cuda_type(identity), as_cuda_type(result), as_cuda_type(partial.get_const_data())); } else { generic_kernel_reduction_2d<<<1, block_size>>>( - rows, cols, fn, op, finalize, as_cuda_type(init), + rows, cols, fn, op, finalize, as_cuda_type(identity), as_cuda_type(result), map_to_device(args)...); } } @@ -206,8 +206,8 @@ template (); const auto row = idx % rows; @@ -221,7 +221,7 @@ __global__ const auto end = min(begin + cols_per_part, cols); auto subwarp = group::tiled_partition(group::this_thread_block()); - auto partial = init; + auto partial = identity; for (auto col = begin + subwarp.thread_rank(); col < end; col += subwarp_size) { partial = op(partial, fn(row, col, args...)); @@ -238,7 +238,7 @@ template (subwarp_rank); - auto partial = init; + auto partial = identity; // accumulate within a thread if (col < cols) { for (auto row = subwarp_id; row < rows; row += subwarp_num) { @@ -274,7 +274,7 @@ __global__ block.sync(); // in a single thread: accumulate the results if (local_warp_id == 0) { - partial = init; + partial = identity; // accumulate the partial results within a thread if (shared_storage >= warp_size) { #pragma unroll @@ -301,7 +301,7 @@ template (block); const auto warp_rank = warp.thread_rank(); const auto col = warp_rank + static_cast(blockIdx.y) * warp_size; - auto partial = init; + auto partial = identity; // accumulate within a thread if (col < cols) { for (auto row = warp_id; row < rows; row += warp_num) { @@ -323,7 +323,7 @@ __global__ block.sync(); // in a single warp: accumulate the results if (threadIdx.x < warp_size) { - partial = init; + partial = identity; // accumulate the partial results within a thread #pragma unroll for (int i = 0; i < default_block_size; i += warp_size) { @@ -340,14 +340,14 @@ template __global__ __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d( int64 num_results, int64 num_blocks, ReductionOp op, - FinalizeOp finalize, ValueType init, const ValueType* input, + FinalizeOp finalize, ValueType identity, const ValueType* input, int64 result_stride, ValueType* result) { const auto idx = thread::get_thread_id_flat(); if (idx >= num_results) { return; } - auto partial = init; + auto partial = identity; for (int64 block = 0; block < num_blocks; block++) { partial = op(partial, input[idx + block * num_results]); } @@ -363,7 +363,7 @@ template , int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, int64 result_stride, KernelArgs... args) { @@ -371,7 +371,7 @@ void run_generic_kernel_row_reduction(syn::value_list, ceildiv(rows * col_blocks * subwarp_size, default_block_size); generic_kernel_row_reduction_2d <<>>( - rows, cols, col_blocks, fn, op, finalize, as_cuda_type(init), + rows, cols, col_blocks, fn, op, finalize, as_cuda_type(identity), as_cuda_type(result), result_stride, args...); } @@ -386,7 +386,7 @@ void run_generic_col_reduction_small(syn::value_list, int64 max_blocks, std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, MappedKernelArgs... args) { @@ -397,7 +397,7 @@ void run_generic_col_reduction_small(syn::value_list, if (num_blocks <= 1) { generic_kernel_col_reduction_2d_small <<<1, default_block_size>>>(rows, cols, fn, op, finalize, - as_cuda_type(init), + as_cuda_type(identity), as_cuda_type(result), args...); } else { Array tmp_storage{exec, @@ -405,11 +405,11 @@ void run_generic_col_reduction_small(syn::value_list, generic_kernel_col_reduction_2d_small <<>>( rows, cols, fn, op, [] __device__(auto v) { return v; }, - as_cuda_type(init), as_cuda_type(tmp_storage.get_data()), + as_cuda_type(identity), as_cuda_type(tmp_storage.get_data()), args...); generic_kernel_reduction_finalize_2d<<< ceildiv(cols, default_block_size), default_block_size>>>( - cols, num_blocks, op, finalize, as_cuda_type(init), + cols, num_blocks, op, finalize, as_cuda_type(identity), as_cuda_type(tmp_storage.get_const_data()), 1, as_cuda_type(result)); } @@ -426,7 +426,7 @@ template void run_kernel_row_reduction(std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, size_type result_stride, dim<2> size, KernelArgs&&... args) { @@ -447,12 +447,12 @@ void run_kernel_row_reduction(std::shared_ptr exec, generic_kernel_row_reduction_2d <<>>( rows, cols, col_blocks, fn, op, - [] __device__(auto v) { return v; }, as_cuda_type(init), + [] __device__(auto v) { return v; }, as_cuda_type(identity), as_cuda_type(partial.get_data()), 1, map_to_device(args)...); const auto num_finalize_blocks = ceildiv(rows, default_block_size); generic_kernel_reduction_finalize_2d<<>>( - rows, col_blocks, op, finalize, as_cuda_type(init), + rows, col_blocks, op, finalize, as_cuda_type(identity), as_cuda_type(partial.get_const_data()), static_cast(result_stride), as_cuda_type(result)); } else { @@ -463,7 +463,7 @@ void run_kernel_row_reduction(std::shared_ptr exec, compiled_subwarp_size == config::warp_size; }, syn::value_list(), syn::type_list<>(), rows, cols, 1, fn, op, - finalize, init, result, static_cast(result_stride), + finalize, identity, result, static_cast(result_stride), map_to_device(args)...); } } @@ -473,7 +473,7 @@ template void run_kernel_col_reduction(std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, KernelArgs&&... args) { @@ -493,7 +493,7 @@ void run_kernel_col_reduction(std::shared_ptr exec, compiled_subwarp_size == config::warp_size; }, syn::value_list(), syn::type_list<>(), max_blocks, exec, fn, - op, finalize, init, result, size, map_to_device(args)...); + op, finalize, identity, result, size, map_to_device(args)...); } else { const auto col_blocks = ceildiv(cols, config::warp_size); const auto row_blocks = @@ -504,7 +504,7 @@ void run_kernel_col_reduction(std::shared_ptr exec, if (row_blocks <= 1) { generic_kernel_col_reduction_2d_blocked<<>>( - rows, cols, fn, op, finalize, as_cuda_type(init), + rows, cols, fn, op, finalize, as_cuda_type(identity), as_cuda_type(result), map_to_device(args)...); } else { Array tmp_storage{ @@ -512,11 +512,11 @@ void run_kernel_col_reduction(std::shared_ptr exec, generic_kernel_col_reduction_2d_blocked<<< dim3(row_blocks, col_blocks), default_block_size>>>( rows, cols, fn, op, [] __device__(auto v) { return v; }, - as_cuda_type(init), as_cuda_type(tmp_storage.get_data()), + as_cuda_type(identity), as_cuda_type(tmp_storage.get_data()), map_to_device(args)...); generic_kernel_reduction_finalize_2d<<< ceildiv(cols, default_block_size), default_block_size>>>( - cols, row_blocks, op, finalize, as_cuda_type(init), + cols, row_blocks, op, finalize, as_cuda_type(identity), as_cuda_type(tmp_storage.get_const_data()), 1, as_cuda_type(result)); } diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp index 4bda0422178..4b29a5af55e 100644 --- a/dpcpp/base/kernel_launch_reduction.dp.hpp +++ b/dpcpp/base/kernel_launch_reduction.dp.hpp @@ -69,7 +69,7 @@ template (cfg); @@ -89,7 +89,7 @@ void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size, const auto local_tidx = static_cast(tidx % wg_size); auto subgroup = group::tiled_partition(group::this_thread_block(idx)); - auto partial = init; + auto partial = identity; for (int64 i = tidx; i < size; i += global_size) { partial = op(partial, fn(i, args...)); } @@ -99,7 +99,7 @@ void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size, } idx.barrier(sycl::access::fence_space::local_space); if (local_tidx < sg_size) { - partial = init; + partial = identity; for (int64 i = local_tidx; i < num_partials; i += sg_size) { partial = op(partial, subgroup_partial[i]); } @@ -118,7 +118,7 @@ template (cfg); @@ -138,7 +138,7 @@ void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols, const auto local_tidx = static_cast(tidx % wg_size); auto subgroup = group::tiled_partition(group::this_thread_block(idx)); - auto partial = init; + auto partial = identity; for (int64 i = tidx; i < rows * cols; i += global_size) { const auto row = i / cols; const auto col = i % cols; @@ -150,7 +150,7 @@ void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols, } idx.barrier(sycl::access::fence_space::local_space); if (local_tidx < sg_size) { - partial = init; + partial = identity; for (int64 i = local_tidx; i < num_partials; i += sg_size) { partial = op(partial, subgroup_partial[i]); } @@ -169,7 +169,7 @@ template , std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, size_type size, MappedKernelArgs... args) { @@ -186,19 +186,20 @@ void run_kernel_reduction_impl(syn::value_list, queue->submit([&](sycl::handler& cgh) { generic_kernel_reduction_1d( cgh, static_cast(size), num_workgroups, fn, op, - [](auto v) { return v; }, init, partial.get_data(), args...); + [](auto v) { return v; }, identity, partial.get_data(), + args...); }); queue->submit([&](sycl::handler& cgh) { generic_kernel_reduction_1d( cgh, static_cast(num_workgroups), 1, - [](auto i, auto v) { return v[i]; }, op, finalize, init, result, - partial.get_const_data()); + [](auto i, auto v) { return v[i]; }, op, finalize, identity, + result, partial.get_const_data()); }); } else { queue->submit([&](sycl::handler& cgh) { generic_kernel_reduction_1d(cgh, static_cast(size), num_workgroups, fn, op, finalize, - init, result, args...); + identity, result, args...); }); } } @@ -210,7 +211,7 @@ template , std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, MappedKernelArgs... args) { @@ -230,18 +231,19 @@ void run_kernel_reduction_impl(syn::value_list, queue->submit([&](sycl::handler& cgh) { generic_kernel_reduction_2d( cgh, rows, cols, num_workgroups, fn, op, - [](auto v) { return v; }, init, partial.get_data(), args...); + [](auto v) { return v; }, identity, partial.get_data(), + args...); }); queue->submit([&](sycl::handler& cgh) { generic_kernel_reduction_1d( cgh, static_cast(num_workgroups), 1, - [](auto i, auto v) { return v[i]; }, op, finalize, init, result, - partial.get_const_data()); + [](auto i, auto v) { return v[i]; }, op, finalize, identity, + result, partial.get_const_data()); }); } else { queue->submit([&](sycl::handler& cgh) { generic_kernel_reduction_2d(cgh, rows, cols, num_workgroups, - fn, op, finalize, init, result, + fn, op, finalize, identity, result, args...); }); } @@ -255,7 +257,7 @@ template void run_kernel_reduction(std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, KernelArgs&&... args) { const auto desired_icfg = static_cast(get_first_cfg( @@ -266,7 +268,7 @@ void run_kernel_reduction(std::shared_ptr exec, select_run_kernel_reduction( kcfg_1d_list_simple_reduction, [&](int icfg) { return icfg == desired_icfg; }, syn::value_list(), - syn::type_list<>(), exec, fn, op, finalize, init, result, size, + syn::type_list<>(), exec, fn, op, finalize, identity, result, size, map_to_device(args)...); } @@ -275,7 +277,7 @@ template void run_kernel_reduction(std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, size_type size, KernelArgs&&... args) { @@ -287,7 +289,7 @@ void run_kernel_reduction(std::shared_ptr exec, select_run_kernel_reduction( kcfg_1d_list_simple_reduction, [&](int icfg) { return icfg == desired_icfg; }, syn::value_list(), - syn::type_list<>(), exec, fn, op, finalize, init, result, size, + syn::type_list<>(), exec, fn, op, finalize, identity, result, size, map_to_device(args)...); } @@ -302,7 +304,7 @@ void generic_kernel_row_reduction_2d(syn::value_list, std::shared_ptr exec, int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, int64 result_stride, MappedKernelArgs... args) { @@ -321,7 +323,7 @@ void generic_kernel_row_reduction_2d(syn::value_list, thread::get_subwarp_id_flat(id); const auto row = idx % rows; const auto col_block = idx / rows; - auto partial = init; + auto partial = identity; auto subgroup = group::tiled_partition( group::this_thread_block(id)); auto ssg_rank = @@ -356,12 +358,10 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_kernel_row_reduction_2d, template -void generic_kernel_col_reduction_2d_small(sycl::handler& cgh, int64 rows, - int64 cols, int64 row_blocks, - KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, - ValueType* result, - MappedKernelArgs... args) +void generic_kernel_col_reduction_2d_small( + sycl::handler& cgh, int64 rows, int64 cols, int64 row_blocks, + KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, + ValueType* result, MappedKernelArgs... args) { constexpr auto wg_size = KCFG_1D::decode<0>(static_cast(icfg)); @@ -390,7 +390,7 @@ void generic_kernel_col_reduction_2d_small(sycl::handler& cgh, int64 rows, const auto sg_rank = subgroup.thread_rank(); const auto ssg_rank = sg_rank % ssg_size; const auto col = static_cast(ssg_rank); - auto partial = init; + auto partial = identity; // accumulate within a thread if (col < cols) { for (auto row = ssg_id; row < rows; row += ssg_num) { @@ -409,7 +409,7 @@ void generic_kernel_col_reduction_2d_small(sycl::handler& cgh, int64 rows, workgroup.sync(); // in a single thread: accumulate the results if (local_sg_id == 0) { - partial = init; + partial = identity; // accumulate the partial results within a thread if (shared_storage >= sg_size) { #pragma unroll @@ -439,7 +439,7 @@ template (icfg); constexpr auto wg_size = KCFG_1D::decode<0>(cfg); @@ -461,7 +461,7 @@ void generic_kernel_col_reduction_2d_blocked( const auto col = sg_rank + static_cast(id.get_group(1)) * sg_size; auto block_partial = &block_partial_acc[0][0]; - auto partial = init; + auto partial = identity; // accumulate within a thread if (col < cols) { for (auto row = sg_id; row < rows; row += sg_num) { @@ -472,7 +472,7 @@ void generic_kernel_col_reduction_2d_blocked( workgroup.sync(); // in a single warp: accumulate the results if (id.get_local_id(2) < sg_size) { - partial = init; + partial = identity; // accumulate the partial results within a thread #pragma unroll for (int i = 0; i < wg_size; i += sg_size) { @@ -487,16 +487,14 @@ void generic_kernel_col_reduction_2d_blocked( template -void generic_kernel_reduction_finalize_2d(sycl::handler& cgh, int64 num_results, - int64 num_blocks, ReductionOp op, - FinalizeOp finalize, ValueType init, - const ValueType* input, - int64 result_stride, - ValueType* result) +void generic_kernel_reduction_finalize_2d( + sycl::handler& cgh, int64 num_results, int64 num_blocks, ReductionOp op, + FinalizeOp finalize, ValueType identity, const ValueType* input, + int64 result_stride, ValueType* result) { cgh.parallel_for(sycl::range<1>{static_cast(num_results)}, [=](sycl::id<1> id) { - auto partial = init; + auto partial = identity; for (int64 block = 0; block < num_blocks; block++) { partial = op(partial, input[id[0] + block * num_results]); @@ -513,7 +511,7 @@ void run_generic_col_reduction_small(syn::value_list, std::shared_ptr exec, int64 max_workgroups, KernelFunction fn, ReductionOp op, FinalizeOp finalize, - ValueType init, ValueType* result, + ValueType identity, ValueType* result, dim<2> size, MappedKernelArgs... args) { constexpr auto wg_size = @@ -529,7 +527,8 @@ void run_generic_col_reduction_small(syn::value_list, if (row_blocks <= 1) { queue->submit([&](sycl::handler& cgh) { generic_kernel_col_reduction_2d_small( - cgh, rows, cols, 1, fn, op, finalize, init, result, args...); + cgh, rows, cols, 1, fn, op, finalize, identity, result, + args...); }); } else { Array tmp_storage{exec, @@ -537,11 +536,11 @@ void run_generic_col_reduction_small(syn::value_list, queue->submit([&](sycl::handler& cgh) { generic_kernel_col_reduction_2d_small( cgh, rows, cols, row_blocks, fn, op, [](auto v) { return v; }, - init, tmp_storage.get_data(), args...); + identity, tmp_storage.get_data(), args...); }); queue->submit([&](sycl::handler& cgh) { generic_kernel_reduction_finalize_2d( - cgh, cols, row_blocks, op, finalize, init, + cgh, cols, row_blocks, op, finalize, identity, tmp_storage.get_const_data(), 1, result); }); } @@ -557,7 +556,7 @@ template , std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, size_type result_stride, dim<2> size, MappedKernelArgs... args) { @@ -580,10 +579,11 @@ void run_kernel_row_reduction_stage1(syn::value_list, static_cast(col_blocks * rows)}; generic_kernel_row_reduction_2d( syn::value_list{}, exec, rows, cols, col_blocks, fn, - op, [](auto v) { return v; }, init, partial.get_data(), 1, args...); + op, [](auto v) { return v; }, identity, partial.get_data(), 1, + args...); queue->submit([&](sycl::handler& cgh) { generic_kernel_reduction_finalize_2d( - cgh, rows, col_blocks, op, finalize, init, + cgh, rows, col_blocks, op, finalize, identity, partial.get_const_data(), static_cast(result_stride), result); }); @@ -595,7 +595,7 @@ void run_kernel_row_reduction_stage1(syn::value_list, compiled_ssg_size == sg_size; }, syn::value_list(), syn::type_list<>(), exec, rows, cols, - 1, fn, op, finalize, init, result, + 1, fn, op, finalize, identity, result, static_cast(result_stride), args...); } } @@ -610,7 +610,7 @@ template , std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, MappedKernelArgs... args) { @@ -634,7 +634,7 @@ void run_kernel_col_reduction_stage1(syn::value_list, compiled_ssg_size == sg_size; }, syn::value_list(), syn::type_list<>(), exec, max_blocks, - fn, op, finalize, init, result, size, args...); + fn, op, finalize, identity, result, size, args...); } else { const auto col_blocks = ceildiv(cols, sg_size); const auto row_blocks = ceildiv( @@ -644,7 +644,7 @@ void run_kernel_col_reduction_stage1(syn::value_list, if (row_blocks <= 1) { queue->submit([&](sycl::handler& cgh) { generic_kernel_col_reduction_2d_blocked( - cgh, rows, cols, 1, col_blocks, fn, op, finalize, init, + cgh, rows, cols, 1, col_blocks, fn, op, finalize, identity, result, args...); }); } else { @@ -653,12 +653,12 @@ void run_kernel_col_reduction_stage1(syn::value_list, queue->submit([&](sycl::handler& cgh) { generic_kernel_col_reduction_2d_blocked( cgh, rows, cols, row_blocks, col_blocks, fn, op, - [](auto v) { return v; }, init, tmp_storage.get_data(), + [](auto v) { return v; }, identity, tmp_storage.get_data(), args...); }); queue->submit([&](sycl::handler& cgh) { generic_kernel_reduction_finalize_2d( - cgh, cols, row_blocks, op, finalize, init, + cgh, cols, row_blocks, op, finalize, identity, tmp_storage.get_const_data(), 1, result); }); } @@ -676,7 +676,7 @@ template void run_kernel_row_reduction(std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, size_type result_stride, dim<2> size, KernelArgs&&... args) { @@ -688,8 +688,8 @@ void run_kernel_row_reduction(std::shared_ptr exec, select_kernel_row_reduction_stage1( kcfg_1d_list_simple_reduction, [&](int icfg) { return icfg == desired_icfg; }, syn::value_list(), - syn::type_list<>(), exec, fn, op, finalize, init, result, result_stride, - size, map_to_device(args)...); + syn::type_list<>(), exec, fn, op, finalize, identity, result, + result_stride, size, map_to_device(args)...); } @@ -697,7 +697,7 @@ template void run_kernel_col_reduction(std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, KernelArgs&&... args) { @@ -709,7 +709,7 @@ void run_kernel_col_reduction(std::shared_ptr exec, select_kernel_col_reduction_stage1( kcfg_1d_list_simple_reduction, [&](int icfg) { return icfg == desired_icfg; }, syn::value_list(), - syn::type_list<>(), exec, fn, op, finalize, init, result, size, + syn::type_list<>(), exec, fn, op, finalize, identity, result, size, map_to_device(args)...); } diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp index 47b33f411ac..40e4268dccb 100644 --- a/hip/base/kernel_launch_reduction.hip.hpp +++ b/hip/base/kernel_launch_reduction.hip.hpp @@ -56,7 +56,7 @@ __global__ __launch_bounds__( KernelFunction fn, ReductionOp op, FinalizeOp finalize, - ValueType init, + ValueType identity, ValueType* storage, KernelArgs... args) { @@ -69,7 +69,7 @@ __global__ __launch_bounds__( auto grid_size = thread::get_thread_num_flat(); auto warp = group::tiled_partition(group::this_thread_block()); - auto partial = init; + auto partial = identity; for (int64 i = tidx; i < size; i += grid_size) { partial = op(partial, fn(i, args...)); } @@ -82,7 +82,7 @@ __global__ __launch_bounds__( partial = reduce(warp, threadIdx.x < default_block_size / config::warp_size ? warp_partial[threadIdx.x] - : init, + : identity, op); if (threadIdx.x == 0) { storage[blockIdx.x] = finalize(partial); @@ -98,7 +98,7 @@ __global__ __launch_bounds__( KernelFunction fn, ReductionOp op, FinalizeOp finalize, - ValueType init, + ValueType identity, ValueType* storage, KernelArgs... args) { @@ -111,7 +111,7 @@ __global__ __launch_bounds__( auto grid_size = thread::get_thread_num_flat(); auto warp = group::tiled_partition(group::this_thread_block()); - auto partial = init; + auto partial = identity; for (int64 i = tidx; i < rows * cols; i += grid_size) { const auto row = i / cols; const auto col = i % cols; @@ -126,7 +126,7 @@ __global__ __launch_bounds__( partial = reduce(warp, threadIdx.x < default_block_size / config::warp_size ? warp_partial[threadIdx.x] - : init, + : identity, op); if (threadIdx.x == 0) { storage[blockIdx.x] = finalize(partial); @@ -139,7 +139,7 @@ template void run_kernel_reduction(std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, size_type size, KernelArgs&&... args) { @@ -153,18 +153,18 @@ void run_kernel_reduction(std::shared_ptr exec, hipLaunchKernelGGL( generic_kernel_reduction_1d, num_blocks, block_size, 0, 0, static_cast(size), fn, op, - [] __device__(auto v) { return v; }, as_hip_type(init), + [] __device__(auto v) { return v; }, as_hip_type(identity), as_hip_type(partial.get_data()), map_to_device(args)...); hipLaunchKernelGGL( generic_kernel_reduction_1d, 1, block_size, 0, 0, static_cast(num_blocks), [] __device__(auto i, auto v) { return v[i]; }, op, finalize, - as_hip_type(init), as_hip_type(result), + as_hip_type(identity), as_hip_type(result), as_hip_type(partial.get_const_data())); } else { hipLaunchKernelGGL(generic_kernel_reduction_1d, 1, block_size, 0, 0, static_cast(size), fn, op, finalize, - as_hip_type(init), as_hip_type(result), + as_hip_type(identity), as_hip_type(result), map_to_device(args)...); } } @@ -174,7 +174,7 @@ template void run_kernel_reduction(std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, KernelArgs&&... args) { constexpr int oversubscription = 16; @@ -190,17 +190,17 @@ void run_kernel_reduction(std::shared_ptr exec, hipLaunchKernelGGL( generic_kernel_reduction_2d, num_blocks, block_size, 0, 0, rows, cols, fn, op, [] __device__(auto v) { return v; }, - as_hip_type(init), as_hip_type(partial.get_data()), + as_hip_type(identity), as_hip_type(partial.get_data()), map_to_device(args)...); hipLaunchKernelGGL( generic_kernel_reduction_1d, 1, block_size, 0, 0, static_cast(num_blocks), [] __device__(auto i, auto v) { return v[i]; }, op, finalize, - as_hip_type(init), as_hip_type(result), + as_hip_type(identity), as_hip_type(result), as_hip_type(partial.get_const_data())); } else { hipLaunchKernelGGL(generic_kernel_reduction_2d, 1, block_size, 0, 0, - rows, cols, fn, op, finalize, as_hip_type(init), + rows, cols, fn, op, finalize, as_hip_type(identity), as_hip_type(result), map_to_device(args)...); } } @@ -211,8 +211,8 @@ template (); const auto row = idx % rows; @@ -226,7 +226,7 @@ __global__ const auto end = min(begin + cols_per_part, cols); auto subwarp = group::tiled_partition(group::this_thread_block()); - auto partial = init; + auto partial = identity; for (auto col = begin + subwarp.thread_rank(); col < end; col += subwarp_size) { partial = op(partial, fn(row, col, args...)); @@ -243,7 +243,7 @@ template (subwarp_rank); - auto partial = init; + auto partial = identity; // accumulate within a thread if (col < cols) { for (auto row = subwarp_id; row < rows; row += subwarp_num) { @@ -279,7 +279,7 @@ __global__ block.sync(); // in a single thread: accumulate the results if (local_warp_id == 0) { - partial = init; + partial = identity; // accumulate the partial results within a thread if (shared_storage >= warp_size) { #pragma unroll @@ -306,7 +306,7 @@ template (block); const auto warp_rank = warp.thread_rank(); const auto col = warp_rank + static_cast(blockIdx.y) * warp_size; - auto partial = init; + auto partial = identity; // accumulate within a thread if (col < cols) { for (auto row = warp_id; row < rows; row += warp_num) { @@ -328,7 +328,7 @@ __global__ block.sync(); // in a single warp: accumulate the results if (threadIdx.x < warp_size) { - partial = init; + partial = identity; // accumulate the partial results within a thread #pragma unroll for (int i = 0; i < default_block_size; i += warp_size) { @@ -345,14 +345,14 @@ template __global__ __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d( int64 num_results, int64 num_blocks, ReductionOp op, - FinalizeOp finalize, ValueType init, const ValueType* input, + FinalizeOp finalize, ValueType identity, const ValueType* input, int64 result_stride, ValueType* result) { const auto idx = thread::get_thread_id_flat(); if (idx >= num_results) { return; } - auto partial = init; + auto partial = identity; for (int64 block = 0; block < num_blocks; block++) { partial = op(partial, input[idx + block * num_results]); } @@ -368,7 +368,7 @@ template , int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, int64 result_stride, KernelArgs... args) { @@ -377,7 +377,7 @@ void run_generic_kernel_row_reduction(syn::value_list, hipLaunchKernelGGL( HIP_KERNEL_NAME(generic_kernel_row_reduction_2d), num_blocks, default_block_size, 0, 0, rows, cols, col_blocks, fn, op, - finalize, as_hip_type(init), as_hip_type(result), result_stride, + finalize, as_hip_type(identity), as_hip_type(result), result_stride, args...); } @@ -392,7 +392,7 @@ void run_generic_col_reduction_small(syn::value_list, int64 max_blocks, std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, MappedKernelArgs... args) { @@ -405,7 +405,7 @@ void run_generic_col_reduction_small(syn::value_list, HIP_KERNEL_NAME( generic_kernel_col_reduction_2d_small), 1, default_block_size, 0, 0, rows, cols, fn, op, finalize, - as_hip_type(init), as_hip_type(result), args...); + as_hip_type(identity), as_hip_type(result), args...); } else { Array tmp_storage{exec, static_cast(num_blocks * cols)}; @@ -413,12 +413,12 @@ void run_generic_col_reduction_small(syn::value_list, HIP_KERNEL_NAME( generic_kernel_col_reduction_2d_small), num_blocks, default_block_size, 0, 0, rows, cols, fn, op, - [] __device__(auto v) { return v; }, as_hip_type(init), + [] __device__(auto v) { return v; }, as_hip_type(identity), as_hip_type(tmp_storage.get_data()), args...); hipLaunchKernelGGL( generic_kernel_reduction_finalize_2d, ceildiv(cols, default_block_size), default_block_size, 0, 0, cols, - num_blocks, op, finalize, as_hip_type(init), + num_blocks, op, finalize, as_hip_type(identity), as_hip_type(tmp_storage.get_const_data()), 1, as_hip_type(result)); } } @@ -434,7 +434,7 @@ template void run_kernel_row_reduction(std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, size_type result_stride, dim<2> size, KernelArgs&&... args) { @@ -455,13 +455,13 @@ void run_kernel_row_reduction(std::shared_ptr exec, hipLaunchKernelGGL( HIP_KERNEL_NAME(generic_kernel_row_reduction_2d), num_blocks, default_block_size, 0, 0, rows, cols, col_blocks, fn, - op, [] __device__(auto v) { return v; }, as_hip_type(init), + op, [] __device__(auto v) { return v; }, as_hip_type(identity), as_hip_type(partial.get_data()), 1, map_to_device(args)...); const auto num_finalize_blocks = ceildiv(rows, default_block_size); hipLaunchKernelGGL( generic_kernel_reduction_finalize_2d, num_finalize_blocks, default_block_size, 0, 0, rows, col_blocks, op, finalize, - as_hip_type(init), as_hip_type(partial.get_const_data()), + as_hip_type(identity), as_hip_type(partial.get_const_data()), static_cast(result_stride), as_hip_type(result)); } else { select_run_generic_kernel_row_reduction( @@ -471,7 +471,7 @@ void run_kernel_row_reduction(std::shared_ptr exec, compiled_subwarp_size == config::warp_size; }, syn::value_list(), syn::type_list<>(), rows, cols, 1, fn, op, - finalize, init, result, static_cast(result_stride), + finalize, identity, result, static_cast(result_stride), map_to_device(args)...); } } @@ -481,7 +481,7 @@ template void run_kernel_col_reduction(std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, KernelArgs&&... args) { @@ -501,7 +501,7 @@ void run_kernel_col_reduction(std::shared_ptr exec, compiled_subwarp_size == config::warp_size; }, syn::value_list(), syn::type_list<>(), max_blocks, exec, fn, - op, finalize, init, result, size, map_to_device(args)...); + op, finalize, identity, result, size, map_to_device(args)...); } else { const auto col_blocks = ceildiv(cols, config::warp_size); const auto row_blocks = @@ -512,8 +512,9 @@ void run_kernel_col_reduction(std::shared_ptr exec, if (row_blocks <= 1) { hipLaunchKernelGGL(generic_kernel_col_reduction_2d_blocked, dim3(1, col_blocks), default_block_size, 0, 0, - rows, cols, fn, op, finalize, as_hip_type(init), - as_hip_type(result), map_to_device(args)...); + rows, cols, fn, op, finalize, + as_hip_type(identity), as_hip_type(result), + map_to_device(args)...); } else { Array tmp_storage{ exec, static_cast(row_blocks * cols)}; @@ -521,12 +522,12 @@ void run_kernel_col_reduction(std::shared_ptr exec, generic_kernel_col_reduction_2d_blocked, dim3(row_blocks, col_blocks), default_block_size, 0, 0, rows, cols, fn, op, [] __device__(auto v) { return v; }, - as_hip_type(init), as_hip_type(tmp_storage.get_data()), + as_hip_type(identity), as_hip_type(tmp_storage.get_data()), map_to_device(args)...); hipLaunchKernelGGL(generic_kernel_reduction_finalize_2d, ceildiv(cols, default_block_size), default_block_size, 0, 0, cols, row_blocks, op, - finalize, as_hip_type(init), + finalize, as_hip_type(identity), as_hip_type(tmp_storage.get_const_data()), 1, as_hip_type(result)); } diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp index 0c5acf0ebe2..4f9e8267633 100644 --- a/omp/base/kernel_launch_reduction.hpp +++ b/omp/base/kernel_launch_reduction.hpp @@ -58,7 +58,7 @@ template void run_kernel_reduction_impl(std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, size_type size, MappedKernelArgs... args) { @@ -72,7 +72,7 @@ void run_kernel_reduction_impl(std::shared_ptr exec, const auto begin = thread_id * work_per_thread; const auto end = std::min(ssize, begin + work_per_thread); - auto local_partial = init; + auto local_partial = identity; for (auto i = begin; i < end; i++) { local_partial = op(local_partial, fn(i, map_to_device(args)...)); } @@ -80,7 +80,7 @@ void run_kernel_reduction_impl(std::shared_ptr exec, } *result = finalize(std::accumulate(partial.get_const_data(), partial.get_const_data() + num_threads, - init, op)); + identity, op)); } @@ -90,7 +90,7 @@ template , std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, MappedKernelArgs... args) { @@ -108,7 +108,7 @@ void run_kernel_reduction_sized_impl(syn::value_list, const auto begin = thread_id * work_per_thread; const auto end = std::min(rows, begin + work_per_thread); - auto local_partial = init; + auto local_partial = identity; if (rounded_cols == 0 || cols == block_size) { // we group all sizes <= block_size here and unroll explicitly constexpr auto local_cols = @@ -142,7 +142,7 @@ void run_kernel_reduction_sized_impl(syn::value_list, } *result = finalize(std::accumulate(partial.get_const_data(), partial.get_const_data() + num_threads, - init, op)); + identity, op)); } GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_sized, @@ -156,11 +156,11 @@ template void run_kernel_reduction(std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, size_type size, KernelArgs&&... args) { - run_kernel_reduction_impl(exec, fn, op, finalize, init, result, size, + run_kernel_reduction_impl(exec, fn, op, finalize, identity, result, size, map_to_device(args)...); } @@ -169,7 +169,7 @@ template void run_kernel_reduction(std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, KernelArgs&&... args) { const auto cols = static_cast(size[1]); @@ -177,14 +177,14 @@ void run_kernel_reduction(std::shared_ptr exec, using remainders = syn::as_list>; if (cols <= 0) { - *result = init; + *result = identity; return; } select_run_kernel_reduction_sized( remainders(), [&](int remainder) { return remainder == cols % block_size; }, syn::value_list(), syn::type_list<>(), exec, fn, op, - finalize, init, result, size, map_to_device(args)...); + finalize, identity, result, size, map_to_device(args)...); } @@ -195,7 +195,7 @@ template void run_kernel_row_reduction_impl(std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, size_type result_stride, dim<2> size, MappedKernelArgs... args) { @@ -212,7 +212,7 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, #pragma omp parallel for for (int64 row = 0; row < rows; row++) { [&]() { - auto partial = init; + auto partial = identity; for (int64 col = 0; col < cols; col++) { partial = op(partial, fn(row, col, args...)); } @@ -230,7 +230,7 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, const auto begin = thread_id * work_per_thread; const auto end = std::min(begin + work_per_thread, cols); for (int64 row = 0; row < rows; row++) { - auto local_partial = init; + auto local_partial = identity; for (int64 col = begin; col < end; col++) { local_partial = op(local_partial, [&]() { return fn(row, col, args...); @@ -244,7 +244,7 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, #pragma omp parallel for for (int64 row = 0; row < rows; row++) { [&] { - auto local_partial = init; + auto local_partial = identity; for (int64 thread_id = 0; thread_id < num_threads; thread_id++) { local_partial = op( @@ -263,12 +263,12 @@ template void run_kernel_col_reduction_sized_block_impl( - KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType init, + KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, int64 row_begin, int64 row_end, int64 base_col, MappedKernelArgs... args) { std::array partial; - partial.fill(init); + partial.fill(identity); for (auto row = row_begin; row < row_end; row++) { #pragma unroll for (int64 rel_col = 0; rel_col < local_cols; rel_col++) { @@ -289,7 +289,7 @@ template , std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, ValueType* result, dim<2> size, + FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, MappedKernelArgs... args) { const auto rows = static_cast(size[0]); @@ -306,10 +306,12 @@ void run_kernel_col_reduction_sized_impl( const auto base_col = col_block * block_size; if (base_col + block_size <= cols) { run_kernel_col_reduction_sized_block_impl( - fn, op, finalize, init, result, 0, rows, base_col, args...); + fn, op, finalize, identity, result, 0, rows, base_col, + args...); } else { run_kernel_col_reduction_sized_block_impl( - fn, op, finalize, init, result, 0, rows, base_col, args...); + fn, op, finalize, identity, result, 0, rows, base_col, + args...); } } } else { @@ -329,12 +331,12 @@ void run_kernel_col_reduction_sized_impl( const auto identity = [](auto i) { return i; }; if (base_col + block_size <= cols) { run_kernel_col_reduction_sized_block_impl( - fn, op, identity, init, + fn, op, identity, identity, partial.get_data() + cols * row_block, begin, end, base_col, args...); } else { run_kernel_col_reduction_sized_block_impl( - fn, op, identity, init, + fn, op, identity, identity, partial.get_data() + cols * row_block, begin, end, base_col, args...); } @@ -342,7 +344,7 @@ void run_kernel_col_reduction_sized_impl( #pragma omp parallel for for (int64 col = 0; col < cols; col++) { [&] { - auto total = init; + auto total = identity; for (int64 row_block = 0; row_block < reduction_size; row_block++) { total = @@ -366,11 +368,11 @@ template void run_kernel_row_reduction(std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, size_type result_stride, dim<2> size, MappedKernelArgs... args) { - run_kernel_row_reduction_impl(exec, fn, op, finalize, init, result, + run_kernel_row_reduction_impl(exec, fn, op, finalize, identity, result, result_stride, size, map_to_device(args)...); } @@ -379,7 +381,7 @@ template void run_kernel_col_reduction(std::shared_ptr exec, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType init, + FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, KernelArgs&&... args) { @@ -394,7 +396,7 @@ void run_kernel_col_reduction(std::shared_ptr exec, remainders(), [&](int remainder) { return remainder == cols % block_size; }, syn::value_list(), syn::type_list<>(), exec, fn, op, - finalize, init, result, size, map_to_device(args)...); + finalize, identity, result, size, map_to_device(args)...); } From d076d5fa8aa7b84574130c4175f910399df1295e Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 5 Oct 2021 14:13:32 +0200 Subject: [PATCH 22/25] simplify DPCPP kernel selection --- dpcpp/base/kernel_launch_reduction.dp.hpp | 148 ++++++++++------------ 1 file changed, 67 insertions(+), 81 deletions(-) diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp index 4b29a5af55e..c4b8d32642a 100644 --- a/dpcpp/base/kernel_launch_reduction.dp.hpp +++ b/dpcpp/base/kernel_launch_reduction.dp.hpp @@ -55,12 +55,10 @@ namespace dpcpp { using KCFG_1D = ConfigSet<11, 7>; constexpr auto kcfg_1d_list_simple_reduction = - syn::value_list(KCFG_1D::encode(512, 64)), - static_cast(KCFG_1D::encode(512, 32)), - static_cast(KCFG_1D::encode(512, 16)), - static_cast(KCFG_1D::encode(256, 32)), - static_cast(KCFG_1D::encode(256, 16)), - static_cast(KCFG_1D::encode(256, 8))>(); + syn::value_list(); template -void run_kernel_reduction_impl(syn::value_list, - std::shared_ptr exec, +void run_kernel_reduction_impl(std::shared_ptr exec, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, size_type size, MappedKernelArgs... args) { - constexpr auto cfg = static_cast(icfg); constexpr int oversubscription = 4; constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); @@ -205,17 +201,15 @@ void run_kernel_reduction_impl(syn::value_list, } -template -void run_kernel_reduction_impl(syn::value_list, - std::shared_ptr exec, +void run_kernel_reduction_impl(std::shared_ptr exec, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, MappedKernelArgs... args) { - constexpr auto cfg = static_cast(icfg); constexpr int oversubscription = 4; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); @@ -249,8 +243,8 @@ void run_kernel_reduction_impl(syn::value_list, } } -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction, - run_kernel_reduction_impl) +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(select_run_kernel_reduction, + run_kernel_reduction_impl) template exec, FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, KernelArgs&&... args) { - const auto desired_icfg = static_cast(get_first_cfg( + const auto desired_cfg = get_first_cfg( as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); - })); + }); select_run_kernel_reduction( kcfg_1d_list_simple_reduction, - [&](int icfg) { return icfg == desired_icfg; }, syn::value_list(), - syn::type_list<>(), exec, fn, op, finalize, identity, result, size, - map_to_device(args)...); + [&](std::uint32_t cfg) { return cfg == desired_cfg; }, + syn::value_list(), syn::value_list(), + syn::value_list(), syn::type_list<>(), exec, fn, op, + finalize, identity, result, size, map_to_device(args)...); } @@ -281,24 +276,25 @@ void run_kernel_reduction(std::shared_ptr exec, ValueType* result, size_type size, KernelArgs&&... args) { - const auto desired_icfg = static_cast(get_first_cfg( + const auto desired_cfg = get_first_cfg( as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); - })); + }); select_run_kernel_reduction( kcfg_1d_list_simple_reduction, - [&](int icfg) { return icfg == desired_icfg; }, syn::value_list(), - syn::type_list<>(), exec, fn, op, finalize, identity, result, size, - map_to_device(args)...); + [&](std::uint32_t cfg) { return cfg == desired_cfg; }, + syn::value_list(), syn::value_list(), + syn::value_list(), syn::type_list<>(), exec, fn, op, + finalize, identity, result, size, map_to_device(args)...); } namespace { -template void generic_kernel_row_reduction_2d(syn::value_list, std::shared_ptr exec, @@ -308,10 +304,8 @@ void generic_kernel_row_reduction_2d(syn::value_list, ValueType* result, int64 result_stride, MappedKernelArgs... args) { - constexpr auto wg_size = - KCFG_1D::decode<0>(static_cast(icfg)); - constexpr auto sg_size = - KCFG_1D::decode<1>(static_cast(icfg)); + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); static_assert(ssg_size <= sg_size, "ssg must be smaller than sg"); const auto num_workgroups = ceildiv(rows * col_blocks * ssg_size, wg_size); const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size)); @@ -355,18 +349,16 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_kernel_row_reduction_2d, generic_kernel_row_reduction_2d); -template void generic_kernel_col_reduction_2d_small( sycl::handler& cgh, int64 rows, int64 cols, int64 row_blocks, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, MappedKernelArgs... args) { - constexpr auto wg_size = - KCFG_1D::decode<0>(static_cast(icfg)); - constexpr auto sg_size = - KCFG_1D::decode<1>(static_cast(icfg)); + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); static_assert(ssg_size <= sg_size, "ssg must be smaller than sg"); constexpr auto subgroups_per_workgroup = wg_size / sg_size; // stores the subwarp_size partial sums from each warp, grouped by warp @@ -433,7 +425,7 @@ void generic_kernel_col_reduction_2d_small( } -template void generic_kernel_col_reduction_2d_blocked( @@ -441,7 +433,6 @@ void generic_kernel_col_reduction_2d_blocked( int64 col_blocks, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, MappedKernelArgs... args) { - constexpr auto cfg = static_cast(icfg); constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); const auto range = @@ -504,8 +495,8 @@ void generic_kernel_reduction_finalize_2d( } -template void run_generic_col_reduction_small(syn::value_list, std::shared_ptr exec, @@ -514,10 +505,8 @@ void run_generic_col_reduction_small(syn::value_list, ValueType identity, ValueType* result, dim<2> size, MappedKernelArgs... args) { - constexpr auto wg_size = - KCFG_1D::decode<0>(static_cast(icfg)); - constexpr auto sg_size = - KCFG_1D::decode<1>(static_cast(icfg)); + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); static_assert(ssg_size <= sg_size, "ssg must be smaller than sg"); const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); @@ -526,7 +515,7 @@ void run_generic_col_reduction_small(syn::value_list, auto queue = exec->get_queue(); if (row_blocks <= 1) { queue->submit([&](sycl::handler& cgh) { - generic_kernel_col_reduction_2d_small( + generic_kernel_col_reduction_2d_small( cgh, rows, cols, 1, fn, op, finalize, identity, result, args...); }); @@ -534,7 +523,7 @@ void run_generic_col_reduction_small(syn::value_list, Array tmp_storage{exec, static_cast(row_blocks * cols)}; queue->submit([&](sycl::handler& cgh) { - generic_kernel_col_reduction_2d_small( + generic_kernel_col_reduction_2d_small( cgh, rows, cols, row_blocks, fn, op, [](auto v) { return v; }, identity, tmp_storage.get_data(), args...); }); @@ -550,20 +539,17 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small, run_generic_col_reduction_small); -template -void run_kernel_row_reduction_stage1(syn::value_list, - std::shared_ptr exec, +void run_kernel_row_reduction_stage1(std::shared_ptr exec, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, size_type result_stride, dim<2> size, MappedKernelArgs... args) { - constexpr auto wg_size = - KCFG_1D::decode<0>(static_cast(icfg)); - constexpr auto sg_size = - KCFG_1D::decode<1>(static_cast(icfg)); + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); using subsubgroup_sizes = syn::value_list(16, sg_size), std::min(32, sg_size), sg_size>; @@ -577,7 +563,7 @@ void run_kernel_row_reduction_stage1(syn::value_list, const auto col_blocks = ceildiv(rows * cols, resources); Array partial{exec, static_cast(col_blocks * rows)}; - generic_kernel_row_reduction_2d( + generic_kernel_row_reduction_2d( syn::value_list{}, exec, rows, cols, col_blocks, fn, op, [](auto v) { return v; }, identity, partial.get_data(), 1, args...); @@ -594,30 +580,27 @@ void run_kernel_row_reduction_stage1(syn::value_list, return compiled_ssg_size >= cols || compiled_ssg_size == sg_size; }, - syn::value_list(), syn::type_list<>(), exec, rows, cols, + syn::value_list(), syn::type_list<>(), exec, rows, cols, 1, fn, op, finalize, identity, result, static_cast(result_stride), args...); } } -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_kernel_row_reduction_stage1, - run_kernel_row_reduction_stage1); +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(select_kernel_row_reduction_stage1, + run_kernel_row_reduction_stage1); -template -void run_kernel_col_reduction_stage1(syn::value_list, - std::shared_ptr exec, +void run_kernel_col_reduction_stage1(std::shared_ptr exec, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, MappedKernelArgs... args) { - constexpr auto wg_size = - KCFG_1D::decode<0>(static_cast(icfg)); - constexpr auto sg_size = - KCFG_1D::decode<1>(static_cast(icfg)); + constexpr auto wg_size = KCFG_1D::decode<0>(cfg); + constexpr auto sg_size = KCFG_1D::decode<1>(cfg); using subsubgroup_sizes = syn::value_list(16, sg_size), std::min(32, sg_size), sg_size>; @@ -633,7 +616,7 @@ void run_kernel_col_reduction_stage1(syn::value_list, return compiled_ssg_size >= cols || compiled_ssg_size == sg_size; }, - syn::value_list(), syn::type_list<>(), exec, max_blocks, + syn::value_list(), syn::type_list<>(), exec, max_blocks, fn, op, finalize, identity, result, size, args...); } else { const auto col_blocks = ceildiv(cols, sg_size); @@ -643,7 +626,7 @@ void run_kernel_col_reduction_stage1(syn::value_list, auto queue = exec->get_queue(); if (row_blocks <= 1) { queue->submit([&](sycl::handler& cgh) { - generic_kernel_col_reduction_2d_blocked( + generic_kernel_col_reduction_2d_blocked( cgh, rows, cols, 1, col_blocks, fn, op, finalize, identity, result, args...); }); @@ -651,7 +634,7 @@ void run_kernel_col_reduction_stage1(syn::value_list, Array tmp_storage{ exec, static_cast(row_blocks * cols)}; queue->submit([&](sycl::handler& cgh) { - generic_kernel_col_reduction_2d_blocked( + generic_kernel_col_reduction_2d_blocked( cgh, rows, cols, row_blocks, col_blocks, fn, op, [](auto v) { return v; }, identity, tmp_storage.get_data(), args...); @@ -665,8 +648,8 @@ void run_kernel_col_reduction_stage1(syn::value_list, } } -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_kernel_col_reduction_stage1, - run_kernel_col_reduction_stage1); +GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(select_kernel_col_reduction_stage1, + run_kernel_col_reduction_stage1); } // namespace @@ -680,16 +663,18 @@ void run_kernel_row_reduction(std::shared_ptr exec, ValueType* result, size_type result_stride, dim<2> size, KernelArgs&&... args) { - const auto desired_icfg = static_cast(get_first_cfg( + const auto desired_cfg = get_first_cfg( as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); - })); + }); select_kernel_row_reduction_stage1( kcfg_1d_list_simple_reduction, - [&](int icfg) { return icfg == desired_icfg; }, syn::value_list(), - syn::type_list<>(), exec, fn, op, finalize, identity, result, - result_stride, size, map_to_device(args)...); + [&](std::uint32_t cfg) { return cfg == desired_cfg; }, + syn::value_list(), syn::value_list(), + syn::value_list(), syn::type_list<>(), exec, fn, op, + finalize, identity, result, result_stride, size, + map_to_device(args)...); } @@ -701,16 +686,17 @@ void run_kernel_col_reduction(std::shared_ptr exec, ValueType* result, dim<2> size, KernelArgs&&... args) { - const auto desired_icfg = static_cast(get_first_cfg( + const auto desired_cfg = get_first_cfg( as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); - })); + }); select_kernel_col_reduction_stage1( kcfg_1d_list_simple_reduction, - [&](int icfg) { return icfg == desired_icfg; }, syn::value_list(), - syn::type_list<>(), exec, fn, op, finalize, identity, result, size, - map_to_device(args)...); + [&](std::uint32_t cfg) { return cfg == desired_cfg; }, + syn::value_list(), syn::value_list(), + syn::value_list(), syn::type_list<>(), exec, fn, op, + finalize, identity, result, size, map_to_device(args)...); } From ceaa76b7b45234c3a19928a185f249972ce93fc2 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 5 Oct 2021 14:36:12 +0200 Subject: [PATCH 23/25] simplify dpcpp local memory usage --- dpcpp/base/kernel_launch_reduction.dp.hpp | 24 +++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp index c4b8d32642a..5ebf06b0f71 100644 --- a/dpcpp/base/kernel_launch_reduction.dp.hpp +++ b/dpcpp/base/kernel_launch_reduction.dp.hpp @@ -73,16 +73,16 @@ void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size, constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); constexpr auto num_partials = wg_size / sg_size; - sycl::accessor, 1, + sycl::accessor, 0, sycl::access_mode::read_write, sycl::access::target::local> - subgroup_partial_acc(sycl::range<1>{1}, cgh); + subgroup_partial_acc(cgh); const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size)); const auto global_size = num_workgroups * wg_size; cgh.parallel_for( range, [= ](sycl::nd_item<3> idx) [[intel::reqd_sub_group_size(sg_size)]] { - auto subgroup_partial = &subgroup_partial_acc[0][0]; + auto subgroup_partial = &(*subgroup_partial_acc.get_pointer())[0]; const auto tidx = thread::get_thread_id_flat(idx); const auto local_tidx = static_cast(tidx % wg_size); auto subgroup = @@ -122,16 +122,16 @@ void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols, constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); constexpr auto num_partials = wg_size / sg_size; - sycl::accessor, 1, + sycl::accessor, 0, sycl::access_mode::read_write, sycl::access::target::local> - subgroup_partial_acc(sycl::range<1>{1}, cgh); + subgroup_partial_acc(cgh); const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size)); const auto global_size = num_workgroups * wg_size; cgh.parallel_for( range, [= ](sycl::nd_item<3> idx) [[intel::reqd_sub_group_size(sg_size)]] { - auto subgroup_partial = &subgroup_partial_acc[0][0]; + auto subgroup_partial = &(*subgroup_partial_acc.get_pointer())[0]; const auto tidx = thread::get_thread_id_flat(idx); const auto local_tidx = static_cast(tidx % wg_size); auto subgroup = @@ -363,14 +363,14 @@ void generic_kernel_col_reduction_2d_small( constexpr auto subgroups_per_workgroup = wg_size / sg_size; // stores the subwarp_size partial sums from each warp, grouped by warp constexpr auto shared_storage = subgroups_per_workgroup * ssg_size; - sycl::accessor, 1, + sycl::accessor, 0, sycl::access_mode::read_write, sycl::access::target::local> - block_partial_acc(sycl::range<1>{1}, cgh); + block_partial_acc(cgh); const auto range = sycl_nd_range(dim3(row_blocks), dim3(wg_size)); cgh.parallel_for( range, [= ](sycl::nd_item<3> id) [[intel::reqd_sub_group_size(sg_size)]] { - auto block_partial = &block_partial_acc[0][0]; + auto block_partial = &(*block_partial_acc.get_pointer())[0]; const auto ssg_id = thread::get_subwarp_id_flat(id); const auto local_sg_id = id.get_local_id(2) / sg_size; @@ -437,9 +437,9 @@ void generic_kernel_col_reduction_2d_blocked( constexpr auto sg_size = KCFG_1D::decode<1>(cfg); const auto range = sycl_nd_range(dim3(row_blocks, col_blocks), dim3(wg_size)); - sycl::accessor, 1, + sycl::accessor, 0, sycl::access_mode::read_write, sycl::access::target::local> - block_partial_acc(sycl::range<1>{1}, cgh); + block_partial_acc(cgh); cgh.parallel_for( range, [= ](sycl::nd_item<3> id) [[intel::reqd_sub_group_size(sg_size)]] { @@ -451,7 +451,7 @@ void generic_kernel_col_reduction_2d_blocked( const auto sg_rank = subgroup.thread_rank(); const auto col = sg_rank + static_cast(id.get_group(1)) * sg_size; - auto block_partial = &block_partial_acc[0][0]; + auto block_partial = &(*block_partial_acc.get_pointer())[0]; auto partial = identity; // accumulate within a thread if (col < cols) { From 817d1d2d24de3926d46c18a5ff425c5173d0b0f3 Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Tue, 5 Oct 2021 12:43:55 +0000 Subject: [PATCH 24/25] Format files Co-authored-by: Tobias Ribizel --- cuda/test/base/kernel_launch.cu | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/cuda/test/base/kernel_launch.cu b/cuda/test/base/kernel_launch.cu index 66fc3d9e94d..e2f6583f930 100644 --- a/cuda/test/base/kernel_launch.cu +++ b/cuda/test/base/kernel_launch.cu @@ -285,7 +285,7 @@ void run1d_reduction(std::shared_ptr exec) exec, [] GKO_KERNEL(auto i, auto a) { static_assert(is_same::value, "index"); - static_assert(is_same::value, "value"); + static_assert(is_same::value, "value"); return i + 1; }, [] GKO_KERNEL(auto i, auto j) { return i + j; }, @@ -299,7 +299,7 @@ void run1d_reduction(std::shared_ptr exec) exec, [] GKO_KERNEL(auto i, auto a) { static_assert(is_same::value, "index"); - static_assert(is_same::value, "value"); + static_assert(is_same::value, "value"); return i + 1; }, [] GKO_KERNEL(auto i, auto j) { @@ -329,7 +329,7 @@ void run2d_reduction(std::shared_ptr exec) [] GKO_KERNEL(auto i, auto j, auto a) { static_assert(is_same::value, "index"); static_assert(is_same::value, "index"); - static_assert(is_same::value, "value"); + static_assert(is_same::value, "value"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { @@ -351,7 +351,7 @@ void run2d_reduction(std::shared_ptr exec) [] GKO_KERNEL(auto i, auto j, auto a) { static_assert(is_same::value, "index"); static_assert(is_same::value, "index"); - static_assert(is_same::value, "value"); + static_assert(is_same::value, "value"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { @@ -394,8 +394,7 @@ void run2d_row_reduction(std::shared_ptr exec) [] GKO_KERNEL(auto i, auto j, auto a) { static_assert(is_same::value, "index"); static_assert(is_same::value, "index"); - static_assert(is_same::value, - "value"); + static_assert(is_same::value, "value"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { @@ -444,8 +443,7 @@ void run2d_col_reduction(std::shared_ptr exec) [] GKO_KERNEL(auto i, auto j, auto a) { static_assert(is_same::value, "index"); static_assert(is_same::value, "index"); - static_assert(is_same::value, - "value"); + static_assert(is_same::value, "value"); return (i + 1) * (j + 1); }, [] GKO_KERNEL(auto i, auto j) { From dfee616e12102434e6b293d5efcd0ac2278a0b51 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 6 Oct 2021 08:15:03 +0200 Subject: [PATCH 25/25] fix omp name collision --- omp/base/kernel_launch_reduction.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp index 4f9e8267633..030da93c245 100644 --- a/omp/base/kernel_launch_reduction.hpp +++ b/omp/base/kernel_launch_reduction.hpp @@ -328,15 +328,15 @@ void run_kernel_col_reduction_sized_impl( const auto begin = row_block * rows_per_thread; const auto end = std::min(begin + rows_per_thread, rows); const auto base_col = col_block * block_size; - const auto identity = [](auto i) { return i; }; + const auto identity_fn = [](auto i) { return i; }; if (base_col + block_size <= cols) { run_kernel_col_reduction_sized_block_impl( - fn, op, identity, identity, + fn, op, identity_fn, identity, partial.get_data() + cols * row_block, begin, end, base_col, args...); } else { run_kernel_col_reduction_sized_block_impl( - fn, op, identity, identity, + fn, op, identity_fn, identity, partial.get_data() + cols * row_block, begin, end, base_col, args...); }