ginkgo-project · upsj · Oct 13, 2021 · Jul 12, 2021 · Jul 12, 2021 · Jul 13, 2021
diff --git a/.github/bot-pr-format-base.sh b/.github/bot-pr-format-base.sh
@@ -3,8 +3,8 @@
 source .github/bot-pr-base.sh
 
 EXTENSION_REGEX='\.(cuh?|hpp|hpp\.inc?|cpp)$'
-FORMAT_HEADER_REGEX='^(benchmark|core|cuda|hip|include/ginkgo/core|omp|reference|dpcpp)/'
-FORMAT_REGEX='^(common|examples|test)/'
+FORMAT_HEADER_REGEX='^(benchmark|core|cuda|hip|include/ginkgo/core|omp|reference|dpcpp|common/unified)/'
+FORMAT_REGEX='^(common/cuda_hip|examples|test)/'
 
 echo "Retrieving PR file list"
 PR_FILES=$(bot_get_all_changed_files ${PR_URL})

diff --git a/common/cuda_hip/components/reduction.hpp.inc b/common/cuda_hip/components/reduction.hpp.inc
@@ -208,14 +208,15 @@ __device__ void reduce_array(size_type size,
  *
  * Computes a reduction using the add operation (+) on an array
  * `source` of any size. Has to be called a second time on `result` to reduce
- * an array larger than `default_block_size`.
+ * an array larger than `default_reduce_block_size`.
  */
 template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void reduce_add_array(
+__global__ __launch_bounds__(default_reduce_block_size) void reduce_add_array(
     size_type size, const ValueType* __restrict__ source,
     ValueType* __restrict__ result)
 {
-    __shared__ UninitializedArray<ValueType, default_block_size> block_sum;
+    __shared__ UninitializedArray<ValueType, default_reduce_block_size>
+        block_sum;
     reduce_array(size, source, static_cast<ValueType*>(block_sum),
                  [](const ValueType& x, const ValueType& y) { return x + y; });
 

diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp
@@ -170,14 +170,13 @@ namespace GKO_DEVICE_NAMESPACE {
 template <typename ValueType>
 struct matrix_accessor {
     ValueType* data;
-    size_type stride;
+    int64 stride;
 
     /**
      * @internal
      * Returns a reference to the element at position (row, col).
      */
-    GKO_INLINE GKO_ATTRIBUTES ValueType& operator()(size_type row,
-                                                    size_type col)
+    GKO_INLINE GKO_ATTRIBUTES ValueType& operator()(int64 row, int64 col)
     {
         return data[row * stride + col];
     }
@@ -187,7 +186,7 @@ struct matrix_accessor {
      * Returns a reference to the element at position idx in the underlying
      * storage.
      */
-    GKO_INLINE GKO_ATTRIBUTES ValueType& operator[](size_type idx)
+    GKO_INLINE GKO_ATTRIBUTES ValueType& operator[](int64 idx)
     {
         return data[idx];
     }
@@ -223,7 +222,8 @@ struct to_device_type_impl<matrix::Dense<ValueType>*&> {
     using type = matrix_accessor<device_type<ValueType>>;
     static type map_to_device(matrix::Dense<ValueType>* mtx)
     {
-        return {as_device_type(mtx->get_values()), mtx->get_stride()};
+        return {as_device_type(mtx->get_values()),
+                static_cast<int64>(mtx->get_stride())};
     }
 };
 
@@ -232,7 +232,8 @@ struct to_device_type_impl<const matrix::Dense<ValueType>*&> {
     using type = matrix_accessor<const device_type<ValueType>>;
     static type map_to_device(const matrix::Dense<ValueType>* mtx)
     {
-        return {as_device_type(mtx->get_const_values()), mtx->get_stride()};
+        return {as_device_type(mtx->get_const_values()),
+                static_cast<int64>(mtx->get_stride())};
     }
 };
 
@@ -267,8 +268,6 @@ typename to_device_type_impl<T>::type map_to_device(T&& param)
 }  // namespace gko
 
 
-// these files include this file again to make inclusion work from both sides,
-// this does not lead to issues due to the header guards.
 #if defined(GKO_COMPILING_CUDA)
 #include "cuda/base/kernel_launch.cuh"
 #elif defined(GKO_COMPILING_HIP)

diff --git a/common/unified/base/kernel_launch_reduction.hpp b/common/unified/base/kernel_launch_reduction.hpp
@@ -0,0 +1,51 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
+#define GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
+
+
+#include "common/unified/base/kernel_launch.hpp"
+
+
+#if defined(GKO_COMPILING_CUDA)
+#include "cuda/base/kernel_launch_reduction.cuh"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/base/kernel_launch_reduction.hip.hpp"
+#elif defined(GKO_COMPILING_DPCPP)
+#include "dpcpp/base/kernel_launch_reduction.dp.hpp"
+#elif defined(GKO_COMPILING_OMP)
+#include "omp/base/kernel_launch_reduction.hpp"
+#endif
+
+
+#endif  // GKO_COMMON_UNIFIED_BASE_KERNEL_LAUNCH_REDUCTION_HPP_
diff --git a/common/unified/base/kernel_launch_solver.hpp b/common/unified/base/kernel_launch_solver.hpp
@@ -63,7 +63,7 @@ struct default_stride_dense_wrapper {
 template <typename T>
 struct device_unpack_solver_impl {
     using type = T;
-    static GKO_INLINE GKO_ATTRIBUTES type unpack(T param, size_type)
+    static GKO_INLINE GKO_ATTRIBUTES type unpack(T param, int64)
     {
         return param;
     }
@@ -72,8 +72,8 @@ struct device_unpack_solver_impl {
 template <typename ValueType>
 struct device_unpack_solver_impl<default_stride_dense_wrapper<ValueType>> {
     using type = matrix_accessor<ValueType>;
-    static GKO_INLINE GKO_ATTRIBUTES type unpack(
-        default_stride_dense_wrapper<ValueType> param, size_type default_stride)
+    static GKO_INLINE GKO_ATTRIBUTES type
+    unpack(default_stride_dense_wrapper<ValueType> param, int64 default_stride)
     {
         return {param.data, default_stride};
     }

diff --git a/common/unified/matrix/dense_kernels.cpp b/common/unified/matrix/dense_kernels.cpp
@@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "common/unified/base/kernel_launch.hpp"
+#include "common/unified/base/kernel_launch_reduction.hpp"
 
 
 namespace gko {
@@ -220,6 +221,60 @@ void sub_scaled_diag(std::shared_ptr<const DefaultExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
 
 
+template <typename ValueType>
+void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
+                 const matrix::Dense<ValueType>* x,
+                 const matrix::Dense<ValueType>* y,
+                 matrix::Dense<ValueType>* result)
+{
+    run_kernel_col_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto x, auto y) {
+            return x(i, j) * y(i, j);
+        },
+        [] GKO_KERNEL(auto a, auto b) { return a + b; },
+        [] GKO_KERNEL(auto a) { return a; }, ValueType{}, result->get_values(),
+        x->get_size(), x, y);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
+
+
+template <typename ValueType>
+void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Dense<ValueType>* x,
+                      const matrix::Dense<ValueType>* y,
+                      matrix::Dense<ValueType>* result)
+{
+    run_kernel_col_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto x, auto y) {
+            return conj(x(i, j)) * y(i, j);
+        },
+        [] GKO_KERNEL(auto a, auto b) { return a + b; },
+        [] GKO_KERNEL(auto a) { return a; }, ValueType{}, result->get_values(),
+        x->get_size(), x, y);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
+
+
+template <typename ValueType>
+void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
+                   const matrix::Dense<ValueType>* x,
+                   matrix::Dense<remove_complex<ValueType>>* result)
+{
+    run_kernel_col_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto x) { return squared_norm(x(i, j)); },
+        [] GKO_KERNEL(auto a, auto b) { return a + b; },
+        [] GKO_KERNEL(auto a) { return sqrt(a); }, remove_complex<ValueType>{},
+        result->get_values(), x->get_size(), x);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void symm_permute(std::shared_ptr<const DefaultExecutor> exec,
                   const Array<IndexType>* permutation_indices,

diff --git a/cuda/base/kernel_launch.cuh b/cuda/base/kernel_launch.cuh
@@ -51,9 +51,9 @@ constexpr int default_block_size = 512;
 
 template <typename KernelFunction, typename... KernelArgs>
 __global__ __launch_bounds__(default_block_size) void generic_kernel_1d(
-    size_type size, KernelFunction fn, KernelArgs... args)
+    int64 size, KernelFunction fn, KernelArgs... args)
 {
-    auto tidx = thread::get_thread_id_flat();
+    auto tidx = thread::get_thread_id_flat<int64>();
     if (tidx >= size) {
         return;
     }
@@ -63,9 +63,9 @@ __global__ __launch_bounds__(default_block_size) void generic_kernel_1d(
 
 template <typename KernelFunction, typename... KernelArgs>
 __global__ __launch_bounds__(default_block_size) void generic_kernel_2d(
-    size_type rows, size_type cols, KernelFunction fn, KernelArgs... args)
+    int64 rows, int64 cols, KernelFunction fn, KernelArgs... args)
 {
-    auto tidx = thread::get_thread_id_flat();
+    auto tidx = thread::get_thread_id_flat<int64>();
     auto col = tidx % cols;
     auto row = tidx / cols;
     if (row >= rows) {
@@ -82,7 +82,7 @@ void run_kernel(std::shared_ptr<const CudaExecutor> exec, KernelFunction fn,
     gko::cuda::device_guard guard{exec->get_device_id()};
     constexpr auto block_size = default_block_size;
     auto num_blocks = ceildiv(size, block_size);
-    generic_kernel_1d<<<num_blocks, block_size>>>(size, fn,
+    generic_kernel_1d<<<num_blocks, block_size>>>(static_cast<int64>(size), fn,
                                                   map_to_device(args)...);
 }
 
@@ -93,8 +93,9 @@ void run_kernel(std::shared_ptr<const CudaExecutor> exec, KernelFunction fn,
     gko::cuda::device_guard guard{exec->get_device_id()};
     constexpr auto block_size = default_block_size;
     auto num_blocks = ceildiv(size[0] * size[1], block_size);
-    generic_kernel_2d<<<num_blocks, block_size>>>(size[0], size[1], fn,
-                                                  map_to_device(args)...);
+    generic_kernel_2d<<<num_blocks, block_size>>>(static_cast<int64>(size[0]),
+                                                  static_cast<int64>(size[1]),
+                                                  fn, map_to_device(args)...);
 }