improve document, fix auto usage in for, shared_memory usage

Co-authored-by: Thomas Grützmacher <thomas.gruetzmacher@kit.edu>
ginkgo-project · Jul 20, 2021 · 6169c2f · 6169c2f
1 parent 7ac6d09
commit 6169c2f
Show file tree

Hide file tree

Showing 26 changed files with 131 additions and 116 deletions.
diff --git a/common/components/prefix_sum.hpp.inc b/common/components/prefix_sum.hpp.inc
@@ -57,7 +57,7 @@ __forceinline__ __device__ void subwarp_prefix_sum(ValueType element,
     total_sum = element;
 #pragma unroll
     // hypercube prefix sum
-    for (auto step = 1; step < subwarp.size(); step *= 2) {
+    for (int step = 1; step < subwarp.size(); step *= 2) {
         auto neighbor = subwarp.shfl_xor(total_sum, step);
         total_sum += neighbor;
         prefix_sum += bool(subwarp.thread_rank() & step) ? neighbor : 0;

diff --git a/common/components/sorting.hpp.inc b/common/components/sorting.hpp.inc
@@ -70,7 +70,7 @@ struct bitonic_local {
                                                           bool reverse)
     {
         auto els_mid = els + (num_elements / 2);
-        for (auto i = 0; i < num_elements / 2; ++i) {
+        for (int i = 0; i < num_elements / 2; ++i) {
             bitonic_cas(els[i], els_mid[i], reverse);
         }
         half::merge(els, reverse);
@@ -131,7 +131,7 @@ struct bitonic_warp {
         auto tile =
             group::tiled_partition<num_threads>(group::this_thread_block());
         auto new_reverse = reverse != upper_half();
-        for (auto i = 0; i < num_local; ++i) {
+        for (int i = 0; i < num_local; ++i) {
             auto other = tile.shfl_xor(els[i], num_threads / 2);
             bitonic_cas(els[i], other, new_reverse);
         }
@@ -206,7 +206,7 @@ struct bitonic_global {
         auto upper_shared_els = shared_els + (num_groups * num_threads / 2);
         // only the lower group executes the CAS
         if (!upper_half()) {
-            for (auto i = 0; i < num_local; ++i) {
+            for (int i = 0; i < num_local; ++i) {
                 auto j = shared_idx(i);
                 bitonic_cas(shared_els[j], upper_shared_els[j], reverse);
             }
@@ -241,11 +241,11 @@ struct bitonic_global<ValueType, num_local, num_threads, 1, num_total_threads> {
                                                  bool reverse)
     {
         group::this_thread_block().sync();
-        for (auto i = 0; i < num_local; ++i) {
+        for (int i = 0; i < num_local; ++i) {
             local_els[i] = shared_els[shared_idx(i)];
         }
         warp::merge(local_els, reverse);
-        for (auto i = 0; i < num_local; ++i) {
+        for (int i = 0; i < num_local; ++i) {
             shared_els[shared_idx(i)] = local_els[i];
         }
     }
@@ -258,7 +258,7 @@ struct bitonic_global<ValueType, num_local, num_threads, 1, num_total_threads> {
         // This is the first step, so we don't need to load from shared memory
         warp::sort(local_els, reverse);
         // store the sorted elements in shared memory
-        for (auto i = 0; i < num_local; ++i) {
+        for (int i = 0; i < num_local; ++i) {
             shared_els[shared_idx(i)] = local_els[i];
         }
     }

diff --git a/common/components/uninitialized_array.hpp.inc b/common/components/uninitialized_array.hpp.inc
@@ -34,7 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /**
  * Stores an array with uninitialized contents.
  *
- * This class needed for datatypes that do have a non-empty constructor when`
+ * This class is needed for datatypes that do have a non-empty constructor when
  * using them as shared memory, for example `thrust::complex<float>`.
  *
  * @tparam ValueType the type of values
@@ -49,7 +49,7 @@ public:
      *
      * @return the constexpr pointer to the first entry of the array.
      */
-    constexpr GKO_ATTRIBUTES operator ValueType *() const noexcept
+    constexpr GKO_ATTRIBUTES operator const ValueType *() const noexcept
     {
         return &(*this)[0];
     }
@@ -70,7 +70,8 @@ public:
      *
      * @return a reference to the array entry at the given index.
      */
-    constexpr GKO_ATTRIBUTES ValueType &operator[](size_type pos) const noexcept
+    constexpr GKO_ATTRIBUTES const ValueType &operator[](size_type pos) const
+        noexcept
     {
         return reinterpret_cast<const ValueType *>(data_)[pos];
     }

diff --git a/common/factorization/par_ilut_filter_kernels.hpp.inc b/common/factorization/par_ilut_filter_kernels.hpp.inc
@@ -55,7 +55,7 @@ __device__ void abstract_filter_impl(const IndexType *row_ptrs,
     auto end = row_ptrs[row + 1];
     begin_cb(row);
     auto num_steps = ceildiv(end - begin, subwarp_size);
-    for (auto step = 0; step < num_steps; ++step) {
+    for (IndexType step = 0; step < num_steps; ++step) {
         auto idx = begin + lane + step * subwarp_size;
         auto keep = idx < end && pred(idx, begin, end);
         auto mask = subwarp.ballot(keep);
@@ -189,4 +189,4 @@ __global__ __launch_bounds__(default_block_size) void bucket_filter(
 }
 
 
-}  // namespace kernel
+}  // namespace kernel
diff --git a/common/factorization/par_ilut_select_kernels.hpp.inc b/common/factorization/par_ilut_select_kernels.hpp.inc
@@ -62,7 +62,8 @@ __global__ __launch_bounds__(searchtree_width) void build_searchtree(
     // assuming rounding towards zero
     auto stride = double(size) / sample_size;
 #pragma unroll
-    for (auto i = 0; i < sampleselect_oversampling; ++i) {
+    for (auto i = decltype(sampleselect_oversampling){0};
+         i < sampleselect_oversampling; ++i) {
         auto lidx = idx * sampleselect_oversampling + i;
         auto val = input[static_cast<IndexType>(lidx * stride)];
         samples[i] = abs(val);
@@ -119,7 +120,8 @@ __global__ __launch_bounds__(default_block_size) void count_buckets(
         auto el = abs(input[i]);
         IndexType tree_idx{};
 #pragma unroll
-        for (auto level = 0; level < sampleselect_searchtree_height; ++level) {
+        for (auto level = decltype(sampleselect_searchtree_height){0};
+             level < sampleselect_searchtree_height; ++level) {
             auto cmp = !(el < sh_tree[tree_idx]);
             tree_idx = 2 * tree_idx + 1 + cmp;
         }
@@ -168,7 +170,7 @@ __global__ __launch_bounds__(default_block_size) void block_prefix_sum(
     // compute prefix sum over warp-sized blocks
     IndexType total{};
     auto base_idx = warp_idx * work_per_warp * warp.size();
-    for (auto step = 0; step < work_per_warp; ++step) {
+    for (auto step = decltype(work_per_warp){0}; step < work_per_warp; ++step) {
         auto idx = warp_lane + step * warp.size() + base_idx;
         auto val = idx < num_blocks ? local_counters[idx] : zero<IndexType>();
         IndexType warp_total{};
@@ -207,7 +209,7 @@ __global__ __launch_bounds__(default_block_size) void block_prefix_sum(
     // add block prefix sum to each warp's block of data
     block.sync();
     auto warp_prefixsum = warp_sums[warp_idx];
-    for (auto step = 0; step < work_per_warp; ++step) {
+    for (IndexType step = 0; step < work_per_warp; ++step) {
         auto idx = warp_lane + step * warp.size() + base_idx;
         auto val = idx < num_blocks ? local_counters[idx] : zero<IndexType>();
         if (idx < num_blocks) {

diff --git a/common/matrix/dense_kernels.hpp.inc b/common/matrix/dense_kernels.hpp.inc
@@ -211,7 +211,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(
 
     if (tidx < num_rows) {
         auto write_to = row_ptrs[tidx];
-        for (auto i = 0; i < num_cols; i++) {
+        for (size_type i = 0; i < num_cols; i++) {
             if (source[stride * tidx + i] != zero<ValueType>()) {
                 values[write_to] = source[stride * tidx + i];
                 col_idxs[write_to] = i;

diff --git a/common/matrix/ell_kernels.hpp.inc b/common/matrix/ell_kernels.hpp.inc
@@ -179,7 +179,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_dense(
 {
     const auto tidx = thread::get_thread_id_flat();
     if (tidx < num_rows) {
-        for (auto col = 0; col < nnz; col++) {
+        for (size_type col = 0; col < nnz; col++) {
             result[tidx * result_stride +
                    col_idxs[tidx + col * source_stride]] +=
                 values[tidx + col * source_stride];
@@ -226,7 +226,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(
 
     if (tidx < num_rows) {
         auto write_to = result_row_ptrs[tidx];
-        for (auto i = 0; i < max_nnz_per_row; i++) {
+        for (size_type i = 0; i < max_nnz_per_row; i++) {
             const auto source_idx = tidx + stride * i;
             if (source_values[source_idx] != zero<ValueType>()) {
                 result_values[write_to] = source_values[source_idx];

diff --git a/common/matrix/hybrid_kernels.hpp.inc b/common/matrix/hybrid_kernels.hpp.inc
@@ -108,7 +108,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(
 
     if (tidx < num_rows) {
         auto write_to = result_row_ptrs[tidx];
-        for (auto i = 0; i < max_nnz_per_row; i++) {
+        for (size_type i = 0; i < max_nnz_per_row; i++) {
             const auto source_idx = tidx + stride * i;
             if (ell_val[source_idx] != zero<ValueType>()) {
                 result_values[write_to] = ell_val[source_idx];

diff --git a/cuda/components/prefix_sum.cu b/cuda/components/prefix_sum.cu
@@ -49,16 +49,16 @@ template <typename IndexType>
 void prefix_sum(std::shared_ptr<const CudaExecutor> exec, IndexType *counts,
                 size_type num_entries)
 {
-    // prefix_sum should be on the valid array
+    // prefix_sum should only be performed on a valid array
     if (num_entries > 0) {
         auto num_blocks = ceildiv(num_entries, prefix_sum_block_size);
         Array<IndexType> block_sum_array(exec, num_blocks - 1);
         auto block_sums = block_sum_array.get_data();
         start_prefix_sum<prefix_sum_block_size>
             <<<num_blocks, prefix_sum_block_size>>>(num_entries, counts,
                                                     block_sums);
-        // add the total sum of the previous block only when the number of block
-        // is larger than 1.
+        // add the total sum of the previous block only when the number of
+        // blocks is larger than 1.
         if (num_blocks > 1) {
             finalize_prefix_sum<prefix_sum_block_size>
                 <<<num_blocks, prefix_sum_block_size>>>(num_entries, counts,

diff --git a/cuda/test/components/sorting_kernels.cu b/cuda/test/components/sorting_kernels.cu
@@ -99,7 +99,7 @@ protected:
     {
         // we want some duplicate elements
         std::uniform_int_distribution<gko::int32> dist(0, num_elements / 2);
-        for (auto i = 0; i < num_elements; ++i) {
+        for (auto i = decltype(num_elements){0}; i < num_elements; ++i) {
             ref_shared.get_data()[i] = dist(rng);
         }
         ddata = gko::Array<gko::int32>{cuda, ref_shared};

diff --git a/cuda/test/matrix/dense_kernels.cpp b/cuda/test/matrix/dense_kernels.cpp
@@ -550,7 +550,7 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef)
                                                           &dnnz_per_row);
 
     auto tmp = gko::Array<gko::size_type>(ref, dnnz_per_row);
-    for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) {
+    for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) {
         ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]);
     }
 }

diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp
@@ -585,7 +585,7 @@ TEST_F(Ell, CalculateNNZPerRowIsEquivalentToRef)
                                                         &dnnz_per_row);
 
     auto tmp = gko::Array<gko::size_type>(ref, dnnz_per_row);
-    for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) {
+    for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) {
         ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]);
     }
 }

diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
@@ -60,13 +60,15 @@ target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP)
 
 set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE)
 target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}")
-# Note. add MKL via PRIVATE not PUBLIC (MKL example shows) to avoid find_package(MKL) everywhere when link ginkgo
+# Note: add MKL as PRIVATE not PUBLIC (MKL example shows) to avoid propagating
+# find_package(MKL) everywhere when linking ginkgo (see the MKL example
+# https://software.intel.com/content/www/us/en/develop/documentation/onemkl-windows-developer-guide/top/getting-started/cmake-config-for-onemkl.html)
 target_compile_options(ginkgo_dpcpp PRIVATE $<TARGET_PROPERTY:MKL::MKL_DPCPP,INTERFACE_COMPILE_OPTIONS>)
 target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17)
 target_include_directories(ginkgo_dpcpp PRIVATE $<TARGET_PROPERTY:MKL::MKL_DPCPP,INTERFACE_INCLUDE_DIRECTORIES>)
 target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all)
 # When building ginkgo as a static library, we need to use dpcpp and per_kernel
-# link option when the program uses dpcpp related function.
+# link option when the program uses a dpcpp related function.
 if (BUILD_SHARED_LIBS)
     target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel)
 else ()

diff --git a/dpcpp/base/config.hpp b/dpcpp/base/config.hpp
@@ -49,12 +49,6 @@ struct config {
      */
     using lane_mask_type = uint64;
 
-
-    /**
-     * The number of threads within a CUDA warp.
-     */
-    static constexpr uint32 warp_size = 16;
-
     /**
      * The bitmask of the entire warp.
      */

diff --git a/dpcpp/base/helper.dp.cpp b/dpcpp/base/helper.dp.cpp
@@ -44,18 +44,16 @@ namespace dpcpp {
 bool validate(sycl::queue *queue, unsigned int workgroup_size,
               unsigned int subgroup_size)
 {
-    {
-        auto device = queue->get_device();
-        auto subgroup_size_list =
-            device.get_info<cl::sycl::info::device::sub_group_sizes>();
-        auto max_workgroup_size =
-            device.get_info<sycl::info::device::max_work_group_size>();
-        bool allowed = false;
-        for (auto &i : subgroup_size_list) {
-            allowed |= (i == subgroup_size);
-        }
-        return allowed && (workgroup_size <= max_workgroup_size);
+    auto device = queue->get_device();
+    auto subgroup_size_list =
+        device.get_info<cl::sycl::info::device::sub_group_sizes>();
+    auto max_workgroup_size =
+        device.get_info<sycl::info::device::max_work_group_size>();
+    bool allowed = false;
+    for (auto &i : subgroup_size_list) {
+        allowed |= (i == subgroup_size);
     }
+    return allowed && (workgroup_size <= max_workgroup_size);
 }
 
 

diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp
@@ -51,44 +51,44 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /**
  * GKO_ENABLE_DEFAULT_HOST gives a default host implementation for those
  * kernels which require encoded config but do not need explicit template
- * parameter and share memory
+ * parameter and shared memory
  *
  * @param name_  the name of the host function with config
  * @param kernel_  the kernel name
  */
-#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_)                     \
-    template <typename... InferredArgs>                             \
-    void name_(dim3 grid, dim3 block, size_t dynamic_shared_memory, \
-               sycl::queue *queue, InferredArgs... args)            \
-    {                                                               \
-        queue->submit([&](sycl::handler &cgh) {                     \
-            cgh.parallel_for(sycl_nd_range(grid, block),            \
-                             [=](sycl::nd_item<3> item_ct1) {       \
-                                 kernel_(args..., item_ct1);        \
-                             });                                    \
-        });                                                         \
+#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_)                   \
+    template <typename... InferredArgs>                           \
+    void name_(dim3 grid, dim3 block, size_t, sycl::queue *queue, \
+               InferredArgs... args)                              \
+    {                                                             \
+        queue->submit([&](sycl::handler &cgh) {                   \
+            cgh.parallel_for(sycl_nd_range(grid, block),          \
+                             [=](sycl::nd_item<3> item_ct1) {     \
+                                 kernel_(args..., item_ct1);      \
+                             });                                  \
+        });                                                       \
     }
 
 
 /**
  * GKO_ENABLE_DEFAULT_HOST_CONFIG gives a default host implementation for those
  * kernels which require encoded config but do not need explicit template
- * parameter and share memory
+ * parameter and shared memory
  *
  * @param name_  the name of the host function with config
  * @param kernel_  the kernel name
  */
-#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_)                     \
-    template <std::uint32_t encoded, typename... InferredArgs>             \
-    inline void name_(dim3 grid, dim3 block, size_t dynamic_shared_memory, \
-                      sycl::queue *queue, InferredArgs... args)            \
-    {                                                                      \
-        queue->submit([&](sycl::handler &cgh) {                            \
-            cgh.parallel_for(sycl_nd_range(grid, block),                   \
-                             [=](sycl::nd_item<3> item_ct1) {              \
-                                 kernel_<encoded>(args..., item_ct1);      \
-                             });                                           \
-        });                                                                \
+#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_)                   \
+    template <std::uint32_t encoded, typename... InferredArgs>           \
+    inline void name_(dim3 grid, dim3 block, size_t, sycl::queue *queue, \
+                      InferredArgs... args)                              \
+    {                                                                    \
+        queue->submit([&](sycl::handler &cgh) {                          \
+            cgh.parallel_for(sycl_nd_range(grid, block),                 \
+                             [=](sycl::nd_item<3> item_ct1) {            \
+                                 kernel_<encoded>(args..., item_ct1);    \
+                             });                                         \
+        });                                                              \
     }
 
 /**
@@ -138,10 +138,33 @@ namespace kernels {
 namespace dpcpp {
 
 
+/**
+ * This is the validate function for common check. It checks the workgroup size
+ * is below device max workgroup size and subgroup size is in the supported
+ * subgroup size.
+ *
+ * @param queue  the sycl queue pointer
+ * @param workgroup_size  the workgroup size (block size in cuda sense)
+ * @param subgroup_size  the subgroup size (warp size in cuda sense)
+ *
+ * @return the given arguments are valid or not in given queue.
+ */
 bool validate(sycl::queue *queue, unsigned workgroup_size,
               unsigned subgroup_size);
 
 
+/**
+ * get_first_cfg will return the first valid config by validate function from
+ * given config array.
+ *
+ * @tparam IterArr  the iteratable array type
+ * @tparam Validate  the validate function type
+ *
+ * @param arr  the config array
+ * @param verify  the validate function
+ *
+ * @return the first valid config
+ */
 template <typename IterArr, typename Validate>
 std::uint32_t get_first_cfg(IterArr &arr, Validate verify)
 {

diff --git a/dpcpp/base/onemkl_bindings.hpp b/dpcpp/base/onemkl_bindings.hpp
@@ -34,6 +34,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_DPCPP_BASE_ONEMKL_BINDINGS_HPP_
 
 
+#include <type_traits>
+
+
 #include <CL/sycl.hpp>
 #include <oneapi/mkl.hpp>