From a1316602547b4bc999d3e30e5297aafc181b87b0 Mon Sep 17 00:00:00 2001
From: "Yuhsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 20 Jul 2021 11:33:27 +0200
Subject: [PATCH] improve document, fix auto usage in for loop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Thomas Grützmacher <thomas.gruetzmacher@kit.edu>
---
 common/components/prefix_sum.hpp.inc          |   2 +-
 common/components/sorting.hpp.inc             |  12 +-
 common/components/uninitialized_array.hpp.inc |   7 +-
 .../par_ilut_filter_kernels.hpp.inc           |   4 +-
 .../par_ilut_select_kernels.hpp.inc           | 327 +++++++++---------
 common/matrix/dense_kernels.hpp.inc           |   2 +-
 common/matrix/ell_kernels.hpp.inc             |   4 +-
 common/matrix/hybrid_kernels.hpp.inc          |   2 +-
 cuda/components/prefix_sum.cu                 |   6 +-
 cuda/test/components/sorting_kernels.cu       |   2 +-
 cuda/test/matrix/dense_kernels.cpp            |   2 +-
 cuda/test/matrix/ell_kernels.cpp              |   2 +-
 dpcpp/CMakeLists.txt                          |   6 +-
 dpcpp/base/helper.dp.cpp                      |  20 +-
 dpcpp/base/helper.hpp                         |  71 ++--
 dpcpp/base/onemkl_bindings.hpp                |   1 +
 dpcpp/components/prefix_sum.dp.cpp            |   6 +-
 dpcpp/components/prefix_sum.dp.hpp            |   2 +-
 dpcpp/components/thread_ids.dp.hpp            |   2 +-
 dpcpp/components/uninitialized_array.hpp      |   6 +-
 dpcpp/matrix/dense_kernels.dp.cpp             |   2 +-
 dpcpp/test/matrix/dense_kernels.cpp           |   2 +-
 hip/components/prefix_sum.hip.cpp             |   6 +-
 .../ginkgo/core/synthesizer/containers.hpp    |  18 +-
 24 files changed, 275 insertions(+), 239 deletions(-)

diff --git a/common/components/prefix_sum.hpp.inc b/common/components/prefix_sum.hpp.inc
index 8f759b1dc95..1d57c20b2e5 100644
--- a/common/components/prefix_sum.hpp.inc
+++ b/common/components/prefix_sum.hpp.inc
@@ -57,7 +57,7 @@ __forceinline__ __device__ void subwarp_prefix_sum(ValueType element,
     total_sum = element;
 #pragma unroll
     // hypercube prefix sum
-    for (auto step = 1; step < subwarp.size(); step *= 2) {
+    for (int step = 1; step < subwarp.size(); step *= 2) {
         auto neighbor = subwarp.shfl_xor(total_sum, step);
         total_sum += neighbor;
         prefix_sum += bool(subwarp.thread_rank() & step) ? neighbor : 0;
diff --git a/common/components/sorting.hpp.inc b/common/components/sorting.hpp.inc
index ef5bd690937..cd772e08adb 100644
--- a/common/components/sorting.hpp.inc
+++ b/common/components/sorting.hpp.inc
@@ -70,7 +70,7 @@ struct bitonic_local {
                                                           bool reverse)
     {
         auto els_mid = els + (num_elements / 2);
-        for (auto i = 0; i < num_elements / 2; ++i) {
+        for (int i = 0; i < num_elements / 2; ++i) {
             bitonic_cas(els[i], els_mid[i], reverse);
         }
         half::merge(els, reverse);
@@ -131,7 +131,7 @@ struct bitonic_warp {
         auto tile =
             group::tiled_partition<num_threads>(group::this_thread_block());
         auto new_reverse = reverse != upper_half();
-        for (auto i = 0; i < num_local; ++i) {
+        for (int i = 0; i < num_local; ++i) {
             auto other = tile.shfl_xor(els[i], num_threads / 2);
             bitonic_cas(els[i], other, new_reverse);
         }
@@ -206,7 +206,7 @@ struct bitonic_global {
         auto upper_shared_els = shared_els + (num_groups * num_threads / 2);
         // only the lower group executes the CAS
         if (!upper_half()) {
-            for (auto i = 0; i < num_local; ++i) {
+            for (int i = 0; i < num_local; ++i) {
                 auto j = shared_idx(i);
                 bitonic_cas(shared_els[j], upper_shared_els[j], reverse);
             }
@@ -241,11 +241,11 @@ struct bitonic_global<ValueType, num_local, num_threads, 1, num_total_threads> {
                                                  bool reverse)
     {
         group::this_thread_block().sync();
-        for (auto i = 0; i < num_local; ++i) {
+        for (int i = 0; i < num_local; ++i) {
             local_els[i] = shared_els[shared_idx(i)];
         }
         warp::merge(local_els, reverse);
-        for (auto i = 0; i < num_local; ++i) {
+        for (int i = 0; i < num_local; ++i) {
             shared_els[shared_idx(i)] = local_els[i];
         }
     }
@@ -258,7 +258,7 @@ struct bitonic_global<ValueType, num_local, num_threads, 1, num_total_threads> {
         // This is the first step, so we don't need to load from shared memory
         warp::sort(local_els, reverse);
         // store the sorted elements in shared memory
-        for (auto i = 0; i < num_local; ++i) {
+        for (int i = 0; i < num_local; ++i) {
             shared_els[shared_idx(i)] = local_els[i];
         }
     }
diff --git a/common/components/uninitialized_array.hpp.inc b/common/components/uninitialized_array.hpp.inc
index 3a8b3796c12..e951cf06860 100644
--- a/common/components/uninitialized_array.hpp.inc
+++ b/common/components/uninitialized_array.hpp.inc
@@ -34,7 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /**
  * Stores an array with uninitialized contents.
  *
- * This class needed for datatypes that do have a non-empty constructor when`
+ * This class is needed for datatypes that do have a non-empty constructor when
  * using them as shared memory, for example `thrust::complex<float>`.
  *
  * @tparam ValueType the type of values
@@ -49,7 +49,7 @@ public:
      *
      * @return the constexpr pointer to the first entry of the array.
      */
-    constexpr GKO_ATTRIBUTES operator ValueType *() const noexcept
+    constexpr GKO_ATTRIBUTES operator const ValueType *() const noexcept
     {
         return &(*this)[0];
     }
@@ -70,7 +70,8 @@ public:
      *
      * @return a reference to the array entry at the given index.
      */
-    constexpr GKO_ATTRIBUTES ValueType &operator[](size_type pos) const noexcept
+    constexpr GKO_ATTRIBUTES const ValueType &operator[](size_type pos) const
+        noexcept
     {
         return reinterpret_cast<const ValueType *>(data_)[pos];
     }
diff --git a/common/factorization/par_ilut_filter_kernels.hpp.inc b/common/factorization/par_ilut_filter_kernels.hpp.inc
index 25b43e789ee..b5f7d43db67 100644
--- a/common/factorization/par_ilut_filter_kernels.hpp.inc
+++ b/common/factorization/par_ilut_filter_kernels.hpp.inc
@@ -55,7 +55,7 @@ __device__ void abstract_filter_impl(const IndexType *row_ptrs,
     auto end = row_ptrs[row + 1];
     begin_cb(row);
     auto num_steps = ceildiv(end - begin, subwarp_size);
-    for (auto step = 0; step < num_steps; ++step) {
+    for (IndexType step = 0; step < num_steps; ++step) {
         auto idx = begin + lane + step * subwarp_size;
         auto keep = idx < end && pred(idx, begin, end);
         auto mask = subwarp.ballot(keep);
@@ -189,4 +189,4 @@ __global__ __launch_bounds__(default_block_size) void bucket_filter(
 }
 
 
-}  // namespace kernel
\ No newline at end of file
+}  // namespace kernel
diff --git a/common/factorization/par_ilut_select_kernels.hpp.inc b/common/factorization/par_ilut_select_kernels.hpp.inc
index 059069faf41..518fd0c1c15 100644
--- a/common/factorization/par_ilut_select_kernels.hpp.inc
+++ b/common/factorization/par_ilut_select_kernels.hpp.inc
@@ -62,23 +62,27 @@ __global__ __launch_bounds__(searchtree_width) void build_searchtree(
     // assuming rounding towards zero
     auto stride = double(size) / sample_size;
 #pragma unroll
-    for (auto i = 0; i < sampleselect_oversampling; ++i) {
-        auto lidx = idx * sampleselect_oversampling + i;
-        auto val = input[static_cast<IndexType>(lidx * stride)];
-        samples[i] = abs(val);
-    }
-    __shared__ AbsType sh_samples[sample_size];
-    bitonic_sort<sample_size, sampleselect_oversampling>(samples, sh_samples);
-    if (idx > 0) {
-        // root has level 0
-        auto level = sampleselect_searchtree_height - ffs(threadIdx.x);
-        // we get the in-level index by removing trailing 10000...
-        auto idx_in_level = threadIdx.x >> ffs(threadIdx.x);
-        // we get the global index by adding previous levels
-        auto previous_levels = (1 << level) - 1;
-        tree_output[idx_in_level + previous_levels] = samples[0];
-    }
-    tree_output[threadIdx.x + searchtree_inner_size] = samples[0];
+    for (decltype(sampleselect_oversampling){
+        i = 0;
+        i < sampleselect_oversampling; ++i)
+        {
+            auto lidx = idx * sampleselect_oversampling + i;
+            auto val = input[static_cast<IndexType>(lidx * stride)];
+            samples[i] = abs(val);
+        }
+        __shared__ AbsType sh_samples[sample_size];
+        bitonic_sort<sample_size, sampleselect_oversampling>(samples,
+                                                             sh_samples);
+        if (idx > 0) {
+            // root has level 0
+            auto level = sampleselect_searchtree_height - ffs(threadIdx.x);
+            // we get the in-level index by removing trailing 10000...
+            auto idx_in_level = threadIdx.x >> ffs(threadIdx.x);
+            // we get the global index by adding previous levels
+            auto previous_levels = (1 << level) - 1;
+            tree_output[idx_in_level + previous_levels] = samples[0];
+        }
+        tree_output[threadIdx.x + searchtree_inner_size] = samples[0];
 }
 
 
@@ -96,45 +100,48 @@ __global__ __launch_bounds__(default_block_size) void count_buckets(
     const remove_complex<ValueType> *__restrict__ tree, IndexType *counter,
     unsigned char *oracles, int items_per_thread)
 {
-    // load tree into shared memory, initialize counters
-    __shared__ remove_complex<ValueType> sh_tree[searchtree_inner_size];
-    __shared__ IndexType sh_counter[searchtree_width];
-    if (threadIdx.x < searchtree_inner_size) {
-        sh_tree[threadIdx.x] = tree[threadIdx.x];
-    }
-    if (threadIdx.x < searchtree_width) {
-        sh_counter[threadIdx.x] = 0;
-    }
-    group::this_thread_block().sync();
-
-    // work distribution: each thread block gets a consecutive index range
-    auto begin = threadIdx.x + default_block_size *
-                                   static_cast<IndexType>(blockIdx.x) *
-                                   items_per_thread;
-    auto block_end = default_block_size *
-                     static_cast<IndexType>(blockIdx.x + 1) * items_per_thread;
-    auto end = min(block_end, size);
-    for (IndexType i = begin; i < end; i += default_block_size) {
-        // traverse the search tree with the input element
-        auto el = abs(input[i]);
-        IndexType tree_idx{};
+        // load tree into shared memory, initialize counters
+        __shared__ remove_complex<ValueType> sh_tree[searchtree_inner_size];
+        __shared__ IndexType sh_counter[searchtree_width];
+        if (threadIdx.x < searchtree_inner_size) {
+            sh_tree[threadIdx.x] = tree[threadIdx.x];
+        }
+        if (threadIdx.x < searchtree_width) {
+            sh_counter[threadIdx.x] = 0;
+        }
+        group::this_thread_block().sync();
+
+        // work distribution: each thread block gets a consecutive index range
+        auto begin = threadIdx.x + default_block_size *
+                                       static_cast<IndexType>(blockIdx.x) *
+                                       items_per_thread;
+        auto block_end = default_block_size *
+                         static_cast<IndexType>(blockIdx.x + 1) *
+                         items_per_thread;
+        auto end = min(block_end, size);
+        for (IndexType i = begin; i < end; i += default_block_size) {
+            // traverse the search tree with the input element
+            auto el = abs(input[i]);
+            IndexType tree_idx{};
 #pragma unroll
-        for (auto level = 0; level < sampleselect_searchtree_height; ++level) {
-            auto cmp = !(el < sh_tree[tree_idx]);
-            tree_idx = 2 * tree_idx + 1 + cmp;
+            for (decltype(sampleselect_oversampling) level = 0;
+                 level < sampleselect_searchtree_height; ++level) {
+                auto cmp = !(el < sh_tree[tree_idx]);
+                tree_idx = 2 * tree_idx + 1 + cmp;
+            }
+            // increment the bucket counter and store the bucket index
+            uint32 bucket = tree_idx - searchtree_inner_size;
+            // post-condition: sample[bucket] <= el < sample[bucket + 1]
+            atomic_add<IndexType>(sh_counter + bucket, 1);
+            oracles[i] = bucket;
+        }
+        group::this_thread_block().sync();
+
+        // write back the block-wide counts to global memory
+        if (threadIdx.x < searchtree_width) {
+            counter[blockIdx.x + threadIdx.x * gridDim.x] =
+                sh_counter[threadIdx.x];
         }
-        // increment the bucket counter and store the bucket index
-        uint32 bucket = tree_idx - searchtree_inner_size;
-        // post-condition: sample[bucket] <= el < sample[bucket + 1]
-        atomic_add<IndexType>(sh_counter + bucket, 1);
-        oracles[i] = bucket;
-    }
-    group::this_thread_block().sync();
-
-    // write back the block-wide counts to global memory
-    if (threadIdx.x < searchtree_width) {
-        counter[blockIdx.x + threadIdx.x * gridDim.x] = sh_counter[threadIdx.x];
-    }
 }
 
 
@@ -151,69 +158,72 @@ __global__ __launch_bounds__(default_block_size) void block_prefix_sum(
     IndexType *__restrict__ counters, IndexType *__restrict__ totals,
     IndexType num_blocks)
 {
-    constexpr auto num_warps = default_block_size / config::warp_size;
-    static_assert(num_warps < config::warp_size,
-                  "block size needs to be smaller");
-    __shared__ IndexType warp_sums[num_warps];
-
-    auto block = group::this_thread_block();
-    auto warp = group::tiled_partition<config::warp_size>(block);
-
-    auto bucket = blockIdx.x;
-    auto local_counters = counters + num_blocks * bucket;
-    auto work_per_warp = ceildiv(num_blocks, warp.size());
-    auto warp_idx = threadIdx.x / warp.size();
-    auto warp_lane = warp.thread_rank();
-
-    // compute prefix sum over warp-sized blocks
-    IndexType total{};
-    auto base_idx = warp_idx * work_per_warp * warp.size();
-    for (auto step = 0; step < work_per_warp; ++step) {
-        auto idx = warp_lane + step * warp.size() + base_idx;
-        auto val = idx < num_blocks ? local_counters[idx] : zero<IndexType>();
-        IndexType warp_total{};
-        IndexType warp_prefix{};
-        // compute inclusive prefix sum
-        subwarp_prefix_sum<false>(val, warp_prefix, warp_total, warp);
-
-        if (idx < num_blocks) {
-            local_counters[idx] = warp_prefix + total;
-        }
-        total += warp_total;
-    }
-
-    // store total sum
-    if (warp_lane == 0) {
-        warp_sums[warp_idx] = total;
-    }
-
-    // compute prefix sum over all warps in a single warp
-    block.sync();
-    if (warp_idx == 0) {
-        auto in_bounds = warp_lane < num_warps;
-        auto val = in_bounds ? warp_sums[warp_lane] : zero<IndexType>();
-        IndexType prefix_sum{};
-        IndexType total_sum{};
-        // compute inclusive prefix sum
-        subwarp_prefix_sum<false>(val, prefix_sum, total_sum, warp);
-        if (in_bounds) {
-            warp_sums[warp_lane] = prefix_sum;
+        constexpr auto num_warps = default_block_size / config::warp_size;
+        static_assert(num_warps < config::warp_size,
+                      "block size needs to be smaller");
+        __shared__ IndexType warp_sums[num_warps];
+
+        auto block = group::this_thread_block();
+        auto warp = group::tiled_partition<config::warp_size>(block);
+
+        auto bucket = blockIdx.x;
+        auto local_counters = counters + num_blocks * bucket;
+        auto work_per_warp = ceildiv(num_blocks, warp.size());
+        auto warp_idx = threadIdx.x / warp.size();
+        auto warp_lane = warp.thread_rank();
+
+        // compute prefix sum over warp-sized blocks
+        IndexType total{};
+        auto base_idx = warp_idx * work_per_warp * warp.size();
+        for (decltype(sampleselect_oversampling) step = 0; step < work_per_warp;
+             ++step) {
+            auto idx = warp_lane + step * warp.size() + base_idx;
+            auto val =
+                idx < num_blocks ? local_counters[idx] : zero<IndexType>();
+            IndexType warp_total{};
+            IndexType warp_prefix{};
+            // compute inclusive prefix sum
+            subwarp_prefix_sum<false>(val, warp_prefix, warp_total, warp);
+
+            if (idx < num_blocks) {
+                local_counters[idx] = warp_prefix + total;
+            }
+            total += warp_total;
         }
+
+        // store total sum
         if (warp_lane == 0) {
-            totals[bucket] = total_sum;
+            warp_sums[warp_idx] = total;
         }
-    }
-
-    // add block prefix sum to each warp's block of data
-    block.sync();
-    auto warp_prefixsum = warp_sums[warp_idx];
-    for (auto step = 0; step < work_per_warp; ++step) {
-        auto idx = warp_lane + step * warp.size() + base_idx;
-        auto val = idx < num_blocks ? local_counters[idx] : zero<IndexType>();
-        if (idx < num_blocks) {
-            local_counters[idx] += warp_prefixsum;
+
+        // compute prefix sum over all warps in a single warp
+        block.sync();
+        if (warp_idx == 0) {
+            auto in_bounds = warp_lane < num_warps;
+            auto val = in_bounds ? warp_sums[warp_lane] : zero<IndexType>();
+            IndexType prefix_sum{};
+            IndexType total_sum{};
+            // compute inclusive prefix sum
+            subwarp_prefix_sum<false>(val, prefix_sum, total_sum, warp);
+            if (in_bounds) {
+                warp_sums[warp_lane] = prefix_sum;
+            }
+            if (warp_lane == 0) {
+                totals[bucket] = total_sum;
+            }
+        }
+
+        // add block prefix sum to each warp's block of data
+        block.sync();
+        auto warp_prefixsum = warp_sums[warp_idx];
+        for (IndexType step = 0; step < work_per_warp; ++step) {
+            auto idx = warp_lane + step * warp.size() + base_idx;
+            auto val =
+                idx < num_blocks ? local_counters[idx] : zero<IndexType>();
+            if (idx < num_blocks) {
+                local_counters[idx] += warp_prefixsum;
+            }
         }
-    }
 }
 
 
@@ -229,28 +239,29 @@ __global__ __launch_bounds__(default_block_size) void filter_bucket(
     const unsigned char *oracles, const IndexType *block_offsets,
     remove_complex<ValueType> *__restrict__ output, int items_per_thread)
 {
-    // initialize the counter with the block prefix sum.
-    __shared__ IndexType counter;
-    if (threadIdx.x == 0) {
-        counter = block_offsets[blockIdx.x + bucket * gridDim.x];
-    }
-    group::this_thread_block().sync();
-
-    // same work-distribution as in count_buckets
-    auto begin = threadIdx.x + default_block_size *
-                                   static_cast<IndexType>(blockIdx.x) *
-                                   items_per_thread;
-    auto block_end = default_block_size *
-                     static_cast<IndexType>(blockIdx.x + 1) * items_per_thread;
-    auto end = min(block_end, size);
-    for (IndexType i = begin; i < end; i += default_block_size) {
-        // only copy the element when it belongs to the target bucket
-        auto found = bucket == oracles[i];
-        auto ofs = atomic_add<IndexType>(&counter, found);
-        if (found) {
-            output[ofs] = abs(input[i]);
+        // initialize the counter with the block prefix sum.
+        __shared__ IndexType counter;
+        if (threadIdx.x == 0) {
+            counter = block_offsets[blockIdx.x + bucket * gridDim.x];
+        }
+        group::this_thread_block().sync();
+
+        // same work-distribution as in count_buckets
+        auto begin = threadIdx.x + default_block_size *
+                                       static_cast<IndexType>(blockIdx.x) *
+                                       items_per_thread;
+        auto block_end = default_block_size *
+                         static_cast<IndexType>(blockIdx.x + 1) *
+                         items_per_thread;
+        auto end = min(block_end, size);
+        for (IndexType i = begin; i < end; i += default_block_size) {
+            // only copy the element when it belongs to the target bucket
+            auto found = bucket == oracles[i];
+            auto ofs = atomic_add<IndexType>(&counter, found);
+            if (found) {
+                output[ofs] = abs(input[i]);
+            }
         }
-    }
 }
 
 
@@ -264,17 +275,17 @@ __global__ __launch_bounds__(basecase_block_size) void basecase_select(
     const ValueType *__restrict__ input, IndexType size, IndexType rank,
     ValueType *__restrict__ out)
 {
-    constexpr auto sentinel = device_numeric_limits<ValueType>::inf;
-    ValueType local[basecase_local_size];
-    __shared__ ValueType sh_local[basecase_size];
-    for (int i = 0; i < basecase_local_size; ++i) {
-        auto idx = threadIdx.x + i * basecase_block_size;
-        local[i] = idx < size ? input[idx] : sentinel;
-    }
-    bitonic_sort<basecase_size, basecase_local_size>(local, sh_local);
-    if (threadIdx.x == rank / basecase_local_size) {
-        *out = local[rank % basecase_local_size];
-    }
+        constexpr auto sentinel = device_numeric_limits<ValueType>::inf;
+        ValueType local[basecase_local_size];
+        __shared__ ValueType sh_local[basecase_size];
+        for (int i = 0; i < basecase_local_size; ++i) {
+            auto idx = threadIdx.x + i * basecase_block_size;
+            local[i] = idx < size ? input[idx] : sentinel;
+        }
+        bitonic_sort<basecase_size, basecase_local_size>(local, sh_local);
+        if (threadIdx.x == rank / basecase_local_size) {
+            *out = local[rank % basecase_local_size];
+        }
 }
 
 
@@ -289,19 +300,19 @@ template <typename IndexType>
 __global__ __launch_bounds__(config::warp_size) void find_bucket(
     IndexType *prefix_sum, IndexType rank)
 {
-    auto warp =
-        group::tiled_partition<config::warp_size>(group::this_thread_block());
-    auto idx = group_wide_search(0, searchtree_width, warp, [&](int i) {
-        return prefix_sum[i + 1] > rank;
-    });
-    if (warp.thread_rank() == 0) {
-        auto base = prefix_sum[idx];
-        auto size = prefix_sum[idx + 1] - base;
-        // don't overwrite anything before having loaded everything!
-        prefix_sum[0] = idx;
-        prefix_sum[1] = base;
-        prefix_sum[2] = size;
-    }
+        auto warp = group::tiled_partition<config::warp_size>(
+            group::this_thread_block());
+        auto idx = group_wide_search(0, searchtree_width, warp, [&](int i) {
+            return prefix_sum[i + 1] > rank;
+        });
+        if (warp.thread_rank() == 0) {
+            auto base = prefix_sum[idx];
+            auto size = prefix_sum[idx + 1] - base;
+            // don't overwrite anything before having loaded everything!
+            prefix_sum[0] = idx;
+            prefix_sum[1] = base;
+            prefix_sum[2] = size;
+        }
 }
 
 
diff --git a/common/matrix/dense_kernels.hpp.inc b/common/matrix/dense_kernels.hpp.inc
index d46b202a8ff..c7ebafd0627 100644
--- a/common/matrix/dense_kernels.hpp.inc
+++ b/common/matrix/dense_kernels.hpp.inc
@@ -211,7 +211,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(
 
     if (tidx < num_rows) {
         auto write_to = row_ptrs[tidx];
-        for (auto i = 0; i < num_cols; i++) {
+        for (size_type i = 0; i < num_cols; i++) {
             if (source[stride * tidx + i] != zero<ValueType>()) {
                 values[write_to] = source[stride * tidx + i];
                 col_idxs[write_to] = i;
diff --git a/common/matrix/ell_kernels.hpp.inc b/common/matrix/ell_kernels.hpp.inc
index 2323d512258..399dd5070ac 100644
--- a/common/matrix/ell_kernels.hpp.inc
+++ b/common/matrix/ell_kernels.hpp.inc
@@ -179,7 +179,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_dense(
 {
     const auto tidx = thread::get_thread_id_flat();
     if (tidx < num_rows) {
-        for (auto col = 0; col < nnz; col++) {
+        for (size_type col = 0; col < nnz; col++) {
             result[tidx * result_stride +
                    col_idxs[tidx + col * source_stride]] +=
                 values[tidx + col * source_stride];
@@ -226,7 +226,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(
 
     if (tidx < num_rows) {
         auto write_to = result_row_ptrs[tidx];
-        for (auto i = 0; i < max_nnz_per_row; i++) {
+        for (size_type i = 0; i < max_nnz_per_row; i++) {
             const auto source_idx = tidx + stride * i;
             if (source_values[source_idx] != zero<ValueType>()) {
                 result_values[write_to] = source_values[source_idx];
diff --git a/common/matrix/hybrid_kernels.hpp.inc b/common/matrix/hybrid_kernels.hpp.inc
index b6af7c2be36..c7c192189e0 100644
--- a/common/matrix/hybrid_kernels.hpp.inc
+++ b/common/matrix/hybrid_kernels.hpp.inc
@@ -108,7 +108,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(
 
     if (tidx < num_rows) {
         auto write_to = result_row_ptrs[tidx];
-        for (auto i = 0; i < max_nnz_per_row; i++) {
+        for (size_type i = 0; i < max_nnz_per_row; i++) {
             const auto source_idx = tidx + stride * i;
             if (ell_val[source_idx] != zero<ValueType>()) {
                 result_values[write_to] = ell_val[source_idx];
diff --git a/cuda/components/prefix_sum.cu b/cuda/components/prefix_sum.cu
index 54739c783c8..ce108fa8cf9 100644
--- a/cuda/components/prefix_sum.cu
+++ b/cuda/components/prefix_sum.cu
@@ -49,7 +49,7 @@ template <typename IndexType>
 void prefix_sum(std::shared_ptr<const CudaExecutor> exec, IndexType *counts,
                 size_type num_entries)
 {
-    // prefix_sum should be on the valid array
+    // prefix_sum should only be performed on a valid array
     if (num_entries > 0) {
         auto num_blocks = ceildiv(num_entries, prefix_sum_block_size);
         Array<IndexType> block_sum_array(exec, num_blocks - 1);
@@ -57,8 +57,8 @@ void prefix_sum(std::shared_ptr<const CudaExecutor> exec, IndexType *counts,
         start_prefix_sum<prefix_sum_block_size>
             <<<num_blocks, prefix_sum_block_size>>>(num_entries, counts,
                                                     block_sums);
-        // add the total sum of the previous block only when the number of block
-        // is larger than 1.
+        // add the total sum of the previous block only when the number of
+        // blocks is larger than 1.
         if (num_blocks > 1) {
             finalize_prefix_sum<prefix_sum_block_size>
                 <<<num_blocks, prefix_sum_block_size>>>(num_entries, counts,
diff --git a/cuda/test/components/sorting_kernels.cu b/cuda/test/components/sorting_kernels.cu
index e973cc0f650..9659ecbc7a8 100644
--- a/cuda/test/components/sorting_kernels.cu
+++ b/cuda/test/components/sorting_kernels.cu
@@ -99,7 +99,7 @@ protected:
     {
         // we want some duplicate elements
         std::uniform_int_distribution<gko::int32> dist(0, num_elements / 2);
-        for (auto i = 0; i < num_elements; ++i) {
+        for (decltype(num_elements) i = 0; i < num_elements; ++i) {
             ref_shared.get_data()[i] = dist(rng);
         }
         ddata = gko::Array<gko::int32>{cuda, ref_shared};
diff --git a/cuda/test/matrix/dense_kernels.cpp b/cuda/test/matrix/dense_kernels.cpp
index 6e40ce5b5a3..de96d27d823 100644
--- a/cuda/test/matrix/dense_kernels.cpp
+++ b/cuda/test/matrix/dense_kernels.cpp
@@ -550,7 +550,7 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef)
                                                           &dnnz_per_row);
 
     auto tmp = gko::Array<gko::size_type>(ref, dnnz_per_row);
-    for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) {
+    for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) {
         ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]);
     }
 }
diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp
index 2df1c397f4c..51c12fab531 100644
--- a/cuda/test/matrix/ell_kernels.cpp
+++ b/cuda/test/matrix/ell_kernels.cpp
@@ -585,7 +585,7 @@ TEST_F(Ell, CalculateNNZPerRowIsEquivalentToRef)
                                                         &dnnz_per_row);
 
     auto tmp = gko::Array<gko::size_type>(ref, dnnz_per_row);
-    for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) {
+    for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) {
         ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]);
     }
 }
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 443d180b172..7729588d363 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -60,13 +60,15 @@ target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP)
 
 set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE)
 target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}")
-# Note. add MKL via PRIVATE not PUBLIC (MKL example shows) to avoid find_package(MKL) everywhere when link ginkgo
+# Note: add MKL as PRIVATE not PUBLIC (MKL example shows) to avoid propagating
+# find_package(MKL) everywhere when linking ginkgo (see the MKL example
+# https://software.intel.com/content/www/us/en/develop/documentation/onemkl-windows-developer-guide/top/getting-started/cmake-config-for-onemkl.html)
 target_compile_options(ginkgo_dpcpp PRIVATE $<TARGET_PROPERTY:MKL::MKL_DPCPP,INTERFACE_COMPILE_OPTIONS>)
 target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17)
 target_include_directories(ginkgo_dpcpp PRIVATE $<TARGET_PROPERTY:MKL::MKL_DPCPP,INTERFACE_INCLUDE_DIRECTORIES>)
 target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all)
 # When building ginkgo as a static library, we need to use dpcpp and per_kernel
-# link option when the program uses dpcpp related function.
+# link option when the program uses a dpcpp related function.
 if (BUILD_SHARED_LIBS)
     target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel)
 else ()
diff --git a/dpcpp/base/helper.dp.cpp b/dpcpp/base/helper.dp.cpp
index ae453dd937d..5e6c1a579f5 100644
--- a/dpcpp/base/helper.dp.cpp
+++ b/dpcpp/base/helper.dp.cpp
@@ -44,18 +44,16 @@ namespace dpcpp {
 bool validate(sycl::queue *queue, unsigned int workgroup_size,
               unsigned int subgroup_size)
 {
-    {
-        auto device = queue->get_device();
-        auto subgroup_size_list =
-            device.get_info<cl::sycl::info::device::sub_group_sizes>();
-        auto max_workgroup_size =
-            device.get_info<sycl::info::device::max_work_group_size>();
-        bool allowed = false;
-        for (auto &i : subgroup_size_list) {
-            allowed |= (i == subgroup_size);
-        }
-        return allowed && (workgroup_size <= max_workgroup_size);
+    auto device = queue->get_device();
+    auto subgroup_size_list =
+        device.get_info<cl::sycl::info::device::sub_group_sizes>();
+    auto max_workgroup_size =
+        device.get_info<sycl::info::device::max_work_group_size>();
+    bool allowed = false;
+    for (auto &i : subgroup_size_list) {
+        allowed |= (i == subgroup_size);
     }
+    return allowed && (workgroup_size <= max_workgroup_size);
 }
 
 
diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp
index 8c7f45e5174..16d91c2ef8d 100644
--- a/dpcpp/base/helper.hpp
+++ b/dpcpp/base/helper.hpp
@@ -51,44 +51,44 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /**
  * GKO_ENABLE_DEFAULT_HOST gives a default host implementation for those
  * kernels which require encoded config but do not need explicit template
- * parameter and share memory
+ * parameter and shared memory
  *
  * @param name_  the name of the host function with config
  * @param kernel_  the kernel name
  */
-#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_)                     \
-    template <typename... InferredArgs>                             \
-    void name_(dim3 grid, dim3 block, size_t dynamic_shared_memory, \
-               sycl::queue *queue, InferredArgs... args)            \
-    {                                                               \
-        queue->submit([&](sycl::handler &cgh) {                     \
-            cgh.parallel_for(sycl_nd_range(grid, block),            \
-                             [=](sycl::nd_item<3> item_ct1) {       \
-                                 kernel_(args..., item_ct1);        \
-                             });                                    \
-        });                                                         \
+#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_)                   \
+    template <typename... InferredArgs>                           \
+    void name_(dim3 grid, dim3 block, size_t, sycl::queue *queue, \
+               InferredArgs... args)                              \
+    {                                                             \
+        queue->submit([&](sycl::handler &cgh) {                   \
+            cgh.parallel_for(sycl_nd_range(grid, block),          \
+                             [=](sycl::nd_item<3> item_ct1) {     \
+                                 kernel_(args..., item_ct1);      \
+                             });                                  \
+        });                                                       \
     }
 
 
 /**
  * GKO_ENABLE_DEFAULT_HOST_CONFIG gives a default host implementation for those
  * kernels which require encoded config but do not need explicit template
- * parameter and share memory
+ * parameter and shared memory
  *
  * @param name_  the name of the host function with config
  * @param kernel_  the kernel name
  */
-#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_)                     \
-    template <std::uint32_t encoded, typename... InferredArgs>             \
-    inline void name_(dim3 grid, dim3 block, size_t dynamic_shared_memory, \
-                      sycl::queue *queue, InferredArgs... args)            \
-    {                                                                      \
-        queue->submit([&](sycl::handler &cgh) {                            \
-            cgh.parallel_for(sycl_nd_range(grid, block),                   \
-                             [=](sycl::nd_item<3> item_ct1) {              \
-                                 kernel_<encoded>(args..., item_ct1);      \
-                             });                                           \
-        });                                                                \
+#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_)                   \
+    template <std::uint32_t encoded, typename... InferredArgs>           \
+    inline void name_(dim3 grid, dim3 block, size_t, sycl::queue *queue, \
+                      InferredArgs... args)                              \
+    {                                                                    \
+        queue->submit([&](sycl::handler &cgh) {                          \
+            cgh.parallel_for(sycl_nd_range(grid, block),                 \
+                             [=](sycl::nd_item<3> item_ct1) {            \
+                                 kernel_<encoded>(args..., item_ct1);    \
+                             });                                         \
+        });                                                              \
     }
 
 /**
@@ -138,10 +138,33 @@ namespace kernels {
 namespace dpcpp {
 
 
+/**
+ * This is the validate function for common check. It checks the workgroup size
+ * is below device max workgroup size and subgroup size is in the supported
+ * subgroup size.
+ *
+ * @param queue  the sycl queue pointer
+ * @param workgroup_size  the workgroup size (block size in cuda sense)
+ * @param subgroup_size  the subgroup size (warp size in cuda sense)
+ *
+ * @return the given arguments are valid or not in given queue.
+ */
 bool validate(sycl::queue *queue, unsigned workgroup_size,
               unsigned subgroup_size);
 
 
+/**
+ * get_first_cfg will return the first valid config by validate function from
+ * given config array.
+ *
+ * @tparam IterArr  the iteratable array type
+ * @tparam Validate  the validate function type
+ *
+ * @param arr  the config array
+ * @param verify  the validate function
+ *
+ * @return the first valid config
+ */
 template <typename IterArr, typename Validate>
 std::uint32_t get_first_cfg(IterArr &arr, Validate verify)
 {
diff --git a/dpcpp/base/onemkl_bindings.hpp b/dpcpp/base/onemkl_bindings.hpp
index 6456a048d23..d2a13f9eab7 100644
--- a/dpcpp/base/onemkl_bindings.hpp
+++ b/dpcpp/base/onemkl_bindings.hpp
@@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <CL/sycl.hpp>
 #include <oneapi/mkl.hpp>
+#include <type_traits>
 
 
 namespace gko {
diff --git a/dpcpp/components/prefix_sum.dp.cpp b/dpcpp/components/prefix_sum.dp.cpp
index 07cdb5b38aa..63f33e9ba35 100644
--- a/dpcpp/components/prefix_sum.dp.cpp
+++ b/dpcpp/components/prefix_sum.dp.cpp
@@ -70,7 +70,7 @@ template <typename IndexType>
 void prefix_sum(std::shared_ptr<const DpcppExecutor> exec, IndexType *counts,
                 size_type num_entries)
 {
-    // prefix_sum should be on the valid array
+    // prefix_sum should only be performed on a valid array
     if (num_entries > 0) {
         auto queue = exec->get_queue();
         constexpr auto block_cfg_array = as_array(block_cfg_list);
@@ -84,8 +84,8 @@ void prefix_sum(std::shared_ptr<const DpcppExecutor> exec, IndexType *counts,
         auto block_sums = block_sum_array.get_data();
         start_prefix_sum_call(cfg, num_blocks, wg_size, 0, exec->get_queue(),
                               num_entries, counts, block_sums);
-        // add the total sum of the previous block only when the number of block
-        // is larger than 1.
+        // add the total sum of the previous block only when the number of
+        // blocks is larger than 1.
         if (num_blocks > 1) {
             finalize_prefix_sum_call(cfg, num_blocks, wg_size, 0,
                                      exec->get_queue(), num_entries, counts,
diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp
index 22e6139dd84..cb407118aed 100644
--- a/dpcpp/components/prefix_sum.dp.hpp
+++ b/dpcpp/components/prefix_sum.dp.hpp
@@ -78,7 +78,7 @@ __dpct_inline__ void subwarp_prefix_sum(ValueType element,
     total_sum = element;
 #pragma unroll
     // hypercube prefix sum
-    for (auto step = 1; step < subwarp.size(); step *= 2) {
+    for (int step = 1; step < subwarp.size(); step *= 2) {
         auto neighbor = subwarp.shfl_xor(total_sum, step);
         total_sum += neighbor;
         prefix_sum += bool(subwarp.thread_rank() & step) ? neighbor : 0;
diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp
index 47abf3c7b72..f9decfd989d 100644
--- a/dpcpp/components/thread_ids.dp.hpp
+++ b/dpcpp/components/thread_ids.dp.hpp
@@ -195,7 +195,7 @@ __dpct_inline__ size_type get_warp_id(sycl::nd_item<3> item_ct1)
 template <int subwarp_size, int warps_per_block>
 __dpct_inline__ size_type get_subwarp_id(sycl::nd_item<3> item_ct1)
 {
-    // dpcpp dose not have subwarp
+    // dpcpp does not have subwarp
     constexpr auto subwarps_per_warp = subwarp_size / subwarp_size;
     return get_warp_id<warps_per_block>(item_ct1) * subwarps_per_warp +
            item_ct1.get_local_id(1);
diff --git a/dpcpp/components/uninitialized_array.hpp b/dpcpp/components/uninitialized_array.hpp
index b10457df217..d9d423c9c94 100644
--- a/dpcpp/components/uninitialized_array.hpp
+++ b/dpcpp/components/uninitialized_array.hpp
@@ -48,7 +48,7 @@ namespace dpcpp {
 /**
  * Stores an array with uninitialized contents.
  *
- * This class needed for datatypes that do have a non-empty constructor when`
+ * This class is needed for datatypes that do have a non-empty constructor when
  * using them as shared memory, for example `thrust::complex<float>`.
  *
  * @tparam ValueType the type of values
@@ -63,7 +63,7 @@ class UninitializedArray {
      *
      * @return the constexpr pointer to the first entry of the array.
      */
-    constexpr __dpct_inline__ operator ValueType *() const noexcept
+    constexpr __dpct_inline__ operator const ValueType *() const noexcept
     {
         return &(*this)[0];
     }
@@ -84,7 +84,7 @@ class UninitializedArray {
      *
      * @return a reference to the array entry at the given index.
      */
-    constexpr __dpct_inline__ ValueType &operator[](size_type pos) const
+    constexpr __dpct_inline__ const ValueType &operator[](size_type pos) const
         noexcept
     {
         return data_[pos];
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 32eef01af63..3d7af2024fc 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -433,7 +433,7 @@ void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride,
 
     if (tidx < num_rows) {
         auto write_to = row_ptrs[tidx];
-        for (auto i = 0; i < num_cols; i++) {
+        for (size_type i = 0; i < num_cols; i++) {
             if (source[stride * tidx + i] != zero<ValueType>()) {
                 values[write_to] = source[stride * tidx + i];
                 col_idxs[write_to] = i;
diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp
index 43ce9bad547..257ee6fbc6a 100644
--- a/dpcpp/test/matrix/dense_kernels.cpp
+++ b/dpcpp/test/matrix/dense_kernels.cpp
@@ -697,7 +697,7 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef)
                                                            &dnnz_per_row);
 
     auto tmp = gko::Array<gko::size_type>(ref, dnnz_per_row);
-    for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) {
+    for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) {
         ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]);
     }
 }
diff --git a/hip/components/prefix_sum.hip.cpp b/hip/components/prefix_sum.hip.cpp
index 28cd01b4fb5..9302fc07b9a 100644
--- a/hip/components/prefix_sum.hip.cpp
+++ b/hip/components/prefix_sum.hip.cpp
@@ -49,7 +49,7 @@ template <typename IndexType>
 void prefix_sum(std::shared_ptr<const HipExecutor> exec, IndexType *counts,
                 size_type num_entries)
 {
-    // prefix_sum should be on the valid array
+    // prefix_sum should only be performed on a valid array
     if (num_entries > 0) {
         auto num_blocks = ceildiv(num_entries, prefix_sum_block_size);
         Array<IndexType> block_sum_array(exec, num_blocks - 1);
@@ -58,8 +58,8 @@ void prefix_sum(std::shared_ptr<const HipExecutor> exec, IndexType *counts,
             HIP_KERNEL_NAME(start_prefix_sum<prefix_sum_block_size>),
             dim3(num_blocks), dim3(prefix_sum_block_size), 0, 0, num_entries,
             counts, block_sums);
-        // add the total sum of the previous block only when the number of block
-        // is larger than 1.
+        // add the total sum of the previous block only when the number of
+        // blocks is larger than 1.
         if (num_blocks > 1) {
             hipLaunchKernelGGL(
                 HIP_KERNEL_NAME(finalize_prefix_sum<prefix_sum_block_size>),
diff --git a/include/ginkgo/core/synthesizer/containers.hpp b/include/ginkgo/core/synthesizer/containers.hpp
index 10e8c1031a1..0e9570540fa 100644
--- a/include/ginkgo/core/synthesizer/containers.hpp
+++ b/include/ginkgo/core/synthesizer/containers.hpp
@@ -51,7 +51,7 @@ namespace syn {
  * value_list records several values with the same type in template.
  *
  * @tparam T  the value type of the list
- * @tparam T...  the values in the list
+ * @tparam Values  the values in the list
  */
 template <typename T, T... Values>
 struct value_list {};
@@ -60,7 +60,7 @@ struct value_list {};
 /**
  * type_list records several types in template
  *
- * @tparam ...Types  the types in the list
+ * @tparam Types  the types in the list
  */
 template <typename... Types>
 struct type_list {};
@@ -69,9 +69,9 @@ struct type_list {};
 /**
  * range records start, end, step in template
  *
- * @tparam int  start of range
- * @tparam int  end of range
- * @tparam int  step of range. default is 1
+ * @tparam Start  start of range
+ * @tparam End  end of range
+ * @tparam Step  step of range. default is 1
  */
 template <int Start, int End, int Step = 1>
 struct range {};
@@ -93,8 +93,8 @@ struct concatenate_impl;
  * concatenate_impl specializes for two value_list with the same value type.
  *
  * @tparam T  the value type of two value_list
- * @tparam T...  the values of the first list
- * @tparam T...  the values of the second list
+ * @tparam Values  the values of the first list
+ * @tparam Values  the values of the second list
  */
 template <typename T, T... Values1, T... Values2>
 struct concatenate_impl<value_list<T, Values1...>, value_list<T, Values2...>> {
@@ -130,7 +130,7 @@ struct as_list_impl;
  * as_list_impl specializes for the value_list
  *
  * @tparam T  the value_list type
- * @tparam T...  the values of value_list
+ * @tparam Values  the values of value_list
  */
 template <typename T, T... Values>
 struct as_list_impl<value_list<T, Values...>> {
@@ -193,7 +193,7 @@ using as_list = typename detail::as_list_impl<T>::type;
  * for in runtime on the array.
  *
  * @tparam T  the type of value_list
- * @tparam T...  the values of value_list
+ * @tparam Value  the values of value_list
  *
  * @param value_list  the input value_list
  *