From a1316602547b4bc999d3e30e5297aafc181b87b0 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Tue, 20 Jul 2021 11:33:27 +0200 Subject: [PATCH] improve document, fix auto usage in for loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Thomas Grützmacher --- common/components/prefix_sum.hpp.inc | 2 +- common/components/sorting.hpp.inc | 12 +- common/components/uninitialized_array.hpp.inc | 7 +- .../par_ilut_filter_kernels.hpp.inc | 4 +- .../par_ilut_select_kernels.hpp.inc | 327 +++++++++--------- common/matrix/dense_kernels.hpp.inc | 2 +- common/matrix/ell_kernels.hpp.inc | 4 +- common/matrix/hybrid_kernels.hpp.inc | 2 +- cuda/components/prefix_sum.cu | 6 +- cuda/test/components/sorting_kernels.cu | 2 +- cuda/test/matrix/dense_kernels.cpp | 2 +- cuda/test/matrix/ell_kernels.cpp | 2 +- dpcpp/CMakeLists.txt | 6 +- dpcpp/base/helper.dp.cpp | 20 +- dpcpp/base/helper.hpp | 71 ++-- dpcpp/base/onemkl_bindings.hpp | 1 + dpcpp/components/prefix_sum.dp.cpp | 6 +- dpcpp/components/prefix_sum.dp.hpp | 2 +- dpcpp/components/thread_ids.dp.hpp | 2 +- dpcpp/components/uninitialized_array.hpp | 6 +- dpcpp/matrix/dense_kernels.dp.cpp | 2 +- dpcpp/test/matrix/dense_kernels.cpp | 2 +- hip/components/prefix_sum.hip.cpp | 6 +- .../ginkgo/core/synthesizer/containers.hpp | 18 +- 24 files changed, 275 insertions(+), 239 deletions(-) diff --git a/common/components/prefix_sum.hpp.inc b/common/components/prefix_sum.hpp.inc index 8f759b1dc95..1d57c20b2e5 100644 --- a/common/components/prefix_sum.hpp.inc +++ b/common/components/prefix_sum.hpp.inc @@ -57,7 +57,7 @@ __forceinline__ __device__ void subwarp_prefix_sum(ValueType element, total_sum = element; #pragma unroll // hypercube prefix sum - for (auto step = 1; step < subwarp.size(); step *= 2) { + for (int step = 1; step < subwarp.size(); step *= 2) { auto neighbor = subwarp.shfl_xor(total_sum, step); total_sum += neighbor; prefix_sum += bool(subwarp.thread_rank() & step) ? neighbor : 0; diff --git a/common/components/sorting.hpp.inc b/common/components/sorting.hpp.inc index ef5bd690937..cd772e08adb 100644 --- a/common/components/sorting.hpp.inc +++ b/common/components/sorting.hpp.inc @@ -70,7 +70,7 @@ struct bitonic_local { bool reverse) { auto els_mid = els + (num_elements / 2); - for (auto i = 0; i < num_elements / 2; ++i) { + for (int i = 0; i < num_elements / 2; ++i) { bitonic_cas(els[i], els_mid[i], reverse); } half::merge(els, reverse); @@ -131,7 +131,7 @@ struct bitonic_warp { auto tile = group::tiled_partition(group::this_thread_block()); auto new_reverse = reverse != upper_half(); - for (auto i = 0; i < num_local; ++i) { + for (int i = 0; i < num_local; ++i) { auto other = tile.shfl_xor(els[i], num_threads / 2); bitonic_cas(els[i], other, new_reverse); } @@ -206,7 +206,7 @@ struct bitonic_global { auto upper_shared_els = shared_els + (num_groups * num_threads / 2); // only the lower group executes the CAS if (!upper_half()) { - for (auto i = 0; i < num_local; ++i) { + for (int i = 0; i < num_local; ++i) { auto j = shared_idx(i); bitonic_cas(shared_els[j], upper_shared_els[j], reverse); } @@ -241,11 +241,11 @@ struct bitonic_global { bool reverse) { group::this_thread_block().sync(); - for (auto i = 0; i < num_local; ++i) { + for (int i = 0; i < num_local; ++i) { local_els[i] = shared_els[shared_idx(i)]; } warp::merge(local_els, reverse); - for (auto i = 0; i < num_local; ++i) { + for (int i = 0; i < num_local; ++i) { shared_els[shared_idx(i)] = local_els[i]; } } @@ -258,7 +258,7 @@ struct bitonic_global { // This is the first step, so we don't need to load from shared memory warp::sort(local_els, reverse); // store the sorted elements in shared memory - for (auto i = 0; i < num_local; ++i) { + for (int i = 0; i < num_local; ++i) { shared_els[shared_idx(i)] = local_els[i]; } } diff --git a/common/components/uninitialized_array.hpp.inc b/common/components/uninitialized_array.hpp.inc index 3a8b3796c12..e951cf06860 100644 --- a/common/components/uninitialized_array.hpp.inc +++ b/common/components/uninitialized_array.hpp.inc @@ -34,7 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /** * Stores an array with uninitialized contents. * - * This class needed for datatypes that do have a non-empty constructor when` + * This class is needed for datatypes that do have a non-empty constructor when * using them as shared memory, for example `thrust::complex`. * * @tparam ValueType the type of values @@ -49,7 +49,7 @@ public: * * @return the constexpr pointer to the first entry of the array. */ - constexpr GKO_ATTRIBUTES operator ValueType *() const noexcept + constexpr GKO_ATTRIBUTES operator const ValueType *() const noexcept { return &(*this)[0]; } @@ -70,7 +70,8 @@ public: * * @return a reference to the array entry at the given index. */ - constexpr GKO_ATTRIBUTES ValueType &operator[](size_type pos) const noexcept + constexpr GKO_ATTRIBUTES const ValueType &operator[](size_type pos) const + noexcept { return reinterpret_cast(data_)[pos]; } diff --git a/common/factorization/par_ilut_filter_kernels.hpp.inc b/common/factorization/par_ilut_filter_kernels.hpp.inc index 25b43e789ee..b5f7d43db67 100644 --- a/common/factorization/par_ilut_filter_kernels.hpp.inc +++ b/common/factorization/par_ilut_filter_kernels.hpp.inc @@ -55,7 +55,7 @@ __device__ void abstract_filter_impl(const IndexType *row_ptrs, auto end = row_ptrs[row + 1]; begin_cb(row); auto num_steps = ceildiv(end - begin, subwarp_size); - for (auto step = 0; step < num_steps; ++step) { + for (IndexType step = 0; step < num_steps; ++step) { auto idx = begin + lane + step * subwarp_size; auto keep = idx < end && pred(idx, begin, end); auto mask = subwarp.ballot(keep); @@ -189,4 +189,4 @@ __global__ __launch_bounds__(default_block_size) void bucket_filter( } -} // namespace kernel \ No newline at end of file +} // namespace kernel diff --git a/common/factorization/par_ilut_select_kernels.hpp.inc b/common/factorization/par_ilut_select_kernels.hpp.inc index 059069faf41..518fd0c1c15 100644 --- a/common/factorization/par_ilut_select_kernels.hpp.inc +++ b/common/factorization/par_ilut_select_kernels.hpp.inc @@ -62,23 +62,27 @@ __global__ __launch_bounds__(searchtree_width) void build_searchtree( // assuming rounding towards zero auto stride = double(size) / sample_size; #pragma unroll - for (auto i = 0; i < sampleselect_oversampling; ++i) { - auto lidx = idx * sampleselect_oversampling + i; - auto val = input[static_cast(lidx * stride)]; - samples[i] = abs(val); - } - __shared__ AbsType sh_samples[sample_size]; - bitonic_sort(samples, sh_samples); - if (idx > 0) { - // root has level 0 - auto level = sampleselect_searchtree_height - ffs(threadIdx.x); - // we get the in-level index by removing trailing 10000... - auto idx_in_level = threadIdx.x >> ffs(threadIdx.x); - // we get the global index by adding previous levels - auto previous_levels = (1 << level) - 1; - tree_output[idx_in_level + previous_levels] = samples[0]; - } - tree_output[threadIdx.x + searchtree_inner_size] = samples[0]; + for (decltype(sampleselect_oversampling){ + i = 0; + i < sampleselect_oversampling; ++i) + { + auto lidx = idx * sampleselect_oversampling + i; + auto val = input[static_cast(lidx * stride)]; + samples[i] = abs(val); + } + __shared__ AbsType sh_samples[sample_size]; + bitonic_sort(samples, + sh_samples); + if (idx > 0) { + // root has level 0 + auto level = sampleselect_searchtree_height - ffs(threadIdx.x); + // we get the in-level index by removing trailing 10000... + auto idx_in_level = threadIdx.x >> ffs(threadIdx.x); + // we get the global index by adding previous levels + auto previous_levels = (1 << level) - 1; + tree_output[idx_in_level + previous_levels] = samples[0]; + } + tree_output[threadIdx.x + searchtree_inner_size] = samples[0]; } @@ -96,45 +100,48 @@ __global__ __launch_bounds__(default_block_size) void count_buckets( const remove_complex *__restrict__ tree, IndexType *counter, unsigned char *oracles, int items_per_thread) { - // load tree into shared memory, initialize counters - __shared__ remove_complex sh_tree[searchtree_inner_size]; - __shared__ IndexType sh_counter[searchtree_width]; - if (threadIdx.x < searchtree_inner_size) { - sh_tree[threadIdx.x] = tree[threadIdx.x]; - } - if (threadIdx.x < searchtree_width) { - sh_counter[threadIdx.x] = 0; - } - group::this_thread_block().sync(); - - // work distribution: each thread block gets a consecutive index range - auto begin = threadIdx.x + default_block_size * - static_cast(blockIdx.x) * - items_per_thread; - auto block_end = default_block_size * - static_cast(blockIdx.x + 1) * items_per_thread; - auto end = min(block_end, size); - for (IndexType i = begin; i < end; i += default_block_size) { - // traverse the search tree with the input element - auto el = abs(input[i]); - IndexType tree_idx{}; + // load tree into shared memory, initialize counters + __shared__ remove_complex sh_tree[searchtree_inner_size]; + __shared__ IndexType sh_counter[searchtree_width]; + if (threadIdx.x < searchtree_inner_size) { + sh_tree[threadIdx.x] = tree[threadIdx.x]; + } + if (threadIdx.x < searchtree_width) { + sh_counter[threadIdx.x] = 0; + } + group::this_thread_block().sync(); + + // work distribution: each thread block gets a consecutive index range + auto begin = threadIdx.x + default_block_size * + static_cast(blockIdx.x) * + items_per_thread; + auto block_end = default_block_size * + static_cast(blockIdx.x + 1) * + items_per_thread; + auto end = min(block_end, size); + for (IndexType i = begin; i < end; i += default_block_size) { + // traverse the search tree with the input element + auto el = abs(input[i]); + IndexType tree_idx{}; #pragma unroll - for (auto level = 0; level < sampleselect_searchtree_height; ++level) { - auto cmp = !(el < sh_tree[tree_idx]); - tree_idx = 2 * tree_idx + 1 + cmp; + for (decltype(sampleselect_oversampling) level = 0; + level < sampleselect_searchtree_height; ++level) { + auto cmp = !(el < sh_tree[tree_idx]); + tree_idx = 2 * tree_idx + 1 + cmp; + } + // increment the bucket counter and store the bucket index + uint32 bucket = tree_idx - searchtree_inner_size; + // post-condition: sample[bucket] <= el < sample[bucket + 1] + atomic_add(sh_counter + bucket, 1); + oracles[i] = bucket; + } + group::this_thread_block().sync(); + + // write back the block-wide counts to global memory + if (threadIdx.x < searchtree_width) { + counter[blockIdx.x + threadIdx.x * gridDim.x] = + sh_counter[threadIdx.x]; } - // increment the bucket counter and store the bucket index - uint32 bucket = tree_idx - searchtree_inner_size; - // post-condition: sample[bucket] <= el < sample[bucket + 1] - atomic_add(sh_counter + bucket, 1); - oracles[i] = bucket; - } - group::this_thread_block().sync(); - - // write back the block-wide counts to global memory - if (threadIdx.x < searchtree_width) { - counter[blockIdx.x + threadIdx.x * gridDim.x] = sh_counter[threadIdx.x]; - } } @@ -151,69 +158,72 @@ __global__ __launch_bounds__(default_block_size) void block_prefix_sum( IndexType *__restrict__ counters, IndexType *__restrict__ totals, IndexType num_blocks) { - constexpr auto num_warps = default_block_size / config::warp_size; - static_assert(num_warps < config::warp_size, - "block size needs to be smaller"); - __shared__ IndexType warp_sums[num_warps]; - - auto block = group::this_thread_block(); - auto warp = group::tiled_partition(block); - - auto bucket = blockIdx.x; - auto local_counters = counters + num_blocks * bucket; - auto work_per_warp = ceildiv(num_blocks, warp.size()); - auto warp_idx = threadIdx.x / warp.size(); - auto warp_lane = warp.thread_rank(); - - // compute prefix sum over warp-sized blocks - IndexType total{}; - auto base_idx = warp_idx * work_per_warp * warp.size(); - for (auto step = 0; step < work_per_warp; ++step) { - auto idx = warp_lane + step * warp.size() + base_idx; - auto val = idx < num_blocks ? local_counters[idx] : zero(); - IndexType warp_total{}; - IndexType warp_prefix{}; - // compute inclusive prefix sum - subwarp_prefix_sum(val, warp_prefix, warp_total, warp); - - if (idx < num_blocks) { - local_counters[idx] = warp_prefix + total; - } - total += warp_total; - } - - // store total sum - if (warp_lane == 0) { - warp_sums[warp_idx] = total; - } - - // compute prefix sum over all warps in a single warp - block.sync(); - if (warp_idx == 0) { - auto in_bounds = warp_lane < num_warps; - auto val = in_bounds ? warp_sums[warp_lane] : zero(); - IndexType prefix_sum{}; - IndexType total_sum{}; - // compute inclusive prefix sum - subwarp_prefix_sum(val, prefix_sum, total_sum, warp); - if (in_bounds) { - warp_sums[warp_lane] = prefix_sum; + constexpr auto num_warps = default_block_size / config::warp_size; + static_assert(num_warps < config::warp_size, + "block size needs to be smaller"); + __shared__ IndexType warp_sums[num_warps]; + + auto block = group::this_thread_block(); + auto warp = group::tiled_partition(block); + + auto bucket = blockIdx.x; + auto local_counters = counters + num_blocks * bucket; + auto work_per_warp = ceildiv(num_blocks, warp.size()); + auto warp_idx = threadIdx.x / warp.size(); + auto warp_lane = warp.thread_rank(); + + // compute prefix sum over warp-sized blocks + IndexType total{}; + auto base_idx = warp_idx * work_per_warp * warp.size(); + for (decltype(sampleselect_oversampling) step = 0; step < work_per_warp; + ++step) { + auto idx = warp_lane + step * warp.size() + base_idx; + auto val = + idx < num_blocks ? local_counters[idx] : zero(); + IndexType warp_total{}; + IndexType warp_prefix{}; + // compute inclusive prefix sum + subwarp_prefix_sum(val, warp_prefix, warp_total, warp); + + if (idx < num_blocks) { + local_counters[idx] = warp_prefix + total; + } + total += warp_total; } + + // store total sum if (warp_lane == 0) { - totals[bucket] = total_sum; + warp_sums[warp_idx] = total; } - } - - // add block prefix sum to each warp's block of data - block.sync(); - auto warp_prefixsum = warp_sums[warp_idx]; - for (auto step = 0; step < work_per_warp; ++step) { - auto idx = warp_lane + step * warp.size() + base_idx; - auto val = idx < num_blocks ? local_counters[idx] : zero(); - if (idx < num_blocks) { - local_counters[idx] += warp_prefixsum; + + // compute prefix sum over all warps in a single warp + block.sync(); + if (warp_idx == 0) { + auto in_bounds = warp_lane < num_warps; + auto val = in_bounds ? warp_sums[warp_lane] : zero(); + IndexType prefix_sum{}; + IndexType total_sum{}; + // compute inclusive prefix sum + subwarp_prefix_sum(val, prefix_sum, total_sum, warp); + if (in_bounds) { + warp_sums[warp_lane] = prefix_sum; + } + if (warp_lane == 0) { + totals[bucket] = total_sum; + } + } + + // add block prefix sum to each warp's block of data + block.sync(); + auto warp_prefixsum = warp_sums[warp_idx]; + for (IndexType step = 0; step < work_per_warp; ++step) { + auto idx = warp_lane + step * warp.size() + base_idx; + auto val = + idx < num_blocks ? local_counters[idx] : zero(); + if (idx < num_blocks) { + local_counters[idx] += warp_prefixsum; + } } - } } @@ -229,28 +239,29 @@ __global__ __launch_bounds__(default_block_size) void filter_bucket( const unsigned char *oracles, const IndexType *block_offsets, remove_complex *__restrict__ output, int items_per_thread) { - // initialize the counter with the block prefix sum. - __shared__ IndexType counter; - if (threadIdx.x == 0) { - counter = block_offsets[blockIdx.x + bucket * gridDim.x]; - } - group::this_thread_block().sync(); - - // same work-distribution as in count_buckets - auto begin = threadIdx.x + default_block_size * - static_cast(blockIdx.x) * - items_per_thread; - auto block_end = default_block_size * - static_cast(blockIdx.x + 1) * items_per_thread; - auto end = min(block_end, size); - for (IndexType i = begin; i < end; i += default_block_size) { - // only copy the element when it belongs to the target bucket - auto found = bucket == oracles[i]; - auto ofs = atomic_add(&counter, found); - if (found) { - output[ofs] = abs(input[i]); + // initialize the counter with the block prefix sum. + __shared__ IndexType counter; + if (threadIdx.x == 0) { + counter = block_offsets[blockIdx.x + bucket * gridDim.x]; + } + group::this_thread_block().sync(); + + // same work-distribution as in count_buckets + auto begin = threadIdx.x + default_block_size * + static_cast(blockIdx.x) * + items_per_thread; + auto block_end = default_block_size * + static_cast(blockIdx.x + 1) * + items_per_thread; + auto end = min(block_end, size); + for (IndexType i = begin; i < end; i += default_block_size) { + // only copy the element when it belongs to the target bucket + auto found = bucket == oracles[i]; + auto ofs = atomic_add(&counter, found); + if (found) { + output[ofs] = abs(input[i]); + } } - } } @@ -264,17 +275,17 @@ __global__ __launch_bounds__(basecase_block_size) void basecase_select( const ValueType *__restrict__ input, IndexType size, IndexType rank, ValueType *__restrict__ out) { - constexpr auto sentinel = device_numeric_limits::inf; - ValueType local[basecase_local_size]; - __shared__ ValueType sh_local[basecase_size]; - for (int i = 0; i < basecase_local_size; ++i) { - auto idx = threadIdx.x + i * basecase_block_size; - local[i] = idx < size ? input[idx] : sentinel; - } - bitonic_sort(local, sh_local); - if (threadIdx.x == rank / basecase_local_size) { - *out = local[rank % basecase_local_size]; - } + constexpr auto sentinel = device_numeric_limits::inf; + ValueType local[basecase_local_size]; + __shared__ ValueType sh_local[basecase_size]; + for (int i = 0; i < basecase_local_size; ++i) { + auto idx = threadIdx.x + i * basecase_block_size; + local[i] = idx < size ? input[idx] : sentinel; + } + bitonic_sort(local, sh_local); + if (threadIdx.x == rank / basecase_local_size) { + *out = local[rank % basecase_local_size]; + } } @@ -289,19 +300,19 @@ template __global__ __launch_bounds__(config::warp_size) void find_bucket( IndexType *prefix_sum, IndexType rank) { - auto warp = - group::tiled_partition(group::this_thread_block()); - auto idx = group_wide_search(0, searchtree_width, warp, [&](int i) { - return prefix_sum[i + 1] > rank; - }); - if (warp.thread_rank() == 0) { - auto base = prefix_sum[idx]; - auto size = prefix_sum[idx + 1] - base; - // don't overwrite anything before having loaded everything! - prefix_sum[0] = idx; - prefix_sum[1] = base; - prefix_sum[2] = size; - } + auto warp = group::tiled_partition( + group::this_thread_block()); + auto idx = group_wide_search(0, searchtree_width, warp, [&](int i) { + return prefix_sum[i + 1] > rank; + }); + if (warp.thread_rank() == 0) { + auto base = prefix_sum[idx]; + auto size = prefix_sum[idx + 1] - base; + // don't overwrite anything before having loaded everything! + prefix_sum[0] = idx; + prefix_sum[1] = base; + prefix_sum[2] = size; + } } diff --git a/common/matrix/dense_kernels.hpp.inc b/common/matrix/dense_kernels.hpp.inc index d46b202a8ff..c7ebafd0627 100644 --- a/common/matrix/dense_kernels.hpp.inc +++ b/common/matrix/dense_kernels.hpp.inc @@ -211,7 +211,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr( if (tidx < num_rows) { auto write_to = row_ptrs[tidx]; - for (auto i = 0; i < num_cols; i++) { + for (size_type i = 0; i < num_cols; i++) { if (source[stride * tidx + i] != zero()) { values[write_to] = source[stride * tidx + i]; col_idxs[write_to] = i; diff --git a/common/matrix/ell_kernels.hpp.inc b/common/matrix/ell_kernels.hpp.inc index 2323d512258..399dd5070ac 100644 --- a/common/matrix/ell_kernels.hpp.inc +++ b/common/matrix/ell_kernels.hpp.inc @@ -179,7 +179,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_dense( { const auto tidx = thread::get_thread_id_flat(); if (tidx < num_rows) { - for (auto col = 0; col < nnz; col++) { + for (size_type col = 0; col < nnz; col++) { result[tidx * result_stride + col_idxs[tidx + col * source_stride]] += values[tidx + col * source_stride]; @@ -226,7 +226,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr( if (tidx < num_rows) { auto write_to = result_row_ptrs[tidx]; - for (auto i = 0; i < max_nnz_per_row; i++) { + for (size_type i = 0; i < max_nnz_per_row; i++) { const auto source_idx = tidx + stride * i; if (source_values[source_idx] != zero()) { result_values[write_to] = source_values[source_idx]; diff --git a/common/matrix/hybrid_kernels.hpp.inc b/common/matrix/hybrid_kernels.hpp.inc index b6af7c2be36..c7c192189e0 100644 --- a/common/matrix/hybrid_kernels.hpp.inc +++ b/common/matrix/hybrid_kernels.hpp.inc @@ -108,7 +108,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr( if (tidx < num_rows) { auto write_to = result_row_ptrs[tidx]; - for (auto i = 0; i < max_nnz_per_row; i++) { + for (size_type i = 0; i < max_nnz_per_row; i++) { const auto source_idx = tidx + stride * i; if (ell_val[source_idx] != zero()) { result_values[write_to] = ell_val[source_idx]; diff --git a/cuda/components/prefix_sum.cu b/cuda/components/prefix_sum.cu index 54739c783c8..ce108fa8cf9 100644 --- a/cuda/components/prefix_sum.cu +++ b/cuda/components/prefix_sum.cu @@ -49,7 +49,7 @@ template void prefix_sum(std::shared_ptr exec, IndexType *counts, size_type num_entries) { - // prefix_sum should be on the valid array + // prefix_sum should only be performed on a valid array if (num_entries > 0) { auto num_blocks = ceildiv(num_entries, prefix_sum_block_size); Array block_sum_array(exec, num_blocks - 1); @@ -57,8 +57,8 @@ void prefix_sum(std::shared_ptr exec, IndexType *counts, start_prefix_sum <<>>(num_entries, counts, block_sums); - // add the total sum of the previous block only when the number of block - // is larger than 1. + // add the total sum of the previous block only when the number of + // blocks is larger than 1. if (num_blocks > 1) { finalize_prefix_sum <<>>(num_entries, counts, diff --git a/cuda/test/components/sorting_kernels.cu b/cuda/test/components/sorting_kernels.cu index e973cc0f650..9659ecbc7a8 100644 --- a/cuda/test/components/sorting_kernels.cu +++ b/cuda/test/components/sorting_kernels.cu @@ -99,7 +99,7 @@ protected: { // we want some duplicate elements std::uniform_int_distribution dist(0, num_elements / 2); - for (auto i = 0; i < num_elements; ++i) { + for (decltype(num_elements) i = 0; i < num_elements; ++i) { ref_shared.get_data()[i] = dist(rng); } ddata = gko::Array{cuda, ref_shared}; diff --git a/cuda/test/matrix/dense_kernels.cpp b/cuda/test/matrix/dense_kernels.cpp index 6e40ce5b5a3..de96d27d823 100644 --- a/cuda/test/matrix/dense_kernels.cpp +++ b/cuda/test/matrix/dense_kernels.cpp @@ -550,7 +550,7 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef) &dnnz_per_row); auto tmp = gko::Array(ref, dnnz_per_row); - for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) { + for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) { ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]); } } diff --git a/cuda/test/matrix/ell_kernels.cpp b/cuda/test/matrix/ell_kernels.cpp index 2df1c397f4c..51c12fab531 100644 --- a/cuda/test/matrix/ell_kernels.cpp +++ b/cuda/test/matrix/ell_kernels.cpp @@ -585,7 +585,7 @@ TEST_F(Ell, CalculateNNZPerRowIsEquivalentToRef) &dnnz_per_row); auto tmp = gko::Array(ref, dnnz_per_row); - for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) { + for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) { ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]); } } diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 443d180b172..7729588d363 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -60,13 +60,15 @@ target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP) set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE) target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}") -# Note. add MKL via PRIVATE not PUBLIC (MKL example shows) to avoid find_package(MKL) everywhere when link ginkgo +# Note: add MKL as PRIVATE not PUBLIC (MKL example shows) to avoid propagating +# find_package(MKL) everywhere when linking ginkgo (see the MKL example +# https://software.intel.com/content/www/us/en/develop/documentation/onemkl-windows-developer-guide/top/getting-started/cmake-config-for-onemkl.html) target_compile_options(ginkgo_dpcpp PRIVATE $) target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17) target_include_directories(ginkgo_dpcpp PRIVATE $) target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all) # When building ginkgo as a static library, we need to use dpcpp and per_kernel -# link option when the program uses dpcpp related function. +# link option when the program uses a dpcpp related function. if (BUILD_SHARED_LIBS) target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel) else () diff --git a/dpcpp/base/helper.dp.cpp b/dpcpp/base/helper.dp.cpp index ae453dd937d..5e6c1a579f5 100644 --- a/dpcpp/base/helper.dp.cpp +++ b/dpcpp/base/helper.dp.cpp @@ -44,18 +44,16 @@ namespace dpcpp { bool validate(sycl::queue *queue, unsigned int workgroup_size, unsigned int subgroup_size) { - { - auto device = queue->get_device(); - auto subgroup_size_list = - device.get_info(); - auto max_workgroup_size = - device.get_info(); - bool allowed = false; - for (auto &i : subgroup_size_list) { - allowed |= (i == subgroup_size); - } - return allowed && (workgroup_size <= max_workgroup_size); + auto device = queue->get_device(); + auto subgroup_size_list = + device.get_info(); + auto max_workgroup_size = + device.get_info(); + bool allowed = false; + for (auto &i : subgroup_size_list) { + allowed |= (i == subgroup_size); } + return allowed && (workgroup_size <= max_workgroup_size); } diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp index 8c7f45e5174..16d91c2ef8d 100644 --- a/dpcpp/base/helper.hpp +++ b/dpcpp/base/helper.hpp @@ -51,44 +51,44 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /** * GKO_ENABLE_DEFAULT_HOST gives a default host implementation for those * kernels which require encoded config but do not need explicit template - * parameter and share memory + * parameter and shared memory * * @param name_ the name of the host function with config * @param kernel_ the kernel name */ -#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_) \ - template \ - void name_(dim3 grid, dim3 block, size_t dynamic_shared_memory, \ - sycl::queue *queue, InferredArgs... args) \ - { \ - queue->submit([&](sycl::handler &cgh) { \ - cgh.parallel_for(sycl_nd_range(grid, block), \ - [=](sycl::nd_item<3> item_ct1) { \ - kernel_(args..., item_ct1); \ - }); \ - }); \ +#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_) \ + template \ + void name_(dim3 grid, dim3 block, size_t, sycl::queue *queue, \ + InferredArgs... args) \ + { \ + queue->submit([&](sycl::handler &cgh) { \ + cgh.parallel_for(sycl_nd_range(grid, block), \ + [=](sycl::nd_item<3> item_ct1) { \ + kernel_(args..., item_ct1); \ + }); \ + }); \ } /** * GKO_ENABLE_DEFAULT_HOST_CONFIG gives a default host implementation for those * kernels which require encoded config but do not need explicit template - * parameter and share memory + * parameter and shared memory * * @param name_ the name of the host function with config * @param kernel_ the kernel name */ -#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_) \ - template \ - inline void name_(dim3 grid, dim3 block, size_t dynamic_shared_memory, \ - sycl::queue *queue, InferredArgs... args) \ - { \ - queue->submit([&](sycl::handler &cgh) { \ - cgh.parallel_for(sycl_nd_range(grid, block), \ - [=](sycl::nd_item<3> item_ct1) { \ - kernel_(args..., item_ct1); \ - }); \ - }); \ +#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_) \ + template \ + inline void name_(dim3 grid, dim3 block, size_t, sycl::queue *queue, \ + InferredArgs... args) \ + { \ + queue->submit([&](sycl::handler &cgh) { \ + cgh.parallel_for(sycl_nd_range(grid, block), \ + [=](sycl::nd_item<3> item_ct1) { \ + kernel_(args..., item_ct1); \ + }); \ + }); \ } /** @@ -138,10 +138,33 @@ namespace kernels { namespace dpcpp { +/** + * This is the validate function for common check. It checks the workgroup size + * is below device max workgroup size and subgroup size is in the supported + * subgroup size. + * + * @param queue the sycl queue pointer + * @param workgroup_size the workgroup size (block size in cuda sense) + * @param subgroup_size the subgroup size (warp size in cuda sense) + * + * @return the given arguments are valid or not in given queue. + */ bool validate(sycl::queue *queue, unsigned workgroup_size, unsigned subgroup_size); +/** + * get_first_cfg will return the first valid config by validate function from + * given config array. + * + * @tparam IterArr the iteratable array type + * @tparam Validate the validate function type + * + * @param arr the config array + * @param verify the validate function + * + * @return the first valid config + */ template std::uint32_t get_first_cfg(IterArr &arr, Validate verify) { diff --git a/dpcpp/base/onemkl_bindings.hpp b/dpcpp/base/onemkl_bindings.hpp index 6456a048d23..d2a13f9eab7 100644 --- a/dpcpp/base/onemkl_bindings.hpp +++ b/dpcpp/base/onemkl_bindings.hpp @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include namespace gko { diff --git a/dpcpp/components/prefix_sum.dp.cpp b/dpcpp/components/prefix_sum.dp.cpp index 07cdb5b38aa..63f33e9ba35 100644 --- a/dpcpp/components/prefix_sum.dp.cpp +++ b/dpcpp/components/prefix_sum.dp.cpp @@ -70,7 +70,7 @@ template void prefix_sum(std::shared_ptr exec, IndexType *counts, size_type num_entries) { - // prefix_sum should be on the valid array + // prefix_sum should only be performed on a valid array if (num_entries > 0) { auto queue = exec->get_queue(); constexpr auto block_cfg_array = as_array(block_cfg_list); @@ -84,8 +84,8 @@ void prefix_sum(std::shared_ptr exec, IndexType *counts, auto block_sums = block_sum_array.get_data(); start_prefix_sum_call(cfg, num_blocks, wg_size, 0, exec->get_queue(), num_entries, counts, block_sums); - // add the total sum of the previous block only when the number of block - // is larger than 1. + // add the total sum of the previous block only when the number of + // blocks is larger than 1. if (num_blocks > 1) { finalize_prefix_sum_call(cfg, num_blocks, wg_size, 0, exec->get_queue(), num_entries, counts, diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp index 22e6139dd84..cb407118aed 100644 --- a/dpcpp/components/prefix_sum.dp.hpp +++ b/dpcpp/components/prefix_sum.dp.hpp @@ -78,7 +78,7 @@ __dpct_inline__ void subwarp_prefix_sum(ValueType element, total_sum = element; #pragma unroll // hypercube prefix sum - for (auto step = 1; step < subwarp.size(); step *= 2) { + for (int step = 1; step < subwarp.size(); step *= 2) { auto neighbor = subwarp.shfl_xor(total_sum, step); total_sum += neighbor; prefix_sum += bool(subwarp.thread_rank() & step) ? neighbor : 0; diff --git a/dpcpp/components/thread_ids.dp.hpp b/dpcpp/components/thread_ids.dp.hpp index 47abf3c7b72..f9decfd989d 100644 --- a/dpcpp/components/thread_ids.dp.hpp +++ b/dpcpp/components/thread_ids.dp.hpp @@ -195,7 +195,7 @@ __dpct_inline__ size_type get_warp_id(sycl::nd_item<3> item_ct1) template __dpct_inline__ size_type get_subwarp_id(sycl::nd_item<3> item_ct1) { - // dpcpp dose not have subwarp + // dpcpp does not have subwarp constexpr auto subwarps_per_warp = subwarp_size / subwarp_size; return get_warp_id(item_ct1) * subwarps_per_warp + item_ct1.get_local_id(1); diff --git a/dpcpp/components/uninitialized_array.hpp b/dpcpp/components/uninitialized_array.hpp index b10457df217..d9d423c9c94 100644 --- a/dpcpp/components/uninitialized_array.hpp +++ b/dpcpp/components/uninitialized_array.hpp @@ -48,7 +48,7 @@ namespace dpcpp { /** * Stores an array with uninitialized contents. * - * This class needed for datatypes that do have a non-empty constructor when` + * This class is needed for datatypes that do have a non-empty constructor when * using them as shared memory, for example `thrust::complex`. * * @tparam ValueType the type of values @@ -63,7 +63,7 @@ class UninitializedArray { * * @return the constexpr pointer to the first entry of the array. */ - constexpr __dpct_inline__ operator ValueType *() const noexcept + constexpr __dpct_inline__ operator const ValueType *() const noexcept { return &(*this)[0]; } @@ -84,7 +84,7 @@ class UninitializedArray { * * @return a reference to the array entry at the given index. */ - constexpr __dpct_inline__ ValueType &operator[](size_type pos) const + constexpr __dpct_inline__ const ValueType &operator[](size_type pos) const noexcept { return data_[pos]; diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 32eef01af63..3d7af2024fc 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -433,7 +433,7 @@ void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride, if (tidx < num_rows) { auto write_to = row_ptrs[tidx]; - for (auto i = 0; i < num_cols; i++) { + for (size_type i = 0; i < num_cols; i++) { if (source[stride * tidx + i] != zero()) { values[write_to] = source[stride * tidx + i]; col_idxs[write_to] = i; diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp index 43ce9bad547..257ee6fbc6a 100644 --- a/dpcpp/test/matrix/dense_kernels.cpp +++ b/dpcpp/test/matrix/dense_kernels.cpp @@ -697,7 +697,7 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef) &dnnz_per_row); auto tmp = gko::Array(ref, dnnz_per_row); - for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) { + for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) { ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]); } } diff --git a/hip/components/prefix_sum.hip.cpp b/hip/components/prefix_sum.hip.cpp index 28cd01b4fb5..9302fc07b9a 100644 --- a/hip/components/prefix_sum.hip.cpp +++ b/hip/components/prefix_sum.hip.cpp @@ -49,7 +49,7 @@ template void prefix_sum(std::shared_ptr exec, IndexType *counts, size_type num_entries) { - // prefix_sum should be on the valid array + // prefix_sum should only be performed on a valid array if (num_entries > 0) { auto num_blocks = ceildiv(num_entries, prefix_sum_block_size); Array block_sum_array(exec, num_blocks - 1); @@ -58,8 +58,8 @@ void prefix_sum(std::shared_ptr exec, IndexType *counts, HIP_KERNEL_NAME(start_prefix_sum), dim3(num_blocks), dim3(prefix_sum_block_size), 0, 0, num_entries, counts, block_sums); - // add the total sum of the previous block only when the number of block - // is larger than 1. + // add the total sum of the previous block only when the number of + // blocks is larger than 1. if (num_blocks > 1) { hipLaunchKernelGGL( HIP_KERNEL_NAME(finalize_prefix_sum), diff --git a/include/ginkgo/core/synthesizer/containers.hpp b/include/ginkgo/core/synthesizer/containers.hpp index 10e8c1031a1..0e9570540fa 100644 --- a/include/ginkgo/core/synthesizer/containers.hpp +++ b/include/ginkgo/core/synthesizer/containers.hpp @@ -51,7 +51,7 @@ namespace syn { * value_list records several values with the same type in template. * * @tparam T the value type of the list - * @tparam T... the values in the list + * @tparam Values the values in the list */ template struct value_list {}; @@ -60,7 +60,7 @@ struct value_list {}; /** * type_list records several types in template * - * @tparam ...Types the types in the list + * @tparam Types the types in the list */ template struct type_list {}; @@ -69,9 +69,9 @@ struct type_list {}; /** * range records start, end, step in template * - * @tparam int start of range - * @tparam int end of range - * @tparam int step of range. default is 1 + * @tparam Start start of range + * @tparam End end of range + * @tparam Step step of range. default is 1 */ template struct range {}; @@ -93,8 +93,8 @@ struct concatenate_impl; * concatenate_impl specializes for two value_list with the same value type. * * @tparam T the value type of two value_list - * @tparam T... the values of the first list - * @tparam T... the values of the second list + * @tparam Values the values of the first list + * @tparam Values the values of the second list */ template struct concatenate_impl, value_list> { @@ -130,7 +130,7 @@ struct as_list_impl; * as_list_impl specializes for the value_list * * @tparam T the value_list type - * @tparam T... the values of value_list + * @tparam Values the values of value_list */ template struct as_list_impl> { @@ -193,7 +193,7 @@ using as_list = typename detail::as_list_impl::type; * for in runtime on the array. * * @tparam T the type of value_list - * @tparam T... the values of value_list + * @tparam Value the values of value_list * * @param value_list the input value_list *