diff --git a/common/cuda_hip/components/segment_scan.hpp.inc b/common/cuda_hip/components/segment_scan.hpp.inc index 2bd2e3eec90..cc873d5850d 100644 --- a/common/cuda_hip/components/segment_scan.hpp.inc +++ b/common/cuda_hip/components/segment_scan.hpp.inc @@ -38,25 +38,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * performs suffix sum. Works on the source array and returns whether the thread * is the first element of its segment with same `ind`. */ -template +template __device__ __forceinline__ bool segment_scan( const group::thread_block_tile& group, const IndexType ind, - ValueType* __restrict__ val) + ValueType& val, Operator op) { bool head = true; #pragma unroll for (int i = 1; i < subwarp_size; i <<= 1) { const IndexType add_ind = group.shfl_up(ind, i); - ValueType add_val = zero(); - if (add_ind == ind && threadIdx.x >= i) { - add_val = *val; + ValueType add_val{}; + if (add_ind == ind && group.thread_rank() >= i) { + add_val = val; if (i == 1) { head = false; } } add_val = group.shfl_down(add_val, i); - if (threadIdx.x < subwarp_size - i) { - *val += add_val; + if (group.thread_rank() < subwarp_size - i) { + val = op(val, add_val); } } return head; diff --git a/common/cuda_hip/matrix/coo_kernels.hpp.inc b/common/cuda_hip/matrix/coo_kernels.hpp.inc index a13068c1ce0..a5dd70bc95a 100644 --- a/common/cuda_hip/matrix/coo_kernels.hpp.inc +++ b/common/cuda_hip/matrix/coo_kernels.hpp.inc @@ -82,8 +82,9 @@ __device__ void spmv_kernel(const size_type nnz, const size_type num_lines, (ind + subwarp_size < nnz) ? row[ind + subwarp_size] : row[nnz - 1]; // segmented scan if (tile_block.any(curr_row != next_row)) { - bool is_first_in_segment = - segment_scan(tile_block, curr_row, &temp_val); + bool is_first_in_segment = segment_scan( + tile_block, curr_row, temp_val, + [](ValueType a, ValueType b) { return a + b; }); if (is_first_in_segment) { atomic_add(&(c[curr_row * c_stride + column_id]), scale(temp_val)); @@ -97,8 +98,9 @@ __device__ void spmv_kernel(const size_type nnz, const size_type num_lines, temp_val += (ind < nnz) ? val[ind] * b[col[ind] * b_stride + column_id] : zero(); // segmented scan - bool is_first_in_segment = - segment_scan(tile_block, curr_row, &temp_val); + bool is_first_in_segment = segment_scan( + tile_block, curr_row, temp_val, + [](ValueType a, ValueType b) { return a + b; }); if (is_first_in_segment) { atomic_add(&(c[curr_row * c_stride + column_id]), scale(temp_val)); } diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index a754971db77..3cdf7f24d6d 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -68,22 +68,21 @@ __device__ __forceinline__ bool block_segment_scan_reverse( template __device__ __forceinline__ void find_next_row( const IndexType num_rows, const IndexType data_size, const IndexType ind, - IndexType* __restrict__ row, IndexType* __restrict__ row_end, - const IndexType row_predict, const IndexType row_predict_end, - const IndexType* __restrict__ row_ptr) + IndexType& row, IndexType& row_end, const IndexType row_predict, + const IndexType row_predict_end, const IndexType* __restrict__ row_ptr) { if (!overflow || ind < data_size) { - if (ind >= *row_end) { - *row = row_predict; - *row_end = row_predict_end; - while (ind >= *row_end) { - *row_end = row_ptr[++*row + 1]; + if (ind >= row_end) { + row = row_predict; + row_end = row_predict_end; + while (ind >= row_end) { + row_end = row_ptr[++row + 1]; } } } else { - *row = num_rows - 1; - *row_end = data_size; + row = num_rows - 1; + row_end = data_size; } } @@ -92,16 +91,17 @@ template __device__ __forceinline__ void warp_atomic_add( const group::thread_block_tile& group, bool force_write, - ValueType* __restrict__ val, const IndexType row, ValueType* __restrict__ c, + ValueType& val, const IndexType row, ValueType* __restrict__ c, const size_type c_stride, const IndexType column_id, Closure scale) { // do a local scan to avoid atomic collisions - const bool need_write = segment_scan(group, row, val); + const bool need_write = segment_scan( + group, row, val, [](ValueType a, ValueType b) { return a + b; }); if (need_write && force_write) { - atomic_add(&(c[row * c_stride + column_id]), scale(*val)); + atomic_add(&(c[row * c_stride + column_id]), scale(val)); } if (!need_write || force_write) { - *val = zero(); + val = zero(); } } @@ -111,28 +111,27 @@ template & group, const IndexType num_rows, const IndexType data_size, const IndexType ind, - IndexType* __restrict__ row, IndexType* __restrict__ row_end, - IndexType* __restrict__ nrow, IndexType* __restrict__ nrow_end, - ValueType* __restrict__ temp_val, const ValueType* __restrict__ val, + IndexType& row, IndexType& row_end, IndexType& nrow, IndexType& nrow_end, + ValueType& temp_val, const ValueType* __restrict__ val, const IndexType* __restrict__ col_idxs, const IndexType* __restrict__ row_ptrs, const ValueType* __restrict__ b, const size_type b_stride, ValueType* __restrict__ c, const size_type c_stride, const IndexType column_id, Closure scale) { - const IndexType curr_row = *row; - find_next_row(num_rows, data_size, ind, row, row_end, *nrow, - *nrow_end, row_ptrs); + const auto curr_row = row; + find_next_row(num_rows, data_size, ind, row, row_end, nrow, nrow_end, + row_ptrs); // segmented scan - if (group.any(curr_row != *row)) { - warp_atomic_add(group, curr_row != *row, temp_val, curr_row, c, - c_stride, column_id, scale); - *nrow = group.shfl(*row, subwarp_size - 1); - *nrow_end = group.shfl(*row_end, subwarp_size - 1); + if (group.any(curr_row != row)) { + warp_atomic_add(group, curr_row != row, temp_val, curr_row, c, c_stride, + column_id, scale); + nrow = group.shfl(row, subwarp_size - 1); + nrow_end = group.shfl(row_end, subwarp_size - 1); } if (!last || ind < data_size) { const auto col = col_idxs[ind]; - *temp_val += val[ind] * b[col * b_stride + column_id]; + temp_val += val[ind] * b[col * b_stride + column_id]; } } @@ -171,21 +170,21 @@ __device__ __forceinline__ void spmv_kernel( auto nrow_end = row_end; ValueType temp_val = zero(); IndexType ind = start + threadIdx.x; - find_next_row(num_rows, data_size, ind, &row, &row_end, nrow, - nrow_end, row_ptrs); + find_next_row(num_rows, data_size, ind, row, row_end, nrow, nrow_end, + row_ptrs); const IndexType ind_end = end - wsize; const auto tile_block = group::tiled_partition(group::this_thread_block()); for (; ind < ind_end; ind += wsize) { - process_window(tile_block, num_rows, data_size, ind, &row, - &row_end, &nrow, &nrow_end, &temp_val, val, - col_idxs, row_ptrs, b, b_stride, c, c_stride, - column_id, scale); - } - process_window(tile_block, num_rows, data_size, ind, &row, &row_end, - &nrow, &nrow_end, &temp_val, val, col_idxs, row_ptrs, - b, b_stride, c, c_stride, column_id, scale); - warp_atomic_add(tile_block, true, &temp_val, row, c, c_stride, column_id, + process_window(tile_block, num_rows, data_size, ind, row, + row_end, nrow, nrow_end, temp_val, val, col_idxs, + row_ptrs, b, b_stride, c, c_stride, column_id, + scale); + } + process_window(tile_block, num_rows, data_size, ind, row, row_end, + nrow, nrow_end, temp_val, val, col_idxs, row_ptrs, b, + b_stride, c, c_stride, column_id, scale); + warp_atomic_add(tile_block, true, temp_val, row, c, c_stride, column_id, scale); } @@ -915,3 +914,187 @@ void convert_to_fbcsr(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); + + +namespace kernel { + + +template +__global__ __launch_bounds__(default_block_size) void build_csr_lookup( + const IndexType* __restrict__ row_ptrs, + const IndexType* __restrict__ col_idxs, size_type num_rows, + matrix::csr::sparsity_type allowed, const IndexType* storage_offsets, + int64* __restrict__ row_desc, int32* __restrict__ storage) +{ + using matrix::csr::sparsity_type; + constexpr int bitmap_block_size = matrix::csr::sparsity_bitmap_block_size; + auto row = thread::get_subwarp_id_flat(); + if (row >= num_rows) { + return; + } + const auto subwarp = + group::tiled_partition(group::this_thread_block()); + const auto row_begin = row_ptrs[row]; + const auto row_len = row_ptrs[row + 1] - row_begin; + const auto storage_begin = storage_offsets[row]; + const auto available_storage = storage_offsets[row + 1] - storage_begin; + const auto local_storage = storage + storage_begin; + const auto local_cols = col_idxs + row_begin; + const auto lane = subwarp.thread_rank(); + const auto min_col = row_len > 0 ? local_cols[0] : 0; + const auto col_range = + row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0; + + // full column range + if (col_range == row_len && + csr_lookup_allowed(allowed, sparsity_type::full)) { + if (lane == 0) { + row_desc[row] = static_cast(sparsity_type::full); + } + return; + } + // dense bitmap storage + const auto num_blocks = + static_cast(ceildiv(col_range, bitmap_block_size)); + if (num_blocks * 2 <= available_storage && + csr_lookup_allowed(allowed, sparsity_type::bitmap)) { + if (lane == 0) { + row_desc[row] = (static_cast(num_blocks) << 32) | + static_cast(sparsity_type::bitmap); + } + const auto block_ranks = local_storage; + const auto block_bitmaps = + reinterpret_cast(block_ranks + num_blocks); + // fill bitmaps with zeros + for (int32 i = lane; i < num_blocks; i += subwarp_size) { + block_bitmaps[i] = 0; + } + // fill bitmaps with sparsity pattern + for (IndexType base_i = 0; base_i < row_len; base_i += subwarp_size) { + const auto i = base_i + lane; + const auto col = i < row_len + ? local_cols[i] + : device_numeric_limits::max; + const auto rel_col = static_cast(col - min_col); + const auto block = rel_col / bitmap_block_size; + const auto col_in_block = rel_col % bitmap_block_size; + auto local_bitmap = uint32{i < row_len ? 1u : 0u} << col_in_block; + bool is_first = + segment_scan(subwarp, block, local_bitmap, + [](config::lane_mask_type a, + config::lane_mask_type b) { return a | b; }); + // memory barrier - just to be sure + subwarp.sync(); + if (is_first && i < row_len) { + block_bitmaps[block] |= local_bitmap; + } + } + // compute bitmap ranks + int32 block_partial_sum{}; + for (int32 base_i = 0; base_i < num_blocks; base_i += subwarp_size) { + const auto i = base_i + lane; + const auto bitmap = i < num_blocks ? block_bitmaps[i] : 0; + int32 local_partial_sum{}; + int32 local_total_sum{}; + subwarp_prefix_sum(popcnt(bitmap), local_partial_sum, + local_total_sum, subwarp); + if (i < num_blocks) { + block_ranks[i] = local_partial_sum + block_partial_sum; + } + block_partial_sum += local_total_sum; + } + return; + } + // sparse hashmap storage + // we need at least one unfilled entry to avoid infinite loops on search + GKO_ASSERT(row_len < available_storage); + constexpr double inv_golden_ratio = 0.61803398875; + // use golden ratio as approximation for hash parameter that spreads + // consecutive values as far apart as possible. Ensure lowest bit is set + // otherwise we skip odd hashtable entries + const auto hash_parameter = + 1u | static_cast(available_storage * inv_golden_ratio); + if (lane == 0) { + row_desc[row] = (static_cast(hash_parameter) << 32) | + static_cast(sparsity_type::hash); + } + // fill hashmap with sentinel + constexpr int32 empty = invalid_index(); + for (int32 i = lane; i < available_storage; i += subwarp_size) { + local_storage[i] = empty; + } + // memory barrier + subwarp.sync(); + // fill with actual entries + for (IndexType base_i = 0; base_i < row_len; base_i += subwarp_size) { + const auto i = base_i + lane; + const auto col = i < row_len ? local_cols[i] : 0; + // make sure that each idle thread gets a unique out-of-bounds value + auto hash = i < row_len ? (static_cast(col) * hash_parameter) % + static_cast(available_storage) + : static_cast(available_storage + lane); + // collision resolution +#if !defined(__HIP_DEVICE_COMPILE__) && defined(__CUDA_ARCH__) && \ + (__CUDA_ARCH__ >= 700) + const auto this_lane_mask = config::lane_mask_type{1} << lane; + const auto lane_prefix_mask = this_lane_mask - 1; + // memory barrier to previous loop iteration + subwarp.sync(); + int32 entry = i < row_len ? local_storage[hash] : empty; + // find all threads in the subwarp with the same hash key + auto colliding = subwarp.match_any(hash); + // if there are any collisions with previously filled buckets + while (subwarp.any(entry != empty || colliding != this_lane_mask)) { + if (entry != empty || colliding != this_lane_mask) { + // assign consecutive indices to matching threads + hash += (entry == empty ? 0 : 1) + + popcnt(colliding & lane_prefix_mask); + // cheap modulo replacement + if (hash >= available_storage) { + hash -= available_storage; + } + // this could only fail for available_storage < warp_size, as + // popcnt(colliding) is at most warp_size. At the same time, we + // only increase hash by row_length at most, so this is still + // safe. + GKO_ASSERT(hash < available_storage); + entry = local_storage[hash]; + } + colliding = subwarp.match_any(hash); + } + if (i < row_len) { + local_storage[hash] = i; + } +#else + if (i < row_len) { + while (atomicCAS(local_storage + hash, empty, i) != empty) { + hash++; + if (hash >= available_storage) { + hash = 0; + } + } + } +#endif + } +} + + +} // namespace kernel + + +template +void build_lookup(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* col_idxs, + size_type num_rows, matrix::csr::sparsity_type allowed, + const IndexType* storage_offsets, int64* row_desc, + int32* storage) +{ + const auto num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + kernel::build_csr_lookup + <<>>(row_ptrs, col_idxs, num_rows, + allowed, storage_offsets, row_desc, + storage); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); diff --git a/common/unified/matrix/csr_kernels.cpp b/common/unified/matrix/csr_kernels.cpp index d6a81453b86..690f96debdc 100644 --- a/common/unified/matrix/csr_kernels.cpp +++ b/common/unified/matrix/csr_kernels.cpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common/unified/base/kernel_launch.hpp" +#include "core/components/prefix_sum_kernels.hpp" namespace gko { @@ -243,6 +244,49 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL); +template +void build_lookup_offsets(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* col_idxs, + size_type num_rows, + matrix::csr::sparsity_type allowed, + IndexType* storage_offsets) +{ + using matrix::csr::sparsity_bitmap_block_size; + using matrix::csr::sparsity_type; + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto row_ptrs, auto col_idxs, auto num_rows, + auto allowed, auto storage_offsets) { + const auto row_begin = row_ptrs[row]; + const auto row_len = row_ptrs[row + 1] - row_begin; + const auto local_cols = col_idxs + row_begin; + const auto min_col = row_len > 0 ? local_cols[0] : 0; + const auto col_range = + row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0; + if (csr_lookup_allowed(allowed, sparsity_type::full) && + row_len == col_range) { + storage_offsets[row] = 0; + } else { + const auto hashmap_storage = row_len == 0 ? 1 : 2 * row_len; + const auto bitmap_num_blocks = static_cast( + ceildiv(col_range, sparsity_bitmap_block_size)); + const auto bitmap_storage = 2 * bitmap_num_blocks; + if (csr_lookup_allowed(allowed, sparsity_type::bitmap) && + bitmap_storage <= hashmap_storage) { + storage_offsets[row] = bitmap_storage; + } else { + storage_offsets[row] = hashmap_storage; + } + } + }, + num_rows, row_ptrs, col_idxs, num_rows, allowed, storage_offsets); + components::prefix_sum(exec, storage_offsets, num_rows + 1); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL); + + } // namespace csr } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp index e9ee48608ed..bd965ad4bc3 100644 --- a/core/matrix/csr_kernels.hpp +++ b/core/matrix/csr_kernels.hpp @@ -50,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/kernel_declaration.hpp" +#include "core/matrix/csr_lookup.hpp" namespace gko { @@ -224,6 +225,20 @@ namespace kernels { const matrix::Dense* beta, \ matrix::Csr* mtx) +#define GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL(IndexType) \ + void build_lookup_offsets(std::shared_ptr exec, \ + const IndexType* row_ptrs, \ + const IndexType* col_idxs, size_type num_rows, \ + matrix::csr::sparsity_type allowed, \ + IndexType* storage_offsets) + +#define GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL(IndexType) \ + void build_lookup(std::shared_ptr exec, \ + const IndexType* row_ptrs, const IndexType* col_idxs, \ + size_type num_rows, matrix::csr::sparsity_type allowed, \ + const IndexType* storage_offsets, int64* row_desc, \ + int32* storage) + #define GKO_DECLARE_ALL_AS_TEMPLATES \ template \ @@ -283,7 +298,11 @@ namespace kernels { template \ GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST(ValueType, IndexType); \ template \ - GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL(ValueType, IndexType) + GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL(IndexType); \ + template \ + GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL(IndexType) GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(csr, GKO_DECLARE_ALL_AS_TEMPLATES); diff --git a/core/matrix/csr_lookup.hpp b/core/matrix/csr_lookup.hpp new file mode 100644 index 00000000000..e4d46e25ee5 --- /dev/null +++ b/core/matrix/csr_lookup.hpp @@ -0,0 +1,327 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_MATRIX_CSR_LOOKUP_HPP_ +#define GKO_CORE_MATRIX_CSR_LOOKUP_HPP_ + + +#include + + +#include +#include +#include + + +namespace gko { +namespace matrix { +namespace csr { + + +/** + * Type describing which kind of lookup structure is used to find entries in a + * single row of a Csr matrix. It is also uses as a mask, so each value has only + * a single bit set. + */ +enum class sparsity_type { + /** + * The row is dense, i.e. it contains all entries in + * `[min_col, min_col + storage_size)`. + * This means that the relative output index is `col - min_col`. + */ + full = 1, + /** + * The row is sufficiently dense that its sparsity pattern can be stored in + * a dense bitmap consisting of `storage_size` blocks of size `block_size`. + * Each block stores its sparsity pattern as a bitmask and the number of + * columns before the block as integer. This means that the relative output + * index can be computed as + * ``` + * auto block = (col - min_col) / sparsity_bitmap_block_size; + * auto local_col = (col - min_col) % sparsity_bitmap_block_size; + * auto prefix_mask = (block_type{1} << local_col) - 1; + * auto output_idx = base[block] + popcount(bitmap[block] & prefix_mask); + * ``` + */ + bitmap = 2, + /** + * The row is sparse, so it is best represented using a hashtable. + * The hashtable has size `storage_size` and stores the relative output + * index directly, i.e. + * ``` + * auto hash_key = col; + * auto hash_bucket = hash(hash_key); + * while (local_cols[hashtable[hash_bucket]] != col) { + * hash_bucket = (hash_bucket + 1) % storage_size; // linear probing + * } + * auto output_idx = hashtable[hash_bucket]; + * ``` + */ + hash = 4, +}; + + +GKO_ATTRIBUTES GKO_INLINE sparsity_type operator|(sparsity_type a, + sparsity_type b) +{ + return static_cast(static_cast(a) | + static_cast(b)); +} + + +GKO_ATTRIBUTES GKO_INLINE bool csr_lookup_allowed(sparsity_type allowed, + sparsity_type type) +{ + return ((static_cast(allowed) & static_cast(type)) != 0); +} + + +/** Number of bits in a block_type entry. */ +static constexpr int sparsity_bitmap_block_size = 32; + + +template +struct device_sparsity_lookup { + /** + * Set up a device_sparsity_lookup structure from local data + * + * @param local_cols the column array slice for the local row, it contains + * row_nnz entries + * @param row_nnz the number of entries in this row + * @param local_storage the local lookup structure storage array slice, it + * needs to be set up using the storage_offsets array + * @param storage_size the number of int32 entries in the local lookup + * structure + * @param desc the lookup structure descriptor for this row + */ + GKO_ATTRIBUTES GKO_INLINE device_sparsity_lookup( + const IndexType* local_cols, IndexType row_nnz, + const int32* local_storage, IndexType storage_size, int64 desc) + : local_cols{local_cols}, + row_nnz{row_nnz}, + local_storage{local_storage}, + storage_size{storage_size}, + desc{desc} + {} + + /** + * Set up a device_sparsity_lookup structure from global data + * + * @param row_ptrs the CSR row pointers + * @param col_idxs the CSR column indices + * @param storage_offsets the storage offset array for the lookup structure + * @param storage the storage array for the lookup data structure + * @param descs the lookup structure descriptor array + * @param row the row index to build the lookup structure for + */ + GKO_ATTRIBUTES GKO_INLINE device_sparsity_lookup( + const IndexType* row_ptrs, const IndexType* col_idxs, + const IndexType* storage_offsets, const int32* storage, + const int64* descs, size_type row) + { + const auto row_begin = row_ptrs[row]; + const auto row_end = row_ptrs[row + 1]; + const auto storage_begin = storage_offsets[row]; + const auto storage_end = storage_offsets[row + 1]; + local_cols = col_idxs + row_begin; + row_nnz = row_end - row_begin; + local_storage = storage + storage_begin; + storage_size = storage_end - storage_begin; + desc = descs[row]; + } + + /** + * Returns the row-local index of the entry with the given column index, if + * it is present. + * + * @param col the column index + * @return the row-local index of the given entry, or + * invalid_index() if it is not present. + */ + GKO_ATTRIBUTES GKO_INLINE IndexType operator[](IndexType col) const + { + switch (static_cast(desc & 0xF)) { + case sparsity_type::full: + return lookup_full(col); + case sparsity_type::bitmap: + return lookup_bitmap(col); + case sparsity_type::hash: + return lookup_hash(col); + } + GKO_ASSERT(false); + } + + /** + * Returns the row-local index of the entry with the given column index, + * assuming it is present. + * + * @param col the column index + * @return the row-local index of the given entry. + * @note the function fails with an assertion (debug builds) or can crash if + * the column index is not present in the row! + */ + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_unsafe(IndexType col) const + { + IndexType result{}; + switch (static_cast(desc & 0xF)) { + case sparsity_type::full: + result = lookup_full_unsafe(col); + break; + case sparsity_type::bitmap: + result = lookup_bitmap_unsafe(col); + break; + case sparsity_type::hash: + result = lookup_hash_unsafe(col); + break; + default: + GKO_ASSERT(false); + } + GKO_ASSERT(local_cols[result] == col); + return result; + } + +private: + using unsigned_index_type = typename std::make_unsigned::type; + + const IndexType* local_cols; + IndexType row_nnz; + const int32* local_storage; + IndexType storage_size; + int64 desc; + + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_full_unsafe(IndexType col) const + { + const auto min_col = local_cols[0]; + const auto out_idx = col - min_col; + GKO_ASSERT(out_idx >= 0 && out_idx < row_nnz); + return out_idx; + } + + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_full(IndexType col) const + { + const auto min_col = local_cols[0]; + const auto out_idx = col - min_col; + return out_idx >= 0 && out_idx < row_nnz ? out_idx + : invalid_index(); + } + + GKO_ATTRIBUTES GKO_INLINE IndexType + lookup_bitmap_unsafe(IndexType col) const + { + const auto min_col = local_cols[0]; + const auto num_blocks = static_cast(desc >> 32); + const auto block_bases = local_storage; + const auto block_bitmaps = + reinterpret_cast(block_bases + num_blocks); + const auto rel_col = col - min_col; + const auto block = rel_col / sparsity_bitmap_block_size; + const auto col_in_block = rel_col % sparsity_bitmap_block_size; + const auto prefix_mask = (uint32{1} << col_in_block) - 1; + GKO_ASSERT(rel_col >= 0); + GKO_ASSERT(block < num_blocks); + GKO_ASSERT(block_bitmaps[block] & (uint32{1} << col_in_block)); + const auto out_idx = + block_bases[block] + + gko::detail::popcount(block_bitmaps[block] & prefix_mask); + GKO_ASSERT(local_cols[out_idx] == col); + return out_idx; + } + + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_bitmap(IndexType col) const + { + const auto min_col = local_cols[0]; + const auto num_blocks = static_cast(desc >> 32); + const auto block_bases = local_storage; + const auto block_bitmaps = + reinterpret_cast(block_bases + num_blocks); + const auto rel_col = col - min_col; + const auto block = rel_col / sparsity_bitmap_block_size; + const auto col_in_block = rel_col % sparsity_bitmap_block_size; + if (rel_col < 0 || block >= num_blocks || + !(block_bitmaps[block] & (uint32{1} << col_in_block))) { + return invalid_index(); + } + const auto prefix_mask = (uint32{1} << col_in_block) - 1; + return block_bases[block] + + gko::detail::popcount(block_bitmaps[block] & prefix_mask); + } + + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_hash_unsafe(IndexType col) const + { + const auto hashmap_size = static_cast(storage_size); + const auto hash_param = static_cast(desc >> 32); + const auto hashmap = local_storage; + auto hash = + (static_cast(col) * hash_param) % hashmap_size; + GKO_ASSERT(hashmap[hash] >= 0); + GKO_ASSERT(hashmap[hash] < row_nnz); + while (local_cols[hashmap[hash]] != col) { + hash++; + if (hash >= hashmap_size) { + hash = 0; + } + GKO_ASSERT(hashmap[hash] >= 0); + GKO_ASSERT(hashmap[hash] < row_nnz); + } + const auto out_idx = hashmap[hash]; + return out_idx; + } + + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_hash(IndexType col) const + { + const auto hashmap_size = static_cast(storage_size); + const auto hash_param = static_cast(desc >> 32); + const auto hashmap = local_storage; + auto hash = + (static_cast(col) * hash_param) % hashmap_size; + GKO_ASSERT(hashmap[hash] < row_nnz); + auto out_idx = hashmap[hash]; + // linear probing with invalid_index sentinel to avoid infinite loop + while (out_idx >= 0 && local_cols[out_idx] != col) { + hash++; + if (hash >= hashmap_size) { + hash = 0; + } + out_idx = hashmap[hash]; + GKO_ASSERT(hashmap[hash] < row_nnz); + } + // out_idx is either correct or invalid_index, the hashmap sentinel + return out_idx; + } +}; + + +} // namespace csr +} // namespace matrix +} // namespace gko + +#endif // GKO_CORE_MATRIX_CSR_LOOKUP_HPP_ diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu index f63622e7031..9bdb7e53c2a 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.cu @@ -59,6 +59,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/format_conversion_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_lookup.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/config.hpp" @@ -71,6 +72,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/components/format_conversion.cuh" #include "cuda/components/intrinsics.cuh" #include "cuda/components/merging.cuh" +#include "cuda/components/prefix_sum.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/segment_scan.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu index ab229b81c37..dfd352d4996 100644 --- a/cuda/matrix/fbcsr_kernels.cu +++ b/cuda/matrix/fbcsr_kernels.cu @@ -57,6 +57,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/device_matrix_data_kernels.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" +#include "core/matrix/csr_lookup.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/config.hpp" @@ -69,6 +70,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/components/atomic.cuh" #include "cuda/components/cooperative_groups.cuh" #include "cuda/components/merging.cuh" +#include "cuda/components/prefix_sum.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" diff --git a/dev_tools/scripts/config b/dev_tools/scripts/config index 7f43fb78089..86ab73b4a8b 100644 --- a/dev_tools/scripts/config +++ b/dev_tools/scripts/config @@ -32,6 +32,8 @@ - FixInclude: "common/unified/base/kernel_launch_solver.hpp" - "(cuda|hip|dpcpp|omp)/base/kernel_launch_solver\." - FixInclude: "common/unified/base/kernel_launch_solver.hpp" +- "test/base/kernel_launch_generic.cpp" + - FixInclude: "common/unified/base/kernel_launch.hpp" - "elimination_forest\.cpp" - FixInclude: "core/factorization/elimination_forest.hpp" - "common/unified/.*.cpp" diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 9ed089478b3..2381a98e463 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -2270,6 +2270,128 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); +template +bool csr_lookup_try_full(IndexType row_len, IndexType col_range, + matrix::csr::sparsity_type allowed, int64& row_desc) +{ + using matrix::csr::sparsity_type; + bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::full); + if (is_allowed && row_len == col_range) { + row_desc = static_cast(sparsity_type::full); + return true; + } + return false; +} + + +template +bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, + IndexType min_col, IndexType available_storage, + matrix::csr::sparsity_type allowed, int64& row_desc, + int32* local_storage, const IndexType* cols) +{ + using matrix::csr::sparsity_bitmap_block_size; + using matrix::csr::sparsity_type; + bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap); + const auto num_blocks = + static_cast(ceildiv(col_range, sparsity_bitmap_block_size)); + if (is_allowed && num_blocks * 2 <= available_storage) { + row_desc = (static_cast(num_blocks) << 32) | + static_cast(sparsity_type::bitmap); + const auto block_ranks = local_storage; + const auto block_bitmaps = + reinterpret_cast(block_ranks + num_blocks); + std::fill_n(block_bitmaps, num_blocks, 0); + for (auto col_it = cols; col_it < cols + row_len; col_it++) { + const auto rel_col = *col_it - min_col; + const auto block = rel_col / sparsity_bitmap_block_size; + const auto col_in_block = rel_col % sparsity_bitmap_block_size; + block_bitmaps[block] |= uint32{1} << col_in_block; + } + int32 partial_sum{}; + for (int32 block = 0; block < num_blocks; block++) { + block_ranks[block] = partial_sum; + partial_sum += gko::detail::popcount(block_bitmaps[block]); + } + return true; + } + return false; +} + + +template +void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, + int64& row_desc, int32* local_storage, + const IndexType* cols) +{ + // we need at least one unfilled entry to avoid infinite loops on search + GKO_ASSERT(row_len < available_storage); +#if GINKGO_DPCPP_SINGLE_MODE + constexpr float inv_golden_ratio = 0.61803398875f; +#else + constexpr double inv_golden_ratio = 0.61803398875; +#endif + // use golden ratio as approximation for hash parameter that spreads + // consecutive values as far apart as possible. Ensure lowest bit is set + // otherwise we skip odd hashtable entries + const auto hash_parameter = + 1u | static_cast(available_storage * inv_golden_ratio); + row_desc = (static_cast(hash_parameter) << 32) | + static_cast(matrix::csr::sparsity_type::hash); + std::fill_n(local_storage, available_storage, invalid_index()); + for (int32 nz = 0; nz < row_len; nz++) { + auto hash = (static_cast(cols[nz]) * hash_parameter) % + static_cast(available_storage); + // linear probing: find the next empty entry + while (local_storage[hash] != invalid_index()) { + hash++; + if (hash >= available_storage) { + hash = 0; + } + } + local_storage[hash] = nz; + } +} + + +template +void build_lookup(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* col_idxs, + size_type num_rows, matrix::csr::sparsity_type allowed, + const IndexType* storage_offsets, int64* row_desc, + int32* storage) +{ + exec->get_queue()->submit([&](sycl::handler& cgh) { + cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) { + const auto row = static_cast(idx[0]); + const auto row_begin = row_ptrs[row]; + const auto row_len = row_ptrs[row + 1] - row_begin; + const auto storage_begin = storage_offsets[row]; + const auto available_storage = + storage_offsets[row + 1] - storage_begin; + const auto local_storage = storage + storage_begin; + const auto local_cols = col_idxs + row_begin; + const auto min_col = row_len > 0 ? local_cols[0] : 0; + const auto col_range = + row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0; + bool done = + csr_lookup_try_full(row_len, col_range, allowed, row_desc[row]); + if (!done) { + done = csr_lookup_try_bitmap( + row_len, col_range, min_col, available_storage, allowed, + row_desc[row], local_storage, local_cols); + } + if (!done) { + csr_lookup_build_hash(row_len, available_storage, row_desc[row], + local_storage, local_cols); + } + }); + }); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); + + } // namespace csr } // namespace dpcpp } // namespace kernels diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp index 1e387f71cf3..9025d8c493b 100644 --- a/hip/matrix/csr_kernels.hip.cpp +++ b/hip/matrix/csr_kernels.hip.cpp @@ -60,6 +60,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/format_conversion_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_lookup.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/config.hip.hpp" @@ -71,6 +72,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/intrinsics.hip.hpp" #include "hip/components/merging.hip.hpp" +#include "hip/components/prefix_sum.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/segment_scan.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.hip.cpp index 9e57f6ece55..8fbf9a05ac9 100644 --- a/hip/matrix/fbcsr_kernels.hip.cpp +++ b/hip/matrix/fbcsr_kernels.hip.cpp @@ -58,6 +58,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/device_matrix_data_kernels.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" +#include "core/matrix/csr_lookup.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/config.hip.hpp" @@ -70,6 +71,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "hip/components/atomic.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/merging.hip.hpp" +#include "hip/components/prefix_sum.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" diff --git a/include/ginkgo/core/base/intrinsics.hpp b/include/ginkgo/core/base/intrinsics.hpp new file mode 100644 index 00000000000..279c9cb0c3e --- /dev/null +++ b/include/ginkgo/core/base/intrinsics.hpp @@ -0,0 +1,78 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_BASE_INTRINSICS_HPP_ +#define GKO_PUBLIC_CORE_BASE_INTRINSICS_HPP_ + + +#include + + +#include + + +namespace gko { +namespace detail { + + +/** + * Returns the number of set bits in the given bitmask. + */ +GKO_ATTRIBUTES GKO_INLINE int popcount(uint32 bitmask) +{ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return __popc(bitmask); +#else + std::bitset<32> bits{bitmask}; + return bits.count(); +#endif +} + + +/** + * Returns the number of set bits in the given bitmask. + */ +GKO_ATTRIBUTES GKO_INLINE int popcount(uint64 bitmask) +{ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return __popcll(bitmask); +#else + std::bitset<64> bits{bitmask}; + return bits.count(); +#endif +} + + +} // namespace detail +} // namespace gko + +#endif // GKO_PUBLIC_CORE_BASE_INTRINSICS_HPP_ diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index b29f78c21f3..453bfae6e3a 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -48,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index a88e6e791a6..f7226662b8e 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -1129,6 +1129,122 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); +template +bool csr_lookup_try_full(IndexType row_len, IndexType col_range, + matrix::csr::sparsity_type allowed, int64& row_desc) +{ + using matrix::csr::sparsity_type; + bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::full); + if (is_allowed && row_len == col_range) { + row_desc = static_cast(sparsity_type::full); + return true; + } + return false; +} + + +template +bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, + IndexType min_col, IndexType available_storage, + matrix::csr::sparsity_type allowed, int64& row_desc, + int32* local_storage, const IndexType* cols) +{ + using matrix::csr::sparsity_bitmap_block_size; + using matrix::csr::sparsity_type; + bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap); + const auto num_blocks = ceildiv(col_range, sparsity_bitmap_block_size); + if (is_allowed && num_blocks * 2 <= available_storage) { + row_desc = (static_cast(num_blocks) << 32) | + static_cast(sparsity_type::bitmap); + const auto block_ranks = local_storage; + const auto block_bitmaps = + reinterpret_cast(block_ranks + num_blocks); + std::fill_n(block_bitmaps, num_blocks, 0); + for (auto col_it = cols; col_it < cols + row_len; col_it++) { + const auto rel_col = *col_it - min_col; + const auto block = rel_col / sparsity_bitmap_block_size; + const auto col_in_block = rel_col % sparsity_bitmap_block_size; + block_bitmaps[block] |= uint32{1} << col_in_block; + } + int32 partial_sum{}; + for (int32 block = 0; block < num_blocks; block++) { + block_ranks[block] = partial_sum; + partial_sum += gko::detail::popcount(block_bitmaps[block]); + } + return true; + } + return false; +} + + +template +void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, + int64& row_desc, int32* local_storage, + const IndexType* cols) +{ + // we need at least one unfilled entry to avoid infinite loops on search + GKO_ASSERT(row_len < available_storage); + constexpr double inv_golden_ratio = 0.61803398875; + // use golden ratio as approximation for hash parameter that spreads + // consecutive values as far apart as possible. Ensure lowest bit is set + // otherwise we skip odd hashtable entries + const auto hash_parameter = + 1u | static_cast(available_storage * inv_golden_ratio); + row_desc = (static_cast(hash_parameter) << 32) | + static_cast(matrix::csr::sparsity_type::hash); + std::fill_n(local_storage, available_storage, invalid_index()); + for (int32 nz = 0; nz < row_len; nz++) { + auto hash = (static_cast::type>( + cols[nz]) * + hash_parameter) % + static_cast(available_storage); + // linear probing: find the next empty entry + while (local_storage[hash] != invalid_index()) { + hash++; + if (hash >= available_storage) { + hash = 0; + } + } + local_storage[hash] = nz; + } +} + + +template +void build_lookup(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* col_idxs, + size_type num_rows, matrix::csr::sparsity_type allowed, + const IndexType* storage_offsets, int64* row_desc, + int32* storage) +{ +#pragma omp parallel for + for (size_type row = 0; row < num_rows; row++) { + const auto row_begin = row_ptrs[row]; + const auto row_len = row_ptrs[row + 1] - row_begin; + const auto storage_begin = storage_offsets[row]; + const auto available_storage = storage_offsets[row + 1] - storage_begin; + const auto local_storage = storage + storage_begin; + const auto local_cols = col_idxs + row_begin; + const auto min_col = row_len > 0 ? local_cols[0] : 0; + const auto col_range = + row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0; + bool done = + csr_lookup_try_full(row_len, col_range, allowed, row_desc[row]); + if (!done) { + done = csr_lookup_try_bitmap( + row_len, col_range, min_col, available_storage, allowed, + row_desc[row], local_storage, local_cols); + } + if (!done) { + csr_lookup_build_hash(row_len, available_storage, row_desc[row], + local_storage, local_cols); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); + + } // namespace csr } // namespace omp } // namespace kernels diff --git a/omp/test/matrix/csr_kernels.cpp b/omp/test/matrix/csr_kernels.cpp index f308bf40fe6..521a923e55d 100644 --- a/omp/test/matrix/csr_kernels.cpp +++ b/omp/test/matrix/csr_kernels.cpp @@ -53,6 +53,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_kernels.hpp" +#include "core/matrix/csr_lookup.hpp" #include "core/test/utils.hpp" #include "core/test/utils/unsort_matrix.hpp" #include "core/utils/matrix_utils.hpp" diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp index 6dbbe528162..b438f59da80 100644 --- a/reference/matrix/csr_kernels.cpp +++ b/reference/matrix/csr_kernels.cpp @@ -1118,6 +1118,160 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); +template +void build_lookup_offsets(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* col_idxs, + size_type num_rows, + matrix::csr::sparsity_type allowed, + IndexType* storage_offsets) +{ + using matrix::csr::sparsity_bitmap_block_size; + using matrix::csr::sparsity_type; + for (size_type row = 0; row < num_rows; row++) { + const auto row_begin = row_ptrs[row]; + const auto row_len = row_ptrs[row + 1] - row_begin; + const auto local_cols = col_idxs + row_begin; + const auto min_col = row_len > 0 ? local_cols[0] : 0; + const auto col_range = + row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0; + if (csr_lookup_allowed(allowed, sparsity_type::full) && + row_len == col_range) { + storage_offsets[row] = 0; + } else { + const auto hashmap_storage = std::max(2 * row_len, 1); + const auto bitmap_num_blocks = static_cast( + ceildiv(col_range, sparsity_bitmap_block_size)); + const auto bitmap_storage = 2 * bitmap_num_blocks; + if (csr_lookup_allowed(allowed, sparsity_type::bitmap) && + bitmap_storage <= hashmap_storage) { + storage_offsets[row] = bitmap_storage; + } else { + storage_offsets[row] = hashmap_storage; + } + } + } + components::prefix_sum(exec, storage_offsets, num_rows + 1); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL); + + +template +bool csr_lookup_try_full(IndexType row_len, IndexType col_range, + matrix::csr::sparsity_type allowed, int64& row_desc) +{ + using matrix::csr::sparsity_type; + bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::full); + if (is_allowed && row_len == col_range) { + row_desc = static_cast(sparsity_type::full); + return true; + } + return false; +} + + +template +bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, + IndexType min_col, IndexType available_storage, + matrix::csr::sparsity_type allowed, int64& row_desc, + int32* local_storage, const IndexType* cols) +{ + using matrix::csr::sparsity_bitmap_block_size; + using matrix::csr::sparsity_type; + bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap); + const auto num_blocks = + static_cast(ceildiv(col_range, sparsity_bitmap_block_size)); + if (is_allowed && num_blocks * 2 <= available_storage) { + row_desc = (static_cast(num_blocks) << 32) | + static_cast(sparsity_type::bitmap); + const auto block_ranks = local_storage; + const auto block_bitmaps = + reinterpret_cast(block_ranks + num_blocks); + std::fill_n(block_bitmaps, num_blocks, 0); + for (auto col_it = cols; col_it < cols + row_len; col_it++) { + const auto rel_col = *col_it - min_col; + const auto block = rel_col / sparsity_bitmap_block_size; + const auto col_in_block = rel_col % sparsity_bitmap_block_size; + block_bitmaps[block] |= uint32{1} << col_in_block; + } + int32 partial_sum{}; + for (int32 block = 0; block < num_blocks; block++) { + block_ranks[block] = partial_sum; + partial_sum += gko::detail::popcount(block_bitmaps[block]); + } + return true; + } + return false; +} + + +template +void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, + int64& row_desc, int32* local_storage, + const IndexType* cols) +{ + // we need at least one unfilled entry to avoid infinite loops on search + GKO_ASSERT(row_len < available_storage); + constexpr double inv_golden_ratio = 0.61803398875; + // use golden ratio as approximation for hash parameter that spreads + // consecutive values as far apart as possible. Ensure lowest bit is set + // otherwise we skip odd hashtable entries + const auto hash_parameter = + 1u | static_cast(available_storage * inv_golden_ratio); + row_desc = (static_cast(hash_parameter) << 32) | + static_cast(matrix::csr::sparsity_type::hash); + std::fill_n(local_storage, available_storage, invalid_index()); + for (int32 nz = 0; nz < row_len; nz++) { + auto hash = (static_cast>(cols[nz]) * + hash_parameter) % + static_cast(available_storage); + // linear probing: find the next empty entry + while (local_storage[hash] != invalid_index()) { + hash++; + if (hash >= available_storage) { + hash = 0; + } + } + local_storage[hash] = nz; + } +} + + +template +void build_lookup(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* col_idxs, + size_type num_rows, matrix::csr::sparsity_type allowed, + const IndexType* storage_offsets, int64* row_desc, + int32* storage) +{ + for (size_type row = 0; row < num_rows; row++) { + const auto row_begin = row_ptrs[row]; + const auto row_len = row_ptrs[row + 1] - row_begin; + const auto storage_begin = storage_offsets[row]; + const auto available_storage = storage_offsets[row + 1] - storage_begin; + const auto local_storage = storage + storage_begin; + const auto local_cols = col_idxs + row_begin; + const auto min_col = row_len > 0 ? local_cols[0] : 0; + const auto col_range = + row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0; + bool done = + csr_lookup_try_full(row_len, col_range, allowed, row_desc[row]); + if (!done) { + done = csr_lookup_try_bitmap( + row_len, col_range, min_col, available_storage, allowed, + row_desc[row], local_storage, local_cols); + } + if (!done) { + csr_lookup_build_hash(row_len, available_storage, row_desc[row], + local_storage, local_cols); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); + + } // namespace csr } // namespace reference } // namespace kernels diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp index ec610081884..f7899cf7c89 100644 --- a/reference/test/matrix/csr_kernels.cpp +++ b/reference/test/matrix/csr_kernels.cpp @@ -54,6 +54,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/csr_kernels.hpp" +#include "core/matrix/csr_lookup.hpp" #include "core/test/utils.hpp" @@ -1891,4 +1892,154 @@ TYPED_TEST(Csr, CanGetSubmatrixWithindex_set) } +template +class CsrLookup : public ::testing::Test { +protected: + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Mtx = gko::matrix::Csr; + + CsrLookup() : exec(gko::ReferenceExecutor::create()) + { + mtx = Mtx::create(this->exec); + typename Mtx::mat_data data{gko::dim<2>{6, 65}}; + for (int i = 0; i < 65; i++) { + // row 0: empty row + // row 1: full row + data.nonzeros.emplace_back(1, i, 1.0); + // row 2: pretty dense row + if (i % 3 == 0) { + data.nonzeros.emplace_back(2, i, 1.0); + } + // row 4-5: contiguous row + if (i >= 10 && i < 30) { + data.nonzeros.emplace_back(4, i, 1.0); + } + if (i >= 2 && i < 6) { + data.nonzeros.emplace_back(5, i, 1.0); + } + } + // row 3: very sparse + data.nonzeros.emplace_back(3, 0, 1.0); + data.nonzeros.emplace_back(3, 64, 1.0); + data.ensure_row_major_order(); + // 1000 as min-sentinel + full_sizes = {0, 0, 1000, 1000, 0, 0}; + bitmap_sizes = {0, 6, 4, 6, 2, 2}; + hash_sizes = {1, 130, 44, 4, 40, 8}; + mtx->read(data); + } + + std::shared_ptr exec; + std::unique_ptr mtx; + std::vector full_sizes; + std::vector bitmap_sizes; + std::vector hash_sizes; + index_type invalid_index = gko::invalid_index(); +}; + +TYPED_TEST_SUITE(CsrLookup, gko::test::ValueIndexTypes, + PairTypenameNameGenerator); + +TYPED_TEST(CsrLookup, GeneratesLookupDataOffsets) +{ + using IndexType = typename TestFixture::index_type; + using gko::matrix::csr::sparsity_type; + const auto num_rows = this->mtx->get_size()[0]; + gko::array storage_offset_array(this->exec, num_rows + 1); + const auto storage_offsets = storage_offset_array.get_data(); + const auto row_ptrs = this->mtx->get_const_row_ptrs(); + const auto col_idxs = this->mtx->get_const_col_idxs(); + + for (auto allowed : + {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { + gko::kernels::reference::csr::build_lookup_offsets( + this->exec, row_ptrs, col_idxs, num_rows, allowed, storage_offsets); + bool allow_full = + gko::matrix::csr::csr_lookup_allowed(allowed, sparsity_type::full); + bool allow_bitmap = gko::matrix::csr::csr_lookup_allowed( + allowed, sparsity_type::bitmap); + + for (gko::size_type row = 0; row < num_rows; row++) { + const auto expected_size = + std::min(allow_full ? this->full_sizes[row] : 1000, + std::min(allow_bitmap ? this->bitmap_sizes[row] : 1000, + this->hash_sizes[row])); + const auto size = storage_offsets[row + 1] - storage_offsets[row]; + + ASSERT_EQ(size, expected_size); + } + } +} + + +TYPED_TEST(CsrLookup, GeneratesLookupData) +{ + using IndexType = typename TestFixture::index_type; + using gko::matrix::csr::sparsity_type; + const auto num_rows = this->mtx->get_size()[0]; + const auto num_cols = this->mtx->get_size()[1]; + gko::array row_desc_array(this->exec, num_rows); + gko::array storage_offset_array(this->exec, num_rows + 1); + const auto row_descs = row_desc_array.get_data(); + const auto storage_offsets = storage_offset_array.get_data(); + const auto row_ptrs = this->mtx->get_const_row_ptrs(); + const auto col_idxs = this->mtx->get_const_col_idxs(); + for (auto allowed : + {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { + gko::kernels::reference::csr::build_lookup_offsets( + this->exec, row_ptrs, col_idxs, num_rows, allowed, storage_offsets); + gko::array storage_array(this->exec, + storage_offsets[num_rows]); + const auto storage = storage_array.get_data(); + const auto bitmap_equivalent = + csr_lookup_allowed(allowed, sparsity_type::bitmap) + ? sparsity_type::bitmap + : sparsity_type::hash; + const auto full_equivalent = + csr_lookup_allowed(allowed, sparsity_type::full) + ? sparsity_type::full + : bitmap_equivalent; + + gko::kernels::reference::csr::build_lookup( + this->exec, row_ptrs, col_idxs, num_rows, allowed, storage_offsets, + row_descs, storage); + + for (int row = 0; row < num_rows; row++) { + const auto row_begin = row_ptrs[row]; + const auto row_end = row_ptrs[row + 1]; + gko::matrix::csr::device_sparsity_lookup lookup{ + row_ptrs, col_idxs, storage_offsets, + storage, row_descs, static_cast(row)}; + for (auto nz = row_begin; nz < row_end; nz++) { + const auto col = col_idxs[nz]; + ASSERT_EQ(lookup.lookup_unsafe(col) + row_begin, nz); + } + auto nz = row_begin; + for (int col = 0; col < num_cols; col++) { + auto found_nz = lookup[col]; + if (nz < row_end && col_idxs[nz] == col) { + ASSERT_EQ(found_nz, nz - row_begin); + nz++; + } else { + ASSERT_EQ(found_nz, this->invalid_index); + } + } + } + ASSERT_EQ(row_descs[0] & 0xF, static_cast(full_equivalent)); + ASSERT_EQ(row_descs[1] & 0xF, static_cast(full_equivalent)); + ASSERT_EQ(row_descs[2] & 0xF, static_cast(bitmap_equivalent)); + ASSERT_EQ(row_descs[3] & 0xF, static_cast(sparsity_type::hash)); + ASSERT_EQ(row_descs[4] & 0xF, static_cast(full_equivalent)); + ASSERT_EQ(row_descs[5] & 0xF, static_cast(full_equivalent)); + } +} + + } // namespace diff --git a/test/matrix/csr_kernels.cpp b/test/matrix/csr_kernels.cpp index 2be151a55de..3c1e06e89c5 100644 --- a/test/matrix/csr_kernels.cpp +++ b/test/matrix/csr_kernels.cpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include +#include "core/matrix/csr_kernels.hpp" #include @@ -42,11 +42,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include "core/components/fill_array_kernels.hpp" -#include "core/matrix/csr_kernels.hpp" #include "core/test/utils.hpp" #include "test/utils/executor.hpp" @@ -133,4 +133,146 @@ TEST_F(Csr, InvScaleIsEquivalentToRef) } +template +class CsrLookup : public ::testing::Test { +protected: + using value_type = float; + using index_type = IndexType; + using Mtx = gko::matrix::Csr; + + CsrLookup() : rand_engine(15) {} + + void SetUp() + { + ref = gko::ReferenceExecutor::create(); + init_executor(ref, exec); + auto data = + gko::test::generate_random_matrix_data( + 628, 923, std::uniform_int_distribution(10, 300), + std::normal_distribution>(-1.0, + 1.0), + rand_engine); + // create a few empty rows + data.nonzeros.erase( + std::remove_if(data.nonzeros.begin(), data.nonzeros.end(), + [](auto entry) { return entry.row % 200 == 21; }), + data.nonzeros.end()); + // insert a full row and a pretty dense row + for (int i = 0; i < 100; i++) { + data.nonzeros.emplace_back(221, i + 100, 1.0); + data.nonzeros.emplace_back(421, i * 3 + 100, 2.0); + } + data.ensure_row_major_order(); + // initialize the matrices + mtx = Mtx::create(ref); + mtx->read(data); + dmtx = gko::clone(exec, mtx); + } + + void TearDown() + { + if (exec != nullptr) { + ASSERT_NO_THROW(exec->synchronize()); + } + } + + std::default_random_engine rand_engine; + std::shared_ptr ref; + std::shared_ptr exec; + std::unique_ptr mtx; + std::unique_ptr dmtx; + index_type invalid_index = gko::invalid_index(); +}; + +TYPED_TEST_SUITE(CsrLookup, gko::test::IndexTypes, TypenameNameGenerator); + + +TYPED_TEST(CsrLookup, BuildLookupWorks) +{ + using index_type = typename TestFixture::index_type; + using gko::matrix::csr::sparsity_type; + const auto num_rows = this->mtx->get_size()[0]; + const auto num_cols = this->mtx->get_size()[1]; + gko::array row_desc_array(this->ref, num_rows); + gko::array drow_desc_array(this->exec, num_rows); + gko::array storage_offset_array(this->ref, num_rows + 1); + gko::array dstorage_offset_array(this->exec, num_rows + 1); + const auto row_descs = row_desc_array.get_data(); + const auto drow_descs = drow_desc_array.get_data(); + const auto row_ptrs = this->mtx->get_const_row_ptrs(); + const auto col_idxs = this->mtx->get_const_col_idxs(); + const auto drow_ptrs = this->dmtx->get_const_row_ptrs(); + const auto dcol_idxs = this->dmtx->get_const_col_idxs(); + const auto storage_offsets = storage_offset_array.get_data(); + const auto dstorage_offsets = dstorage_offset_array.get_data(); + for (auto allowed : + {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { + // check that storage offsets are calculated correctly + // otherwise things might crash + gko::kernels::reference::csr::build_lookup_offsets( + this->ref, row_ptrs, col_idxs, num_rows, allowed, storage_offsets); + gko::kernels::EXEC_NAMESPACE::csr::build_lookup_offsets( + this->exec, drow_ptrs, dcol_idxs, num_rows, allowed, + dstorage_offsets); + + GKO_ASSERT_ARRAY_EQ(storage_offset_array, dstorage_offset_array); + + gko::array storage_array(this->ref, + storage_offsets[num_rows]); + gko::array dstorage_array(this->exec, + storage_offsets[num_rows]); + const auto storage = storage_array.get_data(); + const auto dstorage = dstorage_array.get_data(); + const auto bitmap_equivalent = + csr_lookup_allowed(allowed, sparsity_type::bitmap) + ? sparsity_type::bitmap + : sparsity_type::hash; + const auto full_equivalent = + csr_lookup_allowed(allowed, sparsity_type::full) + ? sparsity_type::full + : bitmap_equivalent; + + gko::kernels::reference::csr::build_lookup( + this->ref, row_ptrs, col_idxs, num_rows, allowed, storage_offsets, + row_descs, storage); + gko::kernels::EXEC_NAMESPACE::csr::build_lookup( + this->exec, drow_ptrs, dcol_idxs, num_rows, allowed, + dstorage_offsets, drow_descs, dstorage); + + gko::array host_row_descs(this->ref, drow_desc_array); + gko::array host_storage_array(this->ref, dstorage_array); + for (int row = 0; row < num_rows; row++) { + const auto row_begin = row_ptrs[row]; + const auto row_end = row_ptrs[row + 1]; + const auto row_nnz = row_end - row_begin; + gko::matrix::csr::device_sparsity_lookup lookup{ + row_ptrs, + col_idxs, + storage_offsets, + host_storage_array.get_const_data(), + host_row_descs.get_data(), + static_cast(row)}; + ASSERT_EQ(host_row_descs.get_const_data()[row] & 0xF, + row_descs[row] & 0xF); + for (auto nz = row_begin; nz < row_end; nz++) { + const auto col = col_idxs[nz]; + ASSERT_EQ(lookup.lookup_unsafe(col) + row_begin, nz); + } + auto nz = row_begin; + for (int col = 0; col < num_cols; col++) { + auto found_nz = lookup[col]; + if (nz < row_end && col_idxs[nz] == col) { + ASSERT_EQ(found_nz, nz - row_begin); + nz++; + } else { + ASSERT_EQ(found_nz, this->invalid_index); + } + } + } + } +} + + } // namespace