From 57c2999605b13b57c96e5b5029df04847c2217f1 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 8 Feb 2022 20:50:24 +0100 Subject: [PATCH] add kernels for Csr sparsity pattern lookup --- .../cuda_hip/components/segment_scan.hpp.inc | 15 +- common/cuda_hip/matrix/coo_kernels.hpp.inc | 10 +- common/cuda_hip/matrix/csr_kernels.hpp.inc | 221 +++++++++++++++--- core/matrix/csr_kernels.hpp | 11 +- cuda/matrix/csr_kernels.cu | 22 +- cuda/matrix/fbcsr_kernels.cu | 2 + cuda/test/matrix/csr_kernels.cpp | 58 +++++ dpcpp/matrix/csr_kernels.dp.cpp | 121 ++++++++++ dpcpp/test/matrix/csr_kernels.cpp | 58 +++++ hip/matrix/csr_kernels.hip.cpp | 18 ++ hip/matrix/fbcsr_kernels.hip.cpp | 2 + hip/test/matrix/csr_kernels.hip.cpp | 58 +++++ include/ginkgo/core/base/intrinsics.hpp | 78 +++++++ include/ginkgo/core/matrix/csr_lookup.hpp | 185 +++++++++++++++ include/ginkgo/ginkgo.hpp | 2 + omp/matrix/csr_kernels.cpp | 120 ++++++++++ omp/test/matrix/csr_kernels.cpp | 58 +++++ reference/matrix/csr_kernels.cpp | 118 ++++++++++ reference/test/matrix/csr_kernels.cpp | 77 ++++++ 19 files changed, 1183 insertions(+), 51 deletions(-) create mode 100644 include/ginkgo/core/base/intrinsics.hpp create mode 100644 include/ginkgo/core/matrix/csr_lookup.hpp diff --git a/common/cuda_hip/components/segment_scan.hpp.inc b/common/cuda_hip/components/segment_scan.hpp.inc index 2bd2e3eec90..cc873d5850d 100644 --- a/common/cuda_hip/components/segment_scan.hpp.inc +++ b/common/cuda_hip/components/segment_scan.hpp.inc @@ -38,25 +38,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * performs suffix sum. Works on the source array and returns whether the thread * is the first element of its segment with same `ind`. */ -template +template __device__ __forceinline__ bool segment_scan( const group::thread_block_tile& group, const IndexType ind, - ValueType* __restrict__ val) + ValueType& val, Operator op) { bool head = true; #pragma unroll for (int i = 1; i < subwarp_size; i <<= 1) { const IndexType add_ind = group.shfl_up(ind, i); - ValueType add_val = zero(); - if (add_ind == ind && threadIdx.x >= i) { - add_val = *val; + ValueType add_val{}; + if (add_ind == ind && group.thread_rank() >= i) { + add_val = val; if (i == 1) { head = false; } } add_val = group.shfl_down(add_val, i); - if (threadIdx.x < subwarp_size - i) { - *val += add_val; + if (group.thread_rank() < subwarp_size - i) { + val = op(val, add_val); } } return head; diff --git a/common/cuda_hip/matrix/coo_kernels.hpp.inc b/common/cuda_hip/matrix/coo_kernels.hpp.inc index a13068c1ce0..a5dd70bc95a 100644 --- a/common/cuda_hip/matrix/coo_kernels.hpp.inc +++ b/common/cuda_hip/matrix/coo_kernels.hpp.inc @@ -82,8 +82,9 @@ __device__ void spmv_kernel(const size_type nnz, const size_type num_lines, (ind + subwarp_size < nnz) ? row[ind + subwarp_size] : row[nnz - 1]; // segmented scan if (tile_block.any(curr_row != next_row)) { - bool is_first_in_segment = - segment_scan(tile_block, curr_row, &temp_val); + bool is_first_in_segment = segment_scan( + tile_block, curr_row, temp_val, + [](ValueType a, ValueType b) { return a + b; }); if (is_first_in_segment) { atomic_add(&(c[curr_row * c_stride + column_id]), scale(temp_val)); @@ -97,8 +98,9 @@ __device__ void spmv_kernel(const size_type nnz, const size_type num_lines, temp_val += (ind < nnz) ? val[ind] * b[col[ind] * b_stride + column_id] : zero(); // segmented scan - bool is_first_in_segment = - segment_scan(tile_block, curr_row, &temp_val); + bool is_first_in_segment = segment_scan( + tile_block, curr_row, temp_val, + [](ValueType a, ValueType b) { return a + b; }); if (is_first_in_segment) { atomic_add(&(c[curr_row * c_stride + column_id]), scale(temp_val)); } diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index 075e1607ace..01df076f78a 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -68,22 +68,21 @@ __device__ __forceinline__ bool block_segment_scan_reverse( template __device__ __forceinline__ void find_next_row( const IndexType num_rows, const IndexType data_size, const IndexType ind, - IndexType* __restrict__ row, IndexType* __restrict__ row_end, - const IndexType row_predict, const IndexType row_predict_end, - const IndexType* __restrict__ row_ptr) + IndexType& row, IndexType& row_end, const IndexType row_predict, + const IndexType row_predict_end, const IndexType* __restrict__ row_ptr) { if (!overflow || ind < data_size) { - if (ind >= *row_end) { - *row = row_predict; - *row_end = row_predict_end; - while (ind >= *row_end) { - *row_end = row_ptr[++*row + 1]; + if (ind >= row_end) { + row = row_predict; + row_end = row_predict_end; + while (ind >= row_end) { + row_end = row_ptr[++row + 1]; } } } else { - *row = num_rows - 1; - *row_end = data_size; + row = num_rows - 1; + row_end = data_size; } } @@ -92,16 +91,17 @@ template __device__ __forceinline__ void warp_atomic_add( const group::thread_block_tile& group, bool force_write, - ValueType* __restrict__ val, const IndexType row, ValueType* __restrict__ c, + ValueType& val, const IndexType row, ValueType* __restrict__ c, const size_type c_stride, const IndexType column_id, Closure scale) { // do a local scan to avoid atomic collisions - const bool need_write = segment_scan(group, row, val); + const bool need_write = segment_scan( + group, row, val, [](ValueType a, ValueType b) { return a + b; }); if (need_write && force_write) { - atomic_add(&(c[row * c_stride + column_id]), scale(*val)); + atomic_add(&(c[row * c_stride + column_id]), scale(val)); } if (!need_write || force_write) { - *val = zero(); + val = zero(); } } @@ -111,28 +111,27 @@ template & group, const IndexType num_rows, const IndexType data_size, const IndexType ind, - IndexType* __restrict__ row, IndexType* __restrict__ row_end, - IndexType* __restrict__ nrow, IndexType* __restrict__ nrow_end, - ValueType* __restrict__ temp_val, const ValueType* __restrict__ val, + IndexType& row, IndexType& row_end, IndexType& nrow, IndexType& nrow_end, + ValueType& temp_val, const ValueType* __restrict__ val, const IndexType* __restrict__ col_idxs, const IndexType* __restrict__ row_ptrs, const ValueType* __restrict__ b, const size_type b_stride, ValueType* __restrict__ c, const size_type c_stride, const IndexType column_id, Closure scale) { - const IndexType curr_row = *row; - find_next_row(num_rows, data_size, ind, row, row_end, *nrow, - *nrow_end, row_ptrs); + const auto curr_row = row; + find_next_row(num_rows, data_size, ind, row, row_end, nrow, nrow_end, + row_ptrs); // segmented scan - if (group.any(curr_row != *row)) { - warp_atomic_add(group, curr_row != *row, temp_val, curr_row, c, - c_stride, column_id, scale); - *nrow = group.shfl(*row, subwarp_size - 1); - *nrow_end = group.shfl(*row_end, subwarp_size - 1); + if (group.any(curr_row != row)) { + warp_atomic_add(group, curr_row != row, temp_val, curr_row, c, c_stride, + column_id, scale); + nrow = group.shfl(row, subwarp_size - 1); + nrow_end = group.shfl(row_end, subwarp_size - 1); } if (!last || ind < data_size) { const auto col = col_idxs[ind]; - *temp_val += val[ind] * b[col * b_stride + column_id]; + temp_val += val[ind] * b[col * b_stride + column_id]; } } @@ -171,21 +170,21 @@ __device__ __forceinline__ void spmv_kernel( auto nrow_end = row_end; ValueType temp_val = zero(); IndexType ind = start + threadIdx.x; - find_next_row(num_rows, data_size, ind, &row, &row_end, nrow, - nrow_end, row_ptrs); + find_next_row(num_rows, data_size, ind, row, row_end, nrow, nrow_end, + row_ptrs); const IndexType ind_end = end - wsize; const auto tile_block = group::tiled_partition(group::this_thread_block()); for (; ind < ind_end; ind += wsize) { - process_window(tile_block, num_rows, data_size, ind, &row, - &row_end, &nrow, &nrow_end, &temp_val, val, - col_idxs, row_ptrs, b, b_stride, c, c_stride, - column_id, scale); - } - process_window(tile_block, num_rows, data_size, ind, &row, &row_end, - &nrow, &nrow_end, &temp_val, val, col_idxs, row_ptrs, - b, b_stride, c, c_stride, column_id, scale); - warp_atomic_add(tile_block, true, &temp_val, row, c, c_stride, column_id, + process_window(tile_block, num_rows, data_size, ind, row, + row_end, nrow, nrow_end, temp_val, val, col_idxs, + row_ptrs, b, b_stride, c, c_stride, column_id, + scale); + } + process_window(tile_block, num_rows, data_size, ind, row, row_end, + nrow, nrow_end, temp_val, val, col_idxs, row_ptrs, b, + b_stride, c, c_stride, column_id, scale); + warp_atomic_add(tile_block, true, temp_val, row, c, c_stride, column_id, scale); } @@ -902,4 +901,152 @@ __global__ __launch_bounds__(default_block_size) void check_diagonal_entries( } +template +__global__ __launch_bounds__(default_block_size) void build_csr_lookup( + const IndexType* __restrict__ row_ptrs, + const IndexType* __restrict__ col_idxs, size_type num_rows, + gko::matrix::sparsity_type allowed, int64* __restrict__ row_desc, + int32* __restrict__ storage) +{ + constexpr int block_size = 32; + auto row = thread::get_subwarp_id_flat(); + if (row >= num_rows) { + return; + } + const auto subwarp = + group::tiled_partition(group::this_thread_block()); + const auto row_begin = row_ptrs[row]; + const auto row_len = row_ptrs[row + 1] - row_begin; + const auto storage_begin = 2 * row_begin; + const auto available_storage = 2 * row_len; + const auto local_storage = storage + storage_begin; + const auto local_cols = col_idxs + row_begin; + const auto lane = subwarp.thread_rank(); + const auto min_col = row_len > 0 ? local_cols[0] : 0; + const auto col_range = + row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0; + + // full column range + if (col_range == row_len && + ((static_cast(allowed) & + static_cast(matrix::sparsity_type::full)) != 0)) { + if (lane == 0) { + row_desc[row] = static_cast(matrix::sparsity_type::full); + } + return; + } + // dense bitmap storage + const auto num_blocks = static_cast(ceildiv(col_range, block_size)); + if (num_blocks * 2 <= available_storage && + ((static_cast(allowed) & + static_cast(matrix::sparsity_type::bitmap)) != 0)) { + if (lane == 0) { + row_desc[row] = (static_cast(num_blocks) << 32) | + static_cast(matrix::sparsity_type::bitmap); + } + const auto block_ranks = local_storage; + const auto block_bitmaps = + reinterpret_cast(block_ranks + num_blocks); + // fill bitmaps with zeros + for (int32 i = lane; i < num_blocks; i += subwarp_size) { + block_bitmaps[i] = 0; + } + // fill bitmaps with sparsity pattern + for (IndexType base_i = 0; base_i < row_len; base_i += subwarp_size) { + const auto i = base_i + lane; + const auto col = i < row_len ? local_cols[i] : INT_MAX; + const auto rel_col = static_cast(col - min_col); + const auto block = rel_col / block_size; + const auto col_in_block = rel_col % block_size; + auto local_bitmap = uint32{i < row_len ? 1u : 0u} << col_in_block; + bool is_first = + segment_scan(subwarp, block, local_bitmap, + [](config::lane_mask_type a, + config::lane_mask_type b) { return a | b; }); + if (is_first && i < row_len) { + block_bitmaps[block] |= local_bitmap; + } + } + // compute bitmap ranks + int32 block_partial_sum{}; + for (int32 base_i = 0; base_i < num_blocks; base_i += subwarp_size) { + const auto i = base_i + lane; + const auto bitmap = i < num_blocks ? block_bitmaps[i] : 0; + int32 local_partial_sum{}; + int32 local_total_sum{}; + subwarp_prefix_sum(popcnt(bitmap), local_partial_sum, + local_total_sum, subwarp); + if (i < num_blocks) { + block_ranks[i] = local_partial_sum + block_partial_sum; + } + block_partial_sum += local_total_sum; + } + return; + } + // sparse hashmap storage + constexpr double inv_golden_ratio = 0.61803398875; + // use golden ratio as approximation for hash parameter that spreads + // consecutive values as far apart as possible. Ensure lowest bit is set + // otherwise we skip odd hashtable entries + const auto hash_parameter = + 1u | static_cast(available_storage * inv_golden_ratio); + if (lane == 0) { + row_desc[row] = (static_cast(hash_parameter) << 32) | + static_cast(matrix::sparsity_type::hash); + } + // fill hashmap with sentinel + constexpr int32 empty = -1; + for (int32 i = lane; i < available_storage; i += subwarp_size) { + local_storage[i] = empty; + } + // memory barrier + subwarp.sync(); + // fill with actual entries + for (IndexType base_i = 0; base_i < row_len; base_i += subwarp_size) { + const auto i = base_i + lane; + const auto col = i < row_len ? local_cols[i] : 0; + auto hash = i < row_len ? (static_cast(col) * hash_parameter) % + static_cast(available_storage) + : available_storage; + // collision resolution +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700) + const auto this_lane_mask = config::lane_mask_type{1} << lane; + const auto lane_prefix_mask = this_lane_mask - 1; + // memory barrier to previous loop iteration + subwarp.sync(); + int32 entry = i < row_len ? local_storage[hash] : empty_bucket; + // find all threads in the subwarp with the same hash key + auto colliding = subwarp.match_any(hash); + // if there are any collisions with previously filled buckets + while (subwarp.any(entry != empty || colliding != this_lane_mask)) { + if (entry != empty_bucket) { + // assign consecutive indices to matching threads + hash += 1 + popcnt(colliding & lane_prefix_mask); + // cheap modulo replacement + if (hash >= available_storage) { + hash -= available_storage; + } + // this can only fail for row_length < 16 + assert(hash < available_storage); + entry = local_storage[hash]; + } + colliding = subwarp.match_any(hash); + } + if (i < row_len) { + local_storage[hash] = i; + } +#else + if (i < row_len) { + while (atomicCAS(local_storage + hash, empty, i) != empty) { + hash++; + if (hash >= available_storage) { + hash = 0; + } + } + } +#endif + } +} + + } // namespace kernel diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp index c67d5e0730d..1cfd822190e 100644 --- a/core/matrix/csr_kernels.hpp +++ b/core/matrix/csr_kernels.hpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -206,6 +207,12 @@ namespace kernels { const matrix::Dense* beta, \ matrix::Csr* mtx) +#define GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL(IndexType) \ + void build_lookup(std::shared_ptr exec, \ + const IndexType* row_ptrs, const IndexType* col_idxs, \ + size_type num_rows, matrix::sparsity_type allowed, \ + int64* row_desc, int32* storage) + #define GKO_DECLARE_ALL_AS_TEMPLATES \ template \ @@ -259,7 +266,9 @@ namespace kernels { template \ GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST(ValueType, IndexType); \ template \ - GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL(ValueType, IndexType) + GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL(IndexType) GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(csr, GKO_DECLARE_ALL_AS_TEMPLATES); diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu index b1e892ebf48..3b40992d0b3 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.cu @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -61,6 +62,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/components/format_conversion.cuh" #include "cuda/components/intrinsics.cuh" #include "cuda/components/merging.cuh" +#include "cuda/components/prefix_sum.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/segment_scan.cuh" #include "cuda/components/thread_ids.cuh" @@ -602,7 +604,7 @@ void spgemm(std::shared_ptr exec, cusparse::destroy(b_descr); cusparse::destroy(a_descr); -#else // CUDA_VERSION >= 11000 +#else // CUDA_VERSION >= 11000 const auto beta = zero(); auto spgemm_descr = cusparse::create_spgemm_descr(); auto a_descr = cusparse::create_csr( @@ -775,7 +777,7 @@ void advanced_spgemm(std::shared_ptr exec, cusparse::destroy(c_descr); cusparse::destroy(b_descr); cusparse::destroy(a_descr); -#else // CUDA_VERSION >= 11000 +#else // CUDA_VERSION >= 11000 auto null_value = static_cast(nullptr); auto null_index = static_cast(nullptr); auto one_val = one(); @@ -1310,6 +1312,22 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); +template +void build_lookup(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* col_idxs, + size_type num_rows, matrix::sparsity_type allowed, + int64* row_desc, int32* storage) +{ + const auto num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + kernel::build_csr_lookup + <<>>(row_ptrs, col_idxs, num_rows, + allowed, row_desc, storage); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); + + } // namespace csr } // namespace cuda } // namespace kernels diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu index c9c4782d262..79f75cf2dac 100644 --- a/cuda/matrix/fbcsr_kernels.cu +++ b/cuda/matrix/fbcsr_kernels.cu @@ -49,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include @@ -68,6 +69,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/components/atomic.cuh" #include "cuda/components/cooperative_groups.cuh" #include "cuda/components/merging.cuh" +#include "cuda/components/prefix_sum.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/test/matrix/csr_kernels.cpp b/cuda/test/matrix/csr_kernels.cpp index affd108ea6f..fd07c59bd2b 100644 --- a/cuda/test/matrix/csr_kernels.cpp +++ b/cuda/test/matrix/csr_kernels.cpp @@ -1012,4 +1012,62 @@ TEST_F(Csr, AddScaledIdentityToNonSquare) } +TEST_F(Csr, BuildLookupDataWorks) +{ + set_up_apply_data(std::make_shared()); + using gko::matrix::sparsity_type; + gko::Array row_descs(ref, mtx->get_size()[0]); + gko::Array lookup_info(ref, mtx->get_num_stored_elements() * 2); + gko::Array drow_descs(cuda, mtx->get_size()[0]); + gko::Array dlookup_info(cuda, + mtx->get_num_stored_elements() * 2); + for (auto allowed_methods : + {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { + const auto full_allowed = + static_cast(static_cast(allowed_methods) & + static_cast(sparsity_type::full)); + const auto bitmap_allowed = + static_cast(static_cast(allowed_methods) & + static_cast(sparsity_type::bitmap)); + const auto bitmap_equivalent = + bitmap_allowed ? sparsity_type::bitmap : sparsity_type::hash; + const auto full_equivalent = + full_allowed ? sparsity_type::full : bitmap_equivalent; + SCOPED_TRACE("full: " + std::to_string(full_allowed) + + " bitmap: " + std::to_string(bitmap_allowed)); + + gko::kernels::reference::csr::build_lookup( + ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + mtx->get_size()[0], allowed_methods, row_descs.get_data(), + lookup_info.get_data()); + gko::kernels::cuda::csr::build_lookup( + cuda, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(), + dmtx->get_size()[0], allowed_methods, drow_descs.get_data(), + dlookup_info.get_data()); + + gko::Array host_row_descs(ref, drow_descs); + gko::Array host_lookup_info(ref, dlookup_info); + for (int row = 0; row < dmtx->get_size()[0]; row++) { + SCOPED_TRACE("row: " + std::to_string(row)); + const auto row_begin = mtx->get_const_row_ptrs()[row]; + const auto row_nnz = mtx->get_const_row_ptrs()[row + 1] - row_begin; + gko::matrix::device_sparsity_lookup lookup{ + mtx->get_const_col_idxs() + row_begin, row_nnz, + host_lookup_info.get_const_data() + (row_begin * 2), + host_row_descs.get_const_data()[row]}; + + ASSERT_EQ(host_row_descs.get_const_data()[row] & 0xFFFF, + row_descs.get_const_data()[row] & 0xFFFF); + for (int nz = 0; nz < row_nnz; nz++) { + SCOPED_TRACE("nz: " + std::to_string(nz)); + const auto col = mtx->get_const_col_idxs()[nz + row_begin]; + ASSERT_EQ(nz, lookup[col]); + } + } + } +} + + } // namespace diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index d475a104dbf..85a3c54052d 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -2207,6 +2207,127 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); +bool csr_lookup_allowed(matrix::sparsity_type allowed, + matrix::sparsity_type type) +{ + return ((static_cast(allowed) & static_cast(type)) != 0); +} + + +template +bool csr_lookup_try_full(IndexType row_len, IndexType col_range, + matrix::sparsity_type allowed, int64& row_desc) +{ + using matrix::sparsity_type; + bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::full); + if (is_allowed && row_len == col_range) { + row_desc = static_cast(sparsity_type::full); + return true; + } + return false; +} + + +template +bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, + IndexType min_col, IndexType available_storage, + matrix::sparsity_type allowed, int64& row_desc, + int32* local_storage, const IndexType* cols) +{ + constexpr static int block_size = + gko::matrix::device_sparsity_lookup::block_size; + using matrix::sparsity_type; + bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap); + const auto num_blocks = ceildiv(col_range, block_size); + if (is_allowed && num_blocks * 2 <= available_storage) { + row_desc = (static_cast(num_blocks) << 32) | + static_cast(sparsity_type::bitmap); + const auto block_ranks = local_storage; + const auto block_bitmaps = + reinterpret_cast(block_ranks + num_blocks); + std::fill_n(block_bitmaps, num_blocks, 0); + for (auto col_it = cols; col_it < cols + row_len; col_it++) { + const auto rel_col = *col_it - min_col; + const auto block = rel_col / block_size; + const auto col_in_block = rel_col % block_size; + block_bitmaps[block] |= uint32{1} << col_in_block; + } + int32 partial_sum{}; + for (int32 block = 0; block < num_blocks; block++) { + block_ranks[block] = partial_sum; + partial_sum += gko::detail::popcount(block_bitmaps[block]); + } + return true; + } + return false; +} + + +template +void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, + int64& row_desc, int32* local_storage, + const IndexType* cols) +{ + constexpr double inv_golden_ratio = 0.61803398875; + // use golden ratio as approximation for hash parameter that spreads + // consecutive values as far apart as possible. Ensure lowest bit is set + // otherwise we skip odd hashtable entries + const auto hash_parameter = + 1u | static_cast(available_storage * inv_golden_ratio); + row_desc = (static_cast(hash_parameter) << 32) | + static_cast(matrix::sparsity_type::hash); + std::fill_n(local_storage, available_storage, -1); + for (int32 nz = 0; nz < row_len; nz++) { + auto hash = (static_cast(cols[nz]) * hash_parameter) % + static_cast(available_storage); + // linear probing: find the next empty entry + while (local_storage[hash] != -1) { + hash++; + if (hash >= available_storage) { + hash = 0; + } + } + local_storage[hash] = nz; + } +} + + +template +void build_lookup(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* col_idxs, + size_type num_rows, matrix::sparsity_type allowed, + int64* row_desc, int32* storage) +{ + exec->get_queue()->submit([&](sycl::handler& cgh) { + cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) { + const auto row = static_cast(idx[0]); + const auto row_begin = row_ptrs[row]; + const auto row_len = row_ptrs[row + 1] - row_begin; + const auto storage_begin = 2 * row_begin; + const auto available_storage = 2 * row_len; + const auto local_storage = storage + storage_begin; + const auto local_cols = col_idxs + row_begin; + const auto min_col = row_len > 0 ? local_cols[0] : 0; + const auto col_range = + row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0; + bool done = + csr_lookup_try_full(row_len, col_range, allowed, row_desc[row]); + if (!done) { + done = csr_lookup_try_bitmap( + row_len, col_range, min_col, available_storage, allowed, + row_desc[row], local_storage, local_cols); + } + if (!done) { + csr_lookup_build_hash(row_len, available_storage, row_desc[row], + local_storage, local_cols); + } + }); + }); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); + + } // namespace csr } // namespace dpcpp } // namespace kernels diff --git a/dpcpp/test/matrix/csr_kernels.cpp b/dpcpp/test/matrix/csr_kernels.cpp index 3f3aa505fbc..afed432b5e6 100644 --- a/dpcpp/test/matrix/csr_kernels.cpp +++ b/dpcpp/test/matrix/csr_kernels.cpp @@ -964,4 +964,62 @@ TEST_F(Csr, CreateSubMatrixIsEquivalentToRef) } +TEST_F(Csr, BuildLookupDataWorks) +{ + set_up_apply_data(std::make_shared()); + using gko::matrix::sparsity_type; + gko::Array row_descs(ref, mtx->get_size()[0]); + gko::Array lookup_info(ref, mtx->get_num_stored_elements() * 2); + gko::Array drow_descs(dpcpp, mtx->get_size()[0]); + gko::Array dlookup_info(dpcpp, + mtx->get_num_stored_elements() * 2); + for (auto allowed_methods : + {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { + const auto full_allowed = + static_cast(static_cast(allowed_methods) & + static_cast(sparsity_type::full)); + const auto bitmap_allowed = + static_cast(static_cast(allowed_methods) & + static_cast(sparsity_type::bitmap)); + const auto bitmap_equivalent = + bitmap_allowed ? sparsity_type::bitmap : sparsity_type::hash; + const auto full_equivalent = + full_allowed ? sparsity_type::full : bitmap_equivalent; + SCOPED_TRACE("full: " + std::to_string(full_allowed) + + " bitmap: " + std::to_string(bitmap_allowed)); + + gko::kernels::reference::csr::build_lookup( + ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + mtx->get_size()[0], allowed_methods, row_descs.get_data(), + lookup_info.get_data()); + gko::kernels::dpcpp::csr::build_lookup( + dpcpp, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(), + dmtx->get_size()[0], allowed_methods, drow_descs.get_data(), + dlookup_info.get_data()); + + gko::Array host_row_descs(ref, drow_descs); + gko::Array host_lookup_info(ref, dlookup_info); + for (int row = 0; row < dmtx->get_size()[0]; row++) { + SCOPED_TRACE("row: " + std::to_string(row)); + const auto row_begin = mtx->get_const_row_ptrs()[row]; + const auto row_nnz = mtx->get_const_row_ptrs()[row + 1] - row_begin; + gko::matrix::device_sparsity_lookup lookup{ + mtx->get_const_col_idxs() + row_begin, row_nnz, + host_lookup_info.get_const_data() + (row_begin * 2), + host_row_descs.get_const_data()[row]}; + + ASSERT_EQ(host_row_descs.get_const_data()[row] & 0xFFFF, + row_descs.get_const_data()[row] & 0xFFFF); + for (int nz = 0; nz < row_nnz; nz++) { + SCOPED_TRACE("nz: " + std::to_string(nz)); + const auto col = mtx->get_const_col_idxs()[nz + row_begin]; + ASSERT_EQ(nz, lookup[col]); + } + } + } +} + + } // namespace diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp index d1d980c60a5..1a62ec3e8b1 100644 --- a/hip/matrix/csr_kernels.hip.cpp +++ b/hip/matrix/csr_kernels.hip.cpp @@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -63,6 +64,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/intrinsics.hip.hpp" #include "hip/components/merging.hip.hpp" +#include "hip/components/prefix_sum.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/segment_scan.hip.hpp" #include "hip/components/thread_ids.hip.hpp" @@ -1093,6 +1095,22 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); +template +void build_lookup(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* col_idxs, + size_type num_rows, matrix::sparsity_type allowed, + int64* row_desc, int32* storage) +{ + const auto num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + kernel::build_csr_lookup + <<>>(row_ptrs, col_idxs, num_rows, + allowed, row_desc, storage); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); + + } // namespace csr } // namespace hip } // namespace kernels diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.hip.cpp index 33ed4a08e2e..58421baf8d8 100644 --- a/hip/matrix/fbcsr_kernels.hip.cpp +++ b/hip/matrix/fbcsr_kernels.hip.cpp @@ -50,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include @@ -59,6 +60,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/format_conversion_kernels.hpp" #include "hip/base/config.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/prefix_sum.hip.hpp" namespace gko { namespace kernels { diff --git a/hip/test/matrix/csr_kernels.hip.cpp b/hip/test/matrix/csr_kernels.hip.cpp index c4f2a8f101b..c80c1827e2d 100644 --- a/hip/test/matrix/csr_kernels.hip.cpp +++ b/hip/test/matrix/csr_kernels.hip.cpp @@ -1022,4 +1022,62 @@ TEST_F(Csr, AddScaledIdentityToNonSquare) } +TEST_F(Csr, BuildLookupDataWorks) +{ + set_up_apply_data(std::make_shared()); + using gko::matrix::sparsity_type; + gko::Array row_descs(ref, mtx->get_size()[0]); + gko::Array lookup_info(ref, mtx->get_num_stored_elements() * 2); + gko::Array drow_descs(hip, mtx->get_size()[0]); + gko::Array dlookup_info(hip, + mtx->get_num_stored_elements() * 2); + for (auto allowed_methods : + {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { + const auto full_allowed = + static_cast(static_cast(allowed_methods) & + static_cast(sparsity_type::full)); + const auto bitmap_allowed = + static_cast(static_cast(allowed_methods) & + static_cast(sparsity_type::bitmap)); + const auto bitmap_equivalent = + bitmap_allowed ? sparsity_type::bitmap : sparsity_type::hash; + const auto full_equivalent = + full_allowed ? sparsity_type::full : bitmap_equivalent; + SCOPED_TRACE("full: " + std::to_string(full_allowed) + + " bitmap: " + std::to_string(bitmap_allowed)); + + gko::kernels::reference::csr::build_lookup( + ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + mtx->get_size()[0], allowed_methods, row_descs.get_data(), + lookup_info.get_data()); + gko::kernels::hip::csr::build_lookup( + hip, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(), + dmtx->get_size()[0], allowed_methods, drow_descs.get_data(), + dlookup_info.get_data()); + + gko::Array host_row_descs(ref, drow_descs); + gko::Array host_lookup_info(ref, dlookup_info); + for (int row = 0; row < dmtx->get_size()[0]; row++) { + SCOPED_TRACE("row: " + std::to_string(row)); + const auto row_begin = mtx->get_const_row_ptrs()[row]; + const auto row_nnz = mtx->get_const_row_ptrs()[row + 1] - row_begin; + gko::matrix::device_sparsity_lookup lookup{ + mtx->get_const_col_idxs() + row_begin, row_nnz, + host_lookup_info.get_const_data() + (row_begin * 2), + host_row_descs.get_const_data()[row]}; + + ASSERT_EQ(host_row_descs.get_const_data()[row] & 0xFFFF, + row_descs.get_const_data()[row] & 0xFFFF); + for (int nz = 0; nz < row_nnz; nz++) { + SCOPED_TRACE("nz: " + std::to_string(nz)); + const auto col = mtx->get_const_col_idxs()[nz + row_begin]; + ASSERT_EQ(nz, lookup[col]); + } + } + } +} + + } // namespace diff --git a/include/ginkgo/core/base/intrinsics.hpp b/include/ginkgo/core/base/intrinsics.hpp new file mode 100644 index 00000000000..ba3894475da --- /dev/null +++ b/include/ginkgo/core/base/intrinsics.hpp @@ -0,0 +1,78 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_BASE_INTRINSICS_HPP_ +#define GKO_PUBLIC_CORE_BASE_INTRINSICS_HPP_ + + +#include + + +#include + + +namespace gko { +namespace detail { + + +/** + * Returns the number of set bits in the given bitmask. + */ +GKO_ATTRIBUTES GKO_INLINE int popcount(uint32 bitmask) +{ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return __popc(bitmask); +#else + std::bitset<32> bits{bitmask}; + return bits.count(); +#endif +} + + +/** + * Returns the number of set bits in the given bitmask. + */ +GKO_ATTRIBUTES GKO_INLINE int popcount(uint64 bitmask) +{ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return __popcll(bitmask); +#else + std::bitset<64> bits{bitmask}; + return bits.count(); +#endif +} + + +} // namespace detail +} // namespace gko + +#endif // GKO_PUBLIC_CORE_BASE_INTRINSICS_HPP_ \ No newline at end of file diff --git a/include/ginkgo/core/matrix/csr_lookup.hpp b/include/ginkgo/core/matrix/csr_lookup.hpp new file mode 100644 index 00000000000..a409851cde0 --- /dev/null +++ b/include/ginkgo/core/matrix/csr_lookup.hpp @@ -0,0 +1,185 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_MATRIX_CSR_LOOKUP_HPP_ +#define GKO_PUBLIC_CORE_MATRIX_CSR_LOOKUP_HPP_ + +#include + + +#include +#include +#include + + +namespace gko { +namespace matrix { + + +/** + * Type describing which kind of lookup structure is used to find entries in a + * single row of a Csr matrix. + */ +enum class sparsity_type : int { + /** + * The row is dense, i.e. it contains all entries in + * `[min_col, min_col + storage_size)`. + * This means that the relative output index is `col - min_col`. + */ + full = 1, + /** + * The row is sufficiently dense that its sparsity pattern can be stored in + * a dense bitmap consisting of `storage_size` blocks of size `block_size`, + * with a total of `storage_size` blocks. Each block stores its sparsity + * pattern and the number of columns before. This means that the relative + * output index can be computed as + * ``` + * auto block = (col - min_col) / block_size; + * auto local_col = (col - min_col) % block_size; + * auto prefix_mask = (block_type{1} << local_col) - 1; + * auto output_idx = base[block] + popcount(bitmap[block] & prefix_mask); + * ``` + */ + bitmap = 2, + /** + * The row is sparse, so it is best represented using a hashtable. + * The hashtable has size `storage_size` and stores the relative output + * index directly, i.e. + * ``` + * auto hash_key = col - min_col; + * auto hash_bucket = hash(hash_key); + * while (col_idxs[hashtable[hash_bucket]] != col) { + * hash_bucket = (hash_bucket + 1) % storage_size; // linear probing + * } + * auto output_idx = hashtable[hash_bucket]; + * ``` + */ + hash = 4, +}; + + +inline sparsity_type operator|(sparsity_type a, sparsity_type b) +{ + return static_cast(static_cast(a) | + static_cast(b)); +} + + +template +class csr_sparsity_lookup { + const IndexType* row_ptrs; + const IndexType* col_idxs; + Array row_desc; + Array storage; +}; + + +template +struct device_sparsity_lookup { + /** Number of bits in a block_type entry. */ + static constexpr int block_size = 32; + + const IndexType* col_idxs; + const IndexType row_nnz; + const int32* storage; + const int64 desc; + + GKO_ATTRIBUTES GKO_INLINE IndexType operator[](IndexType col) const + { + switch (static_cast(desc & 0xF)) { + case sparsity_type::full: + return lookup_full(col); + case sparsity_type::bitmap: + return lookup_bitmap(col); + case sparsity_type::hash: + return lookup_hash(col); + } + assert(false); + } + + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_full(IndexType col) const + { + const auto min_col = col_idxs[0]; + const auto out_idx = col - min_col; + assert(out_idx < row_nnz); + assert(col_idxs[out_idx] == col); + return out_idx; + } + + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_bitmap(IndexType col) const + { + const auto min_col = col_idxs[0]; + const auto num_blocks = static_cast(desc >> 32); + const auto block_bases = storage; + const auto block_bitmaps = + reinterpret_cast(block_bases + num_blocks); + const auto rel_col = col - min_col; + const auto block = rel_col / block_size; + const auto col_in_block = rel_col % block_size; + const auto prefix_mask = (uint32{1} << col_in_block) - 1; + assert(block < num_blocks); + assert(block_bitmaps[block] & (uint32{1} << col_in_block)); + const auto out_idx = + block_bases[block] + + gko::detail::popcount(block_bitmaps[block] & prefix_mask); + assert(col_idxs[out_idx] == col); + return out_idx; + } + + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_hash(IndexType col) const + { + const auto hashmap_size = static_cast(2 * row_nnz); + const auto hash_param = static_cast(desc >> 32); + const auto hashmap = storage; + auto hash = (static_cast(col) * hash_param) % hashmap_size; + assert(hashmap[hash] >= 0); + assert(hashmap[hash] < row_nnz); + // linear probing with sentinel to avoid infinite loop + while (col_idxs[hashmap[hash]] != col) { + hash++; + if (hash >= hashmap_size) { + hash = 0; + } + assert(hashmap[hash] >= 0); + assert(hashmap[hash] < row_nnz); + } + const auto out_idx = hashmap[hash]; + assert(col_idxs[out_idx] == col); + return out_idx; + } +}; + + +} // namespace matrix +} // namespace gko + +#endif // GKO_PUBLIC_CORE_MATRIX_CSR_LOOKUP_HPP_ \ No newline at end of file diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index a7756d40df3..7788cdb89b8 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -48,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -86,6 +87,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index 53d0e7ee4e5..3de68b3c19b 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -1020,6 +1020,126 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); +bool csr_lookup_allowed(matrix::sparsity_type allowed, + matrix::sparsity_type type) +{ + return ((static_cast(allowed) & static_cast(type)) != 0); +} + + +template +bool csr_lookup_try_full(IndexType row_len, IndexType col_range, + matrix::sparsity_type allowed, int64& row_desc) +{ + using matrix::sparsity_type; + bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::full); + if (is_allowed && row_len == col_range) { + row_desc = static_cast(sparsity_type::full); + return true; + } + return false; +} + + +constexpr static int csr_lookup_block_size = 32; + + +template +bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, + IndexType min_col, IndexType available_storage, + matrix::sparsity_type allowed, int64& row_desc, + int32* local_storage, const IndexType* cols) +{ + using matrix::sparsity_type; + bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap); + const auto num_blocks = ceildiv(col_range, csr_lookup_block_size); + if (is_allowed && num_blocks * 2 <= available_storage) { + row_desc = (static_cast(num_blocks) << 32) | + static_cast(sparsity_type::bitmap); + const auto block_ranks = local_storage; + const auto block_bitmaps = + reinterpret_cast(block_ranks + num_blocks); + std::fill_n(block_bitmaps, num_blocks, 0); + for (auto col_it = cols; col_it < cols + row_len; col_it++) { + const auto rel_col = *col_it - min_col; + const auto block = rel_col / csr_lookup_block_size; + const auto col_in_block = rel_col % csr_lookup_block_size; + block_bitmaps[block] |= uint32{1} << col_in_block; + } + int32 partial_sum{}; + for (int32 block = 0; block < num_blocks; block++) { + block_ranks[block] = partial_sum; + partial_sum += gko::detail::popcount(block_bitmaps[block]); + } + return true; + } + return false; +} + + +template +void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, + int64& row_desc, int32* local_storage, + const IndexType* cols) +{ + constexpr double inv_golden_ratio = 0.61803398875; + // use golden ratio as approximation for hash parameter that spreads + // consecutive values as far apart as possible. Ensure lowest bit is set + // otherwise we skip odd hashtable entries + const auto hash_parameter = + 1u | static_cast(available_storage * inv_golden_ratio); + row_desc = (static_cast(hash_parameter) << 32) | + static_cast(matrix::sparsity_type::hash); + std::fill_n(local_storage, available_storage, -1); + for (int32 nz = 0; nz < row_len; nz++) { + auto hash = (static_cast(cols[nz]) * hash_parameter) % + static_cast(available_storage); + // linear probing: find the next empty entry + while (local_storage[hash] != -1) { + hash++; + if (hash >= available_storage) { + hash = 0; + } + } + local_storage[hash] = nz; + } +} + + +template +void build_lookup(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* col_idxs, + size_type num_rows, matrix::sparsity_type allowed, + int64* row_desc, int32* storage) +{ +#pragma omp parallel for + for (size_type row = 0; row < num_rows; row++) { + const auto row_begin = row_ptrs[row]; + const auto row_len = row_ptrs[row + 1] - row_begin; + const auto storage_begin = 2 * row_begin; + const auto available_storage = 2 * row_len; + const auto local_storage = storage + storage_begin; + const auto local_cols = col_idxs + row_begin; + const auto min_col = row_len > 0 ? local_cols[0] : 0; + const auto col_range = + row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0; + bool done = + csr_lookup_try_full(row_len, col_range, allowed, row_desc[row]); + if (!done) { + done = csr_lookup_try_bitmap( + row_len, col_range, min_col, available_storage, allowed, + row_desc[row], local_storage, local_cols); + } + if (!done) { + csr_lookup_build_hash(row_len, available_storage, row_desc[row], + local_storage, local_cols); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); + + } // namespace csr } // namespace omp } // namespace kernels diff --git a/omp/test/matrix/csr_kernels.cpp b/omp/test/matrix/csr_kernels.cpp index d7a8a55bf4c..b1fc6d825fe 100644 --- a/omp/test/matrix/csr_kernels.cpp +++ b/omp/test/matrix/csr_kernels.cpp @@ -56,6 +56,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/test/utils.hpp" #include "core/test/utils/matrix_utils.hpp" #include "core/test/utils/unsort_matrix.hpp" +#include "ginkgo/core/matrix/csr_lookup.hpp" namespace { @@ -816,4 +817,61 @@ TEST_F(Csr, AddScaledIdentityToNonSquare) } +TEST_F(Csr, BuildLookupDataWorks) +{ + set_up_apply_data(); + using gko::matrix::sparsity_type; + gko::Array row_descs(ref, mtx->get_size()[0]); + gko::Array lookup_info(ref, mtx->get_num_stored_elements() * 2); + gko::Array drow_descs(omp, mtx->get_size()[0]); + gko::Array dlookup_info(omp, + mtx->get_num_stored_elements() * 2); + for (auto allowed_methods : + {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { + const auto full_allowed = + static_cast(static_cast(allowed_methods) & + static_cast(sparsity_type::full)); + const auto bitmap_allowed = + static_cast(static_cast(allowed_methods) & + static_cast(sparsity_type::bitmap)); + const auto bitmap_equivalent = + bitmap_allowed ? sparsity_type::bitmap : sparsity_type::hash; + const auto full_equivalent = + full_allowed ? sparsity_type::full : bitmap_equivalent; + SCOPED_TRACE("full: " + std::to_string(full_allowed) + + " bitmap: " + std::to_string(bitmap_allowed)); + + gko::kernels::reference::csr::build_lookup( + ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + mtx->get_size()[0], allowed_methods, row_descs.get_data(), + lookup_info.get_data()); + gko::kernels::omp::csr::build_lookup( + omp, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(), + dmtx->get_size()[0], allowed_methods, drow_descs.get_data(), + dlookup_info.get_data()); + + for (int row = 0; row < dmtx->get_size()[0]; row++) { + SCOPED_TRACE("row: " + std::to_string(row)); + const auto row_begin = mtx->get_const_row_ptrs()[row]; + const auto row_nnz = mtx->get_const_row_ptrs()[row + 1] - row_begin; + gko::matrix::device_sparsity_lookup lookup{ + mtx->get_const_col_idxs() + row_begin, row_nnz, + dlookup_info.get_const_data() + (row_begin * 2), + drow_descs.get_const_data()[row]}; + + ASSERT_EQ( + static_cast(drow_descs.get_const_data()[row]) & 0xFFFF, + static_cast(row_descs.get_const_data()[row]) & 0xFFFF); + for (int nz = 0; nz < row_nnz; nz++) { + SCOPED_TRACE("nz: " + std::to_string(nz)); + const auto col = mtx->get_const_col_idxs()[nz + row_begin]; + ASSERT_EQ(nz, lookup[col]); + } + } + } +} + + } // namespace diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp index 2e913b128a3..a343695d90c 100644 --- a/reference/matrix/csr_kernels.cpp +++ b/reference/matrix/csr_kernels.cpp @@ -1016,6 +1016,124 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); +bool csr_lookup_allowed(matrix::sparsity_type allowed, + matrix::sparsity_type type) +{ + return ((static_cast(allowed) & static_cast(type)) != 0); +} + + +template +bool csr_lookup_try_full(IndexType row_len, IndexType col_range, + matrix::sparsity_type allowed, int64& row_desc) +{ + using matrix::sparsity_type; + bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::full); + if (is_allowed && row_len == col_range) { + row_desc = static_cast(sparsity_type::full); + return true; + } + return false; +} + + +template +bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, + IndexType min_col, IndexType available_storage, + matrix::sparsity_type allowed, int64& row_desc, + int32* local_storage, const IndexType* cols) +{ + constexpr static int block_size = + gko::matrix::device_sparsity_lookup::block_size; + using matrix::sparsity_type; + bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap); + const auto num_blocks = ceildiv(col_range, block_size); + if (is_allowed && num_blocks * 2 <= available_storage) { + row_desc = (static_cast(num_blocks) << 32) | + static_cast(sparsity_type::bitmap); + const auto block_ranks = local_storage; + const auto block_bitmaps = + reinterpret_cast(block_ranks + num_blocks); + std::fill_n(block_bitmaps, num_blocks, 0); + for (auto col_it = cols; col_it < cols + row_len; col_it++) { + const auto rel_col = *col_it - min_col; + const auto block = rel_col / block_size; + const auto col_in_block = rel_col % block_size; + block_bitmaps[block] |= uint32{1} << col_in_block; + } + int32 partial_sum{}; + for (int32 block = 0; block < num_blocks; block++) { + block_ranks[block] = partial_sum; + partial_sum += gko::detail::popcount(block_bitmaps[block]); + } + return true; + } + return false; +} + + +template +void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, + int64& row_desc, int32* local_storage, + const IndexType* cols) +{ + constexpr double inv_golden_ratio = 0.61803398875; + // use golden ratio as approximation for hash parameter that spreads + // consecutive values as far apart as possible. Ensure lowest bit is set + // otherwise we skip odd hashtable entries + const auto hash_parameter = + 1u | static_cast(available_storage * inv_golden_ratio); + row_desc = (static_cast(hash_parameter) << 32) | + static_cast(matrix::sparsity_type::hash); + std::fill_n(local_storage, available_storage, -1); + for (int32 nz = 0; nz < row_len; nz++) { + auto hash = (static_cast(cols[nz]) * hash_parameter) % + static_cast(available_storage); + // linear probing: find the next empty entry + while (local_storage[hash] != -1) { + hash++; + if (hash >= available_storage) { + hash = 0; + } + } + local_storage[hash] = nz; + } +} + + +template +void build_lookup(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* col_idxs, + size_type num_rows, matrix::sparsity_type allowed, + int64* row_desc, int32* storage) +{ + for (size_type row = 0; row < num_rows; row++) { + const auto row_begin = row_ptrs[row]; + const auto row_len = row_ptrs[row + 1] - row_begin; + const auto storage_begin = 2 * row_begin; + const auto available_storage = 2 * row_len; + const auto local_storage = storage + storage_begin; + const auto local_cols = col_idxs + row_begin; + const auto min_col = row_len > 0 ? local_cols[0] : 0; + const auto col_range = + row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0; + bool done = + csr_lookup_try_full(row_len, col_range, allowed, row_desc[row]); + if (!done) { + done = csr_lookup_try_bitmap( + row_len, col_range, min_col, available_storage, allowed, + row_desc[row], local_storage, local_cols); + } + if (!done) { + csr_lookup_build_hash(row_len, available_storage, row_desc[row], + local_storage, local_cols); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); + + } // namespace csr } // namespace reference } // namespace kernels diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp index 7341c6347fe..a5cbf975daf 100644 --- a/reference/test/matrix/csr_kernels.cpp +++ b/reference/test/matrix/csr_kernels.cpp @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -1760,4 +1761,80 @@ TYPED_TEST(Csr, CanGetSubmatrix2) } +TYPED_TEST(Csr, GeneratesLookupData) +{ + using Mtx = typename TestFixture::Mtx; + using IndexType = typename Mtx::index_type; + using gko::matrix::sparsity_type; + auto mtx = Mtx::create(this->exec); + typename Mtx::mat_data data{gko::dim<2>{6, 65}}; + for (int i = 0; i < 65; i++) { + // row 0: empty row + // row 1: full row + data.nonzeros.emplace_back(1, i, 1.0); + // row 2: pretty dense row + if (i % 3 == 0) { + data.nonzeros.emplace_back(2, i, 1.0); + } + // row 4-5: contiguous row + if (i >= 10 && i < 30) { + data.nonzeros.emplace_back(4, i, 1.0); + } + if (i >= 2 && i < 6) { + data.nonzeros.emplace_back(5, i, 1.0); + } + } + // row 3: very sparse + data.nonzeros.emplace_back(3, 0, 1.0); + data.nonzeros.emplace_back(3, 64, 1.0); + data.ensure_row_major_order(); + mtx->read(data); + gko::Array row_descs(this->exec, mtx->get_size()[0]); + gko::Array lookup_info(this->exec, + mtx->get_num_stored_elements() * 2); + for (auto allowed_methods : + {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { + const auto full_allowed = + static_cast(static_cast(allowed_methods) & + static_cast(sparsity_type::full)); + const auto bitmap_allowed = + static_cast(static_cast(allowed_methods) & + static_cast(sparsity_type::bitmap)); + const auto bitmap_equivalent = + bitmap_allowed ? sparsity_type::bitmap : sparsity_type::hash; + const auto full_equivalent = + full_allowed ? sparsity_type::full : bitmap_equivalent; + SCOPED_TRACE("full: " + std::to_string(full_allowed) + + " bitmap: " + std::to_string(bitmap_allowed)); + + gko::kernels::reference::csr::build_lookup( + this->exec, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + mtx->get_size()[0], allowed_methods, row_descs.get_data(), + lookup_info.get_data()); + + const auto descs = row_descs.get_const_data(); + for (auto entry : data.nonzeros) { + const auto row = entry.row; + const auto col = entry.column; + const auto row_begin = mtx->get_const_row_ptrs()[row]; + const auto row_nnz = mtx->get_const_row_ptrs()[row + 1] - row_begin; + gko::matrix::device_sparsity_lookup lookup{ + mtx->get_const_col_idxs() + row_begin, row_nnz, + lookup_info.get_const_data() + (row_begin * 2), descs[row]}; + + const auto nz = lookup[col] + row_begin; + ASSERT_EQ(mtx->get_const_col_idxs()[nz], col); + } + ASSERT_EQ(descs[0] & 0xFFFF, static_cast(full_equivalent)); + ASSERT_EQ(descs[1] & 0xFFFF, static_cast(full_equivalent)); + ASSERT_EQ(descs[2] & 0xFFFF, static_cast(bitmap_equivalent)); + ASSERT_EQ(descs[3] & 0xFFFF, static_cast(sparsity_type::hash)); + ASSERT_EQ(descs[4] & 0xFFFF, static_cast(full_equivalent)); + ASSERT_EQ(descs[5] & 0xFFFF, static_cast(full_equivalent)); + } +} + + } // namespace