From b7ba34920a0bf8aa988591d0e3747703894fe761 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 8 Feb 2022 20:50:24 +0100 Subject: [PATCH 1/8] add kernels for Csr sparsity pattern lookup --- .../cuda_hip/components/segment_scan.hpp.inc | 15 +- common/cuda_hip/matrix/coo_kernels.hpp.inc | 10 +- common/cuda_hip/matrix/csr_kernels.hpp.inc | 270 ++++++++++++++---- core/matrix/csr_kernels.hpp | 11 +- cuda/matrix/csr_kernels.cu | 2 + cuda/matrix/fbcsr_kernels.cu | 2 + cuda/test/matrix/csr_kernels.cpp | 58 ++++ dpcpp/matrix/csr_kernels.dp.cpp | 121 ++++++++ dpcpp/test/matrix/csr_kernels.cpp | 58 ++++ hip/matrix/csr_kernels.hip.cpp | 2 + hip/matrix/fbcsr_kernels.hip.cpp | 2 + hip/test/matrix/csr_kernels.hip.cpp | 58 ++++ include/ginkgo/core/base/intrinsics.hpp | 78 +++++ include/ginkgo/core/matrix/csr_lookup.hpp | 186 ++++++++++++ include/ginkgo/ginkgo.hpp | 2 + omp/matrix/csr_kernels.cpp | 120 ++++++++ omp/test/matrix/csr_kernels.cpp | 58 ++++ reference/matrix/csr_kernels.cpp | 118 ++++++++ reference/test/matrix/csr_kernels.cpp | 77 +++++ 19 files changed, 1185 insertions(+), 63 deletions(-) create mode 100644 include/ginkgo/core/base/intrinsics.hpp create mode 100644 include/ginkgo/core/matrix/csr_lookup.hpp diff --git a/common/cuda_hip/components/segment_scan.hpp.inc b/common/cuda_hip/components/segment_scan.hpp.inc index 2bd2e3eec90..cc873d5850d 100644 --- a/common/cuda_hip/components/segment_scan.hpp.inc +++ b/common/cuda_hip/components/segment_scan.hpp.inc @@ -38,25 +38,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * performs suffix sum. Works on the source array and returns whether the thread * is the first element of its segment with same `ind`. */ -template +template __device__ __forceinline__ bool segment_scan( const group::thread_block_tile& group, const IndexType ind, - ValueType* __restrict__ val) + ValueType& val, Operator op) { bool head = true; #pragma unroll for (int i = 1; i < subwarp_size; i <<= 1) { const IndexType add_ind = group.shfl_up(ind, i); - ValueType add_val = zero(); - if (add_ind == ind && threadIdx.x >= i) { - add_val = *val; + ValueType add_val{}; + if (add_ind == ind && group.thread_rank() >= i) { + add_val = val; if (i == 1) { head = false; } } add_val = group.shfl_down(add_val, i); - if (threadIdx.x < subwarp_size - i) { - *val += add_val; + if (group.thread_rank() < subwarp_size - i) { + val = op(val, add_val); } } return head; diff --git a/common/cuda_hip/matrix/coo_kernels.hpp.inc b/common/cuda_hip/matrix/coo_kernels.hpp.inc index a13068c1ce0..a5dd70bc95a 100644 --- a/common/cuda_hip/matrix/coo_kernels.hpp.inc +++ b/common/cuda_hip/matrix/coo_kernels.hpp.inc @@ -82,8 +82,9 @@ __device__ void spmv_kernel(const size_type nnz, const size_type num_lines, (ind + subwarp_size < nnz) ? row[ind + subwarp_size] : row[nnz - 1]; // segmented scan if (tile_block.any(curr_row != next_row)) { - bool is_first_in_segment = - segment_scan(tile_block, curr_row, &temp_val); + bool is_first_in_segment = segment_scan( + tile_block, curr_row, temp_val, + [](ValueType a, ValueType b) { return a + b; }); if (is_first_in_segment) { atomic_add(&(c[curr_row * c_stride + column_id]), scale(temp_val)); @@ -97,8 +98,9 @@ __device__ void spmv_kernel(const size_type nnz, const size_type num_lines, temp_val += (ind < nnz) ? val[ind] * b[col[ind] * b_stride + column_id] : zero(); // segmented scan - bool is_first_in_segment = - segment_scan(tile_block, curr_row, &temp_val); + bool is_first_in_segment = segment_scan( + tile_block, curr_row, temp_val, + [](ValueType a, ValueType b) { return a + b; }); if (is_first_in_segment) { atomic_add(&(c[curr_row * c_stride + column_id]), scale(temp_val)); } diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index a754971db77..e6a1f7341d7 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -68,22 +68,21 @@ __device__ __forceinline__ bool block_segment_scan_reverse( template __device__ __forceinline__ void find_next_row( const IndexType num_rows, const IndexType data_size, const IndexType ind, - IndexType* __restrict__ row, IndexType* __restrict__ row_end, - const IndexType row_predict, const IndexType row_predict_end, - const IndexType* __restrict__ row_ptr) + IndexType& row, IndexType& row_end, const IndexType row_predict, + const IndexType row_predict_end, const IndexType* __restrict__ row_ptr) { if (!overflow || ind < data_size) { - if (ind >= *row_end) { - *row = row_predict; - *row_end = row_predict_end; - while (ind >= *row_end) { - *row_end = row_ptr[++*row + 1]; + if (ind >= row_end) { + row = row_predict; + row_end = row_predict_end; + while (ind >= row_end) { + row_end = row_ptr[++row + 1]; } } } else { - *row = num_rows - 1; - *row_end = data_size; + row = num_rows - 1; + row_end = data_size; } } @@ -92,16 +91,17 @@ template __device__ __forceinline__ void warp_atomic_add( const group::thread_block_tile& group, bool force_write, - ValueType* __restrict__ val, const IndexType row, ValueType* __restrict__ c, + ValueType& val, const IndexType row, ValueType* __restrict__ c, const size_type c_stride, const IndexType column_id, Closure scale) { // do a local scan to avoid atomic collisions - const bool need_write = segment_scan(group, row, val); + const bool need_write = segment_scan( + group, row, val, [](ValueType a, ValueType b) { return a + b; }); if (need_write && force_write) { - atomic_add(&(c[row * c_stride + column_id]), scale(*val)); + atomic_add(&(c[row * c_stride + column_id]), scale(val)); } if (!need_write || force_write) { - *val = zero(); + val = zero(); } } @@ -111,28 +111,27 @@ template & group, const IndexType num_rows, const IndexType data_size, const IndexType ind, - IndexType* __restrict__ row, IndexType* __restrict__ row_end, - IndexType* __restrict__ nrow, IndexType* __restrict__ nrow_end, - ValueType* __restrict__ temp_val, const ValueType* __restrict__ val, + IndexType& row, IndexType& row_end, IndexType& nrow, IndexType& nrow_end, + ValueType& temp_val, const ValueType* __restrict__ val, const IndexType* __restrict__ col_idxs, const IndexType* __restrict__ row_ptrs, const ValueType* __restrict__ b, const size_type b_stride, ValueType* __restrict__ c, const size_type c_stride, const IndexType column_id, Closure scale) { - const IndexType curr_row = *row; - find_next_row(num_rows, data_size, ind, row, row_end, *nrow, - *nrow_end, row_ptrs); + const auto curr_row = row; + find_next_row(num_rows, data_size, ind, row, row_end, nrow, nrow_end, + row_ptrs); // segmented scan - if (group.any(curr_row != *row)) { - warp_atomic_add(group, curr_row != *row, temp_val, curr_row, c, - c_stride, column_id, scale); - *nrow = group.shfl(*row, subwarp_size - 1); - *nrow_end = group.shfl(*row_end, subwarp_size - 1); + if (group.any(curr_row != row)) { + warp_atomic_add(group, curr_row != row, temp_val, curr_row, c, c_stride, + column_id, scale); + nrow = group.shfl(row, subwarp_size - 1); + nrow_end = group.shfl(row_end, subwarp_size - 1); } if (!last || ind < data_size) { const auto col = col_idxs[ind]; - *temp_val += val[ind] * b[col * b_stride + column_id]; + temp_val += val[ind] * b[col * b_stride + column_id]; } } @@ -171,21 +170,21 @@ __device__ __forceinline__ void spmv_kernel( auto nrow_end = row_end; ValueType temp_val = zero(); IndexType ind = start + threadIdx.x; - find_next_row(num_rows, data_size, ind, &row, &row_end, nrow, - nrow_end, row_ptrs); + find_next_row(num_rows, data_size, ind, row, row_end, nrow, nrow_end, + row_ptrs); const IndexType ind_end = end - wsize; const auto tile_block = group::tiled_partition(group::this_thread_block()); for (; ind < ind_end; ind += wsize) { - process_window(tile_block, num_rows, data_size, ind, &row, - &row_end, &nrow, &nrow_end, &temp_val, val, - col_idxs, row_ptrs, b, b_stride, c, c_stride, - column_id, scale); - } - process_window(tile_block, num_rows, data_size, ind, &row, &row_end, - &nrow, &nrow_end, &temp_val, val, col_idxs, row_ptrs, - b, b_stride, c, c_stride, column_id, scale); - warp_atomic_add(tile_block, true, &temp_val, row, c, c_stride, column_id, + process_window(tile_block, num_rows, data_size, ind, row, + row_end, nrow, nrow_end, temp_val, val, col_idxs, + row_ptrs, b, b_stride, c, c_stride, column_id, + scale); + } + process_window(tile_block, num_rows, data_size, ind, row, row_end, + nrow, nrow_end, temp_val, val, col_idxs, row_ptrs, b, + b_stride, c, c_stride, column_id, scale); + warp_atomic_add(tile_block, true, temp_val, row, c, c_stride, column_id, scale); } @@ -737,15 +736,15 @@ __global__ __launch_bounds__(default_block_size) void inv_symm_permute( template __global__ - __launch_bounds__(default_block_size) void compute_submatrix_idxs_and_vals( - const size_type num_rows, const size_type num_cols, - const size_type row_offset, const size_type col_offset, - const IndexType* __restrict__ source_row_ptrs, - const IndexType* __restrict__ source_col_idxs, - const ValueType* __restrict__ source_values, - const IndexType* __restrict__ result_row_ptrs, - IndexType* __restrict__ result_col_idxs, - ValueType* __restrict__ result_values) +__launch_bounds__(default_block_size) void compute_submatrix_idxs_and_vals( + const size_type num_rows, const size_type num_cols, + const size_type row_offset, const size_type col_offset, + const IndexType* __restrict__ source_row_ptrs, + const IndexType* __restrict__ source_col_idxs, + const ValueType* __restrict__ source_values, + const IndexType* __restrict__ result_row_ptrs, + IndexType* __restrict__ result_col_idxs, + ValueType* __restrict__ result_values) { const auto res_row = thread::get_thread_id_flat(); if (res_row < num_rows) { @@ -767,11 +766,10 @@ __global__ template __global__ - __launch_bounds__(default_block_size) void calculate_nnz_per_row_in_span( - const span row_span, const span col_span, - const IndexType* __restrict__ row_ptrs, - const IndexType* __restrict__ col_idxs, - IndexType* __restrict__ nnz_per_row) +__launch_bounds__(default_block_size) void calculate_nnz_per_row_in_span( + const span row_span, const span col_span, + const IndexType* __restrict__ row_ptrs, + const IndexType* __restrict__ col_idxs, IndexType* __restrict__ nnz_per_row) { const auto src_row = thread::get_thread_id_flat() + row_span.begin; if (src_row < row_span.end) { @@ -915,3 +913,173 @@ void convert_to_fbcsr(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL); + + +namespace kernel { + + +template +__global__ __launch_bounds__(default_block_size) void build_csr_lookup( + const IndexType* __restrict__ row_ptrs, + const IndexType* __restrict__ col_idxs, size_type num_rows, + gko::matrix::sparsity_type allowed, int64* __restrict__ row_desc, + int32* __restrict__ storage) +{ + constexpr int block_size = 32; + auto row = thread::get_subwarp_id_flat(); + if (row >= num_rows) { + return; + } + const auto subwarp = + group::tiled_partition(group::this_thread_block()); + const auto row_begin = row_ptrs[row]; + const auto row_len = row_ptrs[row + 1] - row_begin; + const auto storage_begin = 2 * row_begin; + const auto available_storage = 2 * row_len; + const auto local_storage = storage + storage_begin; + const auto local_cols = col_idxs + row_begin; + const auto lane = subwarp.thread_rank(); + const auto min_col = row_len > 0 ? local_cols[0] : 0; + const auto col_range = + row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0; + + // full column range + if (col_range == row_len && + ((static_cast(allowed) & + static_cast(matrix::sparsity_type::full)) != 0)) { + if (lane == 0) { + row_desc[row] = static_cast(matrix::sparsity_type::full); + } + return; + } + // dense bitmap storage + const auto num_blocks = static_cast(ceildiv(col_range, block_size)); + if (num_blocks * 2 <= available_storage && + ((static_cast(allowed) & + static_cast(matrix::sparsity_type::bitmap)) != 0)) { + if (lane == 0) { + row_desc[row] = (static_cast(num_blocks) << 32) | + static_cast(matrix::sparsity_type::bitmap); + } + const auto block_ranks = local_storage; + const auto block_bitmaps = + reinterpret_cast(block_ranks + num_blocks); + // fill bitmaps with zeros + for (int32 i = lane; i < num_blocks; i += subwarp_size) { + block_bitmaps[i] = 0; + } + // fill bitmaps with sparsity pattern + for (IndexType base_i = 0; base_i < row_len; base_i += subwarp_size) { + const auto i = base_i + lane; + const auto col = i < row_len ? local_cols[i] : INT_MAX; + const auto rel_col = static_cast(col - min_col); + const auto block = rel_col / block_size; + const auto col_in_block = rel_col % block_size; + auto local_bitmap = uint32{i < row_len ? 1u : 0u} << col_in_block; + bool is_first = + segment_scan(subwarp, block, local_bitmap, + [](config::lane_mask_type a, + config::lane_mask_type b) { return a | b; }); + if (is_first && i < row_len) { + block_bitmaps[block] |= local_bitmap; + } + } + // compute bitmap ranks + int32 block_partial_sum{}; + for (int32 base_i = 0; base_i < num_blocks; base_i += subwarp_size) { + const auto i = base_i + lane; + const auto bitmap = i < num_blocks ? block_bitmaps[i] : 0; + int32 local_partial_sum{}; + int32 local_total_sum{}; + subwarp_prefix_sum(popcnt(bitmap), local_partial_sum, + local_total_sum, subwarp); + if (i < num_blocks) { + block_ranks[i] = local_partial_sum + block_partial_sum; + } + block_partial_sum += local_total_sum; + } + return; + } + // sparse hashmap storage + constexpr double inv_golden_ratio = 0.61803398875; + // use golden ratio as approximation for hash parameter that spreads + // consecutive values as far apart as possible. Ensure lowest bit is set + // otherwise we skip odd hashtable entries + const auto hash_parameter = + 1u | static_cast(available_storage * inv_golden_ratio); + if (lane == 0) { + row_desc[row] = (static_cast(hash_parameter) << 32) | + static_cast(matrix::sparsity_type::hash); + } + // fill hashmap with sentinel + constexpr int32 empty = -1; + for (int32 i = lane; i < available_storage; i += subwarp_size) { + local_storage[i] = empty; + } + // memory barrier + subwarp.sync(); + // fill with actual entries + for (IndexType base_i = 0; base_i < row_len; base_i += subwarp_size) { + const auto i = base_i + lane; + const auto col = i < row_len ? local_cols[i] : 0; + auto hash = i < row_len ? (static_cast(col) * hash_parameter) % + static_cast(available_storage) + : available_storage; + // collision resolution +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700) + const auto this_lane_mask = config::lane_mask_type{1} << lane; + const auto lane_prefix_mask = this_lane_mask - 1; + // memory barrier to previous loop iteration + subwarp.sync(); + int32 entry = i < row_len ? local_storage[hash] : empty; + // find all threads in the subwarp with the same hash key + auto colliding = subwarp.match_any(hash); + // if there are any collisions with previously filled buckets + while (subwarp.any(entry != empty || colliding != this_lane_mask)) { + if (entry != empty) { + // assign consecutive indices to matching threads + hash += 1 + popcnt(colliding & lane_prefix_mask); + // cheap modulo replacement + if (hash >= available_storage) { + hash -= available_storage; + } + // this can only fail for row_length < 16 + assert(hash < available_storage); + entry = local_storage[hash]; + } + colliding = subwarp.match_any(hash); + } + if (i < row_len) { + local_storage[hash] = i; + } +#else + if (i < row_len) { + while (atomicCAS(local_storage + hash, empty, i) != empty) { + hash++; + if (hash >= available_storage) { + hash = 0; + } + } + } +#endif + } +} + + +} // namespace kernel + + +template +void build_lookup(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* col_idxs, + size_type num_rows, matrix::sparsity_type allowed, + int64* row_desc, int32* storage) +{ + const auto num_blocks = + ceildiv(num_rows, default_block_size / config::warp_size); + kernel::build_csr_lookup + <<>>(row_ptrs, col_idxs, num_rows, + allowed, row_desc, storage); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp index e9ee48608ed..7b473b915b5 100644 --- a/core/matrix/csr_kernels.hpp +++ b/core/matrix/csr_kernels.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -224,6 +225,12 @@ namespace kernels { const matrix::Dense* beta, \ matrix::Csr* mtx) +#define GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL(IndexType) \ + void build_lookup(std::shared_ptr exec, \ + const IndexType* row_ptrs, const IndexType* col_idxs, \ + size_type num_rows, matrix::sparsity_type allowed, \ + int64* row_desc, int32* storage) + #define GKO_DECLARE_ALL_AS_TEMPLATES \ template \ @@ -283,7 +290,9 @@ namespace kernels { template \ GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST(ValueType, IndexType); \ template \ - GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL(ValueType, IndexType) + GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL(IndexType) GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(csr, GKO_DECLARE_ALL_AS_TEMPLATES); diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu index f63622e7031..c4da4687edd 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.cu @@ -49,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -71,6 +72,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/components/format_conversion.cuh" #include "cuda/components/intrinsics.cuh" #include "cuda/components/merging.cuh" +#include "cuda/components/prefix_sum.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/segment_scan.cuh" #include "cuda/components/thread_ids.cuh" diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu index ab229b81c37..fb7830fbeba 100644 --- a/cuda/matrix/fbcsr_kernels.cu +++ b/cuda/matrix/fbcsr_kernels.cu @@ -49,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include @@ -69,6 +70,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/components/atomic.cuh" #include "cuda/components/cooperative_groups.cuh" #include "cuda/components/merging.cuh" +#include "cuda/components/prefix_sum.cuh" #include "cuda/components/reduction.cuh" #include "cuda/components/thread_ids.cuh" #include "cuda/components/uninitialized_array.hpp" diff --git a/cuda/test/matrix/csr_kernels.cpp b/cuda/test/matrix/csr_kernels.cpp index 483c678d7fb..ab591555815 100644 --- a/cuda/test/matrix/csr_kernels.cpp +++ b/cuda/test/matrix/csr_kernels.cpp @@ -1012,4 +1012,62 @@ TEST_F(Csr, AddScaledIdentityToNonSquare) } +TEST_F(Csr, BuildLookupDataWorks) +{ + set_up_apply_data(std::make_shared()); + using gko::matrix::sparsity_type; + gko::Array row_descs(ref, mtx->get_size()[0]); + gko::Array lookup_info(ref, mtx->get_num_stored_elements() * 2); + gko::Array drow_descs(cuda, mtx->get_size()[0]); + gko::Array dlookup_info(cuda, + mtx->get_num_stored_elements() * 2); + for (auto allowed_methods : + {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { + const auto full_allowed = + static_cast(static_cast(allowed_methods) & + static_cast(sparsity_type::full)); + const auto bitmap_allowed = + static_cast(static_cast(allowed_methods) & + static_cast(sparsity_type::bitmap)); + const auto bitmap_equivalent = + bitmap_allowed ? sparsity_type::bitmap : sparsity_type::hash; + const auto full_equivalent = + full_allowed ? sparsity_type::full : bitmap_equivalent; + SCOPED_TRACE("full: " + std::to_string(full_allowed) + + " bitmap: " + std::to_string(bitmap_allowed)); + + gko::kernels::reference::csr::build_lookup( + ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + mtx->get_size()[0], allowed_methods, row_descs.get_data(), + lookup_info.get_data()); + gko::kernels::cuda::csr::build_lookup( + cuda, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(), + dmtx->get_size()[0], allowed_methods, drow_descs.get_data(), + dlookup_info.get_data()); + + gko::Array host_row_descs(ref, drow_descs); + gko::Array host_lookup_info(ref, dlookup_info); + for (int row = 0; row < dmtx->get_size()[0]; row++) { + SCOPED_TRACE("row: " + std::to_string(row)); + const auto row_begin = mtx->get_const_row_ptrs()[row]; + const auto row_nnz = mtx->get_const_row_ptrs()[row + 1] - row_begin; + gko::matrix::device_sparsity_lookup lookup{ + mtx->get_const_col_idxs() + row_begin, row_nnz, + host_lookup_info.get_const_data() + (row_begin * 2), + host_row_descs.get_const_data()[row]}; + + ASSERT_EQ(host_row_descs.get_const_data()[row] & 0xFFFF, + row_descs.get_const_data()[row] & 0xFFFF); + for (int nz = 0; nz < row_nnz; nz++) { + SCOPED_TRACE("nz: " + std::to_string(nz)); + const auto col = mtx->get_const_col_idxs()[nz + row_begin]; + ASSERT_EQ(nz, lookup[col]); + } + } + } +} + + } // namespace diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 9ed089478b3..5a79765626e 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -2270,6 +2270,127 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); +bool csr_lookup_allowed(matrix::sparsity_type allowed, + matrix::sparsity_type type) +{ + return ((static_cast(allowed) & static_cast(type)) != 0); +} + + +template +bool csr_lookup_try_full(IndexType row_len, IndexType col_range, + matrix::sparsity_type allowed, int64& row_desc) +{ + using matrix::sparsity_type; + bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::full); + if (is_allowed && row_len == col_range) { + row_desc = static_cast(sparsity_type::full); + return true; + } + return false; +} + + +template +bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, + IndexType min_col, IndexType available_storage, + matrix::sparsity_type allowed, int64& row_desc, + int32* local_storage, const IndexType* cols) +{ + constexpr static int block_size = + gko::matrix::device_sparsity_lookup::block_size; + using matrix::sparsity_type; + bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap); + const auto num_blocks = ceildiv(col_range, block_size); + if (is_allowed && num_blocks * 2 <= available_storage) { + row_desc = (static_cast(num_blocks) << 32) | + static_cast(sparsity_type::bitmap); + const auto block_ranks = local_storage; + const auto block_bitmaps = + reinterpret_cast(block_ranks + num_blocks); + std::fill_n(block_bitmaps, num_blocks, 0); + for (auto col_it = cols; col_it < cols + row_len; col_it++) { + const auto rel_col = *col_it - min_col; + const auto block = rel_col / block_size; + const auto col_in_block = rel_col % block_size; + block_bitmaps[block] |= uint32{1} << col_in_block; + } + int32 partial_sum{}; + for (int32 block = 0; block < num_blocks; block++) { + block_ranks[block] = partial_sum; + partial_sum += gko::detail::popcount(block_bitmaps[block]); + } + return true; + } + return false; +} + + +template +void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, + int64& row_desc, int32* local_storage, + const IndexType* cols) +{ + constexpr double inv_golden_ratio = 0.61803398875; + // use golden ratio as approximation for hash parameter that spreads + // consecutive values as far apart as possible. Ensure lowest bit is set + // otherwise we skip odd hashtable entries + const auto hash_parameter = + 1u | static_cast(available_storage * inv_golden_ratio); + row_desc = (static_cast(hash_parameter) << 32) | + static_cast(matrix::sparsity_type::hash); + std::fill_n(local_storage, available_storage, -1); + for (int32 nz = 0; nz < row_len; nz++) { + auto hash = (static_cast(cols[nz]) * hash_parameter) % + static_cast(available_storage); + // linear probing: find the next empty entry + while (local_storage[hash] != -1) { + hash++; + if (hash >= available_storage) { + hash = 0; + } + } + local_storage[hash] = nz; + } +} + + +template +void build_lookup(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* col_idxs, + size_type num_rows, matrix::sparsity_type allowed, + int64* row_desc, int32* storage) +{ + exec->get_queue()->submit([&](sycl::handler& cgh) { + cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) { + const auto row = static_cast(idx[0]); + const auto row_begin = row_ptrs[row]; + const auto row_len = row_ptrs[row + 1] - row_begin; + const auto storage_begin = 2 * row_begin; + const auto available_storage = 2 * row_len; + const auto local_storage = storage + storage_begin; + const auto local_cols = col_idxs + row_begin; + const auto min_col = row_len > 0 ? local_cols[0] : 0; + const auto col_range = + row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0; + bool done = + csr_lookup_try_full(row_len, col_range, allowed, row_desc[row]); + if (!done) { + done = csr_lookup_try_bitmap( + row_len, col_range, min_col, available_storage, allowed, + row_desc[row], local_storage, local_cols); + } + if (!done) { + csr_lookup_build_hash(row_len, available_storage, row_desc[row], + local_storage, local_cols); + } + }); + }); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); + + } // namespace csr } // namespace dpcpp } // namespace kernels diff --git a/dpcpp/test/matrix/csr_kernels.cpp b/dpcpp/test/matrix/csr_kernels.cpp index 375f7d93a93..3de309b336e 100644 --- a/dpcpp/test/matrix/csr_kernels.cpp +++ b/dpcpp/test/matrix/csr_kernels.cpp @@ -964,4 +964,62 @@ TEST_F(Csr, CreateSubMatrixIsEquivalentToRef) } +TEST_F(Csr, BuildLookupDataWorks) +{ + set_up_apply_data(std::make_shared()); + using gko::matrix::sparsity_type; + gko::Array row_descs(ref, mtx->get_size()[0]); + gko::Array lookup_info(ref, mtx->get_num_stored_elements() * 2); + gko::Array drow_descs(dpcpp, mtx->get_size()[0]); + gko::Array dlookup_info(dpcpp, + mtx->get_num_stored_elements() * 2); + for (auto allowed_methods : + {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { + const auto full_allowed = + static_cast(static_cast(allowed_methods) & + static_cast(sparsity_type::full)); + const auto bitmap_allowed = + static_cast(static_cast(allowed_methods) & + static_cast(sparsity_type::bitmap)); + const auto bitmap_equivalent = + bitmap_allowed ? sparsity_type::bitmap : sparsity_type::hash; + const auto full_equivalent = + full_allowed ? sparsity_type::full : bitmap_equivalent; + SCOPED_TRACE("full: " + std::to_string(full_allowed) + + " bitmap: " + std::to_string(bitmap_allowed)); + + gko::kernels::reference::csr::build_lookup( + ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + mtx->get_size()[0], allowed_methods, row_descs.get_data(), + lookup_info.get_data()); + gko::kernels::dpcpp::csr::build_lookup( + dpcpp, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(), + dmtx->get_size()[0], allowed_methods, drow_descs.get_data(), + dlookup_info.get_data()); + + gko::Array host_row_descs(ref, drow_descs); + gko::Array host_lookup_info(ref, dlookup_info); + for (int row = 0; row < dmtx->get_size()[0]; row++) { + SCOPED_TRACE("row: " + std::to_string(row)); + const auto row_begin = mtx->get_const_row_ptrs()[row]; + const auto row_nnz = mtx->get_const_row_ptrs()[row + 1] - row_begin; + gko::matrix::device_sparsity_lookup lookup{ + mtx->get_const_col_idxs() + row_begin, row_nnz, + host_lookup_info.get_const_data() + (row_begin * 2), + host_row_descs.get_const_data()[row]}; + + ASSERT_EQ(host_row_descs.get_const_data()[row] & 0xFFFF, + row_descs.get_const_data()[row] & 0xFFFF); + for (int nz = 0; nz < row_nnz; nz++) { + SCOPED_TRACE("nz: " + std::to_string(nz)); + const auto col = mtx->get_const_col_idxs()[nz + row_begin]; + ASSERT_EQ(nz, lookup[col]); + } + } + } +} + + } // namespace diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp index 1e387f71cf3..9f5ed7143c9 100644 --- a/hip/matrix/csr_kernels.hip.cpp +++ b/hip/matrix/csr_kernels.hip.cpp @@ -50,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -71,6 +72,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/intrinsics.hip.hpp" #include "hip/components/merging.hip.hpp" +#include "hip/components/prefix_sum.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/segment_scan.hip.hpp" #include "hip/components/thread_ids.hip.hpp" diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.hip.cpp index 9e57f6ece55..df65fed3277 100644 --- a/hip/matrix/fbcsr_kernels.hip.cpp +++ b/hip/matrix/fbcsr_kernels.hip.cpp @@ -50,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include @@ -70,6 +71,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "hip/components/atomic.hip.hpp" #include "hip/components/cooperative_groups.hip.hpp" #include "hip/components/merging.hip.hpp" +#include "hip/components/prefix_sum.hip.hpp" #include "hip/components/reduction.hip.hpp" #include "hip/components/thread_ids.hip.hpp" #include "hip/components/uninitialized_array.hip.hpp" diff --git a/hip/test/matrix/csr_kernels.hip.cpp b/hip/test/matrix/csr_kernels.hip.cpp index bb6567f97d3..670fb28fc94 100644 --- a/hip/test/matrix/csr_kernels.hip.cpp +++ b/hip/test/matrix/csr_kernels.hip.cpp @@ -1022,4 +1022,62 @@ TEST_F(Csr, AddScaledIdentityToNonSquare) } +TEST_F(Csr, BuildLookupDataWorks) +{ + set_up_apply_data(std::make_shared()); + using gko::matrix::sparsity_type; + gko::Array row_descs(ref, mtx->get_size()[0]); + gko::Array lookup_info(ref, mtx->get_num_stored_elements() * 2); + gko::Array drow_descs(hip, mtx->get_size()[0]); + gko::Array dlookup_info(hip, + mtx->get_num_stored_elements() * 2); + for (auto allowed_methods : + {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { + const auto full_allowed = + static_cast(static_cast(allowed_methods) & + static_cast(sparsity_type::full)); + const auto bitmap_allowed = + static_cast(static_cast(allowed_methods) & + static_cast(sparsity_type::bitmap)); + const auto bitmap_equivalent = + bitmap_allowed ? sparsity_type::bitmap : sparsity_type::hash; + const auto full_equivalent = + full_allowed ? sparsity_type::full : bitmap_equivalent; + SCOPED_TRACE("full: " + std::to_string(full_allowed) + + " bitmap: " + std::to_string(bitmap_allowed)); + + gko::kernels::reference::csr::build_lookup( + ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + mtx->get_size()[0], allowed_methods, row_descs.get_data(), + lookup_info.get_data()); + gko::kernels::hip::csr::build_lookup( + hip, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(), + dmtx->get_size()[0], allowed_methods, drow_descs.get_data(), + dlookup_info.get_data()); + + gko::Array host_row_descs(ref, drow_descs); + gko::Array host_lookup_info(ref, dlookup_info); + for (int row = 0; row < dmtx->get_size()[0]; row++) { + SCOPED_TRACE("row: " + std::to_string(row)); + const auto row_begin = mtx->get_const_row_ptrs()[row]; + const auto row_nnz = mtx->get_const_row_ptrs()[row + 1] - row_begin; + gko::matrix::device_sparsity_lookup lookup{ + mtx->get_const_col_idxs() + row_begin, row_nnz, + host_lookup_info.get_const_data() + (row_begin * 2), + host_row_descs.get_const_data()[row]}; + + ASSERT_EQ(host_row_descs.get_const_data()[row] & 0xFFFF, + row_descs.get_const_data()[row] & 0xFFFF); + for (int nz = 0; nz < row_nnz; nz++) { + SCOPED_TRACE("nz: " + std::to_string(nz)); + const auto col = mtx->get_const_col_idxs()[nz + row_begin]; + ASSERT_EQ(nz, lookup[col]); + } + } + } +} + + } // namespace diff --git a/include/ginkgo/core/base/intrinsics.hpp b/include/ginkgo/core/base/intrinsics.hpp new file mode 100644 index 00000000000..279c9cb0c3e --- /dev/null +++ b/include/ginkgo/core/base/intrinsics.hpp @@ -0,0 +1,78 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_BASE_INTRINSICS_HPP_ +#define GKO_PUBLIC_CORE_BASE_INTRINSICS_HPP_ + + +#include + + +#include + + +namespace gko { +namespace detail { + + +/** + * Returns the number of set bits in the given bitmask. + */ +GKO_ATTRIBUTES GKO_INLINE int popcount(uint32 bitmask) +{ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return __popc(bitmask); +#else + std::bitset<32> bits{bitmask}; + return bits.count(); +#endif +} + + +/** + * Returns the number of set bits in the given bitmask. + */ +GKO_ATTRIBUTES GKO_INLINE int popcount(uint64 bitmask) +{ +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return __popcll(bitmask); +#else + std::bitset<64> bits{bitmask}; + return bits.count(); +#endif +} + + +} // namespace detail +} // namespace gko + +#endif // GKO_PUBLIC_CORE_BASE_INTRINSICS_HPP_ diff --git a/include/ginkgo/core/matrix/csr_lookup.hpp b/include/ginkgo/core/matrix/csr_lookup.hpp new file mode 100644 index 00000000000..b6b06be3bc7 --- /dev/null +++ b/include/ginkgo/core/matrix/csr_lookup.hpp @@ -0,0 +1,186 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_MATRIX_CSR_LOOKUP_HPP_ +#define GKO_PUBLIC_CORE_MATRIX_CSR_LOOKUP_HPP_ + + +#include + + +#include +#include +#include + + +namespace gko { +namespace matrix { + + +/** + * Type describing which kind of lookup structure is used to find entries in a + * single row of a Csr matrix. + */ +enum class sparsity_type : int { + /** + * The row is dense, i.e. it contains all entries in + * `[min_col, min_col + storage_size)`. + * This means that the relative output index is `col - min_col`. + */ + full = 1, + /** + * The row is sufficiently dense that its sparsity pattern can be stored in + * a dense bitmap consisting of `storage_size` blocks of size `block_size`, + * with a total of `storage_size` blocks. Each block stores its sparsity + * pattern and the number of columns before. This means that the relative + * output index can be computed as + * ``` + * auto block = (col - min_col) / block_size; + * auto local_col = (col - min_col) % block_size; + * auto prefix_mask = (block_type{1} << local_col) - 1; + * auto output_idx = base[block] + popcount(bitmap[block] & prefix_mask); + * ``` + */ + bitmap = 2, + /** + * The row is sparse, so it is best represented using a hashtable. + * The hashtable has size `storage_size` and stores the relative output + * index directly, i.e. + * ``` + * auto hash_key = col - min_col; + * auto hash_bucket = hash(hash_key); + * while (col_idxs[hashtable[hash_bucket]] != col) { + * hash_bucket = (hash_bucket + 1) % storage_size; // linear probing + * } + * auto output_idx = hashtable[hash_bucket]; + * ``` + */ + hash = 4, +}; + + +inline sparsity_type operator|(sparsity_type a, sparsity_type b) +{ + return static_cast(static_cast(a) | + static_cast(b)); +} + + +template +class csr_sparsity_lookup { + const IndexType* row_ptrs; + const IndexType* col_idxs; + Array row_desc; + Array storage; +}; + + +template +struct device_sparsity_lookup { + /** Number of bits in a block_type entry. */ + static constexpr int block_size = 32; + + const IndexType* col_idxs; + const IndexType row_nnz; + const int32* storage; + const int64 desc; + + GKO_ATTRIBUTES GKO_INLINE IndexType operator[](IndexType col) const + { + switch (static_cast(desc & 0xF)) { + case sparsity_type::full: + return lookup_full(col); + case sparsity_type::bitmap: + return lookup_bitmap(col); + case sparsity_type::hash: + return lookup_hash(col); + } + assert(false); + } + + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_full(IndexType col) const + { + const auto min_col = col_idxs[0]; + const auto out_idx = col - min_col; + assert(out_idx < row_nnz); + assert(col_idxs[out_idx] == col); + return out_idx; + } + + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_bitmap(IndexType col) const + { + const auto min_col = col_idxs[0]; + const auto num_blocks = static_cast(desc >> 32); + const auto block_bases = storage; + const auto block_bitmaps = + reinterpret_cast(block_bases + num_blocks); + const auto rel_col = col - min_col; + const auto block = rel_col / block_size; + const auto col_in_block = rel_col % block_size; + const auto prefix_mask = (uint32{1} << col_in_block) - 1; + assert(block < num_blocks); + assert(block_bitmaps[block] & (uint32{1} << col_in_block)); + const auto out_idx = + block_bases[block] + + gko::detail::popcount(block_bitmaps[block] & prefix_mask); + assert(col_idxs[out_idx] == col); + return out_idx; + } + + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_hash(IndexType col) const + { + const auto hashmap_size = static_cast(2 * row_nnz); + const auto hash_param = static_cast(desc >> 32); + const auto hashmap = storage; + auto hash = (static_cast(col) * hash_param) % hashmap_size; + assert(hashmap[hash] >= 0); + assert(hashmap[hash] < row_nnz); + // linear probing with sentinel to avoid infinite loop + while (col_idxs[hashmap[hash]] != col) { + hash++; + if (hash >= hashmap_size) { + hash = 0; + } + assert(hashmap[hash] >= 0); + assert(hashmap[hash] < row_nnz); + } + const auto out_idx = hashmap[hash]; + assert(col_idxs[out_idx] == col); + return out_idx; + } +}; + + +} // namespace matrix +} // namespace gko + +#endif // GKO_PUBLIC_CORE_MATRIX_CSR_LOOKUP_HPP_ diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index b29f78c21f3..9794b3d4b42 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -48,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -86,6 +87,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index a88e6e791a6..5d3b6be211b 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -1129,6 +1129,126 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); +bool csr_lookup_allowed(matrix::sparsity_type allowed, + matrix::sparsity_type type) +{ + return ((static_cast(allowed) & static_cast(type)) != 0); +} + + +template +bool csr_lookup_try_full(IndexType row_len, IndexType col_range, + matrix::sparsity_type allowed, int64& row_desc) +{ + using matrix::sparsity_type; + bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::full); + if (is_allowed && row_len == col_range) { + row_desc = static_cast(sparsity_type::full); + return true; + } + return false; +} + + +constexpr static int csr_lookup_block_size = 32; + + +template +bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, + IndexType min_col, IndexType available_storage, + matrix::sparsity_type allowed, int64& row_desc, + int32* local_storage, const IndexType* cols) +{ + using matrix::sparsity_type; + bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap); + const auto num_blocks = ceildiv(col_range, csr_lookup_block_size); + if (is_allowed && num_blocks * 2 <= available_storage) { + row_desc = (static_cast(num_blocks) << 32) | + static_cast(sparsity_type::bitmap); + const auto block_ranks = local_storage; + const auto block_bitmaps = + reinterpret_cast(block_ranks + num_blocks); + std::fill_n(block_bitmaps, num_blocks, 0); + for (auto col_it = cols; col_it < cols + row_len; col_it++) { + const auto rel_col = *col_it - min_col; + const auto block = rel_col / csr_lookup_block_size; + const auto col_in_block = rel_col % csr_lookup_block_size; + block_bitmaps[block] |= uint32{1} << col_in_block; + } + int32 partial_sum{}; + for (int32 block = 0; block < num_blocks; block++) { + block_ranks[block] = partial_sum; + partial_sum += gko::detail::popcount(block_bitmaps[block]); + } + return true; + } + return false; +} + + +template +void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, + int64& row_desc, int32* local_storage, + const IndexType* cols) +{ + constexpr double inv_golden_ratio = 0.61803398875; + // use golden ratio as approximation for hash parameter that spreads + // consecutive values as far apart as possible. Ensure lowest bit is set + // otherwise we skip odd hashtable entries + const auto hash_parameter = + 1u | static_cast(available_storage * inv_golden_ratio); + row_desc = (static_cast(hash_parameter) << 32) | + static_cast(matrix::sparsity_type::hash); + std::fill_n(local_storage, available_storage, -1); + for (int32 nz = 0; nz < row_len; nz++) { + auto hash = (static_cast(cols[nz]) * hash_parameter) % + static_cast(available_storage); + // linear probing: find the next empty entry + while (local_storage[hash] != -1) { + hash++; + if (hash >= available_storage) { + hash = 0; + } + } + local_storage[hash] = nz; + } +} + + +template +void build_lookup(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* col_idxs, + size_type num_rows, matrix::sparsity_type allowed, + int64* row_desc, int32* storage) +{ +#pragma omp parallel for + for (size_type row = 0; row < num_rows; row++) { + const auto row_begin = row_ptrs[row]; + const auto row_len = row_ptrs[row + 1] - row_begin; + const auto storage_begin = 2 * row_begin; + const auto available_storage = 2 * row_len; + const auto local_storage = storage + storage_begin; + const auto local_cols = col_idxs + row_begin; + const auto min_col = row_len > 0 ? local_cols[0] : 0; + const auto col_range = + row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0; + bool done = + csr_lookup_try_full(row_len, col_range, allowed, row_desc[row]); + if (!done) { + done = csr_lookup_try_bitmap( + row_len, col_range, min_col, available_storage, allowed, + row_desc[row], local_storage, local_cols); + } + if (!done) { + csr_lookup_build_hash(row_len, available_storage, row_desc[row], + local_storage, local_cols); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); + + } // namespace csr } // namespace omp } // namespace kernels diff --git a/omp/test/matrix/csr_kernels.cpp b/omp/test/matrix/csr_kernels.cpp index f308bf40fe6..2a00fbeb220 100644 --- a/omp/test/matrix/csr_kernels.cpp +++ b/omp/test/matrix/csr_kernels.cpp @@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -902,4 +903,61 @@ TEST_F(Csr, CreateSubMatrixFromindex_setIsEquivalentToRef) } +TEST_F(Csr, BuildLookupDataWorks) +{ + set_up_apply_data(); + using gko::matrix::sparsity_type; + gko::Array row_descs(ref, mtx->get_size()[0]); + gko::Array lookup_info(ref, mtx->get_num_stored_elements() * 2); + gko::Array drow_descs(omp, mtx->get_size()[0]); + gko::Array dlookup_info(omp, + mtx->get_num_stored_elements() * 2); + for (auto allowed_methods : + {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { + const auto full_allowed = + static_cast(static_cast(allowed_methods) & + static_cast(sparsity_type::full)); + const auto bitmap_allowed = + static_cast(static_cast(allowed_methods) & + static_cast(sparsity_type::bitmap)); + const auto bitmap_equivalent = + bitmap_allowed ? sparsity_type::bitmap : sparsity_type::hash; + const auto full_equivalent = + full_allowed ? sparsity_type::full : bitmap_equivalent; + SCOPED_TRACE("full: " + std::to_string(full_allowed) + + " bitmap: " + std::to_string(bitmap_allowed)); + + gko::kernels::reference::csr::build_lookup( + ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + mtx->get_size()[0], allowed_methods, row_descs.get_data(), + lookup_info.get_data()); + gko::kernels::omp::csr::build_lookup( + omp, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(), + dmtx->get_size()[0], allowed_methods, drow_descs.get_data(), + dlookup_info.get_data()); + + for (int row = 0; row < dmtx->get_size()[0]; row++) { + SCOPED_TRACE("row: " + std::to_string(row)); + const auto row_begin = mtx->get_const_row_ptrs()[row]; + const auto row_nnz = mtx->get_const_row_ptrs()[row + 1] - row_begin; + gko::matrix::device_sparsity_lookup lookup{ + mtx->get_const_col_idxs() + row_begin, row_nnz, + dlookup_info.get_const_data() + (row_begin * 2), + drow_descs.get_const_data()[row]}; + + ASSERT_EQ( + static_cast(drow_descs.get_const_data()[row]) & 0xFFFF, + static_cast(row_descs.get_const_data()[row]) & 0xFFFF); + for (int nz = 0; nz < row_nnz; nz++) { + SCOPED_TRACE("nz: " + std::to_string(nz)); + const auto col = mtx->get_const_col_idxs()[nz + row_begin]; + ASSERT_EQ(nz, lookup[col]); + } + } + } +} + + } // namespace diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp index 6dbbe528162..a5fafe42cb5 100644 --- a/reference/matrix/csr_kernels.cpp +++ b/reference/matrix/csr_kernels.cpp @@ -1118,6 +1118,124 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); +bool csr_lookup_allowed(matrix::sparsity_type allowed, + matrix::sparsity_type type) +{ + return ((static_cast(allowed) & static_cast(type)) != 0); +} + + +template +bool csr_lookup_try_full(IndexType row_len, IndexType col_range, + matrix::sparsity_type allowed, int64& row_desc) +{ + using matrix::sparsity_type; + bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::full); + if (is_allowed && row_len == col_range) { + row_desc = static_cast(sparsity_type::full); + return true; + } + return false; +} + + +template +bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, + IndexType min_col, IndexType available_storage, + matrix::sparsity_type allowed, int64& row_desc, + int32* local_storage, const IndexType* cols) +{ + constexpr static int block_size = + gko::matrix::device_sparsity_lookup::block_size; + using matrix::sparsity_type; + bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap); + const auto num_blocks = ceildiv(col_range, block_size); + if (is_allowed && num_blocks * 2 <= available_storage) { + row_desc = (static_cast(num_blocks) << 32) | + static_cast(sparsity_type::bitmap); + const auto block_ranks = local_storage; + const auto block_bitmaps = + reinterpret_cast(block_ranks + num_blocks); + std::fill_n(block_bitmaps, num_blocks, 0); + for (auto col_it = cols; col_it < cols + row_len; col_it++) { + const auto rel_col = *col_it - min_col; + const auto block = rel_col / block_size; + const auto col_in_block = rel_col % block_size; + block_bitmaps[block] |= uint32{1} << col_in_block; + } + int32 partial_sum{}; + for (int32 block = 0; block < num_blocks; block++) { + block_ranks[block] = partial_sum; + partial_sum += gko::detail::popcount(block_bitmaps[block]); + } + return true; + } + return false; +} + + +template +void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, + int64& row_desc, int32* local_storage, + const IndexType* cols) +{ + constexpr double inv_golden_ratio = 0.61803398875; + // use golden ratio as approximation for hash parameter that spreads + // consecutive values as far apart as possible. Ensure lowest bit is set + // otherwise we skip odd hashtable entries + const auto hash_parameter = + 1u | static_cast(available_storage * inv_golden_ratio); + row_desc = (static_cast(hash_parameter) << 32) | + static_cast(matrix::sparsity_type::hash); + std::fill_n(local_storage, available_storage, -1); + for (int32 nz = 0; nz < row_len; nz++) { + auto hash = (static_cast(cols[nz]) * hash_parameter) % + static_cast(available_storage); + // linear probing: find the next empty entry + while (local_storage[hash] != -1) { + hash++; + if (hash >= available_storage) { + hash = 0; + } + } + local_storage[hash] = nz; + } +} + + +template +void build_lookup(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* col_idxs, + size_type num_rows, matrix::sparsity_type allowed, + int64* row_desc, int32* storage) +{ + for (size_type row = 0; row < num_rows; row++) { + const auto row_begin = row_ptrs[row]; + const auto row_len = row_ptrs[row + 1] - row_begin; + const auto storage_begin = 2 * row_begin; + const auto available_storage = 2 * row_len; + const auto local_storage = storage + storage_begin; + const auto local_cols = col_idxs + row_begin; + const auto min_col = row_len > 0 ? local_cols[0] : 0; + const auto col_range = + row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0; + bool done = + csr_lookup_try_full(row_len, col_range, allowed, row_desc[row]); + if (!done) { + done = csr_lookup_try_bitmap( + row_len, col_range, min_col, available_storage, allowed, + row_desc[row], local_storage, local_cols); + } + if (!done) { + csr_lookup_build_hash(row_len, available_storage, row_desc[row], + local_storage, local_cols); + } + } +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); + + } // namespace csr } // namespace reference } // namespace kernels diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp index ec610081884..6115189db22 100644 --- a/reference/test/matrix/csr_kernels.cpp +++ b/reference/test/matrix/csr_kernels.cpp @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -1891,4 +1892,80 @@ TYPED_TEST(Csr, CanGetSubmatrixWithindex_set) } +TYPED_TEST(Csr, GeneratesLookupData) +{ + using Mtx = typename TestFixture::Mtx; + using IndexType = typename Mtx::index_type; + using gko::matrix::sparsity_type; + auto mtx = Mtx::create(this->exec); + typename Mtx::mat_data data{gko::dim<2>{6, 65}}; + for (int i = 0; i < 65; i++) { + // row 0: empty row + // row 1: full row + data.nonzeros.emplace_back(1, i, 1.0); + // row 2: pretty dense row + if (i % 3 == 0) { + data.nonzeros.emplace_back(2, i, 1.0); + } + // row 4-5: contiguous row + if (i >= 10 && i < 30) { + data.nonzeros.emplace_back(4, i, 1.0); + } + if (i >= 2 && i < 6) { + data.nonzeros.emplace_back(5, i, 1.0); + } + } + // row 3: very sparse + data.nonzeros.emplace_back(3, 0, 1.0); + data.nonzeros.emplace_back(3, 64, 1.0); + data.ensure_row_major_order(); + mtx->read(data); + gko::Array row_descs(this->exec, mtx->get_size()[0]); + gko::Array lookup_info(this->exec, + mtx->get_num_stored_elements() * 2); + for (auto allowed_methods : + {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { + const auto full_allowed = + static_cast(static_cast(allowed_methods) & + static_cast(sparsity_type::full)); + const auto bitmap_allowed = + static_cast(static_cast(allowed_methods) & + static_cast(sparsity_type::bitmap)); + const auto bitmap_equivalent = + bitmap_allowed ? sparsity_type::bitmap : sparsity_type::hash; + const auto full_equivalent = + full_allowed ? sparsity_type::full : bitmap_equivalent; + SCOPED_TRACE("full: " + std::to_string(full_allowed) + + " bitmap: " + std::to_string(bitmap_allowed)); + + gko::kernels::reference::csr::build_lookup( + this->exec, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + mtx->get_size()[0], allowed_methods, row_descs.get_data(), + lookup_info.get_data()); + + const auto descs = row_descs.get_const_data(); + for (auto entry : data.nonzeros) { + const auto row = entry.row; + const auto col = entry.column; + const auto row_begin = mtx->get_const_row_ptrs()[row]; + const auto row_nnz = mtx->get_const_row_ptrs()[row + 1] - row_begin; + gko::matrix::device_sparsity_lookup lookup{ + mtx->get_const_col_idxs() + row_begin, row_nnz, + lookup_info.get_const_data() + (row_begin * 2), descs[row]}; + + const auto nz = lookup[col] + row_begin; + ASSERT_EQ(mtx->get_const_col_idxs()[nz], col); + } + ASSERT_EQ(descs[0] & 0xFFFF, static_cast(full_equivalent)); + ASSERT_EQ(descs[1] & 0xFFFF, static_cast(full_equivalent)); + ASSERT_EQ(descs[2] & 0xFFFF, static_cast(bitmap_equivalent)); + ASSERT_EQ(descs[3] & 0xFFFF, static_cast(sparsity_type::hash)); + ASSERT_EQ(descs[4] & 0xFFFF, static_cast(full_equivalent)); + ASSERT_EQ(descs[5] & 0xFFFF, static_cast(full_equivalent)); + } +} + + } // namespace From 3ffec4f64a9c63c4603dbe066695a575f4fae653 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 24 Mar 2022 10:49:49 +0100 Subject: [PATCH 2/8] fix hashmap collision resolution on Volta --- common/cuda_hip/matrix/csr_kernels.hpp.inc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index e6a1f7341d7..1ff21194279 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -1022,9 +1022,10 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( for (IndexType base_i = 0; base_i < row_len; base_i += subwarp_size) { const auto i = base_i + lane; const auto col = i < row_len ? local_cols[i] : 0; + // make sure that each idle thread gets a unique out-of-bounds value auto hash = i < row_len ? (static_cast(col) * hash_parameter) % static_cast(available_storage) - : available_storage; + : static_cast(available_storage + lane); // collision resolution #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700) const auto this_lane_mask = config::lane_mask_type{1} << lane; @@ -1036,9 +1037,10 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( auto colliding = subwarp.match_any(hash); // if there are any collisions with previously filled buckets while (subwarp.any(entry != empty || colliding != this_lane_mask)) { - if (entry != empty) { + if (entry != empty || colliding != this_lane_mask) { // assign consecutive indices to matching threads - hash += 1 + popcnt(colliding & lane_prefix_mask); + hash += (entry == empty ? 0 : 1) + + popcnt(colliding & lane_prefix_mask); // cheap modulo replacement if (hash >= available_storage) { hash -= available_storage; From 5f35da7fadd7afbbdb3c835fcfce8b3ea023a193 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 24 Mar 2022 10:56:08 +0100 Subject: [PATCH 3/8] add memory barrier for safety --- common/cuda_hip/matrix/csr_kernels.hpp.inc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index 1ff21194279..a8944c2f696 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -968,6 +968,8 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( for (int32 i = lane; i < num_blocks; i += subwarp_size) { block_bitmaps[i] = 0; } + // memory barrier - just to be sure + subwarp.sync(); // fill bitmaps with sparsity pattern for (IndexType base_i = 0; base_i < row_len; base_i += subwarp_size) { const auto i = base_i + lane; From 15f28cd96b35875470eca34e029077297a4dfb03 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 29 Mar 2022 18:26:27 +0200 Subject: [PATCH 4/8] move memory barrier --- common/cuda_hip/matrix/csr_kernels.hpp.inc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index a8944c2f696..b3296ceeb92 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -968,8 +968,6 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( for (int32 i = lane; i < num_blocks; i += subwarp_size) { block_bitmaps[i] = 0; } - // memory barrier - just to be sure - subwarp.sync(); // fill bitmaps with sparsity pattern for (IndexType base_i = 0; base_i < row_len; base_i += subwarp_size) { const auto i = base_i + lane; @@ -982,6 +980,8 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( segment_scan(subwarp, block, local_bitmap, [](config::lane_mask_type a, config::lane_mask_type b) { return a | b; }); + // memory barrier - just to be sure + subwarp.sync(); if (is_first && i < row_len) { block_bitmaps[block] |= local_bitmap; } From f759a3315d09256ea94ecfd538e6207acbb78d71 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 6 May 2022 15:12:01 +0200 Subject: [PATCH 5/8] review updates * allow lookups of non-existent column indices * store separate offsets for lookup data to reduce memory footprint * simplify tests * fix DPC++ execution on float-only devices * clarify comments on cheap modulo replacement Co-authored-by: Yuhsiang M. Tsai Co-authored-by: Marcel Koch --- common/cuda_hip/matrix/csr_kernels.hpp.inc | 72 +++-- common/unified/matrix/csr_kernels.cpp | 44 +++ core/matrix/csr_kernels.hpp | 14 +- core/matrix/csr_lookup.hpp | 323 +++++++++++++++++++++ cuda/matrix/csr_kernels.cu | 2 +- cuda/matrix/fbcsr_kernels.cu | 2 +- cuda/test/matrix/csr_kernels.cpp | 58 ---- dpcpp/matrix/csr_kernels.dp.cpp | 27 +- dpcpp/test/matrix/csr_kernels.cpp | 58 ---- hip/matrix/csr_kernels.hip.cpp | 2 +- hip/matrix/fbcsr_kernels.hip.cpp | 2 +- hip/test/matrix/csr_kernels.hip.cpp | 58 ---- include/ginkgo/core/matrix/csr_lookup.hpp | 186 ------------ include/ginkgo/ginkgo.hpp | 1 - omp/matrix/csr_kernels.cpp | 35 +-- omp/test/matrix/csr_kernels.cpp | 59 +--- reference/matrix/csr_kernels.cpp | 57 +++- reference/test/matrix/csr_kernels.cpp | 194 +++++++++---- test/matrix/csr_kernels.cpp | 146 +++++++++- 19 files changed, 780 insertions(+), 560 deletions(-) create mode 100644 core/matrix/csr_lookup.hpp delete mode 100644 include/ginkgo/core/matrix/csr_lookup.hpp diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index b3296ceeb92..efb0afd46aa 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -736,15 +736,15 @@ __global__ __launch_bounds__(default_block_size) void inv_symm_permute( template __global__ -__launch_bounds__(default_block_size) void compute_submatrix_idxs_and_vals( - const size_type num_rows, const size_type num_cols, - const size_type row_offset, const size_type col_offset, - const IndexType* __restrict__ source_row_ptrs, - const IndexType* __restrict__ source_col_idxs, - const ValueType* __restrict__ source_values, - const IndexType* __restrict__ result_row_ptrs, - IndexType* __restrict__ result_col_idxs, - ValueType* __restrict__ result_values) + __launch_bounds__(default_block_size) void compute_submatrix_idxs_and_vals( + const size_type num_rows, const size_type num_cols, + const size_type row_offset, const size_type col_offset, + const IndexType* __restrict__ source_row_ptrs, + const IndexType* __restrict__ source_col_idxs, + const ValueType* __restrict__ source_values, + const IndexType* __restrict__ result_row_ptrs, + IndexType* __restrict__ result_col_idxs, + ValueType* __restrict__ result_values) { const auto res_row = thread::get_thread_id_flat(); if (res_row < num_rows) { @@ -766,10 +766,11 @@ __launch_bounds__(default_block_size) void compute_submatrix_idxs_and_vals( template __global__ -__launch_bounds__(default_block_size) void calculate_nnz_per_row_in_span( - const span row_span, const span col_span, - const IndexType* __restrict__ row_ptrs, - const IndexType* __restrict__ col_idxs, IndexType* __restrict__ nnz_per_row) + __launch_bounds__(default_block_size) void calculate_nnz_per_row_in_span( + const span row_span, const span col_span, + const IndexType* __restrict__ row_ptrs, + const IndexType* __restrict__ col_idxs, + IndexType* __restrict__ nnz_per_row) { const auto src_row = thread::get_thread_id_flat() + row_span.begin; if (src_row < row_span.end) { @@ -922,10 +923,11 @@ template __global__ __launch_bounds__(default_block_size) void build_csr_lookup( const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ col_idxs, size_type num_rows, - gko::matrix::sparsity_type allowed, int64* __restrict__ row_desc, - int32* __restrict__ storage) + gko::matrix::sparsity_type allowed, const IndexType* storage_offsets, + int64* __restrict__ row_desc, int32* __restrict__ storage) { - constexpr int block_size = 32; + constexpr int bitmap_block_size = + gko::matrix::device_sparsity_lookup::block_size; auto row = thread::get_subwarp_id_flat(); if (row >= num_rows) { return; @@ -934,8 +936,8 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( group::tiled_partition(group::this_thread_block()); const auto row_begin = row_ptrs[row]; const auto row_len = row_ptrs[row + 1] - row_begin; - const auto storage_begin = 2 * row_begin; - const auto available_storage = 2 * row_len; + const auto storage_begin = storage_offsets[row]; + const auto available_storage = storage_offsets[row + 1] - storage_begin; const auto local_storage = storage + storage_begin; const auto local_cols = col_idxs + row_begin; const auto lane = subwarp.thread_rank(); @@ -945,18 +947,17 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( // full column range if (col_range == row_len && - ((static_cast(allowed) & - static_cast(matrix::sparsity_type::full)) != 0)) { + csr_lookup_allowed(allowed, matrix::sparsity_type::full)) { if (lane == 0) { row_desc[row] = static_cast(matrix::sparsity_type::full); } return; } // dense bitmap storage - const auto num_blocks = static_cast(ceildiv(col_range, block_size)); + const auto num_blocks = + static_cast(ceildiv(col_range, bitmap_block_size)); if (num_blocks * 2 <= available_storage && - ((static_cast(allowed) & - static_cast(matrix::sparsity_type::bitmap)) != 0)) { + csr_lookup_allowed(allowed, matrix::sparsity_type::bitmap)) { if (lane == 0) { row_desc[row] = (static_cast(num_blocks) << 32) | static_cast(matrix::sparsity_type::bitmap); @@ -971,10 +972,12 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( // fill bitmaps with sparsity pattern for (IndexType base_i = 0; base_i < row_len; base_i += subwarp_size) { const auto i = base_i + lane; - const auto col = i < row_len ? local_cols[i] : INT_MAX; + const auto col = i < row_len + ? local_cols[i] + : device_numeric_limits::max; const auto rel_col = static_cast(col - min_col); - const auto block = rel_col / block_size; - const auto col_in_block = rel_col % block_size; + const auto block = rel_col / bitmap_block_size; + const auto col_in_block = rel_col % bitmap_block_size; auto local_bitmap = uint32{i < row_len ? 1u : 0u} << col_in_block; bool is_first = segment_scan(subwarp, block, local_bitmap, @@ -1003,6 +1006,8 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( return; } // sparse hashmap storage + // we need at least one unfilled entry to avoid infinite loops on search + GKO_ASSERT(row_len < available_storage); constexpr double inv_golden_ratio = 0.61803398875; // use golden ratio as approximation for hash parameter that spreads // consecutive values as far apart as possible. Ensure lowest bit is set @@ -1014,7 +1019,7 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( static_cast(matrix::sparsity_type::hash); } // fill hashmap with sentinel - constexpr int32 empty = -1; + constexpr int32 empty = invalid_index(); for (int32 i = lane; i < available_storage; i += subwarp_size) { local_storage[i] = empty; } @@ -1047,8 +1052,11 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( if (hash >= available_storage) { hash -= available_storage; } - // this can only fail for row_length < 16 - assert(hash < available_storage); + // this could only fail for available_storage < warp_size, as + // popcnt(colliding) is at most warp_size. At the same time, we + // only increase hash by row_length at most, so this is still + // safe. + GKO_ASSERT(hash < available_storage); entry = local_storage[hash]; } colliding = subwarp.match_any(hash); @@ -1077,13 +1085,15 @@ template void build_lookup(std::shared_ptr exec, const IndexType* row_ptrs, const IndexType* col_idxs, size_type num_rows, matrix::sparsity_type allowed, - int64* row_desc, int32* storage) + const IndexType* storage_offsets, int64* row_desc, + int32* storage) { const auto num_blocks = ceildiv(num_rows, default_block_size / config::warp_size); kernel::build_csr_lookup <<>>(row_ptrs, col_idxs, num_rows, - allowed, row_desc, storage); + allowed, storage_offsets, row_desc, + storage); } GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); diff --git a/common/unified/matrix/csr_kernels.cpp b/common/unified/matrix/csr_kernels.cpp index d6a81453b86..d861b1a16e7 100644 --- a/common/unified/matrix/csr_kernels.cpp +++ b/common/unified/matrix/csr_kernels.cpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common/unified/base/kernel_launch.hpp" +#include "core/components/prefix_sum_kernels.hpp" namespace gko { @@ -243,6 +244,49 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL); +template +void build_lookup_offsets(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* col_idxs, + size_type num_rows, matrix::sparsity_type allowed, + IndexType* storage_offsets) +{ + using matrix::sparsity_type; + constexpr static int block_size = + gko::matrix::device_sparsity_lookup::block_size; + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto row_ptrs, auto col_idxs, auto num_rows, + auto allowed, auto storage_offsets) { + const auto row_begin = row_ptrs[row]; + const auto row_len = row_ptrs[row + 1] - row_begin; + const auto local_cols = col_idxs + row_begin; + const auto min_col = row_len > 0 ? local_cols[0] : 0; + const auto col_range = + row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0; + if (csr_lookup_allowed(allowed, sparsity_type::full) && + row_len == col_range) { + storage_offsets[row] = 0; + } else { + const auto hashmap_storage = row_len == 0 ? 1 : 2 * row_len; + const auto bitmap_num_blocks = + static_cast(ceildiv(col_range, block_size)); + const auto bitmap_storage = 2 * bitmap_num_blocks; + if (csr_lookup_allowed(allowed, sparsity_type::bitmap) && + bitmap_storage <= hashmap_storage) { + storage_offsets[row] = bitmap_storage; + } else { + storage_offsets[row] = hashmap_storage; + } + } + }, + num_rows, row_ptrs, col_idxs, num_rows, allowed, storage_offsets); + components::prefix_sum(exec, storage_offsets, num_rows + 1); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL); + + } // namespace csr } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp index 7b473b915b5..5e910385f20 100644 --- a/core/matrix/csr_kernels.hpp +++ b/core/matrix/csr_kernels.hpp @@ -41,7 +41,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include #include #include #include @@ -51,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/kernel_declaration.hpp" +#include "core/matrix/csr_lookup.hpp" namespace gko { @@ -225,11 +225,19 @@ namespace kernels { const matrix::Dense* beta, \ matrix::Csr* mtx) +#define GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL(IndexType) \ + void build_lookup_offsets(std::shared_ptr exec, \ + const IndexType* row_ptrs, \ + const IndexType* col_idxs, size_type num_rows, \ + matrix::sparsity_type allowed, \ + IndexType* storage_offsets) + #define GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL(IndexType) \ void build_lookup(std::shared_ptr exec, \ const IndexType* row_ptrs, const IndexType* col_idxs, \ size_type num_rows, matrix::sparsity_type allowed, \ - int64* row_desc, int32* storage) + const IndexType* storage_offsets, int64* row_desc, \ + int32* storage) #define GKO_DECLARE_ALL_AS_TEMPLATES \ @@ -292,6 +300,8 @@ namespace kernels { template \ GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL(ValueType, IndexType); \ template \ + GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL(IndexType); \ + template \ GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL(IndexType) diff --git a/core/matrix/csr_lookup.hpp b/core/matrix/csr_lookup.hpp new file mode 100644 index 00000000000..9dc61028065 --- /dev/null +++ b/core/matrix/csr_lookup.hpp @@ -0,0 +1,323 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_MATRIX_CSR_LOOKUP_HPP_ +#define GKO_CORE_MATRIX_CSR_LOOKUP_HPP_ + + +#include + + +#include +#include +#include + + +namespace gko { +namespace matrix { + + +/** + * Type describing which kind of lookup structure is used to find entries in a + * single row of a Csr matrix. + */ +enum class sparsity_type : int { + /** + * The row is dense, i.e. it contains all entries in + * `[min_col, min_col + storage_size)`. + * This means that the relative output index is `col - min_col`. + */ + full = 1, + /** + * The row is sufficiently dense that its sparsity pattern can be stored in + * a dense bitmap consisting of `storage_size` blocks of size `block_size`. + * Each block stores its sparsity pattern as a bitmask and the number of + * columns before the block as integer. This means that the relative output + * index can be computed as + * ``` + * auto block = (col - min_col) / block_size; + * auto local_col = (col - min_col) % block_size; + * auto prefix_mask = (block_type{1} << local_col) - 1; + * auto output_idx = base[block] + popcount(bitmap[block] & prefix_mask); + * ``` + */ + bitmap = 2, + /** + * The row is sparse, so it is best represented using a hashtable. + * The hashtable has size `storage_size` and stores the relative output + * index directly, i.e. + * ``` + * auto hash_key = col; + * auto hash_bucket = hash(hash_key); + * while (local_cols[hashtable[hash_bucket]] != col) { + * hash_bucket = (hash_bucket + 1) % storage_size; // linear probing + * } + * auto output_idx = hashtable[hash_bucket]; + * ``` + */ + hash = 4, +}; + + +GKO_ATTRIBUTES GKO_INLINE sparsity_type operator|(sparsity_type a, + sparsity_type b) +{ + return static_cast(static_cast(a) | + static_cast(b)); +} + + +GKO_ATTRIBUTES GKO_INLINE bool csr_lookup_allowed(matrix::sparsity_type allowed, + matrix::sparsity_type type) +{ + return ((static_cast(allowed) & static_cast(type)) != 0); +} + + +template +struct device_sparsity_lookup { + /** Number of bits in a block_type entry. */ + static constexpr int block_size = 32; + + /** + * Set up a device_sparsity_lookup structure from local data + * + * @param local_cols the column array slice for the local row, it contains + * row_nnz entries + * @param row_nnz the number of entries in this row + * @param local_storage the local lookup structure storage array slice, it + * needs to be set up using the storage_offsets array + * @param storage_size the number of int32 entries in the local lookup + * structure + * @param desc the lookup structure descriptor for this row + */ + GKO_ATTRIBUTES GKO_INLINE device_sparsity_lookup( + const IndexType* local_cols, IndexType row_nnz, + const int32* local_storage, IndexType storage_size, int64 desc) + : local_cols{local_cols}, + row_nnz{row_nnz}, + local_storage{local_storage}, + storage_size{storage_size}, + desc{desc} + {} + + /** + * Set up a device_sparsity_lookup structure from global data + * + * @param row_ptrs the CSR row pointers + * @param col_idxs the CSR column indices + * @param storage_offsets the storage offset array for the lookup structure + * @param storage the storage array for the lookup data structure + * @param descs the lookup structure descriptor array + * @param row the row index to build the lookup structure for + */ + GKO_ATTRIBUTES GKO_INLINE device_sparsity_lookup( + const IndexType* row_ptrs, const IndexType* col_idxs, + const IndexType* storage_offsets, const int32* storage, + const int64* descs, size_type row) + { + const auto row_begin = row_ptrs[row]; + const auto row_end = row_ptrs[row + 1]; + const auto storage_begin = storage_offsets[row]; + const auto storage_end = storage_offsets[row + 1]; + local_cols = col_idxs + row_begin; + row_nnz = row_end - row_begin; + local_storage = storage + storage_begin; + storage_size = storage_end - storage_begin; + desc = descs[row]; + } + + /** + * Returns the row-local index of the entry with the given column index, if + * it is present. + * + * @param col the column index + * @return the row-local index of the given entry, or + * invalid_index() if it is not present. + */ + GKO_ATTRIBUTES GKO_INLINE IndexType operator[](IndexType col) const + { + switch (static_cast(desc & 0xF)) { + case sparsity_type::full: + return lookup_full(col); + case sparsity_type::bitmap: + return lookup_bitmap(col); + case sparsity_type::hash: + return lookup_hash(col); + } + GKO_ASSERT(false); + } + + /** + * Returns the row-local index of the entry with the given column index, + * assuming it is present. + * + * @param col the column index + * @return the row-local index of the given entry. + * @note the function fails with an assertion (debug builds) or can crash if + * the column index is not present in the row! + */ + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_unsafe(IndexType col) const + { + IndexType result{}; + switch (static_cast(desc & 0xF)) { + case sparsity_type::full: + result = lookup_full_unsafe(col); + break; + case sparsity_type::bitmap: + result = lookup_bitmap_unsafe(col); + break; + case sparsity_type::hash: + result = lookup_hash_unsafe(col); + break; + default: + GKO_ASSERT(false); + } + GKO_ASSERT(local_cols[result] == col); + return result; + } + +private: + using unsigned_index_type = typename std::make_unsigned::type; + + const IndexType* local_cols; + IndexType row_nnz; + const int32* local_storage; + IndexType storage_size; + int64 desc; + + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_full_unsafe(IndexType col) const + { + const auto min_col = local_cols[0]; + const auto out_idx = col - min_col; + GKO_ASSERT(out_idx >= 0 && out_idx < row_nnz); + return out_idx; + } + + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_full(IndexType col) const + { + const auto min_col = local_cols[0]; + const auto out_idx = col - min_col; + return out_idx >= 0 && out_idx < row_nnz ? out_idx + : invalid_index(); + } + + GKO_ATTRIBUTES GKO_INLINE IndexType + lookup_bitmap_unsafe(IndexType col) const + { + const auto min_col = local_cols[0]; + const auto num_blocks = static_cast(desc >> 32); + const auto block_bases = local_storage; + const auto block_bitmaps = + reinterpret_cast(block_bases + num_blocks); + const auto rel_col = col - min_col; + const auto block = rel_col / block_size; + const auto col_in_block = rel_col % block_size; + const auto prefix_mask = (uint32{1} << col_in_block) - 1; + GKO_ASSERT(rel_col >= 0); + GKO_ASSERT(block < num_blocks); + GKO_ASSERT(block_bitmaps[block] & (uint32{1} << col_in_block)); + const auto out_idx = + block_bases[block] + + gko::detail::popcount(block_bitmaps[block] & prefix_mask); + GKO_ASSERT(local_cols[out_idx] == col); + return out_idx; + } + + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_bitmap(IndexType col) const + { + const auto min_col = local_cols[0]; + const auto num_blocks = static_cast(desc >> 32); + const auto block_bases = local_storage; + const auto block_bitmaps = + reinterpret_cast(block_bases + num_blocks); + const auto rel_col = col - min_col; + const auto block = rel_col / block_size; + const auto col_in_block = rel_col % block_size; + if (rel_col < 0 || block >= num_blocks || + !(block_bitmaps[block] & (uint32{1} << col_in_block))) { + return invalid_index(); + } + const auto prefix_mask = (uint32{1} << col_in_block) - 1; + return block_bases[block] + + gko::detail::popcount(block_bitmaps[block] & prefix_mask); + } + + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_hash_unsafe(IndexType col) const + { + const auto hashmap_size = static_cast(storage_size); + const auto hash_param = static_cast(desc >> 32); + const auto hashmap = local_storage; + auto hash = + (static_cast(col) * hash_param) % hashmap_size; + GKO_ASSERT(hashmap[hash] >= 0); + GKO_ASSERT(hashmap[hash] < row_nnz); + while (local_cols[hashmap[hash]] != col) { + hash++; + if (hash >= hashmap_size) { + hash = 0; + } + GKO_ASSERT(hashmap[hash] >= 0); + GKO_ASSERT(hashmap[hash] < row_nnz); + } + const auto out_idx = hashmap[hash]; + return out_idx; + } + + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_hash(IndexType col) const + { + const auto hashmap_size = static_cast(storage_size); + const auto hash_param = static_cast(desc >> 32); + const auto hashmap = local_storage; + auto hash = + (static_cast(col) * hash_param) % hashmap_size; + GKO_ASSERT(hashmap[hash] < row_nnz); + auto out_idx = hashmap[hash]; + // linear probing with invalid_index sentinel to avoid infinite loop + while (out_idx >= 0 && local_cols[out_idx] != col) { + hash++; + if (hash >= hashmap_size) { + hash = 0; + } + out_idx = hashmap[hash]; + GKO_ASSERT(hashmap[hash] < row_nnz); + } + // out_idx is either correct or invalid_index, the hashmap sentinel + return out_idx; + } +}; + + +} // namespace matrix +} // namespace gko + +#endif // GKO_CORE_MATRIX_CSR_LOOKUP_HPP_ diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu index c4da4687edd..9bdb7e53c2a 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.cu @@ -49,7 +49,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include #include #include #include @@ -60,6 +59,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/format_conversion_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_lookup.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/config.hpp" diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu index fb7830fbeba..dfd352d4996 100644 --- a/cuda/matrix/fbcsr_kernels.cu +++ b/cuda/matrix/fbcsr_kernels.cu @@ -49,7 +49,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include #include @@ -58,6 +57,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/device_matrix_data_kernels.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" +#include "core/matrix/csr_lookup.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "cuda/base/config.hpp" diff --git a/cuda/test/matrix/csr_kernels.cpp b/cuda/test/matrix/csr_kernels.cpp index ab591555815..483c678d7fb 100644 --- a/cuda/test/matrix/csr_kernels.cpp +++ b/cuda/test/matrix/csr_kernels.cpp @@ -1012,62 +1012,4 @@ TEST_F(Csr, AddScaledIdentityToNonSquare) } -TEST_F(Csr, BuildLookupDataWorks) -{ - set_up_apply_data(std::make_shared()); - using gko::matrix::sparsity_type; - gko::Array row_descs(ref, mtx->get_size()[0]); - gko::Array lookup_info(ref, mtx->get_num_stored_elements() * 2); - gko::Array drow_descs(cuda, mtx->get_size()[0]); - gko::Array dlookup_info(cuda, - mtx->get_num_stored_elements() * 2); - for (auto allowed_methods : - {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, - sparsity_type::bitmap | sparsity_type::hash, - sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { - const auto full_allowed = - static_cast(static_cast(allowed_methods) & - static_cast(sparsity_type::full)); - const auto bitmap_allowed = - static_cast(static_cast(allowed_methods) & - static_cast(sparsity_type::bitmap)); - const auto bitmap_equivalent = - bitmap_allowed ? sparsity_type::bitmap : sparsity_type::hash; - const auto full_equivalent = - full_allowed ? sparsity_type::full : bitmap_equivalent; - SCOPED_TRACE("full: " + std::to_string(full_allowed) + - " bitmap: " + std::to_string(bitmap_allowed)); - - gko::kernels::reference::csr::build_lookup( - ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), - mtx->get_size()[0], allowed_methods, row_descs.get_data(), - lookup_info.get_data()); - gko::kernels::cuda::csr::build_lookup( - cuda, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(), - dmtx->get_size()[0], allowed_methods, drow_descs.get_data(), - dlookup_info.get_data()); - - gko::Array host_row_descs(ref, drow_descs); - gko::Array host_lookup_info(ref, dlookup_info); - for (int row = 0; row < dmtx->get_size()[0]; row++) { - SCOPED_TRACE("row: " + std::to_string(row)); - const auto row_begin = mtx->get_const_row_ptrs()[row]; - const auto row_nnz = mtx->get_const_row_ptrs()[row + 1] - row_begin; - gko::matrix::device_sparsity_lookup lookup{ - mtx->get_const_col_idxs() + row_begin, row_nnz, - host_lookup_info.get_const_data() + (row_begin * 2), - host_row_descs.get_const_data()[row]}; - - ASSERT_EQ(host_row_descs.get_const_data()[row] & 0xFFFF, - row_descs.get_const_data()[row] & 0xFFFF); - for (int nz = 0; nz < row_nnz; nz++) { - SCOPED_TRACE("nz: " + std::to_string(nz)); - const auto col = mtx->get_const_col_idxs()[nz + row_begin]; - ASSERT_EQ(nz, lookup[col]); - } - } - } -} - - } // namespace diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 5a79765626e..30d29bcd002 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -2270,13 +2270,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); -bool csr_lookup_allowed(matrix::sparsity_type allowed, - matrix::sparsity_type type) -{ - return ((static_cast(allowed) & static_cast(type)) != 0); -} - - template bool csr_lookup_try_full(IndexType row_len, IndexType col_range, matrix::sparsity_type allowed, int64& row_desc) @@ -2301,7 +2294,7 @@ bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, gko::matrix::device_sparsity_lookup::block_size; using matrix::sparsity_type; bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap); - const auto num_blocks = ceildiv(col_range, block_size); + const auto num_blocks = static_cast(ceildiv(col_range, block_size)); if (is_allowed && num_blocks * 2 <= available_storage) { row_desc = (static_cast(num_blocks) << 32) | static_cast(sparsity_type::bitmap); @@ -2331,7 +2324,13 @@ void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, int64& row_desc, int32* local_storage, const IndexType* cols) { + // we need at least one unfilled entry to avoid infinite loops on search + GKO_ASSERT(row_len < available_storage); +#if GINKGO_DPCPP_SINGLE_MODE + constexpr float inv_golden_ratio = 0.61803398875f; +#else constexpr double inv_golden_ratio = 0.61803398875; +#endif // use golden ratio as approximation for hash parameter that spreads // consecutive values as far apart as possible. Ensure lowest bit is set // otherwise we skip odd hashtable entries @@ -2339,12 +2338,12 @@ void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, 1u | static_cast(available_storage * inv_golden_ratio); row_desc = (static_cast(hash_parameter) << 32) | static_cast(matrix::sparsity_type::hash); - std::fill_n(local_storage, available_storage, -1); + std::fill_n(local_storage, available_storage, invalid_index()); for (int32 nz = 0; nz < row_len; nz++) { auto hash = (static_cast(cols[nz]) * hash_parameter) % static_cast(available_storage); // linear probing: find the next empty entry - while (local_storage[hash] != -1) { + while (local_storage[hash] != invalid_index()) { hash++; if (hash >= available_storage) { hash = 0; @@ -2359,15 +2358,17 @@ template void build_lookup(std::shared_ptr exec, const IndexType* row_ptrs, const IndexType* col_idxs, size_type num_rows, matrix::sparsity_type allowed, - int64* row_desc, int32* storage) + const IndexType* storage_offsets, int64* row_desc, + int32* storage) { exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) { const auto row = static_cast(idx[0]); const auto row_begin = row_ptrs[row]; const auto row_len = row_ptrs[row + 1] - row_begin; - const auto storage_begin = 2 * row_begin; - const auto available_storage = 2 * row_len; + const auto storage_begin = storage_offsets[row]; + const auto available_storage = + storage_offsets[row + 1] - storage_begin; const auto local_storage = storage + storage_begin; const auto local_cols = col_idxs + row_begin; const auto min_col = row_len > 0 ? local_cols[0] : 0; diff --git a/dpcpp/test/matrix/csr_kernels.cpp b/dpcpp/test/matrix/csr_kernels.cpp index 3de309b336e..375f7d93a93 100644 --- a/dpcpp/test/matrix/csr_kernels.cpp +++ b/dpcpp/test/matrix/csr_kernels.cpp @@ -964,62 +964,4 @@ TEST_F(Csr, CreateSubMatrixIsEquivalentToRef) } -TEST_F(Csr, BuildLookupDataWorks) -{ - set_up_apply_data(std::make_shared()); - using gko::matrix::sparsity_type; - gko::Array row_descs(ref, mtx->get_size()[0]); - gko::Array lookup_info(ref, mtx->get_num_stored_elements() * 2); - gko::Array drow_descs(dpcpp, mtx->get_size()[0]); - gko::Array dlookup_info(dpcpp, - mtx->get_num_stored_elements() * 2); - for (auto allowed_methods : - {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, - sparsity_type::bitmap | sparsity_type::hash, - sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { - const auto full_allowed = - static_cast(static_cast(allowed_methods) & - static_cast(sparsity_type::full)); - const auto bitmap_allowed = - static_cast(static_cast(allowed_methods) & - static_cast(sparsity_type::bitmap)); - const auto bitmap_equivalent = - bitmap_allowed ? sparsity_type::bitmap : sparsity_type::hash; - const auto full_equivalent = - full_allowed ? sparsity_type::full : bitmap_equivalent; - SCOPED_TRACE("full: " + std::to_string(full_allowed) + - " bitmap: " + std::to_string(bitmap_allowed)); - - gko::kernels::reference::csr::build_lookup( - ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), - mtx->get_size()[0], allowed_methods, row_descs.get_data(), - lookup_info.get_data()); - gko::kernels::dpcpp::csr::build_lookup( - dpcpp, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(), - dmtx->get_size()[0], allowed_methods, drow_descs.get_data(), - dlookup_info.get_data()); - - gko::Array host_row_descs(ref, drow_descs); - gko::Array host_lookup_info(ref, dlookup_info); - for (int row = 0; row < dmtx->get_size()[0]; row++) { - SCOPED_TRACE("row: " + std::to_string(row)); - const auto row_begin = mtx->get_const_row_ptrs()[row]; - const auto row_nnz = mtx->get_const_row_ptrs()[row + 1] - row_begin; - gko::matrix::device_sparsity_lookup lookup{ - mtx->get_const_col_idxs() + row_begin, row_nnz, - host_lookup_info.get_const_data() + (row_begin * 2), - host_row_descs.get_const_data()[row]}; - - ASSERT_EQ(host_row_descs.get_const_data()[row] & 0xFFFF, - row_descs.get_const_data()[row] & 0xFFFF); - for (int nz = 0; nz < row_nnz; nz++) { - SCOPED_TRACE("nz: " + std::to_string(nz)); - const auto col = mtx->get_const_col_idxs()[nz + row_begin]; - ASSERT_EQ(nz, lookup[col]); - } - } - } -} - - } // namespace diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp index 9f5ed7143c9..9025d8c493b 100644 --- a/hip/matrix/csr_kernels.hip.cpp +++ b/hip/matrix/csr_kernels.hip.cpp @@ -50,7 +50,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include #include #include #include @@ -61,6 +60,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/format_conversion_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" +#include "core/matrix/csr_lookup.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/config.hip.hpp" diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.hip.cpp index df65fed3277..8fbf9a05ac9 100644 --- a/hip/matrix/fbcsr_kernels.hip.cpp +++ b/hip/matrix/fbcsr_kernels.hip.cpp @@ -50,7 +50,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include #include @@ -59,6 +58,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/device_matrix_data_kernels.hpp" #include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" +#include "core/matrix/csr_lookup.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "hip/base/config.hip.hpp" diff --git a/hip/test/matrix/csr_kernels.hip.cpp b/hip/test/matrix/csr_kernels.hip.cpp index 670fb28fc94..bb6567f97d3 100644 --- a/hip/test/matrix/csr_kernels.hip.cpp +++ b/hip/test/matrix/csr_kernels.hip.cpp @@ -1022,62 +1022,4 @@ TEST_F(Csr, AddScaledIdentityToNonSquare) } -TEST_F(Csr, BuildLookupDataWorks) -{ - set_up_apply_data(std::make_shared()); - using gko::matrix::sparsity_type; - gko::Array row_descs(ref, mtx->get_size()[0]); - gko::Array lookup_info(ref, mtx->get_num_stored_elements() * 2); - gko::Array drow_descs(hip, mtx->get_size()[0]); - gko::Array dlookup_info(hip, - mtx->get_num_stored_elements() * 2); - for (auto allowed_methods : - {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, - sparsity_type::bitmap | sparsity_type::hash, - sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { - const auto full_allowed = - static_cast(static_cast(allowed_methods) & - static_cast(sparsity_type::full)); - const auto bitmap_allowed = - static_cast(static_cast(allowed_methods) & - static_cast(sparsity_type::bitmap)); - const auto bitmap_equivalent = - bitmap_allowed ? sparsity_type::bitmap : sparsity_type::hash; - const auto full_equivalent = - full_allowed ? sparsity_type::full : bitmap_equivalent; - SCOPED_TRACE("full: " + std::to_string(full_allowed) + - " bitmap: " + std::to_string(bitmap_allowed)); - - gko::kernels::reference::csr::build_lookup( - ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), - mtx->get_size()[0], allowed_methods, row_descs.get_data(), - lookup_info.get_data()); - gko::kernels::hip::csr::build_lookup( - hip, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(), - dmtx->get_size()[0], allowed_methods, drow_descs.get_data(), - dlookup_info.get_data()); - - gko::Array host_row_descs(ref, drow_descs); - gko::Array host_lookup_info(ref, dlookup_info); - for (int row = 0; row < dmtx->get_size()[0]; row++) { - SCOPED_TRACE("row: " + std::to_string(row)); - const auto row_begin = mtx->get_const_row_ptrs()[row]; - const auto row_nnz = mtx->get_const_row_ptrs()[row + 1] - row_begin; - gko::matrix::device_sparsity_lookup lookup{ - mtx->get_const_col_idxs() + row_begin, row_nnz, - host_lookup_info.get_const_data() + (row_begin * 2), - host_row_descs.get_const_data()[row]}; - - ASSERT_EQ(host_row_descs.get_const_data()[row] & 0xFFFF, - row_descs.get_const_data()[row] & 0xFFFF); - for (int nz = 0; nz < row_nnz; nz++) { - SCOPED_TRACE("nz: " + std::to_string(nz)); - const auto col = mtx->get_const_col_idxs()[nz + row_begin]; - ASSERT_EQ(nz, lookup[col]); - } - } - } -} - - } // namespace diff --git a/include/ginkgo/core/matrix/csr_lookup.hpp b/include/ginkgo/core/matrix/csr_lookup.hpp deleted file mode 100644 index b6b06be3bc7..00000000000 --- a/include/ginkgo/core/matrix/csr_lookup.hpp +++ /dev/null @@ -1,186 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#ifndef GKO_PUBLIC_CORE_MATRIX_CSR_LOOKUP_HPP_ -#define GKO_PUBLIC_CORE_MATRIX_CSR_LOOKUP_HPP_ - - -#include - - -#include -#include -#include - - -namespace gko { -namespace matrix { - - -/** - * Type describing which kind of lookup structure is used to find entries in a - * single row of a Csr matrix. - */ -enum class sparsity_type : int { - /** - * The row is dense, i.e. it contains all entries in - * `[min_col, min_col + storage_size)`. - * This means that the relative output index is `col - min_col`. - */ - full = 1, - /** - * The row is sufficiently dense that its sparsity pattern can be stored in - * a dense bitmap consisting of `storage_size` blocks of size `block_size`, - * with a total of `storage_size` blocks. Each block stores its sparsity - * pattern and the number of columns before. This means that the relative - * output index can be computed as - * ``` - * auto block = (col - min_col) / block_size; - * auto local_col = (col - min_col) % block_size; - * auto prefix_mask = (block_type{1} << local_col) - 1; - * auto output_idx = base[block] + popcount(bitmap[block] & prefix_mask); - * ``` - */ - bitmap = 2, - /** - * The row is sparse, so it is best represented using a hashtable. - * The hashtable has size `storage_size` and stores the relative output - * index directly, i.e. - * ``` - * auto hash_key = col - min_col; - * auto hash_bucket = hash(hash_key); - * while (col_idxs[hashtable[hash_bucket]] != col) { - * hash_bucket = (hash_bucket + 1) % storage_size; // linear probing - * } - * auto output_idx = hashtable[hash_bucket]; - * ``` - */ - hash = 4, -}; - - -inline sparsity_type operator|(sparsity_type a, sparsity_type b) -{ - return static_cast(static_cast(a) | - static_cast(b)); -} - - -template -class csr_sparsity_lookup { - const IndexType* row_ptrs; - const IndexType* col_idxs; - Array row_desc; - Array storage; -}; - - -template -struct device_sparsity_lookup { - /** Number of bits in a block_type entry. */ - static constexpr int block_size = 32; - - const IndexType* col_idxs; - const IndexType row_nnz; - const int32* storage; - const int64 desc; - - GKO_ATTRIBUTES GKO_INLINE IndexType operator[](IndexType col) const - { - switch (static_cast(desc & 0xF)) { - case sparsity_type::full: - return lookup_full(col); - case sparsity_type::bitmap: - return lookup_bitmap(col); - case sparsity_type::hash: - return lookup_hash(col); - } - assert(false); - } - - GKO_ATTRIBUTES GKO_INLINE IndexType lookup_full(IndexType col) const - { - const auto min_col = col_idxs[0]; - const auto out_idx = col - min_col; - assert(out_idx < row_nnz); - assert(col_idxs[out_idx] == col); - return out_idx; - } - - GKO_ATTRIBUTES GKO_INLINE IndexType lookup_bitmap(IndexType col) const - { - const auto min_col = col_idxs[0]; - const auto num_blocks = static_cast(desc >> 32); - const auto block_bases = storage; - const auto block_bitmaps = - reinterpret_cast(block_bases + num_blocks); - const auto rel_col = col - min_col; - const auto block = rel_col / block_size; - const auto col_in_block = rel_col % block_size; - const auto prefix_mask = (uint32{1} << col_in_block) - 1; - assert(block < num_blocks); - assert(block_bitmaps[block] & (uint32{1} << col_in_block)); - const auto out_idx = - block_bases[block] + - gko::detail::popcount(block_bitmaps[block] & prefix_mask); - assert(col_idxs[out_idx] == col); - return out_idx; - } - - GKO_ATTRIBUTES GKO_INLINE IndexType lookup_hash(IndexType col) const - { - const auto hashmap_size = static_cast(2 * row_nnz); - const auto hash_param = static_cast(desc >> 32); - const auto hashmap = storage; - auto hash = (static_cast(col) * hash_param) % hashmap_size; - assert(hashmap[hash] >= 0); - assert(hashmap[hash] < row_nnz); - // linear probing with sentinel to avoid infinite loop - while (col_idxs[hashmap[hash]] != col) { - hash++; - if (hash >= hashmap_size) { - hash = 0; - } - assert(hashmap[hash] >= 0); - assert(hashmap[hash] < row_nnz); - } - const auto out_idx = hashmap[hash]; - assert(col_idxs[out_idx] == col); - return out_idx; - } -}; - - -} // namespace matrix -} // namespace gko - -#endif // GKO_PUBLIC_CORE_MATRIX_CSR_LOOKUP_HPP_ diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index 9794b3d4b42..453bfae6e3a 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -87,7 +87,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#include #include #include #include diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index 5d3b6be211b..2a8cfa5bec0 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -1129,13 +1129,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); -bool csr_lookup_allowed(matrix::sparsity_type allowed, - matrix::sparsity_type type) -{ - return ((static_cast(allowed) & static_cast(type)) != 0); -} - - template bool csr_lookup_try_full(IndexType row_len, IndexType col_range, matrix::sparsity_type allowed, int64& row_desc) @@ -1150,9 +1143,6 @@ bool csr_lookup_try_full(IndexType row_len, IndexType col_range, } -constexpr static int csr_lookup_block_size = 32; - - template bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, IndexType min_col, IndexType available_storage, @@ -1161,7 +1151,9 @@ bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, { using matrix::sparsity_type; bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap); - const auto num_blocks = ceildiv(col_range, csr_lookup_block_size); + constexpr auto bitmap_block_size = + gko::matrix::device_sparsity_lookup::block_size; + const auto num_blocks = ceildiv(col_range, bitmap_block_size); if (is_allowed && num_blocks * 2 <= available_storage) { row_desc = (static_cast(num_blocks) << 32) | static_cast(sparsity_type::bitmap); @@ -1171,8 +1163,8 @@ bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, std::fill_n(block_bitmaps, num_blocks, 0); for (auto col_it = cols; col_it < cols + row_len; col_it++) { const auto rel_col = *col_it - min_col; - const auto block = rel_col / csr_lookup_block_size; - const auto col_in_block = rel_col % csr_lookup_block_size; + const auto block = rel_col / bitmap_block_size; + const auto col_in_block = rel_col % bitmap_block_size; block_bitmaps[block] |= uint32{1} << col_in_block; } int32 partial_sum{}; @@ -1191,6 +1183,8 @@ void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, int64& row_desc, int32* local_storage, const IndexType* cols) { + // we need at least one unfilled entry to avoid infinite loops on search + GKO_ASSERT(row_len < available_storage); constexpr double inv_golden_ratio = 0.61803398875; // use golden ratio as approximation for hash parameter that spreads // consecutive values as far apart as possible. Ensure lowest bit is set @@ -1199,12 +1193,14 @@ void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, 1u | static_cast(available_storage * inv_golden_ratio); row_desc = (static_cast(hash_parameter) << 32) | static_cast(matrix::sparsity_type::hash); - std::fill_n(local_storage, available_storage, -1); + std::fill_n(local_storage, available_storage, invalid_index()); for (int32 nz = 0; nz < row_len; nz++) { - auto hash = (static_cast(cols[nz]) * hash_parameter) % + auto hash = (static_cast::type>( + cols[nz]) * + hash_parameter) % static_cast(available_storage); // linear probing: find the next empty entry - while (local_storage[hash] != -1) { + while (local_storage[hash] != invalid_index()) { hash++; if (hash >= available_storage) { hash = 0; @@ -1219,14 +1215,15 @@ template void build_lookup(std::shared_ptr exec, const IndexType* row_ptrs, const IndexType* col_idxs, size_type num_rows, matrix::sparsity_type allowed, - int64* row_desc, int32* storage) + const IndexType* storage_offsets, int64* row_desc, + int32* storage) { #pragma omp parallel for for (size_type row = 0; row < num_rows; row++) { const auto row_begin = row_ptrs[row]; const auto row_len = row_ptrs[row + 1] - row_begin; - const auto storage_begin = 2 * row_begin; - const auto available_storage = 2 * row_len; + const auto storage_begin = storage_offsets[row]; + const auto available_storage = storage_offsets[row + 1] - storage_begin; const auto local_storage = storage + storage_begin; const auto local_cols = col_idxs + row_begin; const auto min_col = row_len > 0 ? local_cols[0] : 0; diff --git a/omp/test/matrix/csr_kernels.cpp b/omp/test/matrix/csr_kernels.cpp index 2a00fbeb220..521a923e55d 100644 --- a/omp/test/matrix/csr_kernels.cpp +++ b/omp/test/matrix/csr_kernels.cpp @@ -45,7 +45,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include #include #include #include @@ -54,6 +53,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_kernels.hpp" +#include "core/matrix/csr_lookup.hpp" #include "core/test/utils.hpp" #include "core/test/utils/unsort_matrix.hpp" #include "core/utils/matrix_utils.hpp" @@ -903,61 +903,4 @@ TEST_F(Csr, CreateSubMatrixFromindex_setIsEquivalentToRef) } -TEST_F(Csr, BuildLookupDataWorks) -{ - set_up_apply_data(); - using gko::matrix::sparsity_type; - gko::Array row_descs(ref, mtx->get_size()[0]); - gko::Array lookup_info(ref, mtx->get_num_stored_elements() * 2); - gko::Array drow_descs(omp, mtx->get_size()[0]); - gko::Array dlookup_info(omp, - mtx->get_num_stored_elements() * 2); - for (auto allowed_methods : - {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, - sparsity_type::bitmap | sparsity_type::hash, - sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { - const auto full_allowed = - static_cast(static_cast(allowed_methods) & - static_cast(sparsity_type::full)); - const auto bitmap_allowed = - static_cast(static_cast(allowed_methods) & - static_cast(sparsity_type::bitmap)); - const auto bitmap_equivalent = - bitmap_allowed ? sparsity_type::bitmap : sparsity_type::hash; - const auto full_equivalent = - full_allowed ? sparsity_type::full : bitmap_equivalent; - SCOPED_TRACE("full: " + std::to_string(full_allowed) + - " bitmap: " + std::to_string(bitmap_allowed)); - - gko::kernels::reference::csr::build_lookup( - ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), - mtx->get_size()[0], allowed_methods, row_descs.get_data(), - lookup_info.get_data()); - gko::kernels::omp::csr::build_lookup( - omp, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(), - dmtx->get_size()[0], allowed_methods, drow_descs.get_data(), - dlookup_info.get_data()); - - for (int row = 0; row < dmtx->get_size()[0]; row++) { - SCOPED_TRACE("row: " + std::to_string(row)); - const auto row_begin = mtx->get_const_row_ptrs()[row]; - const auto row_nnz = mtx->get_const_row_ptrs()[row + 1] - row_begin; - gko::matrix::device_sparsity_lookup lookup{ - mtx->get_const_col_idxs() + row_begin, row_nnz, - dlookup_info.get_const_data() + (row_begin * 2), - drow_descs.get_const_data()[row]}; - - ASSERT_EQ( - static_cast(drow_descs.get_const_data()[row]) & 0xFFFF, - static_cast(row_descs.get_const_data()[row]) & 0xFFFF); - for (int nz = 0; nz < row_nnz; nz++) { - SCOPED_TRACE("nz: " + std::to_string(nz)); - const auto col = mtx->get_const_col_idxs()[nz + row_begin]; - ASSERT_EQ(nz, lookup[col]); - } - } - } -} - - } // namespace diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp index a5fafe42cb5..a0e50da6452 100644 --- a/reference/matrix/csr_kernels.cpp +++ b/reference/matrix/csr_kernels.cpp @@ -1118,12 +1118,44 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); -bool csr_lookup_allowed(matrix::sparsity_type allowed, - matrix::sparsity_type type) +template +void build_lookup_offsets(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* col_idxs, + size_type num_rows, matrix::sparsity_type allowed, + IndexType* storage_offsets) { - return ((static_cast(allowed) & static_cast(type)) != 0); + using matrix::sparsity_type; + constexpr static int block_size = + gko::matrix::device_sparsity_lookup::block_size; + for (size_type row = 0; row < num_rows; row++) { + const auto row_begin = row_ptrs[row]; + const auto row_len = row_ptrs[row + 1] - row_begin; + const auto local_cols = col_idxs + row_begin; + const auto min_col = row_len > 0 ? local_cols[0] : 0; + const auto col_range = + row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0; + if (csr_lookup_allowed(allowed, sparsity_type::full) && + row_len == col_range) { + storage_offsets[row] = 0; + } else { + const auto hashmap_storage = std::max(2 * row_len, 1); + const auto bitmap_num_blocks = + static_cast(ceildiv(col_range, block_size)); + const auto bitmap_storage = 2 * bitmap_num_blocks; + if (csr_lookup_allowed(allowed, sparsity_type::bitmap) && + bitmap_storage <= hashmap_storage) { + storage_offsets[row] = bitmap_storage; + } else { + storage_offsets[row] = hashmap_storage; + } + } + } + components::prefix_sum(exec, storage_offsets, num_rows + 1); } +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL); + template bool csr_lookup_try_full(IndexType row_len, IndexType col_range, @@ -1149,7 +1181,7 @@ bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, gko::matrix::device_sparsity_lookup::block_size; using matrix::sparsity_type; bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap); - const auto num_blocks = ceildiv(col_range, block_size); + const auto num_blocks = static_cast(ceildiv(col_range, block_size)); if (is_allowed && num_blocks * 2 <= available_storage) { row_desc = (static_cast(num_blocks) << 32) | static_cast(sparsity_type::bitmap); @@ -1179,6 +1211,8 @@ void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, int64& row_desc, int32* local_storage, const IndexType* cols) { + // we need at least one unfilled entry to avoid infinite loops on search + GKO_ASSERT(row_len < available_storage); constexpr double inv_golden_ratio = 0.61803398875; // use golden ratio as approximation for hash parameter that spreads // consecutive values as far apart as possible. Ensure lowest bit is set @@ -1187,12 +1221,14 @@ void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, 1u | static_cast(available_storage * inv_golden_ratio); row_desc = (static_cast(hash_parameter) << 32) | static_cast(matrix::sparsity_type::hash); - std::fill_n(local_storage, available_storage, -1); + std::fill_n(local_storage, available_storage, invalid_index()); for (int32 nz = 0; nz < row_len; nz++) { - auto hash = (static_cast(cols[nz]) * hash_parameter) % + auto hash = (static_cast::type>( + cols[nz]) * + hash_parameter) % static_cast(available_storage); // linear probing: find the next empty entry - while (local_storage[hash] != -1) { + while (local_storage[hash] != invalid_index()) { hash++; if (hash >= available_storage) { hash = 0; @@ -1207,13 +1243,14 @@ template void build_lookup(std::shared_ptr exec, const IndexType* row_ptrs, const IndexType* col_idxs, size_type num_rows, matrix::sparsity_type allowed, - int64* row_desc, int32* storage) + const IndexType* storage_offsets, int64* row_desc, + int32* storage) { for (size_type row = 0; row < num_rows; row++) { const auto row_begin = row_ptrs[row]; const auto row_len = row_ptrs[row + 1] - row_begin; - const auto storage_begin = 2 * row_begin; - const auto available_storage = 2 * row_len; + const auto storage_begin = storage_offsets[row]; + const auto available_storage = storage_offsets[row + 1] - storage_begin; const auto local_storage = storage + storage_begin; const auto local_cols = col_idxs + row_begin; const auto min_col = row_len > 0 ? local_cols[0] : 0; diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp index 6115189db22..a3d6c47efc5 100644 --- a/reference/test/matrix/csr_kernels.cpp +++ b/reference/test/matrix/csr_kernels.cpp @@ -44,7 +44,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include #include #include #include @@ -55,6 +54,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/csr_kernels.hpp" +#include "core/matrix/csr_lookup.hpp" #include "core/test/utils.hpp" @@ -1892,78 +1892,152 @@ TYPED_TEST(Csr, CanGetSubmatrixWithindex_set) } -TYPED_TEST(Csr, GeneratesLookupData) +template +class CsrLookup : public ::testing::Test { +protected: + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Mtx = gko::matrix::Csr; + + CsrLookup() : exec(gko::ReferenceExecutor::create()) + { + mtx = Mtx::create(this->exec); + typename Mtx::mat_data data{gko::dim<2>{6, 65}}; + for (int i = 0; i < 65; i++) { + // row 0: empty row + // row 1: full row + data.nonzeros.emplace_back(1, i, 1.0); + // row 2: pretty dense row + if (i % 3 == 0) { + data.nonzeros.emplace_back(2, i, 1.0); + } + // row 4-5: contiguous row + if (i >= 10 && i < 30) { + data.nonzeros.emplace_back(4, i, 1.0); + } + if (i >= 2 && i < 6) { + data.nonzeros.emplace_back(5, i, 1.0); + } + } + // row 3: very sparse + data.nonzeros.emplace_back(3, 0, 1.0); + data.nonzeros.emplace_back(3, 64, 1.0); + data.ensure_row_major_order(); + // 1000 as min-sentinel + full_sizes = {0, 0, 1000, 1000, 0, 0}; + bitmap_sizes = {0, 6, 4, 6, 2, 2}; + hash_sizes = {1, 130, 44, 4, 40, 8}; + mtx->read(data); + } + + std::shared_ptr exec; + std::unique_ptr mtx; + std::vector full_sizes; + std::vector bitmap_sizes; + std::vector hash_sizes; + index_type invalid_index = gko::invalid_index(); +}; + +TYPED_TEST_SUITE(CsrLookup, gko::test::ValueIndexTypes, + PairTypenameNameGenerator); + +TYPED_TEST(CsrLookup, GeneratesLookupDataOffsets) { - using Mtx = typename TestFixture::Mtx; - using IndexType = typename Mtx::index_type; + using IndexType = typename TestFixture::index_type; using gko::matrix::sparsity_type; - auto mtx = Mtx::create(this->exec); - typename Mtx::mat_data data{gko::dim<2>{6, 65}}; - for (int i = 0; i < 65; i++) { - // row 0: empty row - // row 1: full row - data.nonzeros.emplace_back(1, i, 1.0); - // row 2: pretty dense row - if (i % 3 == 0) { - data.nonzeros.emplace_back(2, i, 1.0); - } - // row 4-5: contiguous row - if (i >= 10 && i < 30) { - data.nonzeros.emplace_back(4, i, 1.0); - } - if (i >= 2 && i < 6) { - data.nonzeros.emplace_back(5, i, 1.0); + const auto num_rows = this->mtx->get_size()[0]; + gko::array storage_offset_array(this->exec, num_rows + 1); + const auto storage_offsets = storage_offset_array.get_data(); + const auto row_ptrs = this->mtx->get_const_row_ptrs(); + const auto col_idxs = this->mtx->get_const_col_idxs(); + + for (auto allowed : + {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { + gko::kernels::reference::csr::build_lookup_offsets( + this->exec, row_ptrs, col_idxs, num_rows, allowed, storage_offsets); + bool allow_full = + gko::matrix::csr_lookup_allowed(allowed, sparsity_type::full); + bool allow_bitmap = + gko::matrix::csr_lookup_allowed(allowed, sparsity_type::bitmap); + + for (gko::size_type row = 0; row < num_rows; row++) { + const auto expected_size = + std::min(allow_full ? this->full_sizes[row] : 1000, + std::min(allow_bitmap ? this->bitmap_sizes[row] : 1000, + this->hash_sizes[row])); + const auto size = storage_offsets[row + 1] - storage_offsets[row]; + + ASSERT_EQ(size, expected_size); } } - // row 3: very sparse - data.nonzeros.emplace_back(3, 0, 1.0); - data.nonzeros.emplace_back(3, 64, 1.0); - data.ensure_row_major_order(); - mtx->read(data); - gko::Array row_descs(this->exec, mtx->get_size()[0]); - gko::Array lookup_info(this->exec, - mtx->get_num_stored_elements() * 2); - for (auto allowed_methods : +} + + +TYPED_TEST(CsrLookup, GeneratesLookupData) +{ + using IndexType = typename TestFixture::index_type; + using gko::matrix::sparsity_type; + const auto num_rows = this->mtx->get_size()[0]; + const auto num_cols = this->mtx->get_size()[1]; + gko::array row_desc_array(this->exec, num_rows); + gko::array storage_offset_array(this->exec, num_rows + 1); + const auto row_descs = row_desc_array.get_data(); + const auto storage_offsets = storage_offset_array.get_data(); + const auto row_ptrs = this->mtx->get_const_row_ptrs(); + const auto col_idxs = this->mtx->get_const_col_idxs(); + for (auto allowed : {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, sparsity_type::bitmap | sparsity_type::hash, sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { - const auto full_allowed = - static_cast(static_cast(allowed_methods) & - static_cast(sparsity_type::full)); - const auto bitmap_allowed = - static_cast(static_cast(allowed_methods) & - static_cast(sparsity_type::bitmap)); + gko::kernels::reference::csr::build_lookup_offsets( + this->exec, row_ptrs, col_idxs, num_rows, allowed, storage_offsets); + gko::array storage_array(this->exec, + storage_offsets[num_rows]); + const auto storage = storage_array.get_data(); const auto bitmap_equivalent = - bitmap_allowed ? sparsity_type::bitmap : sparsity_type::hash; + csr_lookup_allowed(allowed, sparsity_type::bitmap) + ? sparsity_type::bitmap + : sparsity_type::hash; const auto full_equivalent = - full_allowed ? sparsity_type::full : bitmap_equivalent; - SCOPED_TRACE("full: " + std::to_string(full_allowed) + - " bitmap: " + std::to_string(bitmap_allowed)); + csr_lookup_allowed(allowed, sparsity_type::full) + ? sparsity_type::full + : bitmap_equivalent; gko::kernels::reference::csr::build_lookup( - this->exec, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), - mtx->get_size()[0], allowed_methods, row_descs.get_data(), - lookup_info.get_data()); - - const auto descs = row_descs.get_const_data(); - for (auto entry : data.nonzeros) { - const auto row = entry.row; - const auto col = entry.column; - const auto row_begin = mtx->get_const_row_ptrs()[row]; - const auto row_nnz = mtx->get_const_row_ptrs()[row + 1] - row_begin; - gko::matrix::device_sparsity_lookup lookup{ - mtx->get_const_col_idxs() + row_begin, row_nnz, - lookup_info.get_const_data() + (row_begin * 2), descs[row]}; + this->exec, row_ptrs, col_idxs, num_rows, allowed, storage_offsets, + row_descs, storage); - const auto nz = lookup[col] + row_begin; - ASSERT_EQ(mtx->get_const_col_idxs()[nz], col); + for (int row = 0; row < num_rows; row++) { + const auto row_begin = row_ptrs[row]; + const auto row_end = row_ptrs[row + 1]; + gko::matrix::device_sparsity_lookup lookup{ + row_ptrs, col_idxs, storage_offsets, + storage, row_descs, static_cast(row)}; + for (auto nz = row_begin; nz < row_end; nz++) { + const auto col = col_idxs[nz]; + ASSERT_EQ(lookup.lookup_unsafe(col) + row_begin, nz); + } + auto nz = row_begin; + for (int col = 0; col < num_cols; col++) { + auto found_nz = lookup[col]; + if (nz < row_end && col_idxs[nz] == col) { + ASSERT_EQ(found_nz, nz - row_begin); + nz++; + } else { + ASSERT_EQ(found_nz, this->invalid_index); + } + } } - ASSERT_EQ(descs[0] & 0xFFFF, static_cast(full_equivalent)); - ASSERT_EQ(descs[1] & 0xFFFF, static_cast(full_equivalent)); - ASSERT_EQ(descs[2] & 0xFFFF, static_cast(bitmap_equivalent)); - ASSERT_EQ(descs[3] & 0xFFFF, static_cast(sparsity_type::hash)); - ASSERT_EQ(descs[4] & 0xFFFF, static_cast(full_equivalent)); - ASSERT_EQ(descs[5] & 0xFFFF, static_cast(full_equivalent)); + ASSERT_EQ(row_descs[0] & 0xF, static_cast(full_equivalent)); + ASSERT_EQ(row_descs[1] & 0xF, static_cast(full_equivalent)); + ASSERT_EQ(row_descs[2] & 0xF, static_cast(bitmap_equivalent)); + ASSERT_EQ(row_descs[3] & 0xF, static_cast(sparsity_type::hash)); + ASSERT_EQ(row_descs[4] & 0xF, static_cast(full_equivalent)); + ASSERT_EQ(row_descs[5] & 0xF, static_cast(full_equivalent)); } } diff --git a/test/matrix/csr_kernels.cpp b/test/matrix/csr_kernels.cpp index 2be151a55de..a2ab2b8f707 100644 --- a/test/matrix/csr_kernels.cpp +++ b/test/matrix/csr_kernels.cpp @@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#include +#include "core/matrix/csr_kernels.hpp" #include @@ -42,11 +42,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include "core/components/fill_array_kernels.hpp" -#include "core/matrix/csr_kernels.hpp" #include "core/test/utils.hpp" #include "test/utils/executor.hpp" @@ -133,4 +133,146 @@ TEST_F(Csr, InvScaleIsEquivalentToRef) } +template +class CsrLookup : public ::testing::Test { +protected: + using value_type = float; + using index_type = IndexType; + using Mtx = gko::matrix::Csr; + + CsrLookup() : rand_engine(15) {} + + void SetUp() + { + ref = gko::ReferenceExecutor::create(); + init_executor(ref, exec); + auto data = + gko::test::generate_random_matrix_data( + 628, 923, std::uniform_int_distribution(10, 300), + std::normal_distribution>(-1.0, + 1.0), + rand_engine); + // create a few empty rows + data.nonzeros.erase( + std::remove_if(data.nonzeros.begin(), data.nonzeros.end(), + [](auto entry) { return entry.row % 200 == 21; }), + data.nonzeros.end()); + // insert a full row and a pretty dense row + for (int i = 0; i < 100; i++) { + data.nonzeros.emplace_back(221, i + 100, 1.0); + data.nonzeros.emplace_back(421, i * 3 + 100, 2.0); + } + data.ensure_row_major_order(); + // initialize the matrices + mtx = Mtx::create(ref); + mtx->read(data); + dmtx = gko::clone(exec, mtx); + } + + void TearDown() + { + if (exec != nullptr) { + ASSERT_NO_THROW(exec->synchronize()); + } + } + + std::default_random_engine rand_engine; + std::shared_ptr ref; + std::shared_ptr exec; + std::unique_ptr mtx; + std::unique_ptr dmtx; + index_type invalid_index = gko::invalid_index(); +}; + +TYPED_TEST_SUITE(CsrLookup, gko::test::IndexTypes, TypenameNameGenerator); + + +TYPED_TEST(CsrLookup, BuildLookupWorks) +{ + using index_type = typename TestFixture::index_type; + using gko::matrix::sparsity_type; + const auto num_rows = this->mtx->get_size()[0]; + const auto num_cols = this->mtx->get_size()[1]; + gko::array row_desc_array(this->ref, num_rows); + gko::array drow_desc_array(this->exec, num_rows); + gko::array storage_offset_array(this->ref, num_rows + 1); + gko::array dstorage_offset_array(this->exec, num_rows + 1); + const auto row_descs = row_desc_array.get_data(); + const auto drow_descs = drow_desc_array.get_data(); + const auto row_ptrs = this->mtx->get_const_row_ptrs(); + const auto col_idxs = this->mtx->get_const_col_idxs(); + const auto drow_ptrs = this->dmtx->get_const_row_ptrs(); + const auto dcol_idxs = this->dmtx->get_const_col_idxs(); + const auto storage_offsets = storage_offset_array.get_data(); + const auto dstorage_offsets = dstorage_offset_array.get_data(); + for (auto allowed : + {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { + // check that storage offsets are calculated correctly + // otherwise things might crash + gko::kernels::reference::csr::build_lookup_offsets( + this->ref, row_ptrs, col_idxs, num_rows, allowed, storage_offsets); + gko::kernels::EXEC_NAMESPACE::csr::build_lookup_offsets( + this->exec, drow_ptrs, dcol_idxs, num_rows, allowed, + dstorage_offsets); + + GKO_ASSERT_ARRAY_EQ(storage_offset_array, dstorage_offset_array); + + gko::array storage_array(this->ref, + storage_offsets[num_rows]); + gko::array dstorage_array(this->exec, + storage_offsets[num_rows]); + const auto storage = storage_array.get_data(); + const auto dstorage = dstorage_array.get_data(); + const auto bitmap_equivalent = + csr_lookup_allowed(allowed, sparsity_type::bitmap) + ? sparsity_type::bitmap + : sparsity_type::hash; + const auto full_equivalent = + csr_lookup_allowed(allowed, sparsity_type::full) + ? sparsity_type::full + : bitmap_equivalent; + + gko::kernels::reference::csr::build_lookup( + this->ref, row_ptrs, col_idxs, num_rows, allowed, storage_offsets, + row_descs, storage); + gko::kernels::EXEC_NAMESPACE::csr::build_lookup( + this->exec, drow_ptrs, dcol_idxs, num_rows, allowed, + dstorage_offsets, drow_descs, dstorage); + + gko::array host_row_descs(this->ref, drow_desc_array); + gko::array host_storage_array(this->ref, dstorage_array); + for (int row = 0; row < num_rows; row++) { + const auto row_begin = row_ptrs[row]; + const auto row_end = row_ptrs[row + 1]; + const auto row_nnz = row_end - row_begin; + gko::matrix::device_sparsity_lookup lookup{ + row_ptrs, + col_idxs, + storage_offsets, + host_storage_array.get_const_data(), + host_row_descs.get_data(), + static_cast(row)}; + ASSERT_EQ(host_row_descs.get_const_data()[row] & 0xF, + row_descs[row] & 0xF); + for (auto nz = row_begin; nz < row_end; nz++) { + const auto col = col_idxs[nz]; + ASSERT_EQ(lookup.lookup_unsafe(col) + row_begin, nz); + } + auto nz = row_begin; + for (int col = 0; col < num_cols; col++) { + auto found_nz = lookup[col]; + if (nz < row_end && col_idxs[nz] == col) { + ASSERT_EQ(found_nz, nz - row_begin); + nz++; + } else { + ASSERT_EQ(found_nz, this->invalid_index); + } + } + } + } +} + + } // namespace From 046438067347ff144af5a9d060de35f6bb781408 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Sat, 14 May 2022 11:06:45 +0200 Subject: [PATCH 6/8] fix ROCm + gcc build issues Somehow, the static constexpr member variable trips up the linker. Move it to a global constant instead --- common/cuda_hip/matrix/csr_kernels.hpp.inc | 3 +-- common/unified/matrix/csr_kernels.cpp | 7 +++---- core/matrix/csr_lookup.hpp | 19 ++++++++++--------- dpcpp/matrix/csr_kernels.dp.cpp | 10 +++++----- omp/matrix/csr_kernels.cpp | 9 ++++----- reference/matrix/csr_kernels.cpp | 17 ++++++++--------- 6 files changed, 31 insertions(+), 34 deletions(-) diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index efb0afd46aa..c2d47b0c248 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -926,8 +926,7 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( gko::matrix::sparsity_type allowed, const IndexType* storage_offsets, int64* __restrict__ row_desc, int32* __restrict__ storage) { - constexpr int bitmap_block_size = - gko::matrix::device_sparsity_lookup::block_size; + constexpr int bitmap_block_size = gko::matrix::sparsity_bitmap_block_size; auto row = thread::get_subwarp_id_flat(); if (row >= num_rows) { return; diff --git a/common/unified/matrix/csr_kernels.cpp b/common/unified/matrix/csr_kernels.cpp index d861b1a16e7..c35c47a144c 100644 --- a/common/unified/matrix/csr_kernels.cpp +++ b/common/unified/matrix/csr_kernels.cpp @@ -250,9 +250,8 @@ void build_lookup_offsets(std::shared_ptr exec, size_type num_rows, matrix::sparsity_type allowed, IndexType* storage_offsets) { + using matrix::sparsity_bitmap_block_size; using matrix::sparsity_type; - constexpr static int block_size = - gko::matrix::device_sparsity_lookup::block_size; run_kernel( exec, [] GKO_KERNEL(auto row, auto row_ptrs, auto col_idxs, auto num_rows, @@ -268,8 +267,8 @@ void build_lookup_offsets(std::shared_ptr exec, storage_offsets[row] = 0; } else { const auto hashmap_storage = row_len == 0 ? 1 : 2 * row_len; - const auto bitmap_num_blocks = - static_cast(ceildiv(col_range, block_size)); + const auto bitmap_num_blocks = static_cast( + ceildiv(col_range, sparsity_bitmap_block_size)); const auto bitmap_storage = 2 * bitmap_num_blocks; if (csr_lookup_allowed(allowed, sparsity_type::bitmap) && bitmap_storage <= hashmap_storage) { diff --git a/core/matrix/csr_lookup.hpp b/core/matrix/csr_lookup.hpp index 9dc61028065..c7f0215844c 100644 --- a/core/matrix/csr_lookup.hpp +++ b/core/matrix/csr_lookup.hpp @@ -64,8 +64,8 @@ enum class sparsity_type : int { * columns before the block as integer. This means that the relative output * index can be computed as * ``` - * auto block = (col - min_col) / block_size; - * auto local_col = (col - min_col) % block_size; + * auto block = (col - min_col) / sparsity_bitmap_block_size; + * auto local_col = (col - min_col) % sparsity_bitmap_block_size; * auto prefix_mask = (block_type{1} << local_col) - 1; * auto output_idx = base[block] + popcount(bitmap[block] & prefix_mask); * ``` @@ -103,11 +103,12 @@ GKO_ATTRIBUTES GKO_INLINE bool csr_lookup_allowed(matrix::sparsity_type allowed, } +/** Number of bits in a block_type entry. */ +static constexpr int sparsity_bitmap_block_size = 32; + + template struct device_sparsity_lookup { - /** Number of bits in a block_type entry. */ - static constexpr int block_size = 32; - /** * Set up a device_sparsity_lookup structure from local data * @@ -240,8 +241,8 @@ struct device_sparsity_lookup { const auto block_bitmaps = reinterpret_cast(block_bases + num_blocks); const auto rel_col = col - min_col; - const auto block = rel_col / block_size; - const auto col_in_block = rel_col % block_size; + const auto block = rel_col / sparsity_bitmap_block_size; + const auto col_in_block = rel_col % sparsity_bitmap_block_size; const auto prefix_mask = (uint32{1} << col_in_block) - 1; GKO_ASSERT(rel_col >= 0); GKO_ASSERT(block < num_blocks); @@ -261,8 +262,8 @@ struct device_sparsity_lookup { const auto block_bitmaps = reinterpret_cast(block_bases + num_blocks); const auto rel_col = col - min_col; - const auto block = rel_col / block_size; - const auto col_in_block = rel_col % block_size; + const auto block = rel_col / sparsity_bitmap_block_size; + const auto col_in_block = rel_col % sparsity_bitmap_block_size; if (rel_col < 0 || block >= num_blocks || !(block_bitmaps[block] & (uint32{1} << col_in_block))) { return invalid_index(); diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 30d29bcd002..9deaeb0f22b 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -2290,11 +2290,11 @@ bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, matrix::sparsity_type allowed, int64& row_desc, int32* local_storage, const IndexType* cols) { - constexpr static int block_size = - gko::matrix::device_sparsity_lookup::block_size; + using matrix::sparsity_bitmap_block_size; using matrix::sparsity_type; bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap); - const auto num_blocks = static_cast(ceildiv(col_range, block_size)); + const auto num_blocks = + static_cast(ceildiv(col_range, sparsity_bitmap_block_size)); if (is_allowed && num_blocks * 2 <= available_storage) { row_desc = (static_cast(num_blocks) << 32) | static_cast(sparsity_type::bitmap); @@ -2304,8 +2304,8 @@ bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, std::fill_n(block_bitmaps, num_blocks, 0); for (auto col_it = cols; col_it < cols + row_len; col_it++) { const auto rel_col = *col_it - min_col; - const auto block = rel_col / block_size; - const auto col_in_block = rel_col % block_size; + const auto block = rel_col / sparsity_bitmap_block_size; + const auto col_in_block = rel_col % sparsity_bitmap_block_size; block_bitmaps[block] |= uint32{1} << col_in_block; } int32 partial_sum{}; diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index 2a8cfa5bec0..fd34ee43d84 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -1149,11 +1149,10 @@ bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, matrix::sparsity_type allowed, int64& row_desc, int32* local_storage, const IndexType* cols) { + using matrix::sparsity_bitmap_block_size; using matrix::sparsity_type; bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap); - constexpr auto bitmap_block_size = - gko::matrix::device_sparsity_lookup::block_size; - const auto num_blocks = ceildiv(col_range, bitmap_block_size); + const auto num_blocks = ceildiv(col_range, sparsity_bitmap_block_size); if (is_allowed && num_blocks * 2 <= available_storage) { row_desc = (static_cast(num_blocks) << 32) | static_cast(sparsity_type::bitmap); @@ -1163,8 +1162,8 @@ bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, std::fill_n(block_bitmaps, num_blocks, 0); for (auto col_it = cols; col_it < cols + row_len; col_it++) { const auto rel_col = *col_it - min_col; - const auto block = rel_col / bitmap_block_size; - const auto col_in_block = rel_col % bitmap_block_size; + const auto block = rel_col / sparsity_bitmap_block_size; + const auto col_in_block = rel_col % sparsity_bitmap_block_size; block_bitmaps[block] |= uint32{1} << col_in_block; } int32 partial_sum{}; diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp index a0e50da6452..33a38a80efd 100644 --- a/reference/matrix/csr_kernels.cpp +++ b/reference/matrix/csr_kernels.cpp @@ -1124,9 +1124,8 @@ void build_lookup_offsets(std::shared_ptr exec, size_type num_rows, matrix::sparsity_type allowed, IndexType* storage_offsets) { + using matrix::sparsity_bitmap_block_size; using matrix::sparsity_type; - constexpr static int block_size = - gko::matrix::device_sparsity_lookup::block_size; for (size_type row = 0; row < num_rows; row++) { const auto row_begin = row_ptrs[row]; const auto row_len = row_ptrs[row + 1] - row_begin; @@ -1139,8 +1138,8 @@ void build_lookup_offsets(std::shared_ptr exec, storage_offsets[row] = 0; } else { const auto hashmap_storage = std::max(2 * row_len, 1); - const auto bitmap_num_blocks = - static_cast(ceildiv(col_range, block_size)); + const auto bitmap_num_blocks = static_cast( + ceildiv(col_range, sparsity_bitmap_block_size)); const auto bitmap_storage = 2 * bitmap_num_blocks; if (csr_lookup_allowed(allowed, sparsity_type::bitmap) && bitmap_storage <= hashmap_storage) { @@ -1177,11 +1176,11 @@ bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, matrix::sparsity_type allowed, int64& row_desc, int32* local_storage, const IndexType* cols) { - constexpr static int block_size = - gko::matrix::device_sparsity_lookup::block_size; + using matrix::sparsity_bitmap_block_size; using matrix::sparsity_type; bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap); - const auto num_blocks = static_cast(ceildiv(col_range, block_size)); + const auto num_blocks = + static_cast(ceildiv(col_range, sparsity_bitmap_block_size)); if (is_allowed && num_blocks * 2 <= available_storage) { row_desc = (static_cast(num_blocks) << 32) | static_cast(sparsity_type::bitmap); @@ -1191,8 +1190,8 @@ bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, std::fill_n(block_bitmaps, num_blocks, 0); for (auto col_it = cols; col_it < cols + row_len; col_it++) { const auto rel_col = *col_it - min_col; - const auto block = rel_col / block_size; - const auto col_in_block = rel_col % block_size; + const auto block = rel_col / sparsity_bitmap_block_size; + const auto col_in_block = rel_col % sparsity_bitmap_block_size; block_bitmaps[block] |= uint32{1} << col_in_block; } int32 partial_sum{}; From 78642dd8d496d2e88f7e67213b9e6ebda686d3f4 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Mon, 23 May 2022 14:46:44 +0200 Subject: [PATCH 7/8] review updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * move sparsity lookup to csr namespace * improve conversions for sparsity_type Co-authored-by: Thomas Grützmacher Co-authored-by: Yuhsiang M. Tsai --- common/cuda_hip/matrix/csr_kernels.hpp.inc | 17 ++++++------- common/unified/matrix/csr_kernels.cpp | 7 +++--- core/matrix/csr_kernels.hpp | 12 +++++----- core/matrix/csr_lookup.hpp | 11 +++++---- dev_tools/scripts/config | 2 ++ dpcpp/matrix/csr_kernels.dp.cpp | 14 +++++------ omp/matrix/csr_kernels.cpp | 14 +++++------ reference/matrix/csr_kernels.cpp | 28 +++++++++++----------- reference/test/matrix/csr_kernels.cpp | 12 +++++----- test/matrix/csr_kernels.cpp | 4 ++-- 10 files changed, 64 insertions(+), 57 deletions(-) diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index c2d47b0c248..f7fe907697a 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -923,10 +923,11 @@ template __global__ __launch_bounds__(default_block_size) void build_csr_lookup( const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ col_idxs, size_type num_rows, - gko::matrix::sparsity_type allowed, const IndexType* storage_offsets, + matrix::csr::sparsity_type allowed, const IndexType* storage_offsets, int64* __restrict__ row_desc, int32* __restrict__ storage) { - constexpr int bitmap_block_size = gko::matrix::sparsity_bitmap_block_size; + using matrix::csr::sparsity_type; + constexpr int bitmap_block_size = matrix::csr::sparsity_bitmap_block_size; auto row = thread::get_subwarp_id_flat(); if (row >= num_rows) { return; @@ -946,9 +947,9 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( // full column range if (col_range == row_len && - csr_lookup_allowed(allowed, matrix::sparsity_type::full)) { + csr_lookup_allowed(allowed, sparsity_type::full)) { if (lane == 0) { - row_desc[row] = static_cast(matrix::sparsity_type::full); + row_desc[row] = static_cast(sparsity_type::full); } return; } @@ -956,10 +957,10 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( const auto num_blocks = static_cast(ceildiv(col_range, bitmap_block_size)); if (num_blocks * 2 <= available_storage && - csr_lookup_allowed(allowed, matrix::sparsity_type::bitmap)) { + csr_lookup_allowed(allowed, sparsity_type::bitmap)) { if (lane == 0) { row_desc[row] = (static_cast(num_blocks) << 32) | - static_cast(matrix::sparsity_type::bitmap); + static_cast(sparsity_type::bitmap); } const auto block_ranks = local_storage; const auto block_bitmaps = @@ -1015,7 +1016,7 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( 1u | static_cast(available_storage * inv_golden_ratio); if (lane == 0) { row_desc[row] = (static_cast(hash_parameter) << 32) | - static_cast(matrix::sparsity_type::hash); + static_cast(sparsity_type::hash); } // fill hashmap with sentinel constexpr int32 empty = invalid_index(); @@ -1083,7 +1084,7 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( template void build_lookup(std::shared_ptr exec, const IndexType* row_ptrs, const IndexType* col_idxs, - size_type num_rows, matrix::sparsity_type allowed, + size_type num_rows, matrix::csr::sparsity_type allowed, const IndexType* storage_offsets, int64* row_desc, int32* storage) { diff --git a/common/unified/matrix/csr_kernels.cpp b/common/unified/matrix/csr_kernels.cpp index c35c47a144c..690f96debdc 100644 --- a/common/unified/matrix/csr_kernels.cpp +++ b/common/unified/matrix/csr_kernels.cpp @@ -247,11 +247,12 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void build_lookup_offsets(std::shared_ptr exec, const IndexType* row_ptrs, const IndexType* col_idxs, - size_type num_rows, matrix::sparsity_type allowed, + size_type num_rows, + matrix::csr::sparsity_type allowed, IndexType* storage_offsets) { - using matrix::sparsity_bitmap_block_size; - using matrix::sparsity_type; + using matrix::csr::sparsity_bitmap_block_size; + using matrix::csr::sparsity_type; run_kernel( exec, [] GKO_KERNEL(auto row, auto row_ptrs, auto col_idxs, auto num_rows, diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp index 5e910385f20..bd965ad4bc3 100644 --- a/core/matrix/csr_kernels.hpp +++ b/core/matrix/csr_kernels.hpp @@ -229,14 +229,14 @@ namespace kernels { void build_lookup_offsets(std::shared_ptr exec, \ const IndexType* row_ptrs, \ const IndexType* col_idxs, size_type num_rows, \ - matrix::sparsity_type allowed, \ + matrix::csr::sparsity_type allowed, \ IndexType* storage_offsets) -#define GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL(IndexType) \ - void build_lookup(std::shared_ptr exec, \ - const IndexType* row_ptrs, const IndexType* col_idxs, \ - size_type num_rows, matrix::sparsity_type allowed, \ - const IndexType* storage_offsets, int64* row_desc, \ +#define GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL(IndexType) \ + void build_lookup(std::shared_ptr exec, \ + const IndexType* row_ptrs, const IndexType* col_idxs, \ + size_type num_rows, matrix::csr::sparsity_type allowed, \ + const IndexType* storage_offsets, int64* row_desc, \ int32* storage) diff --git a/core/matrix/csr_lookup.hpp b/core/matrix/csr_lookup.hpp index c7f0215844c..e4d46e25ee5 100644 --- a/core/matrix/csr_lookup.hpp +++ b/core/matrix/csr_lookup.hpp @@ -44,13 +44,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { namespace matrix { +namespace csr { /** * Type describing which kind of lookup structure is used to find entries in a - * single row of a Csr matrix. + * single row of a Csr matrix. It is also uses as a mask, so each value has only + * a single bit set. */ -enum class sparsity_type : int { +enum class sparsity_type { /** * The row is dense, i.e. it contains all entries in * `[min_col, min_col + storage_size)`. @@ -96,8 +98,8 @@ GKO_ATTRIBUTES GKO_INLINE sparsity_type operator|(sparsity_type a, } -GKO_ATTRIBUTES GKO_INLINE bool csr_lookup_allowed(matrix::sparsity_type allowed, - matrix::sparsity_type type) +GKO_ATTRIBUTES GKO_INLINE bool csr_lookup_allowed(sparsity_type allowed, + sparsity_type type) { return ((static_cast(allowed) & static_cast(type)) != 0); } @@ -318,6 +320,7 @@ struct device_sparsity_lookup { }; +} // namespace csr } // namespace matrix } // namespace gko diff --git a/dev_tools/scripts/config b/dev_tools/scripts/config index 7f43fb78089..86ab73b4a8b 100644 --- a/dev_tools/scripts/config +++ b/dev_tools/scripts/config @@ -32,6 +32,8 @@ - FixInclude: "common/unified/base/kernel_launch_solver.hpp" - "(cuda|hip|dpcpp|omp)/base/kernel_launch_solver\." - FixInclude: "common/unified/base/kernel_launch_solver.hpp" +- "test/base/kernel_launch_generic.cpp" + - FixInclude: "common/unified/base/kernel_launch.hpp" - "elimination_forest\.cpp" - FixInclude: "core/factorization/elimination_forest.hpp" - "common/unified/.*.cpp" diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 9deaeb0f22b..c9575e3f2be 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -2272,9 +2272,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template bool csr_lookup_try_full(IndexType row_len, IndexType col_range, - matrix::sparsity_type allowed, int64& row_desc) + matrix::csr::sparsity_type allowed, int64& row_desc) { - using matrix::sparsity_type; + using matrix::csr::sparsity_type; bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::full); if (is_allowed && row_len == col_range) { row_desc = static_cast(sparsity_type::full); @@ -2287,11 +2287,11 @@ bool csr_lookup_try_full(IndexType row_len, IndexType col_range, template bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, IndexType min_col, IndexType available_storage, - matrix::sparsity_type allowed, int64& row_desc, + matrix::csr::sparsity_type allowed, int64& row_desc, int32* local_storage, const IndexType* cols) { - using matrix::sparsity_bitmap_block_size; - using matrix::sparsity_type; + using matrix::csr::sparsity_bitmap_block_size; + using matrix::csr::sparsity_type; bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap); const auto num_blocks = static_cast(ceildiv(col_range, sparsity_bitmap_block_size)); @@ -2337,7 +2337,7 @@ void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, const auto hash_parameter = 1u | static_cast(available_storage * inv_golden_ratio); row_desc = (static_cast(hash_parameter) << 32) | - static_cast(matrix::sparsity_type::hash); + static_cast(matrix::csr::sparsity_type::hash); std::fill_n(local_storage, available_storage, invalid_index()); for (int32 nz = 0; nz < row_len; nz++) { auto hash = (static_cast(cols[nz]) * hash_parameter) % @@ -2357,7 +2357,7 @@ void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, template void build_lookup(std::shared_ptr exec, const IndexType* row_ptrs, const IndexType* col_idxs, - size_type num_rows, matrix::sparsity_type allowed, + size_type num_rows, matrix::csr::sparsity_type allowed, const IndexType* storage_offsets, int64* row_desc, int32* storage) { diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index fd34ee43d84..6e3260eb5c9 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -1131,9 +1131,9 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template bool csr_lookup_try_full(IndexType row_len, IndexType col_range, - matrix::sparsity_type allowed, int64& row_desc) + matrix::csr::sparsity_type allowed, int64& row_desc) { - using matrix::sparsity_type; + using matrix::csr::sparsity_type; bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::full); if (is_allowed && row_len == col_range) { row_desc = static_cast(sparsity_type::full); @@ -1146,11 +1146,11 @@ bool csr_lookup_try_full(IndexType row_len, IndexType col_range, template bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, IndexType min_col, IndexType available_storage, - matrix::sparsity_type allowed, int64& row_desc, + matrix::csr::sparsity_type allowed, int64& row_desc, int32* local_storage, const IndexType* cols) { - using matrix::sparsity_bitmap_block_size; - using matrix::sparsity_type; + using matrix::csr::sparsity_bitmap_block_size; + using matrix::csr::sparsity_type; bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap); const auto num_blocks = ceildiv(col_range, sparsity_bitmap_block_size); if (is_allowed && num_blocks * 2 <= available_storage) { @@ -1191,7 +1191,7 @@ void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, const auto hash_parameter = 1u | static_cast(available_storage * inv_golden_ratio); row_desc = (static_cast(hash_parameter) << 32) | - static_cast(matrix::sparsity_type::hash); + static_cast(matrix::csr::sparsity_type::hash); std::fill_n(local_storage, available_storage, invalid_index()); for (int32 nz = 0; nz < row_len; nz++) { auto hash = (static_cast::type>( @@ -1213,7 +1213,7 @@ void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, template void build_lookup(std::shared_ptr exec, const IndexType* row_ptrs, const IndexType* col_idxs, - size_type num_rows, matrix::sparsity_type allowed, + size_type num_rows, matrix::csr::sparsity_type allowed, const IndexType* storage_offsets, int64* row_desc, int32* storage) { diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp index 33a38a80efd..b438f59da80 100644 --- a/reference/matrix/csr_kernels.cpp +++ b/reference/matrix/csr_kernels.cpp @@ -1121,11 +1121,12 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template void build_lookup_offsets(std::shared_ptr exec, const IndexType* row_ptrs, const IndexType* col_idxs, - size_type num_rows, matrix::sparsity_type allowed, + size_type num_rows, + matrix::csr::sparsity_type allowed, IndexType* storage_offsets) { - using matrix::sparsity_bitmap_block_size; - using matrix::sparsity_type; + using matrix::csr::sparsity_bitmap_block_size; + using matrix::csr::sparsity_type; for (size_type row = 0; row < num_rows; row++) { const auto row_begin = row_ptrs[row]; const auto row_len = row_ptrs[row + 1] - row_begin; @@ -1158,12 +1159,12 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( template bool csr_lookup_try_full(IndexType row_len, IndexType col_range, - matrix::sparsity_type allowed, int64& row_desc) + matrix::csr::sparsity_type allowed, int64& row_desc) { - using matrix::sparsity_type; + using matrix::csr::sparsity_type; bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::full); if (is_allowed && row_len == col_range) { - row_desc = static_cast(sparsity_type::full); + row_desc = static_cast(sparsity_type::full); return true; } return false; @@ -1173,17 +1174,17 @@ bool csr_lookup_try_full(IndexType row_len, IndexType col_range, template bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, IndexType min_col, IndexType available_storage, - matrix::sparsity_type allowed, int64& row_desc, + matrix::csr::sparsity_type allowed, int64& row_desc, int32* local_storage, const IndexType* cols) { - using matrix::sparsity_bitmap_block_size; - using matrix::sparsity_type; + using matrix::csr::sparsity_bitmap_block_size; + using matrix::csr::sparsity_type; bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap); const auto num_blocks = static_cast(ceildiv(col_range, sparsity_bitmap_block_size)); if (is_allowed && num_blocks * 2 <= available_storage) { row_desc = (static_cast(num_blocks) << 32) | - static_cast(sparsity_type::bitmap); + static_cast(sparsity_type::bitmap); const auto block_ranks = local_storage; const auto block_bitmaps = reinterpret_cast(block_ranks + num_blocks); @@ -1219,11 +1220,10 @@ void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, const auto hash_parameter = 1u | static_cast(available_storage * inv_golden_ratio); row_desc = (static_cast(hash_parameter) << 32) | - static_cast(matrix::sparsity_type::hash); + static_cast(matrix::csr::sparsity_type::hash); std::fill_n(local_storage, available_storage, invalid_index()); for (int32 nz = 0; nz < row_len; nz++) { - auto hash = (static_cast::type>( - cols[nz]) * + auto hash = (static_cast>(cols[nz]) * hash_parameter) % static_cast(available_storage); // linear probing: find the next empty entry @@ -1241,7 +1241,7 @@ void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, template void build_lookup(std::shared_ptr exec, const IndexType* row_ptrs, const IndexType* col_idxs, - size_type num_rows, matrix::sparsity_type allowed, + size_type num_rows, matrix::csr::sparsity_type allowed, const IndexType* storage_offsets, int64* row_desc, int32* storage) { diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp index a3d6c47efc5..f7899cf7c89 100644 --- a/reference/test/matrix/csr_kernels.cpp +++ b/reference/test/matrix/csr_kernels.cpp @@ -1946,7 +1946,7 @@ TYPED_TEST_SUITE(CsrLookup, gko::test::ValueIndexTypes, TYPED_TEST(CsrLookup, GeneratesLookupDataOffsets) { using IndexType = typename TestFixture::index_type; - using gko::matrix::sparsity_type; + using gko::matrix::csr::sparsity_type; const auto num_rows = this->mtx->get_size()[0]; gko::array storage_offset_array(this->exec, num_rows + 1); const auto storage_offsets = storage_offset_array.get_data(); @@ -1960,9 +1960,9 @@ TYPED_TEST(CsrLookup, GeneratesLookupDataOffsets) gko::kernels::reference::csr::build_lookup_offsets( this->exec, row_ptrs, col_idxs, num_rows, allowed, storage_offsets); bool allow_full = - gko::matrix::csr_lookup_allowed(allowed, sparsity_type::full); - bool allow_bitmap = - gko::matrix::csr_lookup_allowed(allowed, sparsity_type::bitmap); + gko::matrix::csr::csr_lookup_allowed(allowed, sparsity_type::full); + bool allow_bitmap = gko::matrix::csr::csr_lookup_allowed( + allowed, sparsity_type::bitmap); for (gko::size_type row = 0; row < num_rows; row++) { const auto expected_size = @@ -1980,7 +1980,7 @@ TYPED_TEST(CsrLookup, GeneratesLookupDataOffsets) TYPED_TEST(CsrLookup, GeneratesLookupData) { using IndexType = typename TestFixture::index_type; - using gko::matrix::sparsity_type; + using gko::matrix::csr::sparsity_type; const auto num_rows = this->mtx->get_size()[0]; const auto num_cols = this->mtx->get_size()[1]; gko::array row_desc_array(this->exec, num_rows); @@ -2014,7 +2014,7 @@ TYPED_TEST(CsrLookup, GeneratesLookupData) for (int row = 0; row < num_rows; row++) { const auto row_begin = row_ptrs[row]; const auto row_end = row_ptrs[row + 1]; - gko::matrix::device_sparsity_lookup lookup{ + gko::matrix::csr::device_sparsity_lookup lookup{ row_ptrs, col_idxs, storage_offsets, storage, row_descs, static_cast(row)}; for (auto nz = row_begin; nz < row_end; nz++) { diff --git a/test/matrix/csr_kernels.cpp b/test/matrix/csr_kernels.cpp index a2ab2b8f707..3c1e06e89c5 100644 --- a/test/matrix/csr_kernels.cpp +++ b/test/matrix/csr_kernels.cpp @@ -190,7 +190,7 @@ TYPED_TEST_SUITE(CsrLookup, gko::test::IndexTypes, TypenameNameGenerator); TYPED_TEST(CsrLookup, BuildLookupWorks) { using index_type = typename TestFixture::index_type; - using gko::matrix::sparsity_type; + using gko::matrix::csr::sparsity_type; const auto num_rows = this->mtx->get_size()[0]; const auto num_cols = this->mtx->get_size()[1]; gko::array row_desc_array(this->ref, num_rows); @@ -247,7 +247,7 @@ TYPED_TEST(CsrLookup, BuildLookupWorks) const auto row_begin = row_ptrs[row]; const auto row_end = row_ptrs[row + 1]; const auto row_nnz = row_end - row_begin; - gko::matrix::device_sparsity_lookup lookup{ + gko::matrix::csr::device_sparsity_lookup lookup{ row_ptrs, col_idxs, storage_offsets, From a851a72074f9e55e5f181ea2be49505d27e7d617 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Tue, 24 May 2022 12:30:51 +0200 Subject: [PATCH 8/8] review updates and fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * consistently cast to int64 for row_desc * fix compilation issues on Volta with HIP Co-authored-by: Thomas Grützmacher --- common/cuda_hip/matrix/csr_kernels.hpp.inc | 9 +++++---- dpcpp/matrix/csr_kernels.dp.cpp | 4 ++-- omp/matrix/csr_kernels.cpp | 4 ++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index f7fe907697a..3cdf7f24d6d 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -949,7 +949,7 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( if (col_range == row_len && csr_lookup_allowed(allowed, sparsity_type::full)) { if (lane == 0) { - row_desc[row] = static_cast(sparsity_type::full); + row_desc[row] = static_cast(sparsity_type::full); } return; } @@ -960,7 +960,7 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( csr_lookup_allowed(allowed, sparsity_type::bitmap)) { if (lane == 0) { row_desc[row] = (static_cast(num_blocks) << 32) | - static_cast(sparsity_type::bitmap); + static_cast(sparsity_type::bitmap); } const auto block_ranks = local_storage; const auto block_bitmaps = @@ -1016,7 +1016,7 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( 1u | static_cast(available_storage * inv_golden_ratio); if (lane == 0) { row_desc[row] = (static_cast(hash_parameter) << 32) | - static_cast(sparsity_type::hash); + static_cast(sparsity_type::hash); } // fill hashmap with sentinel constexpr int32 empty = invalid_index(); @@ -1034,7 +1034,8 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( static_cast(available_storage) : static_cast(available_storage + lane); // collision resolution -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700) +#if !defined(__HIP_DEVICE_COMPILE__) && defined(__CUDA_ARCH__) && \ + (__CUDA_ARCH__ >= 700) const auto this_lane_mask = config::lane_mask_type{1} << lane; const auto lane_prefix_mask = this_lane_mask - 1; // memory barrier to previous loop iteration diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index c9575e3f2be..2381a98e463 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -2277,7 +2277,7 @@ bool csr_lookup_try_full(IndexType row_len, IndexType col_range, using matrix::csr::sparsity_type; bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::full); if (is_allowed && row_len == col_range) { - row_desc = static_cast(sparsity_type::full); + row_desc = static_cast(sparsity_type::full); return true; } return false; @@ -2297,7 +2297,7 @@ bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, static_cast(ceildiv(col_range, sparsity_bitmap_block_size)); if (is_allowed && num_blocks * 2 <= available_storage) { row_desc = (static_cast(num_blocks) << 32) | - static_cast(sparsity_type::bitmap); + static_cast(sparsity_type::bitmap); const auto block_ranks = local_storage; const auto block_bitmaps = reinterpret_cast(block_ranks + num_blocks); diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index 6e3260eb5c9..f7226662b8e 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -1136,7 +1136,7 @@ bool csr_lookup_try_full(IndexType row_len, IndexType col_range, using matrix::csr::sparsity_type; bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::full); if (is_allowed && row_len == col_range) { - row_desc = static_cast(sparsity_type::full); + row_desc = static_cast(sparsity_type::full); return true; } return false; @@ -1155,7 +1155,7 @@ bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range, const auto num_blocks = ceildiv(col_range, sparsity_bitmap_block_size); if (is_allowed && num_blocks * 2 <= available_storage) { row_desc = (static_cast(num_blocks) << 32) | - static_cast(sparsity_type::bitmap); + static_cast(sparsity_type::bitmap); const auto block_ranks = local_storage; const auto block_bitmaps = reinterpret_cast(block_ranks + num_blocks);