diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index 652e8b0df6e..b9d1cc85195 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -736,15 +736,15 @@ __global__ __launch_bounds__(default_block_size) void inv_symm_permute( template __global__ - __launch_bounds__(default_block_size) void compute_submatrix_idxs_and_vals( - const size_type num_rows, const size_type num_cols, - const size_type row_offset, const size_type col_offset, - const IndexType* __restrict__ source_row_ptrs, - const IndexType* __restrict__ source_col_idxs, - const ValueType* __restrict__ source_values, - const IndexType* __restrict__ result_row_ptrs, - IndexType* __restrict__ result_col_idxs, - ValueType* __restrict__ result_values) +__launch_bounds__(default_block_size) void compute_submatrix_idxs_and_vals( + const size_type num_rows, const size_type num_cols, + const size_type row_offset, const size_type col_offset, + const IndexType* __restrict__ source_row_ptrs, + const IndexType* __restrict__ source_col_idxs, + const ValueType* __restrict__ source_values, + const IndexType* __restrict__ result_row_ptrs, + IndexType* __restrict__ result_col_idxs, + ValueType* __restrict__ result_values) { const auto res_row = thread::get_thread_id_flat(); if (res_row < num_rows) { @@ -766,11 +766,10 @@ __global__ template __global__ - __launch_bounds__(default_block_size) void calculate_nnz_per_row_in_span( - const span row_span, const span col_span, - const IndexType* __restrict__ row_ptrs, - const IndexType* __restrict__ col_idxs, - IndexType* __restrict__ nnz_per_row) +__launch_bounds__(default_block_size) void calculate_nnz_per_row_in_span( + const span row_span, const span col_span, + const IndexType* __restrict__ row_ptrs, + const IndexType* __restrict__ col_idxs, IndexType* __restrict__ nnz_per_row) { const auto src_row = thread::get_thread_id_flat() + row_span.begin; if (src_row < row_span.end) { @@ -923,8 +922,8 @@ template __global__ __launch_bounds__(default_block_size) void build_csr_lookup( const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ col_idxs, size_type num_rows, - gko::matrix::sparsity_type allowed, int64* __restrict__ row_desc, - int32* __restrict__ storage) + gko::matrix::sparsity_type allowed, const IndexType* storage_offsets, + int64* __restrict__ row_desc, int32* __restrict__ storage) { constexpr int bitmap_block_size = gko::matrix::device_sparsity_lookup::block_size; @@ -936,8 +935,8 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( group::tiled_partition(group::this_thread_block()); const auto row_begin = row_ptrs[row]; const auto row_len = row_ptrs[row + 1] - row_begin; - const auto storage_begin = 2 * row_begin; - const auto available_storage = 2 * row_len; + const auto storage_begin = storage_offsets[row]; + const auto available_storage = storage_offsets[row + 1] - storage_begin; const auto local_storage = storage + storage_begin; const auto local_cols = col_idxs + row_begin; const auto lane = subwarp.thread_rank(); @@ -947,8 +946,7 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( // full column range if (col_range == row_len && - ((static_cast(allowed) & - static_cast(matrix::sparsity_type::full)) != 0)) { + csr_lookup_allowed(allowed, matrix::sparsity_type::full)) { if (lane == 0) { row_desc[row] = static_cast(matrix::sparsity_type::full); } @@ -958,8 +956,7 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( const auto num_blocks = static_cast(ceildiv(col_range, bitmap_block_size)); if (num_blocks * 2 <= available_storage && - ((static_cast(allowed) & - static_cast(matrix::sparsity_type::bitmap)) != 0)) { + csr_lookup_allowed(allowed, matrix::sparsity_type::bitmap)) { if (lane == 0) { row_desc[row] = (static_cast(num_blocks) << 32) | static_cast(matrix::sparsity_type::bitmap); @@ -974,8 +971,9 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( // fill bitmaps with sparsity pattern for (IndexType base_i = 0; base_i < row_len; base_i += subwarp_size) { const auto i = base_i + lane; - const auto col = - i < row_len ? local_cols[i] : device_numeric_limits::max; + const auto col = i < row_len + ? local_cols[i] + : device_numeric_limits::max; const auto rel_col = static_cast(col - min_col); const auto block = rel_col / bitmap_block_size; const auto col_in_block = rel_col % bitmap_block_size; @@ -1007,6 +1005,8 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( return; } // sparse hashmap storage + // we need at least one unfilled entry to avoid infinite loops on search + GKO_ASSERT(row_len < available_storage); constexpr double inv_golden_ratio = 0.61803398875; // use golden ratio as approximation for hash parameter that spreads // consecutive values as far apart as possible. Ensure lowest bit is set @@ -1051,9 +1051,10 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( if (hash >= available_storage) { hash -= available_storage; } - // this can only fail for row_length < 16 - // because then available_storage < 32 and - // colliding can have more than available_storage bits set. + // this could only fail for available_storage < warp_size, as + // popcnt(colliding) is at most warp_size. At the same time, we + // only increase hash by row_length at most, so this is still + // safe. GKO_ASSERT(hash < available_storage); entry = local_storage[hash]; } @@ -1083,13 +1084,15 @@ template void build_lookup(std::shared_ptr exec, const IndexType* row_ptrs, const IndexType* col_idxs, size_type num_rows, matrix::sparsity_type allowed, - int64* row_desc, int32* storage) + const IndexType* storage_offsets, int64* row_desc, + int32* storage) { const auto num_blocks = ceildiv(num_rows, default_block_size / config::warp_size); kernel::build_csr_lookup <<>>(row_ptrs, col_idxs, num_rows, - allowed, row_desc, storage); + allowed, storage_offsets, row_desc, + storage); } GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL); diff --git a/common/unified/matrix/csr_kernels.cpp b/common/unified/matrix/csr_kernels.cpp index d6a81453b86..d861b1a16e7 100644 --- a/common/unified/matrix/csr_kernels.cpp +++ b/common/unified/matrix/csr_kernels.cpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common/unified/base/kernel_launch.hpp" +#include "core/components/prefix_sum_kernels.hpp" namespace gko { @@ -243,6 +244,49 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL); +template +void build_lookup_offsets(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* col_idxs, + size_type num_rows, matrix::sparsity_type allowed, + IndexType* storage_offsets) +{ + using matrix::sparsity_type; + constexpr static int block_size = + gko::matrix::device_sparsity_lookup::block_size; + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto row_ptrs, auto col_idxs, auto num_rows, + auto allowed, auto storage_offsets) { + const auto row_begin = row_ptrs[row]; + const auto row_len = row_ptrs[row + 1] - row_begin; + const auto local_cols = col_idxs + row_begin; + const auto min_col = row_len > 0 ? local_cols[0] : 0; + const auto col_range = + row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0; + if (csr_lookup_allowed(allowed, sparsity_type::full) && + row_len == col_range) { + storage_offsets[row] = 0; + } else { + const auto hashmap_storage = row_len == 0 ? 1 : 2 * row_len; + const auto bitmap_num_blocks = + static_cast(ceildiv(col_range, block_size)); + const auto bitmap_storage = 2 * bitmap_num_blocks; + if (csr_lookup_allowed(allowed, sparsity_type::bitmap) && + bitmap_storage <= hashmap_storage) { + storage_offsets[row] = bitmap_storage; + } else { + storage_offsets[row] = hashmap_storage; + } + } + }, + num_rows, row_ptrs, col_idxs, num_rows, allowed, storage_offsets); + components::prefix_sum(exec, storage_offsets, num_rows + 1); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL); + + } // namespace csr } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp index 3b427808107..5e910385f20 100644 --- a/core/matrix/csr_kernels.hpp +++ b/core/matrix/csr_kernels.hpp @@ -225,11 +225,19 @@ namespace kernels { const matrix::Dense* beta, \ matrix::Csr* mtx) +#define GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL(IndexType) \ + void build_lookup_offsets(std::shared_ptr exec, \ + const IndexType* row_ptrs, \ + const IndexType* col_idxs, size_type num_rows, \ + matrix::sparsity_type allowed, \ + IndexType* storage_offsets) + #define GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL(IndexType) \ void build_lookup(std::shared_ptr exec, \ const IndexType* row_ptrs, const IndexType* col_idxs, \ size_type num_rows, matrix::sparsity_type allowed, \ - int64* row_desc, int32* storage) + const IndexType* storage_offsets, int64* row_desc, \ + int32* storage) #define GKO_DECLARE_ALL_AS_TEMPLATES \ @@ -292,6 +300,8 @@ namespace kernels { template \ GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL(ValueType, IndexType); \ template \ + GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL(IndexType); \ + template \ GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL(IndexType) diff --git a/core/matrix/csr_lookup.hpp b/core/matrix/csr_lookup.hpp index 120fcbfb20f..9dc61028065 100644 --- a/core/matrix/csr_lookup.hpp +++ b/core/matrix/csr_lookup.hpp @@ -78,7 +78,7 @@ enum class sparsity_type : int { * ``` * auto hash_key = col; * auto hash_bucket = hash(hash_key); - * while (col_idxs[hashtable[hash_bucket]] != col) { + * while (local_cols[hashtable[hash_bucket]] != col) { * hash_bucket = (hash_bucket + 1) % storage_size; // linear probing * } * auto output_idx = hashtable[hash_bucket]; @@ -108,11 +108,62 @@ struct device_sparsity_lookup { /** Number of bits in a block_type entry. */ static constexpr int block_size = 32; - const IndexType* col_idxs; - const IndexType row_nnz; - const int32* storage; - const int64 desc; + /** + * Set up a device_sparsity_lookup structure from local data + * + * @param local_cols the column array slice for the local row, it contains + * row_nnz entries + * @param row_nnz the number of entries in this row + * @param local_storage the local lookup structure storage array slice, it + * needs to be set up using the storage_offsets array + * @param storage_size the number of int32 entries in the local lookup + * structure + * @param desc the lookup structure descriptor for this row + */ + GKO_ATTRIBUTES GKO_INLINE device_sparsity_lookup( + const IndexType* local_cols, IndexType row_nnz, + const int32* local_storage, IndexType storage_size, int64 desc) + : local_cols{local_cols}, + row_nnz{row_nnz}, + local_storage{local_storage}, + storage_size{storage_size}, + desc{desc} + {} + + /** + * Set up a device_sparsity_lookup structure from global data + * + * @param row_ptrs the CSR row pointers + * @param col_idxs the CSR column indices + * @param storage_offsets the storage offset array for the lookup structure + * @param storage the storage array for the lookup data structure + * @param descs the lookup structure descriptor array + * @param row the row index to build the lookup structure for + */ + GKO_ATTRIBUTES GKO_INLINE device_sparsity_lookup( + const IndexType* row_ptrs, const IndexType* col_idxs, + const IndexType* storage_offsets, const int32* storage, + const int64* descs, size_type row) + { + const auto row_begin = row_ptrs[row]; + const auto row_end = row_ptrs[row + 1]; + const auto storage_begin = storage_offsets[row]; + const auto storage_end = storage_offsets[row + 1]; + local_cols = col_idxs + row_begin; + row_nnz = row_end - row_begin; + local_storage = storage + storage_begin; + storage_size = storage_end - storage_begin; + desc = descs[row]; + } + /** + * Returns the row-local index of the entry with the given column index, if + * it is present. + * + * @param col the column index + * @return the row-local index of the given entry, or + * invalid_index() if it is not present. + */ GKO_ATTRIBUTES GKO_INLINE IndexType operator[](IndexType col) const { switch (static_cast(desc & 0xF)) { @@ -126,45 +177,111 @@ struct device_sparsity_lookup { GKO_ASSERT(false); } - GKO_ATTRIBUTES GKO_INLINE IndexType lookup_full(IndexType col) const + /** + * Returns the row-local index of the entry with the given column index, + * assuming it is present. + * + * @param col the column index + * @return the row-local index of the given entry. + * @note the function fails with an assertion (debug builds) or can crash if + * the column index is not present in the row! + */ + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_unsafe(IndexType col) const { - const auto min_col = col_idxs[0]; + IndexType result{}; + switch (static_cast(desc & 0xF)) { + case sparsity_type::full: + result = lookup_full_unsafe(col); + break; + case sparsity_type::bitmap: + result = lookup_bitmap_unsafe(col); + break; + case sparsity_type::hash: + result = lookup_hash_unsafe(col); + break; + default: + GKO_ASSERT(false); + } + GKO_ASSERT(local_cols[result] == col); + return result; + } + +private: + using unsigned_index_type = typename std::make_unsigned::type; + + const IndexType* local_cols; + IndexType row_nnz; + const int32* local_storage; + IndexType storage_size; + int64 desc; + + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_full_unsafe(IndexType col) const + { + const auto min_col = local_cols[0]; const auto out_idx = col - min_col; - GKO_ASSERT(out_idx < row_nnz); - GKO_ASSERT(col_idxs[out_idx] == col); + GKO_ASSERT(out_idx >= 0 && out_idx < row_nnz); return out_idx; } - GKO_ATTRIBUTES GKO_INLINE IndexType lookup_bitmap(IndexType col) const + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_full(IndexType col) const + { + const auto min_col = local_cols[0]; + const auto out_idx = col - min_col; + return out_idx >= 0 && out_idx < row_nnz ? out_idx + : invalid_index(); + } + + GKO_ATTRIBUTES GKO_INLINE IndexType + lookup_bitmap_unsafe(IndexType col) const { - const auto min_col = col_idxs[0]; + const auto min_col = local_cols[0]; const auto num_blocks = static_cast(desc >> 32); - const auto block_bases = storage; + const auto block_bases = local_storage; const auto block_bitmaps = reinterpret_cast(block_bases + num_blocks); const auto rel_col = col - min_col; const auto block = rel_col / block_size; const auto col_in_block = rel_col % block_size; const auto prefix_mask = (uint32{1} << col_in_block) - 1; + GKO_ASSERT(rel_col >= 0); GKO_ASSERT(block < num_blocks); GKO_ASSERT(block_bitmaps[block] & (uint32{1} << col_in_block)); const auto out_idx = block_bases[block] + gko::detail::popcount(block_bitmaps[block] & prefix_mask); - GKO_ASSERT(col_idxs[out_idx] == col); + GKO_ASSERT(local_cols[out_idx] == col); return out_idx; } - GKO_ATTRIBUTES GKO_INLINE IndexType lookup_hash(IndexType col) const + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_bitmap(IndexType col) const + { + const auto min_col = local_cols[0]; + const auto num_blocks = static_cast(desc >> 32); + const auto block_bases = local_storage; + const auto block_bitmaps = + reinterpret_cast(block_bases + num_blocks); + const auto rel_col = col - min_col; + const auto block = rel_col / block_size; + const auto col_in_block = rel_col % block_size; + if (rel_col < 0 || block >= num_blocks || + !(block_bitmaps[block] & (uint32{1} << col_in_block))) { + return invalid_index(); + } + const auto prefix_mask = (uint32{1} << col_in_block) - 1; + return block_bases[block] + + gko::detail::popcount(block_bitmaps[block] & prefix_mask); + } + + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_hash_unsafe(IndexType col) const { - const auto hashmap_size = static_cast(2 * row_nnz); + const auto hashmap_size = static_cast(storage_size); const auto hash_param = static_cast(desc >> 32); - const auto hashmap = storage; - auto hash = (static_cast(col) * hash_param) % hashmap_size; + const auto hashmap = local_storage; + auto hash = + (static_cast(col) * hash_param) % hashmap_size; GKO_ASSERT(hashmap[hash] >= 0); GKO_ASSERT(hashmap[hash] < row_nnz); - // linear probing with sentinel to avoid infinite loop - while (col_idxs[hashmap[hash]] != col) { + while (local_cols[hashmap[hash]] != col) { hash++; if (hash >= hashmap_size) { hash = 0; @@ -173,7 +290,28 @@ struct device_sparsity_lookup { GKO_ASSERT(hashmap[hash] < row_nnz); } const auto out_idx = hashmap[hash]; - GKO_ASSERT(col_idxs[out_idx] == col); + return out_idx; + } + + GKO_ATTRIBUTES GKO_INLINE IndexType lookup_hash(IndexType col) const + { + const auto hashmap_size = static_cast(storage_size); + const auto hash_param = static_cast(desc >> 32); + const auto hashmap = local_storage; + auto hash = + (static_cast(col) * hash_param) % hashmap_size; + GKO_ASSERT(hashmap[hash] < row_nnz); + auto out_idx = hashmap[hash]; + // linear probing with invalid_index sentinel to avoid infinite loop + while (out_idx >= 0 && local_cols[out_idx] != col) { + hash++; + if (hash >= hashmap_size) { + hash = 0; + } + out_idx = hashmap[hash]; + GKO_ASSERT(hashmap[hash] < row_nnz); + } + // out_idx is either correct or invalid_index, the hashmap sentinel return out_idx; } }; diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 677a2627897..30d29bcd002 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -2324,7 +2324,13 @@ void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, int64& row_desc, int32* local_storage, const IndexType* cols) { + // we need at least one unfilled entry to avoid infinite loops on search + GKO_ASSERT(row_len < available_storage); +#if GINKGO_DPCPP_SINGLE_MODE + constexpr float inv_golden_ratio = 0.61803398875f; +#else constexpr double inv_golden_ratio = 0.61803398875; +#endif // use golden ratio as approximation for hash parameter that spreads // consecutive values as far apart as possible. Ensure lowest bit is set // otherwise we skip odd hashtable entries @@ -2352,15 +2358,17 @@ template void build_lookup(std::shared_ptr exec, const IndexType* row_ptrs, const IndexType* col_idxs, size_type num_rows, matrix::sparsity_type allowed, - int64* row_desc, int32* storage) + const IndexType* storage_offsets, int64* row_desc, + int32* storage) { exec->get_queue()->submit([&](sycl::handler& cgh) { cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) { const auto row = static_cast(idx[0]); const auto row_begin = row_ptrs[row]; const auto row_len = row_ptrs[row + 1] - row_begin; - const auto storage_begin = 2 * row_begin; - const auto available_storage = 2 * row_len; + const auto storage_begin = storage_offsets[row]; + const auto available_storage = + storage_offsets[row + 1] - storage_begin; const auto local_storage = storage + storage_begin; const auto local_cols = col_idxs + row_begin; const auto min_col = row_len > 0 ? local_cols[0] : 0; diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index d7263b63a5e..2a8cfa5bec0 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -1183,6 +1183,8 @@ void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, int64& row_desc, int32* local_storage, const IndexType* cols) { + // we need at least one unfilled entry to avoid infinite loops on search + GKO_ASSERT(row_len < available_storage); constexpr double inv_golden_ratio = 0.61803398875; // use golden ratio as approximation for hash parameter that spreads // consecutive values as far apart as possible. Ensure lowest bit is set @@ -1213,14 +1215,15 @@ template void build_lookup(std::shared_ptr exec, const IndexType* row_ptrs, const IndexType* col_idxs, size_type num_rows, matrix::sparsity_type allowed, - int64* row_desc, int32* storage) + const IndexType* storage_offsets, int64* row_desc, + int32* storage) { #pragma omp parallel for for (size_type row = 0; row < num_rows; row++) { const auto row_begin = row_ptrs[row]; const auto row_len = row_ptrs[row + 1] - row_begin; - const auto storage_begin = 2 * row_begin; - const auto available_storage = 2 * row_len; + const auto storage_begin = storage_offsets[row]; + const auto available_storage = storage_offsets[row + 1] - storage_begin; const auto local_storage = storage + storage_begin; const auto local_cols = col_idxs + row_begin; const auto min_col = row_len > 0 ? local_cols[0] : 0; diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp index 1397d069b59..a0e50da6452 100644 --- a/reference/matrix/csr_kernels.cpp +++ b/reference/matrix/csr_kernels.cpp @@ -1118,6 +1118,45 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL); +template +void build_lookup_offsets(std::shared_ptr exec, + const IndexType* row_ptrs, const IndexType* col_idxs, + size_type num_rows, matrix::sparsity_type allowed, + IndexType* storage_offsets) +{ + using matrix::sparsity_type; + constexpr static int block_size = + gko::matrix::device_sparsity_lookup::block_size; + for (size_type row = 0; row < num_rows; row++) { + const auto row_begin = row_ptrs[row]; + const auto row_len = row_ptrs[row + 1] - row_begin; + const auto local_cols = col_idxs + row_begin; + const auto min_col = row_len > 0 ? local_cols[0] : 0; + const auto col_range = + row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0; + if (csr_lookup_allowed(allowed, sparsity_type::full) && + row_len == col_range) { + storage_offsets[row] = 0; + } else { + const auto hashmap_storage = std::max(2 * row_len, 1); + const auto bitmap_num_blocks = + static_cast(ceildiv(col_range, block_size)); + const auto bitmap_storage = 2 * bitmap_num_blocks; + if (csr_lookup_allowed(allowed, sparsity_type::bitmap) && + bitmap_storage <= hashmap_storage) { + storage_offsets[row] = bitmap_storage; + } else { + storage_offsets[row] = hashmap_storage; + } + } + } + components::prefix_sum(exec, storage_offsets, num_rows + 1); +} + +GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( + GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL); + + template bool csr_lookup_try_full(IndexType row_len, IndexType col_range, matrix::sparsity_type allowed, int64& row_desc) @@ -1172,6 +1211,8 @@ void csr_lookup_build_hash(IndexType row_len, IndexType available_storage, int64& row_desc, int32* local_storage, const IndexType* cols) { + // we need at least one unfilled entry to avoid infinite loops on search + GKO_ASSERT(row_len < available_storage); constexpr double inv_golden_ratio = 0.61803398875; // use golden ratio as approximation for hash parameter that spreads // consecutive values as far apart as possible. Ensure lowest bit is set @@ -1202,13 +1243,14 @@ template void build_lookup(std::shared_ptr exec, const IndexType* row_ptrs, const IndexType* col_idxs, size_type num_rows, matrix::sparsity_type allowed, - int64* row_desc, int32* storage) + const IndexType* storage_offsets, int64* row_desc, + int32* storage) { for (size_type row = 0; row < num_rows; row++) { const auto row_begin = row_ptrs[row]; const auto row_len = row_ptrs[row + 1] - row_begin; - const auto storage_begin = 2 * row_begin; - const auto available_storage = 2 * row_len; + const auto storage_begin = storage_offsets[row]; + const auto available_storage = storage_offsets[row + 1] - storage_begin; const auto local_storage = storage + storage_begin; const auto local_cols = col_idxs + row_begin; const auto min_col = row_len > 0 ? local_cols[0] : 0; diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp index 09a341ccfae..a3d6c47efc5 100644 --- a/reference/test/matrix/csr_kernels.cpp +++ b/reference/test/matrix/csr_kernels.cpp @@ -1892,78 +1892,152 @@ TYPED_TEST(Csr, CanGetSubmatrixWithindex_set) } -TYPED_TEST(Csr, GeneratesLookupData) +template +class CsrLookup : public ::testing::Test { +protected: + using value_type = + typename std::tuple_element<0, decltype(ValueIndexType())>::type; + using index_type = + typename std::tuple_element<1, decltype(ValueIndexType())>::type; + using Mtx = gko::matrix::Csr; + + CsrLookup() : exec(gko::ReferenceExecutor::create()) + { + mtx = Mtx::create(this->exec); + typename Mtx::mat_data data{gko::dim<2>{6, 65}}; + for (int i = 0; i < 65; i++) { + // row 0: empty row + // row 1: full row + data.nonzeros.emplace_back(1, i, 1.0); + // row 2: pretty dense row + if (i % 3 == 0) { + data.nonzeros.emplace_back(2, i, 1.0); + } + // row 4-5: contiguous row + if (i >= 10 && i < 30) { + data.nonzeros.emplace_back(4, i, 1.0); + } + if (i >= 2 && i < 6) { + data.nonzeros.emplace_back(5, i, 1.0); + } + } + // row 3: very sparse + data.nonzeros.emplace_back(3, 0, 1.0); + data.nonzeros.emplace_back(3, 64, 1.0); + data.ensure_row_major_order(); + // 1000 as min-sentinel + full_sizes = {0, 0, 1000, 1000, 0, 0}; + bitmap_sizes = {0, 6, 4, 6, 2, 2}; + hash_sizes = {1, 130, 44, 4, 40, 8}; + mtx->read(data); + } + + std::shared_ptr exec; + std::unique_ptr mtx; + std::vector full_sizes; + std::vector bitmap_sizes; + std::vector hash_sizes; + index_type invalid_index = gko::invalid_index(); +}; + +TYPED_TEST_SUITE(CsrLookup, gko::test::ValueIndexTypes, + PairTypenameNameGenerator); + +TYPED_TEST(CsrLookup, GeneratesLookupDataOffsets) { - using Mtx = typename TestFixture::Mtx; - using IndexType = typename Mtx::index_type; + using IndexType = typename TestFixture::index_type; using gko::matrix::sparsity_type; - auto mtx = Mtx::create(this->exec); - typename Mtx::mat_data data{gko::dim<2>{6, 65}}; - for (int i = 0; i < 65; i++) { - // row 0: empty row - // row 1: full row - data.nonzeros.emplace_back(1, i, 1.0); - // row 2: pretty dense row - if (i % 3 == 0) { - data.nonzeros.emplace_back(2, i, 1.0); - } - // row 4-5: contiguous row - if (i >= 10 && i < 30) { - data.nonzeros.emplace_back(4, i, 1.0); - } - if (i >= 2 && i < 6) { - data.nonzeros.emplace_back(5, i, 1.0); + const auto num_rows = this->mtx->get_size()[0]; + gko::array storage_offset_array(this->exec, num_rows + 1); + const auto storage_offsets = storage_offset_array.get_data(); + const auto row_ptrs = this->mtx->get_const_row_ptrs(); + const auto col_idxs = this->mtx->get_const_col_idxs(); + + for (auto allowed : + {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::bitmap | sparsity_type::hash, + sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { + gko::kernels::reference::csr::build_lookup_offsets( + this->exec, row_ptrs, col_idxs, num_rows, allowed, storage_offsets); + bool allow_full = + gko::matrix::csr_lookup_allowed(allowed, sparsity_type::full); + bool allow_bitmap = + gko::matrix::csr_lookup_allowed(allowed, sparsity_type::bitmap); + + for (gko::size_type row = 0; row < num_rows; row++) { + const auto expected_size = + std::min(allow_full ? this->full_sizes[row] : 1000, + std::min(allow_bitmap ? this->bitmap_sizes[row] : 1000, + this->hash_sizes[row])); + const auto size = storage_offsets[row + 1] - storage_offsets[row]; + + ASSERT_EQ(size, expected_size); } } - // row 3: very sparse - data.nonzeros.emplace_back(3, 0, 1.0); - data.nonzeros.emplace_back(3, 64, 1.0); - data.ensure_row_major_order(); - mtx->read(data); - gko::array row_descs(this->exec, mtx->get_size()[0]); - gko::array lookup_info(this->exec, - mtx->get_num_stored_elements() * 2); - for (auto allowed_methods : +} + + +TYPED_TEST(CsrLookup, GeneratesLookupData) +{ + using IndexType = typename TestFixture::index_type; + using gko::matrix::sparsity_type; + const auto num_rows = this->mtx->get_size()[0]; + const auto num_cols = this->mtx->get_size()[1]; + gko::array row_desc_array(this->exec, num_rows); + gko::array storage_offset_array(this->exec, num_rows + 1); + const auto row_descs = row_desc_array.get_data(); + const auto storage_offsets = storage_offset_array.get_data(); + const auto row_ptrs = this->mtx->get_const_row_ptrs(); + const auto col_idxs = this->mtx->get_const_col_idxs(); + for (auto allowed : {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, sparsity_type::bitmap | sparsity_type::hash, sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { - const auto full_allowed = - static_cast(static_cast(allowed_methods) & - static_cast(sparsity_type::full)); - const auto bitmap_allowed = - static_cast(static_cast(allowed_methods) & - static_cast(sparsity_type::bitmap)); + gko::kernels::reference::csr::build_lookup_offsets( + this->exec, row_ptrs, col_idxs, num_rows, allowed, storage_offsets); + gko::array storage_array(this->exec, + storage_offsets[num_rows]); + const auto storage = storage_array.get_data(); const auto bitmap_equivalent = - bitmap_allowed ? sparsity_type::bitmap : sparsity_type::hash; + csr_lookup_allowed(allowed, sparsity_type::bitmap) + ? sparsity_type::bitmap + : sparsity_type::hash; const auto full_equivalent = - full_allowed ? sparsity_type::full : bitmap_equivalent; - SCOPED_TRACE("full: " + std::to_string(full_allowed) + - " bitmap: " + std::to_string(bitmap_allowed)); + csr_lookup_allowed(allowed, sparsity_type::full) + ? sparsity_type::full + : bitmap_equivalent; gko::kernels::reference::csr::build_lookup( - this->exec, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), - mtx->get_size()[0], allowed_methods, row_descs.get_data(), - lookup_info.get_data()); - - const auto descs = row_descs.get_const_data(); - for (auto entry : data.nonzeros) { - const auto row = entry.row; - const auto col = entry.column; - const auto row_begin = mtx->get_const_row_ptrs()[row]; - const auto row_nnz = mtx->get_const_row_ptrs()[row + 1] - row_begin; - gko::matrix::device_sparsity_lookup lookup{ - mtx->get_const_col_idxs() + row_begin, row_nnz, - lookup_info.get_const_data() + (row_begin * 2), descs[row]}; + this->exec, row_ptrs, col_idxs, num_rows, allowed, storage_offsets, + row_descs, storage); - const auto nz = lookup[col] + row_begin; - ASSERT_EQ(mtx->get_const_col_idxs()[nz], col); + for (int row = 0; row < num_rows; row++) { + const auto row_begin = row_ptrs[row]; + const auto row_end = row_ptrs[row + 1]; + gko::matrix::device_sparsity_lookup lookup{ + row_ptrs, col_idxs, storage_offsets, + storage, row_descs, static_cast(row)}; + for (auto nz = row_begin; nz < row_end; nz++) { + const auto col = col_idxs[nz]; + ASSERT_EQ(lookup.lookup_unsafe(col) + row_begin, nz); + } + auto nz = row_begin; + for (int col = 0; col < num_cols; col++) { + auto found_nz = lookup[col]; + if (nz < row_end && col_idxs[nz] == col) { + ASSERT_EQ(found_nz, nz - row_begin); + nz++; + } else { + ASSERT_EQ(found_nz, this->invalid_index); + } + } } - ASSERT_EQ(descs[0] & 0xFFFF, static_cast(full_equivalent)); - ASSERT_EQ(descs[1] & 0xFFFF, static_cast(full_equivalent)); - ASSERT_EQ(descs[2] & 0xFFFF, static_cast(bitmap_equivalent)); - ASSERT_EQ(descs[3] & 0xFFFF, static_cast(sparsity_type::hash)); - ASSERT_EQ(descs[4] & 0xFFFF, static_cast(full_equivalent)); - ASSERT_EQ(descs[5] & 0xFFFF, static_cast(full_equivalent)); + ASSERT_EQ(row_descs[0] & 0xF, static_cast(full_equivalent)); + ASSERT_EQ(row_descs[1] & 0xF, static_cast(full_equivalent)); + ASSERT_EQ(row_descs[2] & 0xF, static_cast(bitmap_equivalent)); + ASSERT_EQ(row_descs[3] & 0xF, static_cast(sparsity_type::hash)); + ASSERT_EQ(row_descs[4] & 0xF, static_cast(full_equivalent)); + ASSERT_EQ(row_descs[5] & 0xF, static_cast(full_equivalent)); } } diff --git a/test/matrix/csr_kernels.cpp b/test/matrix/csr_kernels.cpp index a19fa1ad8c8..a2ab2b8f707 100644 --- a/test/matrix/csr_kernels.cpp +++ b/test/matrix/csr_kernels.cpp @@ -132,72 +132,143 @@ TEST_F(Csr, InvScaleIsEquivalentToRef) GKO_ASSERT_MTX_NEAR(dx, x, r::value); } -TEST_F(Csr, BuildLookupDataWorks) + +template +class CsrLookup : public ::testing::Test { +protected: + using value_type = float; + using index_type = IndexType; + using Mtx = gko::matrix::Csr; + + CsrLookup() : rand_engine(15) {} + + void SetUp() + { + ref = gko::ReferenceExecutor::create(); + init_executor(ref, exec); + auto data = + gko::test::generate_random_matrix_data( + 628, 923, std::uniform_int_distribution(10, 300), + std::normal_distribution>(-1.0, + 1.0), + rand_engine); + // create a few empty rows + data.nonzeros.erase( + std::remove_if(data.nonzeros.begin(), data.nonzeros.end(), + [](auto entry) { return entry.row % 200 == 21; }), + data.nonzeros.end()); + // insert a full row and a pretty dense row + for (int i = 0; i < 100; i++) { + data.nonzeros.emplace_back(221, i + 100, 1.0); + data.nonzeros.emplace_back(421, i * 3 + 100, 2.0); + } + data.ensure_row_major_order(); + // initialize the matrices + mtx = Mtx::create(ref); + mtx->read(data); + dmtx = gko::clone(exec, mtx); + } + + void TearDown() + { + if (exec != nullptr) { + ASSERT_NO_THROW(exec->synchronize()); + } + } + + std::default_random_engine rand_engine; + std::shared_ptr ref; + std::shared_ptr exec; + std::unique_ptr mtx; + std::unique_ptr dmtx; + index_type invalid_index = gko::invalid_index(); +}; + +TYPED_TEST_SUITE(CsrLookup, gko::test::IndexTypes, TypenameNameGenerator); + + +TYPED_TEST(CsrLookup, BuildLookupWorks) { + using index_type = typename TestFixture::index_type; using gko::matrix::sparsity_type; - auto data = gko::test::generate_random_matrix_data( - 628, 923, std::uniform_int_distribution<>(10, 300), - std::normal_distribution(-1.0, 1.0), rand_engine); - // create a few empty rows - data.nonzeros.erase( - std::remove_if(data.nonzeros.begin(), data.nonzeros.end(), - [](auto entry) { return entry.row % 200 == 21; }), - data.nonzeros.end()); - // insert a full row and a pretty dense row - for (int i = 0; i < 100; i++) { - data.nonzeros.emplace_back(221, i + 100, 1.0); - data.nonzeros.emplace_back(421, i * 3 + 100, 2.0); - } - data.ensure_row_major_order(); - // initialize the matrices - auto mtx = Mtx::create(ref); - mtx->read(data); - auto dmtx = gko::clone(exec, mtx); - gko::array row_descs(ref, mtx->get_size()[0]); - gko::array lookup_info(ref, mtx->get_num_stored_elements() * 2); - gko::array drow_descs(exec, mtx->get_size()[0]); - gko::array dlookup_info(exec, - mtx->get_num_stored_elements() * 2); - for (auto allowed_methods : + const auto num_rows = this->mtx->get_size()[0]; + const auto num_cols = this->mtx->get_size()[1]; + gko::array row_desc_array(this->ref, num_rows); + gko::array drow_desc_array(this->exec, num_rows); + gko::array storage_offset_array(this->ref, num_rows + 1); + gko::array dstorage_offset_array(this->exec, num_rows + 1); + const auto row_descs = row_desc_array.get_data(); + const auto drow_descs = drow_desc_array.get_data(); + const auto row_ptrs = this->mtx->get_const_row_ptrs(); + const auto col_idxs = this->mtx->get_const_col_idxs(); + const auto drow_ptrs = this->dmtx->get_const_row_ptrs(); + const auto dcol_idxs = this->dmtx->get_const_col_idxs(); + const auto storage_offsets = storage_offset_array.get_data(); + const auto dstorage_offsets = dstorage_offset_array.get_data(); + for (auto allowed : {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash, sparsity_type::bitmap | sparsity_type::hash, sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) { - const auto full_allowed = - static_cast(static_cast(allowed_methods) & - static_cast(sparsity_type::full)); - const auto bitmap_allowed = - static_cast(static_cast(allowed_methods) & - static_cast(sparsity_type::bitmap)); + // check that storage offsets are calculated correctly + // otherwise things might crash + gko::kernels::reference::csr::build_lookup_offsets( + this->ref, row_ptrs, col_idxs, num_rows, allowed, storage_offsets); + gko::kernels::EXEC_NAMESPACE::csr::build_lookup_offsets( + this->exec, drow_ptrs, dcol_idxs, num_rows, allowed, + dstorage_offsets); + + GKO_ASSERT_ARRAY_EQ(storage_offset_array, dstorage_offset_array); + + gko::array storage_array(this->ref, + storage_offsets[num_rows]); + gko::array dstorage_array(this->exec, + storage_offsets[num_rows]); + const auto storage = storage_array.get_data(); + const auto dstorage = dstorage_array.get_data(); const auto bitmap_equivalent = - bitmap_allowed ? sparsity_type::bitmap : sparsity_type::hash; + csr_lookup_allowed(allowed, sparsity_type::bitmap) + ? sparsity_type::bitmap + : sparsity_type::hash; const auto full_equivalent = - full_allowed ? sparsity_type::full : bitmap_equivalent; - SCOPED_TRACE("full: " + std::to_string(full_allowed) + - " bitmap: " + std::to_string(bitmap_allowed)); + csr_lookup_allowed(allowed, sparsity_type::full) + ? sparsity_type::full + : bitmap_equivalent; gko::kernels::reference::csr::build_lookup( - ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), - mtx->get_size()[0], allowed_methods, row_descs.get_data(), - lookup_info.get_data()); + this->ref, row_ptrs, col_idxs, num_rows, allowed, storage_offsets, + row_descs, storage); gko::kernels::EXEC_NAMESPACE::csr::build_lookup( - exec, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(), - dmtx->get_size()[0], allowed_methods, drow_descs.get_data(), - dlookup_info.get_data()); - - gko::array host_row_descs(ref, drow_descs); - gko::array host_lookup_info(ref, dlookup_info); - for (int row = 0; row < dmtx->get_size()[0]; row++) { - const auto row_begin = mtx->get_const_row_ptrs()[row]; - const auto row_nnz = mtx->get_const_row_ptrs()[row + 1] - row_begin; - gko::matrix::device_sparsity_lookup lookup{ - mtx->get_const_col_idxs() + row_begin, row_nnz, - host_lookup_info.get_const_data() + (row_begin * 2), - host_row_descs.get_const_data()[row]}; - ASSERT_EQ(host_row_descs.get_const_data()[row] & 0xFFFF, - row_descs.get_const_data()[row] & 0xFFFF); - for (int nz = 0; nz < row_nnz; nz++) { - const auto col = mtx->get_const_col_idxs()[nz + row_begin]; - ASSERT_EQ(nz, lookup[col]); + this->exec, drow_ptrs, dcol_idxs, num_rows, allowed, + dstorage_offsets, drow_descs, dstorage); + + gko::array host_row_descs(this->ref, drow_desc_array); + gko::array host_storage_array(this->ref, dstorage_array); + for (int row = 0; row < num_rows; row++) { + const auto row_begin = row_ptrs[row]; + const auto row_end = row_ptrs[row + 1]; + const auto row_nnz = row_end - row_begin; + gko::matrix::device_sparsity_lookup lookup{ + row_ptrs, + col_idxs, + storage_offsets, + host_storage_array.get_const_data(), + host_row_descs.get_data(), + static_cast(row)}; + ASSERT_EQ(host_row_descs.get_const_data()[row] & 0xF, + row_descs[row] & 0xF); + for (auto nz = row_begin; nz < row_end; nz++) { + const auto col = col_idxs[nz]; + ASSERT_EQ(lookup.lookup_unsafe(col) + row_begin, nz); + } + auto nz = row_begin; + for (int col = 0; col < num_cols; col++) { + auto found_nz = lookup[col]; + if (nz < row_end && col_idxs[nz] == col) { + ASSERT_EQ(found_nz, nz - row_begin); + nz++; + } else { + ASSERT_EQ(found_nz, this->invalid_index); + } } } }