From 57c2999605b13b57c96e5b5029df04847c2217f1 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Tue, 8 Feb 2022 20:50:24 +0100
Subject: [PATCH] add kernels for Csr sparsity pattern lookup

---
 .../cuda_hip/components/segment_scan.hpp.inc  |  15 +-
 common/cuda_hip/matrix/coo_kernels.hpp.inc    |  10 +-
 common/cuda_hip/matrix/csr_kernels.hpp.inc    | 221 +++++++++++++++---
 core/matrix/csr_kernels.hpp                   |  11 +-
 cuda/matrix/csr_kernels.cu                    |  22 +-
 cuda/matrix/fbcsr_kernels.cu                  |   2 +
 cuda/test/matrix/csr_kernels.cpp              |  58 +++++
 dpcpp/matrix/csr_kernels.dp.cpp               | 121 ++++++++++
 dpcpp/test/matrix/csr_kernels.cpp             |  58 +++++
 hip/matrix/csr_kernels.hip.cpp                |  18 ++
 hip/matrix/fbcsr_kernels.hip.cpp              |   2 +
 hip/test/matrix/csr_kernels.hip.cpp           |  58 +++++
 include/ginkgo/core/base/intrinsics.hpp       |  78 +++++++
 include/ginkgo/core/matrix/csr_lookup.hpp     | 185 +++++++++++++++
 include/ginkgo/ginkgo.hpp                     |   2 +
 omp/matrix/csr_kernels.cpp                    | 120 ++++++++++
 omp/test/matrix/csr_kernels.cpp               |  58 +++++
 reference/matrix/csr_kernels.cpp              | 118 ++++++++++
 reference/test/matrix/csr_kernels.cpp         |  77 ++++++
 19 files changed, 1183 insertions(+), 51 deletions(-)
 create mode 100644 include/ginkgo/core/base/intrinsics.hpp
 create mode 100644 include/ginkgo/core/matrix/csr_lookup.hpp
diff --git a/common/cuda_hip/components/segment_scan.hpp.inc b/common/cuda_hip/components/segment_scan.hpp.inc
index 2bd2e3eec90..cc873d5850d 100644
--- a/common/cuda_hip/components/segment_scan.hpp.inc
+++ b/common/cuda_hip/components/segment_scan.hpp.inc
@@ -38,25 +38,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * performs suffix sum. Works on the source array and returns whether the thread
  * is the first element of its segment with same `ind`.
  */
-template <unsigned subwarp_size, typename ValueType, typename IndexType>
+template <unsigned subwarp_size, typename ValueType, typename IndexType,
+          typename Operator>
 __device__ __forceinline__ bool segment_scan(
     const group::thread_block_tile<subwarp_size>& group, const IndexType ind,
-    ValueType* __restrict__ val)
+    ValueType& val, Operator op)
 {
     bool head = true;
 #pragma unroll
     for (int i = 1; i < subwarp_size; i <<= 1) {
         const IndexType add_ind = group.shfl_up(ind, i);
-        ValueType add_val = zero<ValueType>();
-        if (add_ind == ind && threadIdx.x >= i) {
-            add_val = *val;
+        ValueType add_val{};
+        if (add_ind == ind && group.thread_rank() >= i) {
+            add_val = val;
             if (i == 1) {
                 head = false;
             }
         }
         add_val = group.shfl_down(add_val, i);
-        if (threadIdx.x < subwarp_size - i) {
-            *val += add_val;
+        if (group.thread_rank() < subwarp_size - i) {
+            val = op(val, add_val);
         }
     }
     return head;
diff --git a/common/cuda_hip/matrix/coo_kernels.hpp.inc b/common/cuda_hip/matrix/coo_kernels.hpp.inc
index a13068c1ce0..a5dd70bc95a 100644
--- a/common/cuda_hip/matrix/coo_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/coo_kernels.hpp.inc
@@ -82,8 +82,9 @@ __device__ void spmv_kernel(const size_type nnz, const size_type num_lines,
             (ind + subwarp_size < nnz) ? row[ind + subwarp_size] : row[nnz - 1];
         // segmented scan
         if (tile_block.any(curr_row != next_row)) {
-            bool is_first_in_segment =
-                segment_scan<subwarp_size>(tile_block, curr_row, &temp_val);
+            bool is_first_in_segment = segment_scan<subwarp_size>(
+                tile_block, curr_row, temp_val,
+                [](ValueType a, ValueType b) { return a + b; });
             if (is_first_in_segment) {
                 atomic_add(&(c[curr_row * c_stride + column_id]),
                            scale(temp_val));
@@ -97,8 +98,9 @@ __device__ void spmv_kernel(const size_type nnz, const size_type num_lines,
         temp_val += (ind < nnz) ? val[ind] * b[col[ind] * b_stride + column_id]
                                 : zero<ValueType>();
         // segmented scan
-        bool is_first_in_segment =
-            segment_scan<subwarp_size>(tile_block, curr_row, &temp_val);
+        bool is_first_in_segment = segment_scan<subwarp_size>(
+            tile_block, curr_row, temp_val,
+            [](ValueType a, ValueType b) { return a + b; });
         if (is_first_in_segment) {
             atomic_add(&(c[curr_row * c_stride + column_id]), scale(temp_val));
         }
diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc
index 075e1607ace..01df076f78a 100644
--- a/common/cuda_hip/matrix/csr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc
@@ -68,22 +68,21 @@ __device__ __forceinline__ bool block_segment_scan_reverse(
 template <bool overflow, typename IndexType>
 __device__ __forceinline__ void find_next_row(
     const IndexType num_rows, const IndexType data_size, const IndexType ind,
-    IndexType* __restrict__ row, IndexType* __restrict__ row_end,
-    const IndexType row_predict, const IndexType row_predict_end,
-    const IndexType* __restrict__ row_ptr)
+    IndexType& row, IndexType& row_end, const IndexType row_predict,
+    const IndexType row_predict_end, const IndexType* __restrict__ row_ptr)
 {
     if (!overflow || ind < data_size) {
-        if (ind >= *row_end) {
-            *row = row_predict;
-            *row_end = row_predict_end;
-            while (ind >= *row_end) {
-                *row_end = row_ptr[++*row + 1];
+        if (ind >= row_end) {
+            row = row_predict;
+            row_end = row_predict_end;
+            while (ind >= row_end) {
+                row_end = row_ptr[++row + 1];
             }
         }
 
     } else {
-        *row = num_rows - 1;
-        *row_end = data_size;
+        row = num_rows - 1;
+        row_end = data_size;
     }
 }
 
@@ -92,16 +91,17 @@ template <unsigned subwarp_size, typename ValueType, typename IndexType,
           typename Closure>
 __device__ __forceinline__ void warp_atomic_add(
     const group::thread_block_tile<subwarp_size>& group, bool force_write,
-    ValueType* __restrict__ val, const IndexType row, ValueType* __restrict__ c,
+    ValueType& val, const IndexType row, ValueType* __restrict__ c,
     const size_type c_stride, const IndexType column_id, Closure scale)
 {
     // do a local scan to avoid atomic collisions
-    const bool need_write = segment_scan(group, row, val);
+    const bool need_write = segment_scan(
+        group, row, val, [](ValueType a, ValueType b) { return a + b; });
     if (need_write && force_write) {
-        atomic_add(&(c[row * c_stride + column_id]), scale(*val));
+        atomic_add(&(c[row * c_stride + column_id]), scale(val));
     }
     if (!need_write || force_write) {
-        *val = zero<ValueType>();
+        val = zero<ValueType>();
     }
 }
 
@@ -111,28 +111,27 @@ template <bool last, unsigned subwarp_size, typename ValueType,
 __device__ __forceinline__ void process_window(
     const group::thread_block_tile<subwarp_size>& group,
     const IndexType num_rows, const IndexType data_size, const IndexType ind,
-    IndexType* __restrict__ row, IndexType* __restrict__ row_end,
-    IndexType* __restrict__ nrow, IndexType* __restrict__ nrow_end,
-    ValueType* __restrict__ temp_val, const ValueType* __restrict__ val,
+    IndexType& row, IndexType& row_end, IndexType& nrow, IndexType& nrow_end,
+    ValueType& temp_val, const ValueType* __restrict__ val,
     const IndexType* __restrict__ col_idxs,
     const IndexType* __restrict__ row_ptrs, const ValueType* __restrict__ b,
     const size_type b_stride, ValueType* __restrict__ c,
     const size_type c_stride, const IndexType column_id, Closure scale)
 {
-    const IndexType curr_row = *row;
-    find_next_row<last>(num_rows, data_size, ind, row, row_end, *nrow,
-                        *nrow_end, row_ptrs);
+    const auto curr_row = row;
+    find_next_row<last>(num_rows, data_size, ind, row, row_end, nrow, nrow_end,
+                        row_ptrs);
     // segmented scan
-    if (group.any(curr_row != *row)) {
-        warp_atomic_add(group, curr_row != *row, temp_val, curr_row, c,
-                        c_stride, column_id, scale);
-        *nrow = group.shfl(*row, subwarp_size - 1);
-        *nrow_end = group.shfl(*row_end, subwarp_size - 1);
+    if (group.any(curr_row != row)) {
+        warp_atomic_add(group, curr_row != row, temp_val, curr_row, c, c_stride,
+                        column_id, scale);
+        nrow = group.shfl(row, subwarp_size - 1);
+        nrow_end = group.shfl(row_end, subwarp_size - 1);
     }
 
     if (!last || ind < data_size) {
         const auto col = col_idxs[ind];
-        *temp_val += val[ind] * b[col * b_stride + column_id];
+        temp_val += val[ind] * b[col * b_stride + column_id];
     }
 }
 
@@ -171,21 +170,21 @@ __device__ __forceinline__ void spmv_kernel(
     auto nrow_end = row_end;
     ValueType temp_val = zero<ValueType>();
     IndexType ind = start + threadIdx.x;
-    find_next_row<true>(num_rows, data_size, ind, &row, &row_end, nrow,
-                        nrow_end, row_ptrs);
+    find_next_row<true>(num_rows, data_size, ind, row, row_end, nrow, nrow_end,
+                        row_ptrs);
     const IndexType ind_end = end - wsize;
     const auto tile_block =
         group::tiled_partition<wsize>(group::this_thread_block());
     for (; ind < ind_end; ind += wsize) {
-        process_window<false>(tile_block, num_rows, data_size, ind, &row,
-                              &row_end, &nrow, &nrow_end, &temp_val, val,
-                              col_idxs, row_ptrs, b, b_stride, c, c_stride,
-                              column_id, scale);
-    }
-    process_window<true>(tile_block, num_rows, data_size, ind, &row, &row_end,
-                         &nrow, &nrow_end, &temp_val, val, col_idxs, row_ptrs,
-                         b, b_stride, c, c_stride, column_id, scale);
-    warp_atomic_add(tile_block, true, &temp_val, row, c, c_stride, column_id,
+        process_window<false>(tile_block, num_rows, data_size, ind, row,
+                              row_end, nrow, nrow_end, temp_val, val, col_idxs,
+                              row_ptrs, b, b_stride, c, c_stride, column_id,
+                              scale);
+    }
+    process_window<true>(tile_block, num_rows, data_size, ind, row, row_end,
+                         nrow, nrow_end, temp_val, val, col_idxs, row_ptrs, b,
+                         b_stride, c, c_stride, column_id, scale);
+    warp_atomic_add(tile_block, true, temp_val, row, c, c_stride, column_id,
                     scale);
 }
 
@@ -902,4 +901,152 @@ __global__ __launch_bounds__(default_block_size) void check_diagonal_entries(
 }
 
 
+template <int subwarp_size, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void build_csr_lookup(
+    const IndexType* __restrict__ row_ptrs,
+    const IndexType* __restrict__ col_idxs, size_type num_rows,
+    gko::matrix::sparsity_type allowed, int64* __restrict__ row_desc,
+    int32* __restrict__ storage)
+{
+    constexpr int block_size = 32;
+    auto row = thread::get_subwarp_id_flat<subwarp_size>();
+    if (row >= num_rows) {
+        return;
+    }
+    const auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    const auto row_begin = row_ptrs[row];
+    const auto row_len = row_ptrs[row + 1] - row_begin;
+    const auto storage_begin = 2 * row_begin;
+    const auto available_storage = 2 * row_len;
+    const auto local_storage = storage + storage_begin;
+    const auto local_cols = col_idxs + row_begin;
+    const auto lane = subwarp.thread_rank();
+    const auto min_col = row_len > 0 ? local_cols[0] : 0;
+    const auto col_range =
+        row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0;
+
+    // full column range
+    if (col_range == row_len &&
+        ((static_cast<int>(allowed) &
+          static_cast<int>(matrix::sparsity_type::full)) != 0)) {
+        if (lane == 0) {
+            row_desc[row] = static_cast<int>(matrix::sparsity_type::full);
+        }
+        return;
+    }
+    // dense bitmap storage
+    const auto num_blocks = static_cast<int>(ceildiv(col_range, block_size));
+    if (num_blocks * 2 <= available_storage &&
+        ((static_cast<int>(allowed) &
+          static_cast<int>(matrix::sparsity_type::bitmap)) != 0)) {
+        if (lane == 0) {
+            row_desc[row] = (static_cast<int64>(num_blocks) << 32) |
+                            static_cast<int>(matrix::sparsity_type::bitmap);
+        }
+        const auto block_ranks = local_storage;
+        const auto block_bitmaps =
+            reinterpret_cast<uint32*>(block_ranks + num_blocks);
+        // fill bitmaps with zeros
+        for (int32 i = lane; i < num_blocks; i += subwarp_size) {
+            block_bitmaps[i] = 0;
+        }
+        // fill bitmaps with sparsity pattern
+        for (IndexType base_i = 0; base_i < row_len; base_i += subwarp_size) {
+            const auto i = base_i + lane;
+            const auto col = i < row_len ? local_cols[i] : INT_MAX;
+            const auto rel_col = static_cast<int32>(col - min_col);
+            const auto block = rel_col / block_size;
+            const auto col_in_block = rel_col % block_size;
+            auto local_bitmap = uint32{i < row_len ? 1u : 0u} << col_in_block;
+            bool is_first =
+                segment_scan(subwarp, block, local_bitmap,
+                             [](config::lane_mask_type a,
+                                config::lane_mask_type b) { return a | b; });
+            if (is_first && i < row_len) {
+                block_bitmaps[block] |= local_bitmap;
+            }
+        }
+        // compute bitmap ranks
+        int32 block_partial_sum{};
+        for (int32 base_i = 0; base_i < num_blocks; base_i += subwarp_size) {
+            const auto i = base_i + lane;
+            const auto bitmap = i < num_blocks ? block_bitmaps[i] : 0;
+            int32 local_partial_sum{};
+            int32 local_total_sum{};
+            subwarp_prefix_sum<false>(popcnt(bitmap), local_partial_sum,
+                                      local_total_sum, subwarp);
+            if (i < num_blocks) {
+                block_ranks[i] = local_partial_sum + block_partial_sum;
+            }
+            block_partial_sum += local_total_sum;
+        }
+        return;
+    }
+    // sparse hashmap storage
+    constexpr double inv_golden_ratio = 0.61803398875;
+    // use golden ratio as approximation for hash parameter that spreads
+    // consecutive values as far apart as possible. Ensure lowest bit is set
+    // otherwise we skip odd hashtable entries
+    const auto hash_parameter =
+        1u | static_cast<uint32>(available_storage * inv_golden_ratio);
+    if (lane == 0) {
+        row_desc[row] = (static_cast<int64>(hash_parameter) << 32) |
+                        static_cast<int>(matrix::sparsity_type::hash);
+    }
+    // fill hashmap with sentinel
+    constexpr int32 empty = -1;
+    for (int32 i = lane; i < available_storage; i += subwarp_size) {
+        local_storage[i] = empty;
+    }
+    // memory barrier
+    subwarp.sync();
+    // fill with actual entries
+    for (IndexType base_i = 0; base_i < row_len; base_i += subwarp_size) {
+        const auto i = base_i + lane;
+        const auto col = i < row_len ? local_cols[i] : 0;
+        auto hash = i < row_len ? (static_cast<uint32>(col) * hash_parameter) %
+                                      static_cast<uint32>(available_storage)
+                                : available_storage;
+        // collision resolution
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)
+        const auto this_lane_mask = config::lane_mask_type{1} << lane;
+        const auto lane_prefix_mask = this_lane_mask - 1;
+        // memory barrier to previous loop iteration
+        subwarp.sync();
+        int32 entry = i < row_len ? local_storage[hash] : empty_bucket;
+        // find all threads in the subwarp with the same hash key
+        auto colliding = subwarp.match_any(hash);
+        // if there are any collisions with previously filled buckets
+        while (subwarp.any(entry != empty || colliding != this_lane_mask)) {
+            if (entry != empty_bucket) {
+                // assign consecutive indices to matching threads
+                hash += 1 + popcnt(colliding & lane_prefix_mask);
+                // cheap modulo replacement
+                if (hash >= available_storage) {
+                    hash -= available_storage;
+                }
+                // this can only fail for row_length < 16
+                assert(hash < available_storage);
+                entry = local_storage[hash];
+            }
+            colliding = subwarp.match_any(hash);
+        }
+        if (i < row_len) {
+            local_storage[hash] = i;
+        }
+#else
+        if (i < row_len) {
+            while (atomicCAS(local_storage + hash, empty, i) != empty) {
+                hash++;
+                if (hash >= available_storage) {
+                    hash = 0;
+                }
+            }
+        }
+#endif
+    }
+}
+
+
 }  // namespace kernel
diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp
index c67d5e0730d..1cfd822190e 100644
--- a/core/matrix/csr_kernels.hpp
+++ b/core/matrix/csr_kernels.hpp
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr_lookup.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
@@ -206,6 +207,12 @@ namespace kernels {
                              const matrix::Dense<ValueType>* beta,        \
                              matrix::Csr<ValueType, IndexType>* mtx)
 
+#define GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL(IndexType)                      \
+    void build_lookup(std::shared_ptr<const DefaultExecutor> exec,          \
+                      const IndexType* row_ptrs, const IndexType* col_idxs, \
+                      size_type num_rows, matrix::sparsity_type allowed,    \
+                      int64* row_desc, int32* storage)
+
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                                       \
     template <typename ValueType, typename IndexType>                      \
@@ -259,7 +266,9 @@ namespace kernels {
     template <typename ValueType, typename IndexType>                      \
     GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST(ValueType, IndexType);    \
     template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL(ValueType, IndexType)
+    GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL(ValueType, IndexType);      \
+    template <typename IndexType>                                          \
+    GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL(IndexType)
 
 
 GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(csr, GKO_DECLARE_ALL_AS_TEMPLATES);
diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
index b1e892ebf48..3b40992d0b3 100644
--- a/cuda/matrix/csr_kernels.cu
+++ b/cuda/matrix/csr_kernels.cu
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr_lookup.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
@@ -61,6 +62,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/components/format_conversion.cuh"
 #include "cuda/components/intrinsics.cuh"
 #include "cuda/components/merging.cuh"
+#include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/segment_scan.cuh"
 #include "cuda/components/thread_ids.cuh"
@@ -602,7 +604,7 @@ void spgemm(std::shared_ptr<const CudaExecutor> exec,
     cusparse::destroy(b_descr);
     cusparse::destroy(a_descr);
 
-#else   // CUDA_VERSION >= 11000
+#else  // CUDA_VERSION >= 11000
     const auto beta = zero<ValueType>();
     auto spgemm_descr = cusparse::create_spgemm_descr();
     auto a_descr = cusparse::create_csr(
@@ -775,7 +777,7 @@ void advanced_spgemm(std::shared_ptr<const CudaExecutor> exec,
     cusparse::destroy(c_descr);
     cusparse::destroy(b_descr);
     cusparse::destroy(a_descr);
-#else   // CUDA_VERSION >= 11000
+#else  // CUDA_VERSION >= 11000
     auto null_value = static_cast<ValueType*>(nullptr);
     auto null_index = static_cast<IndexType*>(nullptr);
     auto one_val = one<ValueType>();
@@ -1310,6 +1312,22 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
+template <typename IndexType>
+void build_lookup(std::shared_ptr<const DefaultExecutor> exec,
+                  const IndexType* row_ptrs, const IndexType* col_idxs,
+                  size_type num_rows, matrix::sparsity_type allowed,
+                  int64* row_desc, int32* storage)
+{
+    const auto num_blocks =
+        ceildiv(num_rows, default_block_size / config::warp_size);
+    kernel::build_csr_lookup<config::warp_size>
+        <<<num_blocks, default_block_size>>>(row_ptrs, col_idxs, num_rows,
+                                             allowed, row_desc, storage);
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL);
+
+
 }  // namespace csr
 }  // namespace cuda
 }  // namespace kernels
diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu
index c9c4782d262..79f75cf2dac 100644
--- a/cuda/matrix/fbcsr_kernels.cu
+++ b/cuda/matrix/fbcsr_kernels.cu
@@ -49,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/csr_lookup.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
@@ -68,6 +69,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/components/atomic.cuh"
 #include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/merging.cuh"
+#include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 
diff --git a/cuda/test/matrix/csr_kernels.cpp b/cuda/test/matrix/csr_kernels.cpp
index affd108ea6f..fd07c59bd2b 100644
--- a/cuda/test/matrix/csr_kernels.cpp
+++ b/cuda/test/matrix/csr_kernels.cpp
@@ -1012,4 +1012,62 @@ TEST_F(Csr, AddScaledIdentityToNonSquare)
 }
 
 
+TEST_F(Csr, BuildLookupDataWorks)
+{
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
+    using gko::matrix::sparsity_type;
+    gko::Array<gko::int64> row_descs(ref, mtx->get_size()[0]);
+    gko::Array<gko::int32> lookup_info(ref, mtx->get_num_stored_elements() * 2);
+    gko::Array<gko::int64> drow_descs(cuda, mtx->get_size()[0]);
+    gko::Array<gko::int32> dlookup_info(cuda,
+                                        mtx->get_num_stored_elements() * 2);
+    for (auto allowed_methods :
+         {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash,
+          sparsity_type::bitmap | sparsity_type::hash,
+          sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) {
+        const auto full_allowed =
+            static_cast<bool>(static_cast<int>(allowed_methods) &
+                              static_cast<int>(sparsity_type::full));
+        const auto bitmap_allowed =
+            static_cast<bool>(static_cast<int>(allowed_methods) &
+                              static_cast<int>(sparsity_type::bitmap));
+        const auto bitmap_equivalent =
+            bitmap_allowed ? sparsity_type::bitmap : sparsity_type::hash;
+        const auto full_equivalent =
+            full_allowed ? sparsity_type::full : bitmap_equivalent;
+        SCOPED_TRACE("full: " + std::to_string(full_allowed) +
+                     " bitmap: " + std::to_string(bitmap_allowed));
+
+        gko::kernels::reference::csr::build_lookup(
+            ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
+            mtx->get_size()[0], allowed_methods, row_descs.get_data(),
+            lookup_info.get_data());
+        gko::kernels::cuda::csr::build_lookup(
+            cuda, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(),
+            dmtx->get_size()[0], allowed_methods, drow_descs.get_data(),
+            dlookup_info.get_data());
+
+        gko::Array<gko::int64> host_row_descs(ref, drow_descs);
+        gko::Array<gko::int32> host_lookup_info(ref, dlookup_info);
+        for (int row = 0; row < dmtx->get_size()[0]; row++) {
+            SCOPED_TRACE("row: " + std::to_string(row));
+            const auto row_begin = mtx->get_const_row_ptrs()[row];
+            const auto row_nnz = mtx->get_const_row_ptrs()[row + 1] - row_begin;
+            gko::matrix::device_sparsity_lookup<int> lookup{
+                mtx->get_const_col_idxs() + row_begin, row_nnz,
+                host_lookup_info.get_const_data() + (row_begin * 2),
+                host_row_descs.get_const_data()[row]};
+
+            ASSERT_EQ(host_row_descs.get_const_data()[row] & 0xFFFF,
+                      row_descs.get_const_data()[row] & 0xFFFF);
+            for (int nz = 0; nz < row_nnz; nz++) {
+                SCOPED_TRACE("nz: " + std::to_string(nz));
+                const auto col = mtx->get_const_col_idxs()[nz + row_begin];
+                ASSERT_EQ(nz, lookup[col]);
+            }
+        }
+    }
+}
+
+
 }  // namespace
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index d475a104dbf..85a3c54052d 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -2207,6 +2207,127 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
+bool csr_lookup_allowed(matrix::sparsity_type allowed,
+                        matrix::sparsity_type type)
+{
+    return ((static_cast<int>(allowed) & static_cast<int>(type)) != 0);
+}
+
+
+template <typename IndexType>
+bool csr_lookup_try_full(IndexType row_len, IndexType col_range,
+                         matrix::sparsity_type allowed, int64& row_desc)
+{
+    using matrix::sparsity_type;
+    bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::full);
+    if (is_allowed && row_len == col_range) {
+        row_desc = static_cast<int>(sparsity_type::full);
+        return true;
+    }
+    return false;
+}
+
+
+template <typename IndexType>
+bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range,
+                           IndexType min_col, IndexType available_storage,
+                           matrix::sparsity_type allowed, int64& row_desc,
+                           int32* local_storage, const IndexType* cols)
+{
+    constexpr static int block_size =
+        gko::matrix::device_sparsity_lookup<IndexType>::block_size;
+    using matrix::sparsity_type;
+    bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap);
+    const auto num_blocks = ceildiv(col_range, block_size);
+    if (is_allowed && num_blocks * 2 <= available_storage) {
+        row_desc = (static_cast<int64>(num_blocks) << 32) |
+                   static_cast<int>(sparsity_type::bitmap);
+        const auto block_ranks = local_storage;
+        const auto block_bitmaps =
+            reinterpret_cast<uint32*>(block_ranks + num_blocks);
+        std::fill_n(block_bitmaps, num_blocks, 0);
+        for (auto col_it = cols; col_it < cols + row_len; col_it++) {
+            const auto rel_col = *col_it - min_col;
+            const auto block = rel_col / block_size;
+            const auto col_in_block = rel_col % block_size;
+            block_bitmaps[block] |= uint32{1} << col_in_block;
+        }
+        int32 partial_sum{};
+        for (int32 block = 0; block < num_blocks; block++) {
+            block_ranks[block] = partial_sum;
+            partial_sum += gko::detail::popcount(block_bitmaps[block]);
+        }
+        return true;
+    }
+    return false;
+}
+
+
+template <typename IndexType>
+void csr_lookup_build_hash(IndexType row_len, IndexType available_storage,
+                           int64& row_desc, int32* local_storage,
+                           const IndexType* cols)
+{
+    constexpr double inv_golden_ratio = 0.61803398875;
+    // use golden ratio as approximation for hash parameter that spreads
+    // consecutive values as far apart as possible. Ensure lowest bit is set
+    // otherwise we skip odd hashtable entries
+    const auto hash_parameter =
+        1u | static_cast<uint32>(available_storage * inv_golden_ratio);
+    row_desc = (static_cast<int64>(hash_parameter) << 32) |
+               static_cast<int>(matrix::sparsity_type::hash);
+    std::fill_n(local_storage, available_storage, -1);
+    for (int32 nz = 0; nz < row_len; nz++) {
+        auto hash = (static_cast<uint32>(cols[nz]) * hash_parameter) %
+                    static_cast<uint32>(available_storage);
+        // linear probing: find the next empty entry
+        while (local_storage[hash] != -1) {
+            hash++;
+            if (hash >= available_storage) {
+                hash = 0;
+            }
+        }
+        local_storage[hash] = nz;
+    }
+}
+
+
+template <typename IndexType>
+void build_lookup(std::shared_ptr<const DpcppExecutor> exec,
+                  const IndexType* row_ptrs, const IndexType* col_idxs,
+                  size_type num_rows, matrix::sparsity_type allowed,
+                  int64* row_desc, int32* storage)
+{
+    exec->get_queue()->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) {
+            const auto row = static_cast<size_type>(idx[0]);
+            const auto row_begin = row_ptrs[row];
+            const auto row_len = row_ptrs[row + 1] - row_begin;
+            const auto storage_begin = 2 * row_begin;
+            const auto available_storage = 2 * row_len;
+            const auto local_storage = storage + storage_begin;
+            const auto local_cols = col_idxs + row_begin;
+            const auto min_col = row_len > 0 ? local_cols[0] : 0;
+            const auto col_range =
+                row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0;
+            bool done =
+                csr_lookup_try_full(row_len, col_range, allowed, row_desc[row]);
+            if (!done) {
+                done = csr_lookup_try_bitmap(
+                    row_len, col_range, min_col, available_storage, allowed,
+                    row_desc[row], local_storage, local_cols);
+            }
+            if (!done) {
+                csr_lookup_build_hash(row_len, available_storage, row_desc[row],
+                                      local_storage, local_cols);
+            }
+        });
+    });
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL);
+
+
 }  // namespace csr
 }  // namespace dpcpp
 }  // namespace kernels
diff --git a/dpcpp/test/matrix/csr_kernels.cpp b/dpcpp/test/matrix/csr_kernels.cpp
index 3f3aa505fbc..afed432b5e6 100644
--- a/dpcpp/test/matrix/csr_kernels.cpp
+++ b/dpcpp/test/matrix/csr_kernels.cpp
@@ -964,4 +964,62 @@ TEST_F(Csr, CreateSubMatrixIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, BuildLookupDataWorks)
+{
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
+    using gko::matrix::sparsity_type;
+    gko::Array<gko::int64> row_descs(ref, mtx->get_size()[0]);
+    gko::Array<gko::int32> lookup_info(ref, mtx->get_num_stored_elements() * 2);
+    gko::Array<gko::int64> drow_descs(dpcpp, mtx->get_size()[0]);
+    gko::Array<gko::int32> dlookup_info(dpcpp,
+                                        mtx->get_num_stored_elements() * 2);
+    for (auto allowed_methods :
+         {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash,
+          sparsity_type::bitmap | sparsity_type::hash,
+          sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) {
+        const auto full_allowed =
+            static_cast<bool>(static_cast<int>(allowed_methods) &
+                              static_cast<int>(sparsity_type::full));
+        const auto bitmap_allowed =
+            static_cast<bool>(static_cast<int>(allowed_methods) &
+                              static_cast<int>(sparsity_type::bitmap));
+        const auto bitmap_equivalent =
+            bitmap_allowed ? sparsity_type::bitmap : sparsity_type::hash;
+        const auto full_equivalent =
+            full_allowed ? sparsity_type::full : bitmap_equivalent;
+        SCOPED_TRACE("full: " + std::to_string(full_allowed) +
+                     " bitmap: " + std::to_string(bitmap_allowed));
+
+        gko::kernels::reference::csr::build_lookup(
+            ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
+            mtx->get_size()[0], allowed_methods, row_descs.get_data(),
+            lookup_info.get_data());
+        gko::kernels::dpcpp::csr::build_lookup(
+            dpcpp, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(),
+            dmtx->get_size()[0], allowed_methods, drow_descs.get_data(),
+            dlookup_info.get_data());
+
+        gko::Array<gko::int64> host_row_descs(ref, drow_descs);
+        gko::Array<gko::int32> host_lookup_info(ref, dlookup_info);
+        for (int row = 0; row < dmtx->get_size()[0]; row++) {
+            SCOPED_TRACE("row: " + std::to_string(row));
+            const auto row_begin = mtx->get_const_row_ptrs()[row];
+            const auto row_nnz = mtx->get_const_row_ptrs()[row + 1] - row_begin;
+            gko::matrix::device_sparsity_lookup<int> lookup{
+                mtx->get_const_col_idxs() + row_begin, row_nnz,
+                host_lookup_info.get_const_data() + (row_begin * 2),
+                host_row_descs.get_const_data()[row]};
+
+            ASSERT_EQ(host_row_descs.get_const_data()[row] & 0xFFFF,
+                      row_descs.get_const_data()[row] & 0xFFFF);
+            for (int nz = 0; nz < row_nnz; nz++) {
+                SCOPED_TRACE("nz: " + std::to_string(nz));
+                const auto col = mtx->get_const_col_idxs()[nz + row_begin];
+                ASSERT_EQ(nz, lookup[col]);
+            }
+        }
+    }
+}
+
+
 }  // namespace
diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp
index d1d980c60a5..1a62ec3e8b1 100644
--- a/hip/matrix/csr_kernels.hip.cpp
+++ b/hip/matrix/csr_kernels.hip.cpp
@@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr_lookup.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
 #include <ginkgo/core/matrix/hybrid.hpp>
@@ -63,6 +64,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
 #include "hip/components/merging.hip.hpp"
+#include "hip/components/prefix_sum.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/segment_scan.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
@@ -1093,6 +1095,22 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
+template <typename IndexType>
+void build_lookup(std::shared_ptr<const DefaultExecutor> exec,
+                  const IndexType* row_ptrs, const IndexType* col_idxs,
+                  size_type num_rows, matrix::sparsity_type allowed,
+                  int64* row_desc, int32* storage)
+{
+    const auto num_blocks =
+        ceildiv(num_rows, default_block_size / config::warp_size);
+    kernel::build_csr_lookup<config::warp_size>
+        <<<num_blocks, default_block_size>>>(row_ptrs, col_idxs, num_rows,
+                                             allowed, row_desc, storage);
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL);
+
+
 }  // namespace csr
 }  // namespace hip
 }  // namespace kernels
diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.hip.cpp
index 33ed4a08e2e..58421baf8d8 100644
--- a/hip/matrix/fbcsr_kernels.hip.cpp
+++ b/hip/matrix/fbcsr_kernels.hip.cpp
@@ -50,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/csr_lookup.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
@@ -59,6 +60,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/components/format_conversion_kernels.hpp"
 #include "hip/base/config.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
+#include "hip/components/prefix_sum.hip.hpp"
 
 namespace gko {
 namespace kernels {
diff --git a/hip/test/matrix/csr_kernels.hip.cpp b/hip/test/matrix/csr_kernels.hip.cpp
index c4f2a8f101b..c80c1827e2d 100644
--- a/hip/test/matrix/csr_kernels.hip.cpp
+++ b/hip/test/matrix/csr_kernels.hip.cpp
@@ -1022,4 +1022,62 @@ TEST_F(Csr, AddScaledIdentityToNonSquare)
 }
 
 
+TEST_F(Csr, BuildLookupDataWorks)
+{
+    set_up_apply_data(std::make_shared<Mtx::sparselib>());
+    using gko::matrix::sparsity_type;
+    gko::Array<gko::int64> row_descs(ref, mtx->get_size()[0]);
+    gko::Array<gko::int32> lookup_info(ref, mtx->get_num_stored_elements() * 2);
+    gko::Array<gko::int64> drow_descs(hip, mtx->get_size()[0]);
+    gko::Array<gko::int32> dlookup_info(hip,
+                                        mtx->get_num_stored_elements() * 2);
+    for (auto allowed_methods :
+         {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash,
+          sparsity_type::bitmap | sparsity_type::hash,
+          sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) {
+        const auto full_allowed =
+            static_cast<bool>(static_cast<int>(allowed_methods) &
+                              static_cast<int>(sparsity_type::full));
+        const auto bitmap_allowed =
+            static_cast<bool>(static_cast<int>(allowed_methods) &
+                              static_cast<int>(sparsity_type::bitmap));
+        const auto bitmap_equivalent =
+            bitmap_allowed ? sparsity_type::bitmap : sparsity_type::hash;
+        const auto full_equivalent =
+            full_allowed ? sparsity_type::full : bitmap_equivalent;
+        SCOPED_TRACE("full: " + std::to_string(full_allowed) +
+                     " bitmap: " + std::to_string(bitmap_allowed));
+
+        gko::kernels::reference::csr::build_lookup(
+            ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
+            mtx->get_size()[0], allowed_methods, row_descs.get_data(),
+            lookup_info.get_data());
+        gko::kernels::hip::csr::build_lookup(
+            hip, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(),
+            dmtx->get_size()[0], allowed_methods, drow_descs.get_data(),
+            dlookup_info.get_data());
+
+        gko::Array<gko::int64> host_row_descs(ref, drow_descs);
+        gko::Array<gko::int32> host_lookup_info(ref, dlookup_info);
+        for (int row = 0; row < dmtx->get_size()[0]; row++) {
+            SCOPED_TRACE("row: " + std::to_string(row));
+            const auto row_begin = mtx->get_const_row_ptrs()[row];
+            const auto row_nnz = mtx->get_const_row_ptrs()[row + 1] - row_begin;
+            gko::matrix::device_sparsity_lookup<int> lookup{
+                mtx->get_const_col_idxs() + row_begin, row_nnz,
+                host_lookup_info.get_const_data() + (row_begin * 2),
+                host_row_descs.get_const_data()[row]};
+
+            ASSERT_EQ(host_row_descs.get_const_data()[row] & 0xFFFF,
+                      row_descs.get_const_data()[row] & 0xFFFF);
+            for (int nz = 0; nz < row_nnz; nz++) {
+                SCOPED_TRACE("nz: " + std::to_string(nz));
+                const auto col = mtx->get_const_col_idxs()[nz + row_begin];
+                ASSERT_EQ(nz, lookup[col]);
+            }
+        }
+    }
+}
+
+
 }  // namespace
diff --git a/include/ginkgo/core/base/intrinsics.hpp b/include/ginkgo/core/base/intrinsics.hpp
new file mode 100644
index 00000000000..ba3894475da
--- /dev/null
+++ b/include/ginkgo/core/base/intrinsics.hpp
@@ -0,0 +1,78 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_BASE_INTRINSICS_HPP_
+#define GKO_PUBLIC_CORE_BASE_INTRINSICS_HPP_
+
+
+#include <bitset>
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace detail {
+
+
+/**
+ * Returns the number of set bits in the given bitmask.
+ */
+GKO_ATTRIBUTES GKO_INLINE int popcount(uint32 bitmask)
+{
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+    return __popc(bitmask);
+#else
+    std::bitset<32> bits{bitmask};
+    return bits.count();
+#endif
+}
+
+
+/**
+ * Returns the number of set bits in the given bitmask.
+ */
+GKO_ATTRIBUTES GKO_INLINE int popcount(uint64 bitmask)
+{
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+    return __popcll(bitmask);
+#else
+    std::bitset<64> bits{bitmask};
+    return bits.count();
+#endif
+}
+
+
+}  // namespace detail
+}  // namespace gko
+
+#endif  // GKO_PUBLIC_CORE_BASE_INTRINSICS_HPP_
\ No newline at end of file
diff --git a/include/ginkgo/core/matrix/csr_lookup.hpp b/include/ginkgo/core/matrix/csr_lookup.hpp
new file mode 100644
index 00000000000..a409851cde0
--- /dev/null
+++ b/include/ginkgo/core/matrix/csr_lookup.hpp
@@ -0,0 +1,185 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_MATRIX_CSR_LOOKUP_HPP_
+#define GKO_PUBLIC_CORE_MATRIX_CSR_LOOKUP_HPP_
+
+#include <type_traits>
+
+
+#include <ginkgo/core/base/intrinsics.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+namespace gko {
+namespace matrix {
+
+
+/**
+ * Type describing which kind of lookup structure is used to find entries in a
+ * single row of a Csr matrix.
+ */
+enum class sparsity_type : int {
+    /**
+     * The row is dense, i.e. it contains all entries in
+     * `[min_col, min_col + storage_size)`.
+     * This means that the relative output index is `col - min_col`.
+     */
+    full = 1,
+    /**
+     * The row is sufficiently dense that its sparsity pattern can be stored in
+     * a dense bitmap consisting of `storage_size` blocks of size `block_size`,
+     * with a total of `storage_size` blocks. Each block stores its sparsity
+     * pattern and the number of columns before. This means that the relative
+     * output index can be computed as
+     * ```
+     * auto block = (col - min_col) / block_size;
+     * auto local_col = (col - min_col) % block_size;
+     * auto prefix_mask = (block_type{1} << local_col) - 1;
+     * auto output_idx = base[block] + popcount(bitmap[block] & prefix_mask);
+     * ```
+     */
+    bitmap = 2,
+    /**
+     * The row is sparse, so it is best represented using a hashtable.
+     * The hashtable has size `storage_size` and stores the relative output
+     * index directly, i.e.
+     * ```
+     * auto hash_key = col - min_col;
+     * auto hash_bucket = hash(hash_key);
+     * while (col_idxs[hashtable[hash_bucket]] != col) {
+     *     hash_bucket = (hash_bucket + 1) % storage_size; // linear probing
+     * }
+     * auto output_idx = hashtable[hash_bucket];
+     * ```
+     */
+    hash = 4,
+};
+
+
+inline sparsity_type operator|(sparsity_type a, sparsity_type b)
+{
+    return static_cast<sparsity_type>(static_cast<int>(a) |
+                                      static_cast<int>(b));
+}
+
+
+template <typename IndexType>
+class csr_sparsity_lookup {
+    const IndexType* row_ptrs;
+    const IndexType* col_idxs;
+    Array<int64> row_desc;
+    Array<int32> storage;
+};
+
+
+template <typename IndexType>
+struct device_sparsity_lookup {
+    /** Number of bits in a block_type entry. */
+    static constexpr int block_size = 32;
+
+    const IndexType* col_idxs;
+    const IndexType row_nnz;
+    const int32* storage;
+    const int64 desc;
+
+    GKO_ATTRIBUTES GKO_INLINE IndexType operator[](IndexType col) const
+    {
+        switch (static_cast<sparsity_type>(desc & 0xF)) {
+        case sparsity_type::full:
+            return lookup_full(col);
+        case sparsity_type::bitmap:
+            return lookup_bitmap(col);
+        case sparsity_type::hash:
+            return lookup_hash(col);
+        }
+        assert(false);
+    }
+
+    GKO_ATTRIBUTES GKO_INLINE IndexType lookup_full(IndexType col) const
+    {
+        const auto min_col = col_idxs[0];
+        const auto out_idx = col - min_col;
+        assert(out_idx < row_nnz);
+        assert(col_idxs[out_idx] == col);
+        return out_idx;
+    }
+
+    GKO_ATTRIBUTES GKO_INLINE IndexType lookup_bitmap(IndexType col) const
+    {
+        const auto min_col = col_idxs[0];
+        const auto num_blocks = static_cast<int32>(desc >> 32);
+        const auto block_bases = storage;
+        const auto block_bitmaps =
+            reinterpret_cast<const uint32*>(block_bases + num_blocks);
+        const auto rel_col = col - min_col;
+        const auto block = rel_col / block_size;
+        const auto col_in_block = rel_col % block_size;
+        const auto prefix_mask = (uint32{1} << col_in_block) - 1;
+        assert(block < num_blocks);
+        assert(block_bitmaps[block] & (uint32{1} << col_in_block));
+        const auto out_idx =
+            block_bases[block] +
+            gko::detail::popcount(block_bitmaps[block] & prefix_mask);
+        assert(col_idxs[out_idx] == col);
+        return out_idx;
+    }
+
+    GKO_ATTRIBUTES GKO_INLINE IndexType lookup_hash(IndexType col) const
+    {
+        const auto hashmap_size = static_cast<uint32>(2 * row_nnz);
+        const auto hash_param = static_cast<uint32>(desc >> 32);
+        const auto hashmap = storage;
+        auto hash = (static_cast<uint32>(col) * hash_param) % hashmap_size;
+        assert(hashmap[hash] >= 0);
+        assert(hashmap[hash] < row_nnz);
+        // linear probing with sentinel to avoid infinite loop
+        while (col_idxs[hashmap[hash]] != col) {
+            hash++;
+            if (hash >= hashmap_size) {
+                hash = 0;
+            }
+            assert(hashmap[hash] >= 0);
+            assert(hashmap[hash] < row_nnz);
+        }
+        const auto out_idx = hashmap[hash];
+        assert(col_idxs[out_idx] == col);
+        return out_idx;
+    }
+};
+
+
+}  // namespace matrix
+}  // namespace gko
+
+#endif  // GKO_PUBLIC_CORE_MATRIX_CSR_LOOKUP_HPP_
\ No newline at end of file
diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp
index a7756d40df3..7788cdb89b8 100644
--- a/include/ginkgo/ginkgo.hpp
+++ b/include/ginkgo/ginkgo.hpp
@@ -48,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/index_set.hpp>
+#include <ginkgo/core/base/intrinsics.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/machine_topology.hpp>
 #include <ginkgo/core/base/math.hpp>
@@ -86,6 +87,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/csr_lookup.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp
index 53d0e7ee4e5..3de68b3c19b 100644
--- a/omp/matrix/csr_kernels.cpp
+++ b/omp/matrix/csr_kernels.cpp
@@ -1020,6 +1020,126 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
+bool csr_lookup_allowed(matrix::sparsity_type allowed,
+                        matrix::sparsity_type type)
+{
+    return ((static_cast<int>(allowed) & static_cast<int>(type)) != 0);
+}
+
+
+template <typename IndexType>
+bool csr_lookup_try_full(IndexType row_len, IndexType col_range,
+                         matrix::sparsity_type allowed, int64& row_desc)
+{
+    using matrix::sparsity_type;
+    bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::full);
+    if (is_allowed && row_len == col_range) {
+        row_desc = static_cast<int>(sparsity_type::full);
+        return true;
+    }
+    return false;
+}
+
+
+constexpr static int csr_lookup_block_size = 32;
+
+
+template <typename IndexType>
+bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range,
+                           IndexType min_col, IndexType available_storage,
+                           matrix::sparsity_type allowed, int64& row_desc,
+                           int32* local_storage, const IndexType* cols)
+{
+    using matrix::sparsity_type;
+    bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap);
+    const auto num_blocks = ceildiv(col_range, csr_lookup_block_size);
+    if (is_allowed && num_blocks * 2 <= available_storage) {
+        row_desc = (static_cast<int64>(num_blocks) << 32) |
+                   static_cast<int>(sparsity_type::bitmap);
+        const auto block_ranks = local_storage;
+        const auto block_bitmaps =
+            reinterpret_cast<uint32*>(block_ranks + num_blocks);
+        std::fill_n(block_bitmaps, num_blocks, 0);
+        for (auto col_it = cols; col_it < cols + row_len; col_it++) {
+            const auto rel_col = *col_it - min_col;
+            const auto block = rel_col / csr_lookup_block_size;
+            const auto col_in_block = rel_col % csr_lookup_block_size;
+            block_bitmaps[block] |= uint32{1} << col_in_block;
+        }
+        int32 partial_sum{};
+        for (int32 block = 0; block < num_blocks; block++) {
+            block_ranks[block] = partial_sum;
+            partial_sum += gko::detail::popcount(block_bitmaps[block]);
+        }
+        return true;
+    }
+    return false;
+}
+
+
+template <typename IndexType>
+void csr_lookup_build_hash(IndexType row_len, IndexType available_storage,
+                           int64& row_desc, int32* local_storage,
+                           const IndexType* cols)
+{
+    constexpr double inv_golden_ratio = 0.61803398875;
+    // use golden ratio as approximation for hash parameter that spreads
+    // consecutive values as far apart as possible. Ensure lowest bit is set
+    // otherwise we skip odd hashtable entries
+    const auto hash_parameter =
+        1u | static_cast<uint32>(available_storage * inv_golden_ratio);
+    row_desc = (static_cast<int64>(hash_parameter) << 32) |
+               static_cast<int>(matrix::sparsity_type::hash);
+    std::fill_n(local_storage, available_storage, -1);
+    for (int32 nz = 0; nz < row_len; nz++) {
+        auto hash = (static_cast<uint32>(cols[nz]) * hash_parameter) %
+                    static_cast<uint32>(available_storage);
+        // linear probing: find the next empty entry
+        while (local_storage[hash] != -1) {
+            hash++;
+            if (hash >= available_storage) {
+                hash = 0;
+            }
+        }
+        local_storage[hash] = nz;
+    }
+}
+
+
+template <typename IndexType>
+void build_lookup(std::shared_ptr<const DefaultExecutor> exec,
+                  const IndexType* row_ptrs, const IndexType* col_idxs,
+                  size_type num_rows, matrix::sparsity_type allowed,
+                  int64* row_desc, int32* storage)
+{
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; row++) {
+        const auto row_begin = row_ptrs[row];
+        const auto row_len = row_ptrs[row + 1] - row_begin;
+        const auto storage_begin = 2 * row_begin;
+        const auto available_storage = 2 * row_len;
+        const auto local_storage = storage + storage_begin;
+        const auto local_cols = col_idxs + row_begin;
+        const auto min_col = row_len > 0 ? local_cols[0] : 0;
+        const auto col_range =
+            row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0;
+        bool done =
+            csr_lookup_try_full(row_len, col_range, allowed, row_desc[row]);
+        if (!done) {
+            done = csr_lookup_try_bitmap(
+                row_len, col_range, min_col, available_storage, allowed,
+                row_desc[row], local_storage, local_cols);
+        }
+        if (!done) {
+            csr_lookup_build_hash(row_len, available_storage, row_desc[row],
+                                  local_storage, local_cols);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL);
+
+
 }  // namespace csr
 }  // namespace omp
 }  // namespace kernels
diff --git a/omp/test/matrix/csr_kernels.cpp b/omp/test/matrix/csr_kernels.cpp
index d7a8a55bf4c..b1fc6d825fe 100644
--- a/omp/test/matrix/csr_kernels.cpp
+++ b/omp/test/matrix/csr_kernels.cpp
@@ -56,6 +56,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/test/utils.hpp"
 #include "core/test/utils/matrix_utils.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
+#include "ginkgo/core/matrix/csr_lookup.hpp"
 
 
 namespace {
@@ -816,4 +817,61 @@ TEST_F(Csr, AddScaledIdentityToNonSquare)
 }
 
 
+TEST_F(Csr, BuildLookupDataWorks)
+{
+    set_up_apply_data();
+    using gko::matrix::sparsity_type;
+    gko::Array<gko::int64> row_descs(ref, mtx->get_size()[0]);
+    gko::Array<gko::int32> lookup_info(ref, mtx->get_num_stored_elements() * 2);
+    gko::Array<gko::int64> drow_descs(omp, mtx->get_size()[0]);
+    gko::Array<gko::int32> dlookup_info(omp,
+                                        mtx->get_num_stored_elements() * 2);
+    for (auto allowed_methods :
+         {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash,
+          sparsity_type::bitmap | sparsity_type::hash,
+          sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) {
+        const auto full_allowed =
+            static_cast<bool>(static_cast<int>(allowed_methods) &
+                              static_cast<int>(sparsity_type::full));
+        const auto bitmap_allowed =
+            static_cast<bool>(static_cast<int>(allowed_methods) &
+                              static_cast<int>(sparsity_type::bitmap));
+        const auto bitmap_equivalent =
+            bitmap_allowed ? sparsity_type::bitmap : sparsity_type::hash;
+        const auto full_equivalent =
+            full_allowed ? sparsity_type::full : bitmap_equivalent;
+        SCOPED_TRACE("full: " + std::to_string(full_allowed) +
+                     " bitmap: " + std::to_string(bitmap_allowed));
+
+        gko::kernels::reference::csr::build_lookup(
+            ref, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
+            mtx->get_size()[0], allowed_methods, row_descs.get_data(),
+            lookup_info.get_data());
+        gko::kernels::omp::csr::build_lookup(
+            omp, dmtx->get_const_row_ptrs(), dmtx->get_const_col_idxs(),
+            dmtx->get_size()[0], allowed_methods, drow_descs.get_data(),
+            dlookup_info.get_data());
+
+        for (int row = 0; row < dmtx->get_size()[0]; row++) {
+            SCOPED_TRACE("row: " + std::to_string(row));
+            const auto row_begin = mtx->get_const_row_ptrs()[row];
+            const auto row_nnz = mtx->get_const_row_ptrs()[row + 1] - row_begin;
+            gko::matrix::device_sparsity_lookup<int> lookup{
+                mtx->get_const_col_idxs() + row_begin, row_nnz,
+                dlookup_info.get_const_data() + (row_begin * 2),
+                drow_descs.get_const_data()[row]};
+
+            ASSERT_EQ(
+                static_cast<int>(drow_descs.get_const_data()[row]) & 0xFFFF,
+                static_cast<int>(row_descs.get_const_data()[row]) & 0xFFFF);
+            for (int nz = 0; nz < row_nnz; nz++) {
+                SCOPED_TRACE("nz: " + std::to_string(nz));
+                const auto col = mtx->get_const_col_idxs()[nz + row_begin];
+                ASSERT_EQ(nz, lookup[col]);
+            }
+        }
+    }
+}
+
+
 }  // namespace
diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp
index 2e913b128a3..a343695d90c 100644
--- a/reference/matrix/csr_kernels.cpp
+++ b/reference/matrix/csr_kernels.cpp
@@ -1016,6 +1016,124 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
+bool csr_lookup_allowed(matrix::sparsity_type allowed,
+                        matrix::sparsity_type type)
+{
+    return ((static_cast<int>(allowed) & static_cast<int>(type)) != 0);
+}
+
+
+template <typename IndexType>
+bool csr_lookup_try_full(IndexType row_len, IndexType col_range,
+                         matrix::sparsity_type allowed, int64& row_desc)
+{
+    using matrix::sparsity_type;
+    bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::full);
+    if (is_allowed && row_len == col_range) {
+        row_desc = static_cast<int>(sparsity_type::full);
+        return true;
+    }
+    return false;
+}
+
+
+template <typename IndexType>
+bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range,
+                           IndexType min_col, IndexType available_storage,
+                           matrix::sparsity_type allowed, int64& row_desc,
+                           int32* local_storage, const IndexType* cols)
+{
+    constexpr static int block_size =
+        gko::matrix::device_sparsity_lookup<IndexType>::block_size;
+    using matrix::sparsity_type;
+    bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap);
+    const auto num_blocks = ceildiv(col_range, block_size);
+    if (is_allowed && num_blocks * 2 <= available_storage) {
+        row_desc = (static_cast<int64>(num_blocks) << 32) |
+                   static_cast<int>(sparsity_type::bitmap);
+        const auto block_ranks = local_storage;
+        const auto block_bitmaps =
+            reinterpret_cast<uint32*>(block_ranks + num_blocks);
+        std::fill_n(block_bitmaps, num_blocks, 0);
+        for (auto col_it = cols; col_it < cols + row_len; col_it++) {
+            const auto rel_col = *col_it - min_col;
+            const auto block = rel_col / block_size;
+            const auto col_in_block = rel_col % block_size;
+            block_bitmaps[block] |= uint32{1} << col_in_block;
+        }
+        int32 partial_sum{};
+        for (int32 block = 0; block < num_blocks; block++) {
+            block_ranks[block] = partial_sum;
+            partial_sum += gko::detail::popcount(block_bitmaps[block]);
+        }
+        return true;
+    }
+    return false;
+}
+
+
+template <typename IndexType>
+void csr_lookup_build_hash(IndexType row_len, IndexType available_storage,
+                           int64& row_desc, int32* local_storage,
+                           const IndexType* cols)
+{
+    constexpr double inv_golden_ratio = 0.61803398875;
+    // use golden ratio as approximation for hash parameter that spreads
+    // consecutive values as far apart as possible. Ensure lowest bit is set
+    // otherwise we skip odd hashtable entries
+    const auto hash_parameter =
+        1u | static_cast<uint32>(available_storage * inv_golden_ratio);
+    row_desc = (static_cast<int64>(hash_parameter) << 32) |
+               static_cast<int>(matrix::sparsity_type::hash);
+    std::fill_n(local_storage, available_storage, -1);
+    for (int32 nz = 0; nz < row_len; nz++) {
+        auto hash = (static_cast<uint32>(cols[nz]) * hash_parameter) %
+                    static_cast<uint32>(available_storage);
+        // linear probing: find the next empty entry
+        while (local_storage[hash] != -1) {
+            hash++;
+            if (hash >= available_storage) {
+                hash = 0;
+            }
+        }
+        local_storage[hash] = nz;
+    }
+}
+
+
+template <typename IndexType>
+void build_lookup(std::shared_ptr<const ReferenceExecutor> exec,
+                  const IndexType* row_ptrs, const IndexType* col_idxs,
+                  size_type num_rows, matrix::sparsity_type allowed,
+                  int64* row_desc, int32* storage)
+{
+    for (size_type row = 0; row < num_rows; row++) {
+        const auto row_begin = row_ptrs[row];
+        const auto row_len = row_ptrs[row + 1] - row_begin;
+        const auto storage_begin = 2 * row_begin;
+        const auto available_storage = 2 * row_len;
+        const auto local_storage = storage + storage_begin;
+        const auto local_cols = col_idxs + row_begin;
+        const auto min_col = row_len > 0 ? local_cols[0] : 0;
+        const auto col_range =
+            row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0;
+        bool done =
+            csr_lookup_try_full(row_len, col_range, allowed, row_desc[row]);
+        if (!done) {
+            done = csr_lookup_try_bitmap(
+                row_len, col_range, min_col, available_storage, allowed,
+                row_desc[row], local_storage, local_cols);
+        }
+        if (!done) {
+            csr_lookup_build_hash(row_len, available_storage, row_desc[row],
+                                  local_storage, local_cols);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL);
+
+
 }  // namespace csr
 }  // namespace reference
 }  // namespace kernels
diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp
index 7341c6347fe..a5cbf975daf 100644
--- a/reference/test/matrix/csr_kernels.cpp
+++ b/reference/test/matrix/csr_kernels.cpp
@@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr_lookup.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 #include <ginkgo/core/matrix/ell.hpp>
@@ -1760,4 +1761,80 @@ TYPED_TEST(Csr, CanGetSubmatrix2)
 }
 
 
+TYPED_TEST(Csr, GeneratesLookupData)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using IndexType = typename Mtx::index_type;
+    using gko::matrix::sparsity_type;
+    auto mtx = Mtx::create(this->exec);
+    typename Mtx::mat_data data{gko::dim<2>{6, 65}};
+    for (int i = 0; i < 65; i++) {
+        // row 0: empty row
+        // row 1: full row
+        data.nonzeros.emplace_back(1, i, 1.0);
+        // row 2: pretty dense row
+        if (i % 3 == 0) {
+            data.nonzeros.emplace_back(2, i, 1.0);
+        }
+        // row 4-5: contiguous row
+        if (i >= 10 && i < 30) {
+            data.nonzeros.emplace_back(4, i, 1.0);
+        }
+        if (i >= 2 && i < 6) {
+            data.nonzeros.emplace_back(5, i, 1.0);
+        }
+    }
+    // row 3: very sparse
+    data.nonzeros.emplace_back(3, 0, 1.0);
+    data.nonzeros.emplace_back(3, 64, 1.0);
+    data.ensure_row_major_order();
+    mtx->read(data);
+    gko::Array<gko::int64> row_descs(this->exec, mtx->get_size()[0]);
+    gko::Array<gko::int32> lookup_info(this->exec,
+                                       mtx->get_num_stored_elements() * 2);
+    for (auto allowed_methods :
+         {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash,
+          sparsity_type::bitmap | sparsity_type::hash,
+          sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) {
+        const auto full_allowed =
+            static_cast<bool>(static_cast<int>(allowed_methods) &
+                              static_cast<int>(sparsity_type::full));
+        const auto bitmap_allowed =
+            static_cast<bool>(static_cast<int>(allowed_methods) &
+                              static_cast<int>(sparsity_type::bitmap));
+        const auto bitmap_equivalent =
+            bitmap_allowed ? sparsity_type::bitmap : sparsity_type::hash;
+        const auto full_equivalent =
+            full_allowed ? sparsity_type::full : bitmap_equivalent;
+        SCOPED_TRACE("full: " + std::to_string(full_allowed) +
+                     " bitmap: " + std::to_string(bitmap_allowed));
+
+        gko::kernels::reference::csr::build_lookup(
+            this->exec, mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(),
+            mtx->get_size()[0], allowed_methods, row_descs.get_data(),
+            lookup_info.get_data());
+
+        const auto descs = row_descs.get_const_data();
+        for (auto entry : data.nonzeros) {
+            const auto row = entry.row;
+            const auto col = entry.column;
+            const auto row_begin = mtx->get_const_row_ptrs()[row];
+            const auto row_nnz = mtx->get_const_row_ptrs()[row + 1] - row_begin;
+            gko::matrix::device_sparsity_lookup<IndexType> lookup{
+                mtx->get_const_col_idxs() + row_begin, row_nnz,
+                lookup_info.get_const_data() + (row_begin * 2), descs[row]};
+
+            const auto nz = lookup[col] + row_begin;
+            ASSERT_EQ(mtx->get_const_col_idxs()[nz], col);
+        }
+        ASSERT_EQ(descs[0] & 0xFFFF, static_cast<int>(full_equivalent));
+        ASSERT_EQ(descs[1] & 0xFFFF, static_cast<int>(full_equivalent));
+        ASSERT_EQ(descs[2] & 0xFFFF, static_cast<int>(bitmap_equivalent));
+        ASSERT_EQ(descs[3] & 0xFFFF, static_cast<int>(sparsity_type::hash));
+        ASSERT_EQ(descs[4] & 0xFFFF, static_cast<int>(full_equivalent));
+        ASSERT_EQ(descs[5] & 0xFFFF, static_cast<int>(full_equivalent));
+    }
+}
+
+
 }  // namespace