diff --git a/common/cuda_hip/components/segment_scan.hpp.inc b/common/cuda_hip/components/segment_scan.hpp.inc
index 2bd2e3eec90..cc873d5850d 100644
--- a/common/cuda_hip/components/segment_scan.hpp.inc
+++ b/common/cuda_hip/components/segment_scan.hpp.inc
@@ -38,25 +38,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * performs suffix sum. Works on the source array and returns whether the thread
  * is the first element of its segment with same `ind`.
  */
-template <unsigned subwarp_size, typename ValueType, typename IndexType>
+template <unsigned subwarp_size, typename ValueType, typename IndexType,
+          typename Operator>
 __device__ __forceinline__ bool segment_scan(
     const group::thread_block_tile<subwarp_size>& group, const IndexType ind,
-    ValueType* __restrict__ val)
+    ValueType& val, Operator op)
 {
     bool head = true;
 #pragma unroll
     for (int i = 1; i < subwarp_size; i <<= 1) {
         const IndexType add_ind = group.shfl_up(ind, i);
-        ValueType add_val = zero<ValueType>();
-        if (add_ind == ind && threadIdx.x >= i) {
-            add_val = *val;
+        ValueType add_val{};
+        if (add_ind == ind && group.thread_rank() >= i) {
+            add_val = val;
             if (i == 1) {
                 head = false;
             }
         }
         add_val = group.shfl_down(add_val, i);
-        if (threadIdx.x < subwarp_size - i) {
-            *val += add_val;
+        if (group.thread_rank() < subwarp_size - i) {
+            val = op(val, add_val);
         }
     }
     return head;
diff --git a/common/cuda_hip/matrix/coo_kernels.hpp.inc b/common/cuda_hip/matrix/coo_kernels.hpp.inc
index a13068c1ce0..a5dd70bc95a 100644
--- a/common/cuda_hip/matrix/coo_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/coo_kernels.hpp.inc
@@ -82,8 +82,9 @@ __device__ void spmv_kernel(const size_type nnz, const size_type num_lines,
             (ind + subwarp_size < nnz) ? row[ind + subwarp_size] : row[nnz - 1];
         // segmented scan
         if (tile_block.any(curr_row != next_row)) {
-            bool is_first_in_segment =
-                segment_scan<subwarp_size>(tile_block, curr_row, &temp_val);
+            bool is_first_in_segment = segment_scan<subwarp_size>(
+                tile_block, curr_row, temp_val,
+                [](ValueType a, ValueType b) { return a + b; });
             if (is_first_in_segment) {
                 atomic_add(&(c[curr_row * c_stride + column_id]),
                            scale(temp_val));
@@ -97,8 +98,9 @@ __device__ void spmv_kernel(const size_type nnz, const size_type num_lines,
         temp_val += (ind < nnz) ? val[ind] * b[col[ind] * b_stride + column_id]
                                 : zero<ValueType>();
         // segmented scan
-        bool is_first_in_segment =
-            segment_scan<subwarp_size>(tile_block, curr_row, &temp_val);
+        bool is_first_in_segment = segment_scan<subwarp_size>(
+            tile_block, curr_row, temp_val,
+            [](ValueType a, ValueType b) { return a + b; });
         if (is_first_in_segment) {
             atomic_add(&(c[curr_row * c_stride + column_id]), scale(temp_val));
         }
diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc
index a754971db77..3cdf7f24d6d 100644
--- a/common/cuda_hip/matrix/csr_kernels.hpp.inc
+++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc
@@ -68,22 +68,21 @@ __device__ __forceinline__ bool block_segment_scan_reverse(
 template <bool overflow, typename IndexType>
 __device__ __forceinline__ void find_next_row(
     const IndexType num_rows, const IndexType data_size, const IndexType ind,
-    IndexType* __restrict__ row, IndexType* __restrict__ row_end,
-    const IndexType row_predict, const IndexType row_predict_end,
-    const IndexType* __restrict__ row_ptr)
+    IndexType& row, IndexType& row_end, const IndexType row_predict,
+    const IndexType row_predict_end, const IndexType* __restrict__ row_ptr)
 {
     if (!overflow || ind < data_size) {
-        if (ind >= *row_end) {
-            *row = row_predict;
-            *row_end = row_predict_end;
-            while (ind >= *row_end) {
-                *row_end = row_ptr[++*row + 1];
+        if (ind >= row_end) {
+            row = row_predict;
+            row_end = row_predict_end;
+            while (ind >= row_end) {
+                row_end = row_ptr[++row + 1];
             }
         }
 
     } else {
-        *row = num_rows - 1;
-        *row_end = data_size;
+        row = num_rows - 1;
+        row_end = data_size;
     }
 }
 
@@ -92,16 +91,17 @@ template <unsigned subwarp_size, typename ValueType, typename IndexType,
           typename Closure>
 __device__ __forceinline__ void warp_atomic_add(
     const group::thread_block_tile<subwarp_size>& group, bool force_write,
-    ValueType* __restrict__ val, const IndexType row, ValueType* __restrict__ c,
+    ValueType& val, const IndexType row, ValueType* __restrict__ c,
     const size_type c_stride, const IndexType column_id, Closure scale)
 {
     // do a local scan to avoid atomic collisions
-    const bool need_write = segment_scan(group, row, val);
+    const bool need_write = segment_scan(
+        group, row, val, [](ValueType a, ValueType b) { return a + b; });
     if (need_write && force_write) {
-        atomic_add(&(c[row * c_stride + column_id]), scale(*val));
+        atomic_add(&(c[row * c_stride + column_id]), scale(val));
     }
     if (!need_write || force_write) {
-        *val = zero<ValueType>();
+        val = zero<ValueType>();
     }
 }
 
@@ -111,28 +111,27 @@ template <bool last, unsigned subwarp_size, typename ValueType,
 __device__ __forceinline__ void process_window(
     const group::thread_block_tile<subwarp_size>& group,
     const IndexType num_rows, const IndexType data_size, const IndexType ind,
-    IndexType* __restrict__ row, IndexType* __restrict__ row_end,
-    IndexType* __restrict__ nrow, IndexType* __restrict__ nrow_end,
-    ValueType* __restrict__ temp_val, const ValueType* __restrict__ val,
+    IndexType& row, IndexType& row_end, IndexType& nrow, IndexType& nrow_end,
+    ValueType& temp_val, const ValueType* __restrict__ val,
     const IndexType* __restrict__ col_idxs,
     const IndexType* __restrict__ row_ptrs, const ValueType* __restrict__ b,
     const size_type b_stride, ValueType* __restrict__ c,
     const size_type c_stride, const IndexType column_id, Closure scale)
 {
-    const IndexType curr_row = *row;
-    find_next_row<last>(num_rows, data_size, ind, row, row_end, *nrow,
-                        *nrow_end, row_ptrs);
+    const auto curr_row = row;
+    find_next_row<last>(num_rows, data_size, ind, row, row_end, nrow, nrow_end,
+                        row_ptrs);
     // segmented scan
-    if (group.any(curr_row != *row)) {
-        warp_atomic_add(group, curr_row != *row, temp_val, curr_row, c,
-                        c_stride, column_id, scale);
-        *nrow = group.shfl(*row, subwarp_size - 1);
-        *nrow_end = group.shfl(*row_end, subwarp_size - 1);
+    if (group.any(curr_row != row)) {
+        warp_atomic_add(group, curr_row != row, temp_val, curr_row, c, c_stride,
+                        column_id, scale);
+        nrow = group.shfl(row, subwarp_size - 1);
+        nrow_end = group.shfl(row_end, subwarp_size - 1);
     }
 
     if (!last || ind < data_size) {
         const auto col = col_idxs[ind];
-        *temp_val += val[ind] * b[col * b_stride + column_id];
+        temp_val += val[ind] * b[col * b_stride + column_id];
     }
 }
 
@@ -171,21 +170,21 @@ __device__ __forceinline__ void spmv_kernel(
     auto nrow_end = row_end;
     ValueType temp_val = zero<ValueType>();
     IndexType ind = start + threadIdx.x;
-    find_next_row<true>(num_rows, data_size, ind, &row, &row_end, nrow,
-                        nrow_end, row_ptrs);
+    find_next_row<true>(num_rows, data_size, ind, row, row_end, nrow, nrow_end,
+                        row_ptrs);
     const IndexType ind_end = end - wsize;
     const auto tile_block =
         group::tiled_partition<wsize>(group::this_thread_block());
     for (; ind < ind_end; ind += wsize) {
-        process_window<false>(tile_block, num_rows, data_size, ind, &row,
-                              &row_end, &nrow, &nrow_end, &temp_val, val,
-                              col_idxs, row_ptrs, b, b_stride, c, c_stride,
-                              column_id, scale);
-    }
-    process_window<true>(tile_block, num_rows, data_size, ind, &row, &row_end,
-                         &nrow, &nrow_end, &temp_val, val, col_idxs, row_ptrs,
-                         b, b_stride, c, c_stride, column_id, scale);
-    warp_atomic_add(tile_block, true, &temp_val, row, c, c_stride, column_id,
+        process_window<false>(tile_block, num_rows, data_size, ind, row,
+                              row_end, nrow, nrow_end, temp_val, val, col_idxs,
+                              row_ptrs, b, b_stride, c, c_stride, column_id,
+                              scale);
+    }
+    process_window<true>(tile_block, num_rows, data_size, ind, row, row_end,
+                         nrow, nrow_end, temp_val, val, col_idxs, row_ptrs, b,
+                         b_stride, c, c_stride, column_id, scale);
+    warp_atomic_add(tile_block, true, temp_val, row, c, c_stride, column_id,
                     scale);
 }
 
@@ -915,3 +914,187 @@ void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);
+
+
+namespace kernel {
+
+
+template <int subwarp_size, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void build_csr_lookup(
+    const IndexType* __restrict__ row_ptrs,
+    const IndexType* __restrict__ col_idxs, size_type num_rows,
+    matrix::csr::sparsity_type allowed, const IndexType* storage_offsets,
+    int64* __restrict__ row_desc, int32* __restrict__ storage)
+{
+    using matrix::csr::sparsity_type;
+    constexpr int bitmap_block_size = matrix::csr::sparsity_bitmap_block_size;
+    auto row = thread::get_subwarp_id_flat<subwarp_size>();
+    if (row >= num_rows) {
+        return;
+    }
+    const auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    const auto row_begin = row_ptrs[row];
+    const auto row_len = row_ptrs[row + 1] - row_begin;
+    const auto storage_begin = storage_offsets[row];
+    const auto available_storage = storage_offsets[row + 1] - storage_begin;
+    const auto local_storage = storage + storage_begin;
+    const auto local_cols = col_idxs + row_begin;
+    const auto lane = subwarp.thread_rank();
+    const auto min_col = row_len > 0 ? local_cols[0] : 0;
+    const auto col_range =
+        row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0;
+
+    // full column range
+    if (col_range == row_len &&
+        csr_lookup_allowed(allowed, sparsity_type::full)) {
+        if (lane == 0) {
+            row_desc[row] = static_cast<int64>(sparsity_type::full);
+        }
+        return;
+    }
+    // dense bitmap storage
+    const auto num_blocks =
+        static_cast<int32>(ceildiv(col_range, bitmap_block_size));
+    if (num_blocks * 2 <= available_storage &&
+        csr_lookup_allowed(allowed, sparsity_type::bitmap)) {
+        if (lane == 0) {
+            row_desc[row] = (static_cast<int64>(num_blocks) << 32) |
+                            static_cast<int64>(sparsity_type::bitmap);
+        }
+        const auto block_ranks = local_storage;
+        const auto block_bitmaps =
+            reinterpret_cast<uint32*>(block_ranks + num_blocks);
+        // fill bitmaps with zeros
+        for (int32 i = lane; i < num_blocks; i += subwarp_size) {
+            block_bitmaps[i] = 0;
+        }
+        // fill bitmaps with sparsity pattern
+        for (IndexType base_i = 0; base_i < row_len; base_i += subwarp_size) {
+            const auto i = base_i + lane;
+            const auto col = i < row_len
+                                 ? local_cols[i]
+                                 : device_numeric_limits<IndexType>::max;
+            const auto rel_col = static_cast<int32>(col - min_col);
+            const auto block = rel_col / bitmap_block_size;
+            const auto col_in_block = rel_col % bitmap_block_size;
+            auto local_bitmap = uint32{i < row_len ? 1u : 0u} << col_in_block;
+            bool is_first =
+                segment_scan(subwarp, block, local_bitmap,
+                             [](config::lane_mask_type a,
+                                config::lane_mask_type b) { return a | b; });
+            // memory barrier - just to be sure
+            subwarp.sync();
+            if (is_first && i < row_len) {
+                block_bitmaps[block] |= local_bitmap;
+            }
+        }
+        // compute bitmap ranks
+        int32 block_partial_sum{};
+        for (int32 base_i = 0; base_i < num_blocks; base_i += subwarp_size) {
+            const auto i = base_i + lane;
+            const auto bitmap = i < num_blocks ? block_bitmaps[i] : 0;
+            int32 local_partial_sum{};
+            int32 local_total_sum{};
+            subwarp_prefix_sum<false>(popcnt(bitmap), local_partial_sum,
+                                      local_total_sum, subwarp);
+            if (i < num_blocks) {
+                block_ranks[i] = local_partial_sum + block_partial_sum;
+            }
+            block_partial_sum += local_total_sum;
+        }
+        return;
+    }
+    // sparse hashmap storage
+    // we need at least one unfilled entry to avoid infinite loops on search
+    GKO_ASSERT(row_len < available_storage);
+    constexpr double inv_golden_ratio = 0.61803398875;
+    // use golden ratio as approximation for hash parameter that spreads
+    // consecutive values as far apart as possible. Ensure lowest bit is set
+    // otherwise we skip odd hashtable entries
+    const auto hash_parameter =
+        1u | static_cast<uint32>(available_storage * inv_golden_ratio);
+    if (lane == 0) {
+        row_desc[row] = (static_cast<int64>(hash_parameter) << 32) |
+                        static_cast<int64>(sparsity_type::hash);
+    }
+    // fill hashmap with sentinel
+    constexpr int32 empty = invalid_index<int32>();
+    for (int32 i = lane; i < available_storage; i += subwarp_size) {
+        local_storage[i] = empty;
+    }
+    // memory barrier
+    subwarp.sync();
+    // fill with actual entries
+    for (IndexType base_i = 0; base_i < row_len; base_i += subwarp_size) {
+        const auto i = base_i + lane;
+        const auto col = i < row_len ? local_cols[i] : 0;
+        // make sure that each idle thread gets a unique out-of-bounds value
+        auto hash = i < row_len ? (static_cast<uint32>(col) * hash_parameter) %
+                                      static_cast<uint32>(available_storage)
+                                : static_cast<uint32>(available_storage + lane);
+        // collision resolution
+#if !defined(__HIP_DEVICE_COMPILE__) && defined(__CUDA_ARCH__) && \
+    (__CUDA_ARCH__ >= 700)
+        const auto this_lane_mask = config::lane_mask_type{1} << lane;
+        const auto lane_prefix_mask = this_lane_mask - 1;
+        // memory barrier to previous loop iteration
+        subwarp.sync();
+        int32 entry = i < row_len ? local_storage[hash] : empty;
+        // find all threads in the subwarp with the same hash key
+        auto colliding = subwarp.match_any(hash);
+        // if there are any collisions with previously filled buckets
+        while (subwarp.any(entry != empty || colliding != this_lane_mask)) {
+            if (entry != empty || colliding != this_lane_mask) {
+                // assign consecutive indices to matching threads
+                hash += (entry == empty ? 0 : 1) +
+                        popcnt(colliding & lane_prefix_mask);
+                // cheap modulo replacement
+                if (hash >= available_storage) {
+                    hash -= available_storage;
+                }
+                // this could only fail for available_storage < warp_size, as
+                // popcnt(colliding) is at most warp_size. At the same time, we
+                // only increase hash by row_length at most, so this is still
+                // safe.
+                GKO_ASSERT(hash < available_storage);
+                entry = local_storage[hash];
+            }
+            colliding = subwarp.match_any(hash);
+        }
+        if (i < row_len) {
+            local_storage[hash] = i;
+        }
+#else
+        if (i < row_len) {
+            while (atomicCAS(local_storage + hash, empty, i) != empty) {
+                hash++;
+                if (hash >= available_storage) {
+                    hash = 0;
+                }
+            }
+        }
+#endif
+    }
+}
+
+
+}  // namespace kernel
+
+
+template <typename IndexType>
+void build_lookup(std::shared_ptr<const DefaultExecutor> exec,
+                  const IndexType* row_ptrs, const IndexType* col_idxs,
+                  size_type num_rows, matrix::csr::sparsity_type allowed,
+                  const IndexType* storage_offsets, int64* row_desc,
+                  int32* storage)
+{
+    const auto num_blocks =
+        ceildiv(num_rows, default_block_size / config::warp_size);
+    kernel::build_csr_lookup<config::warp_size>
+        <<<num_blocks, default_block_size>>>(row_ptrs, col_idxs, num_rows,
+                                             allowed, storage_offsets, row_desc,
+                                             storage);
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL);
diff --git a/common/unified/matrix/csr_kernels.cpp b/common/unified/matrix/csr_kernels.cpp
index d6a81453b86..690f96debdc 100644
--- a/common/unified/matrix/csr_kernels.cpp
+++ b/common/unified/matrix/csr_kernels.cpp
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "common/unified/base/kernel_launch.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
 
 
 namespace gko {
@@ -243,6 +244,49 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONVERT_TO_HYBRID_KERNEL);
 
 
+template <typename IndexType>
+void build_lookup_offsets(std::shared_ptr<const DefaultExecutor> exec,
+                          const IndexType* row_ptrs, const IndexType* col_idxs,
+                          size_type num_rows,
+                          matrix::csr::sparsity_type allowed,
+                          IndexType* storage_offsets)
+{
+    using matrix::csr::sparsity_bitmap_block_size;
+    using matrix::csr::sparsity_type;
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto row, auto row_ptrs, auto col_idxs, auto num_rows,
+                      auto allowed, auto storage_offsets) {
+            const auto row_begin = row_ptrs[row];
+            const auto row_len = row_ptrs[row + 1] - row_begin;
+            const auto local_cols = col_idxs + row_begin;
+            const auto min_col = row_len > 0 ? local_cols[0] : 0;
+            const auto col_range =
+                row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0;
+            if (csr_lookup_allowed(allowed, sparsity_type::full) &&
+                row_len == col_range) {
+                storage_offsets[row] = 0;
+            } else {
+                const auto hashmap_storage = row_len == 0 ? 1 : 2 * row_len;
+                const auto bitmap_num_blocks = static_cast<int32>(
+                    ceildiv(col_range, sparsity_bitmap_block_size));
+                const auto bitmap_storage = 2 * bitmap_num_blocks;
+                if (csr_lookup_allowed(allowed, sparsity_type::bitmap) &&
+                    bitmap_storage <= hashmap_storage) {
+                    storage_offsets[row] = bitmap_storage;
+                } else {
+                    storage_offsets[row] = hashmap_storage;
+                }
+            }
+        },
+        num_rows, row_ptrs, col_idxs, num_rows, allowed, storage_offsets);
+    components::prefix_sum(exec, storage_offsets, num_rows + 1);
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL);
+
+
 }  // namespace csr
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp
index e9ee48608ed..bd965ad4bc3 100644
--- a/core/matrix/csr_kernels.hpp
+++ b/core/matrix/csr_kernels.hpp
@@ -50,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "core/base/kernel_declaration.hpp"
+#include "core/matrix/csr_lookup.hpp"
 
 
 namespace gko {
@@ -224,6 +225,20 @@ namespace kernels {
                              const matrix::Dense<ValueType>* beta,        \
                              matrix::Csr<ValueType, IndexType>* mtx)
 
+#define GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL(IndexType)               \
+    void build_lookup_offsets(std::shared_ptr<const DefaultExecutor> exec,   \
+                              const IndexType* row_ptrs,                     \
+                              const IndexType* col_idxs, size_type num_rows, \
+                              matrix::csr::sparsity_type allowed,            \
+                              IndexType* storage_offsets)
+
+#define GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL(IndexType)                        \
+    void build_lookup(std::shared_ptr<const DefaultExecutor> exec,            \
+                      const IndexType* row_ptrs, const IndexType* col_idxs,   \
+                      size_type num_rows, matrix::csr::sparsity_type allowed, \
+                      const IndexType* storage_offsets, int64* row_desc,      \
+                      int32* storage)
+
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                                       \
     template <typename ValueType, typename IndexType>                      \
@@ -283,7 +298,11 @@ namespace kernels {
     template <typename ValueType, typename IndexType>                      \
     GKO_DECLARE_CSR_CHECK_DIAGONAL_ENTRIES_EXIST(ValueType, IndexType);    \
     template <typename ValueType, typename IndexType>                      \
-    GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL(ValueType, IndexType)
+    GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL(ValueType, IndexType);      \
+    template <typename IndexType>                                          \
+    GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL(IndexType);                \
+    template <typename IndexType>                                          \
+    GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL(IndexType)
 
 
 GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(csr, GKO_DECLARE_ALL_AS_TEMPLATES);
diff --git a/core/matrix/csr_lookup.hpp b/core/matrix/csr_lookup.hpp
new file mode 100644
index 00000000000..e4d46e25ee5
--- /dev/null
+++ b/core/matrix/csr_lookup.hpp
@@ -0,0 +1,327 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_MATRIX_CSR_LOOKUP_HPP_
+#define GKO_CORE_MATRIX_CSR_LOOKUP_HPP_
+
+
+#include <type_traits>
+
+
+#include <ginkgo/core/base/intrinsics.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+namespace gko {
+namespace matrix {
+namespace csr {
+
+
+/**
+ * Type describing which kind of lookup structure is used to find entries in a
+ * single row of a Csr matrix. It is also uses as a mask, so each value has only
+ * a single bit set.
+ */
+enum class sparsity_type {
+    /**
+     * The row is dense, i.e. it contains all entries in
+     * `[min_col, min_col + storage_size)`.
+     * This means that the relative output index is `col - min_col`.
+     */
+    full = 1,
+    /**
+     * The row is sufficiently dense that its sparsity pattern can be stored in
+     * a dense bitmap consisting of `storage_size` blocks of size `block_size`.
+     * Each block stores its sparsity pattern as a bitmask and the number of
+     * columns before the block as integer. This means that the relative output
+     * index can be computed as
+     * ```
+     * auto block = (col - min_col) / sparsity_bitmap_block_size;
+     * auto local_col = (col - min_col) % sparsity_bitmap_block_size;
+     * auto prefix_mask = (block_type{1} << local_col) - 1;
+     * auto output_idx = base[block] + popcount(bitmap[block] & prefix_mask);
+     * ```
+     */
+    bitmap = 2,
+    /**
+     * The row is sparse, so it is best represented using a hashtable.
+     * The hashtable has size `storage_size` and stores the relative output
+     * index directly, i.e.
+     * ```
+     * auto hash_key = col;
+     * auto hash_bucket = hash(hash_key);
+     * while (local_cols[hashtable[hash_bucket]] != col) {
+     *     hash_bucket = (hash_bucket + 1) % storage_size; // linear probing
+     * }
+     * auto output_idx = hashtable[hash_bucket];
+     * ```
+     */
+    hash = 4,
+};
+
+
+GKO_ATTRIBUTES GKO_INLINE sparsity_type operator|(sparsity_type a,
+                                                  sparsity_type b)
+{
+    return static_cast<sparsity_type>(static_cast<int>(a) |
+                                      static_cast<int>(b));
+}
+
+
+GKO_ATTRIBUTES GKO_INLINE bool csr_lookup_allowed(sparsity_type allowed,
+                                                  sparsity_type type)
+{
+    return ((static_cast<int>(allowed) & static_cast<int>(type)) != 0);
+}
+
+
+/** Number of bits in a block_type entry. */
+static constexpr int sparsity_bitmap_block_size = 32;
+
+
+template <typename IndexType>
+struct device_sparsity_lookup {
+    /**
+     * Set up a device_sparsity_lookup structure from local data
+     *
+     * @param local_cols  the column array slice for the local row, it contains
+     *                    row_nnz entries
+     * @param row_nnz  the number of entries in this row
+     * @param local_storage  the local lookup structure storage array slice, it
+     *                       needs to be set up using the storage_offsets array
+     * @param storage_size  the number of int32 entries in the local lookup
+     *                      structure
+     * @param desc  the lookup structure descriptor for this row
+     */
+    GKO_ATTRIBUTES GKO_INLINE device_sparsity_lookup(
+        const IndexType* local_cols, IndexType row_nnz,
+        const int32* local_storage, IndexType storage_size, int64 desc)
+        : local_cols{local_cols},
+          row_nnz{row_nnz},
+          local_storage{local_storage},
+          storage_size{storage_size},
+          desc{desc}
+    {}
+
+    /**
+     * Set up a device_sparsity_lookup structure from global data
+     *
+     * @param row_ptrs  the CSR row pointers
+     * @param col_idxs  the CSR column indices
+     * @param storage_offsets  the storage offset array for the lookup structure
+     * @param storage  the storage array for the lookup data structure
+     * @param descs  the lookup structure descriptor array
+     * @param row  the row index to build the lookup structure for
+     */
+    GKO_ATTRIBUTES GKO_INLINE device_sparsity_lookup(
+        const IndexType* row_ptrs, const IndexType* col_idxs,
+        const IndexType* storage_offsets, const int32* storage,
+        const int64* descs, size_type row)
+    {
+        const auto row_begin = row_ptrs[row];
+        const auto row_end = row_ptrs[row + 1];
+        const auto storage_begin = storage_offsets[row];
+        const auto storage_end = storage_offsets[row + 1];
+        local_cols = col_idxs + row_begin;
+        row_nnz = row_end - row_begin;
+        local_storage = storage + storage_begin;
+        storage_size = storage_end - storage_begin;
+        desc = descs[row];
+    }
+
+    /**
+     * Returns the row-local index of the entry with the given column index, if
+     * it is present.
+     *
+     * @param col  the column index
+     * @return the row-local index of the given entry, or
+     *         invalid_index<IndexType>() if it is not present.
+     */
+    GKO_ATTRIBUTES GKO_INLINE IndexType operator[](IndexType col) const
+    {
+        switch (static_cast<sparsity_type>(desc & 0xF)) {
+        case sparsity_type::full:
+            return lookup_full(col);
+        case sparsity_type::bitmap:
+            return lookup_bitmap(col);
+        case sparsity_type::hash:
+            return lookup_hash(col);
+        }
+        GKO_ASSERT(false);
+    }
+
+    /**
+     * Returns the row-local index of the entry with the given column index,
+     * assuming it is present.
+     *
+     * @param col  the column index
+     * @return the row-local index of the given entry.
+     * @note the function fails with an assertion (debug builds) or can crash if
+     *       the column index is not present in the row!
+     */
+    GKO_ATTRIBUTES GKO_INLINE IndexType lookup_unsafe(IndexType col) const
+    {
+        IndexType result{};
+        switch (static_cast<sparsity_type>(desc & 0xF)) {
+        case sparsity_type::full:
+            result = lookup_full_unsafe(col);
+            break;
+        case sparsity_type::bitmap:
+            result = lookup_bitmap_unsafe(col);
+            break;
+        case sparsity_type::hash:
+            result = lookup_hash_unsafe(col);
+            break;
+        default:
+            GKO_ASSERT(false);
+        }
+        GKO_ASSERT(local_cols[result] == col);
+        return result;
+    }
+
+private:
+    using unsigned_index_type = typename std::make_unsigned<IndexType>::type;
+
+    const IndexType* local_cols;
+    IndexType row_nnz;
+    const int32* local_storage;
+    IndexType storage_size;
+    int64 desc;
+
+    GKO_ATTRIBUTES GKO_INLINE IndexType lookup_full_unsafe(IndexType col) const
+    {
+        const auto min_col = local_cols[0];
+        const auto out_idx = col - min_col;
+        GKO_ASSERT(out_idx >= 0 && out_idx < row_nnz);
+        return out_idx;
+    }
+
+    GKO_ATTRIBUTES GKO_INLINE IndexType lookup_full(IndexType col) const
+    {
+        const auto min_col = local_cols[0];
+        const auto out_idx = col - min_col;
+        return out_idx >= 0 && out_idx < row_nnz ? out_idx
+                                                 : invalid_index<IndexType>();
+    }
+
+    GKO_ATTRIBUTES GKO_INLINE IndexType
+    lookup_bitmap_unsafe(IndexType col) const
+    {
+        const auto min_col = local_cols[0];
+        const auto num_blocks = static_cast<int32>(desc >> 32);
+        const auto block_bases = local_storage;
+        const auto block_bitmaps =
+            reinterpret_cast<const uint32*>(block_bases + num_blocks);
+        const auto rel_col = col - min_col;
+        const auto block = rel_col / sparsity_bitmap_block_size;
+        const auto col_in_block = rel_col % sparsity_bitmap_block_size;
+        const auto prefix_mask = (uint32{1} << col_in_block) - 1;
+        GKO_ASSERT(rel_col >= 0);
+        GKO_ASSERT(block < num_blocks);
+        GKO_ASSERT(block_bitmaps[block] & (uint32{1} << col_in_block));
+        const auto out_idx =
+            block_bases[block] +
+            gko::detail::popcount(block_bitmaps[block] & prefix_mask);
+        GKO_ASSERT(local_cols[out_idx] == col);
+        return out_idx;
+    }
+
+    GKO_ATTRIBUTES GKO_INLINE IndexType lookup_bitmap(IndexType col) const
+    {
+        const auto min_col = local_cols[0];
+        const auto num_blocks = static_cast<int32>(desc >> 32);
+        const auto block_bases = local_storage;
+        const auto block_bitmaps =
+            reinterpret_cast<const uint32*>(block_bases + num_blocks);
+        const auto rel_col = col - min_col;
+        const auto block = rel_col / sparsity_bitmap_block_size;
+        const auto col_in_block = rel_col % sparsity_bitmap_block_size;
+        if (rel_col < 0 || block >= num_blocks ||
+            !(block_bitmaps[block] & (uint32{1} << col_in_block))) {
+            return invalid_index<IndexType>();
+        }
+        const auto prefix_mask = (uint32{1} << col_in_block) - 1;
+        return block_bases[block] +
+               gko::detail::popcount(block_bitmaps[block] & prefix_mask);
+    }
+
+    GKO_ATTRIBUTES GKO_INLINE IndexType lookup_hash_unsafe(IndexType col) const
+    {
+        const auto hashmap_size = static_cast<uint32>(storage_size);
+        const auto hash_param = static_cast<uint32>(desc >> 32);
+        const auto hashmap = local_storage;
+        auto hash =
+            (static_cast<unsigned_index_type>(col) * hash_param) % hashmap_size;
+        GKO_ASSERT(hashmap[hash] >= 0);
+        GKO_ASSERT(hashmap[hash] < row_nnz);
+        while (local_cols[hashmap[hash]] != col) {
+            hash++;
+            if (hash >= hashmap_size) {
+                hash = 0;
+            }
+            GKO_ASSERT(hashmap[hash] >= 0);
+            GKO_ASSERT(hashmap[hash] < row_nnz);
+        }
+        const auto out_idx = hashmap[hash];
+        return out_idx;
+    }
+
+    GKO_ATTRIBUTES GKO_INLINE IndexType lookup_hash(IndexType col) const
+    {
+        const auto hashmap_size = static_cast<uint32>(storage_size);
+        const auto hash_param = static_cast<uint32>(desc >> 32);
+        const auto hashmap = local_storage;
+        auto hash =
+            (static_cast<unsigned_index_type>(col) * hash_param) % hashmap_size;
+        GKO_ASSERT(hashmap[hash] < row_nnz);
+        auto out_idx = hashmap[hash];
+        // linear probing with invalid_index sentinel to avoid infinite loop
+        while (out_idx >= 0 && local_cols[out_idx] != col) {
+            hash++;
+            if (hash >= hashmap_size) {
+                hash = 0;
+            }
+            out_idx = hashmap[hash];
+            GKO_ASSERT(hashmap[hash] < row_nnz);
+        }
+        // out_idx is either correct or invalid_index, the hashmap sentinel
+        return out_idx;
+    }
+};
+
+
+}  // namespace csr
+}  // namespace matrix
+}  // namespace gko
+
+#endif  // GKO_CORE_MATRIX_CSR_LOOKUP_HPP_
diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
index f63622e7031..9bdb7e53c2a 100644
--- a/cuda/matrix/csr_kernels.cu
+++ b/cuda/matrix/csr_kernels.cu
@@ -59,6 +59,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/components/format_conversion_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_lookup.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/config.hpp"
@@ -71,6 +72,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/components/format_conversion.cuh"
 #include "cuda/components/intrinsics.cuh"
 #include "cuda/components/merging.cuh"
+#include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/segment_scan.cuh"
 #include "cuda/components/thread_ids.cuh"
diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu
index ab229b81c37..dfd352d4996 100644
--- a/cuda/matrix/fbcsr_kernels.cu
+++ b/cuda/matrix/fbcsr_kernels.cu
@@ -57,6 +57,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
+#include "core/matrix/csr_lookup.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/config.hpp"
@@ -69,6 +70,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/components/atomic.cuh"
 #include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/merging.cuh"
+#include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
diff --git a/dev_tools/scripts/config b/dev_tools/scripts/config
index 7f43fb78089..86ab73b4a8b 100644
--- a/dev_tools/scripts/config
+++ b/dev_tools/scripts/config
@@ -32,6 +32,8 @@
     - FixInclude: "common/unified/base/kernel_launch_solver.hpp"
 - "(cuda|hip|dpcpp|omp)/base/kernel_launch_solver\."
     - FixInclude: "common/unified/base/kernel_launch_solver.hpp"
+- "test/base/kernel_launch_generic.cpp"
+    - FixInclude: "common/unified/base/kernel_launch.hpp"
 - "elimination_forest\.cpp"
     - FixInclude: "core/factorization/elimination_forest.hpp"
 - "common/unified/.*.cpp"
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index 9ed089478b3..2381a98e463 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -2270,6 +2270,128 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
+template <typename IndexType>
+bool csr_lookup_try_full(IndexType row_len, IndexType col_range,
+                         matrix::csr::sparsity_type allowed, int64& row_desc)
+{
+    using matrix::csr::sparsity_type;
+    bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::full);
+    if (is_allowed && row_len == col_range) {
+        row_desc = static_cast<int64>(sparsity_type::full);
+        return true;
+    }
+    return false;
+}
+
+
+template <typename IndexType>
+bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range,
+                           IndexType min_col, IndexType available_storage,
+                           matrix::csr::sparsity_type allowed, int64& row_desc,
+                           int32* local_storage, const IndexType* cols)
+{
+    using matrix::csr::sparsity_bitmap_block_size;
+    using matrix::csr::sparsity_type;
+    bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap);
+    const auto num_blocks =
+        static_cast<int32>(ceildiv(col_range, sparsity_bitmap_block_size));
+    if (is_allowed && num_blocks * 2 <= available_storage) {
+        row_desc = (static_cast<int64>(num_blocks) << 32) |
+                   static_cast<int64>(sparsity_type::bitmap);
+        const auto block_ranks = local_storage;
+        const auto block_bitmaps =
+            reinterpret_cast<uint32*>(block_ranks + num_blocks);
+        std::fill_n(block_bitmaps, num_blocks, 0);
+        for (auto col_it = cols; col_it < cols + row_len; col_it++) {
+            const auto rel_col = *col_it - min_col;
+            const auto block = rel_col / sparsity_bitmap_block_size;
+            const auto col_in_block = rel_col % sparsity_bitmap_block_size;
+            block_bitmaps[block] |= uint32{1} << col_in_block;
+        }
+        int32 partial_sum{};
+        for (int32 block = 0; block < num_blocks; block++) {
+            block_ranks[block] = partial_sum;
+            partial_sum += gko::detail::popcount(block_bitmaps[block]);
+        }
+        return true;
+    }
+    return false;
+}
+
+
+template <typename IndexType>
+void csr_lookup_build_hash(IndexType row_len, IndexType available_storage,
+                           int64& row_desc, int32* local_storage,
+                           const IndexType* cols)
+{
+    // we need at least one unfilled entry to avoid infinite loops on search
+    GKO_ASSERT(row_len < available_storage);
+#if GINKGO_DPCPP_SINGLE_MODE
+    constexpr float inv_golden_ratio = 0.61803398875f;
+#else
+    constexpr double inv_golden_ratio = 0.61803398875;
+#endif
+    // use golden ratio as approximation for hash parameter that spreads
+    // consecutive values as far apart as possible. Ensure lowest bit is set
+    // otherwise we skip odd hashtable entries
+    const auto hash_parameter =
+        1u | static_cast<uint32>(available_storage * inv_golden_ratio);
+    row_desc = (static_cast<int64>(hash_parameter) << 32) |
+               static_cast<int>(matrix::csr::sparsity_type::hash);
+    std::fill_n(local_storage, available_storage, invalid_index<int32>());
+    for (int32 nz = 0; nz < row_len; nz++) {
+        auto hash = (static_cast<uint32>(cols[nz]) * hash_parameter) %
+                    static_cast<uint32>(available_storage);
+        // linear probing: find the next empty entry
+        while (local_storage[hash] != invalid_index<int32>()) {
+            hash++;
+            if (hash >= available_storage) {
+                hash = 0;
+            }
+        }
+        local_storage[hash] = nz;
+    }
+}
+
+
+template <typename IndexType>
+void build_lookup(std::shared_ptr<const DpcppExecutor> exec,
+                  const IndexType* row_ptrs, const IndexType* col_idxs,
+                  size_type num_rows, matrix::csr::sparsity_type allowed,
+                  const IndexType* storage_offsets, int64* row_desc,
+                  int32* storage)
+{
+    exec->get_queue()->submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(sycl::range<1>{num_rows}, [=](sycl::id<1> idx) {
+            const auto row = static_cast<size_type>(idx[0]);
+            const auto row_begin = row_ptrs[row];
+            const auto row_len = row_ptrs[row + 1] - row_begin;
+            const auto storage_begin = storage_offsets[row];
+            const auto available_storage =
+                storage_offsets[row + 1] - storage_begin;
+            const auto local_storage = storage + storage_begin;
+            const auto local_cols = col_idxs + row_begin;
+            const auto min_col = row_len > 0 ? local_cols[0] : 0;
+            const auto col_range =
+                row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0;
+            bool done =
+                csr_lookup_try_full(row_len, col_range, allowed, row_desc[row]);
+            if (!done) {
+                done = csr_lookup_try_bitmap(
+                    row_len, col_range, min_col, available_storage, allowed,
+                    row_desc[row], local_storage, local_cols);
+            }
+            if (!done) {
+                csr_lookup_build_hash(row_len, available_storage, row_desc[row],
+                                      local_storage, local_cols);
+            }
+        });
+    });
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL);
+
+
 }  // namespace csr
 }  // namespace dpcpp
 }  // namespace kernels
diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp
index 1e387f71cf3..9025d8c493b 100644
--- a/hip/matrix/csr_kernels.hip.cpp
+++ b/hip/matrix/csr_kernels.hip.cpp
@@ -60,6 +60,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/components/format_conversion_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
+#include "core/matrix/csr_lookup.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/config.hip.hpp"
@@ -71,6 +72,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/intrinsics.hip.hpp"
 #include "hip/components/merging.hip.hpp"
+#include "hip/components/prefix_sum.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/segment_scan.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
diff --git a/hip/matrix/fbcsr_kernels.hip.cpp b/hip/matrix/fbcsr_kernels.hip.cpp
index 9e57f6ece55..8fbf9a05ac9 100644
--- a/hip/matrix/fbcsr_kernels.hip.cpp
+++ b/hip/matrix/fbcsr_kernels.hip.cpp
@@ -58,6 +58,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
+#include "core/matrix/csr_lookup.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/config.hip.hpp"
@@ -70,6 +71,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "hip/components/atomic.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/merging.hip.hpp"
+#include "hip/components/prefix_sum.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/thread_ids.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
diff --git a/include/ginkgo/core/base/intrinsics.hpp b/include/ginkgo/core/base/intrinsics.hpp
new file mode 100644
index 00000000000..279c9cb0c3e
--- /dev/null
+++ b/include/ginkgo/core/base/intrinsics.hpp
@@ -0,0 +1,78 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_BASE_INTRINSICS_HPP_
+#define GKO_PUBLIC_CORE_BASE_INTRINSICS_HPP_
+
+
+#include <bitset>
+
+
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace detail {
+
+
+/**
+ * Returns the number of set bits in the given bitmask.
+ */
+GKO_ATTRIBUTES GKO_INLINE int popcount(uint32 bitmask)
+{
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+    return __popc(bitmask);
+#else
+    std::bitset<32> bits{bitmask};
+    return bits.count();
+#endif
+}
+
+
+/**
+ * Returns the number of set bits in the given bitmask.
+ */
+GKO_ATTRIBUTES GKO_INLINE int popcount(uint64 bitmask)
+{
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+    return __popcll(bitmask);
+#else
+    std::bitset<64> bits{bitmask};
+    return bits.count();
+#endif
+}
+
+
+}  // namespace detail
+}  // namespace gko
+
+#endif  // GKO_PUBLIC_CORE_BASE_INTRINSICS_HPP_
diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp
index b29f78c21f3..453bfae6e3a 100644
--- a/include/ginkgo/ginkgo.hpp
+++ b/include/ginkgo/ginkgo.hpp
@@ -48,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/index_set.hpp>
+#include <ginkgo/core/base/intrinsics.hpp>
 #include <ginkgo/core/base/lin_op.hpp>
 #include <ginkgo/core/base/machine_topology.hpp>
 #include <ginkgo/core/base/math.hpp>
diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp
index a88e6e791a6..f7226662b8e 100644
--- a/omp/matrix/csr_kernels.cpp
+++ b/omp/matrix/csr_kernels.cpp
@@ -1129,6 +1129,122 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
+template <typename IndexType>
+bool csr_lookup_try_full(IndexType row_len, IndexType col_range,
+                         matrix::csr::sparsity_type allowed, int64& row_desc)
+{
+    using matrix::csr::sparsity_type;
+    bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::full);
+    if (is_allowed && row_len == col_range) {
+        row_desc = static_cast<int64>(sparsity_type::full);
+        return true;
+    }
+    return false;
+}
+
+
+template <typename IndexType>
+bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range,
+                           IndexType min_col, IndexType available_storage,
+                           matrix::csr::sparsity_type allowed, int64& row_desc,
+                           int32* local_storage, const IndexType* cols)
+{
+    using matrix::csr::sparsity_bitmap_block_size;
+    using matrix::csr::sparsity_type;
+    bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap);
+    const auto num_blocks = ceildiv(col_range, sparsity_bitmap_block_size);
+    if (is_allowed && num_blocks * 2 <= available_storage) {
+        row_desc = (static_cast<int64>(num_blocks) << 32) |
+                   static_cast<int64>(sparsity_type::bitmap);
+        const auto block_ranks = local_storage;
+        const auto block_bitmaps =
+            reinterpret_cast<uint32*>(block_ranks + num_blocks);
+        std::fill_n(block_bitmaps, num_blocks, 0);
+        for (auto col_it = cols; col_it < cols + row_len; col_it++) {
+            const auto rel_col = *col_it - min_col;
+            const auto block = rel_col / sparsity_bitmap_block_size;
+            const auto col_in_block = rel_col % sparsity_bitmap_block_size;
+            block_bitmaps[block] |= uint32{1} << col_in_block;
+        }
+        int32 partial_sum{};
+        for (int32 block = 0; block < num_blocks; block++) {
+            block_ranks[block] = partial_sum;
+            partial_sum += gko::detail::popcount(block_bitmaps[block]);
+        }
+        return true;
+    }
+    return false;
+}
+
+
+template <typename IndexType>
+void csr_lookup_build_hash(IndexType row_len, IndexType available_storage,
+                           int64& row_desc, int32* local_storage,
+                           const IndexType* cols)
+{
+    // we need at least one unfilled entry to avoid infinite loops on search
+    GKO_ASSERT(row_len < available_storage);
+    constexpr double inv_golden_ratio = 0.61803398875;
+    // use golden ratio as approximation for hash parameter that spreads
+    // consecutive values as far apart as possible. Ensure lowest bit is set
+    // otherwise we skip odd hashtable entries
+    const auto hash_parameter =
+        1u | static_cast<uint32>(available_storage * inv_golden_ratio);
+    row_desc = (static_cast<int64>(hash_parameter) << 32) |
+               static_cast<int>(matrix::csr::sparsity_type::hash);
+    std::fill_n(local_storage, available_storage, invalid_index<int32>());
+    for (int32 nz = 0; nz < row_len; nz++) {
+        auto hash = (static_cast<typename std::make_unsigned<IndexType>::type>(
+                         cols[nz]) *
+                     hash_parameter) %
+                    static_cast<uint32>(available_storage);
+        // linear probing: find the next empty entry
+        while (local_storage[hash] != invalid_index<int32>()) {
+            hash++;
+            if (hash >= available_storage) {
+                hash = 0;
+            }
+        }
+        local_storage[hash] = nz;
+    }
+}
+
+
+template <typename IndexType>
+void build_lookup(std::shared_ptr<const DefaultExecutor> exec,
+                  const IndexType* row_ptrs, const IndexType* col_idxs,
+                  size_type num_rows, matrix::csr::sparsity_type allowed,
+                  const IndexType* storage_offsets, int64* row_desc,
+                  int32* storage)
+{
+#pragma omp parallel for
+    for (size_type row = 0; row < num_rows; row++) {
+        const auto row_begin = row_ptrs[row];
+        const auto row_len = row_ptrs[row + 1] - row_begin;
+        const auto storage_begin = storage_offsets[row];
+        const auto available_storage = storage_offsets[row + 1] - storage_begin;
+        const auto local_storage = storage + storage_begin;
+        const auto local_cols = col_idxs + row_begin;
+        const auto min_col = row_len > 0 ? local_cols[0] : 0;
+        const auto col_range =
+            row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0;
+        bool done =
+            csr_lookup_try_full(row_len, col_range, allowed, row_desc[row]);
+        if (!done) {
+            done = csr_lookup_try_bitmap(
+                row_len, col_range, min_col, available_storage, allowed,
+                row_desc[row], local_storage, local_cols);
+        }
+        if (!done) {
+            csr_lookup_build_hash(row_len, available_storage, row_desc[row],
+                                  local_storage, local_cols);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL);
+
+
 }  // namespace csr
 }  // namespace omp
 }  // namespace kernels
diff --git a/omp/test/matrix/csr_kernels.cpp b/omp/test/matrix/csr_kernels.cpp
index f308bf40fe6..521a923e55d 100644
--- a/omp/test/matrix/csr_kernels.cpp
+++ b/omp/test/matrix/csr_kernels.cpp
@@ -53,6 +53,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_kernels.hpp"
+#include "core/matrix/csr_lookup.hpp"
 #include "core/test/utils.hpp"
 #include "core/test/utils/unsort_matrix.hpp"
 #include "core/utils/matrix_utils.hpp"
diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp
index 6dbbe528162..b438f59da80 100644
--- a/reference/matrix/csr_kernels.cpp
+++ b/reference/matrix/csr_kernels.cpp
@@ -1118,6 +1118,160 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADD_SCALED_IDENTITY_KERNEL);
 
 
+template <typename IndexType>
+void build_lookup_offsets(std::shared_ptr<const ReferenceExecutor> exec,
+                          const IndexType* row_ptrs, const IndexType* col_idxs,
+                          size_type num_rows,
+                          matrix::csr::sparsity_type allowed,
+                          IndexType* storage_offsets)
+{
+    using matrix::csr::sparsity_bitmap_block_size;
+    using matrix::csr::sparsity_type;
+    for (size_type row = 0; row < num_rows; row++) {
+        const auto row_begin = row_ptrs[row];
+        const auto row_len = row_ptrs[row + 1] - row_begin;
+        const auto local_cols = col_idxs + row_begin;
+        const auto min_col = row_len > 0 ? local_cols[0] : 0;
+        const auto col_range =
+            row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0;
+        if (csr_lookup_allowed(allowed, sparsity_type::full) &&
+            row_len == col_range) {
+            storage_offsets[row] = 0;
+        } else {
+            const auto hashmap_storage = std::max<IndexType>(2 * row_len, 1);
+            const auto bitmap_num_blocks = static_cast<int32>(
+                ceildiv(col_range, sparsity_bitmap_block_size));
+            const auto bitmap_storage = 2 * bitmap_num_blocks;
+            if (csr_lookup_allowed(allowed, sparsity_type::bitmap) &&
+                bitmap_storage <= hashmap_storage) {
+                storage_offsets[row] = bitmap_storage;
+            } else {
+                storage_offsets[row] = hashmap_storage;
+            }
+        }
+    }
+    components::prefix_sum(exec, storage_offsets, num_rows + 1);
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
+    GKO_DECLARE_CSR_BUILD_LOOKUP_OFFSETS_KERNEL);
+
+
+template <typename IndexType>
+bool csr_lookup_try_full(IndexType row_len, IndexType col_range,
+                         matrix::csr::sparsity_type allowed, int64& row_desc)
+{
+    using matrix::csr::sparsity_type;
+    bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::full);
+    if (is_allowed && row_len == col_range) {
+        row_desc = static_cast<int64>(sparsity_type::full);
+        return true;
+    }
+    return false;
+}
+
+
+template <typename IndexType>
+bool csr_lookup_try_bitmap(IndexType row_len, IndexType col_range,
+                           IndexType min_col, IndexType available_storage,
+                           matrix::csr::sparsity_type allowed, int64& row_desc,
+                           int32* local_storage, const IndexType* cols)
+{
+    using matrix::csr::sparsity_bitmap_block_size;
+    using matrix::csr::sparsity_type;
+    bool is_allowed = csr_lookup_allowed(allowed, sparsity_type::bitmap);
+    const auto num_blocks =
+        static_cast<int32>(ceildiv(col_range, sparsity_bitmap_block_size));
+    if (is_allowed && num_blocks * 2 <= available_storage) {
+        row_desc = (static_cast<int64>(num_blocks) << 32) |
+                   static_cast<int64>(sparsity_type::bitmap);
+        const auto block_ranks = local_storage;
+        const auto block_bitmaps =
+            reinterpret_cast<uint32*>(block_ranks + num_blocks);
+        std::fill_n(block_bitmaps, num_blocks, 0);
+        for (auto col_it = cols; col_it < cols + row_len; col_it++) {
+            const auto rel_col = *col_it - min_col;
+            const auto block = rel_col / sparsity_bitmap_block_size;
+            const auto col_in_block = rel_col % sparsity_bitmap_block_size;
+            block_bitmaps[block] |= uint32{1} << col_in_block;
+        }
+        int32 partial_sum{};
+        for (int32 block = 0; block < num_blocks; block++) {
+            block_ranks[block] = partial_sum;
+            partial_sum += gko::detail::popcount(block_bitmaps[block]);
+        }
+        return true;
+    }
+    return false;
+}
+
+
+template <typename IndexType>
+void csr_lookup_build_hash(IndexType row_len, IndexType available_storage,
+                           int64& row_desc, int32* local_storage,
+                           const IndexType* cols)
+{
+    // we need at least one unfilled entry to avoid infinite loops on search
+    GKO_ASSERT(row_len < available_storage);
+    constexpr double inv_golden_ratio = 0.61803398875;
+    // use golden ratio as approximation for hash parameter that spreads
+    // consecutive values as far apart as possible. Ensure lowest bit is set
+    // otherwise we skip odd hashtable entries
+    const auto hash_parameter =
+        1u | static_cast<uint32>(available_storage * inv_golden_ratio);
+    row_desc = (static_cast<int64>(hash_parameter) << 32) |
+               static_cast<int64>(matrix::csr::sparsity_type::hash);
+    std::fill_n(local_storage, available_storage, invalid_index<int32>());
+    for (int32 nz = 0; nz < row_len; nz++) {
+        auto hash = (static_cast<std::make_unsigned_t<IndexType>>(cols[nz]) *
+                     hash_parameter) %
+                    static_cast<uint32>(available_storage);
+        // linear probing: find the next empty entry
+        while (local_storage[hash] != invalid_index<int32>()) {
+            hash++;
+            if (hash >= available_storage) {
+                hash = 0;
+            }
+        }
+        local_storage[hash] = nz;
+    }
+}
+
+
+template <typename IndexType>
+void build_lookup(std::shared_ptr<const ReferenceExecutor> exec,
+                  const IndexType* row_ptrs, const IndexType* col_idxs,
+                  size_type num_rows, matrix::csr::sparsity_type allowed,
+                  const IndexType* storage_offsets, int64* row_desc,
+                  int32* storage)
+{
+    for (size_type row = 0; row < num_rows; row++) {
+        const auto row_begin = row_ptrs[row];
+        const auto row_len = row_ptrs[row + 1] - row_begin;
+        const auto storage_begin = storage_offsets[row];
+        const auto available_storage = storage_offsets[row + 1] - storage_begin;
+        const auto local_storage = storage + storage_begin;
+        const auto local_cols = col_idxs + row_begin;
+        const auto min_col = row_len > 0 ? local_cols[0] : 0;
+        const auto col_range =
+            row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0;
+        bool done =
+            csr_lookup_try_full(row_len, col_range, allowed, row_desc[row]);
+        if (!done) {
+            done = csr_lookup_try_bitmap(
+                row_len, col_range, min_col, available_storage, allowed,
+                row_desc[row], local_storage, local_cols);
+        }
+        if (!done) {
+            csr_lookup_build_hash(row_len, available_storage, row_desc[row],
+                                  local_storage, local_cols);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL);
+
+
 }  // namespace csr
 }  // namespace reference
 }  // namespace kernels
diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp
index ec610081884..f7899cf7c89 100644
--- a/reference/test/matrix/csr_kernels.cpp
+++ b/reference/test/matrix/csr_kernels.cpp
@@ -54,6 +54,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "core/matrix/csr_kernels.hpp"
+#include "core/matrix/csr_lookup.hpp"
 #include "core/test/utils.hpp"
 
 
@@ -1891,4 +1892,154 @@ TYPED_TEST(Csr, CanGetSubmatrixWithindex_set)
 }
 
 
+template <typename ValueIndexType>
+class CsrLookup : public ::testing::Test {
+protected:
+    using value_type =
+        typename std::tuple_element<0, decltype(ValueIndexType())>::type;
+    using index_type =
+        typename std::tuple_element<1, decltype(ValueIndexType())>::type;
+    using Mtx = gko::matrix::Csr<value_type, index_type>;
+
+    CsrLookup() : exec(gko::ReferenceExecutor::create())
+    {
+        mtx = Mtx::create(this->exec);
+        typename Mtx::mat_data data{gko::dim<2>{6, 65}};
+        for (int i = 0; i < 65; i++) {
+            // row 0: empty row
+            // row 1: full row
+            data.nonzeros.emplace_back(1, i, 1.0);
+            // row 2: pretty dense row
+            if (i % 3 == 0) {
+                data.nonzeros.emplace_back(2, i, 1.0);
+            }
+            // row 4-5: contiguous row
+            if (i >= 10 && i < 30) {
+                data.nonzeros.emplace_back(4, i, 1.0);
+            }
+            if (i >= 2 && i < 6) {
+                data.nonzeros.emplace_back(5, i, 1.0);
+            }
+        }
+        // row 3: very sparse
+        data.nonzeros.emplace_back(3, 0, 1.0);
+        data.nonzeros.emplace_back(3, 64, 1.0);
+        data.ensure_row_major_order();
+        // 1000 as min-sentinel
+        full_sizes = {0, 0, 1000, 1000, 0, 0};
+        bitmap_sizes = {0, 6, 4, 6, 2, 2};
+        hash_sizes = {1, 130, 44, 4, 40, 8};
+        mtx->read(data);
+    }
+
+    std::shared_ptr<const gko::ReferenceExecutor> exec;
+    std::unique_ptr<Mtx> mtx;
+    std::vector<index_type> full_sizes;
+    std::vector<index_type> bitmap_sizes;
+    std::vector<index_type> hash_sizes;
+    index_type invalid_index = gko::invalid_index<index_type>();
+};
+
+TYPED_TEST_SUITE(CsrLookup, gko::test::ValueIndexTypes,
+                 PairTypenameNameGenerator);
+
+TYPED_TEST(CsrLookup, GeneratesLookupDataOffsets)
+{
+    using IndexType = typename TestFixture::index_type;
+    using gko::matrix::csr::sparsity_type;
+    const auto num_rows = this->mtx->get_size()[0];
+    gko::array<IndexType> storage_offset_array(this->exec, num_rows + 1);
+    const auto storage_offsets = storage_offset_array.get_data();
+    const auto row_ptrs = this->mtx->get_const_row_ptrs();
+    const auto col_idxs = this->mtx->get_const_col_idxs();
+
+    for (auto allowed :
+         {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash,
+          sparsity_type::bitmap | sparsity_type::hash,
+          sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) {
+        gko::kernels::reference::csr::build_lookup_offsets(
+            this->exec, row_ptrs, col_idxs, num_rows, allowed, storage_offsets);
+        bool allow_full =
+            gko::matrix::csr::csr_lookup_allowed(allowed, sparsity_type::full);
+        bool allow_bitmap = gko::matrix::csr::csr_lookup_allowed(
+            allowed, sparsity_type::bitmap);
+
+        for (gko::size_type row = 0; row < num_rows; row++) {
+            const auto expected_size =
+                std::min(allow_full ? this->full_sizes[row] : 1000,
+                         std::min(allow_bitmap ? this->bitmap_sizes[row] : 1000,
+                                  this->hash_sizes[row]));
+            const auto size = storage_offsets[row + 1] - storage_offsets[row];
+
+            ASSERT_EQ(size, expected_size);
+        }
+    }
+}
+
+
+TYPED_TEST(CsrLookup, GeneratesLookupData)
+{
+    using IndexType = typename TestFixture::index_type;
+    using gko::matrix::csr::sparsity_type;
+    const auto num_rows = this->mtx->get_size()[0];
+    const auto num_cols = this->mtx->get_size()[1];
+    gko::array<gko::int64> row_desc_array(this->exec, num_rows);
+    gko::array<IndexType> storage_offset_array(this->exec, num_rows + 1);
+    const auto row_descs = row_desc_array.get_data();
+    const auto storage_offsets = storage_offset_array.get_data();
+    const auto row_ptrs = this->mtx->get_const_row_ptrs();
+    const auto col_idxs = this->mtx->get_const_col_idxs();
+    for (auto allowed :
+         {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash,
+          sparsity_type::bitmap | sparsity_type::hash,
+          sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) {
+        gko::kernels::reference::csr::build_lookup_offsets(
+            this->exec, row_ptrs, col_idxs, num_rows, allowed, storage_offsets);
+        gko::array<gko::int32> storage_array(this->exec,
+                                             storage_offsets[num_rows]);
+        const auto storage = storage_array.get_data();
+        const auto bitmap_equivalent =
+            csr_lookup_allowed(allowed, sparsity_type::bitmap)
+                ? sparsity_type::bitmap
+                : sparsity_type::hash;
+        const auto full_equivalent =
+            csr_lookup_allowed(allowed, sparsity_type::full)
+                ? sparsity_type::full
+                : bitmap_equivalent;
+
+        gko::kernels::reference::csr::build_lookup(
+            this->exec, row_ptrs, col_idxs, num_rows, allowed, storage_offsets,
+            row_descs, storage);
+
+        for (int row = 0; row < num_rows; row++) {
+            const auto row_begin = row_ptrs[row];
+            const auto row_end = row_ptrs[row + 1];
+            gko::matrix::csr::device_sparsity_lookup<IndexType> lookup{
+                row_ptrs, col_idxs,  storage_offsets,
+                storage,  row_descs, static_cast<gko::size_type>(row)};
+            for (auto nz = row_begin; nz < row_end; nz++) {
+                const auto col = col_idxs[nz];
+                ASSERT_EQ(lookup.lookup_unsafe(col) + row_begin, nz);
+            }
+            auto nz = row_begin;
+            for (int col = 0; col < num_cols; col++) {
+                auto found_nz = lookup[col];
+                if (nz < row_end && col_idxs[nz] == col) {
+                    ASSERT_EQ(found_nz, nz - row_begin);
+                    nz++;
+                } else {
+                    ASSERT_EQ(found_nz, this->invalid_index);
+                }
+            }
+        }
+        ASSERT_EQ(row_descs[0] & 0xF, static_cast<int>(full_equivalent));
+        ASSERT_EQ(row_descs[1] & 0xF, static_cast<int>(full_equivalent));
+        ASSERT_EQ(row_descs[2] & 0xF, static_cast<int>(bitmap_equivalent));
+        ASSERT_EQ(row_descs[3] & 0xF, static_cast<int>(sparsity_type::hash));
+        ASSERT_EQ(row_descs[4] & 0xF, static_cast<int>(full_equivalent));
+        ASSERT_EQ(row_descs[5] & 0xF, static_cast<int>(full_equivalent));
+    }
+}
+
+
 }  // namespace
diff --git a/test/matrix/csr_kernels.cpp b/test/matrix/csr_kernels.cpp
index 2be151a55de..3c1e06e89c5 100644
--- a/test/matrix/csr_kernels.cpp
+++ b/test/matrix/csr_kernels.cpp
@@ -30,7 +30,7 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#include <ginkgo/core/matrix/csr.hpp>
+#include "core/matrix/csr_kernels.hpp"
 
 
 #include <algorithm>
@@ -42,11 +42,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <gtest/gtest.h>
 
 
+#include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
 #include "core/components/fill_array_kernels.hpp"
-#include "core/matrix/csr_kernels.hpp"
 #include "core/test/utils.hpp"
 #include "test/utils/executor.hpp"
 
@@ -133,4 +133,146 @@ TEST_F(Csr, InvScaleIsEquivalentToRef)
 }
 
 
+template <typename IndexType>
+class CsrLookup : public ::testing::Test {
+protected:
+    using value_type = float;
+    using index_type = IndexType;
+    using Mtx = gko::matrix::Csr<value_type, index_type>;
+
+    CsrLookup() : rand_engine(15) {}
+
+    void SetUp()
+    {
+        ref = gko::ReferenceExecutor::create();
+        init_executor(ref, exec);
+        auto data =
+            gko::test::generate_random_matrix_data<value_type, index_type>(
+                628, 923, std::uniform_int_distribution<index_type>(10, 300),
+                std::normal_distribution<gko::remove_complex<value_type>>(-1.0,
+                                                                          1.0),
+                rand_engine);
+        // create a few empty rows
+        data.nonzeros.erase(
+            std::remove_if(data.nonzeros.begin(), data.nonzeros.end(),
+                           [](auto entry) { return entry.row % 200 == 21; }),
+            data.nonzeros.end());
+        // insert a full row and a pretty dense row
+        for (int i = 0; i < 100; i++) {
+            data.nonzeros.emplace_back(221, i + 100, 1.0);
+            data.nonzeros.emplace_back(421, i * 3 + 100, 2.0);
+        }
+        data.ensure_row_major_order();
+        // initialize the matrices
+        mtx = Mtx::create(ref);
+        mtx->read(data);
+        dmtx = gko::clone(exec, mtx);
+    }
+
+    void TearDown()
+    {
+        if (exec != nullptr) {
+            ASSERT_NO_THROW(exec->synchronize());
+        }
+    }
+
+    std::default_random_engine rand_engine;
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::EXEC_TYPE> exec;
+    std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<Mtx> dmtx;
+    index_type invalid_index = gko::invalid_index<index_type>();
+};
+
+TYPED_TEST_SUITE(CsrLookup, gko::test::IndexTypes, TypenameNameGenerator);
+
+
+TYPED_TEST(CsrLookup, BuildLookupWorks)
+{
+    using index_type = typename TestFixture::index_type;
+    using gko::matrix::csr::sparsity_type;
+    const auto num_rows = this->mtx->get_size()[0];
+    const auto num_cols = this->mtx->get_size()[1];
+    gko::array<gko::int64> row_desc_array(this->ref, num_rows);
+    gko::array<gko::int64> drow_desc_array(this->exec, num_rows);
+    gko::array<index_type> storage_offset_array(this->ref, num_rows + 1);
+    gko::array<index_type> dstorage_offset_array(this->exec, num_rows + 1);
+    const auto row_descs = row_desc_array.get_data();
+    const auto drow_descs = drow_desc_array.get_data();
+    const auto row_ptrs = this->mtx->get_const_row_ptrs();
+    const auto col_idxs = this->mtx->get_const_col_idxs();
+    const auto drow_ptrs = this->dmtx->get_const_row_ptrs();
+    const auto dcol_idxs = this->dmtx->get_const_col_idxs();
+    const auto storage_offsets = storage_offset_array.get_data();
+    const auto dstorage_offsets = dstorage_offset_array.get_data();
+    for (auto allowed :
+         {sparsity_type::full | sparsity_type::bitmap | sparsity_type::hash,
+          sparsity_type::bitmap | sparsity_type::hash,
+          sparsity_type::full | sparsity_type::hash, sparsity_type::hash}) {
+        // check that storage offsets are calculated correctly
+        // otherwise things might crash
+        gko::kernels::reference::csr::build_lookup_offsets(
+            this->ref, row_ptrs, col_idxs, num_rows, allowed, storage_offsets);
+        gko::kernels::EXEC_NAMESPACE::csr::build_lookup_offsets(
+            this->exec, drow_ptrs, dcol_idxs, num_rows, allowed,
+            dstorage_offsets);
+
+        GKO_ASSERT_ARRAY_EQ(storage_offset_array, dstorage_offset_array);
+
+        gko::array<gko::int32> storage_array(this->ref,
+                                             storage_offsets[num_rows]);
+        gko::array<gko::int32> dstorage_array(this->exec,
+                                              storage_offsets[num_rows]);
+        const auto storage = storage_array.get_data();
+        const auto dstorage = dstorage_array.get_data();
+        const auto bitmap_equivalent =
+            csr_lookup_allowed(allowed, sparsity_type::bitmap)
+                ? sparsity_type::bitmap
+                : sparsity_type::hash;
+        const auto full_equivalent =
+            csr_lookup_allowed(allowed, sparsity_type::full)
+                ? sparsity_type::full
+                : bitmap_equivalent;
+
+        gko::kernels::reference::csr::build_lookup(
+            this->ref, row_ptrs, col_idxs, num_rows, allowed, storage_offsets,
+            row_descs, storage);
+        gko::kernels::EXEC_NAMESPACE::csr::build_lookup(
+            this->exec, drow_ptrs, dcol_idxs, num_rows, allowed,
+            dstorage_offsets, drow_descs, dstorage);
+
+        gko::array<gko::int64> host_row_descs(this->ref, drow_desc_array);
+        gko::array<gko::int32> host_storage_array(this->ref, dstorage_array);
+        for (int row = 0; row < num_rows; row++) {
+            const auto row_begin = row_ptrs[row];
+            const auto row_end = row_ptrs[row + 1];
+            const auto row_nnz = row_end - row_begin;
+            gko::matrix::csr::device_sparsity_lookup<index_type> lookup{
+                row_ptrs,
+                col_idxs,
+                storage_offsets,
+                host_storage_array.get_const_data(),
+                host_row_descs.get_data(),
+                static_cast<gko::size_type>(row)};
+            ASSERT_EQ(host_row_descs.get_const_data()[row] & 0xF,
+                      row_descs[row] & 0xF);
+            for (auto nz = row_begin; nz < row_end; nz++) {
+                const auto col = col_idxs[nz];
+                ASSERT_EQ(lookup.lookup_unsafe(col) + row_begin, nz);
+            }
+            auto nz = row_begin;
+            for (int col = 0; col < num_cols; col++) {
+                auto found_nz = lookup[col];
+                if (nz < row_end && col_idxs[nz] == col) {
+                    ASSERT_EQ(found_nz, nz - row_begin);
+                    nz++;
+                } else {
+                    ASSERT_EQ(found_nz, this->invalid_index);
+                }
+            }
+        }
+    }
+}
+
+
 }  //  namespace