Merge generic matrix format test

Adds a test that runs for all matrix formats and fixes a few small issues that were found on the way, mostly related to edge cases with zero matrix dimensions. Related PR: #904
ginkgo-project · Feb 3, 2022 · 083fb87 · 083fb87
2 parents 0436f3e + c6d4152
commit 083fb87
Show file tree

Hide file tree

Showing 33 changed files with 1,265 additions and 837 deletions.
diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
@@ -186,13 +186,14 @@ function(ginkgo_create_common_test test_name)
         string(SUBSTRING ${exec} 0 1 exec_initial)
         string(SUBSTRING ${exec} 1 -1 exec_tail)
         string(TOUPPER ${exec_initial} exec_initial)
+        string(TOUPPER ${exec} exec_upper)
         set(exec_type ${exec_initial}${exec_tail}Executor)
         # set up actual test
         set(test_target_name ${test_target_name}_${exec})
         add_executable(${test_target_name} ${test_name}.cpp)
         target_compile_features(${test_target_name} PUBLIC cxx_std_14)
         target_compile_options(${test_target_name} PRIVATE ${GINKGO_COMPILER_FLAGS})
-        target_compile_definitions(${test_target_name} PRIVATE EXEC_TYPE=${exec_type} EXEC_NAMESPACE=${exec})
+        target_compile_definitions(${test_target_name} PRIVATE EXEC_TYPE=${exec_type} EXEC_NAMESPACE=${exec} GKO_COMPILING_${exec_upper})
         target_link_libraries(${test_target_name} PRIVATE ${common_test_ADDITIONAL_LIBRARIES})
         # use float for DPC++ if necessary
         if((exec STREQUAL "dpcpp") AND GINKGO_DPCPP_SINGLE_MODE)

diff --git a/common/cuda_hip/matrix/dense_kernels.hpp.inc b/common/cuda_hip/matrix/dense_kernels.hpp.inc
@@ -95,6 +95,35 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(
 }
 
 
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void fill_in_sparsity_csr(
+    size_type num_rows, size_type num_cols, size_type stride,
+    const ValueType* __restrict__ source, IndexType* __restrict__ row_ptrs,
+    IndexType* __restrict__ col_idxs)
+{
+    const auto row = thread::get_subwarp_id_flat<config::warp_size>();
+
+    if (row < num_rows) {
+        auto warp = group::tiled_partition<config::warp_size>(
+            group::this_thread_block());
+        auto lane_prefix_mask =
+            (config::lane_mask_type(1) << warp.thread_rank()) - 1;
+        auto base_out_idx = row_ptrs[row];
+        for (size_type i = 0; i < num_cols; i += config::warp_size) {
+            const auto col = i + warp.thread_rank();
+            const auto pred =
+                col < num_cols ? is_nonzero(source[stride * row + col]) : false;
+            const auto mask = warp.ballot(pred);
+            const auto out_idx = base_out_idx + popcnt(mask & lane_prefix_mask);
+            if (pred) {
+                col_idxs[out_idx] = col;
+            }
+            base_out_idx += popcnt(mask);
+        }
+    }
+}
+
+
 template <typename ValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void fill_in_ell(
     size_type num_rows, size_type num_cols, size_type source_stride,

diff --git a/common/cuda_hip/matrix/hybrid_kernels.hpp.inc b/common/cuda_hip/matrix/hybrid_kernels.hpp.inc
diff --git a/common/unified/matrix/hybrid_kernels.cpp b/common/unified/matrix/hybrid_kernels.cpp
@@ -75,6 +75,50 @@ void compute_row_nnz(std::shared_ptr<const DefaultExecutor> exec,
 }
 
 
+template <typename ValueType, typename IndexType>
+void fill_in_matrix_data(std::shared_ptr<const DefaultExecutor> exec,
+                         const device_matrix_data<ValueType, IndexType>& data,
+                         const int64* row_ptrs, const int64* coo_row_ptrs,
+                         matrix::Hybrid<ValueType, IndexType>* result)
+{
+    using device_value_type = device_type<ValueType>;
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto row, auto row_ptrs, auto data, auto ell_stride,
+                      auto ell_max_nnz, auto ell_cols, auto ell_vals,
+                      auto coo_row_ptrs, auto coo_row_idxs, auto coo_col_idxs,
+                      auto coo_vals) {
+            const auto row_begin = row_ptrs[row];
+            const auto row_size = row_ptrs[row + 1] - row_begin;
+            for (int64 i = 0; i < ell_max_nnz; i++) {
+                const auto out_idx = row + ell_stride * i;
+                const auto in_idx = i + row_begin;
+                const bool use = i < row_size;
+                ell_cols[out_idx] = use ? data[in_idx].column : 0;
+                ell_vals[out_idx] = use ? unpack_member(data[in_idx].value)
+                                        : zero<device_value_type>();
+            }
+            const auto coo_begin = coo_row_ptrs[row];
+            for (int64 i = ell_max_nnz; i < row_size; i++) {
+                const auto in_idx = i + row_begin;
+                const auto out_idx =
+                    coo_begin + i - static_cast<int64>(ell_max_nnz);
+                coo_row_idxs[out_idx] = row;
+                coo_col_idxs[out_idx] = data[in_idx].column;
+                coo_vals[out_idx] = unpack_member(data[in_idx].value);
+            }
+        },
+        data.size[0], row_ptrs, data.nonzeros, result->get_ell_stride(),
+        result->get_ell_num_stored_elements_per_row(),
+        result->get_ell_col_idxs(), result->get_ell_values(), coo_row_ptrs,
+        result->get_coo_row_idxs(), result->get_coo_col_idxs(),
+        result->get_coo_values());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Hybrid<ValueType, IndexType>* source,

diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
@@ -579,7 +579,7 @@ namespace hybrid {
 
 GKO_STUB(GKO_DECLARE_HYBRID_COMPUTE_COO_ROW_PTRS_KERNEL);
 GKO_STUB(GKO_DECLARE_HYBRID_COMPUTE_ROW_NNZ);
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL);
 
 

diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp
@@ -182,10 +182,10 @@ template <typename ValueType, typename IndexType>
 void Coo<ValueType, IndexType>::convert_to(Dense<ValueType>* result) const
 {
     auto exec = this->get_executor();
-    result->resize(this->get_size());
-    result->fill(zero<ValueType>());
-    exec->run(coo::make_fill_in_dense(
-        this, make_temporary_output_clone(exec, result).get()));
+    auto tmp_result = make_temporary_output_clone(exec, result);
+    tmp_result->resize(this->get_size());
+    tmp_result->fill(zero<ValueType>());
+    exec->run(coo::make_fill_in_dense(this, tmp_result.get()));
 }
 
 

diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp
@@ -211,10 +211,10 @@ template <typename ValueType, typename IndexType>
 void Csr<ValueType, IndexType>::convert_to(Dense<ValueType>* result) const
 {
     auto exec = this->get_executor();
-    result->resize(this->get_size());
-    result->fill(zero<ValueType>());
-    exec->run(csr::make_fill_in_dense(
-        this, make_temporary_output_clone(exec, result).get()));
+    auto tmp_result = make_temporary_output_clone(exec, result);
+    tmp_result->resize(this->get_size());
+    tmp_result->fill(zero<ValueType>());
+    exec->run(csr::make_fill_in_dense(this, tmp_result.get()));
 }
 
 
@@ -230,15 +230,22 @@ void Csr<ValueType, IndexType>::convert_to(
     Hybrid<ValueType, IndexType>* result) const
 {
     auto exec = this->get_executor();
-    Array<size_type> row_nnz{exec, this->get_size()[0]};
-    Array<int64> coo_row_ptrs{exec, this->get_size()[0] + 1};
-    exec->run(csr::make_convert_ptrs_to_sizes(
-        this->get_const_row_ptrs(), this->get_size()[0], row_nnz.get_data()));
+    const auto num_rows = this->get_size()[0];
+    const auto num_cols = this->get_size()[1];
+    Array<size_type> row_nnz{exec, num_rows};
+    Array<int64> coo_row_ptrs{exec, num_rows + 1};
+    exec->run(csr::make_convert_ptrs_to_sizes(this->get_const_row_ptrs(),
+                                              num_rows, row_nnz.get_data()));
     size_type ell_lim{};
     size_type coo_nnz{};
     result->get_strategy()->compute_hybrid_config(row_nnz, &ell_lim, &coo_nnz);
+    if (ell_lim > num_cols) {
+        // TODO remove temporary fix after ELL gains true structural zeros
+        ell_lim = num_cols;
+    }
     exec->run(csr::make_compute_hybrid_coo_row_ptrs(row_nnz, ell_lim,
                                                     coo_row_ptrs.get_data()));
+    coo_nnz = exec->copy_val_to_host(coo_row_ptrs.get_const_data() + num_rows);
     auto tmp = make_temporary_clone(exec, result);
     tmp->resize(this->get_size(), ell_lim, coo_nnz);
     exec->run(csr::make_convert_to_hybrid(this, coo_row_ptrs.get_const_data(),

diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
@@ -557,18 +557,23 @@ template <typename IndexType>
 void Dense<ValueType>::convert_impl(Hybrid<ValueType, IndexType>* result) const
 {
     auto exec = this->get_executor();
-    Array<size_type> row_nnz{exec, this->get_size()[0]};
-    Array<int64> coo_row_ptrs{exec, this->get_size()[0] + 1};
+    const auto num_rows = this->get_size()[0];
+    const auto num_cols = this->get_size()[1];
+    Array<size_type> row_nnz{exec, num_rows};
+    Array<int64> coo_row_ptrs{exec, num_rows + 1};
     exec->run(dense::make_count_nonzeros_per_row(this, row_nnz.get_data()));
     size_type ell_lim{};
     size_type coo_nnz{};
     result->get_strategy()->compute_hybrid_config(row_nnz, &ell_lim, &coo_nnz);
+    if (ell_lim > num_cols) {
+        // TODO remove temporary fix after ELL gains true structural zeros
+        ell_lim = num_cols;
+    }
     exec->run(dense::make_compute_hybrid_coo_row_ptrs(row_nnz, ell_lim,
                                                       coo_row_ptrs.get_data()));
+    coo_nnz = exec->copy_val_to_host(coo_row_ptrs.get_const_data() + num_rows);
     auto tmp = make_temporary_clone(exec, result);
-    tmp->ell_->resize(this->get_size(), ell_lim);
-    tmp->coo_->resize(this->get_size(), coo_nnz);
-    tmp->set_size(this->get_size());
+    tmp->resize(this->get_size(), ell_lim, coo_nnz);
     exec->run(dense::make_convert_to_hybrid(this, coo_row_ptrs.get_const_data(),
                                             tmp.get()));
 }

diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp
@@ -132,10 +132,10 @@ template <typename ValueType, typename IndexType>
 void Ell<ValueType, IndexType>::convert_to(Dense<ValueType>* result) const
 {
     auto exec = this->get_executor();
-    result->resize(this->get_size());
-    result->fill(zero<ValueType>());
-    exec->run(ell::make_fill_in_dense(
-        this, make_temporary_output_clone(exec, result).get()));
+    auto tmp_result = make_temporary_output_clone(exec, result);
+    tmp_result->resize(this->get_size());
+    tmp_result->fill(zero<ValueType>());
+    exec->run(ell::make_fill_in_dense(this, tmp_result.get()));
 }
 
 

diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp
@@ -155,10 +155,10 @@ void Fbcsr<ValueType, IndexType>::convert_to(
     Dense<ValueType>* const result) const
 {
     auto exec = this->get_executor();
-    result->resize(this->get_size());
-    result->fill(zero<ValueType>());
-    exec->run(fbcsr::make_fill_in_dense(
-        this, make_temporary_output_clone(exec, result).get()));
+    auto tmp_result = make_temporary_output_clone(exec, result);
+    tmp_result->resize(this->get_size());
+    tmp_result->fill(zero<ValueType>());
+    exec->run(fbcsr::make_fill_in_dense(this, tmp_result.get()));
 }
 
 

diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp
@@ -63,12 +63,13 @@ namespace {
 
 GKO_REGISTER_OPERATION(build_row_ptrs, components::build_row_ptrs);
 GKO_REGISTER_OPERATION(compute_row_nnz, hybrid::compute_row_nnz);
-GKO_REGISTER_OPERATION(split_matrix_data, hybrid::split_matrix_data);
+GKO_REGISTER_OPERATION(fill_in_matrix_data, hybrid::fill_in_matrix_data);
 GKO_REGISTER_OPERATION(ell_fill_in_dense, ell::fill_in_dense);
 GKO_REGISTER_OPERATION(coo_fill_in_dense, coo::fill_in_dense);
 GKO_REGISTER_OPERATION(ell_extract_diagonal, ell::extract_diagonal);
 GKO_REGISTER_OPERATION(coo_extract_diagonal, coo::extract_diagonal);
 GKO_REGISTER_OPERATION(ell_count_nonzeros_per_row, ell::count_nonzeros_per_row);
+GKO_REGISTER_OPERATION(compute_coo_row_ptrs, hybrid::compute_coo_row_ptrs);
 GKO_REGISTER_OPERATION(convert_idxs_to_ptrs, components::convert_idxs_to_ptrs);
 GKO_REGISTER_OPERATION(convert_to_csr, hybrid::convert_to_csr);
 GKO_REGISTER_OPERATION(fill_array, components::fill_array);
@@ -209,25 +210,29 @@ template <typename ValueType, typename IndexType>
 void Hybrid<ValueType, IndexType>::read(const device_mat_data& data)
 {
     auto exec = this->get_executor();
+    const auto num_rows = data.size[0];
+    const auto num_cols = data.size[1];
     auto local_data = make_temporary_clone(exec, &data.nonzeros);
-    Array<int64> row_ptrs{exec, data.size[0] + 1};
-    exec->run(hybrid::make_build_row_ptrs(*local_data, data.size[0],
+    Array<int64> row_ptrs{exec, num_rows + 1};
+    exec->run(hybrid::make_build_row_ptrs(*local_data, num_rows,
                                           row_ptrs.get_data()));
     Array<size_type> row_nnz{exec, data.size[0]};
     exec->run(hybrid::make_compute_row_nnz(row_ptrs, row_nnz.get_data()));
     size_type ell_max_nnz{};
     size_type coo_nnz{};
     this->get_strategy()->compute_hybrid_config(row_nnz, &ell_max_nnz,
                                                 &coo_nnz);
-    auto ell_nnz = data.nonzeros.get_num_elems() - coo_nnz;
-    device_mat_data ell_data{exec, data.size, ell_nnz};
-    device_mat_data coo_data{exec, data.size, coo_nnz};
-    exec->run(hybrid::make_split_matrix_data(
-        data.nonzeros, row_ptrs.get_const_data(), ell_max_nnz, data.size[0],
-        ell_data.nonzeros, coo_data.nonzeros));
-    this->set_size(data.size);
-    ell_->read(ell_data);
-    coo_->read(coo_data);
+    if (ell_max_nnz > num_cols) {
+        // TODO remove temporary fix after ELL gains true structural zeros
+        ell_max_nnz = num_cols;
+    }
+    Array<int64> coo_row_ptrs{exec, num_rows + 1};
+    exec->run(hybrid::make_compute_coo_row_ptrs(row_nnz, ell_max_nnz,
+                                                coo_row_ptrs.get_data()));
+    coo_nnz = exec->copy_val_to_host(coo_row_ptrs.get_const_data() + num_rows);
+    this->resize(data.size, ell_max_nnz, coo_nnz);
+    exec->run(hybrid::make_fill_in_matrix_data(
+        data, row_ptrs.get_const_data(), coo_row_ptrs.get_const_data(), this));
 }