diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index dccd16332e2..28685fdf232 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -186,13 +186,14 @@ function(ginkgo_create_common_test test_name) string(SUBSTRING ${exec} 0 1 exec_initial) string(SUBSTRING ${exec} 1 -1 exec_tail) string(TOUPPER ${exec_initial} exec_initial) + string(TOUPPER ${exec} exec_upper) set(exec_type ${exec_initial}${exec_tail}Executor) # set up actual test set(test_target_name ${test_target_name}_${exec}) add_executable(${test_target_name} ${test_name}.cpp) target_compile_features(${test_target_name} PUBLIC cxx_std_14) target_compile_options(${test_target_name} PRIVATE ${GINKGO_COMPILER_FLAGS}) - target_compile_definitions(${test_target_name} PRIVATE EXEC_TYPE=${exec_type} EXEC_NAMESPACE=${exec}) + target_compile_definitions(${test_target_name} PRIVATE EXEC_TYPE=${exec_type} EXEC_NAMESPACE=${exec} GKO_COMPILING_${exec_upper}) target_link_libraries(${test_target_name} PRIVATE ${common_test_ADDITIONAL_LIBRARIES}) # use float for DPC++ if necessary if((exec STREQUAL "dpcpp") AND GINKGO_DPCPP_SINGLE_MODE) diff --git a/common/cuda_hip/matrix/dense_kernels.hpp.inc b/common/cuda_hip/matrix/dense_kernels.hpp.inc index 28eb7dd7332..c4cdc214e76 100644 --- a/common/cuda_hip/matrix/dense_kernels.hpp.inc +++ b/common/cuda_hip/matrix/dense_kernels.hpp.inc @@ -95,6 +95,35 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr( } +template +__global__ __launch_bounds__(default_block_size) void fill_in_sparsity_csr( + size_type num_rows, size_type num_cols, size_type stride, + const ValueType* __restrict__ source, IndexType* __restrict__ row_ptrs, + IndexType* __restrict__ col_idxs) +{ + const auto row = thread::get_subwarp_id_flat(); + + if (row < num_rows) { + auto warp = group::tiled_partition( + group::this_thread_block()); + auto lane_prefix_mask = + (config::lane_mask_type(1) << warp.thread_rank()) - 1; + auto base_out_idx = row_ptrs[row]; + for (size_type i = 0; i < num_cols; i += config::warp_size) { + const auto col = i + warp.thread_rank(); + const auto pred = + col < num_cols ? is_nonzero(source[stride * row + col]) : false; + const auto mask = warp.ballot(pred); + const auto out_idx = base_out_idx + popcnt(mask & lane_prefix_mask); + if (pred) { + col_idxs[out_idx] = col; + } + base_out_idx += popcnt(mask); + } + } +} + + template __global__ __launch_bounds__(default_block_size) void fill_in_ell( size_type num_rows, size_type num_cols, size_type source_stride, diff --git a/common/cuda_hip/matrix/hybrid_kernels.hpp.inc b/common/cuda_hip/matrix/hybrid_kernels.hpp.inc deleted file mode 100644 index 04d52f6b0dc..00000000000 --- a/common/cuda_hip/matrix/hybrid_kernels.hpp.inc +++ /dev/null @@ -1,97 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - - -namespace { - - -template -struct hybrid_tuple_unpack_functor { - using device_entry = device_type>; - device_entry __device__ - operator()(thrust::tuple e) const - { - return thrust::get<1>(e); - } -}; - - -} // anonymous namespace - - -template -void split_matrix_data( - std::shared_ptr exec, - const Array>& data, - const int64* row_ptrs, size_type ell_limit, size_type num_rows, - Array>& ell_data, - Array>& coo_data) -{ - using device_entry = device_type>; - auto iota = thrust::make_counting_iterator(size_type{}); - auto data_it = - thrust::device_pointer_cast(as_device_type(data.get_const_data())); - const auto nnz = data.get_num_elems(); - auto enumerated_data_it = - thrust::make_zip_iterator(thrust::make_tuple(iota, data_it)); - auto ell_predicate = [row_ptrs, ell_limit] __device__( - thrust::tuple e) { - const auto row_begin = row_ptrs[thrust::get<1>(e).row]; - const auto local_nz = thrust::get<0>(e) - row_begin; - return local_nz < ell_limit; - }; - auto coo_predicate = [row_ptrs, ell_limit] __device__( - thrust::tuple e) { - const auto row_begin = row_ptrs[thrust::get<1>(e).row]; - const auto local_nz = thrust::get<0>(e) - row_begin; - return local_nz >= ell_limit; - }; - const auto ell_nnz = static_cast( - thrust::count_if(thrust::device, enumerated_data_it, - enumerated_data_it + nnz, ell_predicate)); - const auto coo_nnz = nnz - ell_nnz; - ell_data.resize_and_reset(ell_nnz); - coo_data.resize_and_reset(coo_nnz); - auto ell_data_it = thrust::make_transform_output_iterator( - thrust::device_pointer_cast(as_device_type(ell_data.get_data())), - hybrid_tuple_unpack_functor{}); - auto coo_data_it = thrust::make_transform_output_iterator( - thrust::device_pointer_cast(as_device_type(coo_data.get_data())), - hybrid_tuple_unpack_functor{}); - thrust::copy_if(thrust::device, enumerated_data_it, - enumerated_data_it + nnz, ell_data_it, ell_predicate); - thrust::copy_if(thrust::device, enumerated_data_it, - enumerated_data_it + nnz, coo_data_it, coo_predicate); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL); diff --git a/common/unified/matrix/hybrid_kernels.cpp b/common/unified/matrix/hybrid_kernels.cpp index 8f654e7d157..3ed8de8b1c5 100644 --- a/common/unified/matrix/hybrid_kernels.cpp +++ b/common/unified/matrix/hybrid_kernels.cpp @@ -75,6 +75,50 @@ void compute_row_nnz(std::shared_ptr exec, } +template +void fill_in_matrix_data(std::shared_ptr exec, + const device_matrix_data& data, + const int64* row_ptrs, const int64* coo_row_ptrs, + matrix::Hybrid* result) +{ + using device_value_type = device_type; + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto row_ptrs, auto data, auto ell_stride, + auto ell_max_nnz, auto ell_cols, auto ell_vals, + auto coo_row_ptrs, auto coo_row_idxs, auto coo_col_idxs, + auto coo_vals) { + const auto row_begin = row_ptrs[row]; + const auto row_size = row_ptrs[row + 1] - row_begin; + for (int64 i = 0; i < ell_max_nnz; i++) { + const auto out_idx = row + ell_stride * i; + const auto in_idx = i + row_begin; + const bool use = i < row_size; + ell_cols[out_idx] = use ? data[in_idx].column : 0; + ell_vals[out_idx] = use ? unpack_member(data[in_idx].value) + : zero(); + } + const auto coo_begin = coo_row_ptrs[row]; + for (int64 i = ell_max_nnz; i < row_size; i++) { + const auto in_idx = i + row_begin; + const auto out_idx = + coo_begin + i - static_cast(ell_max_nnz); + coo_row_idxs[out_idx] = row; + coo_col_idxs[out_idx] = data[in_idx].column; + coo_vals[out_idx] = unpack_member(data[in_idx].value); + } + }, + data.size[0], row_ptrs, data.nonzeros, result->get_ell_stride(), + result->get_ell_num_stored_elements_per_row(), + result->get_ell_col_idxs(), result->get_ell_values(), coo_row_ptrs, + result->get_coo_row_idxs(), result->get_coo_col_idxs(), + result->get_coo_values()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL); + + template void convert_to_csr(std::shared_ptr exec, const matrix::Hybrid* source, diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 647c0c7ed12..462406ef4b5 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -579,7 +579,7 @@ namespace hybrid { GKO_STUB(GKO_DECLARE_HYBRID_COMPUTE_COO_ROW_PTRS_KERNEL); GKO_STUB(GKO_DECLARE_HYBRID_COMPUTE_ROW_NNZ); -GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL); +GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL); diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp index 0b5781e7a5d..d84cbd9c003 100644 --- a/core/matrix/coo.cpp +++ b/core/matrix/coo.cpp @@ -182,10 +182,10 @@ template void Coo::convert_to(Dense* result) const { auto exec = this->get_executor(); - result->resize(this->get_size()); - result->fill(zero()); - exec->run(coo::make_fill_in_dense( - this, make_temporary_output_clone(exec, result).get())); + auto tmp_result = make_temporary_output_clone(exec, result); + tmp_result->resize(this->get_size()); + tmp_result->fill(zero()); + exec->run(coo::make_fill_in_dense(this, tmp_result.get())); } diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index 54c735f89b6..31ba7d7c8bb 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -211,10 +211,10 @@ template void Csr::convert_to(Dense* result) const { auto exec = this->get_executor(); - result->resize(this->get_size()); - result->fill(zero()); - exec->run(csr::make_fill_in_dense( - this, make_temporary_output_clone(exec, result).get())); + auto tmp_result = make_temporary_output_clone(exec, result); + tmp_result->resize(this->get_size()); + tmp_result->fill(zero()); + exec->run(csr::make_fill_in_dense(this, tmp_result.get())); } @@ -230,15 +230,22 @@ void Csr::convert_to( Hybrid* result) const { auto exec = this->get_executor(); - Array row_nnz{exec, this->get_size()[0]}; - Array coo_row_ptrs{exec, this->get_size()[0] + 1}; - exec->run(csr::make_convert_ptrs_to_sizes( - this->get_const_row_ptrs(), this->get_size()[0], row_nnz.get_data())); + const auto num_rows = this->get_size()[0]; + const auto num_cols = this->get_size()[1]; + Array row_nnz{exec, num_rows}; + Array coo_row_ptrs{exec, num_rows + 1}; + exec->run(csr::make_convert_ptrs_to_sizes(this->get_const_row_ptrs(), + num_rows, row_nnz.get_data())); size_type ell_lim{}; size_type coo_nnz{}; result->get_strategy()->compute_hybrid_config(row_nnz, &ell_lim, &coo_nnz); + if (ell_lim > num_cols) { + // TODO remove temporary fix after ELL gains true structural zeros + ell_lim = num_cols; + } exec->run(csr::make_compute_hybrid_coo_row_ptrs(row_nnz, ell_lim, coo_row_ptrs.get_data())); + coo_nnz = exec->copy_val_to_host(coo_row_ptrs.get_const_data() + num_rows); auto tmp = make_temporary_clone(exec, result); tmp->resize(this->get_size(), ell_lim, coo_nnz); exec->run(csr::make_convert_to_hybrid(this, coo_row_ptrs.get_const_data(), diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 93ec5578700..b89d036ee73 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -557,18 +557,23 @@ template void Dense::convert_impl(Hybrid* result) const { auto exec = this->get_executor(); - Array row_nnz{exec, this->get_size()[0]}; - Array coo_row_ptrs{exec, this->get_size()[0] + 1}; + const auto num_rows = this->get_size()[0]; + const auto num_cols = this->get_size()[1]; + Array row_nnz{exec, num_rows}; + Array coo_row_ptrs{exec, num_rows + 1}; exec->run(dense::make_count_nonzeros_per_row(this, row_nnz.get_data())); size_type ell_lim{}; size_type coo_nnz{}; result->get_strategy()->compute_hybrid_config(row_nnz, &ell_lim, &coo_nnz); + if (ell_lim > num_cols) { + // TODO remove temporary fix after ELL gains true structural zeros + ell_lim = num_cols; + } exec->run(dense::make_compute_hybrid_coo_row_ptrs(row_nnz, ell_lim, coo_row_ptrs.get_data())); + coo_nnz = exec->copy_val_to_host(coo_row_ptrs.get_const_data() + num_rows); auto tmp = make_temporary_clone(exec, result); - tmp->ell_->resize(this->get_size(), ell_lim); - tmp->coo_->resize(this->get_size(), coo_nnz); - tmp->set_size(this->get_size()); + tmp->resize(this->get_size(), ell_lim, coo_nnz); exec->run(dense::make_convert_to_hybrid(this, coo_row_ptrs.get_const_data(), tmp.get())); } diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp index 684fb7e627d..0a9f22f8acc 100644 --- a/core/matrix/ell.cpp +++ b/core/matrix/ell.cpp @@ -132,10 +132,10 @@ template void Ell::convert_to(Dense* result) const { auto exec = this->get_executor(); - result->resize(this->get_size()); - result->fill(zero()); - exec->run(ell::make_fill_in_dense( - this, make_temporary_output_clone(exec, result).get())); + auto tmp_result = make_temporary_output_clone(exec, result); + tmp_result->resize(this->get_size()); + tmp_result->fill(zero()); + exec->run(ell::make_fill_in_dense(this, tmp_result.get())); } diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp index 688d7aa4b9f..e0aa4b8df3e 100644 --- a/core/matrix/fbcsr.cpp +++ b/core/matrix/fbcsr.cpp @@ -155,10 +155,10 @@ void Fbcsr::convert_to( Dense* const result) const { auto exec = this->get_executor(); - result->resize(this->get_size()); - result->fill(zero()); - exec->run(fbcsr::make_fill_in_dense( - this, make_temporary_output_clone(exec, result).get())); + auto tmp_result = make_temporary_output_clone(exec, result); + tmp_result->resize(this->get_size()); + tmp_result->fill(zero()); + exec->run(fbcsr::make_fill_in_dense(this, tmp_result.get())); } diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp index 42cdce0f162..6850c71a4bb 100644 --- a/core/matrix/hybrid.cpp +++ b/core/matrix/hybrid.cpp @@ -63,12 +63,13 @@ namespace { GKO_REGISTER_OPERATION(build_row_ptrs, components::build_row_ptrs); GKO_REGISTER_OPERATION(compute_row_nnz, hybrid::compute_row_nnz); -GKO_REGISTER_OPERATION(split_matrix_data, hybrid::split_matrix_data); +GKO_REGISTER_OPERATION(fill_in_matrix_data, hybrid::fill_in_matrix_data); GKO_REGISTER_OPERATION(ell_fill_in_dense, ell::fill_in_dense); GKO_REGISTER_OPERATION(coo_fill_in_dense, coo::fill_in_dense); GKO_REGISTER_OPERATION(ell_extract_diagonal, ell::extract_diagonal); GKO_REGISTER_OPERATION(coo_extract_diagonal, coo::extract_diagonal); GKO_REGISTER_OPERATION(ell_count_nonzeros_per_row, ell::count_nonzeros_per_row); +GKO_REGISTER_OPERATION(compute_coo_row_ptrs, hybrid::compute_coo_row_ptrs); GKO_REGISTER_OPERATION(convert_idxs_to_ptrs, components::convert_idxs_to_ptrs); GKO_REGISTER_OPERATION(convert_to_csr, hybrid::convert_to_csr); GKO_REGISTER_OPERATION(fill_array, components::fill_array); @@ -209,9 +210,11 @@ template void Hybrid::read(const device_mat_data& data) { auto exec = this->get_executor(); + const auto num_rows = data.size[0]; + const auto num_cols = data.size[1]; auto local_data = make_temporary_clone(exec, &data.nonzeros); - Array row_ptrs{exec, data.size[0] + 1}; - exec->run(hybrid::make_build_row_ptrs(*local_data, data.size[0], + Array row_ptrs{exec, num_rows + 1}; + exec->run(hybrid::make_build_row_ptrs(*local_data, num_rows, row_ptrs.get_data())); Array row_nnz{exec, data.size[0]}; exec->run(hybrid::make_compute_row_nnz(row_ptrs, row_nnz.get_data())); @@ -219,15 +222,17 @@ void Hybrid::read(const device_mat_data& data) size_type coo_nnz{}; this->get_strategy()->compute_hybrid_config(row_nnz, &ell_max_nnz, &coo_nnz); - auto ell_nnz = data.nonzeros.get_num_elems() - coo_nnz; - device_mat_data ell_data{exec, data.size, ell_nnz}; - device_mat_data coo_data{exec, data.size, coo_nnz}; - exec->run(hybrid::make_split_matrix_data( - data.nonzeros, row_ptrs.get_const_data(), ell_max_nnz, data.size[0], - ell_data.nonzeros, coo_data.nonzeros)); - this->set_size(data.size); - ell_->read(ell_data); - coo_->read(coo_data); + if (ell_max_nnz > num_cols) { + // TODO remove temporary fix after ELL gains true structural zeros + ell_max_nnz = num_cols; + } + Array coo_row_ptrs{exec, num_rows + 1}; + exec->run(hybrid::make_compute_coo_row_ptrs(row_nnz, ell_max_nnz, + coo_row_ptrs.get_data())); + coo_nnz = exec->copy_val_to_host(coo_row_ptrs.get_const_data() + num_rows); + this->resize(data.size, ell_max_nnz, coo_nnz); + exec->run(hybrid::make_fill_in_matrix_data( + data, row_ptrs.get_const_data(), coo_row_ptrs.get_const_data(), this)); } diff --git a/core/matrix/hybrid_kernels.hpp b/core/matrix/hybrid_kernels.hpp index 9e9cc08f88d..ca298c9eabd 100644 --- a/core/matrix/hybrid_kernels.hpp +++ b/core/matrix/hybrid_kernels.hpp @@ -56,13 +56,12 @@ namespace kernels { const Array& row_nnz, \ size_type ell_lim, int64* coo_row_ptrs) -#define GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL(ValueType, IndexType) \ - void split_matrix_data( \ - std::shared_ptr exec, \ - const Array>& data, \ - const int64* row_ptrs, size_type num_rows, size_type ell_limit, \ - Array>& ell_data, \ - Array>& coo_data) +#define GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType) \ + void fill_in_matrix_data( \ + std::shared_ptr exec, \ + const device_matrix_data& data, \ + const int64* row_ptrs, const int64* coo_row_ptrs, \ + matrix::Hybrid* result) #define GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL(ValueType, IndexType) \ void convert_to_csr(std::shared_ptr exec, \ @@ -72,12 +71,12 @@ namespace kernels { matrix::Csr* result) -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - GKO_DECLARE_HYBRID_COMPUTE_ROW_NNZ; \ - GKO_DECLARE_HYBRID_COMPUTE_COO_ROW_PTRS_KERNEL; \ - template \ - GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL(ValueType, IndexType); \ - template \ +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + GKO_DECLARE_HYBRID_COMPUTE_ROW_NNZ; \ + GKO_DECLARE_HYBRID_COMPUTE_COO_ROW_PTRS_KERNEL; \ + template \ + GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ + template \ GKO_DECLARE_HYBRID_CONVERT_TO_CSR_KERNEL(ValueType, IndexType) diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp index b5e70c23647..bbdf22df95e 100644 --- a/core/matrix/sellp.cpp +++ b/core/matrix/sellp.cpp @@ -127,10 +127,10 @@ template void Sellp::convert_to(Dense* result) const { auto exec = this->get_executor(); - result->resize(this->get_size()); - result->fill(zero()); - exec->run(sellp::make_fill_in_dense( - this, make_temporary_output_clone(exec, result).get())); + auto tmp_result = make_temporary_output_clone(exec, result); + tmp_result->resize(this->get_size()); + tmp_result->fill(zero()); + exec->run(sellp::make_fill_in_dense(this, tmp_result.get())); } diff --git a/core/matrix/sparsity_csr.cpp b/core/matrix/sparsity_csr.cpp index 8d32985abf1..ea02f43b7c2 100644 --- a/core/matrix/sparsity_csr.cpp +++ b/core/matrix/sparsity_csr.cpp @@ -127,10 +127,10 @@ void SparsityCsr::convert_to( Dense* result) const { auto exec = this->get_executor(); - auto tmp = make_temporary_output_clone(exec, result); - tmp->resize(this->get_size()); - tmp->fill(zero()); - exec->run(sparsity_csr::make_fill_in_dense(this, tmp.get())); + auto tmp_result = make_temporary_output_clone(exec, result); + tmp_result->resize(this->get_size()); + tmp_result->fill(zero()); + exec->run(sparsity_csr::make_fill_in_dense(this, tmp_result.get())); } diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 4945056790b..2ace138b485 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -95,7 +95,6 @@ target_sources(ginkgo_cuda matrix/ell_kernels.cu matrix/fbcsr_kernels.cu matrix/fft_kernels.cu - matrix/hybrid_kernels.cu matrix/sellp_kernels.cu matrix/sparsity_csr_kernels.cu multigrid/amgx_pgm_kernels.cu diff --git a/cuda/matrix/coo_kernels.cu b/cuda/matrix/coo_kernels.cu index 0c7540ca7b1..29291c6c93e 100644 --- a/cuda/matrix/coo_kernels.cu +++ b/cuda/matrix/coo_kernels.cu @@ -114,7 +114,7 @@ void spmv2(std::shared_ptr exec, const dim3 coo_block(config::warp_size, warps_in_block, 1); const auto nwarps = host_kernel::calculate_nwarps(exec, nnz); - if (nwarps > 0) { + if (nwarps > 0 && b_ncols > 0) { if (b_ncols < 4) { const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols); int num_lines = ceildiv(nnz, nwarps * config::warp_size); @@ -152,7 +152,7 @@ void advanced_spmv2(std::shared_ptr exec, const dim3 coo_block(config::warp_size, warps_in_block, 1); const auto b_ncols = b->get_size()[1]; - if (nwarps > 0) { + if (nwarps > 0 && b_ncols > 0) { if (b_ncols < 4) { int num_lines = ceildiv(nnz, nwarps * config::warp_size); const dim3 coo_grid(ceildiv(nwarps, warps_in_block), b_ncols); diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu index 60d2c119a03..7d2fc8defc7 100644 --- a/cuda/matrix/dense_kernels.cu +++ b/cuda/matrix/dense_kernels.cu @@ -300,7 +300,23 @@ template void convert_to_sparsity_csr(std::shared_ptr exec, const matrix::Dense* source, matrix::SparsityCsr* result) - GKO_NOT_IMPLEMENTED; +{ + auto num_rows = result->get_size()[0]; + auto num_cols = result->get_size()[1]; + + auto row_ptrs = result->get_row_ptrs(); + auto col_idxs = result->get_col_idxs(); + + auto stride = source->get_stride(); + + const auto grid_dim = + ceildiv(num_rows, default_block_size / config::warp_size); + if (grid_dim > 0) { + kernel::fill_in_sparsity_csr<<>>( + num_rows, num_cols, stride, + as_cuda_type(source->get_const_values()), row_ptrs, col_idxs); + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL); diff --git a/cuda/matrix/hybrid_kernels.cu b/cuda/matrix/hybrid_kernels.cu deleted file mode 100644 index f1b05c4f605..00000000000 --- a/cuda/matrix/hybrid_kernels.cu +++ /dev/null @@ -1,64 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#include "core/matrix/hybrid_kernels.hpp" - - -#include -#include -#include -#include -#include - - -#include "common/unified/base/kernel_launch.hpp" -#include "cuda/base/types.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The Hybrid matrix format namespace. - * - * @ingroup hybrid - */ -namespace hybrid { - - -#include "common/cuda_hip/matrix/hybrid_kernels.hpp.inc" - - -} // namespace hybrid -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/cuda/test/matrix/dense_kernels.cpp b/cuda/test/matrix/dense_kernels.cpp index 8b3ab2630f3..ff787f2be0c 100644 --- a/cuda/test/matrix/dense_kernels.cpp +++ b/cuda/test/matrix/dense_kernels.cpp @@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "core/components/fill_array_kernels.hpp" @@ -437,6 +438,32 @@ TEST_F(Dense, MoveToCsrIsEquivalentToRef) } +TEST_F(Dense, ConvertToSparsityCsrIsEquivalentToRef) +{ + set_up_apply_data(); + auto sparsity_mtx = gko::matrix::SparsityCsr<>::create(ref); + auto d_sparsity_mtx = gko::matrix::SparsityCsr<>::create(cuda); + + x->convert_to(sparsity_mtx.get()); + dx->convert_to(d_sparsity_mtx.get()); + + GKO_ASSERT_MTX_NEAR(d_sparsity_mtx.get(), sparsity_mtx.get(), 0); +} + + +TEST_F(Dense, MoveToSparsityCsrIsEquivalentToRef) +{ + set_up_apply_data(); + auto sparsity_mtx = gko::matrix::SparsityCsr<>::create(ref); + auto d_sparsity_mtx = gko::matrix::SparsityCsr<>::create(cuda); + + x->move_to(sparsity_mtx.get()); + dx->move_to(d_sparsity_mtx.get()); + + GKO_ASSERT_MTX_NEAR(d_sparsity_mtx.get(), sparsity_mtx.get(), 0); +} + + TEST_F(Dense, ConvertToEllIsEquivalentToRef) { set_up_apply_data(); diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index f896ac396a8..3a18b0930d1 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -35,7 +35,6 @@ target_sources(ginkgo_dpcpp matrix/diagonal_kernels.dp.cpp matrix/ell_kernels.dp.cpp matrix/fft_kernels.dp.cpp - matrix/hybrid_kernels.dp.cpp matrix/sellp_kernels.dp.cpp matrix/sparsity_csr_kernels.dp.cpp multigrid/amgx_pgm_kernels.dp.cpp diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 626fd17dab0..cfd435d1dc1 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include @@ -84,316 +85,6 @@ constexpr int default_block_size = 256; namespace kernel { -template -void fill_in_coo(size_type num_rows, size_type num_cols, size_type stride, - const int64* __restrict__ row_ptrs, - const ValueType* __restrict__ source, - IndexType* __restrict__ row_idxs, - IndexType* __restrict__ col_idxs, - ValueType* __restrict__ values, sycl::nd_item<3> item_ct1) -{ - const auto tidx = thread::get_thread_id_flat(item_ct1); - if (tidx < num_rows) { - size_type write_to = row_ptrs[tidx]; - - for (size_type i = 0; i < num_cols; i++) { - if (is_nonzero(source[stride * tidx + i])) { - values[write_to] = source[stride * tidx + i]; - col_idxs[write_to] = i; - row_idxs[write_to] = tidx; - write_to++; - } - } - } -} - -GKO_ENABLE_DEFAULT_HOST(fill_in_coo, fill_in_coo) - - -template -void count_nnz_per_row(size_type num_rows, size_type num_cols, size_type stride, - const ValueType* __restrict__ work, - IndexType* __restrict__ result, - sycl::nd_item<3> item_ct1) -{ - constexpr auto sg_size = KCFG_1D::decode<1>(cfg); - const auto row_idx = thread::get_subwarp_id_flat(item_ct1); - auto warp_tile = - group::tiled_partition(group::this_thread_block(item_ct1)); - - if (row_idx < num_rows) { - IndexType part_result{}; - for (auto i = warp_tile.thread_rank(); i < num_cols; i += sg_size) { - if (is_nonzero(work[stride * row_idx + i])) { - part_result += 1; - } - } - result[row_idx] = ::gko::kernels::dpcpp::reduce( - warp_tile, part_result, - [](const size_type& a, const size_type& b) { return a + b; }); - } -} - -GKO_ENABLE_DEFAULT_HOST_CONFIG(count_nnz_per_row, count_nnz_per_row) -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(count_nnz_per_row, count_nnz_per_row) -GKO_ENABLE_DEFAULT_CONFIG_CALL(count_nnz_per_row_call, count_nnz_per_row, - kcfg_1d_list) - - -template -void fill_in_csr(size_type num_rows, size_type num_cols, size_type stride, - const ValueType* __restrict__ source, - IndexType* __restrict__ row_ptrs, - IndexType* __restrict__ col_idxs, - ValueType* __restrict__ values, sycl::nd_item<3> item_ct1) -{ - const auto tidx = thread::get_thread_id_flat(item_ct1); - - if (tidx < num_rows) { - auto write_to = row_ptrs[tidx]; - for (size_type i = 0; i < num_cols; i++) { - if (is_nonzero(source[stride * tidx + i])) { - values[write_to] = source[stride * tidx + i]; - col_idxs[write_to] = i; - write_to++; - } - } - } -} - -GKO_ENABLE_DEFAULT_HOST(fill_in_csr, fill_in_csr) - - -template -void fill_in_ell(size_type num_rows, size_type num_cols, - size_type source_stride, const ValueType* __restrict__ source, - size_type max_nnz_per_row, size_type result_stride, - IndexType* __restrict__ col_ptrs, - ValueType* __restrict__ values, sycl::nd_item<3> item_ct1) -{ - const auto tidx = thread::get_thread_id_flat(item_ct1); - if (tidx < num_rows) { - IndexType col_idx = 0; - for (size_type col = 0; col < num_cols; col++) { - if (is_nonzero(source[tidx * source_stride + col])) { - col_ptrs[col_idx * result_stride + tidx] = col; - values[col_idx * result_stride + tidx] = - source[tidx * source_stride + col]; - col_idx++; - } - } - for (size_type j = col_idx; j < max_nnz_per_row; j++) { - col_ptrs[j * result_stride + tidx] = 0; - values[j * result_stride + tidx] = zero(); - } - } else if (tidx < result_stride) { - for (size_type j = 0; j < max_nnz_per_row; j++) { - col_ptrs[j * result_stride + tidx] = 0; - values[j * result_stride + tidx] = zero(); - } - } -} - -GKO_ENABLE_DEFAULT_HOST(fill_in_ell, fill_in_ell) - - -template -void calculate_slice_lengths(size_type num_rows, size_type slice_size, - int slice_num, size_type stride_factor, - const size_type* __restrict__ nnz_per_row, - size_type* __restrict__ slice_lengths, - size_type* __restrict__ slice_sets, - sycl::nd_item<3> item_ct1) -{ - constexpr auto sg_size = cfg; - const auto sliceid = item_ct1.get_group(2); - const auto tid_in_warp = item_ct1.get_local_id(2); - const bool runable = sliceid * slice_size + tid_in_warp < num_rows; - size_type thread_result = 0; - for (size_type i = tid_in_warp; i < slice_size; i += sg_size) { - thread_result = - (i + slice_size * sliceid < num_rows) - ? max(thread_result, nnz_per_row[sliceid * slice_size + i]) - : thread_result; - } - - auto warp_tile = - group::tiled_partition(group::this_thread_block(item_ct1)); - auto warp_result = ::gko::kernels::dpcpp::reduce( - warp_tile, thread_result, - [](const size_type& a, const size_type& b) { return max(a, b); }); - - if (tid_in_warp == 0 && runable) { - auto slice_length = ceildiv(warp_result, stride_factor) * stride_factor; - slice_lengths[sliceid] = slice_length; - slice_sets[sliceid] = slice_length; - } -} - -GKO_ENABLE_DEFAULT_HOST_CONFIG(calculate_slice_lengths, calculate_slice_lengths) -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(calculate_slice_lengths, - calculate_slice_lengths) -GKO_ENABLE_DEFAULT_CONFIG_CALL(calculate_slice_lengths_call, - calculate_slice_lengths, subgroup_list) - - -template -void fill_in_sellp(size_type num_rows, size_type num_cols, size_type slice_size, - size_type stride, const ValueType* __restrict__ source, - size_type* __restrict__ slice_lengths, - size_type* __restrict__ slice_sets, - IndexType* __restrict__ col_idxs, - ValueType* __restrict__ vals, sycl::nd_item<3> item_ct1) -{ - const auto global_row = thread::get_thread_id_flat(item_ct1); - const auto row = global_row % slice_size; - const auto sliceid = global_row / slice_size; - - if (global_row < num_rows) { - size_type sellp_ind = slice_sets[sliceid] * slice_size + row; - - for (size_type col = 0; col < num_cols; col++) { - auto val = source[global_row * stride + col]; - if (is_nonzero(val)) { - col_idxs[sellp_ind] = col; - vals[sellp_ind] = val; - sellp_ind += slice_size; - } - } - for (size_type i = sellp_ind; - i < - (slice_sets[sliceid] + slice_lengths[sliceid]) * slice_size + row; - i += slice_size) { - col_idxs[i] = 0; - vals[i] = zero(); - } - } -} - -GKO_ENABLE_DEFAULT_HOST(fill_in_sellp, fill_in_sellp) - - -template -void reduce_max_nnz(size_type size, const size_type* __restrict__ nnz_per_row, - size_type* __restrict__ result, sycl::nd_item<3> item_ct1, - uint8_t* dpct_local) -{ - constexpr auto sg_size = KCFG_1D::decode<1>(cfg); - auto block_max = (size_type*)dpct_local; - - reduce_array( - size, nnz_per_row, block_max, item_ct1, - [](const size_type& x, const size_type& y) { return max(x, y); }); - - if (item_ct1.get_local_id(2) == 0) { - result[item_ct1.get_group(2)] = block_max[0]; - } -} - -template -void reduce_max_nnz(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type size, - const size_type* nnz_per_row, size_type* result) -{ - queue->submit([&](sycl::handler& cgh) { - sycl::accessor - dpct_local_acc_ct1(sycl::range<1>(dynamic_shared_memory), cgh); - - - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - reduce_max_nnz(size, nnz_per_row, result, item_ct1, - dpct_local_acc_ct1.get_pointer().get()); - }); - }); -} - -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_max_nnz, reduce_max_nnz); -GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_call, reduce_max_nnz, - kcfg_1d_list) - - -template -void reduce_max_nnz_per_slice(size_type num_rows, size_type slice_size, - size_type stride_factor, - const size_type* __restrict__ nnz_per_row, - size_type* __restrict__ result, - sycl::nd_item<3> item_ct1) -{ - constexpr auto sg_size = KCFG_1D::decode<1>(cfg); - auto warp_tile = - group::tiled_partition(group::this_thread_block(item_ct1)); - const auto warpid = thread::get_subwarp_id_flat(item_ct1); - const auto tid_in_warp = warp_tile.thread_rank(); - const auto slice_num = ceildiv(num_rows, slice_size); - - size_type thread_result = 0; - for (size_type i = tid_in_warp; i < slice_size; i += sg_size) { - if (warpid * slice_size + i < num_rows) { - thread_result = - max(thread_result, nnz_per_row[warpid * slice_size + i]); - } - } - - auto warp_result = ::gko::kernels::dpcpp::reduce( - warp_tile, thread_result, - [](const size_type& a, const size_type& b) { return max(a, b); }); - - if (tid_in_warp == 0 && warpid < slice_num) { - result[warpid] = ceildiv(warp_result, stride_factor) * stride_factor; - } -} - -GKO_ENABLE_DEFAULT_HOST_CONFIG(reduce_max_nnz_per_slice, - reduce_max_nnz_per_slice) -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_max_nnz_per_slice, - reduce_max_nnz_per_slice) -GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_max_nnz_per_slice_call, - reduce_max_nnz_per_slice, kcfg_1d_list) - - -template -void reduce_total_cols(size_type num_slices, - const size_type* __restrict__ max_nnz_per_slice, - size_type* __restrict__ result, - sycl::nd_item<3> item_ct1, uint8_t* dpct_local) -{ - auto block_result = (size_type*)dpct_local; - constexpr auto sg_size = KCFG_1D::decode<1>(cfg); - reduce_array( - num_slices, max_nnz_per_slice, block_result, item_ct1, - [](const size_type& x, const size_type& y) { return x + y; }); - - if (item_ct1.get_local_id(2) == 0) { - result[item_ct1.get_group(2)] = block_result[0]; - } -} - -template -void reduce_total_cols(dim3 grid, dim3 block, size_type dynamic_shared_memory, - sycl::queue* queue, size_type num_slices, - const size_type* max_nnz_per_slice, size_type* result) -{ - queue->submit([&](sycl::handler& cgh) { - sycl::accessor - dpct_local_acc_ct1(sycl::range<1>(dynamic_shared_memory), cgh); - - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - reduce_total_cols(num_slices, max_nnz_per_slice, result, - item_ct1, - dpct_local_acc_ct1.get_pointer().get()); - }); - }); -} - -GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_total_cols, - reduce_total_cols); -GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_total_cols_call, reduce_total_cols, - kcfg_1d_list) - template void transpose(const size_type nrows, const size_type ncols, const ValueType* __restrict__ in, const size_type in_stride, @@ -540,29 +231,30 @@ void convert_to_coo(std::shared_ptr exec, const int64* row_ptrs, matrix::Coo* result) { - auto num_rows = result->get_size()[0]; - auto num_cols = result->get_size()[1]; - - auto row_idxs = result->get_row_idxs(); - auto col_idxs = result->get_col_idxs(); - auto values = result->get_values(); + const auto num_rows = result->get_size()[0]; + const auto num_cols = result->get_size()[1]; + const auto in_vals = source->get_const_values(); + const auto stride = source->get_stride(); - auto stride = source->get_stride(); + auto rows = result->get_row_idxs(); + auto cols = result->get_col_idxs(); + auto vals = result->get_values(); - auto queue = exec->get_queue(); - constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const std::uint32_t cfg = - get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { - return validate(queue, KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); + exec->get_queue()->submit([&](sycl::handler& cgh) { + cgh.parallel_for(num_rows, [=](sycl::item<1> item) { + const auto row = static_cast(item[0]); + auto write_to = row_ptrs[row]; + + for (size_type col = 0; col < num_cols; col++) { + if (is_nonzero(in_vals[stride * row + col])) { + vals[write_to] = in_vals[stride * row + col]; + cols[write_to] = static_cast(col); + rows[write_to] = static_cast(row); + write_to++; + } + } }); - const auto wg_size = KCFG_1D::decode<0>(cfg); - const auto sg_size = KCFG_1D::decode<1>(cfg); - size_type grid_dim = ceildiv(num_rows, wg_size); - - kernel::fill_in_coo(grid_dim, wg_size, 0, exec->get_queue(), num_rows, - num_cols, stride, row_ptrs, source->get_const_values(), - row_idxs, col_idxs, values); + }); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -574,30 +266,29 @@ void convert_to_csr(std::shared_ptr exec, const matrix::Dense* source, matrix::Csr* result) { - auto queue = exec->get_queue(); - constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const std::uint32_t cfg = - get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { - return validate(queue, KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }); - const auto wg_size = KCFG_1D::decode<0>(cfg); - const auto sg_size = KCFG_1D::decode<1>(cfg); - - auto num_rows = result->get_size()[0]; - auto num_cols = result->get_size()[1]; - - auto row_ptrs = result->get_row_ptrs(); - auto col_idxs = result->get_col_idxs(); - auto values = result->get_values(); - - auto stride = source->get_stride(); + const auto num_rows = result->get_size()[0]; + const auto num_cols = result->get_size()[1]; + const auto in_vals = source->get_const_values(); + const auto stride = source->get_stride(); - size_type grid_dim = ceildiv(num_rows, wg_size); + const auto row_ptrs = result->get_const_row_ptrs(); + auto cols = result->get_col_idxs(); + auto vals = result->get_values(); - kernel::fill_in_csr(grid_dim, default_block_size, 0, exec->get_queue(), - num_rows, num_cols, stride, source->get_const_values(), - row_ptrs, col_idxs, values); + exec->get_queue()->submit([&](sycl::handler& cgh) { + cgh.parallel_for(num_rows, [=](sycl::item<1> item) { + const auto row = static_cast(item[0]); + auto write_to = row_ptrs[row]; + + for (size_type col = 0; col < num_cols; col++) { + if (is_nonzero(in_vals[stride * row + col])) { + vals[write_to] = in_vals[stride * row + col]; + cols[write_to] = static_cast(col); + write_to++; + } + } + }); + }); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -609,29 +300,34 @@ void convert_to_ell(std::shared_ptr exec, const matrix::Dense* source, matrix::Ell* result) { - auto num_rows = result->get_size()[0]; - auto num_cols = result->get_size()[1]; - auto max_nnz_per_row = result->get_num_stored_elements_per_row(); - - auto col_ptrs = result->get_col_idxs(); - auto values = result->get_values(); - - auto source_stride = source->get_stride(); - auto result_stride = result->get_stride(); + const auto num_rows = result->get_size()[0]; + const auto num_cols = result->get_size()[1]; + const auto max_nnz_per_row = result->get_num_stored_elements_per_row(); + const auto in_vals = source->get_const_values(); + const auto in_stride = source->get_stride(); - auto queue = exec->get_queue(); - constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const std::uint32_t cfg = - get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { - return validate(queue, KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); + auto cols = result->get_col_idxs(); + auto vals = result->get_values(); + const auto stride = result->get_stride(); + + exec->get_queue()->submit([&](sycl::handler& cgh) { + cgh.parallel_for(num_rows, [=](sycl::item<1> item) { + const auto row = static_cast(item[0]); + size_type col_idx = 0; + for (size_type col = 0; col < num_cols; col++) { + if (is_nonzero(in_vals[row * in_stride + col])) { + cols[col_idx * stride + row] = col; + vals[col_idx * stride + row] = + in_vals[row * in_stride + col]; + col_idx++; + } + } + for (; col_idx < max_nnz_per_row; col_idx++) { + cols[col_idx * stride + row] = 0; + vals[col_idx * stride + row] = zero(); + } }); - const auto wg_size = KCFG_1D::decode<0>(cfg); - const auto sg_size = KCFG_1D::decode<1>(cfg); - auto grid_dim = ceildiv(result_stride, wg_size); - kernel::fill_in_ell(grid_dim, wg_size, 0, exec->get_queue(), num_rows, - num_cols, source_stride, source->get_const_values(), - max_nnz_per_row, result_stride, col_ptrs, values); + }); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -659,10 +355,56 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( template -void convert_to_hybrid( - std::shared_ptr exec, - const matrix::Dense* source, const int64* coo_row_ptrs, - matrix::Hybrid* result) GKO_NOT_IMPLEMENTED; +void convert_to_hybrid(std::shared_ptr exec, + const matrix::Dense* source, + const int64* coo_row_ptrs, + matrix::Hybrid* result) +{ + const auto num_rows = result->get_size()[0]; + const auto num_cols = result->get_size()[1]; + const auto ell_lim = result->get_ell_num_stored_elements_per_row(); + const auto in_vals = source->get_const_values(); + const auto in_stride = source->get_stride(); + const auto ell_stride = result->get_ell_stride(); + auto ell_cols = result->get_ell_col_idxs(); + auto ell_vals = result->get_ell_values(); + auto coo_rows = result->get_coo_row_idxs(); + auto coo_cols = result->get_coo_col_idxs(); + auto coo_vals = result->get_coo_values(); + + exec->get_queue()->submit([&](sycl::handler& cgh) { + cgh.parallel_for(num_rows, [=](sycl::item<1> item) { + const auto row = static_cast(item[0]); + size_type ell_count = 0; + size_type col = 0; + auto ell_idx = row; + for (; col < num_cols && ell_count < ell_lim; col++) { + const auto val = in_vals[row * in_stride + col]; + if (is_nonzero(val)) { + ell_vals[ell_idx] = val; + ell_cols[ell_idx] = static_cast(col); + ell_count++; + ell_idx += ell_stride; + } + } + for (; ell_count < ell_lim; ell_count++) { + ell_vals[ell_idx] = zero(); + ell_cols[ell_idx] = 0; + ell_idx += ell_stride; + } + auto coo_idx = coo_row_ptrs[row]; + for (; col < num_cols; col++) { + const auto val = in_vals[row * in_stride + col]; + if (is_nonzero(val)) { + coo_vals[coo_idx] = val; + coo_cols[coo_idx] = static_cast(col); + coo_rows[coo_idx] = static_cast(row); + coo_idx++; + } + } + }); + }); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_HYBRID_KERNEL); @@ -673,50 +415,38 @@ void convert_to_sellp(std::shared_ptr exec, const matrix::Dense* source, matrix::Sellp* result) { - auto queue = exec->get_queue(); - constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const std::uint32_t cfg = - get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { - return validate(queue, KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }); - const auto wg_size = KCFG_1D::decode<0>(cfg); - const auto sg_size = KCFG_1D::decode<1>(cfg); - - const auto stride = source->get_stride(); const auto num_rows = result->get_size()[0]; const auto num_cols = result->get_size()[1]; + const auto stride = source->get_stride(); + const auto in_vals = source->get_const_values(); + const auto slice_sets = result->get_const_slice_sets(); + const auto slice_size = result->get_slice_size(); auto vals = result->get_values(); auto col_idxs = result->get_col_idxs(); - auto slice_lengths = result->get_slice_lengths(); - auto slice_sets = result->get_slice_sets(); - - const auto slice_size = result->get_slice_size(); - const auto stride_factor = result->get_stride_factor(); - const int slice_num = ceildiv(num_rows, slice_size); - - auto nnz_per_row = Array(exec, num_rows); - count_nonzeros_per_row(exec, source, nnz_per_row.get_data()); - - auto grid_dim = slice_num; - if (grid_dim > 0) { - kernel::calculate_slice_lengths_call( - sg_size, grid_dim, sg_size, 0, exec->get_queue(), num_rows, - slice_size, slice_num, stride_factor, nnz_per_row.get_const_data(), - slice_lengths, slice_sets); - } - - components::prefix_sum(exec, slice_sets, slice_num + 1); - - grid_dim = ceildiv(num_rows, wg_size); - if (grid_dim > 0) { - kernel::fill_in_sellp(grid_dim, wg_size, 0, exec->get_queue(), num_rows, - num_cols, slice_size, stride, - source->get_const_values(), slice_lengths, - slice_sets, col_idxs, vals); - } + exec->get_queue()->submit([&](sycl::handler& cgh) { + cgh.parallel_for(num_rows, [=](sycl::item<1> item) { + const auto row = static_cast(item[0]); + const auto local_row = row % slice_size; + const auto slice = row / slice_size; + const auto slice_end = slice_sets[slice + 1] * slice_size; + auto out_idx = slice_sets[slice] * slice_size + local_row; + + for (size_type col = 0; col < num_cols; col++) { + const auto val = in_vals[row * stride + col]; + if (is_nonzero(val)) { + col_idxs[out_idx] = static_cast(col); + vals[out_idx] = val; + out_idx += slice_size; + } + } + for (; out_idx < slice_end; out_idx += slice_size) { + col_idxs[out_idx] = 0; + vals[out_idx] = zero(); + } + }); + }); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -727,7 +457,29 @@ template void convert_to_sparsity_csr(std::shared_ptr exec, const matrix::Dense* source, matrix::SparsityCsr* result) - GKO_NOT_IMPLEMENTED; +{ + const auto num_rows = result->get_size()[0]; + const auto num_cols = result->get_size()[1]; + const auto in_vals = source->get_const_values(); + const auto stride = source->get_stride(); + + const auto row_ptrs = result->get_const_row_ptrs(); + auto cols = result->get_col_idxs(); + + exec->get_queue()->submit([&](sycl::handler& cgh) { + cgh.parallel_for(num_rows, [=](sycl::item<1> item) { + const auto row = static_cast(item[0]); + auto write_to = row_ptrs[row]; + + for (size_type col = 0; col < num_cols; col++) { + if (is_nonzero(in_vals[stride * row + col])) { + cols[write_to] = static_cast(col); + write_to++; + } + } + }); + }); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL); diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp index e8976754b98..6afc78f0d8f 100644 --- a/dpcpp/test/matrix/dense_kernels.cpp +++ b/dpcpp/test/matrix/dense_kernels.cpp @@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "core/components/fill_array_kernels.hpp" @@ -470,6 +471,32 @@ TEST_F(Dense, MoveToCsrIsEquivalentToRef) } +TEST_F(Dense, ConvertToSparsityCsrIsEquivalentToRef) +{ + set_up_apply_data(); + auto sparsity_mtx = gko::matrix::SparsityCsr::create(ref); + auto d_sparsity_mtx = gko::matrix::SparsityCsr::create(dpcpp); + + x->convert_to(sparsity_mtx.get()); + dx->convert_to(d_sparsity_mtx.get()); + + GKO_ASSERT_MTX_NEAR(d_sparsity_mtx.get(), sparsity_mtx.get(), 0); +} + + +TEST_F(Dense, MoveToSparsityCsrIsEquivalentToRef) +{ + set_up_apply_data(); + auto sparsity_mtx = gko::matrix::SparsityCsr::create(ref); + auto d_sparsity_mtx = gko::matrix::SparsityCsr::create(dpcpp); + + x->move_to(sparsity_mtx.get()); + dx->move_to(d_sparsity_mtx.get()); + + GKO_ASSERT_MTX_NEAR(d_sparsity_mtx.get(), sparsity_mtx.get(), 0); +} + + TEST_F(Dense, ConvertToEllIsEquivalentToRef) { set_up_apply_data(); diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 55b2c4605b8..c860d6abb2a 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -179,7 +179,6 @@ set(GINKGO_HIP_SOURCES matrix/diagonal_kernels.hip.cpp matrix/ell_kernels.hip.cpp matrix/fbcsr_kernels.hip.cpp - matrix/hybrid_kernels.hip.cpp matrix/sellp_kernels.hip.cpp matrix/sparsity_csr_kernels.hip.cpp multigrid/amgx_pgm_kernels.hip.cpp diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp index 188deeb7a0f..63e42fbea5b 100644 --- a/hip/matrix/dense_kernels.hip.cpp +++ b/hip/matrix/dense_kernels.hip.cpp @@ -308,7 +308,24 @@ template void convert_to_sparsity_csr(std::shared_ptr exec, const matrix::Dense* source, matrix::SparsityCsr* result) - GKO_NOT_IMPLEMENTED; +{ + auto num_rows = result->get_size()[0]; + auto num_cols = result->get_size()[1]; + + auto row_ptrs = result->get_row_ptrs(); + auto col_idxs = result->get_col_idxs(); + + auto stride = source->get_stride(); + + const auto grid_dim = + ceildiv(num_rows, default_block_size / config::warp_size); + if (grid_dim > 0) { + hipLaunchKernelGGL(kernel::fill_in_sparsity_csr, grid_dim, + default_block_size, 0, 0, num_rows, num_cols, stride, + as_hip_type(source->get_const_values()), + as_hip_type(row_ptrs), as_hip_type(col_idxs)); + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_CONVERT_TO_SPARSITY_CSR_KERNEL); diff --git a/hip/matrix/hybrid_kernels.hip.cpp b/hip/matrix/hybrid_kernels.hip.cpp deleted file mode 100644 index acd067ea28b..00000000000 --- a/hip/matrix/hybrid_kernels.hip.cpp +++ /dev/null @@ -1,65 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#include "core/matrix/hybrid_kernels.hpp" - - -#include -#include -#include -#include -#include -#include - - -#include "common/unified/base/kernel_launch.hpp" -#include "hip/base/types.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The Hybrid matrix format namespace. - * - * @ingroup hybrid - */ -namespace hybrid { - - -#include "common/cuda_hip/matrix/hybrid_kernels.hpp.inc" - - -} // namespace hybrid -} // namespace hip -} // namespace kernels -} // namespace gko diff --git a/hip/test/matrix/dense_kernels.hip.cpp b/hip/test/matrix/dense_kernels.hip.cpp index e6978a68fa1..b9c3899a5fe 100644 --- a/hip/test/matrix/dense_kernels.hip.cpp +++ b/hip/test/matrix/dense_kernels.hip.cpp @@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "core/components/fill_array_kernels.hpp" @@ -433,6 +434,32 @@ TEST_F(Dense, MoveToCsrIsEquivalentToRef) } +TEST_F(Dense, ConvertToSparsityCsrIsEquivalentToRef) +{ + set_up_apply_data(); + auto sparsity_mtx = gko::matrix::SparsityCsr<>::create(ref); + auto d_sparsity_mtx = gko::matrix::SparsityCsr<>::create(hip); + + x->convert_to(sparsity_mtx.get()); + dx->convert_to(d_sparsity_mtx.get()); + + GKO_ASSERT_MTX_NEAR(d_sparsity_mtx.get(), sparsity_mtx.get(), 0); +} + + +TEST_F(Dense, MoveToSparsityCsrIsEquivalentToRef) +{ + set_up_apply_data(); + auto sparsity_mtx = gko::matrix::SparsityCsr<>::create(ref); + auto d_sparsity_mtx = gko::matrix::SparsityCsr<>::create(hip); + + x->move_to(sparsity_mtx.get()); + dx->move_to(d_sparsity_mtx.get()); + + GKO_ASSERT_MTX_NEAR(d_sparsity_mtx.get(), sparsity_mtx.get(), 0); +} + + TEST_F(Dense, ConvertToEllIsEquivalentToRef) { set_up_apply_data(); diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index 79b29e053c3..179e6ee18a7 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -22,7 +22,6 @@ target_sources(ginkgo_omp matrix/ell_kernels.cpp matrix/fbcsr_kernels.cpp matrix/fft_kernels.cpp - matrix/hybrid_kernels.cpp matrix/sellp_kernels.cpp matrix/sparsity_csr_kernels.cpp multigrid/amgx_pgm_kernels.cpp diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp index 56735caa436..dd47a2b3ed7 100644 --- a/omp/matrix/dense_kernels.cpp +++ b/omp/matrix/dense_kernels.cpp @@ -283,8 +283,7 @@ void convert_to_hybrid(std::shared_ptr exec, { auto num_rows = result->get_size()[0]; auto num_cols = result->get_size()[1]; - auto strategy = result->get_strategy(); - auto ell_lim = strategy->get_ell_num_stored_elements_per_row(); + auto ell_lim = result->get_ell_num_stored_elements_per_row(); auto coo_val = result->get_coo_values(); auto coo_col = result->get_coo_col_idxs(); auto coo_row = result->get_coo_row_idxs(); diff --git a/omp/matrix/hybrid_kernels.cpp b/omp/matrix/hybrid_kernels.cpp deleted file mode 100644 index 083975cdc96..00000000000 --- a/omp/matrix/hybrid_kernels.cpp +++ /dev/null @@ -1,101 +0,0 @@ -/************************************************************* -Copyright (c) 2017-2022, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ - -#include "core/matrix/hybrid_kernels.hpp" - - -#include - - -#include -#include -#include -#include - - -#include "core/components/format_conversion_kernels.hpp" -#include "core/components/prefix_sum_kernels.hpp" -#include "core/matrix/ell_kernels.hpp" - - -namespace gko { -namespace kernels { -namespace omp { -/** - * @brief The Hybrid matrix format namespace. - * - * @ingroup hybrid - */ -namespace hybrid { - - -template -void split_matrix_data( - std::shared_ptr exec, - const Array>& data, - const int64* row_ptrs, size_type ell_limit, size_type num_rows, - Array>& ell_data, - Array>& coo_data) -{ - auto data_ptr = data.get_const_data(); - size_type ell_nnz{}; - for (size_type row = 0; row < num_rows; row++) { - ell_nnz += - std::min(ell_limit, row_ptrs[row + 1] - row_ptrs[row]); - } - ell_data.resize_and_reset(ell_nnz); - coo_data.resize_and_reset(data.get_num_elems() - ell_nnz); - size_type ell_nz{}; - size_type coo_nz{}; - for (size_type row = 0; row < num_rows; row++) { - size_type local_ell_nnz{}; - for (auto i = row_ptrs[row]; i < row_ptrs[row + 1]; i++) { - if (local_ell_nnz < ell_limit) { - ell_data.get_data()[ell_nz] = data.get_const_data()[i]; - ell_nz++; - local_ell_nnz++; - } else { - coo_data.get_data()[coo_nz] = data.get_const_data()[i]; - coo_nz++; - } - } - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL); - - -} // namespace hybrid -} // namespace omp -} // namespace kernels -} // namespace gko diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp index 7ce43de53ed..1d8667e14ac 100644 --- a/reference/matrix/csr_kernels.cpp +++ b/reference/matrix/csr_kernels.cpp @@ -683,8 +683,7 @@ void convert_to_hybrid(std::shared_ptr exec, auto num_rows = result->get_size()[0]; auto num_cols = result->get_size()[1]; auto strategy = result->get_strategy(); - auto ell_lim = strategy->get_ell_num_stored_elements_per_row(); - auto coo_lim = strategy->get_coo_nnz(); + auto ell_lim = result->get_ell_num_stored_elements_per_row(); auto coo_val = result->get_coo_values(); auto coo_col = result->get_coo_col_idxs(); auto coo_row = result->get_coo_row_idxs(); diff --git a/reference/matrix/hybrid_kernels.cpp b/reference/matrix/hybrid_kernels.cpp index 3b2f9badc54..d2f12cfc65a 100644 --- a/reference/matrix/hybrid_kernels.cpp +++ b/reference/matrix/hybrid_kernels.cpp @@ -83,40 +83,38 @@ void compute_row_nnz(std::shared_ptr exec, template -void split_matrix_data( - std::shared_ptr exec, - const Array>& data, - const int64* row_ptrs, size_type ell_limit, size_type num_rows, - Array>& ell_data, - Array>& coo_data) +void fill_in_matrix_data(std::shared_ptr exec, + const device_matrix_data& data, + const int64* row_ptrs, const int64*, + matrix::Hybrid* result) { - auto data_ptr = data.get_const_data(); - size_type ell_nnz{}; - for (size_type row = 0; row < num_rows; row++) { - ell_nnz += - std::min(ell_limit, row_ptrs[row + 1] - row_ptrs[row]); - } - ell_data.resize_and_reset(ell_nnz); - coo_data.resize_and_reset(data.get_num_elems() - ell_nnz); - size_type ell_nz{}; + const auto num_rows = result->get_size()[0]; + const auto ell_max_nnz = result->get_ell_num_stored_elements_per_row(); + const auto nonzeros = data.nonzeros.get_const_data(); size_type coo_nz{}; for (size_type row = 0; row < num_rows; row++) { - size_type local_ell_nnz{}; - for (auto i = row_ptrs[row]; i < row_ptrs[row + 1]; i++) { - if (local_ell_nnz < ell_limit) { - ell_data.get_data()[ell_nz] = data.get_const_data()[i]; + size_type ell_nz{}; + for (auto nz = row_ptrs[row]; nz < row_ptrs[row + 1]; nz++) { + if (ell_nz < ell_max_nnz) { + result->ell_col_at(row, ell_nz) = nonzeros[nz].column; + result->ell_val_at(row, ell_nz) = nonzeros[nz].value; ell_nz++; - local_ell_nnz++; } else { - coo_data.get_data()[coo_nz] = data.get_const_data()[i]; + result->get_coo_row_idxs()[coo_nz] = nonzeros[nz].row; + result->get_coo_col_idxs()[coo_nz] = nonzeros[nz].column; + result->get_coo_values()[coo_nz] = nonzeros[nz].value; coo_nz++; } } + for (; ell_nz < ell_max_nnz; ell_nz++) { + result->ell_col_at(row, ell_nz) = 0; + result->ell_val_at(row, ell_nz) = zero(); + } } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_HYBRID_SPLIT_MATRIX_DATA_KERNEL); + GKO_DECLARE_HYBRID_FILL_IN_MATRIX_DATA_KERNEL); template diff --git a/test/matrix/CMakeLists.txt b/test/matrix/CMakeLists.txt index e118dcb71bf..2bed18bb9ed 100644 --- a/test/matrix/CMakeLists.txt +++ b/test/matrix/CMakeLists.txt @@ -1,2 +1,3 @@ ginkgo_create_common_test(csr_kernels) ginkgo_create_common_test(dense_kernels) +ginkgo_create_common_test(matrix) \ No newline at end of file diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp new file mode 100644 index 00000000000..7d4b4a41fae --- /dev/null +++ b/test/matrix/matrix.cpp @@ -0,0 +1,806 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include +#include +#include +#include + + +#include + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "core/test/utils.hpp" +#include "test/utils/executor.hpp" + + +#if GINKGO_DPCPP_SINGLE_MODE +using matrix_value_type = float; +#else +using matrix_value_type = double; +#endif // GINKGO_DPCPP_SINGLE_MODE + + +template +struct SimpleMatrixTest { + using matrix_type = MtxType; + + static bool preserves_zeros() { return true; } + + static std::unique_ptr create( + std::shared_ptr exec, gko::dim<2> size) + { + return matrix_type::create(exec->get_master(), size); + } + + static void check_property(const std::unique_ptr&) {} +}; + +struct DenseWithDefaultStride + : SimpleMatrixTest> { + static bool preserves_zeros() { return false; } +}; + +struct DenseWithCustomStride : DenseWithDefaultStride { + static std::unique_ptr create( + std::shared_ptr exec, gko::dim<2> size) + { + return matrix_type::create(exec, size, size[0] + 10); + } + + static void check_property(const std::unique_ptr& mtx) + { + ASSERT_EQ(mtx->get_stride(), mtx->get_size()[0] + 10); + } +}; + +struct Coo : SimpleMatrixTest> {}; + +struct CsrWithDefaultStrategy + : SimpleMatrixTest> {}; + + +#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) || \ + defined(GKO_COMPILING_DPCPP) + + +struct CsrWithClassicalStrategy + : SimpleMatrixTest> { + static std::unique_ptr create( + std::shared_ptr exec, gko::dim<2> size) + { + return matrix_type::create(exec, size, 0, + std::make_shared()); + } + + static void check_property(const std::unique_ptr& mtx) + { + ASSERT_TRUE(dynamic_cast( + mtx->get_strategy().get())); + } +}; + +struct CsrWithMergePathStrategy + : SimpleMatrixTest> { + static std::unique_ptr create( + std::shared_ptr exec, gko::dim<2> size) + { + return matrix_type::create(exec, size, 0, + std::make_shared()); + } + + static void check_property(const std::unique_ptr& mtx) + { + ASSERT_TRUE(dynamic_cast( + mtx->get_strategy().get())); + } +}; + +struct CsrWithSparselibStrategy + : SimpleMatrixTest> { + static std::unique_ptr create( + std::shared_ptr exec, gko::dim<2> size) + { + return matrix_type::create(exec, size, 0, + std::make_shared()); + } + + static void check_property(const std::unique_ptr& mtx) + { + ASSERT_TRUE(dynamic_cast( + mtx->get_strategy().get())); + } +}; + +struct CsrWithLoadBalanceStrategy + : SimpleMatrixTest> { + static std::unique_ptr create( + std::shared_ptr exec, gko::dim<2> size) + { + return matrix_type::create(exec, size, 0, + std::make_shared( + gko::EXEC_TYPE::create(0, exec))); + } + + static void check_property(const std::unique_ptr& mtx) + { + ASSERT_TRUE(dynamic_cast( + mtx->get_strategy().get())); + } +}; + +struct CsrWithAutomaticalStrategy + : SimpleMatrixTest> { + static std::unique_ptr create( + std::shared_ptr exec, gko::dim<2> size) + { + return matrix_type::create(exec, size, 0, + std::make_shared( + gko::EXEC_TYPE::create(0, exec))); + } + + static void check_property(const std::unique_ptr& mtx) + { + ASSERT_TRUE(dynamic_cast( + mtx->get_strategy().get())); + } +}; + + +#endif + + +struct Ell : SimpleMatrixTest> {}; + + +struct FbcsrBlocksize1 + : SimpleMatrixTest> { + static bool preserves_zeros() { return false; } + + static std::unique_ptr create( + std::shared_ptr exec, gko::dim<2> size) + { + return matrix_type::create(exec, size, 0, 1); + } + + static void check_property(const std::unique_ptr& mtx) + { + ASSERT_EQ(mtx->get_block_size(), 1); + } +}; + +struct FbcsrBlocksize2 + : SimpleMatrixTest> { + static bool preserves_zeros() { return false; } + + static std::unique_ptr create( + std::shared_ptr exec, gko::dim<2> size) + { + return matrix_type::create(exec, size, 0, 2); + } + + static void check_property(const std::unique_ptr& mtx) + { + ASSERT_EQ(mtx->get_block_size(), 2); + } +}; + + +struct SellpDefaultParameters + : SimpleMatrixTest> { + static void check_property(const std::unique_ptr& mtx) + { + ASSERT_EQ(mtx->get_stride_factor(), 1); + ASSERT_EQ(mtx->get_slice_size(), 64); + } +}; + +struct Sellp32Factor2 + : SimpleMatrixTest> { + static std::unique_ptr create( + std::shared_ptr exec, gko::dim<2> size) + { + return matrix_type::create(exec, size, 32, 2, 0); + } + + static void check_property(const std::unique_ptr& mtx) + { + ASSERT_EQ(mtx->get_stride_factor(), 2); + ASSERT_EQ(mtx->get_slice_size(), 32); + } +}; + + +struct HybridDefaultStrategy + : SimpleMatrixTest> {}; + +struct HybridColumnLimitStrategy + : SimpleMatrixTest> { + static std::unique_ptr create( + std::shared_ptr exec, gko::dim<2> size) + { + return matrix_type::create( + exec, size, 0, std::make_shared(10)); + } + + static void check_property(const std::unique_ptr& mtx) + { + auto strategy = dynamic_cast( + mtx->get_strategy().get()); + ASSERT_TRUE(strategy); + ASSERT_EQ(strategy->get_num_columns(), 10); + } +}; + +struct HybridImbalanceLimitStrategy + : SimpleMatrixTest> { + static std::unique_ptr create( + std::shared_ptr exec, gko::dim<2> size) + { + return matrix_type::create( + exec, size, 0, std::make_shared(0.5)); + } + + static void check_property(const std::unique_ptr& mtx) + { + auto strategy = dynamic_cast( + mtx->get_strategy().get()); + ASSERT_TRUE(strategy); + ASSERT_EQ(strategy->get_percentage(), 0.5); + } +}; + +struct HybridImbalanceBoundedLimitStrategy + : SimpleMatrixTest> { + static std::unique_ptr create( + std::shared_ptr exec, gko::dim<2> size) + { + return matrix_type::create( + exec, size, 0, + std::make_shared(0.5, 0.01)); + } + + static void check_property(const std::unique_ptr& mtx) + { + auto strategy = + dynamic_cast( + mtx->get_strategy().get()); + ASSERT_TRUE(strategy); + ASSERT_EQ(strategy->get_percentage(), 0.5); + ASSERT_EQ(strategy->get_ratio(), 0.01); + } +}; + +struct HybridMinStorageStrategy + : SimpleMatrixTest> { + static std::unique_ptr create( + std::shared_ptr exec, gko::dim<2> size) + { + return matrix_type::create( + exec, size, 0, + std::make_shared()); + } + + static void check_property(const std::unique_ptr& mtx) + { + auto strategy = dynamic_cast( + mtx->get_strategy().get()); + ASSERT_TRUE(strategy); + } +}; + +struct HybridAutomaticStrategy + : SimpleMatrixTest> { + static std::unique_ptr create( + std::shared_ptr exec, gko::dim<2> size) + { + return matrix_type::create(exec, size, 0, + std::make_shared()); + } + + static void check_property(const std::unique_ptr& mtx) + { + auto strategy = dynamic_cast( + mtx->get_strategy().get()); + ASSERT_TRUE(strategy); + } +}; + + +template +struct test_pair { + std::unique_ptr ref; + std::unique_ptr dev; + + test_pair(std::unique_ptr ref_obj, + std::shared_ptr exec) + : ref{std::move(ref_obj)}, dev{gko::clone(exec, ref)} + {} + + test_pair(std::unique_ptr ref_obj, + std::unique_ptr dev_obj) + : ref{std::move(ref_obj)}, dev{std::move(dev_obj)} + {} +}; + + +template +class Matrix : public ::testing::Test { +protected: + using Config = T; + using Mtx = typename T::matrix_type; + using index_type = typename Mtx::index_type; + using value_type = typename Mtx::value_type; + using mixed_value_type = gko::next_precision; + using Vec = gko::matrix::Dense; + using MixedVec = gko::matrix::Dense; + + Matrix() : rand_engine(15) {} + + void SetUp() + { + ref = gko::ReferenceExecutor::create(); + init_executor(ref, exec); + } + + void TearDown() + { + if (exec != nullptr) { + ASSERT_NO_THROW(exec->synchronize()); + } + } + + template + gko::matrix_data gen_mtx_data(int num_rows, + int num_cols, + DistType dist) + { + return gko::test::generate_random_matrix_data( + num_rows, num_cols, dist, std::normal_distribution<>(0.0, 1.0), + rand_engine); + } + + gko::matrix_data gen_mtx_data(int num_rows, + int num_cols, + int min_cols, + int max_cols) + { + return gen_mtx_data( + num_rows, num_cols, + std::uniform_int_distribution<>(min_cols, max_cols)); + } + + template + gko::matrix_data gen_dense_data(gko::dim<2> size) + { + return { + size, + std::normal_distribution>(0.0, 1.0), + rand_engine}; + } + + template + test_pair gen_in_vec(const test_pair& mtx, int nrhs, + int stride) + { + auto size = gko::dim<2>{mtx.ref->get_size()[1], + static_cast(nrhs)}; + auto result = VecType::create(ref, size, stride); + result->read(gen_dense_data(size)); + return {std::move(result), exec}; + } + + template + test_pair gen_scalar() + { + return {gko::initialize( + {gko::test::detail::get_rand_value< + typename VecType::value_type>( + std::normal_distribution< + gko::remove_complex>( + 0.0, 1.0), + rand_engine)}, + ref), + exec}; + } + + template + test_pair gen_out_vec(const test_pair& mtx, int nrhs, + int stride) + { + auto size = gko::dim<2>{mtx.ref->get_size()[0], + static_cast(nrhs)}; + auto result = VecType::create(ref, size, stride); + result->read(gen_dense_data(size)); + return {std::move(result), exec}; + } + + double tol() { return r::value; } + + double mixed_tol() { return r_mixed(); } + + template + void forall_matrix_data_scenarios(TestFunction fn) + { + auto guarded_fn = [&](auto mtx) { + try { + fn(std::move(mtx)); + } catch (std::exception& e) { + FAIL() << e.what(); + } + }; + { + SCOPED_TRACE("Zero matrix (0x0)"); + guarded_fn(gen_mtx_data(0, 0, 0, 0)); + } + { + SCOPED_TRACE("Zero matrix (0x2)"); + guarded_fn(gen_mtx_data(0, 2, 0, 0)); + } + { + SCOPED_TRACE("Zero matrix (2x0)"); + guarded_fn(gen_mtx_data(2, 0, 0, 0)); + } + { + SCOPED_TRACE("Zero matrix (200x100)"); + guarded_fn(gen_mtx_data(200, 100, 0, 0)); + } + { + SCOPED_TRACE("Sparse Matrix with some zeros rows (200x100)"); + guarded_fn(gen_mtx_data(200, 100, 0, 50)); + } + { + SCOPED_TRACE("Sparse Matrix with fixed row nnz (200x100)"); + guarded_fn(gen_mtx_data(200, 100, 50, 50)); + } + { + SCOPED_TRACE("Sparse Matrix with variable row nnz (200x100)"); + guarded_fn(gen_mtx_data(200, 100, 10, 50)); + } + { + SCOPED_TRACE( + "Sparse Matrix with heavily imbalanced row nnz (200x100)"); + guarded_fn( + gen_mtx_data(200, 100, std::poisson_distribution<>{1.5})); + } + { + SCOPED_TRACE("Dense matrix (200x100)"); + guarded_fn(gen_mtx_data(200, 100, 100, 100)); + } + } + + template + void forall_matrix_scenarios(TestFunction fn) + { + auto guarded_fn = [&](auto mtx) { + try { + T::check_property(mtx.ref); + T::check_property(mtx.dev); + fn(std::move(mtx)); + } catch (std::exception& e) { + FAIL() << e.what(); + } + }; + { + SCOPED_TRACE("Uninitialized matrix (0x0)"); + guarded_fn(test_pair{T::create(ref, gko::dim<2>{}), + T::create(exec, gko::dim<2>{})}); + } + { + SCOPED_TRACE("Uninitialized matrix (0x2)"); + guarded_fn(test_pair{T::create(ref, gko::dim<2>{0, 2}), + T::create(exec, gko::dim<2>{0, 2})}); + } + { + SCOPED_TRACE("Uninitialized matrix (2x0)"); + guarded_fn(test_pair{T::create(ref, gko::dim<2>{2, 0}), + T::create(exec, gko::dim<2>{2, 0})}); + } + forall_matrix_data_scenarios([&](auto data) { + test_pair pair{T::create(ref, data.size), + T::create(exec, data.size)}; + pair.dev->read(data); + pair.ref->read(data); + guarded_fn(std::move(pair)); + }); + } + + template + void forall_vector_scenarios(const test_pair& mtx, TestFunction fn) + { + auto guarded_fn = [&](auto b, auto x) { + try { + fn(std::move(b), std::move(x)); + } catch (std::exception& e) { + FAIL() << e.what(); + } + }; + { + SCOPED_TRACE("Multivector with 0 columns"); + guarded_fn(gen_in_vec(mtx, 0, 0), + gen_out_vec(mtx, 0, 0)); + } + { + SCOPED_TRACE("Single vector"); + guarded_fn(gen_in_vec(mtx, 1, 1), + gen_out_vec(mtx, 1, 1)); + } + { + SCOPED_TRACE("Single strided vector"); + guarded_fn(gen_in_vec(mtx, 1, 2), + gen_out_vec(mtx, 1, 3)); + } + if (!gko::is_complex()) { + // check application of real matrix to complex vector + // viewed as interleaved real/imag vector + using complex_vec = gko::to_complex; + { + SCOPED_TRACE("Single strided complex vector"); + guarded_fn(gen_in_vec(mtx, 1, 2), + gen_out_vec(mtx, 1, 3)); + } + { + SCOPED_TRACE("Strided complex multivector with 2 columns"); + guarded_fn(gen_in_vec(mtx, 2, 3), + gen_out_vec(mtx, 2, 4)); + } + } + { + SCOPED_TRACE("Multivector with 2 columns"); + guarded_fn(gen_in_vec(mtx, 2, 2), + gen_out_vec(mtx, 2, 2)); + } + { + SCOPED_TRACE("Strided multivector with 2 columns"); + guarded_fn(gen_in_vec(mtx, 2, 3), + gen_out_vec(mtx, 2, 4)); + } + { + SCOPED_TRACE("Multivector with 40 columns"); + guarded_fn(gen_in_vec(mtx, 40, 40), + gen_out_vec(mtx, 40, 40)); + } + { + SCOPED_TRACE("Strided multivector with 40 columns"); + guarded_fn(gen_in_vec(mtx, 40, 43), + gen_out_vec(mtx, 40, 45)); + } + } + + std::shared_ptr ref; + std::shared_ptr exec; + + std::ranlux48 rand_engine; +}; + +using MatrixTypes = ::testing::Types< + DenseWithDefaultStride, DenseWithCustomStride, Coo, CsrWithDefaultStrategy, + // The strategies have issues with zero rows + /* + #if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) || \ + defined(GKO_COMPILING_DPCPP) + CsrWithClassicalStrategy, CsrWithMergePathStrategy, + CsrWithSparselibStrategy, CsrWithLoadBalanceStrategy, + CsrWithAutomaticalStrategy, + #endif + */ + Ell, + // Fbcsr is slightly broken + /*FbcsrBlocksize1, FbcsrBlocksize2,*/ + SellpDefaultParameters, Sellp32Factor2, HybridDefaultStrategy, + HybridColumnLimitStrategy, HybridImbalanceLimitStrategy, + HybridImbalanceBoundedLimitStrategy, HybridMinStorageStrategy, + HybridAutomaticStrategy>; + +TYPED_TEST_SUITE(Matrix, MatrixTypes, TypenameNameGenerator); + + +TYPED_TEST(Matrix, SpMVIsEquivalentToRef) +{ + this->forall_matrix_scenarios([&](auto mtx) { + this->forall_vector_scenarios(mtx, [&](auto b, auto x) { + mtx.ref->apply(b.ref.get(), x.ref.get()); + mtx.dev->apply(b.dev.get(), x.dev.get()); + + GKO_ASSERT_MTX_NEAR(x.ref, x.dev, this->tol()); + }); + }); +} + + +TYPED_TEST(Matrix, AdvancedSpMVIsEquivalentToRef) +{ + this->forall_matrix_scenarios([&](auto mtx) { + this->forall_vector_scenarios(mtx, [&](auto b, auto x) { + auto alpha = this->gen_scalar(); + auto beta = this->gen_scalar(); + + mtx.ref->apply(alpha.ref.get(), b.ref.get(), alpha.ref.get(), + x.ref.get()); + mtx.dev->apply(alpha.dev.get(), b.dev.get(), alpha.dev.get(), + x.dev.get()); + + GKO_ASSERT_MTX_NEAR(x.ref, x.dev, this->tol()); + }); + }); +} + + +#if !(GINKGO_DPCPP_SINGLE_MODE) +TYPED_TEST(Matrix, MixedSpMVIsEquivalentToRef) +{ + using MixedVec = typename TestFixture::MixedVec; + this->forall_matrix_scenarios([&](auto mtx) { + this->template forall_vector_scenarios( + mtx, [&](auto b, auto x) { + mtx.ref->apply(b.ref.get(), x.ref.get()); + mtx.dev->apply(b.dev.get(), x.dev.get()); + + GKO_ASSERT_MTX_NEAR(x.ref, x.dev, this->mixed_tol()); + }); + }); +} + + +TYPED_TEST(Matrix, MixedAdvancedSpMVIsEquivalentToRef) +{ + using MixedVec = typename TestFixture::MixedVec; + this->forall_matrix_scenarios([&](auto mtx) { + this->template forall_vector_scenarios( + mtx, [&](auto b, auto x) { + auto alpha = this->template gen_scalar(); + auto beta = this->template gen_scalar(); + + mtx.ref->apply(alpha.ref.get(), b.ref.get(), alpha.ref.get(), + x.ref.get()); + mtx.dev->apply(alpha.dev.get(), b.dev.get(), alpha.dev.get(), + x.dev.get()); + + GKO_ASSERT_MTX_NEAR(x.ref, x.dev, this->mixed_tol()); + }); + }); +} +#endif + + +TYPED_TEST(Matrix, ConvertToCsrIsEquivalentToRef) +{ + using Mtx = typename TestFixture::Mtx; + using Csr = + gko::matrix::Csr; + this->forall_matrix_scenarios([&](auto mtx) { + auto ref_result = Csr::create(this->ref); + auto dev_result = Csr::create(this->exec); + + mtx.ref->convert_to(ref_result.get()); + mtx.dev->convert_to(dev_result.get()); + + GKO_ASSERT_MTX_NEAR(ref_result, dev_result, 0.0); + GKO_ASSERT_MTX_EQ_SPARSITY(ref_result, dev_result); + }); +} + + +TYPED_TEST(Matrix, ConvertFromCsrIsEquivalentToRef) +{ + using TestConfig = typename TestFixture::Config; + using Mtx = typename TestFixture::Mtx; + using Csr = + gko::matrix::Csr; + this->forall_matrix_data_scenarios([&](auto data) { + auto ref_src = Csr::create(this->ref); + auto dev_src = Csr::create(this->exec); + ref_src->read(data); + dev_src->read(data); + auto ref_result = TestConfig::create(this->ref, data.size); + auto dev_result = TestConfig::create(this->exec, data.size); + + ref_src->convert_to(ref_result.get()); + dev_src->convert_to(dev_result.get()); + + GKO_ASSERT_MTX_NEAR(ref_result, dev_result, 0.0); + GKO_ASSERT_MTX_EQ_SPARSITY(ref_result, dev_result); + }); +} + + +TYPED_TEST(Matrix, ConvertToDenseIsEquivalentToRef) +{ + using Mtx = typename TestFixture::Mtx; + using Dense = gko::matrix::Dense; + this->forall_matrix_scenarios([&](auto mtx) { + auto ref_result = Dense::create(this->ref); + auto dev_result = Dense::create(this->exec); + + mtx.ref->convert_to(ref_result.get()); + mtx.dev->convert_to(dev_result.get()); + + GKO_ASSERT_MTX_NEAR(ref_result, dev_result, 0.0); + }); +} + + +TYPED_TEST(Matrix, ConvertFromDenseIsEquivalentToRef) +{ + using TestConfig = typename TestFixture::Config; + using Mtx = typename TestFixture::Mtx; + using Dense = gko::matrix::Dense; + this->forall_matrix_data_scenarios([&](auto data) { + auto ref_src = Dense::create(this->ref); + auto dev_src = Dense::create(this->exec); + ref_src->read(data); + dev_src->read(data); + auto ref_result = TestConfig::create(this->ref, data.size); + auto dev_result = TestConfig::create(this->exec, data.size); + + ref_src->convert_to(ref_result.get()); + dev_src->convert_to(dev_result.get()); + + GKO_ASSERT_MTX_NEAR(ref_result, dev_result, 0.0); + GKO_ASSERT_MTX_EQ_SPARSITY(ref_result, dev_result); + }); +} + + +TYPED_TEST(Matrix, ReadWriteRoundtrip) +{ + using TestConfig = typename TestFixture::Config; + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::index_type; + this->forall_matrix_data_scenarios([&](auto data) { + auto new_mtx = TestConfig::create(this->exec, data.size); + gko::matrix_data out_data; + + new_mtx->read(data); + new_mtx->write(out_data); + + if (!TestConfig::preserves_zeros()) { + data.remove_zeros(); + out_data.remove_zeros(); + } + ASSERT_EQ(data.size, out_data.size); + ASSERT_EQ(data.nonzeros, out_data.nonzeros); + }); +}