From 42739daba312060105d4557218b405b802d06b1e Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 27 Aug 2021 16:48:29 +0200 Subject: [PATCH 1/6] parallelize CUDA Jacobi kernel compilation --- cuda/CMakeLists.txt | 27 ++++ .../jacobi_advanced_apply_instantiate.inc.cu | 125 +++++++++++++++++ .../jacobi_advanced_apply_kernel.cu | 41 +----- .../jacobi_generate_instantiate.inc.cu | 132 ++++++++++++++++++ cuda/preconditioner/jacobi_generate_kernel.cu | 46 +----- .../jacobi_simple_apply_instantiate.inc.cu | 122 ++++++++++++++++ .../jacobi_simple_apply_kernel.cu | 40 +----- 7 files changed, 409 insertions(+), 124 deletions(-) create mode 100644 cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu create mode 100644 cuda/preconditioner/jacobi_generate_instantiate.inc.cu create mode 100644 cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 6824aee731d..2141be283d4 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -129,6 +129,33 @@ target_sources(ginkgo_cuda foreach(source_file IN LISTS GKO_CUDA_COMMON_SOURCES) set_source_files_properties(${source_file} PROPERTIES LANGUAGE CUDA) endforeach(source_file) +if(GINKGO_JACOBI_FULL_OPTIMIZATIONS) + # keep in sync with jacobi_common.hpp + set(GKO_CUDA_JACOBI_BLOCK_SIZES) + foreach(blocksize RANGE 1 32) + list(APPEND GKO_JACOBI_BLOCK_SIZES ${blocksize}) + endforeach() +else() + # keep in sync with jacobi_common.hpp + set(GKO_CUDA_JACOBI_BLOCK_SIZES 1 2 4 8 13 16 32) +endif() +set(GKO_CUDA_JACOBI_SOURCES) +foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_CUDA_JACOBI_BLOCK_SIZES) + configure_file( + preconditioner/jacobi_generate_instantiate.inc.cu + preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) + configure_file( + preconditioner/jacobi_simple_apply_instantiate.inc.cu + preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) + configure_file( + preconditioner/jacobi_advanced_apply_instantiate.inc.cu + preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) + list(APPEND GKO_CUDA_JACOBI_SOURCES + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) +endforeach() +target_sources(ginkgo_cuda PRIVATE ${GKO_CUDA_JACOBI_SOURCES}) if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") # remove false positive CUDA warnings when calling one() and zero() diff --git a/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu b/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu new file mode 100644 index 00000000000..5db4d270ee0 --- /dev/null +++ b/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu @@ -0,0 +1,125 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/preconditioner/jacobi_kernels.hpp" + + +#include + + +#include "core/base/extended_float.hpp" +#include "core/matrix/dense_kernels.hpp" +#include "core/preconditioner/jacobi_utils.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "cuda/base/config.hpp" +#include "cuda/base/math.hpp" +#include "cuda/base/types.hpp" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/thread_ids.cuh" +#include "cuda/components/warp_blas.cuh" +#include "cuda/preconditioner/jacobi_common.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The Jacobi preconditioner namespace. + * @ref Jacobi + * @ingroup jacobi + */ +namespace jacobi { + + +#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc" + + +// clang-format off +#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@ +// clang-format on +// make things easier for IDEs +#ifndef GKO_JACOBI_BLOCK_SIZE +#define GKO_JACOBI_BLOCK_SIZE 1 +#endif + + +template +void advanced_apply( + syn::value_list, size_type num_blocks, + const precision_reduction* block_precisions, + const IndexType* block_pointers, const ValueType* blocks, + const preconditioner::block_interleaved_storage_scheme& + storage_scheme, + const ValueType* alpha, const ValueType* b, size_type b_stride, + ValueType* x, size_type x_stride) +{ + constexpr int subwarp_size = get_larger_power(max_block_size); + constexpr int blocks_per_warp = config::warp_size / subwarp_size; + const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), + 1, 1); + const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); + + if (block_precisions) { + kernel::advanced_adaptive_apply + <<>>( + as_cuda_type(blocks), storage_scheme, block_precisions, + block_pointers, num_blocks, as_cuda_type(alpha), + as_cuda_type(b), b_stride, as_cuda_type(x), x_stride); + } else { + kernel::advanced_apply + <<>>( + as_cuda_type(blocks), storage_scheme, block_pointers, + num_blocks, as_cuda_type(alpha), as_cuda_type(b), b_stride, + as_cuda_type(x), x_stride); + } +} + + +#define DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION(ValueType, IndexType) \ + void advanced_apply( \ + syn::value_list, size_type, \ + const precision_reduction*, const IndexType* block_pointers, \ + const ValueType*, \ + const preconditioner::block_interleaved_storage_scheme&, \ + const ValueType*, const ValueType*, size_type, ValueType*, size_type) + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION); + + +} // namespace jacobi +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernel.cu b/cuda/preconditioner/jacobi_advanced_apply_kernel.cu index d4c1649d6fd..f86cf6e3ea1 100644 --- a/cuda/preconditioner/jacobi_advanced_apply_kernel.cu +++ b/cuda/preconditioner/jacobi_advanced_apply_kernel.cu @@ -36,16 +36,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" -#include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/warp_blas.cuh" #include "cuda/preconditioner/jacobi_common.hpp" @@ -60,12 +52,6 @@ namespace cuda { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc" - - -namespace { - - template void advanced_apply( @@ -75,36 +61,11 @@ void advanced_apply( const preconditioner::block_interleaved_storage_scheme& storage_scheme, const ValueType* alpha, const ValueType* b, size_type b_stride, - ValueType* x, size_type x_stride) -{ - constexpr int subwarp_size = get_larger_power(max_block_size); - constexpr int blocks_per_warp = config::warp_size / subwarp_size; - const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), - 1, 1); - const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); - - if (block_precisions) { - kernel::advanced_adaptive_apply - <<>>( - as_cuda_type(blocks), storage_scheme, block_precisions, - block_pointers, num_blocks, as_cuda_type(alpha), - as_cuda_type(b), b_stride, as_cuda_type(x), x_stride); - } else { - kernel::advanced_apply - <<>>( - as_cuda_type(blocks), storage_scheme, block_pointers, - num_blocks, as_cuda_type(alpha), as_cuda_type(b), b_stride, - as_cuda_type(x), x_stride); - } -} + ValueType* x, size_type x_stride); GKO_ENABLE_IMPLEMENTATION_SELECTION(select_advanced_apply, advanced_apply); -} // namespace - - template void apply(std::shared_ptr exec, size_type num_blocks, uint32 max_block_size, diff --git a/cuda/preconditioner/jacobi_generate_instantiate.inc.cu b/cuda/preconditioner/jacobi_generate_instantiate.inc.cu new file mode 100644 index 00000000000..9bdb77c1e6e --- /dev/null +++ b/cuda/preconditioner/jacobi_generate_instantiate.inc.cu @@ -0,0 +1,132 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/preconditioner/jacobi_kernels.hpp" + + +#include +#include + + +#include "core/base/extended_float.hpp" +#include "core/components/fill_array.hpp" +#include "core/preconditioner/jacobi_utils.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "cuda/base/config.hpp" +#include "cuda/base/math.hpp" +#include "cuda/base/types.hpp" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/diagonal_block_manipulation.cuh" +#include "cuda/components/thread_ids.cuh" +#include "cuda/components/uninitialized_array.hpp" +#include "cuda/components/warp_blas.cuh" +#include "cuda/preconditioner/jacobi_common.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The Jacobi preconditioner namespace. + * @ref Jacobi + * @ingroup jacobi + */ +namespace jacobi { + + +#include "common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc" + + +// clang-format off +#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@ +// clang-format on +// make things easier for IDEs +#ifndef GKO_JACOBI_BLOCK_SIZE +#define GKO_JACOBI_BLOCK_SIZE 1 +#endif + + +template +void generate(syn::value_list, + const matrix::Csr* mtx, + remove_complex accuracy, ValueType* block_data, + const preconditioner::block_interleaved_storage_scheme& + storage_scheme, + remove_complex* conditioning, + precision_reduction* block_precisions, + const IndexType* block_ptrs, size_type num_blocks) +{ + constexpr int subwarp_size = get_larger_power(max_block_size); + constexpr int blocks_per_warp = config::warp_size / subwarp_size; + const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), + 1, 1); + const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); + + if (block_precisions) { + kernel::adaptive_generate + <<>>( + mtx->get_size()[0], mtx->get_const_row_ptrs(), + mtx->get_const_col_idxs(), + as_cuda_type(mtx->get_const_values()), as_cuda_type(accuracy), + as_cuda_type(block_data), storage_scheme, + as_cuda_type(conditioning), block_precisions, block_ptrs, + num_blocks); + } else { + kernel::generate + <<>>( + mtx->get_size()[0], mtx->get_const_row_ptrs(), + mtx->get_const_col_idxs(), + as_cuda_type(mtx->get_const_values()), as_cuda_type(block_data), + storage_scheme, block_ptrs, num_blocks); + } +} + + +#define DECLARE_JACOBI_GENERATE_INSTANTIATION(ValueType, IndexType) \ + void generate( \ + syn::value_list, \ + const matrix::Csr*, remove_complex, \ + ValueType*, \ + const preconditioner::block_interleaved_storage_scheme&, \ + remove_complex*, precision_reduction*, const IndexType*, \ + size_type) + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + DECLARE_JACOBI_GENERATE_INSTANTIATION); + + +} // namespace jacobi +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/preconditioner/jacobi_generate_kernel.cu b/cuda/preconditioner/jacobi_generate_kernel.cu index 218026df629..ea62a6f9fb6 100644 --- a/cuda/preconditioner/jacobi_generate_kernel.cu +++ b/cuda/preconditioner/jacobi_generate_kernel.cu @@ -37,18 +37,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/base/extended_float.hpp" #include "core/components/fill_array.hpp" -#include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/diagonal_block_manipulation.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/uninitialized_array.hpp" -#include "cuda/components/warp_blas.cuh" #include "cuda/preconditioner/jacobi_common.hpp" @@ -63,12 +53,6 @@ namespace cuda { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc" - - -namespace { - - template void generate(syn::value_list, @@ -78,39 +62,11 @@ void generate(syn::value_list, storage_scheme, remove_complex* conditioning, precision_reduction* block_precisions, - const IndexType* block_ptrs, size_type num_blocks) -{ - constexpr int subwarp_size = get_larger_power(max_block_size); - constexpr int blocks_per_warp = config::warp_size / subwarp_size; - const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), - 1, 1); - const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); - - if (block_precisions) { - kernel::adaptive_generate - <<>>( - mtx->get_size()[0], mtx->get_const_row_ptrs(), - mtx->get_const_col_idxs(), - as_cuda_type(mtx->get_const_values()), as_cuda_type(accuracy), - as_cuda_type(block_data), storage_scheme, - as_cuda_type(conditioning), block_precisions, block_ptrs, - num_blocks); - } else { - kernel::generate - <<>>( - mtx->get_size()[0], mtx->get_const_row_ptrs(), - mtx->get_const_col_idxs(), - as_cuda_type(mtx->get_const_values()), as_cuda_type(block_data), - storage_scheme, block_ptrs, num_blocks); - } -} + const IndexType* block_ptrs, size_type num_blocks); GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generate, generate); -} // namespace - - template void generate(std::shared_ptr exec, const matrix::Csr* system_matrix, diff --git a/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu b/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu new file mode 100644 index 00000000000..4c32268983d --- /dev/null +++ b/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu @@ -0,0 +1,122 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/preconditioner/jacobi_kernels.hpp" + + +#include + + +#include "core/base/extended_float.hpp" +#include "core/matrix/dense_kernels.hpp" +#include "core/preconditioner/jacobi_utils.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "cuda/base/config.hpp" +#include "cuda/base/math.hpp" +#include "cuda/base/types.hpp" +#include "cuda/components/cooperative_groups.cuh" +#include "cuda/components/thread_ids.cuh" +#include "cuda/components/warp_blas.cuh" +#include "cuda/preconditioner/jacobi_common.hpp" + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The Jacobi preconditioner namespace. + * @ref Jacobi + * @ingroup jacobi + */ +namespace jacobi { + + +#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc" + + +// clang-format off +#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@ +// clang-format on +// make things easier for IDEs +#ifndef GKO_JACOBI_BLOCK_SIZE +#define GKO_JACOBI_BLOCK_SIZE 1 +#endif + + +template +void apply(syn::value_list, size_type num_blocks, + const precision_reduction* block_precisions, + const IndexType* block_pointers, const ValueType* blocks, + const preconditioner::block_interleaved_storage_scheme& + storage_scheme, + const ValueType* b, size_type b_stride, ValueType* x, + size_type x_stride) +{ + constexpr int subwarp_size = get_larger_power(max_block_size); + constexpr int blocks_per_warp = config::warp_size / subwarp_size; + const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), + 1, 1); + const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); + + if (block_precisions) { + kernel::adaptive_apply + <<>>( + as_cuda_type(blocks), storage_scheme, block_precisions, + block_pointers, num_blocks, as_cuda_type(b), b_stride, + as_cuda_type(x), x_stride); + } else { + kernel::apply + <<>>( + as_cuda_type(blocks), storage_scheme, block_pointers, + num_blocks, as_cuda_type(b), b_stride, as_cuda_type(x), + x_stride); + } +} + + +#define DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION(ValueType, IndexType) \ + void apply( \ + syn::value_list, size_type, \ + const precision_reduction*, const IndexType*, const ValueType*, \ + const preconditioner::block_interleaved_storage_scheme&, \ + const ValueType*, size_type, ValueType*, size_type) + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION); + + +} // namespace jacobi +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/preconditioner/jacobi_simple_apply_kernel.cu b/cuda/preconditioner/jacobi_simple_apply_kernel.cu index 70e33b8caaf..0c57031b357 100644 --- a/cuda/preconditioner/jacobi_simple_apply_kernel.cu +++ b/cuda/preconditioner/jacobi_simple_apply_kernel.cu @@ -36,16 +36,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/base/extended_float.hpp" -#include "core/matrix/dense_kernels.hpp" #include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "cuda/base/config.hpp" -#include "cuda/base/math.hpp" -#include "cuda/base/types.hpp" -#include "cuda/components/cooperative_groups.cuh" -#include "cuda/components/thread_ids.cuh" -#include "cuda/components/warp_blas.cuh" #include "cuda/preconditioner/jacobi_common.hpp" @@ -60,12 +52,6 @@ namespace cuda { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc" - - -namespace { - - template void apply(syn::value_list, size_type num_blocks, @@ -74,35 +60,11 @@ void apply(syn::value_list, size_type num_blocks, const preconditioner::block_interleaved_storage_scheme& storage_scheme, const ValueType* b, size_type b_stride, ValueType* x, - size_type x_stride) -{ - constexpr int subwarp_size = get_larger_power(max_block_size); - constexpr int blocks_per_warp = config::warp_size / subwarp_size; - const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), - 1, 1); - const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); - - if (block_precisions) { - kernel::adaptive_apply - <<>>( - as_cuda_type(blocks), storage_scheme, block_precisions, - block_pointers, num_blocks, as_cuda_type(b), b_stride, - as_cuda_type(x), x_stride); - } else { - kernel::apply - <<>>( - as_cuda_type(blocks), storage_scheme, block_pointers, - num_blocks, as_cuda_type(b), b_stride, as_cuda_type(x), - x_stride); - } -} + size_type x_stride); GKO_ENABLE_IMPLEMENTATION_SELECTION(select_apply, apply); -} // namespace - - template void simple_apply( std::shared_ptr exec, size_type num_blocks, From 697aa1cdcb2e059c4f0f469c101ff04c506387e6 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 27 Aug 2021 17:49:33 +0200 Subject: [PATCH 2/6] parallelize HIP Jacobi kernel compilation --- hip/CMakeLists.txt | 31 ++++ ...obi_advanced_apply_instantiate.inc.hip.cpp | 131 +++++++++++++++++ .../jacobi_advanced_apply_kernel.hip.cpp | 47 +----- .../jacobi_generate_instantiate.inc.hip.cpp | 134 ++++++++++++++++++ .../jacobi_generate_kernel.hip.cpp | 35 +---- ...acobi_simple_apply_instantiate.inc.hip.cpp | 124 ++++++++++++++++ .../jacobi_simple_apply_kernel.hip.cpp | 31 +--- 7 files changed, 423 insertions(+), 110 deletions(-) create mode 100644 hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp create mode 100644 hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp create mode 100644 hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index f476ac29160..58a19ea1532 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -200,6 +200,37 @@ set(GINKGO_HIP_SOURCES ../common/unified/solver/ir_kernels.cpp ) +if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}") + set(GKO_HIP_JACOBI_MAX_BLOCK_SIZE 32) +else() + set(GKO_HIP_JACOBI_MAX_BLOCK_SIZE 64) +endif() +if(GINKGO_JACOBI_FULL_OPTIMIZATIONS) + # keep in sync with jacobi_common.hpp + set(GKO_HIP_JACOBI_BLOCK_SIZES) + foreach(blocksize RANGE 1 ${GKO_HIP_JACOBI_MAX_BLOCK_SIZE}) + list(APPEND GKO_HIP_JACOBI_BLOCK_SIZES ${blocksize}) + endforeach() +else() + # keep in sync with jacobi_common.hpp + set(GKO_HIP_JACOBI_BLOCK_SIZES 1 2 4 8 13 16 32 ${GKO_HIP_JACOBI_MAX_BLOCK_SIZE}) +endif() +foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_HIP_JACOBI_BLOCK_SIZES) + configure_file( + preconditioner/jacobi_generate_instantiate.inc.hip.cpp + preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) + configure_file( + preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp + preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) + configure_file( + preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp + preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) + list(APPEND GINKGO_HIP_SOURCES + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_generate_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp + ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) +endforeach() + set(GINKGO_HIP_NVCC_ARCH "") if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}") if (NOT CMAKE_CUDA_HOST_COMPILER AND NOT GINKGO_CUDA_DEFAULT_HOST_COMPILER) diff --git a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp new file mode 100644 index 00000000000..01899cd70cc --- /dev/null +++ b/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp @@ -0,0 +1,131 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/preconditioner/jacobi_kernels.hpp" + + +#include + + +#include + + +#include "core/base/extended_float.hpp" +#include "core/matrix/dense_kernels.hpp" +#include "core/preconditioner/jacobi_utils.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/warp_blas.hip.hpp" +#include "hip/preconditioner/jacobi_common.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The Jacobi preconditioner namespace. + * @ref Jacobi + * @ingroup jacobi + */ +namespace jacobi { + + +#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc" + + +// clang-format off +#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@ +// clang-format on +// make things easier for IDEs +#ifndef GKO_JACOBI_BLOCK_SIZE +#define GKO_JACOBI_BLOCK_SIZE 1 +#endif + + +template +void advanced_apply( + syn::value_list, size_type num_blocks, + const precision_reduction* block_precisions, + const IndexType* block_pointers, const ValueType* blocks, + const preconditioner::block_interleaved_storage_scheme& + storage_scheme, + const ValueType* alpha, const ValueType* b, size_type b_stride, + ValueType* x, size_type x_stride) +{ + constexpr int subwarp_size = get_larger_power(max_block_size); + constexpr int blocks_per_warp = config::warp_size / subwarp_size; + const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), + 1, 1); + const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); + + if (block_precisions) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + kernel::advanced_adaptive_apply), + dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks), + storage_scheme, block_precisions, block_pointers, num_blocks, + as_hip_type(alpha), as_hip_type(b), b_stride, as_hip_type(x), + x_stride); + } else { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::advanced_apply), + dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks), + storage_scheme, block_pointers, num_blocks, as_hip_type(alpha), + as_hip_type(b), b_stride, as_hip_type(x), x_stride); + } +} + + +#define DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION(ValueType, IndexType) \ + void advanced_apply( \ + syn::value_list, size_type, \ + const precision_reduction*, const IndexType* block_pointers, \ + const ValueType*, \ + const preconditioner::block_interleaved_storage_scheme&, \ + const ValueType*, const ValueType*, size_type, ValueType*, size_type) + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION); + + +} // namespace jacobi +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp index a3b210a0fdd..47425ff3cd1 100644 --- a/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp +++ b/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp @@ -33,22 +33,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/preconditioner/jacobi_kernels.hpp" -#include - - #include -#include "core/base/extended_float.hpp" #include "core/matrix/dense_kernels.hpp" -#include "core/preconditioner/jacobi_utils.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "hip/base/config.hip.hpp" -#include "hip/base/math.hip.hpp" -#include "hip/base/types.hip.hpp" -#include "hip/components/cooperative_groups.hip.hpp" -#include "hip/components/thread_ids.hip.hpp" -#include "hip/components/warp_blas.hip.hpp" #include "hip/preconditioner/jacobi_common.hip.hpp" @@ -63,12 +52,6 @@ namespace hip { namespace jacobi { -#include "common/cuda_hip/preconditioner/jacobi_advanced_apply_kernel.hpp.inc" - - -namespace { - - template void advanced_apply( @@ -78,39 +61,11 @@ void advanced_apply( const preconditioner::block_interleaved_storage_scheme& storage_scheme, const ValueType* alpha, const ValueType* b, size_type b_stride, - ValueType* x, size_type x_stride) -{ - constexpr int subwarp_size = get_larger_power(max_block_size); - constexpr int blocks_per_warp = config::warp_size / subwarp_size; - const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), - 1, 1); - const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); - - if (block_precisions) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - kernel::advanced_adaptive_apply), - dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks), - storage_scheme, block_precisions, block_pointers, num_blocks, - as_hip_type(alpha), as_hip_type(b), b_stride, as_hip_type(x), - x_stride); - } else { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::advanced_apply), - dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks), - storage_scheme, block_pointers, num_blocks, as_hip_type(alpha), - as_hip_type(b), b_stride, as_hip_type(x), x_stride); - } -} + ValueType* x, size_type x_stride); GKO_ENABLE_IMPLEMENTATION_SELECTION(select_advanced_apply, advanced_apply); -} // namespace - - template void apply(std::shared_ptr exec, size_type num_blocks, uint32 max_block_size, diff --git a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp new file mode 100644 index 00000000000..6b55422ab6e --- /dev/null +++ b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp @@ -0,0 +1,134 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/preconditioner/jacobi_kernels.hpp" + + +#include +#include + + +#include "core/base/extended_float.hpp" +#include "core/components/fill_array.hpp" +#include "core/preconditioner/jacobi_utils.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/diagonal_block_manipulation.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/uninitialized_array.hip.hpp" +#include "hip/components/warp_blas.hip.hpp" +#include "hip/preconditioner/jacobi_common.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The Jacobi preconditioner namespace. + * @ref Jacobi + * @ingroup jacobi + */ +namespace jacobi { + + +#include "common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc" + + +// clang-format off +#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@ +// clang-format on +// make things easier for IDEs +#ifndef GKO_JACOBI_BLOCK_SIZE +#define GKO_JACOBI_BLOCK_SIZE 1 +#endif + + +template +void generate(syn::value_list, + const matrix::Csr* mtx, + remove_complex accuracy, ValueType* block_data, + const preconditioner::block_interleaved_storage_scheme& + storage_scheme, + remove_complex* conditioning, + precision_reduction* block_precisions, + const IndexType* block_ptrs, size_type num_blocks) +{ + constexpr int subwarp_size = get_larger_power(max_block_size); + constexpr int blocks_per_warp = config::warp_size / subwarp_size; + const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), + 1, 1); + const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); + + if (block_precisions) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + kernel::adaptive_generate), + dim3(grid_size), dim3(block_size), 0, 0, mtx->get_size()[0], + mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + as_hip_type(mtx->get_const_values()), as_hip_type(accuracy), + as_hip_type(block_data), storage_scheme, as_hip_type(conditioning), + block_precisions, block_ptrs, num_blocks); + } else { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::generate), + dim3(grid_size), dim3(block_size), 0, 0, mtx->get_size()[0], + mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), + as_hip_type(mtx->get_const_values()), as_hip_type(block_data), + storage_scheme, block_ptrs, num_blocks); + } +} + + +#define DECLARE_JACOBI_GENERATE_INSTANTIATION(ValueType, IndexType) \ + void generate( \ + syn::value_list, \ + const matrix::Csr*, remove_complex, \ + ValueType*, \ + const preconditioner::block_interleaved_storage_scheme&, \ + remove_complex*, precision_reduction*, const IndexType*, \ + size_type) + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + DECLARE_JACOBI_GENERATE_INSTANTIATION); + + +} // namespace jacobi +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/preconditioner/jacobi_generate_kernel.hip.cpp b/hip/preconditioner/jacobi_generate_kernel.hip.cpp index 4b398b4f75d..829c8b1dbe2 100644 --- a/hip/preconditioner/jacobi_generate_kernel.hip.cpp +++ b/hip/preconditioner/jacobi_generate_kernel.hip.cpp @@ -69,9 +69,6 @@ namespace jacobi { #include "common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc" -namespace { - - template void generate(syn::value_list, @@ -81,41 +78,11 @@ void generate(syn::value_list, storage_scheme, remove_complex* conditioning, precision_reduction* block_precisions, - const IndexType* block_ptrs, size_type num_blocks) -{ - constexpr int subwarp_size = get_larger_power(max_block_size); - constexpr int blocks_per_warp = config::warp_size / subwarp_size; - const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), - 1, 1); - const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); - - if (block_precisions) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - kernel::adaptive_generate), - dim3(grid_size), dim3(block_size), 0, 0, mtx->get_size()[0], - mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), - as_hip_type(mtx->get_const_values()), as_hip_type(accuracy), - as_hip_type(block_data), storage_scheme, as_hip_type(conditioning), - block_precisions, block_ptrs, num_blocks); - } else { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::generate), - dim3(grid_size), dim3(block_size), 0, 0, mtx->get_size()[0], - mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), - as_hip_type(mtx->get_const_values()), as_hip_type(block_data), - storage_scheme, block_ptrs, num_blocks); - } -} + const IndexType* block_ptrs, size_type num_blocks); GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generate, generate); -} // namespace - - template void generate(std::shared_ptr exec, const matrix::Csr* system_matrix, diff --git a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp new file mode 100644 index 00000000000..3b33cc21bd3 --- /dev/null +++ b/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp @@ -0,0 +1,124 @@ +/************************************************************* +Copyright (c) 2017-2021, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/preconditioner/jacobi_kernels.hpp" + + +#include + + +#include "core/base/extended_float.hpp" +#include "core/matrix/dense_kernels.hpp" +#include "core/preconditioner/jacobi_utils.hpp" +#include "core/synthesizer/implementation_selection.hpp" +#include "hip/base/config.hip.hpp" +#include "hip/base/math.hip.hpp" +#include "hip/base/types.hip.hpp" +#include "hip/components/cooperative_groups.hip.hpp" +#include "hip/components/thread_ids.hip.hpp" +#include "hip/components/warp_blas.hip.hpp" +#include "hip/preconditioner/jacobi_common.hip.hpp" + + +namespace gko { +namespace kernels { +namespace hip { +/** + * @brief The Jacobi preconditioner namespace. + * @ref Jacobi + * @ingroup jacobi + */ +namespace jacobi { + + +#include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc" + + +// clang-format off +#cmakedefine GKO_JACOBI_BLOCK_SIZE @GKO_JACOBI_BLOCK_SIZE@ +// clang-format on +// make things easier for IDEs +#ifndef GKO_JACOBI_BLOCK_SIZE +#define GKO_JACOBI_BLOCK_SIZE 1 +#endif + + +template +void apply(syn::value_list, size_type num_blocks, + const precision_reduction* block_precisions, + const IndexType* block_pointers, const ValueType* blocks, + const preconditioner::block_interleaved_storage_scheme& + storage_scheme, + const ValueType* b, size_type b_stride, ValueType* x, + size_type x_stride) +{ + constexpr int subwarp_size = get_larger_power(max_block_size); + constexpr int blocks_per_warp = config::warp_size / subwarp_size; + const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), + 1, 1); + const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); + + if (block_precisions) { + hipLaunchKernelGGL( + HIP_KERNEL_NAME(kernel::adaptive_apply), + dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks), + storage_scheme, block_precisions, block_pointers, num_blocks, + as_hip_type(b), b_stride, as_hip_type(x), x_stride); + } else { + hipLaunchKernelGGL( + HIP_KERNEL_NAME( + kernel::apply), + dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks), + storage_scheme, block_pointers, num_blocks, as_hip_type(b), + b_stride, as_hip_type(x), x_stride); + } +} + + +#define DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION(ValueType, IndexType) \ + void apply( \ + syn::value_list, size_type, \ + const precision_reduction*, const IndexType*, const ValueType*, \ + const preconditioner::block_interleaved_storage_scheme&, \ + const ValueType*, size_type, ValueType*, size_type) + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION); + + +} // namespace jacobi +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp index c871f94043d..57f7af49fe6 100644 --- a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp +++ b/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp @@ -66,9 +66,6 @@ namespace jacobi { #include "common/cuda_hip/preconditioner/jacobi_simple_apply_kernel.hpp.inc" -namespace { - - template void apply(syn::value_list, size_type num_blocks, @@ -77,37 +74,11 @@ void apply(syn::value_list, size_type num_blocks, const preconditioner::block_interleaved_storage_scheme& storage_scheme, const ValueType* b, size_type b_stride, ValueType* x, - size_type x_stride) -{ - constexpr int subwarp_size = get_larger_power(max_block_size); - constexpr int blocks_per_warp = config::warp_size / subwarp_size; - const dim3 grid_size(ceildiv(num_blocks, warps_per_block * blocks_per_warp), - 1, 1); - const dim3 block_size(subwarp_size, blocks_per_warp, warps_per_block); - - if (block_precisions) { - hipLaunchKernelGGL( - HIP_KERNEL_NAME(kernel::adaptive_apply), - dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks), - storage_scheme, block_precisions, block_pointers, num_blocks, - as_hip_type(b), b_stride, as_hip_type(x), x_stride); - } else { - hipLaunchKernelGGL( - HIP_KERNEL_NAME( - kernel::apply), - dim3(grid_size), dim3(block_size), 0, 0, as_hip_type(blocks), - storage_scheme, block_pointers, num_blocks, as_hip_type(b), - b_stride, as_hip_type(x), x_stride); - } -} + size_type x_stride); GKO_ENABLE_IMPLEMENTATION_SELECTION(select_apply, apply); -} // namespace - - template void simple_apply( std::shared_ptr exec, size_type num_blocks, From 788770892dba4728900ab5bbab76d0454d19a981 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 27 Aug 2021 18:58:04 +0200 Subject: [PATCH 3/6] fix hip-nvcc CMake source file generation --- hip/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 58a19ea1532..b20c8b25e5a 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -214,6 +214,7 @@ if(GINKGO_JACOBI_FULL_OPTIMIZATIONS) else() # keep in sync with jacobi_common.hpp set(GKO_HIP_JACOBI_BLOCK_SIZES 1 2 4 8 13 16 32 ${GKO_HIP_JACOBI_MAX_BLOCK_SIZE}) + list(REMOVE_DUPLICATES GKO_HIP_JACOBI_BLOCK_SIZES) endif() foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_HIP_JACOBI_BLOCK_SIZES) configure_file( From 67163d591e9be87a508f2f9fa9cecf075ef8934a Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Thu, 2 Sep 2021 14:24:48 +0200 Subject: [PATCH 4/6] generate jacobi_common header --- cuda/CMakeLists.txt | 7 +++++-- .../{jacobi_common.hpp => jacobi_common.hpp.in} | 14 +++++++++----- hip/CMakeLists.txt | 5 +++-- ...obi_common.hip.hpp => jacobi_common.hip.hpp.in} | 14 +++++++++----- 4 files changed, 26 insertions(+), 14 deletions(-) rename cuda/preconditioner/{jacobi_common.hpp => jacobi_common.hpp.in} (87%) rename hip/preconditioner/{jacobi_common.hip.hpp => jacobi_common.hip.hpp.in} (87%) diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 2141be283d4..64f46484904 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -130,13 +130,11 @@ foreach(source_file IN LISTS GKO_CUDA_COMMON_SOURCES) set_source_files_properties(${source_file} PROPERTIES LANGUAGE CUDA) endforeach(source_file) if(GINKGO_JACOBI_FULL_OPTIMIZATIONS) - # keep in sync with jacobi_common.hpp set(GKO_CUDA_JACOBI_BLOCK_SIZES) foreach(blocksize RANGE 1 32) list(APPEND GKO_JACOBI_BLOCK_SIZES ${blocksize}) endforeach() else() - # keep in sync with jacobi_common.hpp set(GKO_CUDA_JACOBI_BLOCK_SIZES 1 2 4 8 13 16 32) endif() set(GKO_CUDA_JACOBI_SOURCES) @@ -156,6 +154,8 @@ foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_CUDA_JACOBI_BLOCK_SIZES) ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.cu) endforeach() target_sources(ginkgo_cuda PRIVATE ${GKO_CUDA_JACOBI_SOURCES}) +string(REPLACE ";" "," GKO_CUDA_JACOBI_BLOCK_SIZES_CODE "${GKO_CUDA_JACOBI_BLOCK_SIZES}") +configure_file(preconditioner/jacobi_common.hpp.in preconditioner/jacobi_common.hpp) if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") # remove false positive CUDA warnings when calling one() and zero() @@ -204,6 +204,9 @@ ginkgo_compile_features(ginkgo_cuda) target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA) target_include_directories(ginkgo_cuda SYSTEM PRIVATE ${CUDA_INCLUDE_DIRS}) +# include path for generated headers like jacobi_common.hpp +target_include_directories(ginkgo_cuda + PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/..) target_link_libraries(ginkgo_cuda PRIVATE ${CUDA_RUNTIME_LIBS} ${CUBLAS} ${CUSPARSE} ${CURAND}) target_link_libraries(ginkgo_cuda PUBLIC ginkgo_device) target_compile_options(ginkgo_cuda diff --git a/cuda/preconditioner/jacobi_common.hpp b/cuda/preconditioner/jacobi_common.hpp.in similarity index 87% rename from cuda/preconditioner/jacobi_common.hpp rename to cuda/preconditioner/jacobi_common.hpp.in index 21920f4fe63..bfa596828ab 100644 --- a/cuda/preconditioner/jacobi_common.hpp +++ b/cuda/preconditioner/jacobi_common.hpp.in @@ -46,14 +46,18 @@ namespace jacobi { * A compile-time list of block sizes for which dedicated generate and apply * kernels should be compiled. */ -#ifdef GINKGO_JACOBI_FULL_OPTIMIZATIONS -using compiled_kernels = syn::as_list>; -#else -using compiled_kernels = - syn::value_list; +// clang-format off +#cmakedefine GKO_CUDA_JACOBI_BLOCK_SIZES_CODE @GKO_CUDA_JACOBI_BLOCK_SIZES_CODE@ +// clang-format on +// make things easier for IDEs +#ifndef GKO_CUDA_JACOBI_BLOCK_SIZES_CODE +#define GKO_CUDA_JACOBI_BLOCK_SIZES_CODE 1 #endif +using compiled_kernels = syn::value_list; + + constexpr int get_larger_power(int value, int guess = 1) { return guess >= value ? guess : get_larger_power(value, guess << 1); diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index b20c8b25e5a..6adce32388c 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -206,13 +206,11 @@ else() set(GKO_HIP_JACOBI_MAX_BLOCK_SIZE 64) endif() if(GINKGO_JACOBI_FULL_OPTIMIZATIONS) - # keep in sync with jacobi_common.hpp set(GKO_HIP_JACOBI_BLOCK_SIZES) foreach(blocksize RANGE 1 ${GKO_HIP_JACOBI_MAX_BLOCK_SIZE}) list(APPEND GKO_HIP_JACOBI_BLOCK_SIZES ${blocksize}) endforeach() else() - # keep in sync with jacobi_common.hpp set(GKO_HIP_JACOBI_BLOCK_SIZES 1 2 4 8 13 16 32 ${GKO_HIP_JACOBI_MAX_BLOCK_SIZE}) list(REMOVE_DUPLICATES GKO_HIP_JACOBI_BLOCK_SIZES) endif() @@ -231,6 +229,8 @@ foreach(GKO_JACOBI_BLOCK_SIZE IN LISTS GKO_HIP_JACOBI_BLOCK_SIZES) ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_simple_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp ${CMAKE_CURRENT_BINARY_DIR}/preconditioner/jacobi_advanced_apply_instantiate.${GKO_JACOBI_BLOCK_SIZE}.hip.cpp) endforeach() +string(REPLACE ";" "," GKO_HIP_JACOBI_BLOCK_SIZES_CODE "${GKO_HIP_JACOBI_BLOCK_SIZES}") +configure_file(preconditioner/jacobi_common.hip.hpp.in preconditioner/jacobi_common.hip.hpp) set(GINKGO_HIP_NVCC_ARCH "") if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_NVIDIA_REGEX}") @@ -286,6 +286,7 @@ target_include_directories(ginkgo_hip PUBLIC ${HIP_INCLUDE_DIRS} PRIVATE + ${CMAKE_CURRENT_BINARY_DIR}/.. # for generated headers like jacobi_common.hip.hpp ${GINKGO_HIP_THRUST_PATH} ${HIPBLAS_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} diff --git a/hip/preconditioner/jacobi_common.hip.hpp b/hip/preconditioner/jacobi_common.hip.hpp.in similarity index 87% rename from hip/preconditioner/jacobi_common.hip.hpp rename to hip/preconditioner/jacobi_common.hip.hpp.in index 2c94e7ce3b4..29da74bbbc8 100644 --- a/hip/preconditioner/jacobi_common.hip.hpp +++ b/hip/preconditioner/jacobi_common.hip.hpp.in @@ -47,14 +47,18 @@ namespace jacobi { * A compile-time list of block sizes for which dedicated generate and apply * kernels should be compiled. */ -#ifdef GINKGO_JACOBI_FULL_OPTIMIZATIONS -using compiled_kernels = syn::as_list>; -#else -using compiled_kernels = - syn::value_list; +// clang-format off +#cmakedefine GKO_HIP_JACOBI_BLOCK_SIZES_CODE @GKO_HIP_JACOBI_BLOCK_SIZES_CODE@ +// clang-format on +// make things easier for IDEs +#ifndef GKO_HIP_JACOBI_BLOCK_SIZES_CODE +#define GKO_HIP_JACOBI_BLOCK_SIZES_CODE 1 #endif +using compiled_kernels = syn::value_list; + + constexpr int get_larger_power(int value, int guess = 1) { return guess >= value ? guess : get_larger_power(value, guess << 1); From 17dd3b549b01e7aa42d6ce6629c70be5e4763936 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 3 Sep 2021 13:28:24 +0200 Subject: [PATCH 5/6] fix cuda build with full Jacobi optimization Co-authored-by: Yuhsiang M. Tsai <19565938+yhmtsai@users.noreply.github.com> --- cuda/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 64f46484904..c592514bf5c 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -132,7 +132,7 @@ endforeach(source_file) if(GINKGO_JACOBI_FULL_OPTIMIZATIONS) set(GKO_CUDA_JACOBI_BLOCK_SIZES) foreach(blocksize RANGE 1 32) - list(APPEND GKO_JACOBI_BLOCK_SIZES ${blocksize}) + list(APPEND GKO_CUDA_JACOBI_BLOCK_SIZES ${blocksize}) endforeach() else() set(GKO_CUDA_JACOBI_BLOCK_SIZES 1 2 4 8 13 16 32) From 1a065ebc33cccee8bf9315da5cf607e10b87a150 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Fri, 3 Sep 2021 22:59:05 +0200 Subject: [PATCH 6/6] update formatting script to handle jacobi header --- dev_tools/scripts/format_header.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dev_tools/scripts/format_header.sh b/dev_tools/scripts/format_header.sh index 7a24f50bef6..36c1cab027e 100755 --- a/dev_tools/scripts/format_header.sh +++ b/dev_tools/scripts/format_header.sh @@ -4,6 +4,7 @@ CLANG_FORMAT=${CLANG_FORMAT:="clang-format"} convert_header () { local regex="^(#include )(<|\")(.*)(\"|>)$" + local jacobi_regex="^(cuda|hip)\/preconditioner\/jacobi_common(\.hip)?\.hpp" if [[ $@ =~ ${regex} ]]; then header_file="${BASH_REMATCH[3]}" if [ -f "${header_file}" ]; then @@ -14,6 +15,8 @@ convert_header () { fi elif [ "${header_file}" = "matrices/config.hpp" ]; then echo "#include \"${header_file}\"" + elif [[ "${header_file}" =~ ${jacobi_regex} ]]; then + echo "#include \"${header_file}\"" else echo "#include <${header_file}>" fi