diff --git a/cmake/std/PullRequestLinuxCommonTestingSettings.cmake b/cmake/std/PullRequestLinuxCommonTestingSettings.cmake index 877e47b7a868..ce31514945d6 100644 --- a/cmake/std/PullRequestLinuxCommonTestingSettings.cmake +++ b/cmake/std/PullRequestLinuxCommonTestingSettings.cmake @@ -20,8 +20,6 @@ set (Trilinos_ENABLE_TESTS ON CACHE BOOL "Set by default for PR testing") set (Trilinos_TEST_CATEGORIES BASIC CACHE STRING "Set by default for PR testing") set (Trilinos_ENABLE_CONFIGURE_TIMING ON CACHE BOOL "Set by default for PR testing") set (Trilinos_ENABLE_ALL_FORWARD_DEP_PACKAGES ON CACHE BOOL "Set by default for PR testing") -set (Trilinos_CTEST_USE_NEW_AAO_FEATURES ON CACHE BOOL "Set by default for PR testing") - # Options from cmake/std/MpiReleaseDebugSharedPtSettings.cmake diff --git a/cmake/std/PullRequestLinuxCuda9.2TestingSettings.cmake b/cmake/std/PullRequestLinuxCuda9.2TestingSettings.cmake index aa340b1bc06e..970c38ba05b0 100644 --- a/cmake/std/PullRequestLinuxCuda9.2TestingSettings.cmake +++ b/cmake/std/PullRequestLinuxCuda9.2TestingSettings.cmake @@ -13,34 +13,66 @@ set (CTEST_USE_LAUNCHERS ON CACHE BOOL "Set by default for PR testing") # Options necessary for CUDA build set (TPL_ENABLE_MPI ON CACHE BOOL "Set by default for CUDA PR testing") -set (TPL_ENABLE_CUDA ON CACHE BOOL "Set by default for CUDA PR testing") set (Kokkos_ENABLE_Cuda ON CACHE BOOL "Set by default for CUDA PR testing") set (Kokkos_ENABLE_Cuda_UVM ON CACHE BOOL "Set by default for CUDA PR testing") -set (KOKKOS_ARCH Power8 CACHE STRING "Set by default for CUDA PR testing") +set (KOKKOS_ARCH Power8,Kepler37 CACHE STRING "Set by default for CUDA PR testing") +set (Kokkos_ENABLE_Cuda_Relocatable_Device_Code OFF CACHE BOOL "Set by default for CUDA PR testing") +set (Sacado_ENABLE_HIERARCHICAL_DFAD ON CACHE BOOL "Set by default for CUDA PR testing") +set (Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA ON CACHE BOOL "Set by default for CUDA PR testing") +set (Kokkos_ENABLE_Cuda_Lambda ON CACHE BOOL "Set by default for CUDA PR testing") +set (Phalanx_KOKKOS_DEVICE_TYPE CUDA CACHE STRING "Set by default for CUDA PR testing") +set (MPI_EXEC_POST_NUMPROCS_FLAGS "-map-by;socket:PE=4" CACHE STRING "Set by default for CUDA PR testing") # Options set to match the ATDM build set (Trilinos_ENABLE_DEBUG OFF CACHE BOOL "Set by default for CUDA PR testing") set (Trilinos_ENABLE_DEBUG_SYMBOLS OFF CACHE BOOL "Set by default for CUDA PR testing") - -# TPL settings specific to CUDA build -set (TPL_BLAS_LIBRARIES "-L${BLAS_ROOT}/lib -lblas -lgfortran -lgomp -lm" CACHE STRING "Set by default for CUDA PR testing") -set (TPL_LAPACK_LIBRARIES "-L${LAPACK_ROOT}/lib -llapack -lgfortran -lgomp" CACHE STRING "Set by default for CUDA PR testing") set (BUILD_SHARED_LIBS OFF CACHE BOOL "Set by default for CUDA PR testing") set (Tpetra_INST_SERIAL ON CACHE BOOL "Set by default for CUDA PR testing") set (Trilinos_ENABLE_SECONDARY_TESTED_CODE OFF CACHE BOOL "Set by default for CUDA PR testing") -set (TPL_ENABLE_Scotch OFF CACHE BOOL "Set by default for CUDA PR testing") -# Parmetis is available on ride and could be enabled for the CUDA PR build +set (EpetraExt_ENABLE_HDF5 OFF CACHE BOOL "Set by default for CUDA PR testing") +set (Panzer_ENABLE_FADTYPE "Sacado::Fad::DFad" CACHE STRING "Set by default for CUDA PR testing") +set (Kokkos_ENABLE_Debug_Bounds_Check ON CACHE BOOL "Set by default for CUDA PR testing") +set (KOKKOS_ENABLE_DEBUG ON CACHE BOOL "Set by default for CUDA PR testing") + +# TPL settings specific to CUDA build +set (TPL_ENABLE_CUDA ON CACHE BOOL "Set by default for CUDA PR testing") +set (TPL_ENABLE_CUSPARSE ON CACHE BOOL "Set by default for CUDA PR testing") +set (TPL_ENABLE_Pthread OFF CACHE BOOL "Set by default for CUDA PR testing") +set (TPL_ENABLE_BinUtils OFF CACHE BOOL "Set by default for CUDA PR testing") +set (TPL_ENABLE_BLAS ON CACHE BOOL "Set by default for CUDA PR testing") +set (TPL_BLAS_LIBRARIES "-L$ENV{BLAS_ROOT}/lib;-lblas;-lgfortran;-lgomp;-lm" CACHE STRING "Set by default for CUDA PR testing") +set (TPL_ENABLE_Boost ON CACHE BOOL "Set by default for CUDA PR testing") +set (Boost_INCLUDE_DIRS "$ENV{BOOST_ROOT}/include" CACHE FILEPATH "Set by default for CUDA PR testing") +set (TPL_Boost_LIBRARIES "$ENV{BOOST_ROOT}/lib/libboost_program_options.a;$ENV{BOOST_ROOT}/lib/libboost_system.a" CACHE FILEPATH "Set by default for CUDA PR testing") +set (TPL_ENABLE_BoostLib ON CACHE BOOL "Set by default for CUDA PR testing") +set (BoostLib_INCLUDE_DIRS "$ENV{BOOST_ROOT}/include" CACHE FILEPATH "Set by default for CUDA PR testing") +set (TPL_BoostLib_LIBRARIES "$ENV{BOOST_ROOT}/lib/libboost_program_options.a;$ENV{BOOST_ROOT}/lib/libboost_system.a" CACHE FILEPATH "Set by default for CUDA PR testing") +set (TPL_ENABLE_METIS OFF CACHE BOOL "Set by default for CUDA PR testing") set (TPL_ENABLE_ParMETIS OFF CACHE BOOL "Set by default for CUDA PR testing") -set (TPL_Netcdf_LIBRARIES "-L${BOOST_ROOT}/lib;-L${NETCDF_ROOT}/lib;-L${NETCDF_ROOT}/lib;-L${PNETCDF_ROOT}/lib;-L${HDF5_ROOT}/lib;${BOOST_ROOT}/lib/libboost_program_options.a;${BOOST_ROOT}/lib/libboost_system.a;${NETCDF_ROOT}/lib/libnetcdf.a;${PNETCDF_ROOT}/lib/libpnetcdf.a;${HDF5_ROOT}/lib/libhdf5_hl.a;${HDF5_ROOT}/lib/libhdf5.a;-lz;-ldl" CACHE STRING "Set by default for CUDA PR testing") -# SuperLU is available on ride and could be enabled for the CUDA PR build +set (TPL_ENABLE_Scotch OFF CACHE BOOL "Set by default for CUDA PR testing") +set (TPL_ENABLE_HWLOC OFF CACHE BOOL "Set by default for CUDA PR testing") +set (TPL_ENABLE_LAPACK ON CACHE BOOL "Set by default for CUDA PR testing") +set (TPL_LAPACK_LIBRARIES "-L$ENV{LAPACK_ROOT}/lib;-llapack;-lgfortran;-lgomp" CACHE STRING "Set by default for CUDA PR testing") +set (TPL_ENABLE_Zlib OFF CACHE BOOL "Set by default for CUDA PR testing") +set (TPL_ENABLE_CGNS OFF CACHE BOOL "Set by default for CUDA PR testing") +set (TPL_ENABLE_HDF5 ON CACHE BOOL "Set by default for CUDA PR testing") +set (HDF5_INCLUDE_DIRS "$ENV{HDF5_ROOT}/include" CACHE FILEPATH "Set by default for CUDA PR testing") +set (TPL_HDF5_LIBRARIES "-L$ENV{HDF5_ROOT}/lib;$ENV{HDF5_ROOT}/lib/libhdf5_hl.a;$ENV{HDF5_ROOT}/lib/libhdf5.a;-lz;-ldl" CACHE FILEPATH "Set by default for CUDA PR testing") +set (TPL_ENABLE_Netcdf ON CACHE BOOL "Set by default for CUDA PR testing") +set (Netcdf_INCLUDE_DIRS "$ENV{NETCDF_ROOT}/include" CACHE FILEPATH "Set by default for CUDA PR testing") +set (TPL_Netcdf_LIBRARIES "-L$ENV{BOOST_ROOT}/lib;-L$ENV{NETCDF_ROOT}/lib;-L$ENV{NETCDF_ROOT}/lib;-L$ENV{PNETCDF_ROOT}/lib;-L$ENV{HDF5_ROOT}/lib;$ENV{BOOST_ROOT}/lib/libboost_program_options.a;$ENV{BOOST_ROOT}/lib/libboost_system.a;$ENV{NETCDF_ROOT}/lib/libnetcdf.a;$ENV{PNETCDF_ROOT}/lib/libpnetcdf.a;$ENV{HDF5_ROOT}/lib/libhdf5_hl.a;$ENV{HDF5_ROOT}/lib/libhdf5.a;-lz;-ldl" CACHE STRING "Set by default for CUDA PR testing") +# SuperLU and SuperLUDist is available on ride and could be enabled for the CUDA PR build set (TPL_ENABLE_SuperLU OFF CACHE BOOL "Set by default for CUDA PR testing") +set (TPL_ENABLE_SuperLUDist OFF CACHE BOOL "Set by default for CUDA PR testing") set (TPL_ENABLE_BoostLib OFF CACHE BOOL "Set by default for CUDA PR testing") -set (Trilinos_ENABLE_Moertel OFF CACHE BOOL "Disable for CUDA PR testing") -set (Trilinos_ENABLE_Komplex OFF CACHE BOOL "Disable for CUDA PR testing") +set (TPL_ENABLE_Matio OFF CACHE BOOL "Set by default for CUDA PR testing") +set (TPL_DLlib_LIBRARIES "-ldl" CACHE FILEPATH "Set by default for CUDA PR testing") # Temporary options to clean up build -set (Trilinos_ENABLE_SEACAS OFF CACHE BOOL "Temporary disable for CUDA PR testing") -set (STK_ENABLE_TESTS OFF CACHE BOOL "Temporary disable for CUDA PR testing") +set (Teko_ModALPreconditioner_MPI_1_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") +set (MueLu_ParameterListInterpreterTpetra_MPI_1_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") +set (MueLu_ParameterListInterpreterTpetraHeavy_MPI_1_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") +set (STKUnit_tests_stk_ngp_test_utest_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") set (ROL_example_PDE-OPT_0ld_adv-diff-react_example_01_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") set (ROL_example_PDE-OPT_0ld_adv-diff-react_example_02_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") set (ROL_example_PDE-OPT_0ld_poisson_example_01_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") @@ -53,15 +85,20 @@ set (ROL_example_PDE-OPT_obstacle_example_01_MPI_4_DISABLE ON CACHE BOOL "Tempor set (ROL_example_PDE-OPT_topo-opt_poisson_example_01_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") set (ROL_test_elementwise_TpetraMultiVector_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") set (ROL_NonlinearProblemTest_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") -set (TpetraCore_Core_initialize_where_tpetra_initializes_kokkos_MPI_1_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") -set (TpetraCore_Core_initialize_where_tpetra_initializes_mpi_and_user_initializes_kokkos_MPI_2_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") -set (TpetraCore_Core_initialize_where_user_initializes_kokkos_MPI_1_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") -set (TpetraCore_Core_initialize_where_user_initializes_mpi_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") -set (TpetraCore_Core_ScopeGuard_where_tpetra_initializes_kokkos_MPI_1_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") -set (TpetraCore_Core_ScopeGuard_where_tpetra_initializes_mpi_and_user_initializes_kokkos_MPI_2_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") -set (TpetraCore_Core_ScopeGuard_where_user_initializes_kokkos_MPI_1_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") -set (TpetraCore_Core_ScopeGuard_where_user_initializes_mpi_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") +set (PanzerAdaptersSTK_CurlLaplacianExample-ConvTest-Quad-Order-4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") +set (PanzerAdaptersSTK_MixedPoissonExample-ConvTest-Hex-Order-3_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") set (TrilinosCouplings_Example_Maxwell_MueLu_MPI_1_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") set (TrilinosCouplings_Example_Maxwell_MueLu_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") +# Disable some tests that should not need to be disabled but do because the +# Trilinos PR tester is using too high a parallel level for ctest. (If the +# ctest parallel test level is dropped from 29 to 8, all of these will pass.) +set (MueLu_UnitTestsIntrepid2Tpetra_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") +set (PanzerAdaptersSTK_main_driver_energy-ss-blocked-tp_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") +set (PanzerAdaptersSTK_MixedCurlLaplacianExample-ConvTest-Tri-Order-1_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") +set (PanzerAdaptersSTK_PoissonInterfaceExample_3d_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") +set (PanzerMiniEM_MiniEM-BlockPrec_Augmentation_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") +set (PanzerMiniEM_MiniEM-BlockPrec_RefMaxwell_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") +set (ROL_example_PDE-OPT_poisson-boltzmann_example_01_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing") + include("${CMAKE_CURRENT_LIST_DIR}/PullRequestLinuxCommonTestingSettings.cmake") diff --git a/cmake/std/sems/PullRequestCuda9.2TestingEnv.sh b/cmake/std/sems/PullRequestCuda9.2TestingEnv.sh index 0cde8caf18de..c5532defeab7 100644 --- a/cmake/std/sems/PullRequestCuda9.2TestingEnv.sh +++ b/cmake/std/sems/PullRequestCuda9.2TestingEnv.sh @@ -3,8 +3,6 @@ # usage: $ source PullRequestCUDA9.2TestingEnv.sh -#No SEMS NFS mount on ride -#source /projects/sems/modulefiles/utils/sems-modules-init.sh module load git/2.10.1 module load devpack/20180521/openmpi/2.1.2/gcc/7.2.0/cuda/9.2.88 module swap openblas/0.2.20/gcc/7.2.0 netlib/3.8.0/gcc/7.2.0 @@ -12,13 +10,9 @@ module swap openblas/0.2.20/gcc/7.2.0 netlib/3.8.0/gcc/7.2.0 export OMPI_CXX=$WORKSPACE/Trilinos/packages/kokkos/bin/nvcc_wrapper export OMPI_CC=`which gcc` export OMPI_FC=`which gfortran` -export NVCC_WRAPPER_DEFAULT_COMPILER=`which g++` export CUDA_LAUNCH_BLOCKING=1 +export CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 # Use manually installed cmake and ninja to try to avoid module loading # problems (see TRIL-208) export PATH=/ascldap/users/rabartl/install/white-ride/cmake-3.11.2/bin:/ascldap/users/rabartl/install/white-ride/ninja-1.8.2/bin:$PATH - -# add the OpenMP environment variable we need -export OMP_NUM_THREADS=2 - diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp index def34c796548..99b79c2e605f 100644 --- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp @@ -55,6 +55,8 @@ #include #include #include +#include +#include #include #include #include @@ -74,8 +76,8 @@ #include // need to interface this into cmake variable (or only use this flag when it is necessary) -//#define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE -#undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE +#define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE +//#undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE) #include "cuda_profiler_api.h" #endif @@ -107,6 +109,12 @@ namespace Ifpack2 { template using ConstUnmanaged = Const >; + template + using Scratch = Kokkos::View >; + /// /// cuda specialization /// @@ -266,7 +274,9 @@ namespace Ifpack2 { // packed data always use layout right typedef Kokkos::View vector_type_1d_view; typedef Kokkos::View vector_type_3d_view; + typedef Kokkos::View internal_vector_type_3d_view; typedef Kokkos::View internal_vector_type_4d_view; + typedef Kokkos::View impl_scalar_type_3d_view; typedef Kokkos::View impl_scalar_type_4d_view; }; @@ -771,6 +781,8 @@ namespace Ifpack2 { // space. But it's easy to detect, so it's recorded in case an optimization // can be made based on it. bool row_contiguous; + + local_ordinal_type max_partsz; }; /// @@ -808,7 +820,9 @@ namespace Ifpack2 { // permutation vector std::vector p; - if (!jacobi) { + if (jacobi) { + interf.max_partsz = 1; + } else { // reorder parts to maximize simd packing efficiency p.resize(nparts); @@ -822,6 +836,8 @@ namespace Ifpack2 { }); for (local_ordinal_type i=0;i struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo { typedef KokkosBatched::Experimental::Mode::Team mode_type; @@ -1439,10 +1468,7 @@ namespace Ifpack2 { static int recommended_team_size(const int blksize, const int vector_length, const int internal_vector_length) { - const int vector_size = vector_length/internal_vector_length; - const int total_team_size = blksize <= 8 ? 32 : blksize <= 16 ? 64 : 128; - // to match the old code behavior when internal vector length is 1 - return internal_vector_length == 1 ? 32/vector_size : total_team_size/vector_size; + return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length); } }; template<> @@ -1452,10 +1478,7 @@ namespace Ifpack2 { static int recommended_team_size(const int blksize, const int vector_length, const int internal_vector_length) { - const int vector_size = vector_length/internal_vector_length; - const int total_team_size = blksize <= 8 ? 32 : blksize <= 16 ? 64 : 128; - // to match the old code behavior when internal vector length is 1 - return internal_vector_length == 1 ? 32/vector_size : total_team_size/vector_size; + return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length); } }; #endif @@ -1482,6 +1505,8 @@ namespace Ifpack2 { using vector_type_3d_view = typename impl_type::vector_type_3d_view; using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view; using impl_scalar_type_4d_view = typename impl_type::impl_scalar_type_4d_view; + using internal_vector_scratch_type_3d_view = Scratch; + using impl_scalar_scratch_type_3d_view = Scratch; using internal_vector_type = typename impl_type::internal_vector_type; static constexpr int vector_length = impl_type::vector_length; @@ -1494,6 +1519,7 @@ namespace Ifpack2 { private: // part interface const ConstUnmanaged partptr, lclrow, packptr; + const local_ordinal_type max_partsz; // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int) using a_rowptr_value_type = typename Kokkos::ViewTraits::size_type; const ConstUnmanaged > A_rowptr; @@ -1514,11 +1540,12 @@ namespace Ifpack2 { ExtractAndFactorizeTridiags(const BlockTridiags &btdm_, const PartInterface &interf_, const Teuchos::RCP &A_, - const magnitude_type& tiny_) : + const magnitude_type& tiny_) : // interface partptr(interf_.partptr), lclrow(interf_.lclrow), packptr(interf_.packptr), + max_partsz(interf_.max_partsz), // block crs matrix A_rowptr(A_->getCrsGraph().getLocalGraph().row_map), A_values(const_cast(A_.get())->template getValues()), @@ -1547,10 +1574,8 @@ namespace Ifpack2 { KOKKOS_INLINE_FUNCTION void - extract(const local_ordinal_type &packidx) const { - local_ordinal_type partidx = packptr(packidx); - local_ordinal_type npacks = packptr(packidx+1) - partidx; - + extract(local_ordinal_type partidx, + local_ordinal_type npacks) const { const size_type kps = pack_td_ptr(partidx); local_ordinal_type kfs[vector_length] = {}; local_ordinal_type ri0[vector_length] = {}; @@ -1594,37 +1619,56 @@ namespace Ifpack2 { KOKKOS_INLINE_FUNCTION void extract(const member_type &member, - const local_ordinal_type &partidx, - const local_ordinal_type &v) const { - const size_type kps = pack_td_ptr(partidx); - const local_ordinal_type kfs = flat_td_ptr(partidx); - const local_ordinal_type ri0 = partptr(partidx); - const local_ordinal_type nrows = partptr(partidx+1) - ri0; - for (local_ordinal_type tr=0,j=0;tr + template KOKKOS_INLINE_FUNCTION void factorize(const member_type &member, const local_ordinal_type &i0, const local_ordinal_type &nrows, const local_ordinal_type &v, - const AAViewType &AA) const { + const AAViewType &AA, + const WWViewType &WW) const { namespace KB = KokkosBatched::Experimental; typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo @@ -1636,9 +1680,7 @@ namespace Ifpack2 { const auto one = Kokkos::ArithTraits::one(); // subview pattern - auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), 0); - //Kokkos::subview(internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), 0); - A.assign_data( &AA(i0,0,0,v) ); + auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v); KB::LU ::invoke(member, A , tiny); @@ -1666,17 +1708,32 @@ namespace Ifpack2 { default_mode_type,KB::Algo::LU::Unblocked> ::invoke(member, A, tiny); } - } + } else { + // for block jacobi invert a matrix here + auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v); + KB::Copy + ::invoke(member, A, W); + KB::SetIdentity + ::invoke(member, A); + member.team_barrier(); + KB::Trsm + ::invoke(member, 1.0, W, A); + KB::Trsm + ::invoke(member, 1.0, W, A); + } } public: - struct ExtractAndFactorizeScalarTag {}; - struct ExtractAndFactorizeInternalVectorTag {}; + struct ExtractAndFactorizeTag {}; KOKKOS_INLINE_FUNCTION void - operator() (const ExtractAndFactorizeScalarTag &, const member_type &member) const { + operator() (const ExtractAndFactorizeTag &, const member_type &member) const { // btdm is packed and sorted from largest one const local_ordinal_type packidx = member.league_rank(); @@ -1684,39 +1741,18 @@ namespace Ifpack2 { const local_ordinal_type npacks = packptr(packidx+1) - partidx; const local_ordinal_type i0 = pack_td_ptr(partidx); const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx); - if (vector_length_value == 1) { - extract(packidx); - factorize(member, i0, nrows, 0, scalar_values); - } else { - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, vector_length_value), [&](const local_ordinal_type &v) { - if (v < npacks) extract(member, partidx+v, v); - factorize(member, i0, nrows, v, scalar_values); - }); - } - } - KOKKOS_INLINE_FUNCTION - void - operator() (const ExtractAndFactorizeInternalVectorTag &, const member_type &member) const { - // btdm is packed and sorted from largest one - const local_ordinal_type packidx = member.league_rank(); - - const local_ordinal_type partidx = packptr(packidx); - const local_ordinal_type npacks = packptr(packidx+1) - partidx; - const local_ordinal_type i0 = pack_td_ptr(partidx); - const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx); + internal_vector_scratch_type_3d_view + WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size); if (vector_loop_size == 1) { - extract(packidx); - factorize(member, i0, nrows, 0, internal_vector_values); + extract(partidx, npacks); + factorize(member, i0, nrows, 0, internal_vector_values, WW); } else { Kokkos::parallel_for (Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const local_ordinal_type &v) { const local_ordinal_type vbeg = v*internal_vector_length; - const local_ordinal_type vend = vbeg + internal_vector_length; - for (local_ordinal_type vv=vbeg;vv:: + const local_ordinal_type team_size = + ExtractAndFactorizeTridiagsDefaultModeAndAlgo:: recommended_team_size(blocksize, vector_length, internal_vector_length); - Kokkos::TeamPolicy + const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view:: + shmem_size(blocksize, blocksize, vector_loop_size); + + Kokkos::TeamPolicy policy(packptr.extent(0)-1, team_size, vector_loop_size); - Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run", - //policy.set_scratch_size(0,Kokkos::PerTeam(10000)), - policy, *this); + Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run", + policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE) cudaProfilerStop(); @@ -2033,7 +2071,7 @@ namespace Ifpack2 { typedef KokkosBatched::Experimental::Mode::Serial mode_type; typedef KokkosBatched::Experimental::Algo::Level2::Unblocked single_vector_algo_type; #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__) - typedef KokkosBatched::Experimental::Algo::Level3::CompactMKL algo_type; + typedef KokkosBatched::Experimental::Algo::Level3::CompactMKL multi_vector_algo_type; #else typedef KokkosBatched::Experimental::Algo::Level3::Blocked multi_vector_algo_type; #endif @@ -2045,6 +2083,20 @@ namespace Ifpack2 { }; #if defined(KOKKOS_ENABLE_CUDA) + static inline int SolveTridiagsRecommendedCudaTeamSize(const int blksize, + const int vector_length, + const int internal_vector_length) { + const int vector_size = vector_length/internal_vector_length; + int total_team_size(0); + if (blksize <= 5) total_team_size = 32; + else if (blksize <= 9) total_team_size = 64; + else if (blksize <= 12) total_team_size = 96; + else if (blksize <= 16) total_team_size = 128; + else if (blksize <= 20) total_team_size = 160; + else total_team_size = 160; + return total_team_size/vector_size; + } + template<> struct SolveTridiagsDefaultModeAndAlgo { typedef KokkosBatched::Experimental::Mode::Team mode_type; @@ -2053,9 +2105,7 @@ namespace Ifpack2 { static int recommended_team_size(const int blksize, const int vector_length, const int internal_vector_length) { - const int vector_size = vector_length/internal_vector_length; - const int total_team_size = blksize <= 8 ? 32 : blksize <= 16 ? 64 : 128; - return internal_vector_length == 1 ? 32/vector_size : total_team_size/vector_size; + return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length); } }; template<> @@ -2066,10 +2116,7 @@ namespace Ifpack2 { static int recommended_team_size(const int blksize, const int vector_length, const int internal_vector_length) { - const int vector_size = vector_length/internal_vector_length; - const int total_team_size = blksize <= 8 ? 32 : blksize <= 16 ? 64 : 128; - // to match the old code behavior when internal vector length is 1 - return internal_vector_length == 1 ? 32/vector_size : total_team_size/vector_size; + return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length); } }; #endif @@ -2092,6 +2139,8 @@ namespace Ifpack2 { using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view; using impl_scalar_type_4d_view = typename impl_type::impl_scalar_type_4d_view; + using internal_vector_scratch_type_3d_view = Scratch; + using internal_vector_type =typename impl_type::internal_vector_type; static constexpr int vector_length = impl_type::vector_length; static constexpr int internal_vector_length = impl_type::internal_vector_length; @@ -2142,6 +2191,7 @@ namespace Ifpack2 { /// /// cuda team vectorization /// + template KOKKOS_INLINE_FUNCTION void solveSingleVector(const member_type &member, @@ -2149,7 +2199,8 @@ namespace Ifpack2 { const local_ordinal_type &i0, const local_ordinal_type &r0, const local_ordinal_type &nrows, - const local_ordinal_type &v) const { + const local_ordinal_type &v, + const WWViewType &WW) const { namespace KB = KokkosBatched::Experimental; typedef SolveTridiagsDefaultModeAndAlgo default_mode_and_algo_type; @@ -2162,6 +2213,7 @@ namespace Ifpack2 { // constant const auto one = Kokkos::ArithTraits::one(); + const auto zero = Kokkos::ArithTraits::zero(); //const local_ordinal_type num_vectors = X_scalar_values.extent(2); // const local_ordinal_type blocksize = D_scalar_values.extent(1); @@ -2180,7 +2232,7 @@ namespace Ifpack2 { X += r0*xstep + v; //for (local_ordinal_type col=0;col 1) { // solve Lx = x KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE (default_mode_type,default_algo_type, @@ -2250,9 +2302,25 @@ namespace Ifpack2 { } // for multiple rhs //X += xs1; - } + } else { + const local_ordinal_type ws0 = WW.stride_0(); + auto W = WW.data() + v; + KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE + (default_mode_type, + member, blocksize, X, xs0, W, ws0); + KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE + (default_mode_type,default_algo_type, + member, + blocksize, blocksize, + one, + A, as0, as1, + W, xs0, + zero, + X, xs0); + } } - + + template KOKKOS_INLINE_FUNCTION void solveMultiVector(const member_type &member, @@ -2260,7 +2328,8 @@ namespace Ifpack2 { const local_ordinal_type &i0, const local_ordinal_type &r0, const local_ordinal_type &nrows, - const local_ordinal_type &v) const { + const local_ordinal_type &v, + const WWViewType &WW) const { namespace KB = KokkosBatched::Experimental; typedef SolveTridiagsDefaultModeAndAlgo default_mode_and_algo_type; @@ -2269,56 +2338,67 @@ namespace Ifpack2 { // constant const auto one = Kokkos::ArithTraits::one(); + const auto zero = Kokkos::ArithTraits::zero(); // subview pattern - auto A = Kokkos::subview(D_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), 0); - auto X1 = Kokkos::subview(X_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), 0); + auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v); + auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v); auto X2 = X1; local_ordinal_type i = i0, r = r0; - // solve Lx = x - A.assign_data( &D_internal_vector_values(i,0,0,v) ); - X1.assign_data( &X_internal_vector_values(r,0,0,v) ); - KB::Trsm - ::invoke(member, one, A, X1); - for (local_ordinal_type tr=1;tr - ::invoke(member, -one, A, X1, one, X2); - A.assign_data( &D_internal_vector_values(i+3,0,0,v) ); - KB::Trsm - ::invoke(member, one, A, X2); - X1.assign_data( X2.data() ); - } - - // solve Ux = x - KB::Trsm - ::invoke(member, one, A, X1); - for (local_ordinal_type tr=nrows;tr>1;--tr) { - i -= 3; - A.assign_data( &D_internal_vector_values(i+1,0,0,v) ); - X2.assign_data( &X_internal_vector_values(--r,0,0,v) ); - KB::Gemm - ::invoke(member, -one, A, X1, one, X2); - A.assign_data( &D_internal_vector_values(i,0,0,v) ); - KB::Trsm - ::invoke(member, one, A, X2); - X1.assign_data( X2.data() ); - } + + if (nrows > 1) { + // solve Lx = x + KB::Trsm + ::invoke(member, one, A, X1); + for (local_ordinal_type tr=1;tr + ::invoke(member, -one, A, X1, one, X2); + A.assign_data( &D_internal_vector_values(i+3,0,0,v) ); + KB::Trsm + ::invoke(member, one, A, X2); + X1.assign_data( X2.data() ); + } + + // solve Ux = x + KB::Trsm + ::invoke(member, one, A, X1); + for (local_ordinal_type tr=nrows;tr>1;--tr) { + i -= 3; + A.assign_data( &D_internal_vector_values(i+1,0,0,v) ); + X2.assign_data( &X_internal_vector_values(--r,0,0,v) ); + KB::Gemm + ::invoke(member, -one, A, X1, one, X2); + A.assign_data( &D_internal_vector_values(i,0,0,v) ); + KB::Trsm + ::invoke(member, one, A, X2); + X1.assign_data( X2.data() ); + } + } else { + // matrix is already inverted + auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v); + KB::Copy + ::invoke(member, X1, W); + KB::Gemm + ::invoke(member, one, A, W, zero, X1); + } } template struct SingleVectorTag {}; @@ -2334,9 +2414,12 @@ namespace Ifpack2 { const local_ordinal_type r0 = part2packrowidx0(partidx); const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx); const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B); + + internal_vector_scratch_type_3d_view + WW(member.team_scratch(0), blocksize, 1, vector_loop_size); Kokkos::parallel_for (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) { - solveSingleVector(member, blocksize, i0, r0, nrows, v); + solveSingleVector(member, blocksize, i0, r0, nrows, v, WW); }); } @@ -2350,9 +2433,13 @@ namespace Ifpack2 { const local_ordinal_type r0 = part2packrowidx0(partidx); const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx); const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B); + const local_ordinal_type num_vectors = X_internal_vector_values.extent(2); + + internal_vector_scratch_type_3d_view + WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size); Kokkos::parallel_for (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) { - solveMultiVector(member, blocksize, i0, r0, nrows, v); + solveMultiVector(member, blocksize, i0, r0, nrows, v, WW); }); } @@ -2371,18 +2458,22 @@ namespace Ifpack2 { const local_ordinal_type team_size = SolveTridiagsDefaultModeAndAlgo:: recommended_team_size(blocksize, vector_length, internal_vector_length); + const int per_team_scratch = internal_vector_scratch_type_3d_view + ::shmem_size(blocksize, num_vectors, vector_loop_size); #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \ if (num_vectors == 1) { \ const Kokkos::TeamPolicy > \ policy(packptr.extent(0) - 1, team_size, vector_loop_size); \ Kokkos::parallel_for \ - ("SolveTridiags::TeamPolicy::run", policy, *this); \ + ("SolveTridiags::TeamPolicy::run", \ + policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \ } else { \ const Kokkos::TeamPolicy > \ policy(packptr.extent(0) - 1, team_size, vector_loop_size); \ Kokkos::parallel_for \ - ("SolveTridiags::TeamPolicy::run", policy, *this); \ + ("SolveTridiags::TeamPolicy::run", \ + policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \ } break switch (blocksize) { @@ -2409,6 +2500,18 @@ namespace Ifpack2 { /// /// compute local residula vector y = b - R x /// + static inline int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize, + const int team_size) { + int total_team_size(0); + if (blksize <= 5) total_team_size = 32; + else if (blksize <= 9) total_team_size = 64; + else if (blksize <= 12) total_team_size = 96; + else if (blksize <= 16) total_team_size = 128; + else if (blksize <= 20) total_team_size = 160; + else total_team_size = 160; + return total_team_size/team_size; + } + template struct ComputeResidualVector { public: @@ -2492,10 +2595,10 @@ namespace Ifpack2 { inline void - serialGemv(const local_ordinal_type &blocksize, - const impl_scalar_type * const __restrict__ AA, - const impl_scalar_type * const __restrict__ xx, - /* */ impl_scalar_type * __restrict__ yy) const { + SerialGemv(const local_ordinal_type &blocksize, + const impl_scalar_type * const KOKKOS_RESTRICT AA, + const impl_scalar_type * const KOKKOS_RESTRICT xx, + /* */ impl_scalar_type * KOKKOS_RESTRICT yy) const { for (local_ordinal_type k0=0;k0 KOKKOS_INLINE_FUNCTION void - vectorCopy(const member_type &member, + VectorCopy(const member_type &member, const local_ordinal_type &blocksize, const bbViewType &bb, const yyViewType &yy) const { @@ -2526,7 +2629,7 @@ namespace Ifpack2 { template KOKKOS_INLINE_FUNCTION void - teamGemv(const member_type &member, + TeamVectorGemv(const member_type &member, const local_ordinal_type &blocksize, const AAViewType &AA, const xxViewType &xx, @@ -2535,17 +2638,53 @@ namespace Ifpack2 { (Kokkos::TeamThreadRange(member, blocksize), [&](const local_ordinal_type &k0) { impl_scalar_type val = 0; - Kokkos::parallel_reduce + Kokkos::parallel_for (Kokkos::ThreadVectorRange(member, blocksize), - [&](const local_ordinal_type &k1, impl_scalar_type &update) { - update += AA(k0,k1)*xx(k1); - }, val); - Kokkos::single(Kokkos::PerThread(member), [&]() { - yy(k0) -= val; + [&](const local_ordinal_type &k1) { + val += AA(k0,k1)*xx(k1); }); + Kokkos::atomic_fetch_add(&yy(k0), -val); }); } + template + KOKKOS_INLINE_FUNCTION + void + VectorGemv(const member_type &member, + const local_ordinal_type &blocksize, + const AAViewType &AA, + const xxViewType &xx, + const yyViewType &yy) const { + Kokkos::parallel_for + (Kokkos::ThreadVectorRange(member, blocksize), + [&](const local_ordinal_type &k0) { + impl_scalar_type val(0); + for (local_ordinal_type k1=0;k1 + // KOKKOS_INLINE_FUNCTION + // void + // VectorGemv(const member_type &member, + // const local_ordinal_type &blocksize, + // const AAViewType &AA, + // const xxViewType &xx, + // const yyViewType &yy) const { + // for (local_ordinal_type k0=0;k0 block_range(0, blocksize); const local_ordinal_type num_vectors = y.extent(1); @@ -2594,24 +2733,26 @@ namespace Ifpack2 { auto yy = Kokkos::subview(y, block_range, 0); auto A_block = ConstUnmanaged(NULL, blocksize, blocksize); - const local_ordinal_type row = i*blocksize; + const local_ordinal_type row = lr*blocksize; for (local_ordinal_type col=0;col @@ -2650,11 +2791,11 @@ namespace Ifpack2 { if (A_colind_at_j < num_local_rows) { const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j; const impl_scalar_type * const xx = &x(loc*blocksize, col); - serialGemv(blocksize, AA,xx,yy); + SerialGemv(blocksize, AA,xx,yy); } else { const auto loc = A_colind_at_j - num_local_rows; const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col); - serialGemv(blocksize, AA,xx_remote,yy); + SerialGemv(blocksize, AA,xx_remote,yy); } } // move yy to y_packed @@ -2694,26 +2835,28 @@ namespace Ifpack2 { bb.assign_data(&b(row, col)); yy.assign_data(&y_packed_scalar(pri, 0, col, v)); if (member.team_rank() == 0) - vectorCopy(member, blocksize, bb, yy); + VectorCopy(member, blocksize, bb, yy); member.team_barrier(); // y -= Rx const size_type A_k0 = A_rowptr[lr]; - for (size_type k=rowptr[lr];k(NULL, blocksize, blocksize); auto colindsub_used = (P == 0 ? colindsub : colindsub_remote); @@ -2812,27 +2955,29 @@ namespace Ifpack2 { // y := b bb.assign_data(&b(row, col)); if (member.team_rank() == 0) - vectorCopy(member, blocksize, bb, yy); + VectorCopy(member, blocksize, bb, yy); member.team_barrier(); } // y -= Rx const size_type A_k0 = A_rowptr[lr]; - for (size_type k=rowptr_used[lr];k::value) { #if defined(KOKKOS_ENABLE_CUDA) - const Kokkos::TeamPolicy policy(rowptr.extent(0) - 1, Kokkos::AUTO()); + const local_ordinal_type blocksize = blocksize_requested; + const local_ordinal_type team_size = 8; + const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size); + const Kokkos::TeamPolicy policy(rowptr.extent(0) - 1, team_size, vector_size); Kokkos::parallel_for ("ComputeResidual::TeamPolicy::run", policy, *this); #endif @@ -2902,12 +3050,16 @@ namespace Ifpack2 { if (is_cuda::value) { #if defined(KOKKOS_ENABLE_CUDA) - local_ordinal_type vl_power_of_two = 1; - for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2); - vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1); - const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two; + const local_ordinal_type blocksize = blocksize_requested; + const local_ordinal_type team_size = 8; + const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size); + // local_ordinal_type vl_power_of_two = 1; + // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2); + // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1); + // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two; #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \ - const Kokkos::TeamPolicy > policy(rowidx2part.extent(0), Kokkos::AUTO(), vl); \ + const Kokkos::TeamPolicy > \ + policy(rowidx2part.extent(0), team_size, vector_size); \ Kokkos::parallel_for \ ("ComputeResidual::TeamPolicy::run", \ policy, *this); } break @@ -2985,17 +3137,22 @@ namespace Ifpack2 { if (is_cuda::value) { #if defined(KOKKOS_ENABLE_CUDA) - local_ordinal_type vl_power_of_two = 1; - for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2); - vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1); - const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two; + const local_ordinal_type blocksize = blocksize_requested; + const local_ordinal_type team_size = 8; + const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size); + // local_ordinal_type vl_power_of_two = 1; + // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2); + // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1); + // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two; #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \ if (compute_owned) { \ - const Kokkos::TeamPolicy > policy(rowidx2part.extent(0), Kokkos::AUTO(), vl); \ + const Kokkos::TeamPolicy > \ + policy(rowidx2part.extent(0), team_size, vector_size); \ Kokkos::parallel_for \ ("ComputeResidual::TeamPolicy::run >", policy, *this); \ } else { \ - const Kokkos::TeamPolicy > policy(rowidx2part.extent(0), Kokkos::AUTO(), vl); \ + const Kokkos::TeamPolicy > \ + policy(rowidx2part.extent(0), team_size, vector_size); \ Kokkos::parallel_for \ ("ComputeResidual::TeamPolicy::run >", policy, *this); \ } break @@ -3019,11 +3176,13 @@ namespace Ifpack2 { #else #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \ if (compute_owned) { \ - const Kokkos::RangePolicy > policy(0, rowidx2part.extent(0)); \ + const Kokkos::RangePolicy > \ + policy(0, rowidx2part.extent(0)); \ Kokkos::parallel_for \ ("ComputeResidual::RangePolicy::run >", policy, *this); \ } else { \ - const Kokkos::RangePolicy > policy(0, rowidx2part.extent(0)); \ + const Kokkos::RangePolicy > \ + policy(0, rowidx2part.extent(0)); \ Kokkos::parallel_for \ ("ComputeResidual::RangePolicy::run >", policy, *this); \ } break @@ -3316,6 +3475,7 @@ namespace Ifpack2 { multivector_converter.to_packed_multivector(YY); } else { // fused y := x - R y and pmv := y(lclrow); + // real use case does not use overlap comp and comm if (overlap_communication_and_computation || !is_async_importer_active) { if (is_async_importer_active) async_importer->asyncSendRecv(YY); compute_residual_vector.run(pmv, XX, YY, remote_multivector, true); diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainer.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainer.cpp index ef14d38cb0a1..87b296b9a4ab 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainer.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainer.cpp @@ -328,6 +328,10 @@ static Teuchos::RCP > make_comm () { } int main (int argc, char** argv) { +#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE) + cudaProfilerStop(); +#endif + Tpetra::ScopeGuard tpetraScope(&argc, &argv); int ret = 0; diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_AddRadial_Decl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_AddRadial_Decl.hpp index c4d6adfe1eb7..fbe4fa2c17ff 100644 --- a/packages/kokkos-kernels/src/batched/KokkosBatched_AddRadial_Decl.hpp +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_AddRadial_Decl.hpp @@ -4,6 +4,8 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Vector.hpp" namespace KokkosBatched { namespace Experimental { diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyGivens_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyGivens_Serial_Internal.hpp new file mode 100644 index 000000000000..70528f5af1de --- /dev/null +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyGivens_Serial_Internal.hpp @@ -0,0 +1,184 @@ +#ifndef __KOKKOSBATCHED_APPLY_GIVENS_SERIAL_INTERNAL_HPP__ +#define __KOKKOSBATCHED_APPLY_GIVENS_SERIAL_INTERNAL_HPP__ + + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosBatched_Util.hpp" + + +namespace KokkosBatched { + namespace Experimental { + /// + /// Serial Internal Impl + /// ==================== + /// + /// this impl follows the flame interface of householder transformation + /// + struct SerialApplyLeftGivensInternal { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const Kokkos::pair G, + const int n, + /* */ ValueType * a1t, const int a1ts, + /* */ ValueType * a2t, const int a2ts) { + typedef ValueType value_type; + if (n == 0) return 0; // quick return + if (G.first == value_type(1) && G.second == value_type(0)) return 0; + /// G = [ gamma -sigma; + /// sigma gamma ]; + /// A := G A + /// where gamma is G.first and sigma is G.second + + const value_type gamma = G.first; + const value_type sigma = G.second; +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int j=0;j + KOKKOS_INLINE_FUNCTION + static int + invoke(const Kokkos::pair G, + const int m, + /* */ ValueType * a1, const int a1s, + /* */ ValueType * a2, const int a2s) { + typedef ValueType value_type; + if (m == 0) return 0; // quick return + if (G.first == value_type(1) && G.second == value_type(0)) return 0; + /// G = [ gamma -sigma; + /// sigma gamma ]; + /// A := A G' + /// where gamma is G.first and sigma is G.second + + const value_type gamma = G.first; + const value_type sigma = G.second; +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i=0;i + KOKKOS_INLINE_FUNCTION + static int + invoke(const Kokkos::pair &G12, + const int &m, const int &n, + /* */ ValueType *__restrict__ a1t, + /* */ ValueType *__restrict__ a2t, + /* */ ValueType *__restrict__ a1, + /* */ ValueType *__restrict__ a2, + const int &as0, const int &as1) { + typedef ValueType value_type; + if (G12.first == value_type(1) && G12.second == value_type(0)) return 0; + if (m == 0 && n == 0) return 0; // quick return + + const value_type gamma12 = G12.first; + const value_type sigma12 = G12.second; + +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int j=0;j + KOKKOS_INLINE_FUNCTION + static int + invoke(const Kokkos::pair &G12, + const Kokkos::pair &G13, + const int &m, const int &n, + /* */ ValueType *__restrict__ a1t, + /* */ ValueType *__restrict__ a2t, + /* */ ValueType *__restrict__ a3t, + /* */ ValueType *__restrict__ a1, + /* */ ValueType *__restrict__ a2, + /* */ ValueType *__restrict__ a3, + const int &as0, const int &as1) { + typedef ValueType value_type; + if (m == 0 && n == 0) return 0; // quick return + + const value_type gamma12 = G12.first; + const value_type sigma12 = G12.second; + const value_type gamma13 = G13.first; + const value_type sigma13 = G13.second; + +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int j=0;j + struct SerialApplyHouseholder { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const uViewType &u2, + const tauViewType &tau, + const AViewType + const wViewType &w); + }; + + } +} + +#endif diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp new file mode 100644 index 000000000000..c04fbc510dec --- /dev/null +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp @@ -0,0 +1,64 @@ +#ifndef __KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_IMPL_HPP__ +#define __KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_IMPL_HPP__ + + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Householder_Serial_Internal.hpp" + + +namespace KokkosBatched { + namespace Experimental { + + /// + /// Serial Impl + /// =========== + + template<> + template + KOKKOS_INLINE_FUNCTION + int + SerialApplyHouseholder:: + invoke(const uViewType &u2, + const tauViewType &tau, + const AViewType &A, + const wViewType &w) { + return SerialApplyLeftHouseholderInternal:: + invoke(A.extent(0)-1, A.extent(1), + tau.data(), + u2.data(), u2.stride(0), + A.data(), A.stride(1), + A.data()+A.stride(0), A.stride(0), A.stride(1), + w.data()); + } + + template<> + template + KOKKOS_INLINE_FUNCTION + int + SerialApplyHouseholder:: + invoke(const uViewType &u2, + const tauViewType &tau, + const AViewType &A, + const wViewType &w) { + return SerialApplyRightHouseholderInternal:: + invoke(A.extent(0), A.extent(1)-1, + tau.data(), + u2.data(), u2.stride(0), + A.data(), A.stride(0), + A.data()+A.stride(1), A.stride(0), A.stride(1), + w.data()); + } + + } +} + + +#endif diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp new file mode 100644 index 000000000000..53bb8f8f4da5 --- /dev/null +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp @@ -0,0 +1,116 @@ +#ifndef __KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_INTERNAL_HPP__ +#define __KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_INTERNAL_HPP__ + + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosBatched_Util.hpp" + + +namespace KokkosBatched { + namespace Experimental { + /// + /// Serial Internal Impl + /// ==================== + /// + /// this impl follows the flame interface of householder transformation + /// + struct SerialApplyLeftHouseholderInternal { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, + const int n, + const ValueType * tau, + /* */ ValueType * u2, const int u2s, + /* */ ValueType * a1t, const int a1ts, + /* */ ValueType * A2, const int as0, const int as1, + /* */ ValueType * w1t) { + typedef ValueType value_type; + + /// u2 m x 1 + /// a1t 1 x n + /// A2 m x n + + // apply a single householder transform H from the left to a row vector a1t + // and a matrix A2 + const value_type inv_tau = value_type(1)/(*tau); + + // compute the followings: + // a1t -= inv(tau)(a1t + u2'A2) + // A2 -= u2 inv(tau)(a1t + u2'A2) + + // w1t = a1t + u2'A2 = A2^T conj(u2) + // w1t /= tau + for (int j=0;j::conj(u2[i*u2s])*A2[i*as0+j*as1]; + w1t[j] = tmp*inv_tau; // /= (*tau); + } + + // a1t -= w1t (axpy) + for (int j=0;j + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, + const int n, + const ValueType * tau, + /* */ ValueType * u2, const int u2s, + /* */ ValueType * a1, const int a1s, + /* */ ValueType * A2, const int as0, const int as1, + /* */ ValueType * w1) { + typedef ValueType value_type; + /// u2 n x 1 + /// a1 m x 1 + /// A2 m x n + + // apply a single householder transform H from the left to a row vector a1t + // and a matrix A2 + const value_type inv_tau = value_type(1)/(*tau); + + // compute the followings: + // a1 -= inv(tau)(a1 + A2 u2) + // A2 -= inv(tau)(a1 + A2 u2) u2' + + // w1 = a1 + A2 u2 + // w1 /= tau + for (int i=0;i::conj(u2[j*u2s]); + + return 0; + } + }; + + }// end namespace Experimental +} // end namespace KokkosBatched + + +#endif diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyQ_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyQ_Serial_Internal.hpp new file mode 100644 index 000000000000..190fee0c7225 --- /dev/null +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyQ_Serial_Internal.hpp @@ -0,0 +1,149 @@ +#ifndef __KOKKOSBATCHED_APPLY_Q_SERIAL_INTERNAL_HPP__ +#define __KOKKOSBATCHED_APPLY_Q_SERIAL_INTERNAL_HPP__ + + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_ApplyHouseholder_Serial_Internal.hpp" + +namespace KokkosBatched { + namespace Experimental { + /// + /// Serial Internal Impl + /// ==================== + /// + /// this impl follows the flame interface of householder transformation + /// + + + struct SerialApplyQ_LeftNoTransForwardInternal { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, + const int n, + const int k, + /* */ ValueType * A, const int as0, const int as1, + /* */ ValueType * t, const int ts, + /* */ ValueType * B, const int bs0, const int bs1, + /* */ ValueType * w) { + typedef ValueType value_type; + + /// Given a matrix A that includes a series of householder vectors, + /// it applies a unitary matrix Q to B from left without transpose + /// B = Q B = (H0 H1 H2 H3 ... H(k-1)) B + /// where + /// A is m x k (holding H0, H1 ... H(k-1) + /// t is k x 1 + /// B is m x n + + // partitions used for loop iteration + Partition2x2 A_part2x2(as0, as1); + Partition3x3 A_part3x3(as0, as1); + + Partition2x1 t_part2x1(ts); + Partition3x1 t_part3x1(ts); + + Partition2x1 B_part2x1(bs0); + Partition3x1 B_part3x1(bs0); + + // initial partition of A where ATL has a zero dimension + A_part2x2.partWithABR(A, m, k, m-k, 0); + t_part2x1.partWithAB (t, k, 0 ); + B_part2x1.partWithAB (B, m, m-k ); + + for (int m_A0=(k-1);m_A0>=0;--m_A0) { + // part 2x2 into 3x3 + A_part3x3.partWithATL(A_part2x2, 1, 1); + t_part3x1.partWithAT (t_part2x1, 1); + value_type *tau = t_part3x1.A1; + + B_part3x1.partWithAT (B_part2x1, 1); + const int m_A2 = m - m_A0 - 1; + /// ----------------------------------------------------- + // left apply householder to partitioned B1 and B2 + SerialApplyLeftHouseholderInternal::invoke(m_A2, n, + tau, + A_part3x3.A21, as0, + B_part3x1.A1, bs1, + B_part3x1.A2, bs0, bs1, + w); + + /// ----------------------------------------------------- + A_part2x2.mergeToABR(A_part3x3); + t_part2x1.mergeToAB (t_part3x1); + B_part2x1.mergeToAB (B_part3x1); + } + return 0; + } + }; + + struct SerialApplyQ_RightNoTransForwardInternal { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, + const int n, + const int k, + /* */ ValueType * A, const int as0, const int as1, + /* */ ValueType * t, const int ts, + /* */ ValueType * B, const int bs0, const int bs1, + /* */ ValueType * w) { + typedef ValueType value_type; + + /// Given a matrix A that includes a series of householder vectors, + /// it applies a unitary matrix Q to B from left without transpose + /// B = B Q = B (H0 H1 H2 H3 ... H(k-1)) + /// where + /// A is n x k (holding H0, H1 ... H(k-1) + /// t is k x 1 + /// B is m x n + + // partitions used for loop iteration + Partition2x2 A_part2x2(as0, as1); + Partition3x3 A_part3x3(as0, as1); + + Partition2x1 t_part2x1(ts); + Partition3x1 t_part3x1(ts); + + Partition1x2 B_part1x2(bs1); + Partition1x3 B_part1x3(bs1); + + // initial partition of A where ATL has a zero dimension + A_part2x2.partWithATL(A, n, k, 0, 0); + t_part2x1.partWithAT (t, k, 0 ); + B_part1x2.partWithAL (B, n, 0 ); + + for (int n_A0=0;n_A0 + struct Copy { + template + KOKKOS_FORCEINLINE_FUNCTION + static int + invoke(const MemberType &member, + const AViewType &A, + const BViewType &B) { + int r_val = 0; + if (std::is_same::value) { + r_val = SerialCopy::invoke(A, B); + } else if (std::is_same::value) { + r_val = TeamCopy::invoke(member, A, B); + } + return r_val; + } + }; + + } } +#define KOKKOSBATCHED_SERIAL_COPY_MATRIX_NO_TRANSPOSE_INTERNAL_INVOKE(M,N,A,AS0,AS1,B,BS0,BS1) \ + KokkosBatched::Experimental::SerialCopyInternal \ + ::invoke(M, N, A, AS0, AS1, B, BS0, BS1) + +#define KOKKOSBATCHED_TEAM_COPY_MATRIX_NO_TRANSPOSE_INTERNAL_INVOKE(MEMBER,M,N,A,AS0,AS1,B,BS0,BS1) \ + KokkosBatched::Experimental::TeamCopyInternal \ + ::invoke(MEMBER, M, N, A, AS0, AS1, B, BS0, BS1) + +#define KOKKOSBATCHED_SERIAL_COPY_VECTOR_INTERNAL_INVOKE(M,A,AS,B,BS) \ + KokkosBatched::Experimental::SerialCopyInternal \ + ::invoke(M, A, AS, B, BS) + +#define KOKKOSBATCHED_TEAM_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(MEMBER,M,A,AS,B,BS) \ + KokkosBatched::Experimental::TeamCopyInternal \ + ::invoke(MEMBER, M, A, AS, B, BS) + +#define KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(MODETYPE,MEMBER,M,A,AS,B,BS) \ + if (std::is_same::value) { \ + KOKKOSBATCHED_SERIAL_COPY_VECTOR_INTERNAL_INVOKE(M,A,AS,B,BS); \ + } else if (std::is_same::value) { \ + KOKKOSBATCHED_TEAM_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(MEMBER,M,A,AS,B,BS); \ + } + #endif diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Decl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Decl.hpp new file mode 100644 index 000000000000..1a36d243a791 --- /dev/null +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Decl.hpp @@ -0,0 +1,68 @@ +#ifndef __KOKKOSBATCHED_EIGENDECOMPOSITION_DECL_HPP__ +#define __KOKKOSBATCHED_EIGENDECOMPOSITION_DECL_HPP__ + + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Vector.hpp" + +namespace KokkosBatched { + namespace Experimental { + /// Given a general nonsymmetric matrix A (m x m), it performs eigendecomposition + /// of the matrix. + /// + /// Parameters: + /// [in] member + /// Team interface only has this argument. Partial specialization can be applied for + /// a different type of team member. + /// [in/out]A + /// Real general nonsymmetric rank 2 view A(m x m). + /// A is first condensed to a upper Hessenberg form. Then, the Francis + /// double shift QR algorithm is applied to compute its Schur form. + /// On exit, A stores a quasi upper triangular matrix of the Schur decomposition. + /// [out]er, [out]ei + /// A real and imaginary eigenvalues, which forms er(m)+ei(m)i + /// For a complex eigen pair, it stores a+bi and a-bi consecutively. + /// [out]UL, [out]UR + /// Left/right eigenvectors are stored in (m x m) matrices. If zero span view is provided, + /// it does not compute the corresponding eigenvectors. However, both UL and UR cannot have + /// zero span. If eigenvalues are only requested, use the Eigenvalue interface which + /// simplifies computations + /// [out]W + /// 1D contiguous workspace. The minimum size is (2*m*m+5*m) where m is the dimension of matrix A. + + struct SerialEigendecomposition { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const AViewType &A, + const EViewType &er, const EViewType &ei, + const UViewType &UL, const UViewType &UR, + const WViewType &W); + }; + + template + struct TeamEigendecomposition { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const AViewType &A, + const EViewType &er, const EViewType &ei, + const UViewType &UL, const UViewType &UR, + const WViewType &W); + }; + + + }/// end namespace Experimental +} /// end namespace KokkosBatched + + +#endif diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Serial_Impl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Serial_Impl.hpp new file mode 100644 index 000000000000..4b010dd325a0 --- /dev/null +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Serial_Impl.hpp @@ -0,0 +1,55 @@ +#ifndef __KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_IMPL_HPP__ +#define __KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_IMPL_HPP__ + + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Eigendecomposition_Serial_Internal.hpp" + +namespace KokkosBatched { + namespace Experimental { + /// + /// Serial Impl + /// =========== + + template + KOKKOS_INLINE_FUNCTION + int + SerialEigendecomposition:: + invoke(const AViewType &A, + const EViewType &er, const EViewType &ei, + const UViewType &UL, const UViewType &UR, + const WViewType &W) { + /// view checking + const int m = A.extent(0); + assert(m == int(A.extent(1)) && "Eigendecomposition: A is not square"); + assert(m == int(er.extent(0)) && "Eigendecomposition: Length of er does not match to A's dimension"); + assert(m == int(ei.extent(0)) && "Eigendecomposition: Length of ei does not match to A's dimension"); + assert(m == int(UL.extent(0)) && "Eigendecomposition: Length of UL does not match to A's dimension"); + assert(m == int(UL.extent(1)) && "Eigendecomposition: Width of UL does not match to A's dimension"); + assert(m == int(UR.extent(0)) && "Eigendecomposition: Length of UR does not match to A's dimension"); + assert(m == int(UR.extent(1)) && "Eigendecomposition: Width of UR does not match to A's dimension"); + assert(int(W.extent(0)) >= int(2*m*m+5*m) && "Eigendecomposition: workspace size is too small"); + assert(int(W.stride(0)) == int(1) && "Eigendecomposition: Provided workspace is not contiguous"); + + /// static assert A,er,ei,UL,UR,W has the same value_type + /// static assert all views have the same memory space + return m ? SerialEigendecompositionInternal + ::invoke(A.extent(0), + A.data(), A.stride(0), A.stride(1), + er.data(), er.stride(0), + ei.data(), ei.stride(0), + UL.data(), UL.stride(0), UL.stride(1), + UR.data(), UR.stride(0), UR.stride(1), + W.data(), W.extent(0)) : 0; + } + + }/// end namespace Experimental +} /// end namespace KokkosBatched + + +#endif diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Serial_Internal.hpp new file mode 100644 index 000000000000..93a13e62f8a1 --- /dev/null +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Serial_Internal.hpp @@ -0,0 +1,298 @@ +#ifndef __KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_INTERNAL_HPP__ +#define __KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_INTERNAL_HPP__ + + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_SetIdentity_Internal.hpp" +#include "KokkosBatched_SetTriangular_Internal.hpp" +#include "KokkosBatched_Normalize_Internal.hpp" +#include "KokkosBatched_Hessenberg_Serial_Internal.hpp" +#include "KokkosBatched_ApplyQ_Serial_Internal.hpp" +#include "KokkosBatched_Schur_Serial_Internal.hpp" +#include "KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp" +#include "KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp" +#include "KokkosBatched_Gemm_Serial_Internal.hpp" + +namespace KokkosBatched { + namespace Experimental { + /// + /// Serial Internal Impl + /// ==================== + + struct SerialEigendecompositionInternal { + /// Given a general nonsymmetric matrix A (m x m), it performs eigendecomposition + /// of the matrix. + /// + /// Parameters: + /// [in]m + /// A dimension of the square matrix H. + /// [in/out]A, [in]as0, [in]as1 + /// Real general nonsymmetric matrix A(m x m) with strides as0 and as1. + /// A is first condensed to a upper Hessenberg form. Then, the Francis + /// double shift QR algorithm is applied to compute its Schur form. + /// On exit, A stores a quasi upper triangular matrix of the Schur decomposition. + /// [out]er, [in]ers, [out]ei, [in]eis + /// A complex vector er(m)+ei(m)i with a stride ers and eis to store computed eigenvalues. + /// For a complex eigen pair, it stores a+bi and a-bi consecutively. + /// [out]UL, [in]uls0, [in] uls1 + /// Left eigenvectors UL(m x m) with strides uls0 and uls1. When UL is NULL, the left + /// eigenvectors are not computed. + /// [out]UR, [in]urs0, [in] urs1 + /// Right eigenvectors UR(m x m) with strides urs0 and urs1. When UR is NULL, the right + /// eigenvectors are not computed. + /// [out]w, [in]wlen + /// Workspace + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, + RealType * A, const int as0, const int as1, + RealType * er, const int ers, + RealType * ei, const int eis, + RealType * UL, const int uls0, const int uls1, + RealType * UR, const int urs0, const int urs1, + RealType * w, const int wlen) { + typedef RealType real_type; + typedef Kokkos::Details::ArithTraits ats; + + const real_type one(1), zero(0), tol = 1e2*ats::epsilon(); + //const Kokkos::pair identity(one, zero); + + /// step 0: input checking + assert( (wlen >= (2*m*m+5*m)) && "Eigendecomposition: workspace size is too small"); + real_type *w_now = w; + int wlen_now = wlen; + assert( (wlen_now >= 0) && "Eigendecomposition: workspace size is negative"); + + const bool is_UL = UL != NULL, is_UR = UR != NULL; + const bool is_U = is_UL || is_UR; + assert( is_U && "Eigendecomposition: eigenvectors are not requested; consider to use SerialEigenvalueInternal"); + + real_type *QZ = w_now; w_now += (m*m); wlen_now -= (m*m); + assert( (wlen_now >= 0) && "Eigendecomposition: QZ allocation fails"); + + const int qzs0 = m, qzs1 = 1, qzs = m+1; /// row major + const int as = as0+as1; + + /// step 1: Hessenberg reduction A = Q H Q^H + /// Q is stored in QZ +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) + { + real_type *t = w_now; w_now += m; wlen_now -= m; + assert( (wlen_now >= 0) && "Eigendecomposition: Hessenberg reduction workspace t allocation fail"); + + if (as0 == 1 || as1 == 1) { /// if mkl can be interfaced, use it + const auto matrix_layout = ( as1 == 1 ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR ); + LAPACKE_dgehrd(matrix_layout, m, 1, m, A, m, t); + + SerialCopyInternal::invoke(m, m, QZ, qzs0, qzs1); + LAPACKE_dorghr(matrix_layout, m, 1, m, QZ, m, t); + } else { /// for arbitrary strides, there is no choice to use tpls + real_type *ww = w_now; w_now += m; wlen_now -= m; + assert( (wlen_now >= 0) && "Eigendecomposition: Hessenberg reduction workspace ww allocation fail"); + + SerialHessenbergInternal::invoke(m, m, + A, as0, as1, + t, 1, + ww); + + SerialSetIdentityInternal::invoke(m, QZ, qzs0, qzs1); + SerialApplyQ_LeftNoTransForwardInternal::invoke(m-1, m-1, m-1, + A+as0, as0, as1, + t, 1, + QZ+qzs, qzs0, qzs1, + ww); + /// recovery of workspace for ww + w_now -= m; wlen_now += m; + } + /// recovery of workspace for t + w_now -= m; wlen_now += m; + + /// clean up H + SerialSetLowerTriangularInternal::invoke(m, m, + 2, + zero, + A, as0, as1); + } +#else + { + real_type *t = w_now; w_now += m; wlen_now -= m; + real_type *ww = w_now; w_now += m; wlen_now -= m; + assert( (wlen_now >= 0) && "Eigendecomposition: Hessenberg reduction workspace t and ww allocation fail"); + + SerialHessenbergInternal::invoke(m, m, + A, as0, as1, + t, 1, + ww); + + SerialSetIdentityInternal::invoke(m, QZ, qzs0, qzs1); + SerialApplyQ_LeftNoTransForwardInternal::invoke(m-1, m-1, m-1, + A+as0, as0, as1, + t, 1, + QZ+qzs, qzs0, qzs1, + ww); + + /// clean up H + SerialSetLowerTriangularInternal::invoke(m, m, + 2, + zero, + A, as0, as1); + + /// recover workspace + w_now -= (2*m); wlen_now += (2*m); + } +#endif + /// step 2: Schur decomposition H = Z T Z^H + /// Z is applied to QZ + { + int r_val = 0; + real_type *ww = w_now; w_now += (5*m); wlen_now -= (5*m); + assert( (wlen_now >= 0) && "Eigendecomposition: Schur decomposition workspace ww allocation fails"); + do { + const bool restart = (r_val < 0); + r_val = SerialSchurInternal::invoke(m, + A, as0, as1, + QZ, qzs0, qzs1, + ww, 5*m, + restart); + } while (r_val < 0 && false); + // for a testing purpose, we run the Schur decomposition with a finite number of Francis iterations + w_now -= (5*m); wlen_now += (5*m); + } + + /// Step 3: Extract iigenvalues and eigenvectors from T = V S V^-1 + /// + { + /// extract eigenvalues + real_type *AA = A-as1; + int *blks = (int*)w_now; w_now += m; wlen_now -= m; + assert( (wlen_now >= 0) && "Eigendecomposition: Eigenvector workspace blks allocation fails"); + + { + int i=0; + for (;i<(m-1);) { + const real_type subdiag = ats::abs(AA[(i+1)*as]); + const real_type diag = A[i*as]; + if (subdiag < tol) { + er[i*ers] = diag; + ei[i*eis] = zero; + blks[i] = 1; + i+=1; + } else { + const real_type offdiag = ats::abs(A[i*as+as1]); + const real_type sqrt_mult_suboffdiags = ats::sqrt(subdiag*offdiag); + er[(i )*ers] = diag; + er[(i+1)*ers] = diag; + ei[(i )*eis] = sqrt_mult_suboffdiags; + ei[(i+1)*eis] = -sqrt_mult_suboffdiags; + blks[i ] = 2; + blks[i+1] = 2; /// consider backward iteration + i+=2; + } + } + if (i= 0) && "Eigendecomposition: Eigenvector workspace V allocation fails"); + + const int vs0 = 1, vs1 = m; + real_type *ww = w_now; w_now += 2*m; wlen_now -= 2*m; + assert( (wlen_now >= 0) && "Eigendecomposition: Eigenvector workspace w allocation fails"); + + /// Right eigenvectors V + if (is_UR) { + SerialRightEigenvectorFromSchurInternal + ::invoke(m, + A, as0, as1, + V, vs0, vs1, + ww, + blks); + + /// QZ V + SerialGemmInternal:: + invoke(m, m, m, + one, + QZ, qzs0, qzs1, + V, vs0, vs1, + zero, + UR, urs0, urs1); + int j=0; + for (;j:: + invoke(m, m, m, + one, + V, vs0, vs1, + QZ, qzs1, qzs0, + zero, + UL, uls0, uls1); + + int i=0; + for (;i + struct TeamEigendecomposition { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const AViewType &A, + const EViewType &er, const EViewType &ei, + const UViewType &UL, const UViewType &UR, + const WViewType &W) { + /// view checking + const int m = A.extent(0); + assert(m == A.extent(1) && "Eigendecomposition: A is not square"); + assert(m == er.extent(0) && "Eigendecomposition: Length of er does not match to A's dimension"); + assert(m == ei.extent(0) && "Eigendecomposition: Length of ei does not match to A's dimension"); + assert(m == UL.extent(0) && "Eigendecomposition: Length of UL does not match to A's dimension"); + assert(m == UL.extent(1) && "Eigendecomposition: Width of UL does not match to A's dimension"); + assert(m == UR.extent(0) && "Eigendecomposition: Length of UR does not match to A's dimension"); + assert(m == UR.extent(1) && "Eigendecomposition: Width of UR does not match to A's dimension"); + assert(W.extent(0) >= (2*m*m+5*m) && "Eigendecomposition: workspace size is too small"); + assert(W.stride(0) == 1 && "Eigendecomposition: Provided workspace is not contiguous"); + + return 0; + } + }; + + + }/// end namespace Experimental +} /// end namespace KokkosBatched + + +#endif diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Eigenvalue_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Eigenvalue_Serial_Internal.hpp new file mode 100644 index 000000000000..bd1cecb5c832 --- /dev/null +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Eigenvalue_Serial_Internal.hpp @@ -0,0 +1,225 @@ +#ifndef __KOKKOSBATCHED_EIGENVALUE_SERIAL_INTERNAL_HPP__ +#define __KOKKOSBATCHED_EIGENVALUE_SERIAL_INTERNAL_HPP__ + + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_WilkinsonShift_Serial_Internal.hpp" +#include "KokkosBatched_Schur2x2_Serial_Internal.hpp" +#include "KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp" +#include "KokkosBatched_Francis_Serial_Internal.hpp" + +namespace KokkosBatched { + namespace Experimental { + /// + /// Serial Internal Impl + /// ==================== + /// + /// this impl follows the flame interface of householder transformation + /// + struct SerialEigenvalueInternal { + /// Given a strictly Hessenberg matrix H (m x m), this computes all eigenvalues + /// using the Francis method and stores them into a vector e. This routine does + /// not scale nor balance the matrix for the numerical stability. + /// + /// Parameters: + /// [in]m + /// A dimension of the square matrix H. + /// [in/out]H, [in]hs0, [in]hs1 + /// Real Hessenberg matrix H(m x m) with strides hs0 and hs1. + /// Entering the routine, H is assumed to have a upper Hessenberg form, where + /// all subdiagonals are zero. The matrix is overwritten on exit. + /// [out]er, [in]ers, [out]ei, [in]eis + /// A complex vector er(m)+ei(m)i with a stride ers and eis to store computed eigenvalues. + /// For a complex eigen pair, it stores a+bi and a-bi consecutively. + /// [in]restart(false) + /// With a restart option, the routine assume that the matrix H and the vector e + /// contain the partial results from the previous run. When m = 1 or 2, this option + /// won't work as the routine always computes the all eigenvalues. + /// [in]max_iteration(300) + /// Unlike LAPACK which uses various methods for different types of matrices, + /// this routine uses the Francis method only. A user can set the maximum number + /// of iterations. When it reaches the maximum iteration counts without converging + /// all eigenvalues, the routine returns -1. + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, + /* */ RealType * H, const int hs0, const int hs1, + /* */ RealType * er, const int ers, + /* */ RealType * ei, const int eis, + const bool restart = false, + const int user_max_iteration = -1) { + typedef RealType real_type; + typedef Kokkos::Details::ArithTraits ats; + const real_type zero(0), nan(ats::nan()), tol = 1e2*ats::epsilon(); + const int max_iteration = user_max_iteration < 0 ? 300 : user_max_iteration; + + int r_val = 0; + if (restart) { + if (m <= 2) { + Kokkos::abort("Error: restart option cannot be used for m=1 or m=2"); + } + } else { + for (int i=0;i lambda1, lambda2; + SerialWilkinsonShiftInternal::invoke(H[0], H[hs1], + H[hs0], H[hs], + &lambda1, &lambda2, + &is_complex); + er[0] = lambda1.real(); ei[0] = lambda1.imag(); + er[1] = lambda2.real(); ei[1] = lambda2.imag(); + break; + } + default: { + /// Francis method + int iter(0); /// iteration count + bool converge = false; /// bool to check all eigenvalues are converged + + while (!converge && iter < max_iteration) { + /// Step 1: find a set of unrevealed eigenvalues + int cnt = 1; + + /// find mbeg (first nonzero subdiag value) + for (;cnt tol) break; + } + const int mbeg = cnt-1; + + /// find mend (first zero subdiag value) + for (;cnt lambda1, lambda2; + bool is_complex; + real_type *sub2x2 = H+(mend-2)*hs; + if (2 == mdiff) { + SerialWilkinsonShiftInternal::invoke(sub2x2[0], sub2x2[hs1], + sub2x2[hs0], sub2x2[hs], + &lambda1, &lambda2, + &is_complex); + sub2x2[hs0] = zero; + + /// eigenvalues are from wilkinson shift + er[(mbeg+0)*ers] = lambda1.real(); ei[(mbeg+0)*eis] = lambda1.imag(); + er[(mbeg+1)*ers] = lambda2.real(); ei[(mbeg+1)*eis] = lambda2.imag(); + } else { + SerialWilkinsonShiftInternal::invoke(sub2x2[0], sub2x2[hs1], + sub2x2[hs0], sub2x2[hs], + &lambda1, &lambda2, + &is_complex); + + SerialFrancisInternal::invoke(0, mdiff, mdiff, + H+hs*mbeg, hs0, hs1, + lambda1, lambda2, + is_complex); + /* */ auto &val1 = *(sub2x2+hs0); + /* */ auto &val2 = *(sub2x2-hs1); + const auto abs_val1 = ats::abs(val1); + const auto abs_val2 = ats::abs(val2); + + /// convergence check + if (abs_val1 < tol) { + er[(mend-1)*ers] = sub2x2[hs]; ei[(mend-1)*eis] = zero; + val1 = zero; + } else if (abs_val2 < tol) { + er[(mend-2)*ers] = lambda1.real(); ei[(mend-2)*eis] = lambda1.imag(); + er[(mend-1)*ers] = lambda2.real(); ei[(mend-1)*eis] = lambda2.imag(); + + val1 = zero; + val2 = zero; + } + } + } +# endif + + } else { + /// all eigenvalues are converged + converge = true; + } + ++iter; + } + /// Step 3: record missing real eigenvalues from the diagonals + if (converge) { + // record undetected eigenvalues + for (int i=0;i + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, + /* */ RealType * H, const int hs0, const int hs1, + /* */ Kokkos::complex * e, const int es, + const int max_iteration = 300, + const RealType user_tolerence = RealType(-1), + const bool restart = false) { + RealType * er = (RealType*)e; + RealType * ei = er+1; + const int two_es = 2*es; + return invoke(m, + H, hs0, hs1, + er, two_es, + ei, two_es, + user_tolerence, + restart, + max_iteration); + } + }; + + + + }/// end namespace Experimental +} /// end namespace KokkosBatched + + +#endif diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Francis_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Francis_Serial_Internal.hpp new file mode 100644 index 000000000000..b1e55a543e0e --- /dev/null +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Francis_Serial_Internal.hpp @@ -0,0 +1,215 @@ +#ifndef __KOKKOSBATCHED_FRANCIS_SERIAL_INTERNAL_HPP__ +#define __KOKKOSBATCHED_FRANCIS_SERIAL_INTERNAL_HPP__ + + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Givens_Serial_Internal.hpp" +#include "KokkosBatched_ApplyGivens_Serial_Internal.hpp" + +namespace KokkosBatched { + namespace Experimental { + /// + /// Serial Internal Impl + /// ==================== + /// + /// this impl follows the flame interface of householder transformation + /// + struct SerialFrancisInternal { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const int mbeg, const int mend, const int morg, + /* */ ValueType * HH, const int hs0, const int hs1, + const Kokkos::complex lambda1, + const Kokkos::complex lambda2, + const bool is_complex, + /* */ Kokkos::pair * GG, const bool request_schur) { + typedef ValueType value_type; + + const int hs = hs0+hs1; + const value_type one(1), zero(0); + const Kokkos::pair identity(one,zero); + + /// redefine variables + const int m = mend-mbeg, mrst = morg-mend, mbeg_mult_hs0 = mbeg*hs0; + value_type *H = HH+hs*mbeg; + + /// initialize Gs + Kokkos::pair *Gs = NULL; + if (request_schur) { + Gs = (Kokkos::pair *)(GG+mbeg*2); + for (int i=0;i G[2]; + + /// 0. compute 1st double shift vector + value_type v[3]; + { + // this needs m>=3 + // v = M e_1 = (H*H - 2 Re(lambda) H + |lambda|^2 I)e_1 + value_type s, t; + const value_type + h00 = H[0*hs0+0*hs1], h01 = H[0*hs0+1*hs1], + h10 = H[1*hs0+0*hs1], h11 = H[1*hs0+1*hs1], + /* */ h21 = H[2*hs0+1*hs1]; + if (is_complex) { + s = 2*lambda1.real(); + t = lambda1.real()*lambda1.real()+lambda1.imag()*lambda1.imag(); + } else { + const value_type val = H[(m-1)*hs]; + const auto dist_lambda1 = Kokkos::Details::ArithTraits::abs(lambda1.real() - val); + const auto dist_lambda2 = Kokkos::Details::ArithTraits::abs(lambda2.real() - val); + const value_type lambda = dist_lambda1 < dist_lambda2 ? lambda1.real() : lambda2.real(); + s = 2*lambda; + t = lambda*lambda; + } + v[0] = h00*h00+h01*h10 /* H^2 e_1 */ - s*h00 /* 2 Re(lambda) */ + t; + v[1] = h10*h00+h11*h10 /* */ - s*h10; + v[2] = h21*h10; + } + + /// 1. compute the first two givens rotations that introduces a bulge + { + SerialGivensInternal::invoke(v[0], v[1], + &G[0], + &v[0]); + SerialGivensInternal::invoke(v[0], v[2], + &G[1], + &v[0]); + // record + if (request_schur) { + Gs[0] = G[0]; + Gs[1] = G[1]; + } + + // apply G' from left and right + G[0].second = -G[0].second; + G[1].second = -G[1].second; + + const int mm = m < 4 ? m : 4, nn = m; + value_type *Hs = H-mbeg_mult_hs0; + SerialApplyLeftRightGivensInternal + ::invoke (G[0], G[1], + mm+mbeg, nn+mrst, + H, H +hs0,H +2*hs0, + Hs, Hs+hs1,Hs+2*hs1, + hs0, hs1); + } + + /// 1. chase the bulge + + // partitions used for loop iteration + Partition2x2 H_part2x2(hs0, hs1); + Partition3x3 H_part3x3(hs0, hs1); + + // initial partition of A where ATL has a zero dimension + int m_htl = 1; + H_part2x2.partWithATL(H, m, m, m_htl, m_htl); + for (;m_htl<(m-2);++m_htl) { + // part 2x2 into 3x3 + H_part3x3.partWithABR(H_part2x2, 1, 1); + /// ----------------------------------------------------- + value_type *chi1 = H_part3x3.A11-hs1; + value_type *chi2 = chi1+hs0; + value_type *chi3 = chi2+hs0; + + SerialGivensInternal::invoke(*chi1, *chi2, + &G[0], + chi1); *chi2 = zero; + SerialGivensInternal::invoke(*chi1, *chi3, + &G[1], + chi1); *chi3 = zero; + // record + if (request_schur) { + Gs[2*m_htl ] = G[0]; + Gs[2*m_htl+1] = G[1]; + } + + G[0].second = -G[0].second; + G[1].second = -G[1].second; + + const int nn = m-m_htl, mtmp = m_htl+4, mm = mtmp < m ? mtmp : m; + value_type *a1t = H_part3x3.A11; + value_type *a2t = a1t+hs0; + value_type *a3t = a2t+hs0; + value_type *a1 = H_part3x3.A01-mbeg_mult_hs0; + value_type *a2 = a1+hs1; + value_type *a3 = a2+hs1; + + SerialApplyLeftRightGivensInternal + ::invoke (G[0], G[1], + mm+mbeg, nn+mrst, + a1t, a2t, a3t, + a1, a2, a3, + hs0, hs1); + /// ----------------------------------------------------- + H_part2x2.mergeToATL(H_part3x3); + } + + // last 3x3 block + { + // part 2x2 into 3x3 + H_part3x3.partWithABR(H_part2x2, 1, 1); + /// ----------------------------------------------------- + value_type *chi1 = H_part3x3.A11-hs1; + value_type *chi2 = chi1+hs0; + SerialGivensInternal::invoke(*chi1, *chi2, + &G[0], + chi1); *chi2 = zero; + Gs[2*m_htl ] = G[0]; + Gs[2*m_htl+1] = identity; + + G[0].second = -G[0].second; + + const int mm = m, nn = 2; + value_type *a1t = H_part3x3.A11; + value_type *a2t = a1t+hs0; + value_type *a1 = H_part3x3.A01-mbeg_mult_hs0; + value_type *a2 = a1+hs1; + SerialApplyLeftRightGivensInternal + ::invoke (G[0], + mm+mbeg, nn+mrst, + a1t, a2t, + a1, a2, + hs0, hs1); + + /// ----------------------------------------------------- + H_part2x2.mergeToATL(H_part3x3); + } + + return 0; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + static int + invoke(const int mbeg, const int mend, const int morg, + /* */ ValueType * HH, const int hs0, const int hs1, + const Kokkos::complex lambda1, + const Kokkos::complex lambda2, + const bool is_complex) { + return invoke(mbeg, mend, morg, + HH, hs0, hs1, + lambda1, lambda2, is_complex, + (Kokkos::pair*)NULL, false); + } + }; + + }// end namespace Experimental +} // end namespace KokkosBatched + + +#endif diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Gemm_Decl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Gemm_Decl.hpp index cd5f1bd302ed..a4c2ead53f75 100644 --- a/packages/kokkos-kernels/src/batched/KokkosBatched_Gemm_Decl.hpp +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Gemm_Decl.hpp @@ -4,6 +4,8 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Vector.hpp" namespace KokkosBatched { namespace Experimental { diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Gemv_Decl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Gemv_Decl.hpp index ed910469dc52..98ab2916dbd3 100644 --- a/packages/kokkos-kernels/src/batched/KokkosBatched_Gemv_Decl.hpp +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Gemv_Decl.hpp @@ -4,6 +4,9 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Vector.hpp" + namespace KokkosBatched { namespace Experimental { diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Givens_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Givens_Serial_Internal.hpp new file mode 100644 index 000000000000..ebc6515cafe6 --- /dev/null +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Givens_Serial_Internal.hpp @@ -0,0 +1,70 @@ +#ifndef __KOKKOSBATCHED_GIVENS_SERIAL_INTERNAL_HPP__ +#define __KOKKOSBATCHED_GIVENS_SERIAL_INTERNAL_HPP__ + + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosBatched_Util.hpp" + + +namespace KokkosBatched { + namespace Experimental { + /// + /// Serial Internal Impl + /// ==================== + /// + /// this impl follows the flame interface of householder transformation + /// + struct SerialGivensInternal { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const ValueType chi1, + const ValueType chi2, + /* */ Kokkos::pair * G, + /* */ ValueType * chi1_new) { + typedef ValueType value_type; + const value_type zero(0), one(1); + /// compute G = [ gamma -sigma; + /// sigma gamma ]; + /// G.first = gamma and G.second = sigma + /// this rotation satisfy the following + /// G' [chi1; = [ alpha; + /// chi2] zero ]; + value_type cs, sn, r; + if (chi2 == zero) { + r = chi1; + cs = one; + sn = zero; + } else if (chi1 == zero) { + r = chi2; + cs = zero; + sn = one; + } else { + // here we do not care overflow caused by the division although it is probable.... + r = Kokkos::Details::ArithTraits::sqrt(chi1*chi1 + chi2*chi2); + cs = chi1/r; + sn = chi2/r; + + if (Kokkos::Details::ArithTraits::abs(chi1) > + Kokkos::Details::ArithTraits::abs(chi2) && cs < zero) { + cs = -cs; + sn = -sn; + r = -r; + } + + } + + G->first = cs; + G->second = sn; + *chi1_new = r; + + return 0; + } + }; + + }// end namespace Experimental +} // end namespace KokkosBatched + + +#endif diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp new file mode 100644 index 000000000000..dc52f03486a2 --- /dev/null +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp @@ -0,0 +1,59 @@ +#ifndef __KOKKOSBATCHED_HESSENBERG_FORM_Q_SERIAL_INTERNAL_HPP__ +#define __KOKKOSBATCHED_HESSENBERG_FORM_Q_SERIAL_INTERNAL_HPP__ + + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBatched_SetIdentity_Internal.hpp" +#include "KokkosBatched_ApplyQ_Serial_Internal.hpp" + +namespace KokkosBatched { + namespace Experimental { + /// + /// Serial Internal Impl + /// ==================== + /// + /// this impl follows the flame interface of householder transformation + /// + struct SerialHessenbergFormQInternal { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, + const int k, + /* */ ValueType * A, const int as0, const int as1, + /* */ ValueType * t, const int ts, + /* */ ValueType * Q, const int qs0, const int qs1, + /* */ ValueType * w, + const bool is_Q_zero = false) { + typedef ValueType value_type; + + /// Given a matrix A that includes Hessenberg factorization + /// it forms a unitary matrix Q + /// B = Q = (H0 H1 H2 H3 ... H(k-2)) I + /// where + /// A is m x k (holding H0, H1 ... H(k-2) + /// t is k x 1 + /// B is m x m + // set identity + if (is_Q_zero) + SerialSetInternal::invoke(m, value_type(1), Q, qs0+qs1); + else + SerialSetIdentityInternal::invoke(m, Q, qs0, qs1); + + return SerialApplyQ_LeftNoTransForwardInternal + ::invoke(m-1, m-1, k-1, + A+as0, as0, as1, + t, ts, + Q+qs0+qs1, qs1, qs0, + w); + } + }; + + }// end namespace Experimental +} // end namespace KokkosBatched + + +#endif diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp new file mode 100644 index 000000000000..a96661febf81 --- /dev/null +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp @@ -0,0 +1,139 @@ +#ifndef __KOKKOSBATCHED_HESSENBERG_QR_WITH_SHIFT_SERIAL_INTERNAL_HPP__ +#define __KOKKOSBATCHED_HESSENBERG_QR_WITH_SHIFT_SERIAL_INTERNAL_HPP__ + + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Givens_Serial_Internal.hpp" +#include "KokkosBatched_ApplyGivens_Serial_Internal.hpp" + +namespace KokkosBatched { + namespace Experimental { + /// + /// Serial Internal Impl + /// ==================== + /// + /// this impl follows the flame interface of householder transformation + /// + struct SerialHessenbergQR_WithShiftInternal { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const int mbeg, const int mend, const int morg, + /* */ ValueType * HH, const int hs0, const int hs1, + const ValueType shift, + /* */ Kokkos::pair * GG, const bool request_schur) { + typedef ValueType value_type; + //typedef Kokkos::Details::ArithTraits ats; + + const int hs = hs0+hs1; + const value_type zero(0), one(1); + const Kokkos::pair identity(one,zero); + + /// redefine variables + const int m = mend-mbeg, mbeg_mult_hs0 = mbeg*hs0; + value_type *H = HH+mbeg*hs; + + /// initialize Gs + Kokkos::pair *Gs = NULL; + if (request_schur) { + for (int i=0;i G; + + /// 0. compute the first givens rotation that introduces a bulge + { + const value_type chi1 = H[0] - shift; + const value_type chi2 = H[hs0]; + /* */ value_type alpha; + SerialGivensInternal::invoke(chi1, chi2, + &G, + &alpha); + // record G + if (request_schur) Gs[0] = G; + + value_type *h11 = H; + value_type *h21 = H + hs0; + value_type *h12 = H + hs1; + + // apply G' from left + G.second = -G.second; // transpose G + const int nn = m; + SerialApplyLeftGivensInternal::invoke (G, nn+(morg-mend), + h11, hs1, + h21, hs1); + + // apply (G')' from right + const int mm = m < 3 ? m : 3; + SerialApplyRightGivensInternal::invoke(G, mm+mbeg, + h11-mbeg_mult_hs0, hs0, + h12-mbeg_mult_hs0, hs0); + } + + /// 1. chase the bulge + + // partitions used for loop iteration + Partition2x2 H_part2x2(hs0, hs1); + Partition3x3 H_part3x3(hs0, hs1); + + // initial partition of A where ATL has a zero dimension + H_part2x2.partWithATL(H, m, m, 1, 1); + + for (int m_htl=1;m_htl<(m-1);++m_htl) { + // part 2x2 into 3x3 + H_part3x3.partWithABR(H_part2x2, 1, 1); + //const int n_hbr = m - m_htl; + /// ----------------------------------------------------- + value_type *chi1 = H_part3x3.A11-hs1; + value_type *chi2 = H_part3x3.A21-hs1; + SerialGivensInternal::invoke(*chi1, *chi2, + &G, + chi1); *chi2 = zero; + // record G + if (request_schur) Gs[m_htl] = G; + + G.second = -G.second; // transpose G + + const int nn = m - m_htl; + SerialApplyLeftGivensInternal::invoke (G, nn+(morg-mend), + H_part3x3.A11, hs1, + H_part3x3.A21, hs1); + + const int mtmp = m_htl+3, mm = mtmp < m ? mtmp : m; + SerialApplyRightGivensInternal::invoke(G, mm+mbeg, + H_part3x3.A01-mbeg_mult_hs0, hs0, + H_part3x3.A02-mbeg_mult_hs0, hs0); + /// ----------------------------------------------------- + H_part2x2.mergeToATL(H_part3x3); + } + return 0; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + static int + invoke(const int mbeg, const int mend, const int morg, + /* */ ValueType * HH, const int hs0, const int hs1, + const ValueType shift) { + return invoke(mbeg, mend, morg, + HH, hs0, hs1, shift, + (Kokkos::pair*)NULL, false); + + } + }; + + }// end namespace Experimental +} // end namespace KokkosBatched + + +#endif diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Hessenberg_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Hessenberg_Serial_Internal.hpp new file mode 100644 index 000000000000..5bd6b0cd1339 --- /dev/null +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Hessenberg_Serial_Internal.hpp @@ -0,0 +1,107 @@ +#ifndef __KOKKOSBATCHED_HESSENBERG_SERIAL_INTERNAL_HPP__ +#define __KOKKOSBATCHED_HESSENBERG_SERIAL_INTERNAL_HPP__ + + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Householder_Serial_Internal.hpp" +#include "KokkosBatched_ApplyHouseholder_Serial_Internal.hpp" + +namespace KokkosBatched { + namespace Experimental { + /// + /// Serial Internal Impl + /// ==================== + /// + /// this impl follows the flame interface of householder transformation + /// + struct SerialHessenbergInternal { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, // m = NumRows(A) + const int n, // n = NumCols(A) + /* */ ValueType * A, const int as0, const int as1, + /* */ ValueType * t, const int ts, + /* */ ValueType * w) { + typedef ValueType value_type; + + /// Given a matrix A, it computes hessenberg decomposition of the matrix + /// - t is to store tau and w is for workspace + /// - H = Q^H A Q and A = Q H Q^H + + // partitions used for loop iteration + Partition2x2 A_part2x2(as0, as1); + Partition3x3 A_part3x3(as0, as1); + + Partition2x1 t_part2x1(ts); + Partition3x1 t_part3x1(ts); + + // partitions used in loop body + Partition2x1 A21_part2x1(as0); + Partition2x1 A22_part2x1(as0); + Partition1x2 A2_part1x2 (as1); + + // initial partition of A where ATL has a zero dimension + A_part2x2.partWithATL(A, m, n, 0, 0); + t_part2x1.partWithAT (t, m, 0 ); + + for (int m_atl=0;m_atl 0) { + // partition A21 into 2x1 + A21_part2x1.partWithAT(A_part3x3.A21, m_A22, 1); + + // perform householder transformation + const int m_A22_b = m_A22 - 1; + SerialLeftHouseholderInternal::invoke(m_A22_b, + A21_part2x1.AT, + A21_part2x1.AB, as0, + tau); + + // partition A22 into 2x1 + A22_part2x1.partWithAT(A_part3x3.A22, m_A22, 1); + + // left apply householder to partitioned A22 + SerialApplyLeftHouseholderInternal::invoke(m_A22_b, n_A22, + tau, + A21_part2x1.AB, as0, + A22_part2x1.AT, as1, + A22_part2x1.AB, as0, as1, + w); + + + // partition A*2 column into 1x2 + A2_part1x2.partWithAL(A_part3x3.A02, n_A22, 1); + + // right apply householder to A*2 colums + const int n_A22_r = n_A22 - 1; + SerialApplyRightHouseholderInternal::invoke(m, n_A22_r, + tau, + A21_part2x1.AB, as0, + A2_part1x2.AL, as0, + A2_part1x2.AR, as0, as1, + w); + } + /// ----------------------------------------------------- + A_part2x2.mergeToATL(A_part3x3); + t_part2x1.mergeToAT (t_part3x1); + } + return 0; + } + }; + + }// end namespace Experimental +} // end namespace KokkosBatched + + +#endif diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Householder_Decl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Householder_Decl.hpp new file mode 100644 index 000000000000..dc06dd2e776b --- /dev/null +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Householder_Decl.hpp @@ -0,0 +1,31 @@ +#ifndef __KOKKOSBATCHED_HOUSEHOLDER_DECL_HPP__ +#define __KOKKOSBATCHED_HOUSEHOLDER_DECL_HPP__ + + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Vector.hpp" + + +namespace KokkosBatched { + namespace Experimental { + /// + /// Serial Householder + /// + + // level 1 operation (no blocking algorithm info avail) + template + struct SerialHouseholder { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const aViewType &a, + const tauViewType &tau); + }; + + } +} + +#endif diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Householder_Serial_Impl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Householder_Serial_Impl.hpp new file mode 100644 index 000000000000..ebc5da5393d4 --- /dev/null +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Householder_Serial_Impl.hpp @@ -0,0 +1,37 @@ +#ifndef __KOKKOSBATCHED_HOUSEHOLDER_SERIAL_IMPL_HPP__ +#define __KOKKOSBATCHED_HOUSEHOLDER_SERIAL_IMPL_HPP__ + + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Householder_Serial_Internal.hpp" + + +namespace KokkosBatched { + namespace Experimental { + + /// + /// Serial Impl + /// =========== + + template<> + template + KOKKOS_INLINE_FUNCTION + int + SerialHouseholder:: + invoke(const aViewType &a, + const tauViewType &tau) { + return SerialLeftHouseholderInternal:: + invoke(a.extent(0)-1, + a.data(), + a.data()+a.stride(0), a.stride(0), + tau.data()); + } + + } +} + + +#endif diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Householder_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Householder_Serial_Internal.hpp new file mode 100644 index 000000000000..208cfaaa454d --- /dev/null +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Householder_Serial_Internal.hpp @@ -0,0 +1,82 @@ +#ifndef __KOKKOSBATCHED_HOUSEHOLDER_SERIAL_INTERNAL_HPP__ +#define __KOKKOSBATCHED_HOUSEHOLDER_SERIAL_INTERNAL_HPP__ + + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosBatched_Util.hpp" + + +namespace KokkosBatched { + namespace Experimental { + /// + /// Serial Internal Impl + /// ==================== + /// + /// this impl follows the flame interface of householder transformation + /// + struct SerialLeftHouseholderInternal { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m_x2, + /* */ ValueType * chi1, + /* */ ValueType * x2, const int x2s, + /* */ ValueType * tau) { + typedef ValueType value_type; + typedef typename Kokkos::Details::ArithTraits::mag_type mag_type; + + const mag_type zero(0); + const mag_type half(0.5); + const mag_type one(1); + const mag_type minus_one(-1); + + /// compute the 2norm of x2 + mag_type norm_x2_square(0); + for (int i=0;i::abs(*chi1); + + /// compute 2 norm of x using norm_chi1 and norm_x2 + const mag_type norm_x = Kokkos::Details::ArithTraits::sqrt(norm_x2_square + norm_chi1*norm_chi1); + + /// compute alpha + const mag_type alpha = (*chi1 < 0 ? one : minus_one)*norm_x; + + /// overwrite x2 with u2 + const value_type chi1_minus_alpha = *chi1 - alpha; + const value_type inv_chi1_minus_alpha = one/chi1_minus_alpha; + for (int i=0;i struct SerialInverseLU { - // no piv version template + typename wViewType> KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, - const WViewType &W); + const wViewType &w) { + typedef typename wViewType::value_type value_type; + // workspace w is always 1D view; reinterpret it + Kokkos::View + W(w.data(), A.extent(0), A.extent(1)); + + int r_val[3] = {}; + r_val[0] = SerialCopy::invoke(A, W); + r_val[1] = SerialSetIdentity::invoke(A); + r_val[2] = SerialSolveLU::invoke(W, A); + return r_val[0]+r_val[1]+r_val[2]; + } }; template struct TeamInverseLU { - // no piv version template + typename wViewType> KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, - const WViewType &W); + const wViewType &w) { + typedef typename wViewType::value_type value_type; + // workspace w is always 1D view; reinterpret it + Kokkos::View + W(w.data(), A.extent(0), A.extent(1)); + + int r_val[3] = {}; + r_val[0] = TeamCopy::invoke(member, A, W); + r_val[1] = TeamSetIdentity::invoke(member, A); + r_val[2] = TeamSolveLU::invoke(member, W, A); + return r_val[0]+r_val[1]+r_val[2]; + } }; } diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_InverseLU_Serial_Impl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_InverseLU_Serial_Impl.hpp index 8a815de40732..b2ad7bedb32f 100644 --- a/packages/kokkos-kernels/src/batched/KokkosBatched_InverseLU_Serial_Impl.hpp +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_InverseLU_Serial_Impl.hpp @@ -68,90 +68,6 @@ namespace KokkosBatched { } #endif - template<> - template - KOKKOS_INLINE_FUNCTION - int - SerialInverseLU:: - invoke(const AViewType &A, - const WViewType &W) { - static_assert(AViewType::rank == 2, "A should have two dimensions"); - static_assert(WViewType::rank == 1, "W should have one dimension"); - static_assert(std::is_same::value, "A and W should be on the same memory space"); - static_assert(!std::is_same::value, "W should be an contiguous 1D array"); - assert(A.extent(0)*A.extent(1)*sizeof(typename AViewType::value_type) <= W.span()*sizeof(typename WViewType::value_type)); - assert(A.extent(0)==A.extent(1)); - - typedef typename AViewType::value_type ScalarType; - - auto B = Kokkos::View >(W.data(), A.extent(0), A.extent(1)); - - const ScalarType one(1.0); - -#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) -#pragma unroll -#endif - for (size_t i=0;i::invoke(A,B); - -#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) -#pragma unroll -#endif - for (size_t i=0;i - template - KOKKOS_INLINE_FUNCTION - int - SerialInverseLU:: - invoke(const AViewType &A, - const WViewType &W) { - static_assert(AViewType::rank == 2, "A should have two dimensions"); - static_assert(WViewType::rank == 1, "W should have one dimension"); - static_assert(std::is_same::value, "A and W should be on the same memory space"); - static_assert(!std::is_same::value, "W should be an contiguous 1D array"); - assert(A.extent(0)*A.extent(1)*sizeof(typename AViewType::value_type) <= W.span()*sizeof(typename WViewType::value_type)); - assert(A.extent(0)==A.extent(1)); - - typedef typename AViewType::value_type ScalarType; - - auto B = Kokkos::View >(W.data(), A.extent(0), A.extent(1)); - - const ScalarType one(1.0); - -#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) -#pragma unroll -#endif - for (size_t i=0;i::invoke(A,B); - -#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) -#pragma unroll -#endif - for (size_t i=0;i + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, + /* */ ValueType * S, const int ss0, const int ss1, + /* */ ValueType * V, const int vs0, const int vs1, + /* */ ValueType * w, + const int * blks) { + typedef ValueType value_type; + typedef Kokkos::Details::ArithTraits ats; + //typedef typename ats::mag_type mag_type; + typedef Kokkos::complex complex_type; + + const value_type zero(0), one(1); + /// SerialSetInternal::invoke(m, m, zero, V, vs0, vs1); + + value_type *b = w; // consider complex case + + /// partitions used for loop iteration + Partition2x2 S_part2x2(ss0, ss1); + Partition3x3 S_part3x3(ss0, ss1); + + Partition2x1 V_part2x1(vs0); + Partition3x1 V_part3x1(vs0); + + /// initial partition of S where ATL has a zero dimension + S_part2x2.partWithATL(S, m, m, 0, 0); + V_part2x1.partWithAT(V, m, 0); + + //const mag_type tol = ats::epsilon(); + int m_stl = 0; + for (;m_stl<(m-1);) { + /// part 2x2 into 3x3 + const int mA11 = blks[m_stl]; + assert( ((mA11 == 1) || (mA11 == 2)) && "LeftEigenvectorFromSchur: blk is not 1x1 nor 2x2"); + + S_part3x3.partWithABR(S_part2x2, mA11, mA11); + V_part3x1.partWithAB(V_part2x1, mA11); + + const int m_stl_plus_mA11 = m_stl+mA11; + if (mA11 == 1) { + /// real eigenvalue + const value_type lambda = *S_part3x3.A11; + + /// initialize a left hand side + b[m_stl] = one; + for (int j=0;j<(m-m_stl_plus_mA11);++j) + b[j+m_stl_plus_mA11] = -S_part3x3.A12[j*ss1]; + + /// perform shifted trsv (transposed) + SerialShiftedTrsvInternalLower::invoke(m-m_stl_plus_mA11, lambda, + S_part3x3.A22, ss1, ss0, + b+m_stl_plus_mA11, 1, + blks+m_stl_plus_mA11); + + /// copy back to V (row wise copy) + for (int j=0;j + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, + /* */ ValueType *__restrict__ v, const int vs) { + typedef ValueType value_type; + typedef Kokkos::Details::ArithTraits ats; + typedef typename ats::mag_type mag_type; + + mag_type norm(0); +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i=0;i::sqrt(norm); +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i=0;i + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, + /* */ RealType *__restrict__ vr, const int vrs, + /* */ RealType *__restrict__ vi, const int vis) { + typedef RealType real_type; + typedef Kokkos::Details::ArithTraits ats; + typedef typename ats::mag_type mag_type; + + mag_type norm(0); +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i=0;i::sqrt(norm); +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i=0;i + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, + const int k, + /* */ ValueType * A, const int as0, const int as1, + /* */ ValueType * t, const int ts, + /* */ ValueType * Q, const int qs0, const int qs1, + /* */ ValueType * w, + const bool is_Q_zero = false) { + typedef ValueType value_type; + + /// Given a matrix A that includes QR factorization + /// it forms a unitary matrix Q + /// B = Q = (H0 H1 H2 H3 ... H(k-1)) I + /// where + /// A is m x k (holding H0, H1 ... H(k-1) + /// t is k x 1 + /// B is m x m + + // set identity + if (is_Q_zero) + SerialSetInternal::invoke(m, value_type(1), Q, qs0+qs1); + else + SerialSetIdentityInternal::invoke(m, Q, qs0, qs1); + + return SerialApplyQ_LeftNoTransForwardInternal + ::invoke(m, m, k, + A, as0, as1, + t, ts, + Q, qs0, qs1, + w); + } + }; + + }// end namespace Experimental +} // end namespace KokkosBatched + + +#endif diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_QR_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_QR_Serial_Internal.hpp new file mode 100644 index 000000000000..2420ed2de8d1 --- /dev/null +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_QR_Serial_Internal.hpp @@ -0,0 +1,80 @@ +#ifndef __KOKKOSBATCHED_QR_SERIAL_INTERNAL_HPP__ +#define __KOKKOSBATCHED_QR_SERIAL_INTERNAL_HPP__ + + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Householder_Serial_Internal.hpp" +#include "KokkosBatched_ApplyHouseholder_Serial_Internal.hpp" + +namespace KokkosBatched { + namespace Experimental { + /// + /// Serial Internal Impl + /// ==================== + /// + /// this impl follows the flame interface of householder transformation + /// + struct SerialQR_Internal { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, // m = NumRows(A) + const int n, // n = NumCols(A) + /* */ ValueType * A, const int as0, const int as1, + /* */ ValueType * t, const int ts, + /* */ ValueType * w) { + typedef ValueType value_type; + + /// Given a matrix A, it computes QR decomposition of the matrix + /// - t is to store tau and w is for workspace + + // partitions used for loop iteration + Partition2x2 A_part2x2(as0, as1); + Partition3x3 A_part3x3(as0, as1); + + Partition2x1 t_part2x1(ts); + Partition3x1 t_part3x1(ts); + + // initial partition of A where ATL has a zero dimension + A_part2x2.partWithATL(A, m, n, 0, 0); + t_part2x1.partWithAT (t, m, 0); + + for (int m_atl=0;m_atl + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, + /* */ ValueType * S, const int ss0, const int ss1, + /* */ ValueType * V, const int vs0, const int vs1, + /* */ ValueType * w, + const int * blks) { + typedef ValueType value_type; + typedef Kokkos::Details::ArithTraits ats; + //typedef typename ats::mag_type mag_type; + typedef Kokkos::complex complex_type; + + const value_type zero(0), one(1); + //const int ss(ss0+ss1); + /// SerialSetInternal::invoke(m, m, zero, V, vs0, vs1); + + value_type *b = w; // consider complex case + + /// partitions used for loop iteration + Partition2x2 S_part2x2(ss0, ss1); + Partition3x3 S_part3x3(ss0, ss1); + + Partition1x2 V_part1x2(vs1); + Partition1x3 V_part1x3(vs1); + + /// initial partition of S where ABR has a zero dimension + S_part2x2.partWithABR(S, m, m, 0, 0); + V_part1x2.partWithAR(V, m, 0); + + //const mag_type tol = ats::epsilon(); + int m_stl = m; + for (;m_stl>0;) { + /// part 2x2 into 3x3 + const int mA11 = blks[m_stl-1]; + assert( ((mA11 == 1) || (mA11 == 2)) && "RightEigenvectorFromSchur: blk is not 1x1 nor 2x2"); + + S_part3x3.partWithATL(S_part2x2, mA11, mA11); + V_part1x3.partWithAL(V_part1x2, mA11); + + const int m_stl_minus_mA11 = m_stl - mA11; + if (mA11 == 1) { + /// real eigenvalue + const value_type lambda = *S_part3x3.A11; + + /// initialize a right eigen vector + for (int i=0;i + KOKKOS_INLINE_FUNCTION + static int + invoke(RealType * alpha00, RealType * alpha01, + RealType * alpha10, RealType * alpha11, + Kokkos::pair * G, + Kokkos::complex * lambda1, + Kokkos::complex * lambda2, + bool * is_complex) { + typedef RealType real_type; + typedef Kokkos::Details::ArithTraits ats; + const real_type zero(0), one(1), half(0.5), minus_one(-1); + /// compute G = [ gamma -sigma; + /// sigma gamma ]; + /// G.first = gamma and G.second = sigma + /// this rotation satisfy the following + /// G' [alpha00 alpha01 G = [ beta00 beta01; + /// alpha10 alpha11 ] beta10 beta11 ]; + /// where either + /// 1) beta00 = beta11 and beta01*beta10 < 0 + /// 2) beta10 = 0 + const real_type tol = ats::epsilon()*real_type(100); + if (ats::abs(*alpha10) < tol) { + /// no rotation + *G = Kokkos::pair(one, zero); + /// two real eigen values + *lambda1 = Kokkos::complex(*alpha00, zero); + *lambda2 = Kokkos::complex(*alpha11, zero); + *is_complex = false; + } else if (ats::abs(*alpha01) < tol) { + /// 90 degree rotation (permutation) + *G = Kokkos::pair(zero, one); + /// [ 0 1 ][alpha00 0 [ 0 -1 --> [ alpha11 -alpha10 + /// -1 0 ] alpha10 alpha11] 1 0] 0 alpha00] + const real_type tmp = *alpha00; *alpha00 = *alpha11; *alpha11 = tmp; + *alpha01 = -(*alpha10); *alpha10 = zero; + /// two real eigen values + *lambda1 = Kokkos::complex(*alpha00, zero); + *lambda2 = Kokkos::complex(*alpha11, zero); + *is_complex = false; + } else if (ats::abs(*alpha00-*alpha11) < tol && (*alpha01)*(*alpha10) > zero) { + // no rotation (already the standard schur form) + *G = Kokkos::pair(one, zero); + /// two real eigen values + *lambda1 = Kokkos::complex(*alpha00, zero); + *lambda2 = Kokkos::complex(*alpha11, zero); + *is_complex = false; + } else { + /// rotation to equalize diagonals + const real_type a = (*alpha00)-(*alpha11); + const real_type b = (*alpha01)+(*alpha10); + const real_type l = ats::sqrt(a*a+b*b); + const real_type c = ats::sqrt(half*(one+ats::abs(b)/l)); + const real_type s = -((half*a)/(l*c))*(b > zero ? one : minus_one); + *G = Kokkos::pair(c, s); + /// [ gamma sigma ][ alpha00 alpha01 [ gamma -sigma --> [ alpha11 -alpha10 + /// -sigma gamma ] alpha10 alpha11 ] sigma gamma ] 0 alpha00] + const real_type a00 = *alpha00, a01 = *alpha01; + const real_type a10 = *alpha10, a11 = *alpha11; + const real_type cc = c*c, cs = c*s, ss= s*s; + *alpha00 = cc*a00 + cs*a01 + cs*a10 + ss*a11; + *alpha01 = -cs*a00 + cc*a01 - ss*a10 + cs*a11; + *alpha10 = -cs*a00 - ss*a01 + cc*a10 + cs*a11; + *alpha11 = ss*a00 - cs*a01 - cs*a10 + cc*a11; + + const real_type tmp = (*alpha00 + *alpha11)*half; + *alpha00 = tmp; + *alpha11 = tmp; + + const real_type mult_alpha_offdiags = (*alpha10)*(*alpha01); + if (mult_alpha_offdiags > zero) { + /// transforms the matrix into a upper triangular + const real_type sqrt_mult_alpha_offdiags = ats::sqrt(mult_alpha_offdiags); + + /// redefine the rotation matrix + //const real_type sqrt_abs_alpha01 = ats::sqrt(ats::abs(*alpha01)); + //const real_type sqrt_abs_alpha10 = ats::sqrt(ats::abs(*alpha10)); + const real_type abs_sum_offidags = ats::abs((*alpha01)+(*alpha10)); + const real_type c1 = ats::sqrt(ats::abs(*alpha01)/abs_sum_offidags); + const real_type s1 = ats::sqrt(ats::abs(*alpha10)/abs_sum_offidags); + const real_type sign_alpha10 = *alpha10 > zero ? one : minus_one; + + *G = Kokkos::pair(c*c1-s*s1,c*s1+s*c1); + + /// apply rotation to 2x2 matrix so that alpha10 becomes zero + *alpha00 = tmp + sign_alpha10*sqrt_mult_alpha_offdiags; + *alpha11 = tmp - sign_alpha10*sqrt_mult_alpha_offdiags; + *alpha01 = (*alpha01)-(*alpha10); + *alpha10 = zero; + + // two real eigen values + *lambda1 = Kokkos::complex(*alpha00); + *lambda2 = Kokkos::complex(*alpha11); + *is_complex = false; + } else { + /// two complex eigen values + const real_type sqrt_mult_alpha_offdiags = ats::sqrt(-mult_alpha_offdiags); + *lambda1 = Kokkos::complex(tmp, sqrt_mult_alpha_offdiags); + *lambda2 = Kokkos::complex(lambda1->real(), -lambda1->imag()); + *is_complex = true; + } + } + return 0; + } + }; + + }// end namespace Experimental +} // end namespace KokkosBatched + + +#endif diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Schur_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Schur_Serial_Internal.hpp new file mode 100644 index 000000000000..a4a4aadbf08c --- /dev/null +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Schur_Serial_Internal.hpp @@ -0,0 +1,273 @@ +#ifndef __KOKKOSBATCHED_SCHUR_SERIAL_INTERNAL_HPP__ +#define __KOKKOSBATCHED_SCHUR_SERIAL_INTERNAL_HPP__ + + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_WilkinsonShift_Serial_Internal.hpp" +#include "KokkosBatched_ApplyGivens_Serial_Internal.hpp" +#include "KokkosBatched_Schur2x2_Serial_Internal.hpp" +#include "KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp" +#include "KokkosBatched_Francis_Serial_Internal.hpp" + +namespace KokkosBatched { + namespace Experimental { + /// + /// Serial Internal Impl + /// ==================== + /// + /// this impl follows the flame interface of householder transformation + /// + struct SerialSchurInternal { + /// Given a strictly Hessenberg matrix H (m x m), this computes schur decomposition + /// using the Francis method and stores them into a vector e. This routine does + /// not scale nor balance the matrix for the numerical stability. + /// H = Z T Z^H and T = Z^H H Z + /// Parameters: + /// [in]m + /// A dimension of the square matrix H. + /// [in/out]H, [in]hs0, [in]hs1 + /// Real Hessenberg matrix H(m x m) with strides hs0 and hs1. + /// Entering the routine, H is assumed to have a upper Hessenberg form, where + /// all subdiagonals are zero. The matrix is overwritten as a upper triangular T + /// on exit. + /// [in/out]Z, [in]zs0, [in]zs1 + /// Unitary matrix resulting from Schur decomposition. With a restarting option, + /// the matrix may contain previous partial computation results. + /// [in/out]w, [in]wlen + /// Contiguous workspace of which size is wlen. When restart is true, this + /// workspace is not corrupted after the previous iteration. Temporarily, it stores + /// subdiag values and given rotations. wlen should be at least 3*m. + /// [in]restart(false) + /// With a restart option, the routine assume that the matrix H and the vector e + /// contain the partial results from the previous run. When m = 1 or 2, this option + /// won't work as the routine always computes the all eigenvalues. + /// [in]user_max_iteration(300) + /// Unlike LAPACK which uses various methods for different types of matrices, + /// this routine uses the Francis method only. A user can set the maximum number + /// of iterations. When it reaches the maximum iteration counts without converging + /// all eigenvalues, the routine returns -1. + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, + /* */ RealType * H, const int hs0, const int hs1, + /* */ RealType * Z, const int zs0, const int zs1, + /* */ RealType * w, const int wlen, + const bool restart = false, + const int user_max_iteration = -1) { + typedef RealType real_type; + typedef Kokkos::Details::ArithTraits ats; + const real_type /* one(1), */zero(0), tol = 1e2*ats::epsilon(); + const int max_iteration = user_max_iteration < 0 ? 300 : user_max_iteration; + if (wlen < m*5) + Kokkos::abort("Error: provided workspace is smaller than 3*m"); + + int r_val = 0; + if (restart) { + if (m <= 2) + Kokkos::abort("Error: restart option cannot be used for m=1 or m=2"); + } else { + /// do not touch input + /// SerialSetIdentityInternal::invoke(m, Z, zs0, zs1); + } + + // workspaces + real_type *subdiags = w; + Kokkos::pair *Gs = (Kokkos::pair*)(w+m); + if (!restart) { + /// initialize workspace and Gs + for (int i=0;i lambda1, lambda2; + Kokkos::pair G; + SerialSchur2x2Internal::invoke(H, H+hs1, + H+hs0, H+hs, + &G, + &lambda1, &lambda2, + &is_complex); + + G.second = -G.second; // transpose + SerialApplyRightGivensInternal::invoke(G, 2, + Z, zs0, + Z+zs1, zs0); + break; + } + default: { + /// Francis method + int iter(0); /// iteration count + bool converge = false; /// bool to check all eigenvalues are converged + + while (!converge && iter < max_iteration) { + /// Step 1: find a set of unrevealed eigenvalues + int cnt = 1; + + /// find mbeg (first nonzero subdiag value) + for (;cnt tol) break; + } + const int mbeg = cnt-1; + + /// find mend (first zero subdiag value) + for (;cnt G(Gs[i].first, -Gs[i].second); + SerialApplyRightGivensInternal::invoke(G, m, + Z+i*zs1, zs0, + Z+i*zs1+zs1, zs0); + } + } +# endif + +# if 1 + { + /// find a complex eigen pair + Kokkos::complex lambda1, lambda2; + bool is_complex; + real_type *sub2x2 = H+(mend-2)*hs; + if (2 == mdiff) { + Kokkos::pair G; + SerialSchur2x2Internal::invoke(sub2x2, sub2x2+hs1, + sub2x2+hs0, sub2x2+hs, + &G, + &lambda1, &lambda2, + &is_complex); + subdiags[mend-1] = sub2x2[hs0]; + + /// apply G' from left + G.second = -G.second; + SerialApplyLeftGivensInternal::invoke (G, m-mend, + sub2x2 +2*hs1, hs1, + sub2x2+hs0+2*hs1, hs1); + + /// apply (G')' from right + SerialApplyRightGivensInternal::invoke(G, mend-2, + sub2x2 -mend_minus_two_mult_hs0, hs0, + sub2x2+hs1-mend_minus_two_mult_hs0, hs0); + sub2x2[hs0] = zero; + + /// apply (G')' from right to compute Z + SerialApplyRightGivensInternal::invoke(G, m, + Z+(mend-2)*zs1, zs0, + Z+(mend-1)*zs1, zs0); + + } else { + SerialWilkinsonShiftInternal::invoke(sub2x2[0], sub2x2[hs1], + sub2x2[hs0], sub2x2[hs], + &lambda1, &lambda2, + &is_complex); + + SerialFrancisInternal::invoke(mbeg, mend, m, + H, hs0, hs1, + lambda1, lambda2, + is_complex, + Gs, true); + /* */ auto &val1 = *(sub2x2+hs0); + /* */ auto &val2 = *(sub2x2-hs1); + const auto abs_val1 = ats::abs(val1); + const auto abs_val2 = ats::abs(val2); + + for (int i=mbeg;i<(mend-1);++i) { + const Kokkos::pair G0(Gs[2*i ].first, -Gs[2*i ].second); + const Kokkos::pair G1(Gs[2*i+1].first, -Gs[2*i+1].second); + SerialApplyRightGivensInternal::invoke(G0, m, + Z+i*zs1, zs0, + Z+i*zs1+1*zs1, zs0); + SerialApplyRightGivensInternal::invoke(G1, m, + Z+i*zs1, zs0, + Z+i*zs1+2*zs1, zs0); + } + + /// convergence check + if (abs_val1 < tol) { + val1 = zero; + } else if (abs_val2 < tol) { + /// preserve the standard schur form + Kokkos::pair G; + SerialSchur2x2Internal::invoke(sub2x2, sub2x2+hs1, + sub2x2+hs0, sub2x2+hs, + &G, + &lambda1, &lambda2, + &is_complex); + subdiags[mend-1] = val1; + + /// apply G' from left + G.second = -G.second; + SerialApplyLeftGivensInternal::invoke (G, m-mend, + sub2x2 +2*hs1, hs1, + sub2x2+hs0+2*hs1, hs1); + + // apply (G')' from right + SerialApplyRightGivensInternal::invoke(G, mend-2, + sub2x2 -mend_minus_two_mult_hs0, hs0, + sub2x2+hs1-mend_minus_two_mult_hs0, hs0); + val1 = zero; + val2 = zero; + + // apply (G')' from right + SerialApplyRightGivensInternal::invoke(G, m, + Z+(mend-2)*zs1, zs0, + Z+(mend-1)*zs1, zs0); + } + } + } +# endif + } else { + /// all eigenvalues are converged + converge = true; + } + ++iter; + } + /// Step 3: record missing real eigenvalues from the diagonals + if (converge) { + // recover subdiags + real_type *Hs = H-hs1; + for (int i=1;i + KOKKOS_INLINE_FUNCTION + static int + invoke(const AViewType &A); + }; + + /// + /// Team Set + /// + + template + struct TeamSetIdentity { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const AViewType &A); + }; + + + /// + /// Selective Interface + /// + template + struct SetIdentity { + template + KOKKOS_FORCEINLINE_FUNCTION + static int + invoke(const MemberType &member, + const AViewType &A) { + int r_val = 0; + if (std::is_same::value) { + r_val = SerialSetIdentity::invoke(A); + } else if (std::is_same::value) { + r_val = TeamSetIdentity::invoke(member, A); + } + return r_val; + } + }; + + } +} + + +#endif diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_SetIdentity_Impl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_SetIdentity_Impl.hpp new file mode 100644 index 000000000000..4cd7da82da4f --- /dev/null +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_SetIdentity_Impl.hpp @@ -0,0 +1,48 @@ +#ifndef __KOKKOSBATCHED_SET_IDENTITY_IMPL_HPP__ +#define __KOKKOSBATCHED_SET_IDENTITY_IMPL_HPP__ + + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_SetIdentity_Internal.hpp" + + +namespace KokkosBatched { + namespace Experimental { + /// + /// Serial Impl + /// =========== + + template + KOKKOS_INLINE_FUNCTION + int + SerialSetIdentity:: + invoke(const AViewType &A) { + return SerialSetIdentityInternal:: + invoke(A.extent(0), + A.data(), A.stride_0(), A.stride_1()); + } + + /// + /// Team Impl + /// ========= + + template + template + KOKKOS_INLINE_FUNCTION + int + TeamSetIdentity:: + invoke(const MemberType &member, + const AViewType &A) { + return TeamSetIdentityInternal:: + invoke(member, + A.extent(0), + A.data(), A.stride_0(), A.stride_1()); + } + + } // end namespace Experimental +} //end namespace KokkosBatched + + +#endif diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_SetIdentity_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_SetIdentity_Internal.hpp new file mode 100644 index 000000000000..1c4e653b3154 --- /dev/null +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_SetIdentity_Internal.hpp @@ -0,0 +1,65 @@ +#ifndef __KOKKOSBATCHED_SET_IDENTITY_INTERNAL_HPP__ +#define __KOKKOSBATCHED_SET_IDENTITY_INTERNAL_HPP__ + + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosBatched_Util.hpp" + + +namespace KokkosBatched { + namespace Experimental { + /// + /// Serial Internal Impl + /// ==================== + struct SerialSetIdentityInternal { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, + /* */ ValueType *__restrict__ A, const int as0, const int as1) { + const ValueType one(1), zero(0); + for (int j=0;j + KOKKOS_INLINE_FUNCTION + static int + invoke(const MemberType &member, + const int m, + /* */ ValueType *__restrict__ A, const int as0, const int as1) { + const ValueType one(1), zero(0); + Kokkos::parallel_for + (Kokkos::TeamThreadRange(member,0,m), + [&](const int &i) { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int j=0;j + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, const int n, + const int dist, + const ScalarType alpha, + /* */ ValueType *__restrict__ A, const int as0, const int as1) { + for (int j=0;j + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, + const ScalarType lambda, + const ValueTypeA *__restrict__ A, const int as0, const int as1, + /* */ ValueTypeB *__restrict__ b, const int bs0, + const int *__restrict__ blks) { + const int as = as0+as1; + + int p = 0; + for (;p + KOKKOS_INLINE_FUNCTION + static int + invoke(const int m, + const ScalarType lambda, + const ValueTypeA *__restrict__ A, const int as0, const int as1, + /**/ ValueTypeB *__restrict__ b, const int bs0, + const int *__restrict__ blks) { + const int as = as0+as1; + + ValueTypeB *__restrict__ b0 = b; + + int p = m-1; + for (;p>=0;) { + const int blk = blks[p], iend = p+1-blk; + assert( ((blk == 1) || (blk == 2)) && "ShiftedTrsvUpper: blocks are not 1x1 or 2x2"); + if (blk == 1) { + const auto alpha11 = A[p*as]-lambda; + /**/ ValueTypeB *__restrict__ beta1 = b+p*bs0; + + // with __restrict__ a compiler assumes that the pointer is not accessed by others + // op(/=) uses this pointer and changes the associated values, which brings a compiler problem + *beta1 = *beta1 / alpha11; + + if (iend) { + const ValueTypeA *__restrict__ a01 = A+p*as1; + for (int i=0;i + template struct SerialSolveLU { // no piv version template::value) { + //First, compute Y (= U*X) by solving the system L*Y = B for Y + r_val[0] = SerialTrsm::invoke(one, A, B); + //Second, compute X by solving the system U*X = Y for X + r_val[1] = SerialTrsm::invoke(one, A, B); + } else if (std::is_same::value || + std::is_same::value) { + //First, compute Y (= L'*X) by solving the system U'*Y = B for Y + r_val[0] = SerialTrsm::invoke(one, A, B); + //Second, compute X by solving the system L'*X = Y for X + r_val[1] = SerialTrsm::invoke(one, A, B); + } + return r_val[0]+r_val[1]; + } }; template + typename ArgTrans, + typename ArgAlgo> struct TeamSolveLU { // no piv version template::value) { + //First, compute Y (= U*X) by solving the system L*Y = B for Y + r_val[0] = TeamTrsm::invoke(member, one, A, B); + //Second, compute X by solving the system U*X = Y for X + r_val[1] = TeamTrsm::invoke(member, one, A, B); + } else if (std::is_same::value || + std::is_same::value) { + //First, compute Y (= L'*X) by solving the system U'*Y = B for Y + r_val[0] = TeamTrsm::invoke(member, one, A, B); + //Second, compute X by solving the system L'*X = Y for X + r_val[1] = TeamTrsm::invoke(member, one, A, B); + } + return r_val[0]+r_val[1]; + } }; + + /// + /// Selective Interface + /// + template + struct SolveLU { + // no piv version + template + KOKKOS_FORCEINLINE_FUNCTION + static int + invoke(const MemberType &member, + const AViewType &A, + const BViewType &B) { + int r_val = 0; + if (std::is_same::value) { + r_val = SerialSolveLU::invoke(A, B); + } else if (std::is_same::value) { + r_val = TeamSolveLU::invoke(member, A, B); + } + return r_val; + } + }; + } } diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Trsm_Decl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Trsm_Decl.hpp index dce98acac487..24f39dc6c2b0 100644 --- a/packages/kokkos-kernels/src/batched/KokkosBatched_Trsm_Decl.hpp +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Trsm_Decl.hpp @@ -4,6 +4,9 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Vector.hpp" + namespace KokkosBatched { namespace Experimental { diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Trsv_Decl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Trsv_Decl.hpp index f7d5391921b2..b31cddeaf2b3 100644 --- a/packages/kokkos-kernels/src/batched/KokkosBatched_Trsv_Decl.hpp +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Trsv_Decl.hpp @@ -4,6 +4,9 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Vector.hpp" + namespace KokkosBatched { namespace Experimental { diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_UpdateGivens_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_UpdateGivens_Internal.hpp new file mode 100644 index 000000000000..5c7967bf31ee --- /dev/null +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_UpdateGivens_Internal.hpp @@ -0,0 +1,37 @@ +#ifndef __KOKKOSBATCHED_UPDATE_GIVENS_INTERNAL_HPP__ +#define __KOKKOSBATCHED_UPDATE_GIVENS_INTERNAL_HPP__ + + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosBatched_Util.hpp" + + +namespace KokkosBatched { + namespace Experimental { + /// + /// Serial Internal Impl + /// ==================== + /// + /// this impl follows the flame interface of householder transformation + /// + struct SerialUpdateGivensInternal { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const Kokkos::pair &S, + /* */ Kokkos::pair &G) { + const ValueType + tmp = S.first*G.first -S.second*G.second; + G.second = S.first*G.second+S.second*G.first; + G.first = tmp; + + return 0; + } + }; + + }// end namespace Experimental +} // end namespace KokkosBatched + + +#endif diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Util.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Util.hpp index 0c6bd67dba6d..95dcb3c9120f 100644 --- a/packages/kokkos-kernels/src/batched/KokkosBatched_Util.hpp +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Util.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -366,6 +367,201 @@ namespace KokkosBatched { }; + template struct Partition1x2; + template struct Partition1x3; + + template + struct Partition1x2 { + const int as1; + ValueType *AL, *AR; + + KOKKOS_INLINE_FUNCTION + Partition1x2(const int arg_as1) + : as1(arg_as1), AL(NULL), AR(NULL) {} + + KOKKOS_INLINE_FUNCTION + void partWithAL(ValueType *A, const int nA, const int nAL) { + AL = A; AR = AL+nAL*as1; + } + + KOKKOS_INLINE_FUNCTION + void partWithAR(ValueType *A, const int nA, const int nAR) { + AL = A; AR = AL+(nA-nAR)*as1; + } + + // A0 A1 are merged into AL + KOKKOS_INLINE_FUNCTION + void mergeToAL(const Partition1x3 &part) { + AL = part.A0; AR = part.A2; + } + + // A0 A1 are merged into AL + KOKKOS_INLINE_FUNCTION + void mergeToAR(const Partition1x3 &part) { + AL = part.A0; AR = part.A1; + } + }; + + template + struct Partition1x3 { + const int as1; + ValueType *A0, *A1, *A2; + + KOKKOS_INLINE_FUNCTION + Partition1x3(const int arg_as1) + : as1(arg_as1), A0(NULL), A1(NULL), A2(NULL) {} + + KOKKOS_INLINE_FUNCTION + void partWithAL(const Partition1x2 &part, const int mA1) { + A0 = part.AL; A2 = part.AR; A1 = A2 - mA1*as1; + } + KOKKOS_INLINE_FUNCTION + void partWithAR(const Partition1x2 &part, const int mA1) { + A0 = part.AL; A1 = part.AR; A2 = A1 + mA1*as1; + } + }; + + + template struct Partition2x1; + template struct Partition3x1; + + template + struct Partition2x1 { + const int as0; + ValueType *AT, *AB; + + KOKKOS_INLINE_FUNCTION + Partition2x1(const int arg_as0) + : as0(arg_as0), AT(NULL), AB(NULL) {} + + KOKKOS_INLINE_FUNCTION + void partWithAT(ValueType *A, const int mA, const int mAT) { + AT = A; + AB = AT+mAT*as0; + } + + KOKKOS_INLINE_FUNCTION + void partWithAB(ValueType *A, const int mA, const int mAB) { + partWithAT(A, mA, mA-mAB); + } + + // A0 + // A1 is merged into AT + KOKKOS_INLINE_FUNCTION + void mergeToAT(const Partition3x1 &part) { + AT = part.A0; + AB = part.A2; + } + + KOKKOS_INLINE_FUNCTION + void mergeToAB(const Partition3x1 &part) { + AT = part.A0; + AB = part.A1; + } + }; + + template + struct Partition3x1 { + const int as0; + ValueType *A0, + /* */ *A1, + /* */ *A2; + + KOKKOS_INLINE_FUNCTION + Partition3x1(const int arg_as0) + : as0(arg_as0), + A0(NULL), + A1(NULL), + A2(NULL) {} + + KOKKOS_INLINE_FUNCTION + void partWithAB(const Partition2x1 &part, const int mA1) { + A0 = part.AT; + A1 = part.AB; + A2 = A1 + mA1*as0; + } + + KOKKOS_INLINE_FUNCTION + void partWithAT(const Partition2x1 &part, const int mA1) { + A0 = part.AT; + A1 = part.AB - mA1*as0; + A2 = part.AB; + } + }; + + template struct Partition2x2; + template struct Partition3x3; + + template + struct Partition2x2 { + const int as0, as1; + ValueType *ATL, *ATR, *ABL, *ABR; + + KOKKOS_INLINE_FUNCTION + Partition2x2(const int arg_as0, const int arg_as1) + : as0(arg_as0), as1(arg_as1), ATL(NULL), ATR(NULL), ABL(NULL), ABR(NULL) {} + + KOKKOS_INLINE_FUNCTION + void partWithATL(ValueType *A, + const int mA, const int nA, + const int mATL, const int nATL) { + ATL = A; ATR = ATL+nATL*as1; + ABL = ATL+mATL*as0; ABR = ABL+nATL*as1; + } + + KOKKOS_INLINE_FUNCTION + void partWithABR(ValueType *A, + const int mA, const int nA, + const int mABR, const int nABR) { + partWithATL(A, mA, nA, mA-mABR, nA-nABR); + } + + // A00 A01 + // A10 A11 is merged into ATL + KOKKOS_INLINE_FUNCTION + void mergeToATL(const Partition3x3 &part) { + ATL = part.A00; ATR = part.A02; + ABL = part.A20; ABR = part.A22; + } + + KOKKOS_INLINE_FUNCTION + void mergeToABR(const Partition3x3 &part) { + ATL = part.A00; ATR = part.A01; + ABL = part.A10; ABR = part.A11; + } + }; + + template + struct Partition3x3 { + const int as0, as1; + ValueType *A00, *A01, *A02, + /* */ *A10, *A11, *A12, + /* */ *A20, *A21, *A22; + + + KOKKOS_INLINE_FUNCTION + Partition3x3(const int arg_as0, const int arg_as1) + : as0(arg_as0), as1(arg_as1), + A00(NULL), A01(NULL), A02(NULL), + A10(NULL), A11(NULL), A12(NULL), + A20(NULL), A21(NULL), A22(NULL) {} + + KOKKOS_INLINE_FUNCTION + void partWithABR(const Partition2x2 &part, const int mA11, const int nA11) { + A00 = part.ATL; A01 = part.ATR; A02 = part.ATR + nA11*as1; + A10 = part.ABL; A11 = part.ABR; A12 = part.ABR + nA11*as1; + A20 = part.ABL + mA11*as0; A21 = part.ABR + mA11*as0; A22 = part.ABR + mA11*as0 + nA11*as1; + } + + KOKKOS_INLINE_FUNCTION + void partWithATL(const Partition2x2 &part, const int mA11, const int nA11) { + A00 = part.ATL; A01 = part.ATR - nA11*as1; A02 = part.ATR; + A10 = part.ABL - mA11*as0; A11 = part.ABR - mA11*as0 - nA11*as1; A12 = part.ABR - mA11*as0; + A20 = part.ABL; A21 = part.ABR - nA11*as1; A22 = part.ABR; + } + }; + + } } diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Vector_SIMD_Misc.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Vector_SIMD_Misc.hpp index ca11d303919f..bebd29b62753 100644 --- a/packages/kokkos-kernels/src/batched/KokkosBatched_Vector_SIMD_Misc.hpp +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Vector_SIMD_Misc.hpp @@ -164,7 +164,8 @@ namespace KokkosBatched { T min(const Vector,l> &val) { return reduce(val, [](const T left, const T right) -> T { - return min(left,right); + const auto tmp = left < right; + return tmp*left + !tmp*right; }); } @@ -174,7 +175,8 @@ namespace KokkosBatched { T max(const Vector,l> &val) { return reduce(val, [](const T left, const T right) -> T { - return max(left,right); + const auto tmp = left > right; + return tmp*left + !tmp*right; }); } diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_WilkinsonShift_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_WilkinsonShift_Serial_Internal.hpp new file mode 100644 index 000000000000..5393580f636a --- /dev/null +++ b/packages/kokkos-kernels/src/batched/KokkosBatched_WilkinsonShift_Serial_Internal.hpp @@ -0,0 +1,63 @@ +#ifndef __KOKKOSBATCHED_WILKINSON_SHIFT_SERIAL_INTERNAL_HPP__ +#define __KOKKOSBATCHED_WILKINSON_SHIFT_SERIAL_INTERNAL_HPP__ + + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "KokkosBatched_Util.hpp" + + +namespace KokkosBatched { + namespace Experimental { + /// + /// Serial Internal Impl + /// ==================== + /// + /// this impl follows the flame interface of householder transformation + /// + struct SerialWilkinsonShiftInternal { + template + KOKKOS_INLINE_FUNCTION + static int + invoke(const ValueType a, const ValueType b, + const ValueType c, const ValueType d, + /* */ Kokkos::complex * lambda1, + /* */ Kokkos::complex * lambda2, + /* */ bool * is_complex) { + /// compute eigenvalues of 2x2 system [a b; + /// c d] + /// when the system has a real complex values, + /// lambda1 and lambda2 are real eigenvalues + /// if the system has a complex eigenvalue pair, + /// then lambda1 and lambda2 are redefined as follows + /// lambda1 := lambda1 + lambda2 + /// lambda2 := lambda1 * lambda2 + typedef ValueType value_type; + + const value_type half(0.5); + const value_type p = (a+d)*half; + const value_type q = (b*c-a*d); + const value_type v = p*p+q; + + if (v < 0) { + // complex + const value_type sqrt_v = Kokkos::Details::ArithTraits::sqrt(-v); + *lambda1 = Kokkos::complex(p, sqrt_v); + *lambda2 = Kokkos::complex(p,-sqrt_v); + *is_complex = true; + } else { + // real + const value_type sqrt_v = Kokkos::Details::ArithTraits::sqrt(v); + *lambda1 = Kokkos::complex(p+sqrt_v); + *lambda2 = Kokkos::complex(p-sqrt_v); + *is_complex = false; + } + return 0; + } + }; + + }// end namespace Experimental +} // end namespace KokkosBatched + + +#endif