diff --git a/cmake/std/PullRequestLinuxCommonTestingSettings.cmake b/cmake/std/PullRequestLinuxCommonTestingSettings.cmake
index 877e47b7a868..ce31514945d6 100644
--- a/cmake/std/PullRequestLinuxCommonTestingSettings.cmake
+++ b/cmake/std/PullRequestLinuxCommonTestingSettings.cmake
@@ -20,8 +20,6 @@ set (Trilinos_ENABLE_TESTS ON CACHE BOOL "Set by default for PR testing")
 set (Trilinos_TEST_CATEGORIES BASIC CACHE STRING "Set by default for PR testing")
 set (Trilinos_ENABLE_CONFIGURE_TIMING ON CACHE BOOL "Set by default for PR testing")
 set (Trilinos_ENABLE_ALL_FORWARD_DEP_PACKAGES ON CACHE BOOL "Set by default for PR testing")
-set (Trilinos_CTEST_USE_NEW_AAO_FEATURES ON CACHE BOOL "Set by default for PR testing")
-
 
 # Options from cmake/std/MpiReleaseDebugSharedPtSettings.cmake
 
diff --git a/cmake/std/PullRequestLinuxCuda9.2TestingSettings.cmake b/cmake/std/PullRequestLinuxCuda9.2TestingSettings.cmake
index aa340b1bc06e..970c38ba05b0 100644
--- a/cmake/std/PullRequestLinuxCuda9.2TestingSettings.cmake
+++ b/cmake/std/PullRequestLinuxCuda9.2TestingSettings.cmake
@@ -13,34 +13,66 @@ set (CTEST_USE_LAUNCHERS ON CACHE BOOL "Set by default for PR testing")
 
 # Options necessary for CUDA build
 set (TPL_ENABLE_MPI ON CACHE BOOL "Set by default for CUDA PR testing")
-set (TPL_ENABLE_CUDA ON CACHE BOOL "Set by default for CUDA PR testing")
 set (Kokkos_ENABLE_Cuda ON CACHE BOOL "Set by default for CUDA PR testing")
 set (Kokkos_ENABLE_Cuda_UVM ON CACHE BOOL "Set by default for CUDA PR testing")
-set (KOKKOS_ARCH Power8 CACHE STRING "Set by default for CUDA PR testing")
+set (KOKKOS_ARCH Power8,Kepler37 CACHE STRING "Set by default for CUDA PR testing")
+set (Kokkos_ENABLE_Cuda_Relocatable_Device_Code OFF CACHE BOOL "Set by default for CUDA PR testing")
+set (Sacado_ENABLE_HIERARCHICAL_DFAD ON CACHE BOOL "Set by default for CUDA PR testing")
+set (Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA ON CACHE BOOL "Set by default for CUDA PR testing")
+set (Kokkos_ENABLE_Cuda_Lambda ON CACHE BOOL "Set by default for CUDA PR testing")
+set (Phalanx_KOKKOS_DEVICE_TYPE CUDA CACHE STRING "Set by default for CUDA PR testing")
+set (MPI_EXEC_POST_NUMPROCS_FLAGS "-map-by;socket:PE=4" CACHE STRING "Set by default for CUDA PR testing")
 
 # Options set to match the ATDM build
 set (Trilinos_ENABLE_DEBUG OFF CACHE BOOL "Set by default for CUDA PR testing")
 set (Trilinos_ENABLE_DEBUG_SYMBOLS OFF CACHE BOOL "Set by default for CUDA PR testing")
-
-# TPL settings specific to CUDA build
-set (TPL_BLAS_LIBRARIES "-L${BLAS_ROOT}/lib -lblas -lgfortran -lgomp -lm" CACHE STRING "Set by default for CUDA PR testing")
-set (TPL_LAPACK_LIBRARIES "-L${LAPACK_ROOT}/lib -llapack -lgfortran -lgomp" CACHE STRING "Set by default for CUDA PR testing")
 set (BUILD_SHARED_LIBS OFF CACHE BOOL "Set by default for CUDA PR testing")
 set (Tpetra_INST_SERIAL ON CACHE BOOL "Set by default for CUDA PR testing")
 set (Trilinos_ENABLE_SECONDARY_TESTED_CODE OFF CACHE BOOL "Set by default for CUDA PR testing")
-set (TPL_ENABLE_Scotch OFF CACHE BOOL "Set by default for CUDA PR testing")
-# Parmetis is available on ride and could be enabled for the CUDA PR build
+set (EpetraExt_ENABLE_HDF5 OFF CACHE BOOL "Set by default for CUDA PR testing")
+set (Panzer_ENABLE_FADTYPE "Sacado::Fad::DFad<RealType>" CACHE STRING "Set by default for CUDA PR testing")
+set (Kokkos_ENABLE_Debug_Bounds_Check ON CACHE BOOL "Set by default for CUDA PR testing")
+set (KOKKOS_ENABLE_DEBUG ON CACHE BOOL "Set by default for CUDA PR testing")
+
+# TPL settings specific to CUDA build
+set (TPL_ENABLE_CUDA ON CACHE BOOL "Set by default for CUDA PR testing")
+set (TPL_ENABLE_CUSPARSE ON CACHE BOOL "Set by default for CUDA PR testing")
+set (TPL_ENABLE_Pthread OFF CACHE BOOL "Set by default for CUDA PR testing")
+set (TPL_ENABLE_BinUtils OFF CACHE BOOL "Set by default for CUDA PR testing")
+set (TPL_ENABLE_BLAS ON CACHE BOOL "Set by default for CUDA PR testing")
+set (TPL_BLAS_LIBRARIES "-L$ENV{BLAS_ROOT}/lib;-lblas;-lgfortran;-lgomp;-lm" CACHE STRING "Set by default for CUDA PR testing")
+set (TPL_ENABLE_Boost ON CACHE BOOL "Set by default for CUDA PR testing")
+set (Boost_INCLUDE_DIRS "$ENV{BOOST_ROOT}/include" CACHE FILEPATH "Set by default for CUDA PR testing")
+set (TPL_Boost_LIBRARIES "$ENV{BOOST_ROOT}/lib/libboost_program_options.a;$ENV{BOOST_ROOT}/lib/libboost_system.a" CACHE FILEPATH  "Set by default for CUDA PR testing")
+set (TPL_ENABLE_BoostLib ON CACHE BOOL "Set by default for CUDA PR testing")
+set (BoostLib_INCLUDE_DIRS "$ENV{BOOST_ROOT}/include" CACHE FILEPATH "Set by default for CUDA PR testing")
+set (TPL_BoostLib_LIBRARIES "$ENV{BOOST_ROOT}/lib/libboost_program_options.a;$ENV{BOOST_ROOT}/lib/libboost_system.a" CACHE FILEPATH "Set by default for CUDA PR testing")
+set (TPL_ENABLE_METIS OFF CACHE BOOL "Set by default for CUDA PR testing")
 set (TPL_ENABLE_ParMETIS OFF CACHE BOOL "Set by default for CUDA PR testing")
-set (TPL_Netcdf_LIBRARIES "-L${BOOST_ROOT}/lib;-L${NETCDF_ROOT}/lib;-L${NETCDF_ROOT}/lib;-L${PNETCDF_ROOT}/lib;-L${HDF5_ROOT}/lib;${BOOST_ROOT}/lib/libboost_program_options.a;${BOOST_ROOT}/lib/libboost_system.a;${NETCDF_ROOT}/lib/libnetcdf.a;${PNETCDF_ROOT}/lib/libpnetcdf.a;${HDF5_ROOT}/lib/libhdf5_hl.a;${HDF5_ROOT}/lib/libhdf5.a;-lz;-ldl" CACHE STRING "Set by default for CUDA PR testing")
-# SuperLU is available on ride and could be enabled for the CUDA PR build
+set (TPL_ENABLE_Scotch OFF CACHE BOOL "Set by default for CUDA PR testing")
+set (TPL_ENABLE_HWLOC OFF CACHE BOOL "Set by default for CUDA PR testing")
+set (TPL_ENABLE_LAPACK ON CACHE BOOL "Set by default for CUDA PR testing")
+set (TPL_LAPACK_LIBRARIES "-L$ENV{LAPACK_ROOT}/lib;-llapack;-lgfortran;-lgomp" CACHE STRING "Set by default for CUDA PR testing")
+set (TPL_ENABLE_Zlib OFF CACHE BOOL "Set by default for CUDA PR testing")
+set (TPL_ENABLE_CGNS OFF CACHE BOOL "Set by default for CUDA PR testing")
+set (TPL_ENABLE_HDF5 ON CACHE BOOL "Set by default for CUDA PR testing")
+set (HDF5_INCLUDE_DIRS "$ENV{HDF5_ROOT}/include" CACHE FILEPATH "Set by default for CUDA PR testing")
+set (TPL_HDF5_LIBRARIES "-L$ENV{HDF5_ROOT}/lib;$ENV{HDF5_ROOT}/lib/libhdf5_hl.a;$ENV{HDF5_ROOT}/lib/libhdf5.a;-lz;-ldl" CACHE FILEPATH "Set by default for CUDA PR testing")
+set (TPL_ENABLE_Netcdf ON CACHE BOOL "Set by default for CUDA PR testing")
+set (Netcdf_INCLUDE_DIRS "$ENV{NETCDF_ROOT}/include" CACHE FILEPATH "Set by default for CUDA PR testing")
+set (TPL_Netcdf_LIBRARIES "-L$ENV{BOOST_ROOT}/lib;-L$ENV{NETCDF_ROOT}/lib;-L$ENV{NETCDF_ROOT}/lib;-L$ENV{PNETCDF_ROOT}/lib;-L$ENV{HDF5_ROOT}/lib;$ENV{BOOST_ROOT}/lib/libboost_program_options.a;$ENV{BOOST_ROOT}/lib/libboost_system.a;$ENV{NETCDF_ROOT}/lib/libnetcdf.a;$ENV{PNETCDF_ROOT}/lib/libpnetcdf.a;$ENV{HDF5_ROOT}/lib/libhdf5_hl.a;$ENV{HDF5_ROOT}/lib/libhdf5.a;-lz;-ldl" CACHE STRING "Set by default for CUDA PR testing")
+# SuperLU and SuperLUDist is available on ride and could be enabled for the CUDA PR build
 set (TPL_ENABLE_SuperLU OFF CACHE BOOL "Set by default for CUDA PR testing")
+set (TPL_ENABLE_SuperLUDist OFF CACHE BOOL "Set by default for CUDA PR testing")
 set (TPL_ENABLE_BoostLib OFF CACHE BOOL "Set by default for CUDA PR testing")
-set (Trilinos_ENABLE_Moertel OFF CACHE BOOL "Disable for CUDA PR testing")
-set (Trilinos_ENABLE_Komplex OFF CACHE BOOL "Disable for CUDA PR testing")
+set (TPL_ENABLE_Matio OFF CACHE BOOL "Set by default for CUDA PR testing")
+set (TPL_DLlib_LIBRARIES "-ldl" CACHE FILEPATH "Set by default for CUDA PR testing")
 
 # Temporary options to clean up build
-set (Trilinos_ENABLE_SEACAS OFF CACHE BOOL "Temporary disable for CUDA PR testing")
-set (STK_ENABLE_TESTS OFF CACHE BOOL "Temporary disable for CUDA PR testing")
+set (Teko_ModALPreconditioner_MPI_1_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
+set (MueLu_ParameterListInterpreterTpetra_MPI_1_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
+set (MueLu_ParameterListInterpreterTpetraHeavy_MPI_1_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
+set (STKUnit_tests_stk_ngp_test_utest_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
 set (ROL_example_PDE-OPT_0ld_adv-diff-react_example_01_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
 set (ROL_example_PDE-OPT_0ld_adv-diff-react_example_02_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
 set (ROL_example_PDE-OPT_0ld_poisson_example_01_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
@@ -53,15 +85,20 @@ set (ROL_example_PDE-OPT_obstacle_example_01_MPI_4_DISABLE ON CACHE BOOL "Tempor
 set (ROL_example_PDE-OPT_topo-opt_poisson_example_01_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
 set (ROL_test_elementwise_TpetraMultiVector_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
 set (ROL_NonlinearProblemTest_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
-set (TpetraCore_Core_initialize_where_tpetra_initializes_kokkos_MPI_1_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
-set (TpetraCore_Core_initialize_where_tpetra_initializes_mpi_and_user_initializes_kokkos_MPI_2_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
-set (TpetraCore_Core_initialize_where_user_initializes_kokkos_MPI_1_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
-set (TpetraCore_Core_initialize_where_user_initializes_mpi_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
-set (TpetraCore_Core_ScopeGuard_where_tpetra_initializes_kokkos_MPI_1_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
-set (TpetraCore_Core_ScopeGuard_where_tpetra_initializes_mpi_and_user_initializes_kokkos_MPI_2_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
-set (TpetraCore_Core_ScopeGuard_where_user_initializes_kokkos_MPI_1_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
-set (TpetraCore_Core_ScopeGuard_where_user_initializes_mpi_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
+set (PanzerAdaptersSTK_CurlLaplacianExample-ConvTest-Quad-Order-4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
+set (PanzerAdaptersSTK_MixedPoissonExample-ConvTest-Hex-Order-3_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
 set (TrilinosCouplings_Example_Maxwell_MueLu_MPI_1_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
 set (TrilinosCouplings_Example_Maxwell_MueLu_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
 
+# Disable some tests that should not need to be disabled but do because the
+# Trilinos PR tester is using too high a parallel level for ctest. (If the
+# ctest parallel test level is dropped from 29 to 8, all of these will pass.)
+set (MueLu_UnitTestsIntrepid2Tpetra_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
+set (PanzerAdaptersSTK_main_driver_energy-ss-blocked-tp_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
+set (PanzerAdaptersSTK_MixedCurlLaplacianExample-ConvTest-Tri-Order-1_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
+set (PanzerAdaptersSTK_PoissonInterfaceExample_3d_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
+set (PanzerMiniEM_MiniEM-BlockPrec_Augmentation_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
+set (PanzerMiniEM_MiniEM-BlockPrec_RefMaxwell_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
+set (ROL_example_PDE-OPT_poisson-boltzmann_example_01_MPI_4_DISABLE ON CACHE BOOL "Temporary disable for CUDA PR testing")
+
 include("${CMAKE_CURRENT_LIST_DIR}/PullRequestLinuxCommonTestingSettings.cmake")
diff --git a/cmake/std/sems/PullRequestCuda9.2TestingEnv.sh b/cmake/std/sems/PullRequestCuda9.2TestingEnv.sh
index 0cde8caf18de..c5532defeab7 100644
--- a/cmake/std/sems/PullRequestCuda9.2TestingEnv.sh
+++ b/cmake/std/sems/PullRequestCuda9.2TestingEnv.sh
@@ -3,8 +3,6 @@
 
 # usage: $ source PullRequestCUDA9.2TestingEnv.sh
 
-#No SEMS NFS mount on ride
-#source /projects/sems/modulefiles/utils/sems-modules-init.sh
 module load git/2.10.1
 module load devpack/20180521/openmpi/2.1.2/gcc/7.2.0/cuda/9.2.88
 module swap openblas/0.2.20/gcc/7.2.0 netlib/3.8.0/gcc/7.2.0
@@ -12,13 +10,9 @@ module swap openblas/0.2.20/gcc/7.2.0 netlib/3.8.0/gcc/7.2.0
 export OMPI_CXX=$WORKSPACE/Trilinos/packages/kokkos/bin/nvcc_wrapper
 export OMPI_CC=`which gcc`
 export OMPI_FC=`which gfortran`
-export NVCC_WRAPPER_DEFAULT_COMPILER=`which g++`
 export CUDA_LAUNCH_BLOCKING=1
+export CUDA_MANAGED_FORCE_DEVICE_ALLOC=1
 
 # Use manually installed cmake and ninja to try to avoid module loading
 # problems (see TRIL-208)
 export PATH=/ascldap/users/rabartl/install/white-ride/cmake-3.11.2/bin:/ascldap/users/rabartl/install/white-ride/ninja-1.8.2/bin:$PATH
-
-# add the OpenMP environment variable we need
-export OMP_NUM_THREADS=2
-
diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp
index def34c796548..99b79c2e605f 100644
--- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp
+++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp
@@ -55,6 +55,8 @@
 #include <KokkosBatched_Copy_Impl.hpp>
 #include <KokkosBatched_AddRadial_Decl.hpp>
 #include <KokkosBatched_AddRadial_Impl.hpp>
+#include <KokkosBatched_SetIdentity_Decl.hpp>
+#include <KokkosBatched_SetIdentity_Impl.hpp>
 #include <KokkosBatched_Gemm_Decl.hpp>
 #include <KokkosBatched_Gemm_Serial_Impl.hpp>
 #include <KokkosBatched_Gemm_Team_Impl.hpp>
@@ -74,8 +76,8 @@
 #include <memory>
 
 // need to interface this into cmake variable (or only use this flag when it is necessary)
-//#define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
-#undef  IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
+#define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
+//#undef  IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
 #include "cuda_profiler_api.h"
 #endif
@@ -107,6 +109,12 @@ namespace Ifpack2 {
     template <typename ViewType>
     using ConstUnmanaged = Const<Unmanaged<ViewType> >;    
 
+    template <typename ViewType>
+    using Scratch = Kokkos::View<typename ViewType::data_type,
+                                 typename ViewType::array_layout,
+                                 typename ViewType::execution_space::scratch_memory_space,
+                                 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
+
     ///
     /// cuda specialization
     ///
@@ -266,7 +274,9 @@ namespace Ifpack2 {
       // packed data always use layout right
       typedef Kokkos::View<vector_type*,device_type> vector_type_1d_view;
       typedef Kokkos::View<vector_type***,Kokkos::LayoutRight,device_type> vector_type_3d_view;
+      typedef Kokkos::View<internal_vector_type***,Kokkos::LayoutRight,device_type> internal_vector_type_3d_view;
       typedef Kokkos::View<internal_vector_type****,Kokkos::LayoutRight,device_type> internal_vector_type_4d_view;
+      typedef Kokkos::View<impl_scalar_type***,Kokkos::LayoutRight,device_type> impl_scalar_type_3d_view;
       typedef Kokkos::View<impl_scalar_type****,Kokkos::LayoutRight,device_type> impl_scalar_type_4d_view;
     };
     
@@ -771,6 +781,8 @@ namespace Ifpack2 {
       // space. But it's easy to detect, so it's recorded in case an optimization
       // can be made based on it.
       bool row_contiguous;
+
+      local_ordinal_type max_partsz;
     };
 
     ///
@@ -808,7 +820,9 @@ namespace Ifpack2 {
       
       // permutation vector
       std::vector<local_ordinal_type> p;
-      if (!jacobi) {
+      if (jacobi) {
+	interf.max_partsz = 1;
+      } else {
         // reorder parts to maximize simd packing efficiency
         p.resize(nparts);
         
@@ -822,6 +836,8 @@ namespace Ifpack2 {
                   });
         for (local_ordinal_type i=0;i<nparts;++i)
           p[i] = partsz[i].second;
+
+	interf.max_partsz = partsz[0].first;
       }
 
       // allocate parts
@@ -1432,6 +1448,19 @@ namespace Ifpack2 {
     };
     
 #if defined(KOKKOS_ENABLE_CUDA) 
+    static inline int ExtractAndFactorizeRecommendedCudaTeamSize(const int blksize,
+                                                                 const int vector_length,
+                                                                 const int internal_vector_length) {      
+      const int vector_size = vector_length/internal_vector_length;
+      int total_team_size(0);
+      if      (blksize <=  5) total_team_size =  32;
+      else if (blksize <=  9) total_team_size =  64;
+      else if (blksize <= 12) total_team_size =  96;
+      else if (blksize <= 16) total_team_size = 128;
+      else if (blksize <= 20) total_team_size = 160;
+      else                    total_team_size = 160;
+      return 2*total_team_size/vector_size; 
+    }
     template<>
     struct ExtractAndFactorizeTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
       typedef KokkosBatched::Experimental::Mode::Team mode_type;
@@ -1439,10 +1468,7 @@ namespace Ifpack2 {
       static int recommended_team_size(const int blksize, 
     				       const int vector_length, 
     				       const int internal_vector_length) { 
-    	const int vector_size = vector_length/internal_vector_length;
-    	const int total_team_size = blksize <= 8 ? 32 : blksize <= 16 ? 64 : 128;
-	// to match the old code behavior when internal vector length is 1
-	return internal_vector_length == 1 ? 32/vector_size : total_team_size/vector_size; 
+	return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
       }
     };
     template<>
@@ -1452,10 +1478,7 @@ namespace Ifpack2 {
       static int recommended_team_size(const int blksize, 
     				       const int vector_length, 
     				       const int internal_vector_length) { 
-    	const int vector_size = vector_length/internal_vector_length;
-    	const int total_team_size = blksize <= 8 ? 32 : blksize <= 16 ? 64 : 128;
-	// to match the old code behavior when internal vector length is 1
-	return internal_vector_length == 1 ? 32/vector_size : total_team_size/vector_size; 
+	return ExtractAndFactorizeRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
       }
     };
 #endif
@@ -1482,6 +1505,8 @@ namespace Ifpack2 {
       using vector_type_3d_view = typename impl_type::vector_type_3d_view;
       using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
       using impl_scalar_type_4d_view = typename impl_type::impl_scalar_type_4d_view;
+      using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
+      using impl_scalar_scratch_type_3d_view = Scratch<typename impl_type::impl_scalar_type_3d_view>;
 
       using internal_vector_type = typename impl_type::internal_vector_type;
       static constexpr int vector_length = impl_type::vector_length;
@@ -1494,6 +1519,7 @@ namespace Ifpack2 {
     private:
       // part interface
       const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr;
+      const local_ordinal_type max_partsz;
       // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
       using a_rowptr_value_type = typename Kokkos::ViewTraits<local_ordinal_type*,typename impl_type::device_type>::size_type; 
       const ConstUnmanaged<Kokkos::View<a_rowptr_value_type*,typename impl_type::device_type> > A_rowptr;
@@ -1514,11 +1540,12 @@ namespace Ifpack2 {
       ExtractAndFactorizeTridiags(const BlockTridiags<MatrixType> &btdm_, 
                                   const PartInterface<MatrixType> &interf_,
                                   const Teuchos::RCP<const block_crs_matrix_type> &A_,
-                                  const magnitude_type& tiny_) : 
+                                  const magnitude_type& tiny_) :
         // interface
         partptr(interf_.partptr), 
         lclrow(interf_.lclrow), 
         packptr(interf_.packptr),
+	max_partsz(interf_.max_partsz),
         // block crs matrix
         A_rowptr(A_->getCrsGraph().getLocalGraph().row_map), 
         A_values(const_cast<block_crs_matrix_type*>(A_.get())->template getValues<memory_space>()),
@@ -1547,10 +1574,8 @@ namespace Ifpack2 {
 
       KOKKOS_INLINE_FUNCTION 
       void 
-      extract(const local_ordinal_type &packidx) const {
-        local_ordinal_type partidx = packptr(packidx);
-        local_ordinal_type npacks = packptr(packidx+1) - partidx;
-
+      extract(local_ordinal_type partidx, 
+	      local_ordinal_type npacks) const {
         const size_type kps = pack_td_ptr(partidx);
         local_ordinal_type kfs[vector_length] = {};
         local_ordinal_type ri0[vector_length] = {};
@@ -1594,37 +1619,56 @@ namespace Ifpack2 {
       KOKKOS_INLINE_FUNCTION 
       void 
       extract(const member_type &member, 
-              const local_ordinal_type &partidx,
-	      const local_ordinal_type &v) const {
-        const size_type kps = pack_td_ptr(partidx);
-        const local_ordinal_type kfs = flat_td_ptr(partidx);
-        const local_ordinal_type ri0 = partptr(partidx);
-        const local_ordinal_type nrows = partptr(partidx+1) - ri0;
-        for (local_ordinal_type tr=0,j=0;tr<nrows;++tr) {          
-          local_ordinal_type lbeg = (tr == 0         ? 1 : 0);
-          local_ordinal_type lend = (tr == nrows - 1 ? 2 : 3);
-          for (local_ordinal_type l=lbeg;l<lend;++l,++j) { // l == 1, diagonal
-            const size_type Aj = A_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
-            const impl_scalar_type* block = &A_values(Aj*blocksize_square);
-            const size_type pi = kps + j;
-            Kokkos::parallel_for
-              (Kokkos::TeamThreadRange(member,blocksize_square), [&](const local_ordinal_type &ij) {
-                const local_ordinal_type ii = ij/blocksize;
-                const local_ordinal_type jj = ij%blocksize;
-                scalar_values(pi, ii, jj, v) = block[ii*blocksize + jj];
-              });
+              const local_ordinal_type &partidxbeg,
+	      const local_ordinal_type &npacks,
+	      const local_ordinal_type &vbeg) const {
+        local_ordinal_type kfs_vals[internal_vector_length] = {};
+        local_ordinal_type ri0_vals[internal_vector_length] = {};
+        local_ordinal_type nrows_vals[internal_vector_length] = {};
+
+        const size_type kps = pack_td_ptr(partidxbeg);
+        for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
+          kfs_vals[vi] = flat_td_ptr(partidxbeg+vi);
+          ri0_vals[vi] = partptr(partidxbeg+vi);
+          nrows_vals[vi] = partptr(partidxbeg+vi+1) - ri0_vals[vi];
+        }
+
+        local_ordinal_type j_vals[internal_vector_length] = {};	
+	for (local_ordinal_type tr=0;tr<nrows_vals[0];++tr) {          
+	  for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {	
+	    const local_ordinal_type nrows = nrows_vals[vi];
+	    if (tr < nrows) {
+	      auto &j = j_vals[vi];
+	      const local_ordinal_type kfs = kfs_vals[vi];
+	      const local_ordinal_type ri0 = ri0_vals[vi];
+	      const local_ordinal_type lbeg = (tr == 0         ? 1 : 0);
+	      const local_ordinal_type lend = (tr == nrows - 1 ? 2 : 3);
+	      for (local_ordinal_type l=lbeg;l<lend;++l,++j) {
+		const size_type Aj = A_rowptr(lclrow(ri0 + tr)) + A_colindsub(kfs + j);
+		const impl_scalar_type* block = &A_values(Aj*blocksize_square);
+		const size_type pi = kps + j;
+		Kokkos::parallel_for
+		  (Kokkos::TeamThreadRange(member,blocksize), 
+		   [&](const local_ordinal_type &ii) {
+		    for (local_ordinal_type jj=0;jj<blocksize;++jj)
+		      scalar_values(pi, ii, jj, v) = block[ii*blocksize + jj];
+		  });
+	      }
+	    }
           }
         }
       }
 
-      template<typename AAViewType>
+      template<typename AAViewType,
+	       typename WWViewType>
       KOKKOS_INLINE_FUNCTION 
       void 
       factorize(const member_type &member, 
 		const local_ordinal_type &i0,
 		const local_ordinal_type &nrows,
                 const local_ordinal_type &v,
-		const AAViewType &AA) const {
+		const AAViewType &AA,
+		const WWViewType &WW) const {
         namespace KB = KokkosBatched::Experimental;
 
         typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
@@ -1636,9 +1680,7 @@ namespace Ifpack2 {
         const auto one = Kokkos::ArithTraits<magnitude_type>::one();
 
         // subview pattern
-        auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), 0);
-	  //Kokkos::subview(internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), 0);
-        A.assign_data( &AA(i0,0,0,v) );
+        auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
         KB::LU<member_type,
                default_mode_type,KB::Algo::LU::Unblocked>
           ::invoke(member, A , tiny);
@@ -1666,17 +1708,32 @@ namespace Ifpack2 {
                    default_mode_type,KB::Algo::LU::Unblocked>
               ::invoke(member, A, tiny);
           }
-        }
+        } else {
+	  // for block jacobi invert a matrix here
+	  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
+	  KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
+            ::invoke(member, A, W);
+	  KB::SetIdentity<member_type,default_mode_type>
+            ::invoke(member, A);
+          member.team_barrier();
+	  KB::Trsm<member_type,
+		   KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
+		   default_mode_type,default_algo_type>
+            ::invoke(member, 1.0, W, A);
+	  KB::Trsm<member_type,
+		   KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
+		   default_mode_type,default_algo_type>
+            ::invoke(member, 1.0, W, A);
+	}
       }
 
     public:
 
-      struct ExtractAndFactorizeScalarTag {};
-      struct ExtractAndFactorizeInternalVectorTag {};
+      struct ExtractAndFactorizeTag {};
 
       KOKKOS_INLINE_FUNCTION 
       void 
-      operator() (const ExtractAndFactorizeScalarTag &, const member_type &member) const {
+      operator() (const ExtractAndFactorizeTag &, const member_type &member) const {
 	// btdm is packed and sorted from largest one 
 	const local_ordinal_type packidx = member.league_rank();
 
@@ -1684,39 +1741,18 @@ namespace Ifpack2 {
 	const local_ordinal_type npacks = packptr(packidx+1) - partidx;
 	const local_ordinal_type i0 = pack_td_ptr(partidx);
 	const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
-        if (vector_length_value == 1) {
-          extract(packidx);
-          factorize(member, i0, nrows, 0, scalar_values);
-        } else {
-          Kokkos::parallel_for
-            (Kokkos::ThreadVectorRange(member, vector_length_value), [&](const local_ordinal_type &v) {
-              if (v < npacks) extract(member, partidx+v, v);
-              factorize(member, i0, nrows, v, scalar_values);
-            });
-        }
-      }
 
-      KOKKOS_INLINE_FUNCTION 
-      void 
-      operator() (const ExtractAndFactorizeInternalVectorTag &, const member_type &member) const {
-	// btdm is packed and sorted from largest one 
-	const local_ordinal_type packidx = member.league_rank();
-
-	const local_ordinal_type partidx = packptr(packidx);
-	const local_ordinal_type npacks = packptr(packidx+1) - partidx;
-	const local_ordinal_type i0 = pack_td_ptr(partidx);
-	const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
+	internal_vector_scratch_type_3d_view
+	  WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
         if (vector_loop_size == 1) {
-          extract(packidx);
-          factorize(member, i0, nrows, 0, internal_vector_values);
+          extract(partidx, npacks);
+          factorize(member, i0, nrows, 0, internal_vector_values, WW);
         } else {          
           Kokkos::parallel_for
             (Kokkos::ThreadVectorRange(member, vector_loop_size), [&](const local_ordinal_type &v) {
               const local_ordinal_type vbeg = v*internal_vector_length;
-              const local_ordinal_type vend = vbeg + internal_vector_length;
-              for (local_ordinal_type vv=vbeg;vv<vend;++vv)
-                if (vv < npacks) extract(member, partidx+vv, vv);
-              factorize(member, i0, nrows, v, internal_vector_values);
+	      extract(member, partidx+vbeg, npacks, vbeg);
+              factorize(member, i0, nrows, v, internal_vector_values, WW);
             });
         }
       }
@@ -1725,14 +1761,16 @@ namespace Ifpack2 {
 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
         cudaProfilerStart();
 #endif
-	const local_ordinal_type blocksize = internal_vector_values.extent(1);
-	const local_ordinal_type team_size = ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
+	const local_ordinal_type team_size = 
+	  ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
 	  recommended_team_size(blocksize, vector_length, internal_vector_length);
-	Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeInternalVectorTag>
+	const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
+	  shmem_size(blocksize, blocksize, vector_loop_size);
+
+	Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeTag>
 	  policy(packptr.extent(0)-1, team_size, vector_loop_size); 
-	Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run", 
-			     //policy.set_scratch_size(0,Kokkos::PerTeam(10000)), 
-			     policy, *this);
+	Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>", 
+			     policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this);
 	
 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
         cudaProfilerStop();
@@ -2033,7 +2071,7 @@ namespace Ifpack2 {
       typedef KokkosBatched::Experimental::Mode::Serial mode_type;
       typedef KokkosBatched::Experimental::Algo::Level2::Unblocked single_vector_algo_type;
 #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__) 
-      typedef KokkosBatched::Experimental::Algo::Level3::CompactMKL algo_type;
+      typedef KokkosBatched::Experimental::Algo::Level3::CompactMKL multi_vector_algo_type;
 #else
       typedef KokkosBatched::Experimental::Algo::Level3::Blocked multi_vector_algo_type;
 #endif
@@ -2045,6 +2083,20 @@ namespace Ifpack2 {
     };
     
 #if defined(KOKKOS_ENABLE_CUDA) 
+    static inline int SolveTridiagsRecommendedCudaTeamSize(const int blksize,
+                                                           const int vector_length,
+                                                           const int internal_vector_length) {      
+      const int vector_size = vector_length/internal_vector_length;
+      int total_team_size(0);
+      if      (blksize <=  5) total_team_size =  32;
+      else if (blksize <=  9) total_team_size =  64;
+      else if (blksize <= 12) total_team_size =  96;
+      else if (blksize <= 16) total_team_size = 128;
+      else if (blksize <= 20) total_team_size = 160;
+      else                    total_team_size = 160;
+      return total_team_size/vector_size; 
+    }
+
     template<>
     struct SolveTridiagsDefaultModeAndAlgo<Kokkos::CudaSpace> {
       typedef KokkosBatched::Experimental::Mode::Team mode_type;
@@ -2053,9 +2105,7 @@ namespace Ifpack2 {
       static int recommended_team_size(const int blksize, 
 				       const int vector_length, 
 				       const int internal_vector_length) { 
-	const int vector_size = vector_length/internal_vector_length;
-    	const int total_team_size = blksize <= 8 ? 32 : blksize <= 16 ? 64 : 128;
-	return internal_vector_length == 1 ? 32/vector_size : total_team_size/vector_size; 
+	return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
       }
     };
     template<>
@@ -2066,10 +2116,7 @@ namespace Ifpack2 {
       static int recommended_team_size(const int blksize, 
     				       const int vector_length, 
     				       const int internal_vector_length) { 
-    	const int vector_size = vector_length/internal_vector_length;
-    	const int total_team_size = blksize <= 8 ? 32 : blksize <= 16 ? 64 : 128;
-	// to match the old code behavior when internal vector length is 1
-	return internal_vector_length == 1 ? 32/vector_size : total_team_size/vector_size; 
+	return SolveTridiagsRecommendedCudaTeamSize(blksize, vector_length, internal_vector_length);
       }
     };
 #endif
@@ -2092,6 +2139,8 @@ namespace Ifpack2 {
       using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
       using impl_scalar_type_4d_view = typename impl_type::impl_scalar_type_4d_view;
 
+      using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
+
       using internal_vector_type =typename impl_type::internal_vector_type;
       static constexpr int vector_length = impl_type::vector_length;
       static constexpr int internal_vector_length = impl_type::internal_vector_length;
@@ -2142,6 +2191,7 @@ namespace Ifpack2 {
       ///
       /// cuda team vectorization
       ///
+      template<typename WWViewType>
       KOKKOS_INLINE_FUNCTION 
       void 
       solveSingleVector(const member_type &member, 
@@ -2149,7 +2199,8 @@ namespace Ifpack2 {
                         const local_ordinal_type &i0,
                         const local_ordinal_type &r0,
                         const local_ordinal_type &nrows,
-                        const local_ordinal_type &v) const {
+                        const local_ordinal_type &v,
+			const WWViewType &WW) const {
         namespace KB = KokkosBatched::Experimental;
         typedef SolveTridiagsDefaultModeAndAlgo
           <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
@@ -2162,6 +2213,7 @@ namespace Ifpack2 {
 
         // constant
         const auto one = Kokkos::ArithTraits<magnitude_type>::one();
+	const auto zero = Kokkos::ArithTraits<magnitude_type>::zero();
         //const local_ordinal_type num_vectors = X_scalar_values.extent(2);
 
         // const local_ordinal_type blocksize = D_scalar_values.extent(1);
@@ -2180,7 +2232,7 @@ namespace Ifpack2 {
         X += r0*xstep + v;
         
         //for (local_ordinal_type col=0;col<num_vectors;++col) 
-        {
+        if (nrows > 1) {
           // solve Lx = x
           KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
             (default_mode_type,default_algo_type, 
@@ -2250,9 +2302,25 @@ namespace Ifpack2 {
           }
           // for multiple rhs
           //X += xs1;
-        }
+	} else {
+	  const local_ordinal_type ws0 = WW.stride_0();
+	  auto W = WW.data() + v;
+          KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
+            (default_mode_type,
+             member, blocksize, X, xs0, W, ws0);
+          KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
+            (default_mode_type,default_algo_type,
+             member,
+             blocksize, blocksize,
+             one,
+             A, as0, as1,
+             W, xs0,
+	     zero,
+             X, xs0);
+	}
       }
-
+      
+      template<typename WWViewType>
       KOKKOS_INLINE_FUNCTION 
       void 
       solveMultiVector(const member_type &member, 
@@ -2260,7 +2328,8 @@ namespace Ifpack2 {
                        const local_ordinal_type &i0,
                        const local_ordinal_type &r0,
                        const local_ordinal_type &nrows,
-                       const local_ordinal_type &v) const {
+                       const local_ordinal_type &v,
+		       const WWViewType &WW) const {
         namespace KB = KokkosBatched::Experimental;
         typedef SolveTridiagsDefaultModeAndAlgo
           <Kokkos::Impl::ActiveExecutionMemorySpace> default_mode_and_algo_type;
@@ -2269,56 +2338,67 @@ namespace Ifpack2 {
         
         // constant
         const auto one = Kokkos::ArithTraits<magnitude_type>::one();
+	const auto zero = Kokkos::ArithTraits<magnitude_type>::zero();
 
         // subview pattern
-        auto A  = Kokkos::subview(D_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), 0);
-        auto X1 = Kokkos::subview(X_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), 0);
+        auto A  = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
+        auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
         auto X2 = X1;
 
         local_ordinal_type i = i0, r = r0;
 
-        // solve Lx = x
-        A.assign_data( &D_internal_vector_values(i,0,0,v) );
-        X1.assign_data( &X_internal_vector_values(r,0,0,v) );
-        KB::Trsm<member_type,
-                 KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
-                 default_mode_type,default_algo_type>
-          ::invoke(member, one, A, X1);
-        for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
-          A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
-          X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
-          KB::Gemm<member_type,
-                   KB::Trans::NoTranspose,KB::Trans::NoTranspose,
-                   default_mode_type,default_algo_type>
-            ::invoke(member, -one, A, X1, one, X2);
-          A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
-          KB::Trsm<member_type,
-                   KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
-                   default_mode_type,default_algo_type>
-            ::invoke(member, one, A, X2);
-          X1.assign_data( X2.data() );
-        }
-        
-        // solve Ux = x
-        KB::Trsm<member_type,
-                 KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
-                 default_mode_type,default_algo_type>
-          ::invoke(member, one, A, X1);
-        for (local_ordinal_type tr=nrows;tr>1;--tr) {
-          i -= 3;
-          A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
-          X2.assign_data( &X_internal_vector_values(--r,0,0,v) );          
-          KB::Gemm<member_type,
-                   KB::Trans::NoTranspose,KB::Trans::NoTranspose,
-                   default_mode_type,default_algo_type>                   
-            ::invoke(member, -one, A, X1, one, X2); 
-          A.assign_data( &D_internal_vector_values(i,0,0,v) );          
-          KB::Trsm<member_type,
-                   KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
-                   default_mode_type,default_algo_type>
-            ::invoke(member, one, A, X2);
-          X1.assign_data( X2.data() );
-        }
+
+	if (nrows > 1) {
+	  // solve Lx = x
+	  KB::Trsm<member_type,
+		   KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
+		   default_mode_type,default_algo_type>
+	    ::invoke(member, one, A, X1);
+	  for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
+	    A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
+	    X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
+	    KB::Gemm<member_type,
+		     KB::Trans::NoTranspose,KB::Trans::NoTranspose,
+		     default_mode_type,default_algo_type>
+	      ::invoke(member, -one, A, X1, one, X2);
+	    A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
+	    KB::Trsm<member_type,
+		     KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
+		     default_mode_type,default_algo_type>
+	      ::invoke(member, one, A, X2);
+	    X1.assign_data( X2.data() );
+	  }
+	  
+	  // solve Ux = x
+	  KB::Trsm<member_type,
+		   KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
+		   default_mode_type,default_algo_type>
+	    ::invoke(member, one, A, X1);
+	  for (local_ordinal_type tr=nrows;tr>1;--tr) {
+	    i -= 3;
+	    A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
+	    X2.assign_data( &X_internal_vector_values(--r,0,0,v) );          
+	    KB::Gemm<member_type,
+		     KB::Trans::NoTranspose,KB::Trans::NoTranspose,
+		     default_mode_type,default_algo_type>                   
+	      ::invoke(member, -one, A, X1, one, X2); 
+	    A.assign_data( &D_internal_vector_values(i,0,0,v) );          
+	    KB::Trsm<member_type,
+		     KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
+		     default_mode_type,default_algo_type>
+	      ::invoke(member, one, A, X2);
+	    X1.assign_data( X2.data() );
+	  }
+	} else {
+	  // matrix is already inverted
+	  auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
+	  KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
+            ::invoke(member, X1, W);
+	  KB::Gemm<member_type,
+		   KB::Trans::NoTranspose,KB::Trans::NoTranspose,
+		   default_mode_type,default_algo_type>
+            ::invoke(member, one, A, W, zero, X1);
+	}
       }
 
       template<int B> struct SingleVectorTag {};
@@ -2334,9 +2414,12 @@ namespace Ifpack2 {
 	const local_ordinal_type r0 = part2packrowidx0(partidx);
         const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
         const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);      
+
+       internal_vector_scratch_type_3d_view
+	 WW(member.team_scratch(0), blocksize, 1, vector_loop_size);
 	Kokkos::parallel_for
 	  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
-	    solveSingleVector(member, blocksize, i0, r0, nrows, v);
+	    solveSingleVector(member, blocksize, i0, r0, nrows, v, WW);
 	  });
       }      
 
@@ -2350,9 +2433,13 @@ namespace Ifpack2 {
 	const local_ordinal_type r0 = part2packrowidx0(partidx);
         const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
         const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
+	const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
+
+	internal_vector_scratch_type_3d_view
+          WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
 	Kokkos::parallel_for
 	  (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
-	    solveMultiVector(member, blocksize, i0, r0, nrows, v);
+	    solveMultiVector(member, blocksize, i0, r0, nrows, v, WW);
 	  });
       }      
 
@@ -2371,18 +2458,22 @@ namespace Ifpack2 {
 	const local_ordinal_type team_size = 
 	  SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
 	  recommended_team_size(blocksize, vector_length, internal_vector_length);
+	const int per_team_scratch = internal_vector_scratch_type_3d_view
+	  ::shmem_size(blocksize, num_vectors, vector_loop_size);
 	
 #define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B)			\
 	if (num_vectors == 1) {						\
 	  const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
 	    policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
 	  Kokkos::parallel_for						\
-	    ("SolveTridiags::TeamPolicy::run<SingleVector>", policy, *this); \
+	    ("SolveTridiags::TeamPolicy::run<SingleVector>",		\
+	     policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
 	} else {							\
 	  const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
 	    policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
 	  Kokkos::parallel_for						\
-	    ("SolveTridiags::TeamPolicy::run<MultiVector>", policy, *this); \
+	    ("SolveTridiags::TeamPolicy::run<MultiVector>",		\
+	     policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
 	} break
 	
 	switch (blocksize) {
@@ -2409,6 +2500,18 @@ namespace Ifpack2 {
     ///
     /// compute local residula vector y = b - R x 
     ///
+    static inline int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize,
+                                                                     const int team_size) {
+      int total_team_size(0);
+      if      (blksize <=  5) total_team_size =  32;
+      else if (blksize <=  9) total_team_size =  64;
+      else if (blksize <= 12) total_team_size =  96;
+      else if (blksize <= 16) total_team_size = 128;
+      else if (blksize <= 20) total_team_size = 160;
+      else                    total_team_size = 160;
+      return total_team_size/team_size; 
+    }
+
     template<typename MatrixType>
     struct ComputeResidualVector {
     public:
@@ -2492,10 +2595,10 @@ namespace Ifpack2 {
 
       inline
       void 
-      serialGemv(const local_ordinal_type &blocksize,
-                 const impl_scalar_type * const __restrict__ AA, 
-                 const impl_scalar_type * const __restrict__ xx,
-                 /* */ impl_scalar_type * __restrict__ yy) const {
+      SerialGemv(const local_ordinal_type &blocksize,
+                 const impl_scalar_type * const KOKKOS_RESTRICT AA, 
+                 const impl_scalar_type * const KOKKOS_RESTRICT xx,
+                 /* */ impl_scalar_type * KOKKOS_RESTRICT yy) const {
         for (local_ordinal_type k0=0;k0<blocksize;++k0) {
           impl_scalar_type val = 0;
           const local_ordinal_type offset = k0*blocksize;
@@ -2514,7 +2617,7 @@ namespace Ifpack2 {
       template<typename bbViewType, typename yyViewType>
       KOKKOS_INLINE_FUNCTION       
       void 
-      vectorCopy(const member_type &member,
+      VectorCopy(const member_type &member,
                  const local_ordinal_type &blocksize,
                  const bbViewType &bb, 
                  const yyViewType &yy) const {
@@ -2526,7 +2629,7 @@ namespace Ifpack2 {
       template<typename AAViewType, typename xxViewType, typename yyViewType>
       KOKKOS_INLINE_FUNCTION       
       void 
-      teamGemv(const member_type &member, 
+      TeamVectorGemv(const member_type &member, 
                const local_ordinal_type &blocksize,
                const AAViewType &AA, 
                const xxViewType &xx, 
@@ -2535,17 +2638,53 @@ namespace Ifpack2 {
           (Kokkos::TeamThreadRange(member, blocksize), 
            [&](const local_ordinal_type &k0) {
             impl_scalar_type val = 0;
-            Kokkos::parallel_reduce
+            Kokkos::parallel_for
               (Kokkos::ThreadVectorRange(member, blocksize), 
-               [&](const local_ordinal_type &k1, impl_scalar_type &update) {
-                update += AA(k0,k1)*xx(k1);
-              }, val);
-            Kokkos::single(Kokkos::PerThread(member), [&]() {
-                yy(k0) -= val;
+               [&](const local_ordinal_type &k1) {
+                val += AA(k0,k1)*xx(k1);
               });
+	    Kokkos::atomic_fetch_add(&yy(k0), -val);
           });
       }
 
+      template<typename AAViewType, typename xxViewType, typename yyViewType>
+      KOKKOS_INLINE_FUNCTION       
+      void 
+      VectorGemv(const member_type &member, 
+      		 const local_ordinal_type &blocksize,
+      		 const AAViewType &AA, 
+      		 const xxViewType &xx, 
+      		 const yyViewType &yy) const { 
+      	Kokkos::parallel_for
+      	  (Kokkos::ThreadVectorRange(member, blocksize), 
+      	   [&](const local_ordinal_type &k0) {
+      	    impl_scalar_type val(0);
+      	    for (local_ordinal_type k1=0;k1<blocksize;++k1) {
+      	      val += AA(k0,k1)*xx(k1);
+      	    }
+      	    Kokkos::atomic_fetch_add(&yy(k0), -val);
+      	  });
+      }
+
+      // template<typename AAViewType, typename xxViewType, typename yyViewType>
+      // KOKKOS_INLINE_FUNCTION       
+      // void 
+      // VectorGemv(const member_type &member, 
+      // 		 const local_ordinal_type &blocksize,
+      // 		 const AAViewType &AA, 
+      // 		 const xxViewType &xx, 
+      // 		 const yyViewType &yy) const { 
+      // 	for (local_ordinal_type k0=0;k0<blocksize;++k0) {
+      // 	  impl_scalar_type val = 0;
+      // 	  Kokkos::parallel_for
+      // 	    (Kokkos::ThreadVectorRange(member, blocksize), 
+      // 	     [&](const local_ordinal_type &k1) {
+      // 	      val += AA(k0,k1)*xx(k1);
+      // 	    });
+      // 	  Kokkos::atomic_fetch_add(&yy(k0), -val);
+      // 	}
+      // }
+
       struct SeqTag {};
 
       inline
@@ -2570,7 +2709,7 @@ namespace Ifpack2 {
             const size_type j = A_k0 + colindsub[k];
             const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
             const impl_scalar_type * const xx = &x(A_colind[j]*blocksize, col);
-            serialGemv(blocksize,AA,xx,yy);
+            SerialGemv(blocksize,AA,xx,yy);
           }
         }
       }
@@ -2584,7 +2723,7 @@ namespace Ifpack2 {
         const local_ordinal_type blocksize = blocksize_requested;
         const local_ordinal_type blocksize_square = blocksize*blocksize;
 
-        const local_ordinal_type i = member.league_rank();
+        const local_ordinal_type lr = member.league_rank();
         const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
         const local_ordinal_type num_vectors = y.extent(1);
 
@@ -2594,24 +2733,26 @@ namespace Ifpack2 {
         auto yy = Kokkos::subview(y, block_range, 0);
         auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize); 
         
-        const local_ordinal_type row = i*blocksize;
+        const local_ordinal_type row = lr*blocksize;
         for (local_ordinal_type col=0;col<num_vectors;++col) {
           // y := b
           yy.assign_data(&y(row, col));
           bb.assign_data(&b(row, col));
 	  if (member.team_rank() == 0) 
-            vectorCopy(member, blocksize, bb, yy);
+            VectorCopy(member, blocksize, bb, yy);
           member.team_barrier();
 
           // y -= Rx
-          const size_type A_k0 = A_rowptr[i];              
-          for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
-            const size_type j = A_k0 + colindsub[k];
-            A_block.assign_data( &tpetra_values(j*blocksize_square) );
-            xx.assign_data( &x(A_colind[j]*blocksize, col) );
-            teamGemv(member, blocksize, A_block, xx, yy);
-          }
-        }
+          const size_type A_k0 = A_rowptr[lr];              
+	  Kokkos::parallel_for
+	    (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]), 
+	     [&](const local_ordinal_type &k) {
+	      const size_type j = A_k0 + colindsub[k];
+	      A_block.assign_data( &tpetra_values(j*blocksize_square) );
+	      xx.assign_data( &x(A_colind[j]*blocksize, col) );
+	      VectorGemv(member, blocksize, A_block, xx, yy);
+	    });
+	}
       }
 
       template<int B>
@@ -2650,11 +2791,11 @@ namespace Ifpack2 {
             if (A_colind_at_j < num_local_rows) {
               const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
               const impl_scalar_type * const xx = &x(loc*blocksize, col);
-              serialGemv(blocksize, AA,xx,yy);
+              SerialGemv(blocksize, AA,xx,yy);
             } else {
               const auto loc = A_colind_at_j - num_local_rows;
               const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
-              serialGemv(blocksize, AA,xx_remote,yy);                
+              SerialGemv(blocksize, AA,xx_remote,yy);                
             }
           }
           // move yy to y_packed
@@ -2694,26 +2835,28 @@ namespace Ifpack2 {
           bb.assign_data(&b(row, col));
           yy.assign_data(&y_packed_scalar(pri, 0, col, v));
           if (member.team_rank() == 0) 
-            vectorCopy(member, blocksize, bb, yy);
+            VectorCopy(member, blocksize, bb, yy);
           member.team_barrier();
 
           // y -= Rx
           const size_type A_k0 = A_rowptr[lr];
-          for (size_type k=rowptr[lr];k<rowptr[lr+1];++k) {
-            const size_type j = A_k0 + colindsub[k];
-            A_block.assign_data( &tpetra_values(j*blocksize_square) );
+	  Kokkos::parallel_for
+	    (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]), 
+	     [&](const local_ordinal_type &k) {
+	      const size_type j = A_k0 + colindsub[k];
+	      A_block.assign_data( &tpetra_values(j*blocksize_square) );
               
-            const local_ordinal_type A_colind_at_j = A_colind[j];
-            if (A_colind_at_j < num_local_rows) {
-              const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
-              xx.assign_data( &x(loc*blocksize, col) );
-              teamGemv(member, blocksize, A_block, xx, yy);
-            } else {
-              const auto loc = A_colind_at_j - num_local_rows;
-              xx_remote.assign_data( &x_remote(loc*blocksize, col) );
-              teamGemv(member, blocksize, A_block, xx_remote, yy);
-            }
-          }
+	      const local_ordinal_type A_colind_at_j = A_colind[j];
+	      if (A_colind_at_j < num_local_rows) {
+		const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
+		xx.assign_data( &x(loc*blocksize, col) );
+		VectorGemv(member, blocksize, A_block, xx, yy);
+	      } else {
+		const auto loc = A_colind_at_j - num_local_rows;
+		xx_remote.assign_data( &x_remote(loc*blocksize, col) );
+		VectorGemv(member, blocksize, A_block, xx_remote, yy);
+	      }
+	    });
         }
       }
       
@@ -2760,11 +2903,11 @@ namespace Ifpack2 {
             if (P == 0) {
               const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
               const impl_scalar_type * const xx = &x(loc*blocksize, col);
-              serialGemv(blocksize,AA,xx,yy);
+              SerialGemv(blocksize,AA,xx,yy);
             } else {
               const auto loc = A_colind_at_j - num_local_rows;
               const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
-              serialGemv(blocksize,AA,xx_remote,yy);                
+              SerialGemv(blocksize,AA,xx_remote,yy);                
             }
           }
           // move yy to y_packed
@@ -2797,8 +2940,8 @@ namespace Ifpack2 {
 
         // subview pattern
         auto bb = Kokkos::subview(b, block_range, 0);
-        auto xx = Kokkos::subview(x, block_range, 0);
-        auto xx_remote = Kokkos::subview(x_remote, block_range, 0);
+        auto xx = bb; //Kokkos::subview(x, block_range, 0);
+        auto xx_remote = bb; //Kokkos::subview(x_remote, block_range, 0);
         auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
         auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
         auto colindsub_used = (P == 0 ? colindsub : colindsub_remote); 
@@ -2812,27 +2955,29 @@ namespace Ifpack2 {
             // y := b
             bb.assign_data(&b(row, col));
             if (member.team_rank() == 0) 
-              vectorCopy(member, blocksize, bb, yy);
+              VectorCopy(member, blocksize, bb, yy);
             member.team_barrier();            
           } 
           
           // y -= Rx
           const size_type A_k0 = A_rowptr[lr];
-          for (size_type k=rowptr_used[lr];k<rowptr_used[lr+1];++k) {
-            const size_type j = A_k0 + colindsub_used[k];
-            A_block.assign_data( &tpetra_values(j*blocksize_square) );
-
-            const local_ordinal_type A_colind_at_j = A_colind[j];
-            if (P == 0) {
-              const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
-              xx.assign_data( &x(loc*blocksize, col) );
-              teamGemv(member, blocksize, A_block, xx, yy);
-            } else {
-              const auto loc = A_colind_at_j - num_local_rows;
-              xx_remote.assign_data( &x_remote(loc*blocksize, col) );
-              teamGemv(member, blocksize, A_block, xx_remote, yy);
-            }
-          }
+	  Kokkos::parallel_for
+	    (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]), 
+	     [&](const local_ordinal_type &k) {
+	      const size_type j = A_k0 + colindsub_used[k];
+	      A_block.assign_data( &tpetra_values(j*blocksize_square) );
+	      
+	      const local_ordinal_type A_colind_at_j = A_colind[j];
+	      if (P == 0) {
+		const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
+		xx.assign_data( &x(loc*blocksize, col) );
+		VectorGemv(member, blocksize, A_block, xx, yy);
+	      } else {
+		const auto loc = A_colind_at_j - num_local_rows;
+		xx_remote.assign_data( &x_remote(loc*blocksize, col) );
+		VectorGemv(member, blocksize, A_block, xx_remote, yy);
+	      }
+	    });
         }
       }
 
@@ -2853,7 +2998,10 @@ namespace Ifpack2 {
         y = y_; b = b_; x = x_; 
         if (is_cuda<execution_space>::value) {
 #if defined(KOKKOS_ENABLE_CUDA)
-          const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, Kokkos::AUTO());
+	  const local_ordinal_type blocksize = blocksize_requested;	  
+	  const local_ordinal_type team_size = 8;
+	  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size);
+          const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
           Kokkos::parallel_for
             ("ComputeResidual::TeamPolicy::run<SeqTag>", policy, *this);
 #endif
@@ -2902,12 +3050,16 @@ namespace Ifpack2 {
 
         if (is_cuda<execution_space>::value) {
 #if defined(KOKKOS_ENABLE_CUDA)
-          local_ordinal_type vl_power_of_two = 1;
-          for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
-          vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
-	  const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
+	  const local_ordinal_type blocksize = blocksize_requested;	  
+	  const local_ordinal_type team_size = 8;
+	  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size);
+          // local_ordinal_type vl_power_of_two = 1;
+          // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
+          // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
+	  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) {                \
-            const Kokkos::TeamPolicy<execution_space,AsyncTag<B> > policy(rowidx2part.extent(0), Kokkos::AUTO(), vl); \
+            const Kokkos::TeamPolicy<execution_space,AsyncTag<B> >	\
+	      policy(rowidx2part.extent(0), team_size, vector_size);	\
             Kokkos::parallel_for                                        \
               ("ComputeResidual::TeamPolicy::run<AsyncTag>",            \
                policy, *this); } break
@@ -2985,17 +3137,22 @@ namespace Ifpack2 {
 
         if (is_cuda<execution_space>::value) {
 #if defined(KOKKOS_ENABLE_CUDA)
-          local_ordinal_type vl_power_of_two = 1;
-          for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
-          vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
-	  const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
+	  const local_ordinal_type blocksize = blocksize_requested;	  
+	  const local_ordinal_type team_size = 8;
+	  const local_ordinal_type vector_size = ComputeResidualVectorRecommendedCudaVectorSize(blocksize, team_size);
+          // local_ordinal_type vl_power_of_two = 1;
+          // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
+          // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
+	  // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B)  \
           if (compute_owned) {                                          \
-            const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > policy(rowidx2part.extent(0), Kokkos::AUTO(), vl); \
+            const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > \
+	      policy(rowidx2part.extent(0), team_size, vector_size);	\
             Kokkos::parallel_for                                        \
               ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
           } else {                                                      \
-            const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > policy(rowidx2part.extent(0), Kokkos::AUTO(), vl); \
+            const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > \
+	      policy(rowidx2part.extent(0), team_size, vector_size);	\
             Kokkos::parallel_for                                        \
               ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
           } break
@@ -3019,11 +3176,13 @@ namespace Ifpack2 {
 #else          
 #define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B)  \
           if (compute_owned) {                                          \
-            const Kokkos::RangePolicy<execution_space,OverlapTag<0,B> > policy(0, rowidx2part.extent(0)); \
+            const Kokkos::RangePolicy<execution_space,OverlapTag<0,B> > \
+	      policy(0, rowidx2part.extent(0));				\
             Kokkos::parallel_for                                        \
               ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
           } else {                                                      \
-            const Kokkos::RangePolicy<execution_space,OverlapTag<1,B> > policy(0, rowidx2part.extent(0)); \
+            const Kokkos::RangePolicy<execution_space,OverlapTag<1,B> > \
+	      policy(0, rowidx2part.extent(0));				\
             Kokkos::parallel_for                                        \
               ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
           } break
@@ -3316,6 +3475,7 @@ namespace Ifpack2 {
             multivector_converter.to_packed_multivector(YY);
           } else {
             // fused y := x - R y and pmv := y(lclrow); 
+	    // real use case does not use overlap comp and comm
             if (overlap_communication_and_computation || !is_async_importer_active) {
               if (is_async_importer_active) async_importer->asyncSendRecv(YY);
               compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainer.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainer.cpp
index ef14d38cb0a1..87b296b9a4ab 100644
--- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainer.cpp
+++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainer.cpp
@@ -328,6 +328,10 @@ static Teuchos::RCP<const Teuchos::Comm<int> > make_comm () {
 }
 
 int main (int argc, char** argv) {
+#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
+  cudaProfilerStop();
+#endif
+
   Tpetra::ScopeGuard tpetraScope(&argc, &argv);
 
   int ret = 0;
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_AddRadial_Decl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_AddRadial_Decl.hpp
index c4d6adfe1eb7..fbe4fa2c17ff 100644
--- a/packages/kokkos-kernels/src/batched/KokkosBatched_AddRadial_Decl.hpp
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_AddRadial_Decl.hpp
@@ -4,6 +4,8 @@
 
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
 
 namespace KokkosBatched {
   namespace Experimental {
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyGivens_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyGivens_Serial_Internal.hpp
new file mode 100644
index 000000000000..70528f5af1de
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyGivens_Serial_Internal.hpp
@@ -0,0 +1,184 @@
+#ifndef __KOKKOSBATCHED_APPLY_GIVENS_SERIAL_INTERNAL_HPP__
+#define __KOKKOSBATCHED_APPLY_GIVENS_SERIAL_INTERNAL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Internal Impl
+    /// ==================== 
+    ///
+    /// this impl follows the flame interface of householder transformation
+    ///
+    struct SerialApplyLeftGivensInternal {
+      template<typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const Kokkos::pair<ValueType,ValueType> G,
+             const int n, 
+             /* */ ValueType * a1t, const int a1ts,
+             /* */ ValueType * a2t, const int a2ts) {
+        typedef ValueType value_type;
+        if (n == 0) return 0; // quick return
+        if (G.first == value_type(1) && G.second == value_type(0)) return 0;
+        /// G = [ gamma -sigma;
+        ///       sigma  gamma ];
+        /// A := G A
+        /// where gamma is G.first and sigma is G.second 
+        
+        const value_type gamma = G.first;
+        const value_type sigma = G.second;
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+        for (int j=0;j<n;++j) {
+          const value_type alpha1 = a1t[j*a1ts];
+          const value_type alpha2 = a2t[j*a2ts];
+          a1t[j*a1ts] = gamma*alpha1 - sigma*alpha2;
+          a2t[j*a1ts] = sigma*alpha1 + gamma*alpha2;
+        }
+        return 0;
+      }
+    };
+
+    struct SerialApplyRightGivensInternal {
+      template<typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const Kokkos::pair<ValueType,ValueType> G,
+             const int m, 
+             /* */ ValueType * a1, const int a1s,
+             /* */ ValueType * a2, const int a2s) {
+        typedef ValueType value_type;
+        if (m == 0) return 0; // quick return
+        if (G.first == value_type(1) && G.second == value_type(0)) return 0;
+        /// G = [ gamma -sigma;
+        ///       sigma  gamma ];
+        /// A := A G'
+        /// where gamma is G.first and sigma is G.second 
+          
+        const value_type gamma = G.first;
+        const value_type sigma = G.second;
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif        
+        for (int i=0;i<m;++i) {
+          const value_type alpha1 = a1[i*a1s];
+          const value_type alpha2 = a2[i*a2s];
+          a1[i*a1s] = gamma*alpha1 - sigma*alpha2;
+          a2[i*a1s] = sigma*alpha1 + gamma*alpha2;
+        }
+        return 0;
+      }
+    };
+
+    struct SerialApplyLeftRightGivensInternal {
+      template<typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const Kokkos::pair<ValueType,ValueType> &G12,
+             const int &m, const int &n,
+             /* */ ValueType *__restrict__ a1t, 
+             /* */ ValueType *__restrict__ a2t, 
+             /* */ ValueType *__restrict__ a1, 
+             /* */ ValueType *__restrict__ a2, 
+             const int &as0, const int &as1) {
+        typedef ValueType value_type;
+        if (G12.first == value_type(1) && G12.second == value_type(0)) return 0;
+        if (m == 0 && n == 0) return 0; // quick return
+
+        const value_type gamma12 = G12.first;
+        const value_type sigma12 = G12.second;
+
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+        for (int j=0;j<n;++j) {
+          const value_type alpha1 = a1t[j*as1];
+          const value_type alpha2 = a2t[j*as1];
+          a1t[j*as1] = ( gamma12*alpha1 - sigma12*alpha2 );
+          a2t[j*as1] = ( sigma12*alpha1 + gamma12*alpha2 );
+        }
+
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+        for (int i=0;i<m;++i) {
+          const value_type alpha1 = a1[i*as0];
+          const value_type alpha2 = a2[i*as0];
+          a1[i*as0] = ( gamma12*alpha1 - sigma12*alpha2 );
+          a2[i*as0] = ( sigma12*alpha1 + gamma12*alpha2 );
+        }
+        return 0;
+      }
+
+      template<typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const Kokkos::pair<ValueType,ValueType> &G12,
+             const Kokkos::pair<ValueType,ValueType> &G13,
+             const int &m, const int &n, 
+             /* */ ValueType *__restrict__ a1t, 
+             /* */ ValueType *__restrict__ a2t, 
+             /* */ ValueType *__restrict__ a3t, 
+             /* */ ValueType *__restrict__ a1, 
+             /* */ ValueType *__restrict__ a2, 
+             /* */ ValueType *__restrict__ a3, 
+             const int &as0, const int &as1) {
+        typedef ValueType value_type;
+        if (m == 0 && n == 0) return 0; // quick return
+
+        const value_type gamma12 = G12.first;
+        const value_type sigma12 = G12.second;
+        const value_type gamma13 = G13.first;
+        const value_type sigma13 = G13.second;
+
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+        for (int j=0;j<n;++j) {
+          const value_type alpha2 = a2t[j*as1];
+          const value_type alpha3 = a3t[j*as1];
+          {
+            const value_type alpha1 = a1t[j*as1];
+            a1t[j*as1] = ( gamma12*alpha1 - sigma12*alpha2 );
+            a2t[j*as1] = ( sigma12*alpha1 + gamma12*alpha2 ); 
+          }
+          {
+            const value_type alpha1 = a1t[j*as1];
+            a1t[j*as1] = ( gamma13*alpha1 - sigma13*alpha3 );
+            a3t[j*as1] = ( sigma13*alpha1 + gamma13*alpha3 );
+          }
+        }
+
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+        for (int i=0;i<m;++i) {
+          const value_type alpha2 = a2[i*as0];
+          const value_type alpha3 = a3[i*as0];
+          {
+            const value_type alpha1 = a1[i*as0];
+            a1[i*as0] = ( gamma12*alpha1 - sigma12*alpha2 );
+            a2[i*as0] = ( sigma12*alpha1 + gamma12*alpha2 );
+          }
+          {
+            const value_type alpha1 = a1[i*as0];
+            a1[i*as0] = ( gamma13*alpha1 - sigma13*alpha3 );
+            a3[i*as0] = ( sigma13*alpha1 + gamma13*alpha3 );
+          }
+        }
+        return 0;
+      }
+    };
+    
+  }//  end namespace Experimental
+} // end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyHouseholder_Decl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyHouseholder_Decl.hpp
new file mode 100644
index 000000000000..df2c5297083b
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyHouseholder_Decl.hpp
@@ -0,0 +1,34 @@
+#ifndef __KOKKOSBATCHED_APPLY_HOUSEHOLDER_DECL_HPP__
+#define __KOKKOSBATCHED_APPLY_HOUSEHOLDER_DECL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Householder 
+    ///
+
+    // level 1 operation (no blocking algorithm info avail)
+    template<typename ArgSide>
+    struct SerialApplyHouseholder {
+      template<typename uViewType,
+               typename tauViewType,
+               typename AViewType,
+               typename wViewType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const uViewType &u2,
+             const tauViewType &tau,
+             const AViewType
+             const wViewType &w);
+    };
+
+  }
+}
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp
new file mode 100644
index 000000000000..c04fbc510dec
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp
@@ -0,0 +1,64 @@
+#ifndef __KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_IMPL_HPP__
+#define __KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_IMPL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Householder_Serial_Internal.hpp"
+
+
+namespace KokkosBatched {
+  namespace Experimental {
+
+    ///
+    /// Serial Impl
+    /// ===========
+
+    template<>
+    template<typename uViewType,
+             typename tauViewType,
+             typename AViewType,
+             typename wViewType>
+    KOKKOS_INLINE_FUNCTION
+    int
+    SerialApplyHouseholder<Side::Left>::
+    invoke(const uViewType &u2,
+           const tauViewType &tau,
+           const AViewType &A,
+           const wViewType &w) {
+      return SerialApplyLeftHouseholderInternal::
+        invoke(A.extent(0)-1, A.extent(1),
+               tau.data(),
+               u2.data(), u2.stride(0),
+               A.data(), A.stride(1), 
+               A.data()+A.stride(0), A.stride(0), A.stride(1),
+               w.data());
+    }
+
+    template<>
+    template<typename uViewType,
+             typename tauViewType,
+             typename AViewType,
+             typename wViewType>
+    KOKKOS_INLINE_FUNCTION
+    int
+    SerialApplyHouseholder<Side::Right>::
+    invoke(const uViewType &u2,
+           const tauViewType &tau,
+           const AViewType &A,
+           const wViewType &w) {
+      return SerialApplyRightHouseholderInternal::
+        invoke(A.extent(0), A.extent(1)-1,
+               tau.data(),
+               u2.data(), u2.stride(0),
+               A.data(), A.stride(0), 
+               A.data()+A.stride(1), A.stride(0), A.stride(1),
+               w.data());
+    }
+        
+  }
+}
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp
new file mode 100644
index 000000000000..53bb8f8f4da5
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp
@@ -0,0 +1,116 @@
+#ifndef __KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_INTERNAL_HPP__
+#define __KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_INTERNAL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Internal Impl
+    /// ==================== 
+    ///
+    /// this impl follows the flame interface of householder transformation
+    ///
+    struct SerialApplyLeftHouseholderInternal {
+      template<typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int m,
+             const int n,
+             const ValueType * tau,
+             /* */ ValueType * u2,  const int u2s,
+             /* */ ValueType * a1t, const int a1ts,
+             /* */ ValueType * A2,  const int as0, const int as1,
+             /* */ ValueType * w1t) {
+        typedef ValueType value_type;
+
+        /// u2  m x 1
+        /// a1t 1 x n
+        /// A2  m x n
+
+        // apply a single householder transform H from the left to a row vector a1t 
+        // and a matrix A2
+        const value_type inv_tau = value_type(1)/(*tau);
+
+        // compute the followings:
+        // a1t -=    inv(tau)(a1t + u2'A2)
+        // A2  -= u2 inv(tau)(a1t + u2'A2)
+
+        // w1t = a1t + u2'A2 = A2^T conj(u2)
+        // w1t /= tau
+        for (int j=0;j<n;++j) {
+          value_type tmp = a1t[j*a1ts];
+          for (int i=0;i<m;++i) 
+            tmp += Kokkos::Details::ArithTraits<value_type>::conj(u2[i*u2s])*A2[i*as0+j*as1];
+          w1t[j] = tmp*inv_tau; // /= (*tau);
+        }
+
+        // a1t -= w1t    (axpy)
+        for (int j=0;j<n;++j) 
+          a1t[j*a1ts] -= w1t[j];
+
+        // A2  -= u2 w1t (ger)
+        for (int j=0;j<n;++j) 
+          for (int i=0;i<m;++i)
+            A2[i*as0+j*as1] -= u2[i*u2s]*w1t[j];
+
+        return 0;
+      }
+    };
+
+
+    struct SerialApplyRightHouseholderInternal {
+      template<typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int m,
+             const int n,
+             const ValueType * tau,
+             /* */ ValueType * u2,  const int u2s,
+             /* */ ValueType * a1,  const int a1s,
+             /* */ ValueType * A2,  const int as0, const int as1,
+             /* */ ValueType * w1) {
+        typedef ValueType value_type;
+        /// u2 n x 1
+        /// a1 m x 1
+        /// A2 m x n
+
+        // apply a single householder transform H from the left to a row vector a1t 
+        // and a matrix A2
+        const value_type inv_tau = value_type(1)/(*tau);
+        
+        // compute the followings:
+        // a1 -= inv(tau)(a1 + A2 u2)
+        // A2 -= inv(tau)(a1 + A2 u2) u2'
+
+        // w1 = a1 + A2 u2
+        // w1 /= tau
+        for (int i=0;i<m;++i) { 
+          value_type tmp = a1[i*a1s];
+          for (int j=0;j<n;++j) 
+            tmp += A2[i*as0+j*as1]*u2[j*u2s];
+          w1[i] = tmp*inv_tau; // \= (*tau);
+        }
+
+        // a1 -= w1 (axpy)
+        for (int i=0;i<m;++i) 
+          a1[i*a1s] -= w1[i];
+
+        // A2 -= w1 * u2' (ger with conjugate)
+        for (int j=0;j<n;++j)
+          for (int i=0;i<m;++i) 
+            A2[i*as0+j*as1] -= w1[i]*Kokkos::Details::ArithTraits<ValueType>::conj(u2[j*u2s]);
+
+        return 0;
+      }
+    };
+
+  }//  end namespace Experimental
+} // end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyQ_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyQ_Serial_Internal.hpp
new file mode 100644
index 000000000000..190fee0c7225
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_ApplyQ_Serial_Internal.hpp
@@ -0,0 +1,149 @@
+#ifndef __KOKKOSBATCHED_APPLY_Q_SERIAL_INTERNAL_HPP__
+#define __KOKKOSBATCHED_APPLY_Q_SERIAL_INTERNAL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_ApplyHouseholder_Serial_Internal.hpp"
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Internal Impl
+    /// ==================== 
+    ///
+    /// this impl follows the flame interface of householder transformation
+    ///
+
+
+    struct SerialApplyQ_LeftNoTransForwardInternal {
+      template<typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int m, 
+             const int n, 
+             const int k, 
+             /* */ ValueType * A, const int as0, const int as1,
+             /* */ ValueType * t, const int ts,
+             /* */ ValueType * B, const int bs0, const int bs1,
+             /* */ ValueType * w) {
+        typedef ValueType value_type;
+        
+        /// Given a matrix A that includes a series of householder vectors,
+        /// it applies a unitary matrix Q to B from left without transpose
+        ///   B = Q B = (H0 H1 H2 H3 ... H(k-1)) B
+        /// where
+        ///   A is m x k (holding H0, H1 ... H(k-1)
+        ///   t is k x 1 
+        ///   B is m x n
+
+        // partitions used for loop iteration 
+        Partition2x2<value_type> A_part2x2(as0, as1);
+        Partition3x3<value_type> A_part3x3(as0, as1);
+
+        Partition2x1<value_type> t_part2x1(ts);
+        Partition3x1<value_type> t_part3x1(ts);
+
+        Partition2x1<value_type> B_part2x1(bs0);
+        Partition3x1<value_type> B_part3x1(bs0);
+
+        // initial partition of A where ATL has a zero dimension
+        A_part2x2.partWithABR(A, m, k, m-k, 0);
+        t_part2x1.partWithAB (t, k,    0     );
+        B_part2x1.partWithAB (B, m,    m-k   );
+
+        for (int m_A0=(k-1);m_A0>=0;--m_A0) {
+          // part 2x2 into 3x3
+          A_part3x3.partWithATL(A_part2x2, 1, 1);
+          t_part3x1.partWithAT (t_part2x1, 1);
+          value_type *tau = t_part3x1.A1;
+
+          B_part3x1.partWithAT (B_part2x1, 1);
+          const int m_A2 = m - m_A0 - 1;
+          /// -----------------------------------------------------
+          // left apply householder to partitioned B1 and B2
+          SerialApplyLeftHouseholderInternal::invoke(m_A2, n,
+                                                     tau,
+                                                     A_part3x3.A21, as0,
+                                                     B_part3x1.A1,  bs1,
+                                                     B_part3x1.A2,  bs0, bs1, 
+                                                     w);            
+
+          /// -----------------------------------------------------
+          A_part2x2.mergeToABR(A_part3x3);
+          t_part2x1.mergeToAB (t_part3x1);
+          B_part2x1.mergeToAB (B_part3x1);
+        }
+        return 0;
+      }
+    };
+
+    struct SerialApplyQ_RightNoTransForwardInternal {
+      template<typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int m,
+             const int n,
+             const int k,
+             /* */ ValueType * A, const int as0, const int as1,
+             /* */ ValueType * t, const int ts,
+             /* */ ValueType * B, const int bs0, const int bs1,
+             /* */ ValueType * w) {
+        typedef ValueType value_type;
+        
+        /// Given a matrix A that includes a series of householder vectors,
+        /// it applies a unitary matrix Q to B from left without transpose
+        ///   B = B Q = B (H0 H1 H2 H3 ... H(k-1))
+        /// where
+        ///   A is n x k (holding H0, H1 ... H(k-1)
+        ///   t is k x 1 
+        ///   B is m x n
+
+        // partitions used for loop iteration 
+        Partition2x2<value_type> A_part2x2(as0, as1);
+        Partition3x3<value_type> A_part3x3(as0, as1);
+
+        Partition2x1<value_type> t_part2x1(ts);
+        Partition3x1<value_type> t_part3x1(ts);
+
+        Partition1x2<value_type> B_part1x2(bs1);
+        Partition1x3<value_type> B_part1x3(bs1);
+
+        // initial partition of A where ATL has a zero dimension
+        A_part2x2.partWithATL(A, n, k, 0, 0);
+        t_part2x1.partWithAT (t, k,    0   );
+        B_part1x2.partWithAL (B, n,    0   );
+
+        for (int n_A0=0;n_A0<k;++n_A0) {
+          // part 2x2 into 3x3
+          A_part3x3.partWithABR(A_part2x2, 1, 1);
+          t_part3x1.partWithAB (t_part2x1, 1);
+          value_type *tau = t_part3x1.A1;
+
+          B_part1x3.partWithAR (B_part1x2, 1);
+          const int n_B2 = n - n_A0 - 1;
+          /// -----------------------------------------------------
+          // right apply householder to partitioned B1 and B2
+          SerialApplyRightHouseholderInternal::invoke(m, n_B2,
+                                                      tau,
+                                                      A_part3x3.A21, as0,
+                                                      B_part1x3.A1,  bs0,
+                                                      B_part1x3.A2,  bs0, bs1, 
+                                                      w);
+          /// -----------------------------------------------------
+          A_part2x2.mergeToATL(A_part3x3);
+          t_part2x1.mergeToAT (t_part3x1);
+          B_part1x2.mergeToAL (B_part1x3);
+        }
+        return 0;
+      }
+    };
+
+
+
+  }//  end namespace Experimental
+} // end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Copy_Decl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Copy_Decl.hpp
index 2b6958625153..f379e7756e39 100644
--- a/packages/kokkos-kernels/src/batched/KokkosBatched_Copy_Decl.hpp
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Copy_Decl.hpp
@@ -4,6 +4,8 @@
 
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
 
 namespace KokkosBatched {
   namespace Experimental {
@@ -36,8 +38,57 @@ namespace KokkosBatched {
              const BViewType &B);
     };
 
+
+    ///
+    /// Selective Interface
+    ///
+    template<typename MemberType,
+             typename ArgTrans,
+             typename ArgMode>
+    struct Copy {
+      template<typename AViewType,
+               typename BViewType>
+      KOKKOS_FORCEINLINE_FUNCTION
+      static int
+      invoke(const MemberType &member,
+             const AViewType &A,
+             const BViewType &B) {
+        int r_val = 0;
+        if (std::is_same<ArgMode,Mode::Serial>::value) {
+          r_val = SerialCopy<ArgTrans>::invoke(A, B);
+        } else if (std::is_same<ArgMode,Mode::Team>::value) {
+          r_val = TeamCopy<MemberType,ArgTrans>::invoke(member, A, B);
+        } 
+        return r_val;
+      }
+    };      
+
+
   }
 }
 
 
+#define KOKKOSBATCHED_SERIAL_COPY_MATRIX_NO_TRANSPOSE_INTERNAL_INVOKE(M,N,A,AS0,AS1,B,BS0,BS1) \
+  KokkosBatched::Experimental::SerialCopyInternal			\
+  ::invoke(M, N, A, AS0, AS1, B, BS0, BS1)
+
+#define KOKKOSBATCHED_TEAM_COPY_MATRIX_NO_TRANSPOSE_INTERNAL_INVOKE(MEMBER,M,N,A,AS0,AS1,B,BS0,BS1) \
+  KokkosBatched::Experimental::TeamCopyInternal				\
+  ::invoke(MEMBER, M, N, A, AS0, AS1, B, BS0, BS1)
+
+#define KOKKOSBATCHED_SERIAL_COPY_VECTOR_INTERNAL_INVOKE(M,A,AS,B,BS)	\
+  KokkosBatched::Experimental::SerialCopyInternal			\
+  ::invoke(M, A, AS, B, BS)
+
+#define KOKKOSBATCHED_TEAM_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(MEMBER,M,A,AS,B,BS) \
+  KokkosBatched::Experimental::TeamCopyInternal				\
+  ::invoke(MEMBER, M, A, AS, B, BS)
+
+#define KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(MODETYPE,MEMBER,M,A,AS,B,BS) \
+  if (std::is_same<MODETYPE,KokkosBatched::Experimental::Mode::Serial>::value) { \
+    KOKKOSBATCHED_SERIAL_COPY_VECTOR_INTERNAL_INVOKE(M,A,AS,B,BS);	\
+  } else if (std::is_same<MODETYPE,KokkosBatched::Experimental::Mode::Team>::value) { \
+    KOKKOSBATCHED_TEAM_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE(MEMBER,M,A,AS,B,BS); \
+  }
+
 #endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Decl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Decl.hpp
new file mode 100644
index 000000000000..1a36d243a791
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Decl.hpp
@@ -0,0 +1,68 @@
+#ifndef __KOKKOSBATCHED_EIGENDECOMPOSITION_DECL_HPP__
+#define __KOKKOSBATCHED_EIGENDECOMPOSITION_DECL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
+
+namespace KokkosBatched {
+  namespace Experimental {
+    /// Given a general nonsymmetric matrix A (m x m), it performs eigendecomposition 
+    /// of the matrix.
+    /// 
+    /// Parameters:
+    ///   [in] member
+    ///     Team interface only has this argument. Partial specialization can be applied for 
+    ///     a different type of team member.
+    ///   [in/out]A
+    ///     Real general nonsymmetric rank 2 view A(m x m).
+    ///     A is first condensed to a upper Hessenberg form. Then, the Francis 
+    ///     double shift QR algorithm is applied to compute its Schur form. 
+    ///     On exit, A stores a quasi upper triangular matrix of the Schur decomposition.
+    ///   [out]er, [out]ei
+    ///     A real and imaginary eigenvalues, which forms er(m)+ei(m)i
+    ///     For a complex eigen pair, it stores a+bi and a-bi consecutively. 
+    ///   [out]UL, [out]UR
+    ///     Left/right eigenvectors are stored in (m x m) matrices. If zero span view is provided,
+    ///     it does not compute the corresponding eigenvectors. However, both UL and UR cannot have 
+    ///     zero span. If eigenvalues are only requested, use the Eigenvalue interface which 
+    ///     simplifies computations
+    ///   [out]W
+    ///     1D contiguous workspace. The minimum size is (2*m*m+5*m) where m is the dimension of matrix A.
+
+    struct SerialEigendecomposition {
+      template<typename AViewType,
+               typename EViewType,
+               typename UViewType,
+               typename WViewType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const AViewType &A,
+             const EViewType &er, const EViewType &ei,
+             const UViewType &UL, const UViewType &UR,
+             const WViewType &W);
+    };
+
+    template<typename MemberType>
+    struct TeamEigendecomposition {
+      template<typename AViewType,
+               typename EViewType,
+               typename UViewType,
+               typename WViewType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const MemberType &member, 
+             const AViewType &A,
+             const EViewType &er, const EViewType &ei,
+             const UViewType &UL, const UViewType &UR,
+             const WViewType &W);
+    };
+
+
+  }/// end namespace Experimental
+} /// end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Serial_Impl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Serial_Impl.hpp
new file mode 100644
index 000000000000..4b010dd325a0
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Serial_Impl.hpp
@@ -0,0 +1,55 @@
+#ifndef __KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_IMPL_HPP__
+#define __KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_IMPL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Eigendecomposition_Serial_Internal.hpp"
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Impl
+    /// ===========
+
+    template<typename AViewType,
+             typename EViewType,
+             typename UViewType,
+             typename WViewType>
+    KOKKOS_INLINE_FUNCTION
+    int
+    SerialEigendecomposition::
+    invoke(const AViewType &A,
+           const EViewType &er, const EViewType &ei,
+           const UViewType &UL, const UViewType &UR,
+           const WViewType &W) {
+      /// view checking
+      const int m = A.extent(0);
+      assert(m == int(A.extent(1))  && "Eigendecomposition: A is not square");
+      assert(m == int(er.extent(0)) && "Eigendecomposition: Length of er does not match to A's dimension");
+      assert(m == int(ei.extent(0)) && "Eigendecomposition: Length of ei does not match to A's dimension");
+      assert(m == int(UL.extent(0)) && "Eigendecomposition: Length of UL does not match to A's dimension");
+      assert(m == int(UL.extent(1)) && "Eigendecomposition: Width of UL does not match to A's dimension");
+      assert(m == int(UR.extent(0)) && "Eigendecomposition: Length of UR does not match to A's dimension");
+      assert(m == int(UR.extent(1)) && "Eigendecomposition: Width of UR does not match to A's dimension");
+      assert(int(W.extent(0)) >= int(2*m*m+5*m) && "Eigendecomposition: workspace size is too small");
+      assert(int(W.stride(0)) == int(1)  && "Eigendecomposition: Provided workspace is not contiguous");
+      
+      /// static assert A,er,ei,UL,UR,W has the same value_type
+      /// static assert all views have the same memory space
+      return m ? SerialEigendecompositionInternal
+        ::invoke(A.extent(0),
+                 A.data(), A.stride(0), A.stride(1),
+                 er.data(), er.stride(0),
+                 ei.data(), ei.stride(0),
+                 UL.data(), UL.stride(0), UL.stride(1),
+                 UR.data(), UR.stride(0), UR.stride(1),
+                 W.data(), W.extent(0)) : 0;
+    }
+
+  }/// end namespace Experimental
+} /// end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Serial_Internal.hpp
new file mode 100644
index 000000000000..93a13e62f8a1
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Serial_Internal.hpp
@@ -0,0 +1,298 @@
+#ifndef __KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_INTERNAL_HPP__
+#define __KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_INTERNAL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_SetIdentity_Internal.hpp"
+#include "KokkosBatched_SetTriangular_Internal.hpp"
+#include "KokkosBatched_Normalize_Internal.hpp"
+#include "KokkosBatched_Hessenberg_Serial_Internal.hpp"
+#include "KokkosBatched_ApplyQ_Serial_Internal.hpp"
+#include "KokkosBatched_Schur_Serial_Internal.hpp"
+#include "KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp"
+#include "KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp"
+#include "KokkosBatched_Gemm_Serial_Internal.hpp"
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Internal Impl
+    /// ==================== 
+
+    struct SerialEigendecompositionInternal {
+      /// Given a general nonsymmetric matrix A (m x m), it performs eigendecomposition 
+      /// of the matrix.
+      /// 
+      /// Parameters:
+      ///   [in]m 
+      ///     A dimension of the square matrix H.
+      ///   [in/out]A, [in]as0, [in]as1 
+      ///     Real general nonsymmetric matrix A(m x m) with strides as0 and as1.
+      ///     A is first condensed to a upper Hessenberg form. Then, the Francis 
+      ///     double shift QR algorithm is applied to compute its Schur form. 
+      ///     On exit, A stores a quasi upper triangular matrix of the Schur decomposition.
+      ///   [out]er, [in]ers, [out]ei, [in]eis
+      ///     A complex vector er(m)+ei(m)i with a stride ers and eis to store computed eigenvalues.
+      ///     For a complex eigen pair, it stores a+bi and a-bi consecutively. 
+      ///   [out]UL, [in]uls0, [in] uls1
+      ///     Left eigenvectors UL(m x m) with strides uls0 and uls1. When UL is NULL, the left 
+      ///     eigenvectors are not computed.
+      ///   [out]UR, [in]urs0, [in] urs1
+      ///     Right eigenvectors UR(m x m) with strides urs0 and urs1. When UR is NULL, the right
+      ///     eigenvectors are not computed.
+      ///   [out]w, [in]wlen
+      ///     Workspace
+      template<typename RealType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int m,
+             RealType * A, const int as0, const int as1,
+             RealType * er, const int ers,
+             RealType * ei, const int eis,
+             RealType * UL, const int uls0, const int uls1,
+             RealType * UR, const int urs0, const int urs1,
+             RealType * w,  const int wlen) {
+        typedef RealType real_type;
+        typedef Kokkos::Details::ArithTraits<real_type> ats;
+
+        const real_type one(1), zero(0), tol = 1e2*ats::epsilon();
+        //const Kokkos::pair<real_type,real_type> identity(one, zero);
+
+        /// step 0: input checking
+        assert( (wlen >= (2*m*m+5*m)) && "Eigendecomposition: workspace size is too small");
+        real_type *w_now = w;
+        int wlen_now = wlen;
+        assert( (wlen_now >= 0) && "Eigendecomposition: workspace size is negative");
+
+        const bool is_UL = UL != NULL, is_UR = UR != NULL;
+        const bool is_U  = is_UL || is_UR;
+        assert( is_U && "Eigendecomposition: eigenvectors are not requested; consider to use SerialEigenvalueInternal");
+        
+        real_type *QZ = w_now; w_now += (m*m); wlen_now -= (m*m);
+        assert( (wlen_now >= 0) && "Eigendecomposition: QZ allocation fails");
+
+        const int qzs0 = m, qzs1 = 1, qzs = m+1; /// row major
+        const int as = as0+as1;
+
+        /// step 1: Hessenberg reduction A = Q H Q^H
+        ///         Q is stored in QZ
+#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
+        {
+          real_type *t  = w_now; w_now += m; wlen_now -= m;
+          assert( (wlen_now >= 0) && "Eigendecomposition: Hessenberg reduction workspace t allocation fail");
+          
+          if (as0 == 1 || as1 == 1) { /// if mkl can be interfaced, use it 
+            const auto matrix_layout = ( as1 == 1 ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR );            
+            LAPACKE_dgehrd(matrix_layout, m, 1, m, A, m, t);
+            
+            SerialCopyInternal::invoke(m, m, QZ, qzs0, qzs1);          
+            LAPACKE_dorghr(matrix_layout, m, 1, m, QZ, m, t);
+          } else { /// for arbitrary strides, there is no choice to use tpls
+            real_type *ww = w_now; w_now += m; wlen_now -= m;
+            assert( (wlen_now >= 0) && "Eigendecomposition: Hessenberg reduction workspace ww allocation fail");
+
+            SerialHessenbergInternal::invoke(m, m,
+                                             A, as0, as1,
+                                             t, 1,
+                                             ww);
+            
+            SerialSetIdentityInternal::invoke(m, QZ, qzs0, qzs1);          
+            SerialApplyQ_LeftNoTransForwardInternal::invoke(m-1, m-1, m-1,
+                                                            A+as0, as0, as1,
+                                                            t, 1,
+                                                            QZ+qzs, qzs0, qzs1,
+                                                            ww);
+            /// recovery of workspace for ww
+            w_now -= m; wlen_now += m;
+          }
+          /// recovery of workspace for t
+          w_now -= m; wlen_now += m;
+
+          /// clean up H
+          SerialSetLowerTriangularInternal::invoke(m, m,
+                                                   2,
+                                                   zero,
+                                                   A, as0, as1);
+        }
+#else
+        {
+          real_type *t  = w_now; w_now += m; wlen_now -= m;
+          real_type *ww = w_now; w_now += m; wlen_now -= m;
+          assert( (wlen_now >= 0) && "Eigendecomposition: Hessenberg reduction workspace t and ww allocation fail");
+
+          SerialHessenbergInternal::invoke(m, m,
+                                           A, as0, as1,
+                                           t, 1,
+                                           ww);
+          
+          SerialSetIdentityInternal::invoke(m, QZ, qzs0, qzs1);          
+          SerialApplyQ_LeftNoTransForwardInternal::invoke(m-1, m-1, m-1,
+                                                          A+as0, as0, as1,
+                                                          t, 1,
+                                                          QZ+qzs, qzs0, qzs1,
+                                                          ww);
+
+          /// clean up H
+          SerialSetLowerTriangularInternal::invoke(m, m,
+                                                   2,
+                                                   zero,
+                                                   A, as0, as1);
+
+          /// recover workspace
+          w_now -= (2*m); wlen_now += (2*m);
+        }
+#endif   
+        /// step 2: Schur decomposition H = Z T Z^H
+        ///         Z is applied to QZ
+        {
+          int r_val = 0;
+          real_type *ww = w_now; w_now += (5*m); wlen_now -= (5*m);
+          assert( (wlen_now >= 0) && "Eigendecomposition: Schur decomposition workspace ww allocation fails");
+          do {
+            const bool restart = (r_val < 0);            
+            r_val = SerialSchurInternal::invoke(m,
+                                                A, as0, as1,
+                                                QZ, qzs0, qzs1,
+                                                ww, 5*m,
+                                                restart);
+          } while (r_val < 0 && false); 
+          // for a testing purpose, we run the Schur decomposition with a finite number of Francis iterations
+          w_now -= (5*m); wlen_now += (5*m);          
+        }
+        
+        /// Step 3: Extract iigenvalues and eigenvectors from T = V S V^-1
+        /// 
+        {
+          /// extract eigenvalues 
+          real_type *AA = A-as1;
+          int *blks = (int*)w_now; w_now += m; wlen_now -= m;
+          assert( (wlen_now >= 0) && "Eigendecomposition: Eigenvector workspace blks allocation fails");
+
+          {
+            int i=0;
+            for (;i<(m-1);) {
+              const real_type subdiag = ats::abs(AA[(i+1)*as]);
+              const real_type diag = A[i*as];
+              if (subdiag < tol) {
+                er[i*ers] = diag; 
+                ei[i*eis] = zero;
+                blks[i] = 1;
+                i+=1;
+              } else {
+                const real_type offdiag = ats::abs(A[i*as+as1]);
+                const real_type sqrt_mult_suboffdiags = ats::sqrt(subdiag*offdiag);
+                er[(i  )*ers] =  diag;
+                er[(i+1)*ers] =  diag;
+                ei[(i  )*eis] =  sqrt_mult_suboffdiags;
+                ei[(i+1)*eis] = -sqrt_mult_suboffdiags;
+                blks[i  ] = 2;
+                blks[i+1] = 2; /// consider backward iteration
+                i+=2;
+              }
+            }            
+            if (i<m) { /// take care the remainder
+              er[i*ers] = A[i*as];
+              ei[i*eis] = zero;
+              blks[i] = 1;
+            }
+          }
+
+          {
+            real_type *V = w_now; w_now += (m*m); wlen_now -= (m*m);
+            assert( (wlen_now >= 0) && "Eigendecomposition: Eigenvector workspace V allocation fails");
+
+            const int vs0 = 1, vs1 = m;
+            real_type *ww = w_now; w_now += 2*m; wlen_now -= 2*m;
+            assert( (wlen_now >= 0) && "Eigendecomposition: Eigenvector workspace w allocation fails");
+          
+            /// Right eigenvectors V 
+            if (is_UR) {
+              SerialRightEigenvectorFromSchurInternal
+                ::invoke(m, 
+                         A, as0, as1,
+                         V, vs0, vs1,
+                         ww,
+                         blks);
+              
+              /// QZ V
+              SerialGemmInternal<Algo::Gemm::Unblocked>::
+                invoke(m, m, m,
+                       one, 
+                       QZ, qzs0, qzs1,
+                       V, vs0, vs1,
+                       zero,
+                       UR, urs0, urs1);
+              int j=0;
+              for (;j<m;) {
+                if (ats::abs(ei[j*eis]) < tol) {
+                  /// a real eigenvalue
+                  SerialNormalizeInternal::invoke(m, 
+                                                  UR+j*urs1, urs0);
+                  j+=1;
+                } else {
+                  /// a complex pair of eigenvalues
+                  SerialNormalizeInternal::invoke(m, 
+                                                  UR+(j  )*urs1, urs0,
+                                                  UR+(j+1)*urs1, urs0);
+                  j+=2;
+                }
+              }
+            }
+            
+            /// Left eigenvectors V stores V
+            if (is_UL) {
+              SerialLeftEigenvectorFromSchurInternal
+                ::invoke(m, 
+                         A, as0, as1,
+                         V, vs0, vs1,
+                         ww,
+                         blks);
+                            
+              /// V^-1 (QZ)^H 
+              SerialGemmInternal<Algo::Gemm::Unblocked>::
+                invoke(m, m, m,
+                       one, 
+                       V, vs0, vs1,
+                       QZ, qzs1, qzs0,
+                       zero,
+                       UL, uls0, uls1);            
+              
+              int i=0;
+              for (;i<m;) {
+                /// normalize row vectors
+                if (ats::abs(ei[i*eis]) < tol) {
+                  /// a real eigenvalue
+                  SerialNormalizeInternal::invoke(m, 
+                                                  UL+i*uls0, uls1);
+                  i+=1;
+                } else {
+                  /// a complex pair of eigenvalues
+                  SerialNormalizeInternal::invoke(m, 
+                                                  UL+(i  )*uls0, uls1,
+                                                  UL+(i+1)*uls0, uls1);
+                  i+=2;
+                }
+              }              
+            }
+            w_now -= (m*m+2*m); wlen_now += (m*m+2*m);
+          }
+          // deallocate blks
+          w_now -= m; wlen_now += m;
+        }
+        /// deallocate QZ
+        w_now -= (m*m); wlen_now += (m*m);
+
+        assert( (w == w_now) && "Eigendecomposition: workspace tracking fails");
+        assert( (wlen == wlen_now) && "Eigendecomposition: workspace counting fails");
+        return 0;
+      }
+    };
+
+
+  }/// end namespace Experimental
+} /// end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Team_Impl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Team_Impl.hpp
new file mode 100644
index 000000000000..0b5304b7d049
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Eigendecomposition_Team_Impl.hpp
@@ -0,0 +1,50 @@
+#ifndef __KOKKOSBATCHED_EIGENDECOMPOSITION_TEAM_IMPL_HPP__
+#define __KOKKOSBATCHED_EIGENDECOMPOSITION_TEAM_IMPL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Eigendecomposition_Team_Internal.hpp"
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Team Impl
+    /// =========
+
+    template<typename MemberType>
+    struct TeamEigendecomposition {
+      template<typename AViewType,
+               typename EViewType,
+               typename UViewType,
+               typename WViewType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const MemberType &member, 
+             const AViewType &A,
+             const EViewType &er, const EViewType &ei,
+             const UViewType &UL, const UViewType &UR,
+             const WViewType &W) {
+        /// view checking
+        const int m = A.extent(0);
+        assert(m == A.extent(1)  && "Eigendecomposition: A is not square");
+        assert(m == er.extent(0) && "Eigendecomposition: Length of er does not match to A's dimension");
+        assert(m == ei.extent(0) && "Eigendecomposition: Length of ei does not match to A's dimension");
+        assert(m == UL.extent(0) && "Eigendecomposition: Length of UL does not match to A's dimension");
+        assert(m == UL.extent(1) && "Eigendecomposition: Width of UL does not match to A's dimension");
+        assert(m == UR.extent(0) && "Eigendecomposition: Length of UR does not match to A's dimension");
+        assert(m == UR.extent(1) && "Eigendecomposition: Width of UR does not match to A's dimension");
+        assert(W.extent(0) >= (2*m*m+5*m) && "Eigendecomposition: workspace size is too small");
+        assert(W.stride(0) == 1  && "Eigendecomposition: Provided workspace is not contiguous");
+    
+        return 0;
+      }
+    };
+
+
+  }/// end namespace Experimental
+} /// end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Eigenvalue_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Eigenvalue_Serial_Internal.hpp
new file mode 100644
index 000000000000..bd1cecb5c832
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Eigenvalue_Serial_Internal.hpp
@@ -0,0 +1,225 @@
+#ifndef __KOKKOSBATCHED_EIGENVALUE_SERIAL_INTERNAL_HPP__
+#define __KOKKOSBATCHED_EIGENVALUE_SERIAL_INTERNAL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_WilkinsonShift_Serial_Internal.hpp"
+#include "KokkosBatched_Schur2x2_Serial_Internal.hpp"
+#include "KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp"
+#include "KokkosBatched_Francis_Serial_Internal.hpp"
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Internal Impl
+    /// ==================== 
+    ///
+    /// this impl follows the flame interface of householder transformation
+    ///
+    struct SerialEigenvalueInternal {
+      /// Given a strictly Hessenberg matrix H (m x m), this computes all eigenvalues
+      /// using the Francis method and stores them into a vector e. This routine does 
+      /// not scale nor balance the matrix for the numerical stability.
+      /// 
+      /// Parameters:
+      ///   [in]m 
+      ///     A dimension of the square matrix H.
+      ///   [in/out]H, [in]hs0, [in]hs1 
+      ///     Real Hessenberg matrix H(m x m) with strides hs0 and hs1.
+      ///     Entering the routine, H is assumed to have a upper Hessenberg form, where
+      ///     all subdiagonals are zero. The matrix is overwritten on exit.
+      ///   [out]er, [in]ers, [out]ei, [in]eis
+      ///     A complex vector er(m)+ei(m)i with a stride ers and eis to store computed eigenvalues.
+      ///     For a complex eigen pair, it stores a+bi and a-bi consecutively. 
+      ///   [in]restart(false)
+      ///     With a restart option, the routine assume that the matrix H and the vector e 
+      ///     contain the partial results from the previous run. When m = 1 or 2, this option
+      ///     won't work as the routine always computes the all eigenvalues.
+      ///   [in]max_iteration(300)
+      ///     Unlike LAPACK which uses various methods for different types of matrices,
+      ///     this routine uses the Francis method only. A user can set the maximum number 
+      ///     of iterations. When it reaches the maximum iteration counts without converging 
+      ///     all eigenvalues, the routine returns -1. 
+      template<typename RealType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int m,
+             /* */ RealType * H, const int hs0, const int hs1,
+             /* */ RealType * er, const int ers,
+             /* */ RealType * ei, const int eis,
+             const bool restart = false,
+             const int user_max_iteration = -1) {
+        typedef RealType real_type;
+        typedef Kokkos::Details::ArithTraits<real_type> ats;
+        const real_type zero(0), nan(ats::nan()), tol = 1e2*ats::epsilon();
+        const int max_iteration = user_max_iteration < 0 ? 300 : user_max_iteration;
+
+        int r_val = 0;
+        if (restart) {
+          if (m <= 2) {
+            Kokkos::abort("Error: restart option cannot be used for m=1 or m=2");
+          }
+        } else {
+          for (int i=0;i<m;++i) 
+            er[i*ers] = nan; 
+        }
+
+        const int hs = hs0+hs1;  /// diagonal stride        
+        switch (m) {
+        case 0: { /* do nothing */ break; }
+        case 1: { er[0] = H[0]; ei[0] = zero; break; }
+        case 2: {
+          /// compute eigenvalues from the characteristic determinant equation
+          bool is_complex;
+          Kokkos::complex<real_type> lambda1, lambda2;
+          SerialWilkinsonShiftInternal::invoke(H[0],   H[hs1], 
+                                               H[hs0], H[hs],
+                                               &lambda1, &lambda2,
+                                               &is_complex);
+          er[0] = lambda1.real(); ei[0] = lambda1.imag();
+          er[1] = lambda2.real(); ei[1] = lambda2.imag();
+          break;
+        }
+        default: {
+          /// Francis method 
+          int iter(0);             /// iteration count
+          bool converge = false;   /// bool to check all eigenvalues are converged
+
+          while (!converge && iter < max_iteration) {
+            /// Step 1: find a set of unrevealed eigenvalues 
+            int cnt = 1;
+            
+            /// find mbeg (first nonzero subdiag value)
+            for (;cnt<m;++cnt) {
+              const auto val = ats::abs(*(H+cnt*hs-hs1));
+              if (val > tol) break;
+            }
+            const int mbeg = cnt-1;
+            
+            /// find mend (first zero subdiag value)
+            for (;cnt<m;++cnt) {
+              const auto val = ats::abs(*(H+cnt*hs-hs1));
+              if (val < tol) break;              
+            }
+            const int mend = cnt;
+            const int mdiff = mend - mbeg;
+
+            /// Step 2: if there exist non-converged eigen values
+            if (1 < mdiff) {
+#             if 0 /// implicit QR with shift for testing 
+              {
+                /// Rayleigh quotient shift 
+                const real_type shift = *(H+(mend-1)*hs); 
+                SerialHessenbergQR_WithShiftInternal::invoke(0, mdiff, mdiff, 
+                                                             H+hs*mbeg, hs0, hs1,
+                                                             shift);
+                real_type *sub2x2 = H+(mend-2)*hs;
+                const auto     val = ats::abs(sub2x2[hs0]);
+                if (val < tol) { /// this eigenvalue converges
+                  er[(mend-1)*ers] = sub2x2[hs]; ei[(mend-1)*eis] = zero;
+                }
+              }
+#             endif
+
+#             if 1 /// Francis double shift method
+              {
+                /// find a complex eigen pair
+                Kokkos::complex<real_type> lambda1, lambda2;
+                bool is_complex;
+                real_type *sub2x2 = H+(mend-2)*hs;
+                if (2 == mdiff) {
+                  SerialWilkinsonShiftInternal::invoke(sub2x2[0],   sub2x2[hs1], 
+                                                       sub2x2[hs0], sub2x2[hs],
+                                                       &lambda1, &lambda2,
+                                                       &is_complex);                    
+                  sub2x2[hs0] = zero;
+
+                  /// eigenvalues are from wilkinson shift
+                  er[(mbeg+0)*ers] = lambda1.real(); ei[(mbeg+0)*eis] = lambda1.imag();
+                  er[(mbeg+1)*ers] = lambda2.real(); ei[(mbeg+1)*eis] = lambda2.imag();
+                } else {
+                  SerialWilkinsonShiftInternal::invoke(sub2x2[0],   sub2x2[hs1], 
+                                                       sub2x2[hs0], sub2x2[hs],
+                                                       &lambda1, &lambda2,
+                                                       &is_complex);                    
+                  
+                  SerialFrancisInternal::invoke(0, mdiff, mdiff, 
+                                                H+hs*mbeg, hs0, hs1,
+                                                lambda1, lambda2,
+                                                is_complex);
+                  /* */ auto    &val1 = *(sub2x2+hs0);
+                  /* */ auto    &val2 = *(sub2x2-hs1);
+                  const auto abs_val1 = ats::abs(val1);
+                  const auto abs_val2 = ats::abs(val2);
+
+                  /// convergence check
+                  if (abs_val1 < tol) { 
+                    er[(mend-1)*ers] = sub2x2[hs]; ei[(mend-1)*eis] = zero;
+                    val1 = zero;
+                  } else if (abs_val2 < tol) {
+                    er[(mend-2)*ers] = lambda1.real(); ei[(mend-2)*eis] = lambda1.imag();
+                    er[(mend-1)*ers] = lambda2.real(); ei[(mend-1)*eis] = lambda2.imag();
+
+                    val1 = zero;
+                    val2 = zero;
+                  }
+                }
+              }
+#             endif
+                                             
+            } else {
+              /// all eigenvalues are converged
+              converge = true;
+            }
+            ++iter;
+          }
+          /// Step 3: record missing real eigenvalues from the diagonals
+          if (converge) {
+            // record undetected eigenvalues
+            for (int i=0;i<m;++i) 
+              if (ats::isNan(er[i*ers])) { 
+                er[i*ers] = H[i*hs]; 
+                ei[i*eis] = zero;
+              }
+            r_val = 0;
+          } else {
+            r_val = -1;
+          }
+          break;
+        }          
+        }
+        return r_val;
+      }
+
+      /// complex interface
+      template<typename RealType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int m,
+             /* */ RealType * H, const int hs0, const int hs1,
+             /* */ Kokkos::complex<RealType> * e, const int es,
+             const int max_iteration = 300,
+             const RealType user_tolerence = RealType(-1),
+             const bool restart = false) {
+        RealType * er = (RealType*)e; 
+        RealType * ei = er+1;
+        const int two_es = 2*es;
+        return invoke(m,
+                      H, hs0, hs1, 
+                      er, two_es,
+                      ei, two_es,
+                      user_tolerence,
+                      restart,
+                      max_iteration);
+      }
+    };
+
+
+
+  }/// end namespace Experimental
+} /// end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Francis_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Francis_Serial_Internal.hpp
new file mode 100644
index 000000000000..b1e55a543e0e
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Francis_Serial_Internal.hpp
@@ -0,0 +1,215 @@
+#ifndef __KOKKOSBATCHED_FRANCIS_SERIAL_INTERNAL_HPP__
+#define __KOKKOSBATCHED_FRANCIS_SERIAL_INTERNAL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Givens_Serial_Internal.hpp"
+#include "KokkosBatched_ApplyGivens_Serial_Internal.hpp"
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Internal Impl
+    /// ====================
+    ///
+    /// this impl follows the flame interface of householder transformation
+    ///
+    struct SerialFrancisInternal {
+      template<typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int mbeg, const int mend, const int morg,
+             /* */ ValueType * HH, const int hs0, const int hs1,
+             const Kokkos::complex<ValueType> lambda1, 
+             const Kokkos::complex<ValueType> lambda2,
+             const bool is_complex,
+             /* */ Kokkos::pair<ValueType,ValueType> * GG, const bool request_schur) {
+        typedef ValueType value_type;
+        
+        const int hs = hs0+hs1;
+        const value_type one(1), zero(0);
+        const Kokkos::pair<value_type,value_type> identity(one,zero);
+
+        /// redefine variables
+        const int m = mend-mbeg, mrst = morg-mend, mbeg_mult_hs0 = mbeg*hs0;
+        value_type *H = HH+hs*mbeg;
+        
+        /// initialize Gs
+        Kokkos::pair<value_type,value_type> *Gs = NULL;
+        if (request_schur) {
+          Gs = (Kokkos::pair<value_type,value_type> *)(GG+mbeg*2); 
+          for (int i=0;i<morg;++i) {
+            GG[2*i  ] = identity;
+            GG[2*i+1] = identity;
+          }
+        }
+
+        /// Given a strict Hessenberg matrix H (m x m),
+        /// it computes a single implicit QR step with a given shift
+        /// - it assumes H has zeros on subdiagonal entries (
+        /// givens rotation is defined as G = [gamma -sigma
+        ///                                    sigma  gamma]
+        ///   G' [chi1 chi2]^t = [alpha 0]^T
+        /// where G is stored as a pair of gamma and sigma
+        Kokkos::pair<value_type,value_type> G[2];
+        
+        /// 0. compute 1st double shift vector
+        value_type v[3];
+        {
+          // this needs m>=3
+          // v = M e_1 = (H*H - 2 Re(lambda) H + |lambda|^2 I)e_1
+          value_type s, t;
+          const value_type 
+            h00 = H[0*hs0+0*hs1], h01 = H[0*hs0+1*hs1], 
+            h10 = H[1*hs0+0*hs1], h11 = H[1*hs0+1*hs1], 
+            /* */                 h21 = H[2*hs0+1*hs1];          
+          if (is_complex) {
+            s = 2*lambda1.real();
+            t = lambda1.real()*lambda1.real()+lambda1.imag()*lambda1.imag();
+          } else {
+            const value_type val = H[(m-1)*hs];
+            const auto dist_lambda1 = Kokkos::Details::ArithTraits<value_type>::abs(lambda1.real() - val);
+            const auto dist_lambda2 = Kokkos::Details::ArithTraits<value_type>::abs(lambda2.real() - val);
+            const value_type lambda = dist_lambda1 < dist_lambda2 ? lambda1.real() : lambda2.real();
+            s = 2*lambda;
+            t = lambda*lambda;
+          }
+          v[0] = h00*h00+h01*h10 /* H^2 e_1 */ - s*h00 /* 2 Re(lambda) */ + t;
+          v[1] = h10*h00+h11*h10 /*         */ - s*h10;
+          v[2] = h21*h10;
+        }
+        
+        /// 1. compute the first two givens rotations that introduces a bulge
+        {
+          SerialGivensInternal::invoke(v[0], v[1],
+                                       &G[0],
+                                       &v[0]);
+          SerialGivensInternal::invoke(v[0], v[2],
+                                       &G[1],
+                                       &v[0]);
+          // record 
+          if (request_schur) {
+            Gs[0] = G[0];
+            Gs[1] = G[1];
+          }
+
+          // apply G' from left and right
+          G[0].second = -G[0].second;
+          G[1].second = -G[1].second; 
+
+          const int mm = m < 4 ? m : 4, nn = m;
+          value_type *Hs = H-mbeg_mult_hs0;
+          SerialApplyLeftRightGivensInternal
+            ::invoke (G[0], G[1], 
+                      mm+mbeg, nn+mrst,
+                      H,  H +hs0,H +2*hs0,
+                      Hs, Hs+hs1,Hs+2*hs1,
+                      hs0, hs1);
+        }
+
+        /// 1. chase the bulge
+
+        // partitions used for loop iteration
+        Partition2x2<value_type> H_part2x2(hs0, hs1);
+        Partition3x3<value_type> H_part3x3(hs0, hs1);
+
+        // initial partition of A where ATL has a zero dimension
+        int m_htl = 1;
+        H_part2x2.partWithATL(H, m, m, m_htl, m_htl);
+        for (;m_htl<(m-2);++m_htl) {
+          // part 2x2 into 3x3
+          H_part3x3.partWithABR(H_part2x2, 1, 1);
+          /// -----------------------------------------------------
+          value_type *chi1 = H_part3x3.A11-hs1;
+          value_type *chi2 = chi1+hs0; 
+          value_type *chi3 = chi2+hs0; 
+
+          SerialGivensInternal::invoke(*chi1, *chi2,
+                                       &G[0],
+                                       chi1); *chi2 = zero;
+          SerialGivensInternal::invoke(*chi1, *chi3,
+                                       &G[1],
+                                       chi1); *chi3 = zero;
+          // record 
+          if (request_schur) {
+            Gs[2*m_htl  ] = G[0];
+            Gs[2*m_htl+1] = G[1];
+          }
+
+          G[0].second = -G[0].second;
+          G[1].second = -G[1].second;
+
+          const int nn = m-m_htl, mtmp = m_htl+4, mm = mtmp < m ? mtmp : m;
+          value_type *a1t = H_part3x3.A11;
+          value_type *a2t = a1t+hs0;
+          value_type *a3t = a2t+hs0;
+          value_type *a1  = H_part3x3.A01-mbeg_mult_hs0;
+          value_type *a2  = a1+hs1;
+          value_type *a3  = a2+hs1;
+
+          SerialApplyLeftRightGivensInternal
+            ::invoke (G[0], G[1], 
+                      mm+mbeg, nn+mrst,
+                      a1t, a2t, a3t,
+                      a1,  a2,  a3,
+                      hs0, hs1);
+          /// -----------------------------------------------------
+          H_part2x2.mergeToATL(H_part3x3);
+        }
+
+        // last 3x3 block
+        {
+          // part 2x2 into 3x3
+          H_part3x3.partWithABR(H_part2x2, 1, 1);
+          /// -----------------------------------------------------
+          value_type *chi1 = H_part3x3.A11-hs1;
+          value_type *chi2 = chi1+hs0; 
+          SerialGivensInternal::invoke(*chi1, *chi2,
+                                       &G[0],
+                                       chi1); *chi2 = zero;
+          Gs[2*m_htl  ] = G[0];
+          Gs[2*m_htl+1] = identity;
+
+          G[0].second = -G[0].second; 
+
+          const int mm = m, nn = 2;
+          value_type *a1t = H_part3x3.A11;
+          value_type *a2t = a1t+hs0;
+          value_type *a1  = H_part3x3.A01-mbeg_mult_hs0;
+          value_type *a2  = a1+hs1;
+          SerialApplyLeftRightGivensInternal
+            ::invoke (G[0], 
+                      mm+mbeg, nn+mrst,
+                      a1t, a2t,
+                      a1,  a2,
+                      hs0, hs1);
+
+          /// -----------------------------------------------------
+          H_part2x2.mergeToATL(H_part3x3);
+        }
+
+        return 0;
+      }
+
+      template<typename ValueType>
+      KOKKOS_FORCEINLINE_FUNCTION
+      static int
+      invoke(const int mbeg, const int mend, const int morg,
+             /* */ ValueType * HH, const int hs0, const int hs1,
+             const Kokkos::complex<ValueType> lambda1, 
+             const Kokkos::complex<ValueType> lambda2,
+             const bool is_complex) {
+        return invoke(mbeg, mend, morg,
+                      HH, hs0, hs1,
+                      lambda1, lambda2, is_complex,
+                      (Kokkos::pair<ValueType,ValueType>*)NULL, false);
+      }
+    };
+
+  }//  end namespace Experimental
+} // end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Gemm_Decl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Gemm_Decl.hpp
index cd5f1bd302ed..a4c2ead53f75 100644
--- a/packages/kokkos-kernels/src/batched/KokkosBatched_Gemm_Decl.hpp
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Gemm_Decl.hpp
@@ -4,6 +4,8 @@
 
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
 
 namespace KokkosBatched {
   namespace Experimental {
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Gemv_Decl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Gemv_Decl.hpp
index ed910469dc52..98ab2916dbd3 100644
--- a/packages/kokkos-kernels/src/batched/KokkosBatched_Gemv_Decl.hpp
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Gemv_Decl.hpp
@@ -4,6 +4,9 @@
 
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
+
 
 namespace KokkosBatched {
   namespace Experimental {
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Givens_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Givens_Serial_Internal.hpp
new file mode 100644
index 000000000000..ebc6515cafe6
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Givens_Serial_Internal.hpp
@@ -0,0 +1,70 @@
+#ifndef __KOKKOSBATCHED_GIVENS_SERIAL_INTERNAL_HPP__
+#define __KOKKOSBATCHED_GIVENS_SERIAL_INTERNAL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Internal Impl
+    /// ==================== 
+    ///
+    /// this impl follows the flame interface of householder transformation
+    ///
+    struct SerialGivensInternal {
+      template<typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const ValueType   chi1,
+             const ValueType   chi2, 
+             /* */ Kokkos::pair<ValueType,ValueType> * G, 
+             /* */ ValueType * chi1_new) {
+        typedef ValueType value_type;        
+        const value_type zero(0), one(1);
+        /// compute G = [ gamma -sigma;
+        ///               sigma  gamma ];
+        /// G.first = gamma and G.second = sigma
+        /// this rotation satisfy the following
+        ///   G' [chi1; = [ alpha;
+        ///       chi2]     zero ];
+        value_type cs, sn, r;
+        if        (chi2 == zero) {
+          r  = chi1;
+          cs = one;
+          sn = zero;
+        } else if (chi1 == zero) {
+          r  = chi2;
+          cs = zero;
+          sn = one;
+        } else {
+          // here we do not care overflow caused by the division although it is probable....
+          r = Kokkos::Details::ArithTraits<value_type>::sqrt(chi1*chi1 + chi2*chi2);
+          cs = chi1/r;
+          sn = chi2/r;
+
+          if (Kokkos::Details::ArithTraits<value_type>::abs(chi1) > 
+              Kokkos::Details::ArithTraits<value_type>::abs(chi2) && cs < zero) {
+            cs = -cs;
+            sn = -sn;
+            r  = -r;
+          }
+
+        }
+
+        G->first = cs;
+        G->second = sn;
+        *chi1_new = r;
+
+        return 0;
+      }
+    };
+
+  }//  end namespace Experimental
+} // end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp
new file mode 100644
index 000000000000..dc52f03486a2
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp
@@ -0,0 +1,59 @@
+#ifndef __KOKKOSBATCHED_HESSENBERG_FORM_Q_SERIAL_INTERNAL_HPP__
+#define __KOKKOSBATCHED_HESSENBERG_FORM_Q_SERIAL_INTERNAL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Set_Internal.hpp"
+#include "KokkosBatched_SetIdentity_Internal.hpp"
+#include "KokkosBatched_ApplyQ_Serial_Internal.hpp"
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Internal Impl
+    /// ==================== 
+    ///
+    /// this impl follows the flame interface of householder transformation
+    ///
+    struct SerialHessenbergFormQInternal {
+      template<typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int m, 
+             const int k, 
+             /* */ ValueType * A, const int as0, const int as1,
+             /* */ ValueType * t, const int ts,
+             /* */ ValueType * Q, const int qs0, const int qs1,
+             /* */ ValueType * w,
+             const bool is_Q_zero = false) {
+        typedef ValueType value_type;
+        
+        /// Given a matrix A that includes Hessenberg factorization
+        /// it forms a unitary matrix Q 
+        ///   B = Q = (H0 H1 H2 H3 ... H(k-2)) I
+        /// where
+        ///   A is m x k (holding H0, H1 ... H(k-2)
+        ///   t is k x 1 
+        ///   B is m x m
+        // set identity
+        if (is_Q_zero) 
+          SerialSetInternal::invoke(m, value_type(1), Q, qs0+qs1);
+        else
+          SerialSetIdentityInternal::invoke(m, Q, qs0, qs1);
+        
+        return SerialApplyQ_LeftNoTransForwardInternal
+          ::invoke(m-1, m-1, k-1,
+                   A+as0, as0, as1, 
+                   t, ts,
+                   Q+qs0+qs1, qs1, qs0,
+                   w);
+      }
+    };
+
+  }//  end namespace Experimental
+} // end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp
new file mode 100644
index 000000000000..a96661febf81
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp
@@ -0,0 +1,139 @@
+#ifndef __KOKKOSBATCHED_HESSENBERG_QR_WITH_SHIFT_SERIAL_INTERNAL_HPP__
+#define __KOKKOSBATCHED_HESSENBERG_QR_WITH_SHIFT_SERIAL_INTERNAL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Givens_Serial_Internal.hpp"
+#include "KokkosBatched_ApplyGivens_Serial_Internal.hpp"
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Internal Impl
+    /// ====================
+    ///
+    /// this impl follows the flame interface of householder transformation
+    ///
+    struct SerialHessenbergQR_WithShiftInternal {
+      template<typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int mbeg, const int mend, const int morg,
+             /* */ ValueType * HH, const int hs0, const int hs1,
+             const ValueType shift,
+             /* */ Kokkos::pair<ValueType,ValueType> * GG, const bool request_schur) {
+        typedef ValueType value_type;
+        //typedef Kokkos::Details::ArithTraits<value_type> ats;
+
+        const int hs = hs0+hs1;
+        const value_type zero(0), one(1);
+        const Kokkos::pair<value_type,value_type> identity(one,zero);
+
+        /// redefine variables
+        const int m = mend-mbeg, mbeg_mult_hs0 = mbeg*hs0;
+        value_type *H = HH+mbeg*hs;
+
+        /// initialize Gs
+        Kokkos::pair<value_type,value_type> *Gs = NULL;
+        if (request_schur) {
+          for (int i=0;i<morg;++i) GG[i] = identity;
+          Gs = GG+mbeg;
+        }
+        
+        /// Given a strict Hessenberg matrix H (m x m),
+        /// it computes a single implicit QR step with a given shift
+        /// - it assumes H has zeros on subdiagonal entries (
+        /// givens rotation is defined as G = [gamma -sigma
+        ///                                    sigma  gamma]
+        ///   G' [chi1 chi2]^t = [alpha 0]^T
+        /// where G is stored as a pair of gamma and sigma
+        Kokkos::pair<value_type,value_type> G;
+
+        /// 0. compute the first givens rotation that introduces a bulge
+        {
+          const value_type chi1 = H[0] - shift;
+          const value_type chi2 = H[hs0];
+          /* */ value_type alpha;
+          SerialGivensInternal::invoke(chi1, chi2,
+                                       &G,
+                                       &alpha);
+          // record G
+          if (request_schur) Gs[0] = G;          
+
+          value_type *h11 = H;
+          value_type *h21 = H + hs0;
+          value_type *h12 = H + hs1;
+
+          // apply G' from left
+          G.second = -G.second; // transpose G
+          const int nn = m;
+          SerialApplyLeftGivensInternal::invoke (G, nn+(morg-mend),
+                                                 h11, hs1,
+                                                 h21, hs1);
+          
+          // apply (G')' from right
+          const int mm = m < 3 ? m : 3;
+          SerialApplyRightGivensInternal::invoke(G, mm+mbeg,
+                                                 h11-mbeg_mult_hs0, hs0,
+                                                 h12-mbeg_mult_hs0, hs0);
+        }
+
+        /// 1. chase the bulge
+
+        // partitions used for loop iteration
+        Partition2x2<value_type> H_part2x2(hs0, hs1);
+        Partition3x3<value_type> H_part3x3(hs0, hs1);
+
+        // initial partition of A where ATL has a zero dimension
+        H_part2x2.partWithATL(H, m, m, 1, 1);
+
+        for (int m_htl=1;m_htl<(m-1);++m_htl) {
+          // part 2x2 into 3x3
+          H_part3x3.partWithABR(H_part2x2, 1, 1);
+          //const int n_hbr = m - m_htl;
+          /// -----------------------------------------------------
+          value_type *chi1 = H_part3x3.A11-hs1;
+          value_type *chi2 = H_part3x3.A21-hs1;
+          SerialGivensInternal::invoke(*chi1, *chi2,
+                                       &G,
+                                       chi1); *chi2 = zero;
+          // record G
+          if (request_schur) Gs[m_htl] = G;
+          
+          G.second = -G.second; // transpose G
+
+          const int nn = m - m_htl;
+          SerialApplyLeftGivensInternal::invoke (G, nn+(morg-mend),
+                                                 H_part3x3.A11, hs1,
+                                                 H_part3x3.A21, hs1);
+
+          const int mtmp = m_htl+3, mm = mtmp < m ? mtmp : m;
+          SerialApplyRightGivensInternal::invoke(G, mm+mbeg,
+                                                 H_part3x3.A01-mbeg_mult_hs0, hs0,
+                                                 H_part3x3.A02-mbeg_mult_hs0, hs0);
+          /// -----------------------------------------------------
+          H_part2x2.mergeToATL(H_part3x3);
+        }
+        return 0;
+      }
+
+      template<typename ValueType>
+      KOKKOS_FORCEINLINE_FUNCTION
+      static int
+      invoke(const int mbeg, const int mend, const int morg,
+             /* */ ValueType * HH, const int hs0, const int hs1,
+             const ValueType shift) {
+        return invoke(mbeg, mend, morg, 
+                      HH, hs0, hs1, shift,
+                      (Kokkos::pair<ValueType,ValueType>*)NULL, false);
+        
+      }
+    };
+    
+  }//  end namespace Experimental
+} // end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Hessenberg_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Hessenberg_Serial_Internal.hpp
new file mode 100644
index 000000000000..5bd6b0cd1339
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Hessenberg_Serial_Internal.hpp
@@ -0,0 +1,107 @@
+#ifndef __KOKKOSBATCHED_HESSENBERG_SERIAL_INTERNAL_HPP__
+#define __KOKKOSBATCHED_HESSENBERG_SERIAL_INTERNAL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Householder_Serial_Internal.hpp"
+#include "KokkosBatched_ApplyHouseholder_Serial_Internal.hpp"
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Internal Impl
+    /// ==================== 
+    ///
+    /// this impl follows the flame interface of householder transformation
+    ///
+    struct SerialHessenbergInternal {
+      template<typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int m, // m = NumRows(A)
+             const int n, // n = NumCols(A)
+             /* */ ValueType * A, const int as0, const int as1,
+             /* */ ValueType * t, const int ts,
+             /* */ ValueType * w) {
+        typedef ValueType value_type;
+        
+        /// Given a matrix A, it computes hessenberg decomposition of the matrix
+        ///  - t is to store tau and w is for workspace
+        ///  - H = Q^H A Q and A = Q H Q^H
+
+        // partitions used for loop iteration 
+        Partition2x2<value_type> A_part2x2(as0, as1);
+        Partition3x3<value_type> A_part3x3(as0, as1);
+
+        Partition2x1<value_type> t_part2x1(ts);
+        Partition3x1<value_type> t_part3x1(ts);
+
+        // partitions used in loop body
+        Partition2x1<value_type> A21_part2x1(as0);
+        Partition2x1<value_type> A22_part2x1(as0);
+        Partition1x2<value_type> A2_part1x2 (as1);
+
+        // initial partition of A where ATL has a zero dimension
+        A_part2x2.partWithATL(A, m, n, 0, 0);
+        t_part2x1.partWithAT (t, m,    0   );
+
+        for (int m_atl=0;m_atl<m;++m_atl) {
+          // part 2x2 into 3x3
+          A_part3x3.partWithABR(A_part2x2, 1, 1);
+          const int m_A22 = m - m_atl - 1;
+          const int n_A22 = n - m_atl - 1;
+          
+          t_part3x1.partWithAB (t_part2x1, 1);
+          value_type *tau = t_part3x1.A1;
+
+          /// -----------------------------------------------------
+          if (m_A22 > 0) {
+            // partition A21 into 2x1
+            A21_part2x1.partWithAT(A_part3x3.A21, m_A22, 1);
+            
+            // perform householder transformation
+            const int m_A22_b = m_A22 - 1;
+            SerialLeftHouseholderInternal::invoke(m_A22_b,
+                                                  A21_part2x1.AT,
+                                                  A21_part2x1.AB, as0,
+                                                  tau);
+            
+            // partition A22 into 2x1
+            A22_part2x1.partWithAT(A_part3x3.A22, m_A22, 1);          
+            
+            // left apply householder to partitioned A22
+            SerialApplyLeftHouseholderInternal::invoke(m_A22_b, n_A22,
+                                                       tau,
+                                                       A21_part2x1.AB, as0,
+                                                       A22_part2x1.AT, as1,
+                                                       A22_part2x1.AB, as0, as1, 
+                                                       w);
+            
+            
+            // partition A*2 column into 1x2
+            A2_part1x2.partWithAL(A_part3x3.A02, n_A22, 1); 
+            
+            // right apply householder to A*2 colums
+            const int n_A22_r = n_A22 - 1;
+            SerialApplyRightHouseholderInternal::invoke(m, n_A22_r,
+                                                        tau,
+                                                        A21_part2x1.AB, as0,
+                                                        A2_part1x2.AL, as0,
+                                                        A2_part1x2.AR, as0, as1,
+                                                        w);
+          }
+          /// -----------------------------------------------------
+          A_part2x2.mergeToATL(A_part3x3);
+          t_part2x1.mergeToAT (t_part3x1);
+        }
+        return 0;
+      }
+    };
+
+  }//  end namespace Experimental
+} // end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Householder_Decl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Householder_Decl.hpp
new file mode 100644
index 000000000000..dc06dd2e776b
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Householder_Decl.hpp
@@ -0,0 +1,31 @@
+#ifndef __KOKKOSBATCHED_HOUSEHOLDER_DECL_HPP__
+#define __KOKKOSBATCHED_HOUSEHOLDER_DECL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
+
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Householder 
+    ///
+
+    // level 1 operation (no blocking algorithm info avail)
+    template<typename ArgSide>
+    struct SerialHouseholder {
+      template<typename aViewType,
+               typename tauViewType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const aViewType &a,
+             const tauViewType &tau);
+    };
+
+  }
+}
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Householder_Serial_Impl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Householder_Serial_Impl.hpp
new file mode 100644
index 000000000000..ebc5da5393d4
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Householder_Serial_Impl.hpp
@@ -0,0 +1,37 @@
+#ifndef __KOKKOSBATCHED_HOUSEHOLDER_SERIAL_IMPL_HPP__
+#define __KOKKOSBATCHED_HOUSEHOLDER_SERIAL_IMPL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Householder_Serial_Internal.hpp"
+
+
+namespace KokkosBatched {
+  namespace Experimental {
+
+    ///
+    /// Serial Impl
+    /// ===========
+
+    template<>
+    template<typename aViewType,
+             typename tauViewType>
+    KOKKOS_INLINE_FUNCTION
+    int
+    SerialHouseholder<Side::Left>::
+    invoke(const aViewType &a,
+           const tauViewType &tau) {
+      return SerialLeftHouseholderInternal::
+        invoke(a.extent(0)-1,
+               a.data(),
+               a.data()+a.stride(0), a.stride(0),
+               tau.data());
+    }
+        
+  }
+}
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Householder_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Householder_Serial_Internal.hpp
new file mode 100644
index 000000000000..208cfaaa454d
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Householder_Serial_Internal.hpp
@@ -0,0 +1,82 @@
+#ifndef __KOKKOSBATCHED_HOUSEHOLDER_SERIAL_INTERNAL_HPP__
+#define __KOKKOSBATCHED_HOUSEHOLDER_SERIAL_INTERNAL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Internal Impl
+    /// ==================== 
+    ///
+    /// this impl follows the flame interface of householder transformation
+    ///
+    struct SerialLeftHouseholderInternal {
+      template<typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int m_x2, 
+             /* */ ValueType * chi1,
+             /* */ ValueType * x2, const int x2s,
+             /* */ ValueType * tau) {
+        typedef ValueType value_type;
+        typedef typename Kokkos::Details::ArithTraits<ValueType>::mag_type mag_type;
+        
+        const mag_type zero(0);
+        const mag_type half(0.5);
+        const mag_type one(1);
+        const mag_type minus_one(-1);
+
+        /// compute the 2norm of x2
+        mag_type norm_x2_square(0);
+        for (int i=0;i<m_x2;++i) {
+          const auto x2_at_i = x2[i*x2s];
+          norm_x2_square += x2_at_i*x2_at_i;
+        }
+        
+        /// if norm_x2 is zero, return with trivial values
+        if (norm_x2_square == zero) {
+          *chi1 = -(*chi1);
+          *tau = half;
+          
+          return 0;
+        }
+
+        /// compute magnitude of chi1, equal to norm2 of chi1
+        const mag_type norm_chi1 = Kokkos::Details::ArithTraits<value_type>::abs(*chi1);
+
+        /// compute 2 norm of x using norm_chi1 and norm_x2
+        const mag_type norm_x = Kokkos::Details::ArithTraits<mag_type>::sqrt(norm_x2_square + norm_chi1*norm_chi1);
+
+        /// compute alpha
+        const mag_type alpha = (*chi1 < 0 ? one : minus_one)*norm_x;
+
+        /// overwrite x2 with u2
+        const value_type chi1_minus_alpha = *chi1 - alpha;
+        const value_type inv_chi1_minus_alpha = one/chi1_minus_alpha;
+        for (int i=0;i<m_x2;++i) 
+          x2[i*x2s] *= inv_chi1_minus_alpha;
+
+        // later consider to use the following
+        // SerialScaleInternal::invoke(m_x2, inv_chi1_minus_alpha, x2, x2s);
+
+        /// compute tau
+        const mag_type chi1_minus_alpha_square = chi1_minus_alpha*chi1_minus_alpha;
+        *tau = half + half*(norm_x2_square/chi1_minus_alpha_square);
+
+        /// overwrite chi1 with alpha
+        *chi1 = alpha;
+
+        return 0;
+      }
+    };
+
+  }//  end namespace Experimental
+} // end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_InverseLU_Decl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_InverseLU_Decl.hpp
index e7652ea445f1..c64799be5d8c 100644
--- a/packages/kokkos-kernels/src/batched/KokkosBatched_InverseLU_Decl.hpp
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_InverseLU_Decl.hpp
@@ -5,32 +5,57 @@
 /// \author Vinh Dang (vqdang@sandia.gov)
 
 #include "KokkosBatched_Vector.hpp"
+#include "KokkosBatched_Copy_Decl.hpp"
+#include "KokkosBatched_Copy_Impl.hpp"
+#include "KokkosBatched_SetIdentity_Decl.hpp"
+#include "KokkosBatched_SetIdentity_Impl.hpp"
+#include "KokkosBatched_SolveLU_Decl.hpp"
 
 namespace KokkosBatched {
   namespace Experimental {
       
     template<typename ArgAlgo>
     struct SerialInverseLU {
-      // no piv version
       template<typename AViewType,
-               typename WViewType>
+               typename wViewType>
       KOKKOS_INLINE_FUNCTION
       static int
       invoke(const AViewType &A,
-             const WViewType &W);
+             const wViewType &w) {
+        typedef typename wViewType::value_type value_type;
+        // workspace w is always 1D view; reinterpret it 
+        Kokkos::View<value_type**,Kokkos::LayoutRight,Kokkos::AnonymousSpace>
+          W(w.data(), A.extent(0), A.extent(1));
+
+        int r_val[3] = {};
+        r_val[0] = SerialCopy<Trans::NoTranspose>::invoke(A, W);
+        r_val[1] = SerialSetIdentity::invoke(A);
+        r_val[2] = SerialSolveLU<Trans::NoTranspose,ArgAlgo>::invoke(W, A);        
+        return r_val[0]+r_val[1]+r_val[2];
+      }
     };       
 
     template<typename MemberType,
              typename ArgAlgo>
     struct TeamInverseLU {
-      // no piv version
       template<typename AViewType,
-               typename WViewType>
+               typename wViewType>
       KOKKOS_INLINE_FUNCTION
       static int
       invoke(const MemberType &member, 
              const AViewType &A,
-             const WViewType &W);
+             const wViewType &w) {
+        typedef typename wViewType::value_type value_type;
+        // workspace w is always 1D view; reinterpret it 
+        Kokkos::View<value_type**,Kokkos::LayoutRight,Kokkos::AnonymousSpace>
+          W(w.data(), A.extent(0), A.extent(1));
+
+        int r_val[3] = {};
+        r_val[0] = TeamCopy<MemberType,Trans::NoTranspose>::invoke(member, A, W);
+        r_val[1] = TeamSetIdentity<MemberType>::invoke(member, A);
+        r_val[2] = TeamSolveLU<MemberType,Trans::NoTranspose,ArgAlgo>::invoke(member, W, A);        
+        return r_val[0]+r_val[1]+r_val[2];
+      }
     };       
       
   }
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_InverseLU_Serial_Impl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_InverseLU_Serial_Impl.hpp
index 8a815de40732..b2ad7bedb32f 100644
--- a/packages/kokkos-kernels/src/batched/KokkosBatched_InverseLU_Serial_Impl.hpp
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_InverseLU_Serial_Impl.hpp
@@ -68,90 +68,6 @@ namespace KokkosBatched {
     }
 #endif
 
-    template<>
-    template<typename AViewType,
-             typename WViewType>
-    KOKKOS_INLINE_FUNCTION
-    int
-    SerialInverseLU<Algo::InverseLU::Unblocked>::
-    invoke(const AViewType &A, 
-           const WViewType &W) {
-        static_assert(AViewType::rank == 2, "A should have two dimensions");
-        static_assert(WViewType::rank == 1, "W should have one dimension");
-        static_assert(std::is_same<typename AViewType::memory_space, typename WViewType::memory_space>::value, "A and W should be on the same memory space");
-        static_assert(!std::is_same<typename WViewType::array_layout, Kokkos::LayoutStride>::value, "W should be an contiguous 1D array");
-        assert(A.extent(0)*A.extent(1)*sizeof(typename AViewType::value_type) <= W.span()*sizeof(typename WViewType::value_type));
-        assert(A.extent(0)==A.extent(1));
-
-        typedef typename AViewType::value_type ScalarType;
-               
-        auto B = Kokkos::View<ScalarType**, Kokkos::LayoutLeft, typename WViewType::memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> >(W.data(), A.extent(0), A.extent(1));
-        
-        const ScalarType one(1.0);
-
-#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-#pragma unroll
-#endif
-        for (size_t i=0;i<A.extent(1);++i) {
-            B(i,i) = one;
-        }
-        
-        //First, compute L inverse by solving the system L*Linv = I for Linv
-        //Second, compute A inverse by solving the system U*Ainv = Linv for Ainv
-        SerialSolveLU<Algo::SolveLU::Unblocked,Trans::NoTranspose>::invoke(A,B);
-		
-#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-#pragma unroll
-#endif
-        for (size_t i=0;i<A.extent(0);++i)
-            for (size_t j=0;j<A.extent(1);++j)
-                A(i,j) = B(i,j);
-        
-        return 0;
-    }
-    
-    template<>
-    template<typename AViewType,
-             typename WViewType>
-    KOKKOS_INLINE_FUNCTION
-    int
-    SerialInverseLU<Algo::InverseLU::Blocked>::
-    invoke(const AViewType &A,
-           const WViewType &W) {
-        static_assert(AViewType::rank == 2, "A should have two dimensions");
-        static_assert(WViewType::rank == 1, "W should have one dimension");
-        static_assert(std::is_same<typename AViewType::memory_space, typename WViewType::memory_space>::value, "A and W should be on the same memory space");
-        static_assert(!std::is_same<typename WViewType::array_layout, Kokkos::LayoutStride>::value, "W should be an contiguous 1D array");
-        assert(A.extent(0)*A.extent(1)*sizeof(typename AViewType::value_type) <= W.span()*sizeof(typename WViewType::value_type));
-        assert(A.extent(0)==A.extent(1));
-
-        typedef typename AViewType::value_type ScalarType;
-        
-        auto B = Kokkos::View<ScalarType**, Kokkos::LayoutLeft, typename WViewType::memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> >(W.data(), A.extent(0), A.extent(1));
-        
-        const ScalarType one(1.0);
-
-#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-#pragma unroll
-#endif
-        for (size_t i=0;i<A.extent(1);++i) {
-            B(i,i) = one;
-        }
-
-        //First, compute L inverse by solving the system L*Linv = I for Linv
-        //Second, compute A inverse by solving the system U*Ainv = Linv for Ainv
-        SerialSolveLU<Algo::SolveLU::Blocked,Trans::NoTranspose>::invoke(A,B);
-
-#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-#pragma unroll
-#endif
-        for (size_t i=0;i<A.extent(0);++i)
-            for (size_t j=0;j<A.extent(1);++j)
-                A(i,j) = B(i,j);
-
-        return 0;
-    }
-
   }
 }
 
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_LU_Decl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_LU_Decl.hpp
index 3ffcedde366c..8113d4a65c17 100644
--- a/packages/kokkos-kernels/src/batched/KokkosBatched_LU_Decl.hpp
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_LU_Decl.hpp
@@ -4,6 +4,7 @@
 
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
+#include "KokkosBatched_Util.hpp"
 #include "KokkosBatched_Vector.hpp"
 
 namespace KokkosBatched {
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp
new file mode 100644
index 000000000000..0f6bdcd7ac55
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp
@@ -0,0 +1,149 @@
+#ifndef __KOKKOSBATCHED_LEFT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP__
+#define __KOKKOSBATCHED_LEFT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_ShiftedTrsv_Serial_Internal.hpp"
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Internal Impl
+    /// ==================== 
+    ///
+    /// this impl follows the flame interface of householder transformation
+    ///
+    struct SerialLeftEigenvectorFromSchurInternal {
+      /// Given a quasi upper triangular matrix S (m x m), this computes all left 
+      /// eigenvectors.
+      /// 
+      /// Parameters:
+      ///   [in]m 
+      ///     A dimension of the square matrix S.
+      ///   [in]S, [in]ss0, [in]ss1 
+      ///     A quasi upper triangular part of Schur decomposition which is computed 
+      ///       A = U^H S U
+      ///   [out]V, [in]vs0, [out]vs1 
+      ///     A set of left eigen vectors.
+      ///   [in]w
+      ///     contiguous workspace that can hold complex array (m)
+      template<typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int m,
+             /* */ ValueType * S, const int ss0, const int ss1,
+             /* */ ValueType * V, const int vs0, const int vs1,
+             /* */ ValueType * w,
+             const int * blks) {
+        typedef ValueType value_type;
+        typedef Kokkos::Details::ArithTraits<value_type> ats;
+        //typedef typename ats::mag_type mag_type;
+        typedef Kokkos::complex<value_type> complex_type;
+
+        const value_type zero(0), one(1);
+        /// SerialSetInternal::invoke(m, m, zero, V, vs0, vs1);
+
+        value_type *b = w; // consider complex case
+
+        /// partitions used for loop iteration
+        Partition2x2<value_type> S_part2x2(ss0, ss1);
+        Partition3x3<value_type> S_part3x3(ss0, ss1);
+        
+        Partition2x1<value_type> V_part2x1(vs0);
+        Partition3x1<value_type> V_part3x1(vs0);
+        
+        /// initial partition of S where ATL has a zero dimension
+        S_part2x2.partWithATL(S, m, m, 0, 0);
+        V_part2x1.partWithAT(V, m, 0);
+
+        //const mag_type tol = ats::epsilon();
+        int m_stl = 0;
+        for (;m_stl<(m-1);) {
+          /// part 2x2 into 3x3
+          const int mA11 = blks[m_stl];
+          assert( ((mA11 == 1) || (mA11 == 2)) && "LeftEigenvectorFromSchur: blk is not 1x1 nor 2x2");
+
+          S_part3x3.partWithABR(S_part2x2, mA11, mA11);
+          V_part3x1.partWithAB(V_part2x1, mA11);
+
+          const int m_stl_plus_mA11 = m_stl+mA11;
+          if (mA11 == 1) {
+            /// real eigenvalue 
+            const value_type lambda = *S_part3x3.A11;
+
+            /// initialize a left hand side
+            b[m_stl] = one;
+            for (int j=0;j<(m-m_stl_plus_mA11);++j) 
+              b[j+m_stl_plus_mA11] = -S_part3x3.A12[j*ss1];
+
+            /// perform shifted trsv (transposed)
+            SerialShiftedTrsvInternalLower::invoke(m-m_stl_plus_mA11, lambda,
+                                                   S_part3x3.A22, ss1, ss0,
+                                                   b+m_stl_plus_mA11, 1,
+                                                   blks+m_stl_plus_mA11);
+            
+            /// copy back to V (row wise copy)
+            for (int j=0;j<m_stl;++j) V_part3x1.A1[j*vs1] = zero;
+            for (int j=m_stl;j<m;++j) V_part3x1.A1[j*vs1] = b[j];                          
+          } else {
+            /// complex eigen pair  
+            const value_type 
+              alpha11 = S_part3x3.A11[0],
+              alpha12 = S_part3x3.A11[ss1],
+              alpha21 = S_part3x3.A11[ss0],
+              beta = ats::sqrt(-alpha12*alpha21);
+
+            const complex_type lambda(alpha11, beta);
+            complex_type * bc = (complex_type*)(b);
+            
+            /// initialize a left hand side
+            bc[m_stl  ] = complex_type(beta, zero);
+            bc[m_stl+1] = complex_type(zero, -alpha12);
+
+            const value_type * S_A12_a = S_part3x3.A12;
+            const value_type * S_A12_b = S_part3x3.A12 + ss0;
+            for (int j=0;j<(m-m_stl_plus_mA11);++j)
+              bc[j+m_stl_plus_mA11] = complex_type(-S_A12_a[j*ss1]*beta, S_A12_b[j*ss1]*alpha12);
+            
+            /// perform shifted trsv
+            SerialShiftedTrsvInternalLower::invoke(m-m_stl_plus_mA11, lambda,
+                                                   S_part3x3.A22, ss1, ss0,
+                                                   bc+m_stl_plus_mA11, 1,
+                                                   blks+m_stl_plus_mA11);
+            
+            /// copy back to V
+            value_type * V_A1_r = V_part3x1.A1;
+            value_type * V_A1_i = V_part3x1.A1 + vs0;
+            for (int j=0;j<m_stl;++j) { 
+              V_A1_r[j*vs1] = zero;
+              V_A1_i[j*vs1] = zero;
+            }
+            for (int j=m_stl;j<m;++j) { 
+              V_A1_r[j*vs1] = bc[j].real();              
+              V_A1_i[j*vs1] = bc[j].imag();
+            }              
+            /// ---------------------------------------------------
+          }
+          S_part2x2.mergeToATL(S_part3x3);
+          V_part2x1.mergeToAT(V_part3x1);
+          m_stl += mA11;
+        }
+        
+        /// case: m_stl = m-1
+        if (m_stl < m) {
+          value_type * VV = V+m_stl*vs0;
+          for (int j=0;j<m_stl;++j) VV[j*vs1] = zero;
+          VV[m_stl*vs1] = one;
+        }
+
+        return 0;
+      }
+    };
+
+  }/// end namespace Experimental
+} /// end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Normalize_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Normalize_Internal.hpp
new file mode 100644
index 000000000000..cc2e582a61b3
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Normalize_Internal.hpp
@@ -0,0 +1,79 @@
+#ifndef __KOKKOSBATCHED_NORMALIZE_INTERNAL_HPP__
+#define __KOKKOSBATCHED_NORMALIZE_INTERNAL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Internal Impl
+    /// ==================== 
+    struct SerialNormalizeInternal {
+      template<typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int m,
+             /* */ ValueType *__restrict__ v, const int vs) {
+        typedef ValueType value_type;
+        typedef Kokkos::Details::ArithTraits<value_type> ats;
+        typedef typename ats::mag_type mag_type;
+
+        mag_type norm(0);
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+        for (int i=0;i<m;++i) {
+          const auto v_at_i = v[i*vs];
+          norm += ats::real(v_at_i*ats::conj(v_at_i));
+        }
+        norm = Kokkos::Details::ArithTraits<mag_type>::sqrt(norm);
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+        for (int i=0;i<m;++i) 
+          v[i*vs] /= norm;
+
+        return 0;
+      }
+
+      template<typename RealType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int m,
+             /* */ RealType *__restrict__ vr, const int vrs,
+             /* */ RealType *__restrict__ vi, const int vis) {
+        typedef RealType real_type;
+        typedef Kokkos::Details::ArithTraits<real_type> ats;
+        typedef typename ats::mag_type mag_type;
+
+        mag_type norm(0);
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+        for (int i=0;i<m;++i) {
+          const auto vr_at_i = vr[i*vrs];
+          const auto vi_at_i = vi[i*vis];
+          norm += vr_at_i*vr_at_i+vi_at_i*vi_at_i;
+        }
+        norm = Kokkos::Details::ArithTraits<mag_type>::sqrt(norm);
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+        for (int i=0;i<m;++i) { 
+          vr[i*vrs] /= norm;
+          vi[i*vis] /= norm;
+        }
+
+        return 0;
+      }
+    };
+
+  }//  end namespace Experimental
+} // end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_QR_FormQ_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_QR_FormQ_Serial_Internal.hpp
new file mode 100644
index 000000000000..9241caad9949
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_QR_FormQ_Serial_Internal.hpp
@@ -0,0 +1,60 @@
+#ifndef __KOKKOSBATCHED_QR_FORM_Q_SERIAL_INTERNAL_HPP__
+#define __KOKKOSBATCHED_QR_FORM_Q_SERIAL_INTERNAL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Set_Internal.hpp"
+#include "KokkosBatched_SetIdentity_Internal.hpp"
+#include "KokkosBatched_ApplyQ_Serial_Internal.hpp"
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Internal Impl
+    /// ==================== 
+    ///
+    /// this impl follows the flame interface of householder transformation
+    ///
+    struct SerialQR_FormQ_Internal {
+      template<typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int m, 
+             const int k, 
+             /* */ ValueType * A, const int as0, const int as1,
+             /* */ ValueType * t, const int ts,
+             /* */ ValueType * Q, const int qs0, const int qs1,
+             /* */ ValueType * w,
+             const bool is_Q_zero = false) {
+        typedef ValueType value_type;
+        
+        /// Given a matrix A that includes QR factorization
+        /// it forms a unitary matrix Q 
+        ///   B = Q = (H0 H1 H2 H3 ... H(k-1)) I
+        /// where
+        ///   A is m x k (holding H0, H1 ... H(k-1)
+        ///   t is k x 1 
+        ///   B is m x m
+
+        // set identity
+        if (is_Q_zero)
+          SerialSetInternal::invoke(m, value_type(1), Q, qs0+qs1);
+        else
+          SerialSetIdentityInternal::invoke(m, Q, qs0, qs1);
+
+        return SerialApplyQ_LeftNoTransForwardInternal
+          ::invoke(m, m, k,
+                   A, as0, as1, 
+                   t, ts,
+                   Q, qs0, qs1,
+                   w);
+      }
+    };
+
+  }//  end namespace Experimental
+} // end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_QR_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_QR_Serial_Internal.hpp
new file mode 100644
index 000000000000..2420ed2de8d1
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_QR_Serial_Internal.hpp
@@ -0,0 +1,80 @@
+#ifndef __KOKKOSBATCHED_QR_SERIAL_INTERNAL_HPP__
+#define __KOKKOSBATCHED_QR_SERIAL_INTERNAL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Householder_Serial_Internal.hpp"
+#include "KokkosBatched_ApplyHouseholder_Serial_Internal.hpp"
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Internal Impl
+    /// ==================== 
+    ///
+    /// this impl follows the flame interface of householder transformation
+    ///
+    struct SerialQR_Internal {
+      template<typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int m, // m = NumRows(A)
+             const int n, // n = NumCols(A)
+             /* */ ValueType * A, const int as0, const int as1,
+             /* */ ValueType * t, const int ts,
+             /* */ ValueType * w) {
+        typedef ValueType value_type;
+        
+        /// Given a matrix A, it computes QR decomposition of the matrix
+        ///  - t is to store tau and w is for workspace
+
+        // partitions used for loop iteration 
+        Partition2x2<value_type> A_part2x2(as0, as1);
+        Partition3x3<value_type> A_part3x3(as0, as1);
+
+        Partition2x1<value_type> t_part2x1(ts);
+        Partition3x1<value_type> t_part3x1(ts);
+
+        // initial partition of A where ATL has a zero dimension
+        A_part2x2.partWithATL(A, m, n, 0, 0);
+        t_part2x1.partWithAT (t, m, 0);
+
+        for (int m_atl=0;m_atl<m;++m_atl) {
+          // part 2x2 into 3x3
+          A_part3x3.partWithABR(A_part2x2, 1, 1);
+          const int m_A22 = m - m_atl - 1;
+          const int n_A22 = n - m_atl - 1;
+          
+          t_part3x1.partWithAB (t_part2x1, 1);
+          value_type *tau = t_part3x1.A1;
+
+          /// -----------------------------------------------------
+
+          // perform householder transformation
+          SerialLeftHouseholderInternal::invoke(m_A22,
+                                                A_part3x3.A11,
+                                                A_part3x3.A21, as0,
+                                                tau);
+            
+          // left apply householder to A22
+          SerialApplyLeftHouseholderInternal::invoke(m_A22, n_A22,
+                                                     tau,
+                                                     A_part3x3.A21, as0,
+                                                     A_part3x3.A12, as1,
+                                                     A_part3x3.A22, as0, as1, 
+                                                     w);            
+          /// -----------------------------------------------------
+          A_part2x2.mergeToATL(A_part3x3);
+          t_part2x1.mergeToAT (t_part3x1);
+        }
+        return 0;
+      }
+    };
+
+  }//  end namespace Experimental
+} // end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp
new file mode 100644
index 000000000000..c2cf9cb0c168
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp
@@ -0,0 +1,141 @@
+#ifndef __KOKKOSBATCHED_RIGHT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP__
+#define __KOKKOSBATCHED_RIGHT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_ShiftedTrsv_Serial_Internal.hpp"
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Internal Impl
+    /// ==================== 
+    ///
+    /// this impl follows the flame interface of householder transformation
+    ///
+    struct SerialRightEigenvectorFromSchurInternal {
+      /// Given a quasi upper triangular matrix S (m x m), this computes all right 
+      /// eigenvectors.
+      /// 
+      /// Parameters:
+      ///   [in]m 
+      ///     A dimension of the square matrix S.
+      ///   [in]S, [in]ss0, [in]ss1 
+      ///     A quasi upper triangular part of Schur decomposition which is computed 
+      ///       A = U^H S U
+      ///   [out]V, [in]vs0, [out]vs1 
+      ///     A set of right eigen vectors.
+      ///   [in]w
+      ///     contiguous workspace that can hold complex array (m)
+      template<typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int m,
+             /* */ ValueType * S, const int ss0, const int ss1,
+             /* */ ValueType * V, const int vs0, const int vs1,
+             /* */ ValueType * w,
+             const int * blks) {
+        typedef ValueType value_type;
+        typedef Kokkos::Details::ArithTraits<value_type> ats;
+        //typedef typename ats::mag_type mag_type;
+        typedef Kokkos::complex<value_type> complex_type;
+
+        const value_type zero(0), one(1);
+        //const int ss(ss0+ss1);
+        /// SerialSetInternal::invoke(m, m, zero, V, vs0, vs1);
+
+        value_type *b = w; // consider complex case
+
+        /// partitions used for loop iteration
+        Partition2x2<value_type> S_part2x2(ss0, ss1);
+        Partition3x3<value_type> S_part3x3(ss0, ss1);
+        
+        Partition1x2<value_type> V_part1x2(vs1);
+        Partition1x3<value_type> V_part1x3(vs1);
+        
+        /// initial partition of S where ABR has a zero dimension
+        S_part2x2.partWithABR(S, m, m, 0, 0);
+        V_part1x2.partWithAR(V, m, 0);
+
+        //const mag_type tol = ats::epsilon();
+        int m_stl = m;
+        for (;m_stl>0;) {
+          /// part 2x2 into 3x3
+          const int mA11 = blks[m_stl-1];
+          assert( ((mA11 == 1) || (mA11 == 2)) && "RightEigenvectorFromSchur: blk is not 1x1 nor 2x2");
+
+          S_part3x3.partWithATL(S_part2x2, mA11, mA11);
+          V_part1x3.partWithAL(V_part1x2, mA11);
+
+          const int m_stl_minus_mA11 = m_stl - mA11;
+          if (mA11 == 1) {
+            /// real eigenvalue 
+            const value_type lambda = *S_part3x3.A11;
+            
+            /// initialize a right eigen vector
+            for (int i=0;i<m_stl_minus_mA11;++i) 
+              b[i] = -S_part3x3.A01[i*ss0];
+            b[m_stl-1] = one;
+            
+            /// perform shifted trsv
+            SerialShiftedTrsvInternalUpper::invoke(m_stl_minus_mA11, lambda,
+                                                   S_part3x3.A00, ss0, ss1,
+                                                   w, 1,
+                                                   blks);
+            
+            /// copy back to V
+            for (int i=0;i<m_stl;++i) V_part1x3.A1[i*vs0] = w[i];              
+            for (int i=m_stl;i<m;++i) V_part1x3.A1[i*vs0] = zero;
+          } else {
+            /// complex eigen pair  
+            const value_type 
+              alpha11 = S_part3x3.A11[0],
+              alpha12 = S_part3x3.A11[ss1],
+              alpha21 = S_part3x3.A11[ss0],
+              beta = ats::sqrt(-alpha12*alpha21);
+
+            const complex_type lambda(alpha11, beta);
+            complex_type * bc = (complex_type*)(b);
+            
+            /// initialize a right eigen vector
+            const value_type * S_A01_a = S_part3x3.A01;
+            const value_type * S_A01_b = S_part3x3.A01 + ss1;
+            for (int i=0;i<m_stl_minus_mA11;++i) 
+              bc[i] = complex_type(-S_A01_a[i*ss0]*beta, S_A01_b[i*ss0]*alpha21);
+            bc[m_stl-2] = complex_type(beta,  zero);
+            bc[m_stl-1] = complex_type(zero, -alpha21);
+            
+            /// perform shifted trsv
+            SerialShiftedTrsvInternalUpper::invoke(m_stl_minus_mA11, lambda,
+                                                   S_part3x3.A00, ss0, ss1,
+                                                   bc, 1,
+                                                   blks);
+            
+            /// copy back to V
+            value_type * V_A1_r = V_part1x3.A1;
+            value_type * V_A1_i = V_part1x3.A1 + vs1;
+            for (int i=0;i<m_stl;++i) { 
+              V_A1_r[i*vs0] = bc[i].real();              
+              V_A1_i[i*vs0] = bc[i].imag();
+            }              
+            for (int i=m_stl;i<m;++i) { 
+              V_A1_r[i*vs0] = zero;
+              V_A1_i[i*vs0] = zero;
+            }
+            /// ---------------------------------------------------
+          }
+          S_part2x2.mergeToABR(S_part3x3);
+          V_part1x2.mergeToAR(V_part1x3);
+          m_stl -= mA11;
+        }
+        return 0;
+      }
+    };
+
+  }/// end namespace Experimental
+} /// end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Scale_Decl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Scale_Decl.hpp
index 53627fd97073..26d897d40787 100644
--- a/packages/kokkos-kernels/src/batched/KokkosBatched_Scale_Decl.hpp
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Scale_Decl.hpp
@@ -4,6 +4,9 @@
 
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
+
 
 namespace KokkosBatched {
   namespace Experimental {
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Schur2x2_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Schur2x2_Serial_Internal.hpp
new file mode 100644
index 000000000000..ef5747910b36
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Schur2x2_Serial_Internal.hpp
@@ -0,0 +1,129 @@
+#ifndef __KOKKOSBATCHED_SCHUR2X2_SERIAL_INTERNAL_HPP__
+#define __KOKKOSBATCHED_SCHUR2X2_SERIAL_INTERNAL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Internal Impl
+    /// ==================== 
+    ///
+    /// this impl follows the flame interface of householder transformation
+    ///
+    struct SerialSchur2x2Internal {
+      template<typename RealType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(RealType * alpha00, RealType * alpha01,
+             RealType * alpha10, RealType * alpha11,
+             Kokkos::pair<RealType,RealType> * G,
+             Kokkos::complex<RealType> * lambda1,
+             Kokkos::complex<RealType> * lambda2,
+             bool * is_complex) {
+        typedef RealType real_type;        
+        typedef Kokkos::Details::ArithTraits<real_type> ats;
+        const real_type zero(0), one(1), half(0.5), minus_one(-1);
+        /// compute G = [ gamma -sigma;
+        ///               sigma  gamma ];
+        /// G.first = gamma and G.second = sigma
+        /// this rotation satisfy the following
+        ///   G' [alpha00 alpha01   G = [ beta00 beta01;
+        ///       alpha10 alpha11 ]       beta10 beta11 ];
+        /// where either 
+        ///   1) beta00 = beta11 and beta01*beta10 < 0
+        ///   2) beta10 = 0
+        const real_type tol = ats::epsilon()*real_type(100);
+        if (ats::abs(*alpha10) < tol) {
+          /// no rotation
+          *G = Kokkos::pair<real_type,real_type>(one, zero);
+          /// two real eigen values
+          *lambda1 = Kokkos::complex<real_type>(*alpha00, zero);
+          *lambda2 = Kokkos::complex<real_type>(*alpha11, zero);
+          *is_complex = false;
+        } else if (ats::abs(*alpha01) < tol) {
+          /// 90 degree rotation (permutation)
+          *G = Kokkos::pair<real_type,real_type>(zero, one);
+          /// [ 0 1 ][alpha00 0       [ 0 -1  --> [ alpha11 -alpha10
+          ///  -1 0 ] alpha10 alpha11]  1  0]       0        alpha00]
+          const real_type tmp = *alpha00; *alpha00 = *alpha11; *alpha11 = tmp;
+          *alpha01 = -(*alpha10); *alpha10 = zero;
+          /// two real eigen values 
+          *lambda1 = Kokkos::complex<real_type>(*alpha00, zero);
+          *lambda2 = Kokkos::complex<real_type>(*alpha11, zero);
+          *is_complex = false;
+        } else if (ats::abs(*alpha00-*alpha11) < tol && (*alpha01)*(*alpha10) > zero) {
+          // no rotation (already the standard schur form)
+          *G = Kokkos::pair<real_type,real_type>(one, zero);
+          /// two real eigen values
+          *lambda1 = Kokkos::complex<real_type>(*alpha00, zero);
+          *lambda2 = Kokkos::complex<real_type>(*alpha11, zero);
+          *is_complex = false;          
+        } else {
+          /// rotation to equalize diagonals
+          const real_type a = (*alpha00)-(*alpha11);
+          const real_type b = (*alpha01)+(*alpha10);
+          const real_type l = ats::sqrt(a*a+b*b);
+          const real_type c = ats::sqrt(half*(one+ats::abs(b)/l));
+          const real_type s = -((half*a)/(l*c))*(b > zero ? one : minus_one);
+          *G = Kokkos::pair<real_type,real_type>(c, s);
+          /// [ gamma sigma ][ alpha00 alpha01  [ gamma -sigma  --> [ alpha11 -alpha10
+          ///  -sigma gamma ]  alpha10 alpha11 ]  sigma  gamma ]       0        alpha00]
+          const real_type a00 = *alpha00, a01 = *alpha01;
+          const real_type a10 = *alpha10, a11 = *alpha11;
+          const real_type cc = c*c, cs = c*s, ss= s*s;
+          *alpha00 =  cc*a00 + cs*a01 + cs*a10 + ss*a11; 
+          *alpha01 = -cs*a00 + cc*a01 - ss*a10 + cs*a11;
+          *alpha10 = -cs*a00 - ss*a01 + cc*a10 + cs*a11;
+          *alpha11 =  ss*a00 - cs*a01 - cs*a10 + cc*a11;
+
+          const real_type tmp = (*alpha00 + *alpha11)*half;
+          *alpha00 = tmp; 
+          *alpha11 = tmp;
+
+          const real_type mult_alpha_offdiags = (*alpha10)*(*alpha01);
+          if (mult_alpha_offdiags > zero) {
+            /// transforms the matrix into a upper triangular
+            const real_type sqrt_mult_alpha_offdiags = ats::sqrt(mult_alpha_offdiags);
+
+            /// redefine the rotation matrix
+            //const real_type sqrt_abs_alpha01 = ats::sqrt(ats::abs(*alpha01));
+            //const real_type sqrt_abs_alpha10 = ats::sqrt(ats::abs(*alpha10));
+            const real_type abs_sum_offidags = ats::abs((*alpha01)+(*alpha10));
+            const real_type c1 = ats::sqrt(ats::abs(*alpha01)/abs_sum_offidags);
+            const real_type s1 = ats::sqrt(ats::abs(*alpha10)/abs_sum_offidags);
+            const real_type sign_alpha10 = *alpha10 > zero ? one : minus_one;
+
+            *G = Kokkos::pair<real_type,real_type>(c*c1-s*s1,c*s1+s*c1);
+
+            /// apply rotation to 2x2 matrix so that alpha10 becomes zero
+            *alpha00 = tmp + sign_alpha10*sqrt_mult_alpha_offdiags;
+            *alpha11 = tmp - sign_alpha10*sqrt_mult_alpha_offdiags;
+            *alpha01 = (*alpha01)-(*alpha10);
+            *alpha10 = zero;
+
+            // two real eigen values
+            *lambda1 = Kokkos::complex<real_type>(*alpha00);
+            *lambda2 = Kokkos::complex<real_type>(*alpha11);
+            *is_complex = false;
+          } else {
+            /// two complex eigen values
+            const real_type sqrt_mult_alpha_offdiags = ats::sqrt(-mult_alpha_offdiags);
+            *lambda1 = Kokkos::complex<real_type>(tmp, sqrt_mult_alpha_offdiags);
+            *lambda2 = Kokkos::complex<real_type>(lambda1->real(), -lambda1->imag());
+            *is_complex = true;
+          }
+        }
+        return 0;
+      }
+    };
+
+  }//  end namespace Experimental
+} // end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Schur_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Schur_Serial_Internal.hpp
new file mode 100644
index 000000000000..a4a4aadbf08c
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Schur_Serial_Internal.hpp
@@ -0,0 +1,273 @@
+#ifndef __KOKKOSBATCHED_SCHUR_SERIAL_INTERNAL_HPP__
+#define __KOKKOSBATCHED_SCHUR_SERIAL_INTERNAL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_WilkinsonShift_Serial_Internal.hpp"
+#include "KokkosBatched_ApplyGivens_Serial_Internal.hpp"
+#include "KokkosBatched_Schur2x2_Serial_Internal.hpp"
+#include "KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp"
+#include "KokkosBatched_Francis_Serial_Internal.hpp"
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Internal Impl
+    /// ==================== 
+    ///
+    /// this impl follows the flame interface of householder transformation
+    ///
+    struct SerialSchurInternal {
+      /// Given a strictly Hessenberg matrix H (m x m), this computes schur decomposition
+      /// using the Francis method and stores them into a vector e. This routine does 
+      /// not scale nor balance the matrix for the numerical stability.
+      ///    H = Z T Z^H and T = Z^H H Z
+      /// Parameters:
+      ///   [in]m 
+      ///     A dimension of the square matrix H.
+      ///   [in/out]H, [in]hs0, [in]hs1 
+      ///     Real Hessenberg matrix H(m x m) with strides hs0 and hs1.
+      ///     Entering the routine, H is assumed to have a upper Hessenberg form, where
+      ///     all subdiagonals are zero. The matrix is overwritten as a upper triangular T 
+      ///     on exit.
+      ///   [in/out]Z, [in]zs0, [in]zs1
+      ///     Unitary matrix resulting from Schur decomposition. With a restarting option,
+      ///     the matrix may contain previous partial computation results.
+      ///   [in/out]w, [in]wlen
+      ///     Contiguous workspace of which size is wlen. When restart is true, this 
+      ///     workspace is not corrupted after the previous iteration. Temporarily, it stores
+      ///     subdiag values and given rotations. wlen should be at least 3*m.
+      ///   [in]restart(false)
+      ///     With a restart option, the routine assume that the matrix H and the vector e 
+      ///     contain the partial results from the previous run. When m = 1 or 2, this option
+      ///     won't work as the routine always computes the all eigenvalues.
+      ///   [in]user_max_iteration(300)
+      ///     Unlike LAPACK which uses various methods for different types of matrices,
+      ///     this routine uses the Francis method only. A user can set the maximum number 
+      ///     of iterations. When it reaches the maximum iteration counts without converging 
+      ///     all eigenvalues, the routine returns -1. 
+      template<typename RealType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int m,
+             /* */ RealType * H, const int hs0, const int hs1,
+             /* */ RealType * Z, const int zs0, const int zs1,
+             /* */ RealType * w, const int wlen, 
+             const bool restart = false,
+             const int user_max_iteration = -1) {
+        typedef RealType real_type;
+        typedef Kokkos::Details::ArithTraits<real_type> ats;
+        const real_type /* one(1), */zero(0), tol = 1e2*ats::epsilon();
+        const int max_iteration = user_max_iteration < 0 ? 300 : user_max_iteration;
+        if (wlen < m*5) 
+          Kokkos::abort("Error: provided workspace is smaller than 3*m");
+
+        int r_val = 0;
+        if (restart) {
+          if (m <= 2) 
+            Kokkos::abort("Error: restart option cannot be used for m=1 or m=2");
+        } else {
+          /// do not touch input
+          /// SerialSetIdentityInternal::invoke(m, Z, zs0, zs1);
+        }
+
+        // workspaces
+        real_type *subdiags = w; 
+        Kokkos::pair<real_type,real_type> *Gs = (Kokkos::pair<real_type,real_type>*)(w+m);
+        if (!restart) {
+          /// initialize workspace and Gs
+          for (int i=0;i<m;++i) 
+            subdiags[i] = zero;
+        }         
+
+        const int hs = hs0+hs1;  /// diagonal stride        
+        switch (m) {
+        case 0: { /* do nothing */ break; }
+        case 1: { /* do nothing */ break; }
+        case 2: {
+          /// compute eigenvalues from the characteristic determinant equation
+          bool is_complex;
+          Kokkos::complex<real_type> lambda1, lambda2;
+          Kokkos::pair<real_type,real_type> G;
+          SerialSchur2x2Internal::invoke(H,     H+hs1,
+                                         H+hs0, H+hs, 
+                                         &G,
+                                         &lambda1, &lambda2,
+                                         &is_complex);
+
+          G.second = -G.second; // transpose          
+          SerialApplyRightGivensInternal::invoke(G,     2, 
+                                                 Z,     zs0,           
+                                                 Z+zs1, zs0);          
+          break;
+        }
+        default: {
+          /// Francis method 
+          int iter(0);             /// iteration count
+          bool converge = false;   /// bool to check all eigenvalues are converged
+
+          while (!converge && iter < max_iteration) {
+            /// Step 1: find a set of unrevealed eigenvalues 
+            int cnt = 1;
+            
+            /// find mbeg (first nonzero subdiag value)
+            for (;cnt<m;++cnt) {
+              const auto val = ats::abs(*(H+cnt*hs-hs1));
+              if (val > tol) break;
+            }
+            const int mbeg = cnt-1;
+            
+            /// find mend (first zero subdiag value)
+            for (;cnt<m;++cnt) {
+              const auto val = ats::abs(*(H+cnt*hs-hs1));
+              if (val < tol) break;              
+            }
+            const int mend = cnt;
+            const int mdiff = mend - mbeg;
+            const int mend_minus_two_mult_hs0 = (mend-2)*hs0;
+
+            /// Step 2: if there exist non-converged eigen values
+            if (1 < mdiff) {
+#             if 0 /// implicit QR with shift for testing 
+              {
+                /// Rayleigh quotient shift 
+                const real_type shift = *(H+(mend-1)*hs); 
+                SerialHessenbergQR_WithShiftInternal::invoke(mbeg, mend, m, 
+                                                             H, hs0, hs1,
+                                                             shift,
+                                                             Gs, true);
+
+                for (int i=mbeg;i<(mend-1);++i) {
+                  const Kokkos::pair<real_type,real_type> G(Gs[i].first, -Gs[i].second);
+                  SerialApplyRightGivensInternal::invoke(G, m,
+                                                         Z+i*zs1,     zs0,
+                                                         Z+i*zs1+zs1, zs0);                    
+                }
+              }
+#             endif
+
+#             if 1
+              {
+                /// find a complex eigen pair
+                Kokkos::complex<real_type> lambda1, lambda2;
+                bool is_complex;
+                real_type *sub2x2 = H+(mend-2)*hs;
+                if (2 == mdiff) {
+                  Kokkos::pair<real_type,real_type> G;
+                  SerialSchur2x2Internal::invoke(sub2x2,     sub2x2+hs1,
+                                                 sub2x2+hs0, sub2x2+hs,
+                                                 &G,
+                                                 &lambda1, &lambda2,
+                                                 &is_complex);
+                  subdiags[mend-1] = sub2x2[hs0];
+                    
+                  /// apply G' from left
+                  G.second = -G.second;
+                  SerialApplyLeftGivensInternal::invoke (G, m-mend,
+                                                         sub2x2    +2*hs1, hs1,
+                                                         sub2x2+hs0+2*hs1, hs1);
+
+                  /// apply (G')' from right
+                  SerialApplyRightGivensInternal::invoke(G, mend-2,
+                                                         sub2x2    -mend_minus_two_mult_hs0, hs0,
+                                                         sub2x2+hs1-mend_minus_two_mult_hs0, hs0);
+                  sub2x2[hs0] = zero;
+
+                  /// apply (G')' from right to compute Z
+                  SerialApplyRightGivensInternal::invoke(G, m,
+                                                         Z+(mend-2)*zs1,     zs0,
+                                                         Z+(mend-1)*zs1, zs0);                    
+                  
+                } else {
+                  SerialWilkinsonShiftInternal::invoke(sub2x2[0],   sub2x2[hs1], 
+                                                       sub2x2[hs0], sub2x2[hs],
+                                                       &lambda1, &lambda2,
+                                                       &is_complex);                    
+                  
+                  SerialFrancisInternal::invoke(mbeg, mend, m, 
+                                                H, hs0, hs1,
+                                                lambda1, lambda2,
+                                                is_complex,
+                                                Gs, true);
+                  /* */ auto    &val1 = *(sub2x2+hs0);
+                  /* */ auto    &val2 = *(sub2x2-hs1);
+                  const auto abs_val1 = ats::abs(val1);
+                  const auto abs_val2 = ats::abs(val2);
+
+                  for (int i=mbeg;i<(mend-1);++i) {
+                    const Kokkos::pair<real_type,real_type> G0(Gs[2*i  ].first, -Gs[2*i  ].second);
+                    const Kokkos::pair<real_type,real_type> G1(Gs[2*i+1].first, -Gs[2*i+1].second);
+                    SerialApplyRightGivensInternal::invoke(G0, m,
+                                                           Z+i*zs1,       zs0,
+                                                           Z+i*zs1+1*zs1, zs0);                    
+                    SerialApplyRightGivensInternal::invoke(G1, m,
+                                                           Z+i*zs1,       zs0,
+                                                           Z+i*zs1+2*zs1, zs0);                    
+                  }
+                  
+                  /// convergence check
+                  if (abs_val1 < tol) { 
+                    val1 = zero;
+                  } else if (abs_val2 < tol) {
+                    /// preserve the standard schur form
+                    Kokkos::pair<real_type,real_type> G;
+                    SerialSchur2x2Internal::invoke(sub2x2,     sub2x2+hs1,
+                                                   sub2x2+hs0, sub2x2+hs,
+                                                   &G,
+                                                   &lambda1, &lambda2,
+                                                   &is_complex);
+                    subdiags[mend-1] = val1;
+                    
+                    /// apply G' from left
+                    G.second = -G.second;
+                    SerialApplyLeftGivensInternal::invoke (G, m-mend,
+                                                           sub2x2    +2*hs1, hs1,
+                                                           sub2x2+hs0+2*hs1, hs1);
+
+                    // apply (G')' from right
+                    SerialApplyRightGivensInternal::invoke(G, mend-2,
+                                                           sub2x2    -mend_minus_two_mult_hs0, hs0,
+                                                           sub2x2+hs1-mend_minus_two_mult_hs0, hs0);
+                    val1 = zero;
+                    val2 = zero;
+
+                    // apply (G')' from right
+                    SerialApplyRightGivensInternal::invoke(G, m,
+                                                           Z+(mend-2)*zs1, zs0,
+                                                           Z+(mend-1)*zs1, zs0);                    
+                  }
+                }
+              }
+#             endif 
+            } else {
+              /// all eigenvalues are converged
+              converge = true;
+            }
+            ++iter;
+          }
+          /// Step 3: record missing real eigenvalues from the diagonals
+          if (converge) {
+            // recover subdiags            
+            real_type *Hs = H-hs1;
+            for (int i=1;i<m;++i) {                 
+              Hs[i*hs] = subdiags[i];
+            }
+            r_val = 0;
+          } else {
+            r_val = -1;
+          }
+          break;
+        }          
+        }
+        return r_val;
+        }
+      };
+
+
+  }/// end namespace Experimental
+  } /// end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_SetIdentity_Decl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_SetIdentity_Decl.hpp
new file mode 100644
index 000000000000..338a1558c63f
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_SetIdentity_Decl.hpp
@@ -0,0 +1,63 @@
+#ifndef __KOKKOSBATCHED_SET_IDENTITY_DECL_HPP__
+#define __KOKKOSBATCHED_SET_IDENTITY_DECL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
+
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial SetIdentity
+    ///
+
+    struct SerialSetIdentity {
+      template<typename AViewType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const AViewType &A);
+    };
+
+    ///
+    /// Team Set
+    ///
+
+    template<typename MemberType>
+    struct TeamSetIdentity {
+      template<typename AViewType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const MemberType &member,
+             const AViewType &A);
+    };
+
+
+    ///
+    /// Selective Interface
+    ///
+    template<typename MemberType,
+             typename ArgMode>
+    struct SetIdentity {
+      template<typename AViewType>
+      KOKKOS_FORCEINLINE_FUNCTION
+      static int
+      invoke(const MemberType &member,
+             const AViewType &A) {
+        int r_val = 0;
+        if (std::is_same<ArgMode,Mode::Serial>::value) {
+          r_val = SerialSetIdentity::invoke(A);
+        } else if (std::is_same<ArgMode,Mode::Team>::value) {
+          r_val = TeamSetIdentity<MemberType>::invoke(member, A);
+        } 
+        return r_val;
+      }
+    };      
+
+  }
+}
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_SetIdentity_Impl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_SetIdentity_Impl.hpp
new file mode 100644
index 000000000000..4cd7da82da4f
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_SetIdentity_Impl.hpp
@@ -0,0 +1,48 @@
+#ifndef __KOKKOSBATCHED_SET_IDENTITY_IMPL_HPP__
+#define __KOKKOSBATCHED_SET_IDENTITY_IMPL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_SetIdentity_Internal.hpp"
+
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Impl
+    /// ===========
+      
+    template<typename AViewType>
+    KOKKOS_INLINE_FUNCTION
+    int
+    SerialSetIdentity::
+    invoke(const AViewType &A) {
+      return SerialSetIdentityInternal::
+        invoke(A.extent(0), 
+               A.data(), A.stride_0(), A.stride_1());
+    }
+
+    ///
+    /// Team Impl
+    /// =========
+              
+    template<typename MemberType>
+    template<typename AViewType>
+    KOKKOS_INLINE_FUNCTION
+    int
+    TeamSetIdentity<MemberType>::
+    invoke(const MemberType &member, 
+           const AViewType &A) {
+      return TeamSetIdentityInternal::
+        invoke(member, 
+               A.extent(0), 
+               A.data(), A.stride_0(), A.stride_1());
+    }
+ 
+  } // end namespace Experimental
+} //end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_SetIdentity_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_SetIdentity_Internal.hpp
new file mode 100644
index 000000000000..1c4e653b3154
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_SetIdentity_Internal.hpp
@@ -0,0 +1,65 @@
+#ifndef __KOKKOSBATCHED_SET_IDENTITY_INTERNAL_HPP__
+#define __KOKKOSBATCHED_SET_IDENTITY_INTERNAL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Internal Impl
+    /// ==================== 
+    struct SerialSetIdentityInternal {
+      template<typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int m, 
+             /* */ ValueType *__restrict__ A, const int as0, const int as1) {
+        const ValueType one(1), zero(0);
+        for (int j=0;j<m;++j) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+          for (int i=0;i<m;++i) {
+            A[i*as0+j*as1] = i == j ? one : zero;
+          }
+        }
+        
+        return 0;
+      }
+    };
+
+    ///
+    /// Team Internal Impl
+    /// ==================
+    struct TeamSetIdentityInternal {
+      template<typename MemberType, 
+               typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const MemberType &member,
+             const int m, 
+             /* */ ValueType *__restrict__ A, const int as0, const int as1) {
+        const ValueType one(1), zero(0);
+        Kokkos::parallel_for
+          (Kokkos::TeamThreadRange(member,0,m),
+           [&](const int &i) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+        for (int j=0;j<m;++j) 
+          A[i*as0+j*as1] = i == j ? one : zero;
+          });
+        
+        return 0;
+      }
+    };
+
+  }//  end namespace Experimental
+} // end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_SetTriangular_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_SetTriangular_Internal.hpp
new file mode 100644
index 000000000000..48c242f9d854
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_SetTriangular_Internal.hpp
@@ -0,0 +1,40 @@
+#ifndef __KOKKOSBATCHED_SET_TRIANGULAR_INTERNAL_HPP__
+#define __KOKKOSBATCHED_SET_TRIANGULAR_INTERNAL_HPP__
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Internal Impl
+    /// ==================== 
+    struct SerialSetLowerTriangularInternal {
+      template<typename ScalarType,
+               typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int m, const int n,
+             const int dist,
+             const ScalarType alpha,
+             /* */ ValueType *__restrict__ A, const int as0, const int as1) {
+        for (int j=0;j<n;++j) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#pragma unroll
+#endif
+          for (int i=(j+dist);i<m;++i) {
+            A[i*as0+j*as1] = alpha;
+          }
+        }
+        
+        return 0;
+      }
+    };
+
+  }//  end namespace Experimental
+} // end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Set_Decl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Set_Decl.hpp
index 3b9d50ce6027..ee1befe9ffa5 100644
--- a/packages/kokkos-kernels/src/batched/KokkosBatched_Set_Decl.hpp
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Set_Decl.hpp
@@ -4,6 +4,9 @@
 
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
+
 
 namespace KokkosBatched {
   namespace Experimental {
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp
new file mode 100644
index 000000000000..4086fa891713
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp
@@ -0,0 +1,158 @@
+#ifndef __KOKKOSBATCHED_SHIFTED_TRSV_SERIAL_INTERNAL_HPP__
+#define __KOKKOSBATCHED_SHIFTED_TRSV_SERIAL_INTERNAL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+#include "KokkosBatched_Set_Internal.hpp"
+#include "KokkosBatched_Scale_Internal.hpp"
+
+#include "KokkosBatched_InnerTrsm_Serial_Impl.hpp"
+#include "KokkosBatched_Gemv_Serial_Internal.hpp"
+
+
+namespace KokkosBatched {
+  namespace Experimental {
+
+    ///
+    /// Serial Internal Impl
+    /// ====================
+
+    ///
+    /// Lower
+    ///
+
+    struct SerialShiftedTrsvInternalLower {
+      template<typename ScalarType,
+               typename ValueTypeA,
+               typename ValueTypeB>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int m,
+             const ScalarType lambda,
+             const ValueTypeA *__restrict__ A, const int as0, const int as1,
+             /* */ ValueTypeB *__restrict__ b, const int bs0,
+             const int *__restrict__ blks) {
+        const int as = as0+as1;
+
+        int p = 0;
+        for (;p<m;) {
+          const int blk = blks[p], iend = m-p-blk;
+          assert( ((blk == 1) || (blk == 2)) && "ShiftedTrsvLower: blocks are not 1x1 or 2x2");
+          if (blk == 1) {
+            const auto alpha11 = A[p*as]-lambda;
+            ValueTypeB *__restrict__ beta1 = b+p*bs0;
+
+            // with __restrict__ a compiler assumes that the pointer is not accessed by others
+            // op(/=) uses this pointer and changes the associated values, which brings a compiler problem
+            *beta1 = *beta1 / alpha11;
+
+            if (iend) {
+              const ValueTypeA *__restrict__ a21 = A+p*as+as0; 
+              ValueTypeB *__restrict__ b2 = beta1+bs0;
+              for (int i=0;i<iend;++i)
+                b2[i*bs0] -= a21[i*as0] * (*beta1);              
+            }
+          } else {
+            const int p_plus_one = p+1;
+            const auto alpha11 = A[p*as]-lambda;
+            const auto alpha12 = A[p*as+as1];
+            const auto alpha21 = A[p*as+as0];
+            const auto alpha22 = A[p_plus_one*as]-lambda;
+            const auto det = alpha11*alpha22-alpha12*alpha21;
+
+            ValueTypeB *__restrict__ beta1 = b+p*bs0;            
+            ValueTypeB *__restrict__ beta2 = b+p_plus_one*bs0;            
+
+            const ValueTypeB beta_one = *beta1;
+            const ValueTypeB beta_two = *beta2;           
+
+            *beta1 = ( alpha22*(beta_one)-alpha12*(beta_two))/det;
+            *beta2 = (-alpha21*(beta_one)+alpha11*(beta_two))/det;
+            
+            if (iend) {
+              const ValueTypeA *__restrict__ A21 = A+p*as+2*as0; 
+              ValueTypeB *__restrict__ b2 = beta1+2*bs0;
+              
+              for (int i=0;i<iend;++i)
+                b2[i*bs0] -= (A21[i*as0]*(*beta1) + A21[i*as0+as1]*(*beta2));              
+            }
+          }
+          p += blk;
+        }
+        return 0;
+      }
+    };
+
+    ///
+    /// Upper
+    ///
+    
+    struct SerialShiftedTrsvInternalUpper {
+      template<typename ScalarType,
+               typename ValueTypeA,
+               typename ValueTypeB>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const int m,
+             const ScalarType lambda,
+             const ValueTypeA *__restrict__ A, const int as0, const int as1,
+             /**/  ValueTypeB *__restrict__ b, const int bs0,
+             const int *__restrict__ blks) {
+        const int as = as0+as1;
+
+        ValueTypeB *__restrict__ b0 = b;
+        
+        int p = m-1;
+        for (;p>=0;) {
+          const int blk = blks[p], iend = p+1-blk;
+          assert( ((blk == 1) || (blk == 2)) && "ShiftedTrsvUpper: blocks are not 1x1 or 2x2");          
+          if (blk == 1) {
+            const auto alpha11 = A[p*as]-lambda;
+            /**/  ValueTypeB *__restrict__ beta1 = b+p*bs0;
+
+            // with __restrict__ a compiler assumes that the pointer is not accessed by others
+            // op(/=) uses this pointer and changes the associated values, which brings a compiler problem
+            *beta1 = *beta1 / alpha11;
+
+            if (iend) {
+              const ValueTypeA *__restrict__ a01   = A+p*as1;
+              for (int i=0;i<iend;++i)
+                b0[i*bs0] -= a01[i*as0] * (*beta1);
+            }
+          } else {
+
+            const int p_minus_one = p-1;
+            const auto alpha11 = A[p_minus_one*as]-lambda;
+            const auto alpha12 = A[p_minus_one*as+as1];
+            const auto alpha21 = A[p_minus_one*as+as0];
+            const auto alpha22 = A[p*as]-lambda;
+            const auto det = alpha11*alpha22-alpha12*alpha21;
+
+            ValueTypeB *__restrict__ beta1 = b+p_minus_one*bs0;            
+            ValueTypeB *__restrict__ beta2 = b+p*bs0;            
+
+            const ValueTypeB beta_one = *beta1;
+            const ValueTypeB beta_two = *beta2;           
+
+            *beta1 = ( alpha22*(beta_one)-alpha12*(beta_two))/det;
+            *beta2 = (-alpha21*(beta_one)+alpha11*(beta_two))/det;
+            
+            if (iend) {
+              const ValueTypeA *__restrict__ A01 = A+p_minus_one*as1; 
+              for (int i=0;i<iend;++i)
+                b0[i*bs0] -= (A01[i*as0]*(*beta1) + A01[i*as0+as1]*(*beta2));              
+            }
+          }
+          p -= blk;
+        }
+        return 0;
+      }
+    };
+    
+  }
+}
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_SolveLU_Decl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_SolveLU_Decl.hpp
index 9c8b71a58d97..8f088db2d4e2 100644
--- a/packages/kokkos-kernels/src/batched/KokkosBatched_SolveLU_Decl.hpp
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_SolveLU_Decl.hpp
@@ -4,13 +4,17 @@
 
 /// \author Vinh Dang (vqdang@sandia.gov)
 
+#include "KokkosBatched_Util.hpp"
 #include "KokkosBatched_Vector.hpp"
+#include "KokkosBatched_Trsm_Decl.hpp"
+#include "KokkosBatched_Trsm_Serial_Impl.hpp"
+#include "KokkosBatched_Trsm_Team_Impl.hpp"
 
 namespace KokkosBatched {
   namespace Experimental {
       
-    template<typename ArgAlgo,
-             typename TransType>
+    template<typename ArgTrans,
+             typename ArgAlgo>
     struct SerialSolveLU {
       // no piv version
       template<typename AViewType,
@@ -18,12 +22,28 @@ namespace KokkosBatched {
       KOKKOS_INLINE_FUNCTION
       static int
       invoke(const AViewType &A,
-             const BViewType &B);
+             const BViewType &B) {
+        int r_val[2] = {};
+        const typename AViewType::non_const_value_type one(1.0);
+        if (std::is_same<ArgTrans,Trans::NoTranspose>::value) {
+          //First, compute Y (= U*X) by solving the system L*Y = B for Y
+          r_val[0] = SerialTrsm<Side::Left,Uplo::Lower,ArgTrans,Diag::Unit,ArgAlgo>::invoke(one, A, B);
+          //Second, compute X by solving the system U*X = Y for X
+          r_val[1] = SerialTrsm<Side::Left,Uplo::Upper,ArgTrans,Diag::NonUnit,ArgAlgo>::invoke(one, A, B);
+        } else if (std::is_same<ArgTrans,Trans::Transpose>::value || 
+                   std::is_same<ArgTrans,Trans::ConjTranspose>::value) {
+          //First, compute Y (= L'*X) by solving the system U'*Y = B for Y
+          r_val[0] = SerialTrsm<Side::Left,Uplo::Upper,ArgTrans,Diag::NonUnit,ArgAlgo>::invoke(one, A, B); 
+          //Second, compute X by solving the system L'*X = Y for X
+          r_val[1] = SerialTrsm<Side::Left,Uplo::Lower,ArgTrans,Diag::Unit,ArgAlgo>::invoke(one, A, B);
+        }
+        return r_val[0]+r_val[1];
+      }
     };       
 
     template<typename MemberType,
-             typename ArgAlgo,
-             typename TransType>
+             typename ArgTrans,
+             typename ArgAlgo>
     struct TeamSolveLU {
       // no piv version
       template<typename AViewType,
@@ -32,9 +52,51 @@ namespace KokkosBatched {
       static int
       invoke(const MemberType &member, 
              const AViewType &A,
-             const BViewType &B);
+             const BViewType &B) {
+        int r_val[2] = {};
+        const typename AViewType::non_const_value_type one(1.0);
+        if (std::is_same<ArgTrans,Trans::NoTranspose>::value) {
+          //First, compute Y (= U*X) by solving the system L*Y = B for Y
+          r_val[0] = TeamTrsm<MemberType,Side::Left,Uplo::Lower,ArgTrans,Diag::Unit,ArgAlgo>::invoke(member, one, A, B);
+          //Second, compute X by solving the system U*X = Y for X
+          r_val[1] = TeamTrsm<MemberType,Side::Left,Uplo::Upper,ArgTrans,Diag::NonUnit,ArgAlgo>::invoke(member, one, A, B);
+        } else if (std::is_same<ArgTrans,Trans::Transpose>::value || 
+                   std::is_same<ArgTrans,Trans::ConjTranspose>::value) {
+          //First, compute Y (= L'*X) by solving the system U'*Y = B for Y
+          r_val[0] = TeamTrsm<MemberType,Side::Left,Uplo::Upper,ArgTrans,Diag::NonUnit,ArgAlgo>::invoke(member, one, A, B); 
+          //Second, compute X by solving the system L'*X = Y for X
+          r_val[1] = TeamTrsm<MemberType,Side::Left,Uplo::Lower,ArgTrans,Diag::Unit,ArgAlgo>::invoke(member, one, A, B);
+        }
+        return r_val[0]+r_val[1];
+      }
     };       
       
+
+    ///
+    /// Selective Interface
+    ///
+    template<typename MemberType,
+             typename ArgTrans,
+             typename ArgMode, typename ArgAlgo>
+    struct SolveLU {
+      // no piv version
+      template<typename AViewType,
+               typename BViewType>
+      KOKKOS_FORCEINLINE_FUNCTION
+      static int
+      invoke(const MemberType &member, 
+             const AViewType &A,
+             const BViewType &B) {
+        int r_val = 0;
+        if (std::is_same<ArgMode,Mode::Serial>::value) {
+          r_val = SerialSolveLU<ArgTrans,ArgAlgo>::invoke(A, B);
+        } else if (std::is_same<ArgMode,Mode::Team>::value) {
+          r_val = TeamSolveLU<MemberType,ArgTrans,ArgAlgo>::invoke(member, A, B);
+        } 
+        return r_val;
+      }
+    };           
+    
   }
 }
 
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Trsm_Decl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Trsm_Decl.hpp
index dce98acac487..24f39dc6c2b0 100644
--- a/packages/kokkos-kernels/src/batched/KokkosBatched_Trsm_Decl.hpp
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Trsm_Decl.hpp
@@ -4,6 +4,9 @@
 
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
+
 
 namespace KokkosBatched {
   namespace Experimental {
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Trsv_Decl.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Trsv_Decl.hpp
index f7d5391921b2..b31cddeaf2b3 100644
--- a/packages/kokkos-kernels/src/batched/KokkosBatched_Trsv_Decl.hpp
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Trsv_Decl.hpp
@@ -4,6 +4,9 @@
 
 /// \author Kyungjoo Kim (kyukim@sandia.gov)
 
+#include "KokkosBatched_Util.hpp"
+#include "KokkosBatched_Vector.hpp"
+
 
 namespace KokkosBatched {
   namespace Experimental {  
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_UpdateGivens_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_UpdateGivens_Internal.hpp
new file mode 100644
index 000000000000..5c7967bf31ee
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_UpdateGivens_Internal.hpp
@@ -0,0 +1,37 @@
+#ifndef __KOKKOSBATCHED_UPDATE_GIVENS_INTERNAL_HPP__
+#define __KOKKOSBATCHED_UPDATE_GIVENS_INTERNAL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Internal Impl
+    /// ==================== 
+    ///
+    /// this impl follows the flame interface of householder transformation
+    ///
+    struct SerialUpdateGivensInternal {
+      template<typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const Kokkos::pair<ValueType,ValueType> &S,
+             /* */ Kokkos::pair<ValueType,ValueType> &G) {
+        const ValueType
+          tmp    = S.first*G.first -S.second*G.second;
+        G.second = S.first*G.second+S.second*G.first;
+        G.first  = tmp;
+
+        return 0;
+      }
+    };
+
+  }//  end namespace Experimental
+} // end namespace KokkosBatched
+
+
+#endif
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Util.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Util.hpp
index 0c6bd67dba6d..95dcb3c9120f 100644
--- a/packages/kokkos-kernels/src/batched/KokkosBatched_Util.hpp
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Util.hpp
@@ -9,6 +9,7 @@
 #include <random>
 #include <string>
 
+#include <cassert>
 #include <limits>
 #include <cmath>
 #include <ctime>
@@ -366,6 +367,201 @@ namespace KokkosBatched {
 
     };
 
+    template<typename ValueType> struct Partition1x2;
+    template<typename ValueType> struct Partition1x3;
+    
+    template<typename ValueType>
+    struct Partition1x2 {      
+      const int as1;
+      ValueType *AL, *AR;
+
+      KOKKOS_INLINE_FUNCTION
+      Partition1x2(const int arg_as1)
+        : as1(arg_as1), AL(NULL), AR(NULL) {}
+
+      KOKKOS_INLINE_FUNCTION
+      void partWithAL(ValueType *A, const int nA, const int nAL) {
+        AL = A; AR = AL+nAL*as1;
+      }
+
+      KOKKOS_INLINE_FUNCTION
+      void partWithAR(ValueType *A, const int nA, const int nAR) {
+        AL = A; AR = AL+(nA-nAR)*as1;
+      }
+
+      // A0 A1 are merged into AL
+      KOKKOS_INLINE_FUNCTION
+      void mergeToAL(const Partition1x3<ValueType> &part) {
+        AL = part.A0; AR = part.A2;
+      }      
+
+      // A0 A1 are merged into AL
+      KOKKOS_INLINE_FUNCTION
+      void mergeToAR(const Partition1x3<ValueType> &part) {
+        AL = part.A0; AR = part.A1;
+      }      
+    };
+
+    template<typename ValueType>
+    struct Partition1x3 {      
+      const int as1;
+      ValueType *A0, *A1, *A2;
+
+      KOKKOS_INLINE_FUNCTION
+      Partition1x3(const int arg_as1)
+        : as1(arg_as1), A0(NULL), A1(NULL), A2(NULL) {}
+
+      KOKKOS_INLINE_FUNCTION
+      void partWithAL(const Partition1x2<ValueType> &part, const int mA1) {
+        A0 = part.AL; A2 = part.AR; A1 = A2 - mA1*as1;
+      }
+      KOKKOS_INLINE_FUNCTION
+      void partWithAR(const Partition1x2<ValueType> &part, const int mA1) {
+        A0 = part.AL; A1 = part.AR; A2 = A1 + mA1*as1;
+      }
+    };
+
+
+    template<typename ValueType> struct Partition2x1;
+    template<typename ValueType> struct Partition3x1;
+    
+    template<typename ValueType>
+    struct Partition2x1 {      
+      const int as0;
+      ValueType *AT, *AB;
+
+      KOKKOS_INLINE_FUNCTION
+      Partition2x1(const int arg_as0)
+        : as0(arg_as0), AT(NULL), AB(NULL) {}
+
+      KOKKOS_INLINE_FUNCTION
+      void partWithAT(ValueType *A, const int mA, const int mAT) {
+        AT = A;
+        AB = AT+mAT*as0;
+      }
+
+      KOKKOS_INLINE_FUNCTION
+      void partWithAB(ValueType *A, const int mA, const int mAB) {
+        partWithAT(A, mA, mA-mAB);
+      }
+
+      // A0
+      // A1 is merged into AT
+      KOKKOS_INLINE_FUNCTION
+      void mergeToAT(const Partition3x1<ValueType> &part) {
+        AT = part.A0;
+        AB = part.A2;
+      }      
+
+      KOKKOS_INLINE_FUNCTION
+      void mergeToAB(const Partition3x1<ValueType> &part) {
+        AT = part.A0;
+        AB = part.A1;
+      }      
+    };
+
+    template<typename ValueType>
+    struct Partition3x1 {      
+      const int as0;
+      ValueType *A0,
+        /* */   *A1,
+        /* */   *A2;
+      
+      KOKKOS_INLINE_FUNCTION
+      Partition3x1(const int arg_as0)
+        : as0(arg_as0), 
+          A0(NULL),  
+          A1(NULL),  
+          A2(NULL) {}
+      
+      KOKKOS_INLINE_FUNCTION
+      void partWithAB(const Partition2x1<ValueType> &part, const int mA1) {
+        A0 = part.AT;      
+        A1 = part.AB;      
+        A2 = A1 + mA1*as0;
+      }
+
+      KOKKOS_INLINE_FUNCTION
+      void partWithAT(const Partition2x1<ValueType> &part, const int mA1) {
+        A0 = part.AT;      
+        A1 = part.AB - mA1*as0;
+        A2 = part.AB;
+      }
+    };
+
+    template<typename ValueType> struct Partition2x2;
+    template<typename ValueType> struct Partition3x3;
+
+    template<typename ValueType>
+    struct Partition2x2 {      
+      const int as0, as1;
+      ValueType *ATL, *ATR, *ABL, *ABR;
+
+      KOKKOS_INLINE_FUNCTION
+      Partition2x2(const int arg_as0, const int arg_as1) 
+        : as0(arg_as0), as1(arg_as1), ATL(NULL), ATR(NULL), ABL(NULL), ABR(NULL) {}
+
+      KOKKOS_INLINE_FUNCTION
+      void partWithATL(ValueType *A, 
+                       const int mA, const int nA, 
+                       const int mATL, const int nATL) {
+        ATL = A;            ATR = ATL+nATL*as1; 
+        ABL = ATL+mATL*as0; ABR = ABL+nATL*as1;
+      }
+
+      KOKKOS_INLINE_FUNCTION
+      void partWithABR(ValueType *A, 
+                       const int mA, const int nA, 
+                       const int mABR, const int nABR) {
+        partWithATL(A, mA, nA, mA-mABR, nA-nABR);
+      }
+
+      // A00 A01
+      // A10 A11 is merged into ATL
+      KOKKOS_INLINE_FUNCTION
+      void mergeToATL(const Partition3x3<ValueType> &part) {
+        ATL = part.A00; ATR = part.A02;
+        ABL = part.A20; ABR = part.A22;
+      }      
+
+      KOKKOS_INLINE_FUNCTION
+      void mergeToABR(const Partition3x3<ValueType> &part) {
+        ATL = part.A00; ATR = part.A01;
+        ABL = part.A10; ABR = part.A11;
+      }      
+    };
+
+    template<typename ValueType>
+    struct Partition3x3 {      
+      const int as0, as1;
+      ValueType *A00, *A01, *A02,
+        /* */   *A10, *A11, *A12,
+        /* */   *A20, *A21, *A22;
+
+      
+      KOKKOS_INLINE_FUNCTION
+      Partition3x3(const int arg_as0, const int arg_as1)
+        : as0(arg_as0), as1(arg_as1), 
+          A00(NULL), A01(NULL), A02(NULL), 
+          A10(NULL), A11(NULL), A12(NULL), 
+          A20(NULL), A21(NULL), A22(NULL) {}
+      
+      KOKKOS_INLINE_FUNCTION
+      void partWithABR(const Partition2x2<ValueType> &part, const int mA11, const int nA11) {
+        A00 = part.ATL;            A01 = part.ATR;            A02 = part.ATR + nA11*as1;
+        A10 = part.ABL;            A11 = part.ABR;            A12 = part.ABR + nA11*as1;
+        A20 = part.ABL + mA11*as0; A21 = part.ABR + mA11*as0; A22 = part.ABR + mA11*as0 + nA11*as1;
+      }
+
+      KOKKOS_INLINE_FUNCTION
+      void partWithATL(const Partition2x2<ValueType> &part, const int mA11, const int nA11) {
+        A00 = part.ATL;            A01 = part.ATR - nA11*as1;            A02 = part.ATR;
+        A10 = part.ABL - mA11*as0; A11 = part.ABR - mA11*as0 - nA11*as1; A12 = part.ABR - mA11*as0;
+        A20 = part.ABL;            A21 = part.ABR            - nA11*as1; A22 = part.ABR;
+      }
+    };
+    
+    
   }
 }
 
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_Vector_SIMD_Misc.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_Vector_SIMD_Misc.hpp
index ca11d303919f..bebd29b62753 100644
--- a/packages/kokkos-kernels/src/batched/KokkosBatched_Vector_SIMD_Misc.hpp
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_Vector_SIMD_Misc.hpp
@@ -164,7 +164,8 @@ namespace KokkosBatched {
     T
     min(const Vector<SIMD<T>,l> &val) {
       return reduce(val, [](const T left, const T right) -> T {
-          return min(left,right);
+          const auto tmp = left < right;
+          return tmp*left + !tmp*right;
         });
     } 
 
@@ -174,7 +175,8 @@ namespace KokkosBatched {
     T
     max(const Vector<SIMD<T>,l> &val) {
       return reduce(val, [](const T left, const T right) -> T {
-          return max(left,right);
+          const auto tmp = left > right;
+          return tmp*left + !tmp*right;
         });
     } 
 
diff --git a/packages/kokkos-kernels/src/batched/KokkosBatched_WilkinsonShift_Serial_Internal.hpp b/packages/kokkos-kernels/src/batched/KokkosBatched_WilkinsonShift_Serial_Internal.hpp
new file mode 100644
index 000000000000..5393580f636a
--- /dev/null
+++ b/packages/kokkos-kernels/src/batched/KokkosBatched_WilkinsonShift_Serial_Internal.hpp
@@ -0,0 +1,63 @@
+#ifndef __KOKKOSBATCHED_WILKINSON_SHIFT_SERIAL_INTERNAL_HPP__
+#define __KOKKOSBATCHED_WILKINSON_SHIFT_SERIAL_INTERNAL_HPP__
+
+
+/// \author Kyungjoo Kim (kyukim@sandia.gov)
+
+#include "KokkosBatched_Util.hpp"
+
+
+namespace KokkosBatched {
+  namespace Experimental {
+    ///
+    /// Serial Internal Impl
+    /// ==================== 
+    ///
+    /// this impl follows the flame interface of householder transformation
+    ///
+    struct SerialWilkinsonShiftInternal {
+      template<typename ValueType>
+      KOKKOS_INLINE_FUNCTION
+      static int
+      invoke(const ValueType a, const ValueType b,
+             const ValueType c, const ValueType d,
+             /* */ Kokkos::complex<ValueType> * lambda1,   
+             /* */ Kokkos::complex<ValueType> * lambda2,   
+             /* */ bool * is_complex) {
+        /// compute eigenvalues of 2x2 system [a b;
+        ///                                    c d]
+        /// when the system has a real complex values,
+        /// lambda1 and lambda2 are real eigenvalues
+        /// if the system has a complex eigenvalue pair, 
+        /// then lambda1 and lambda2 are redefined as follows
+        ///   lambda1 := lambda1 + lambda2
+        ///   lambda2 := lambda1 * lambda2
+        typedef ValueType value_type;
+          
+        const value_type half(0.5);
+        const value_type p = (a+d)*half;
+        const value_type q = (b*c-a*d);
+        const value_type v = p*p+q;
+
+        if (v < 0) {
+          // complex 
+          const value_type sqrt_v = Kokkos::Details::ArithTraits<value_type>::sqrt(-v);
+          *lambda1 = Kokkos::complex<value_type>(p, sqrt_v);
+          *lambda2 = Kokkos::complex<value_type>(p,-sqrt_v);
+          *is_complex = true;
+        } else {
+          // real
+          const value_type sqrt_v = Kokkos::Details::ArithTraits<value_type>::sqrt(v);
+          *lambda1 = Kokkos::complex<value_type>(p+sqrt_v);
+          *lambda2 = Kokkos::complex<value_type>(p-sqrt_v);
+          *is_complex = false;
+        }
+        return 0;
+      }
+    };
+
+  }//  end namespace Experimental
+} // end namespace KokkosBatched
+
+
+#endif