diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml
index ccd0d978eeb..3eae206f6a9 100644
--- a/.github/workflows/windows-build.yml
+++ b/.github/workflows/windows-build.yml
@@ -51,7 +51,7 @@ jobs:
         $env:PATH="$env:PATH;$pwd\build\windows_shared_library"
         mkdir build
         cd build
-        cmake  -DCMAKE_CXX_FLAGS=/bigobj -DCMAKE_CXX_FLAGS_DEBUG="/MDd /Zi /Ob1 /Od /RTC1" -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF ..
+        cmake  -DCMAKE_CXX_FLAGS=/bigobj -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_CXX_FLAGS_DEBUG="/MDd /Zi /Ob1 /Od /RTC1" -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF ..
         cmake --build . -j4 --config ${{ matrix.config.build_type }}
         ctest . -C ${{ matrix.config.build_type }} --output-on-failure
     - name: install
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 0e49972117f..242de328838 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -288,6 +288,26 @@ build/cuda92/intel/cuda/release/static:
     - cuda
     - gpu
 
+# Build CUDA NVIDIA without omp
+build/cuda92/intel/cuda_wo_omp/release/shared:
+  <<: *default_build_with_test
+  image: localhost:5000/gko-cuda92-gnu7-llvm50-intel2017
+  variables:
+    <<: *default_variables
+    C_COMPILER: "icc"
+    CXX_COMPILER: "icpc"
+    BUILD_CUDA: "ON"
+    BUILD_HWLOC: "OFF"
+    BUILD_TYPE: "Release"
+    CUDA_ARCH: 35
+  only:
+    variables:
+      - $RUN_CI_TAG
+  tags:
+    - private_ci
+    - cuda
+    - gpu
+
 # cuda 10.0 and friends
 # Make sure that our jobs run when using self-installed
 # third-party HWLOC.
@@ -597,6 +617,24 @@ build/amd/clang/hip/release/static:
     - amd
     - gpu
 
+# Build HIP AMD without omp
+build/amd/clang/hip_wo_omp/release/shared:
+  <<: *default_build_with_test
+  image: localhost:5000/gko-amd-gnu8-llvm7
+  variables:
+    <<: *default_variables
+    C_COMPILER: "clang"
+    CXX_COMPILER: "clang++"
+    BUILD_HIP: "ON"
+    BUILD_TYPE: "Release"
+  only:
+    variables:
+      - $RUN_CI_TAG
+  tags:
+    - private_ci
+    - amd
+    - gpu
+
 # no cuda but latest gcc and clang
 build/nocuda/gcc/core/debug/static:
   <<: *default_build_with_test
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7e2fbeadd10..bedfe94378f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,7 +73,11 @@ set(GINKGO_HIP_AMDGPU "" CACHE STRING
     "The amdgpu_target(s) variable passed to hipcc. The default is none (auto).")
 option(GINKGO_JACOBI_FULL_OPTIMIZATIONS "Use all the optimizations for the CUDA Jacobi algorithm" OFF)
 option(BUILD_SHARED_LIBS "Build shared (.so, .dylib, .dll) libraries" ON)
-option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is ON. If a system HWLOC is not found, then we try to build it ourselves. Switch this OFF to disable HWLOC." ON)
+if(MSVC OR WIN32 OR CYGWIN OR APPLE)
+    option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is OFF. Ginkgo does not support HWLOC on Windows/MacOS" OFF)
+else()
+    option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is ON. If a system HWLOC is not found, then we try to build it ourselves. Switch this OFF to disable HWLOC." ON)
+endif()
 option(GINKGO_INSTALL_RPATH "Set the RPATH when installing its libraries." ON)
 option(GINKGO_INSTALL_RPATH_ORIGIN "Add $ORIGIN (Linux) or @loader_path (MacOS) to the installation RPATH." ON)
 option(GINKGO_INSTALL_RPATH_DEPENDENCIES "Add dependencies to the installation RPATH." OFF)
@@ -169,6 +173,11 @@ if(PAPI_sde_FOUND)
     set(GINKGO_HAVE_PAPI_SDE 1)
 endif()
 
+# Switch off HWLOC for Windows and MacOS
+if(GINKGO_BUILD_HWLOC AND (MSVC OR WIN32 OR CYGWIN OR APPLE))
+    set(GINKGO_BUILD_HWLOC OFF CACHE BOOL "Build Ginkgo with HWLOC. Default is OFF. Ginkgo does not support HWLOC on Windows/MacOS" FORCE)
+    message(WARNING "Ginkgo does not support HWLOC on Windows/MacOS, switch GINKGO_BUILD_HWLOC to OFF")
+endif()
 if(GINKGO_BUILD_HWLOC)
     # By default always use external HWLOC
     set(GINKGO_USE_EXTERNAL_HWLOC 1)
@@ -178,10 +187,6 @@ else()
     set(GINKGO_HAVE_HWLOC 0)
     message(STATUS "HWLOC is being forcibly switched off")
 endif()
-# Switch off HWLOC for Windows and MacOS
-if(MSVC OR WIN32 OR CYGWIN OR APPLE)
-    set(GINKGO_HAVE_HWLOC 0)
-endif()
 
 # We keep using NVCC/HCC for consistency with previous releases even if AMD
 # updated everything to use NVIDIA/AMD in ROCM 4.1
@@ -211,6 +216,21 @@ if(GINKGO_BUILD_HIP)
 endif()
 
 
+if(MSVC)
+    # This is modified from
+    # https://gitlab.kitware.com/cmake/community/wikis/FAQ#dynamic-replace
+    include(cmake/windows_helpers.cmake)
+    if(BUILD_SHARED_LIBS)
+        ginkgo_switch_to_windows_dynamic("CXX")
+        ginkgo_switch_to_windows_dynamic("C")
+        set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
+    else()
+        ginkgo_switch_to_windows_static("CXX")
+        ginkgo_switch_to_windows_static("C")
+        set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS FALSE)
+    endif()
+endif()
+
 # Try to find the third party packages before using our subdirectories
 include(cmake/package_helpers.cmake)
 ginkgo_find_package(GTest "GTest::GTest;GTest::Main" FALSE 1.8.1)
@@ -222,26 +242,12 @@ if(GINKGO_HAVE_HWLOC)
         set(GINKGO_USE_EXTERNAL_HWLOC 0)
     endif()
 endif()
+# third_party needs to be after flag modification.
 add_subdirectory(third_party)    # Third-party tools and libraries
 
 # Load CMake helpers
 include(cmake/build_helpers.cmake)
 include(cmake/install_helpers.cmake)
-include(cmake/windows_helpers.cmake)
-
-# This is modified from
-# https://gitlab.kitware.com/cmake/community/wikis/FAQ#dynamic-replace
-if(MSVC)
-    if(BUILD_SHARED_LIBS)
-        ginkgo_switch_to_windows_dynamic("CXX")
-        ginkgo_switch_to_windows_dynamic("C")
-        set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
-    else()
-        ginkgo_switch_to_windows_static("CXX")
-        ginkgo_switch_to_windows_static("C")
-        set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS FALSE)
-    endif()
-endif()
 
 configure_file(${Ginkgo_SOURCE_DIR}/include/ginkgo/config.hpp.in
     ${Ginkgo_BINARY_DIR}/include/ginkgo/config.hpp @ONLY)
diff --git a/INSTALL.md b/INSTALL.md
index a477b2f6b9a..d85bcc48329 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -44,7 +44,8 @@ Ginkgo adds the following additional switches to control what is being built:
 *   `-DGINKGO_HIP_AMDGPU="gpuarch1;gpuarch2"` the amdgpu_target(s) variable
     passed to hipcc for the `hcc` HIP backend. The default is none (auto).
 *   `-DGINKGO_BUILD_HWLOC={ON, OFF}` builds Ginkgo with HWLOC. If system HWLOC
-    is not found, Ginkgo will try to build it. Default is `ON`.
+    is not found, Ginkgo will try to build it. Default is `ON` on Linux. Ginkgo
+    does not support HWLOC on Windows/MacOS, so the default is `OFF` on Windows/MacOS.
 *   `-DGINKGO_BUILD_DOC={ON, OFF}` creates an HTML version of Ginkgo's documentation
     from inline comments in the code. The default is `OFF`.
 *   `-DGINKGO_DOC_GENERATE_EXAMPLES={ON, OFF}` generates the documentation of examples
@@ -99,7 +100,7 @@ Ginkgo adds the following additional switches to control what is being built:
     list of architectures. Supported values are:
 
     *   `Auto`
-    *   `Kepler`, `Maxwell`, `Pascal`, `Volta`, `Ampere`
+    *   `Kepler`, `Maxwell`, `Pascal`, `Volta`, `Turing`, `Ampere`
     *   `CODE`, `CODE(COMPUTE)`, `(COMPUTE)`
 
     `Auto` will automatically detect the present CUDA-enabled GPU architectures
@@ -137,7 +138,7 @@ Depending on the configuration settings, some manual work might be required:
 * Build Ginkgo as shared library:
   Add `PROJECT_BINARY_DIR/GINKGO_WINDOWS_SHARED_LIBRARY_RELPATH` into the environment variable `PATH`.
   `GINKGO_WINDOWS_SHARED_LIBRARY_RELPATH` is `windows_shared_library` by default. More Details are available in the [Installation page](./INSTALL.md).
-  * cmd: `set PATH="<PROJECT_BINARY_DIR/GINKGO_WINDOWS_SHARED_LIBRARY_RELPATH>;%PATH%"`
+  * cmd: `set PATH=<PROJECT_BINARY_DIR/GINKGO_WINDOWS_SHARED_LIBRARY_RELPATH>;%PATH%`
   * powershell: `$env:PATH="<PROJECT_BINARY_DIR/GINKGO_WINDOWS_SHARED_LIBRARY_RELPATH>;$env:PATH"`
 
   CMake will give the following error message if the path is not correct.
@@ -147,16 +148,16 @@ Depending on the configuration settings, some manual work might be required:
   where `<path>` is the needed `<PROJECT_BINARY_DIR/GINKGO_WINDOWS_SHARED_LIBRARY_RELPATH>`.
 * Build Ginkgo with Debug mode:
   Some Debug build specific issues can appear depending on the machine and environment. The known issues are the following:
-  1. `bigobj` issue: encountering  `too many sections` needs the compilation flags `\bigobj` or `-Wa,-mbig-obj`
+  1. `bigobj` issue: encountering  `too many sections` needs the compilation flags `/bigobj` or `-Wa,-mbig-obj`
   2. `ld` issue: encountering  `ld: error: export ordinal too large` needs the compilation flag `-O1`
 
   The following are the details for different environments:
   * _Microsoft Visual Studio_:
     1. `bigobj` issue
-      * `cmake -DCMAKE_CXX_FLAGS=\bigobj <other parameters> <source_folder>` which might overwrite the default settings.
-      * add `\bigobj` into the environment variable `CXXFLAGS` (only available in the first cmake configuration)
-        * cmd: `set CXXFLAGS=\bigobj`
-        * powershell: `$env:CXXFLAGS=\bigobj`
+      * `cmake -DCMAKE_CXX_FLAGS=/bigobj <other parameters> <source_folder>` which might overwrite the default settings.
+      * add `/bigobj` into the environment variable `CXXFLAGS` (only available in the first cmake configuration)
+        * cmd: `set CXXFLAGS=/bigobj`
+        * powershell: `$env:CXXFLAGS=/bigobj`
     2. `ld` issue (_Microsoft Visual Studio_ does not have this issue)
   * _Cygwin_:
     1. `bigobj` issue
@@ -175,7 +176,10 @@ Depending on the configuration settings, some manual work might be required:
     2. `ld` issue (If building Ginkgo as static library, this is not needed)
       * `cmake -DGINKGO_COMPILER_FLAGS="-Wpedantic -O1" <other parameters> <source_folder>` (`GINKGO_COMPILER_FLAGS` is `-Wpedantic` by default)
       * add `-O1` in the environement variable `CXX_FLAGS` or `CMAKE_CXX_FLAGS`
-* Build Ginkgo in _MinGW_:
+* Possible issue when switching static/shared of Ginkgo with MSVC in the same build directory:\
+  If an issue occurs from mixing MD/MT runtime library when enabling `GINKGO_BUILD_BENCHMARKS`, it means the third-party flags are not updated correctly.
+  To update the third party flags, turn off `GINKGO_SKIP_DEPENDENCY_UPDATE` (`-DGINKGO_SKIP_DEPENDENCY_UPDATE=OFF`).
+* Build Ginkgo in _MinGW_:\
   If encountering the issue `cc1plus.exe: out of memory allocating 65536 bytes`, please follow the workaround in
   [reference](https://www.intel.com/content/www/us/en/programmable/support/support-resources/knowledge-base/embedded/2016/cc1plus-exe--out-of-memory-allocating-65536-bytes.html),
   or trying to compile ginkgo again might work.
diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index 7f41fe87f3d..0cbaa82b3b9 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -61,9 +61,9 @@ std::shared_ptr<CudaExecutor> CudaExecutor::create(
     return std::shared_ptr<CudaExecutor>(
         new CudaExecutor(device_id, std::move(master), device_reset),
         [device_id](CudaExecutor *exec) {
+            auto device_reset = exec->get_device_reset();
             delete exec;
-            if (!CudaExecutor::get_num_execs(device_id) &&
-                exec->get_device_reset()) {
+            if (!CudaExecutor::get_num_execs(device_id) && device_reset) {
                 cuda::device_guard g(device_id);
                 cudaDeviceReset();
             }
@@ -76,9 +76,9 @@ void CudaExecutor::populate_exec_info(const MachineTopology *mach_topo)
     if (this->get_device_id() < this->get_num_devices() &&
         this->get_device_id() >= 0) {
         cuda::device_guard g(this->get_device_id());
-        GKO_ASSERT_NO_CUDA_ERRORS(cudaDeviceGetPCIBusId(
-            const_cast<char *>(this->get_exec_info().pci_bus_id.data()), 13,
-            this->get_device_id()));
+        GKO_ASSERT_NO_CUDA_ERRORS(
+            cudaDeviceGetPCIBusId(&(this->get_exec_info().pci_bus_id.front()),
+                                  13, this->get_device_id()));
 
         auto cuda_hwloc_obj =
             mach_topo->get_pci_device(this->get_exec_info().pci_bus_id);
@@ -230,7 +230,7 @@ void CudaExecutor::set_gpu_property()
         GKO_ASSERT_NO_CUDA_ERRORS(cudaDeviceGetAttribute(
             &max_threads_per_block, cudaDevAttrMaxThreadsPerBlock,
             this->get_device_id()));
-        std::vector<int> max_threads_per_block_dim{3, 0};
+        std::vector<int> max_threads_per_block_dim(3, 0);
         GKO_ASSERT_NO_CUDA_ERRORS(cudaDeviceGetAttribute(
             &max_threads_per_block_dim[0], cudaDevAttrMaxBlockDimX,
             this->get_device_id()));
diff --git a/cuda/test/base/array.cu b/cuda/test/base/array.cu
index f35ac061972..4d12cff3988 100644
--- a/cuda/test/base/array.cu
+++ b/cuda/test/base/array.cu
@@ -39,7 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/executor.hpp>
 
 
-#include "core/test/utils.hpp"
+#include "cuda/test/utils.hpp"
 
 
 template <typename T>
diff --git a/cuda/test/factorization/par_ic_kernels.cpp b/cuda/test/factorization/par_ic_kernels.cpp
index ac454eb6b80..e957efbadf4 100644
--- a/cuda/test/factorization/par_ic_kernels.cpp
+++ b/cuda/test/factorization/par_ic_kernels.cpp
@@ -71,7 +71,7 @@ class ParIc : public ::testing::Test {
         : mtx_size(624, 624),
           rand_engine(43456),
           ref(gko::ReferenceExecutor::create()),
-          cuda(gko::CudaExecutor::create(0, gko::OmpExecutor::create()))
+          cuda(gko::CudaExecutor::create(0, gko::ReferenceExecutor::create()))
     {
         mtx_l = gko::test::generate_random_lower_triangular_matrix<Csr>(
             mtx_size[0], mtx_size[0], false,
diff --git a/cuda/test/matrix/diagonal_kernels.cpp b/cuda/test/matrix/diagonal_kernels.cpp
index d0016b0a079..d3c6f8c5973 100644
--- a/cuda/test/matrix/diagonal_kernels.cpp
+++ b/cuda/test/matrix/diagonal_kernels.cpp
@@ -125,10 +125,10 @@ class Diagonal : public ::testing::Test {
         diag = gen_diag(mtx_size[0]);
         ddiag = Diag::create(cuda);
         ddiag->copy_from(diag.get());
-        dense1 = gen_mtx<Dense>(mtx_size[0], mtx_size[1], mtx_size[0]);
-        dense2 = gen_mtx<Dense>(mtx_size[1], mtx_size[0], mtx_size[1]);
-        denseexpected1 = gen_mtx<Dense>(mtx_size[0], mtx_size[1], mtx_size[0]);
-        denseexpected2 = gen_mtx<Dense>(mtx_size[1], mtx_size[0], mtx_size[1]);
+        dense1 = gen_mtx<Dense>(mtx_size[0], mtx_size[1], mtx_size[1]);
+        dense2 = gen_mtx<Dense>(mtx_size[1], mtx_size[0], mtx_size[0]);
+        denseexpected1 = gen_mtx<Dense>(mtx_size[0], mtx_size[1], mtx_size[1]);
+        denseexpected2 = gen_mtx<Dense>(mtx_size[1], mtx_size[0], mtx_size[0]);
         ddense1 = Dense::create(cuda);
         ddense1->copy_from(dense1.get());
         ddense2 = Dense::create(cuda);
diff --git a/cuda/test/preconditioner/isai_kernels.cpp b/cuda/test/preconditioner/isai_kernels.cpp
index f099e3f503b..9385c0f109d 100644
--- a/cuda/test/preconditioner/isai_kernels.cpp
+++ b/cuda/test/preconditioner/isai_kernels.cpp
@@ -244,7 +244,7 @@ TEST_F(Isai, CudaIsaiGenerateSpdinverseShortIsEquivalentToRef)
         true);
 
     GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse);
-    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 10 * r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 15 * r<value_type>::value);
     GKO_ASSERT_ARRAY_EQ(a1, da1);
     GKO_ASSERT_ARRAY_EQ(a2, da2);
     ASSERT_EQ(a1.get_const_data()[num_rows], 0);
diff --git a/cuda/test/reorder/rcm_kernels.cpp b/cuda/test/reorder/rcm_kernels.cpp
index 0a066918b95..afb3336134b 100644
--- a/cuda/test/reorder/rcm_kernels.cpp
+++ b/cuda/test/reorder/rcm_kernels.cpp
@@ -52,7 +52,8 @@ class Rcm : public ::testing::Test {
 
 
     Rcm()
-        : exec(gko::CudaExecutor::create(0, gko::OmpExecutor::create(), true)),
+        : exec(gko::CudaExecutor::create(0, gko::ReferenceExecutor::create(),
+                                         true)),
           // clang-format off
           p_mtx(gko::initialize<CsrMtx>({{1.0, 2.0, 0.0, -1.3, 2.1},
                                          {2.0, 5.0, 1.5, 0.0, 0.0},
diff --git a/cuda/test/solver/cb_gmres_kernels.cpp b/cuda/test/solver/cb_gmres_kernels.cpp
index cb4b96f65c9..ce28556ef24 100644
--- a/cuda/test/solver/cb_gmres_kernels.cpp
+++ b/cuda/test/solver/cb_gmres_kernels.cpp
@@ -49,7 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "core/solver/cb_gmres_accessor.hpp"
 #include "core/solver/cb_gmres_kernels.hpp"
-#include "core/test/utils.hpp"
+#include "cuda/test/utils.hpp"
 
 
 namespace {
diff --git a/cuda/test/utils.hpp b/cuda/test/utils.hpp
index 4b6e0e0a667..7667c7beb9f 100644
--- a/cuda/test/utils.hpp
+++ b/cuda/test/utils.hpp
@@ -43,9 +43,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace {
 
 
+// Visual Studio does not define the constructor of std::mutex as constexpr,
+// causing it to not be initialized when creating this executor (which uses
+// the mutex)
+#if !defined(_MSC_VER)
 // prevent device reset after each test
 auto no_reset_exec =
     gko::CudaExecutor::create(0, gko::ReferenceExecutor::create(), true);
+#endif
 
 
 }  // namespace
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index 706270aaa47..49013544d98 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -61,9 +61,9 @@ std::shared_ptr<HipExecutor> HipExecutor::create(
     return std::shared_ptr<HipExecutor>(
         new HipExecutor(device_id, std::move(master), device_reset),
         [device_id](HipExecutor *exec) {
+            auto device_reset = exec->get_device_reset();
             delete exec;
-            if (!HipExecutor::get_num_execs(device_id) &&
-                exec->get_device_reset()) {
+            if (!HipExecutor::get_num_execs(device_id) && device_reset) {
                 hip::device_guard g(device_id);
                 hipDeviceReset();
             }
@@ -76,9 +76,9 @@ void HipExecutor::populate_exec_info(const MachineTopology *mach_topo)
     if (this->get_device_id() < this->get_num_devices() &&
         this->get_device_id() >= 0) {
         hip::device_guard g(this->get_device_id());
-        GKO_ASSERT_NO_HIP_ERRORS(hipDeviceGetPCIBusId(
-            const_cast<char *>(this->get_exec_info().pci_bus_id.data()), 13,
-            this->get_device_id()));
+        GKO_ASSERT_NO_HIP_ERRORS(
+            hipDeviceGetPCIBusId(&(this->get_exec_info().pci_bus_id.front()),
+                                 13, this->get_device_id()));
 
         auto hip_hwloc_obj =
             mach_topo->get_pci_device(this->get_exec_info().pci_bus_id);
@@ -232,7 +232,7 @@ void HipExecutor::set_gpu_property()
             this->get_device_id()));
         this->get_exec_info().max_workitem_sizes.push_back(
             max_threads_per_block);
-        std::vector<int> max_threads_per_block_dim{3, 0};
+        std::vector<int> max_threads_per_block_dim(3, 0);
         GKO_ASSERT_NO_HIP_ERRORS(hipDeviceGetAttribute(
             &max_threads_per_block_dim[0], hipDeviceAttributeMaxBlockDimX,
             this->get_device_id()));
diff --git a/hip/test/factorization/par_ic_kernels.hip.cpp b/hip/test/factorization/par_ic_kernels.hip.cpp
index 230ade961e5..e58b4da93e4 100644
--- a/hip/test/factorization/par_ic_kernels.hip.cpp
+++ b/hip/test/factorization/par_ic_kernels.hip.cpp
@@ -71,7 +71,7 @@ class ParIc : public ::testing::Test {
         : mtx_size(585, 585),
           rand_engine(10667),
           ref(gko::ReferenceExecutor::create()),
-          hip(gko::HipExecutor::create(0, gko::OmpExecutor::create()))
+          hip(gko::HipExecutor::create(0, gko::ReferenceExecutor::create()))
     {
         mtx_l = gko::test::generate_random_lower_triangular_matrix<Csr>(
             mtx_size[0], mtx_size[0], false,
diff --git a/hip/test/matrix/diagonal_kernels.hip.cpp b/hip/test/matrix/diagonal_kernels.hip.cpp
index 2c97e7e4b73..606e39db7e6 100644
--- a/hip/test/matrix/diagonal_kernels.hip.cpp
+++ b/hip/test/matrix/diagonal_kernels.hip.cpp
@@ -125,10 +125,10 @@ class Diagonal : public ::testing::Test {
         diag = gen_diag(mtx_size[0]);
         ddiag = Diag::create(hip);
         ddiag->copy_from(diag.get());
-        dense1 = gen_mtx<Dense>(mtx_size[0], mtx_size[1], mtx_size[0]);
-        dense2 = gen_mtx<Dense>(mtx_size[1], mtx_size[0], mtx_size[1]);
-        denseexpected1 = gen_mtx<Dense>(mtx_size[0], mtx_size[1], mtx_size[0]);
-        denseexpected2 = gen_mtx<Dense>(mtx_size[1], mtx_size[0], mtx_size[1]);
+        dense1 = gen_mtx<Dense>(mtx_size[0], mtx_size[1], mtx_size[1]);
+        dense2 = gen_mtx<Dense>(mtx_size[1], mtx_size[0], mtx_size[0]);
+        denseexpected1 = gen_mtx<Dense>(mtx_size[0], mtx_size[1], mtx_size[1]);
+        denseexpected2 = gen_mtx<Dense>(mtx_size[1], mtx_size[0], mtx_size[0]);
         ddense1 = Dense::create(hip);
         ddense1->copy_from(dense1.get());
         ddense2 = Dense::create(hip);
diff --git a/hip/test/preconditioner/isai_kernels.hip.cpp b/hip/test/preconditioner/isai_kernels.hip.cpp
index 18470c3d77c..bfcb226bbc1 100644
--- a/hip/test/preconditioner/isai_kernels.hip.cpp
+++ b/hip/test/preconditioner/isai_kernels.hip.cpp
@@ -225,7 +225,7 @@ TEST_F(Isai, HipIsaiGenerateSpdinverseShortIsEquivalentToRef)
         true);
 
     GKO_ASSERT_MTX_EQ_SPARSITY(inverse, d_inverse);
-    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 10 * r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(inverse, d_inverse, 15 * r<value_type>::value);
     GKO_ASSERT_ARRAY_EQ(a1, da1);
     GKO_ASSERT_ARRAY_EQ(a2, da2);
     ASSERT_EQ(a1.get_const_data()[num_rows], 0);
diff --git a/hip/test/solver/cb_gmres_kernels.cpp b/hip/test/solver/cb_gmres_kernels.cpp
index 19572f73c26..b5114129935 100644
--- a/hip/test/solver/cb_gmres_kernels.cpp
+++ b/hip/test/solver/cb_gmres_kernels.cpp
@@ -49,7 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "core/solver/cb_gmres_accessor.hpp"
 #include "core/solver/cb_gmres_kernels.hpp"
-#include "core/test/utils.hpp"
+#include "hip/test/utils.hip.hpp"
 
 
 namespace {
diff --git a/hip/test/utils.hip.hpp b/hip/test/utils.hip.hpp
index dda2c068524..03d4f2ba6c7 100644
--- a/hip/test/utils.hip.hpp
+++ b/hip/test/utils.hip.hpp
@@ -43,9 +43,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace {
 
 
+// Visual Studio does not define the constructor of std::mutex as constexpr,
+// causing it to not be initialized when creating this executor (which uses
+// the mutex)
+#if !defined(_MSC_VER)
 // prevent device reset after each test
 auto no_reset_exec =
     gko::HipExecutor::create(0, gko::ReferenceExecutor::create(), true);
+#endif
 
 
 }  // namespace