diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index cbb9931f3b5..df3265cb58c 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -722,7 +722,8 @@ build/dpcpp/cpu/release/static:
     BUILD_SHARED_LIBS: "ON"
     SYCL_DEVICE_TYPE: "CPU"
 
-build/dpcpp/igpu/release/static:
+# It gives two available backends of GPU on tests
+build/dpcpp/igpu/release/shared:
   <<: *default_build_with_test
   extends:
     - .full_test_condition
@@ -733,13 +734,59 @@ build/dpcpp/igpu/release/static:
     CXX_COMPILER: "dpcpp"
     BUILD_DPCPP: "ON"
     BUILD_TYPE: "Release"
+    BUILD_SHARED_LIBS: "ON"
+    DPCPP_SINGLE_MODE: "ON"
+    SYCL_DEVICE_TYPE: "GPU"
+
+build/dpcpp/opencl_igpu/release/static:
+  <<: *default_build_with_test
+  extends:
+    - .quick_test_condition
+    - .use_gko-oneapi-igpu
+  variables:
+    <<: *default_variables
+    C_COMPILER: "gcc"
+    CXX_COMPILER: "dpcpp"
+    BUILD_DPCPP: "ON"
+    BUILD_TYPE: "Release"
+    BUILD_SHARED_LIBS: "OFF"
+    DPCPP_SINGLE_MODE: "ON"
+    SYCL_DEVICE_FILTER: "OpenCL"
+    SYCL_DEVICE_TYPE: "GPU"
+
+build/dpcpp/level_zero_igpu/debug/static:
+  <<: *default_build_with_test
+  extends:
+    - .full_test_condition
+    - .use_gko-oneapi-igpu
+  variables:
+    <<: *default_variables
+    C_COMPILER: "gcc"
+    CXX_COMPILER: "dpcpp"
+    BUILD_DPCPP: "ON"
+    BUILD_TYPE: "Debug"
     BUILD_SHARED_LIBS: "OFF"
     DPCPP_SINGLE_MODE: "ON"
     SYCL_DEVICE_FILTER: "Level_Zero:GPU"
 
-build/dpcpp/dgpu/debug/shared:
+# It gives two available backends of GPU on tests
+build/dpcpp/dgpu/release/static:
+  <<: *default_build_with_test
+  extends:
+    - .quick_test_condition
+    - .use_gko-oneapi-igpu
+  variables:
+    <<: *default_variables
+    C_COMPILER: "gcc"
+    CXX_COMPILER: "dpcpp"
+    BUILD_DPCPP: "ON"
+    BUILD_TYPE: "Release"
+    BUILD_SHARED_LIBS: "OF"
+    DPCPP_SINGLE_MODE: "ON"
+    SYCL_DEVICE_TYPE: "GPU"
+
+build/dpcpp/level_zero_dgpu/release/shared:
   <<: *default_build_with_test
-  image: localhost:5000/gko-oneapi
   extends:
     - .quick_test_condition
     - .use_gko-oneapi-dgpu
@@ -748,10 +795,24 @@ build/dpcpp/dgpu/debug/shared:
     C_COMPILER: "gcc"
     CXX_COMPILER: "dpcpp"
     BUILD_DPCPP: "ON"
-    BUILD_TYPE: "Debug"
+    BUILD_TYPE: "Release"
     DPCPP_SINGLE_MODE: "ON"
     SYCL_DEVICE_FILTER: "Level_Zero:GPU"
 
+build/dpcpp/opencl_dgpu/debug/shared:
+  <<: *default_build_with_test
+  extends:
+    - .full_test_condition
+    - .use_gko-oneapi-dgpu
+  variables:
+    <<: *default_variables
+    C_COMPILER: "gcc"
+    CXX_COMPILER: "dpcpp"
+    BUILD_DPCPP: "ON"
+    BUILD_TYPE: "Debug"
+    DPCPP_SINGLE_MODE: "ON"
+    SYCL_DEVICE_FILTER: "OpenCL"
+    SYCL_DEVICE_TYPE: "GPU"
 
 # Job with important warnings as error
 warnings:
diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp
index 39ea82f6b6d..5f1ec600cd2 100644
--- a/core/test/base/executor.cpp
+++ b/core/test/base/executor.cpp
@@ -541,14 +541,20 @@ TEST(Executor, CanVerifyMemory)
     std::shared_ptr<gko::DpcppExecutor> host_dpcpp;
     std::shared_ptr<gko::DpcppExecutor> cpu_dpcpp;
     std::shared_ptr<gko::DpcppExecutor> gpu_dpcpp;
+    std::shared_ptr<gko::DpcppExecutor> host_dpcpp_dup;
+    std::shared_ptr<gko::DpcppExecutor> cpu_dpcpp_dup;
+    std::shared_ptr<gko::DpcppExecutor> gpu_dpcpp_dup;
     if (gko::DpcppExecutor::get_num_devices("host")) {
         host_dpcpp = gko::DpcppExecutor::create(0, omp, "host");
+        host_dpcpp_dup = gko::DpcppExecutor::create(0, omp, "host");
     }
     if (gko::DpcppExecutor::get_num_devices("cpu")) {
         cpu_dpcpp = gko::DpcppExecutor::create(0, omp, "cpu");
+        cpu_dpcpp_dup = gko::DpcppExecutor::create(0, omp, "cpu");
     }
     if (gko::DpcppExecutor::get_num_devices("gpu")) {
         gpu_dpcpp = gko::DpcppExecutor::create(0, omp, "gpu");
+        gpu_dpcpp_dup = gko::DpcppExecutor::create(0, omp, "gpu");
     }
 
     ASSERT_EQ(false, ref->memory_accessible(omp));
@@ -566,18 +572,24 @@ TEST(Executor, CanVerifyMemory)
         ASSERT_EQ(false, ref->memory_accessible(host_dpcpp));
         ASSERT_EQ(true, host_dpcpp->memory_accessible(omp));
         ASSERT_EQ(true, omp->memory_accessible(host_dpcpp));
+        ASSERT_EQ(true, host_dpcpp->memory_accessible(host_dpcpp_dup));
+        ASSERT_EQ(true, host_dpcpp_dup->memory_accessible(host_dpcpp));
     }
     if (gko::DpcppExecutor::get_num_devices("cpu")) {
         ASSERT_EQ(false, ref->memory_accessible(cpu_dpcpp));
         ASSERT_EQ(false, cpu_dpcpp->memory_accessible(ref));
         ASSERT_EQ(true, cpu_dpcpp->memory_accessible(omp));
         ASSERT_EQ(true, omp->memory_accessible(cpu_dpcpp));
+        ASSERT_EQ(true, cpu_dpcpp->memory_accessible(cpu_dpcpp_dup));
+        ASSERT_EQ(true, cpu_dpcpp_dup->memory_accessible(cpu_dpcpp));
     }
     if (gko::DpcppExecutor::get_num_devices("gpu")) {
         ASSERT_EQ(false, gpu_dpcpp->memory_accessible(ref));
         ASSERT_EQ(false, ref->memory_accessible(gpu_dpcpp));
         ASSERT_EQ(false, gpu_dpcpp->memory_accessible(omp));
         ASSERT_EQ(false, omp->memory_accessible(gpu_dpcpp));
+        ASSERT_EQ(false, gpu_dpcpp->memory_accessible(gpu_dpcpp_dup));
+        ASSERT_EQ(false, gpu_dpcpp_dup->memory_accessible(gpu_dpcpp));
     }
 #if GINKGO_HIP_PLATFORM_NVCC
     ASSERT_EQ(true, hip->memory_accessible(cuda));
diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp
index 39c3e4557e6..d0380127732 100644
--- a/dpcpp/base/executor.dp.cpp
+++ b/dpcpp/base/executor.dp.cpp
@@ -102,8 +102,29 @@ void DpcppExecutor::populate_exec_info(const MachineTopology *mach_topo)
 
 void DpcppExecutor::raw_free(void *ptr) const noexcept
 {
-    queue_->wait_and_throw();
-    sycl::free(ptr, queue_->get_context());
+    // the free function may syncronize excution or not, which depends on
+    // implementation or backend, so it is not guaranteed.
+    // TODO: maybe a light wait implementation?
+    try {
+        queue_->wait_and_throw();
+        sycl::free(ptr, queue_->get_context());
+    } catch (cl::sycl::exception &err) {
+#if GKO_VERBOSE_LEVEL >= 1
+        // Unfortunately, if memory free fails, there's not much we can do
+        std::cerr << "Unrecoverable Dpcpp error on device "
+                  << this->get_device_id() << " in " << __func__ << ": "
+                  << err.what() << std::endl
+                  << "Exiting program" << std::endl;
+#endif  // GKO_VERBOSE_LEVEL >= 1
+        // OpenCL error code use 0 for CL_SUCCESS and negative number for others
+        // error. if the error is not from OpenCL, it will return CL_SUCCESS.
+        int err_code = err.get_cl_code();
+        // if return CL_SUCCESS, exit 1 as DPCPP error.
+        if (err_code == 0) {
+            err_code = 1;
+        }
+        std::exit(err_code);
+    }
 }
 
 
@@ -144,7 +165,21 @@ void DpcppExecutor::raw_copy_to(const DpcppExecutor *dest, size_type num_bytes,
                                 const void *src_ptr, void *dest_ptr) const
 {
     if (num_bytes > 0) {
-        dest->get_queue()->memcpy(dest_ptr, src_ptr, num_bytes).wait();
+        // If the queue is different and is not cpu/host, the queue can not
+        // transfer the data to another queue (on the same device)
+        // Note. it could be changed when we ensure the behavior is expected.
+        auto queue = this->get_queue();
+        auto dest_queue = dest->get_queue();
+        auto device = queue->get_device();
+        auto dest_device = dest_queue->get_device();
+        if (((device.is_host() || device.is_cpu()) &&
+             (dest_device.is_host() || dest_device.is_cpu())) ||
+            (queue == dest_queue)) {
+            dest->get_queue()->memcpy(dest_ptr, src_ptr, num_bytes).wait();
+        } else {
+            // the memcpy only support host<->device or itself memcpy
+            GKO_NOT_SUPPORTED(dest);
+        }
     }
 }
 
@@ -176,15 +211,16 @@ bool DpcppExecutor::verify_memory_to(const OmpExecutor *dest_exec) const
 
 bool DpcppExecutor::verify_memory_to(const DpcppExecutor *dest_exec) const
 {
-    auto device = detail::get_devices(
-        get_exec_info().device_type)[get_exec_info().device_id];
-    auto other_device = detail::get_devices(
-        dest_exec->get_device_type())[dest_exec->get_device_id()];
+    // If the queue is different and is not cpu/host, the queue can not access
+    // the data from another queue (on the same device)
+    // Note. it could be changed when we ensure the behavior is expected.
+    auto queue = this->get_queue();
+    auto dest_queue = dest_exec->get_queue();
+    auto device = queue->get_device();
+    auto dest_device = dest_queue->get_device();
     return ((device.is_host() || device.is_cpu()) &&
-            (other_device.is_host() || other_device.is_cpu())) ||
-           (device.get_info<cl::sycl::info::device::device_type>() ==
-                other_device.get_info<cl::sycl::info::device::device_type>() &&
-            device.get() == other_device.get());
+            (dest_device.is_host() || dest_device.is_cpu())) ||
+           (queue == dest_queue);
 }
 
 
diff --git a/dpcpp/test/base/executor.dp.cpp b/dpcpp/test/base/executor.dp.cpp
index 39d9e1e3313..830543349d5 100644
--- a/dpcpp/test/base/executor.dp.cpp
+++ b/dpcpp/test/base/executor.dp.cpp
@@ -55,20 +55,20 @@ namespace {
 class DpcppExecutor : public ::testing::Test {
 protected:
     DpcppExecutor()
-        : omp(gko::OmpExecutor::create()), dpcpp(nullptr), dpcpp2(nullptr)
+        : ref(gko::ReferenceExecutor::create()), dpcpp(nullptr), dpcpp2(nullptr)
     {}
 
     void SetUp()
     {
         if (gko::DpcppExecutor::get_num_devices("gpu") > 0) {
-            dpcpp = gko::DpcppExecutor::create(0, omp, "gpu");
+            dpcpp = gko::DpcppExecutor::create(0, ref, "gpu");
             if (gko::DpcppExecutor::get_num_devices("gpu") > 1) {
-                dpcpp2 = gko::DpcppExecutor::create(1, omp, "gpu");
+                dpcpp2 = gko::DpcppExecutor::create(1, ref, "gpu");
             }
         } else if (gko::DpcppExecutor::get_num_devices("cpu") > 0) {
-            dpcpp = gko::DpcppExecutor::create(0, omp, "cpu");
+            dpcpp = gko::DpcppExecutor::create(0, ref, "cpu");
             if (gko::DpcppExecutor::get_num_devices("cpu") > 1) {
-                dpcpp2 = gko::DpcppExecutor::create(1, omp, "cpu");
+                dpcpp2 = gko::DpcppExecutor::create(1, ref, "cpu");
             }
         } else {
             GKO_NOT_IMPLEMENTED;
@@ -84,7 +84,7 @@ class DpcppExecutor : public ::testing::Test {
         }
     }
 
-    std::shared_ptr<gko::Executor> omp{};
+    std::shared_ptr<gko::Executor> ref{};
     std::shared_ptr<const gko::DpcppExecutor> dpcpp{};
     std::shared_ptr<const gko::DpcppExecutor> dpcpp2{};
 };
@@ -92,9 +92,9 @@ class DpcppExecutor : public ::testing::Test {
 
 TEST_F(DpcppExecutor, CanInstantiateTwoExecutorsOnOneDevice)
 {
-    auto dpcpp = gko::DpcppExecutor::create(0, omp);
+    auto dpcpp = gko::DpcppExecutor::create(0, ref);
     if (dpcpp2 != nullptr) {
-        auto dpcpp2 = gko::DpcppExecutor::create(0, omp);
+        auto dpcpp2 = gko::DpcppExecutor::create(0, ref);
     }
 
     // We want automatic deinitialization to not create any error
@@ -103,7 +103,7 @@ TEST_F(DpcppExecutor, CanInstantiateTwoExecutorsOnOneDevice)
 
 TEST_F(DpcppExecutor, CanGetExecInfo)
 {
-    dpcpp = gko::DpcppExecutor::create(0, omp);
+    dpcpp = gko::DpcppExecutor::create(0, ref);
 
     ASSERT_TRUE(dpcpp->get_num_computing_units() > 0);
     ASSERT_TRUE(dpcpp->get_subgroup_sizes().size() > 0);
@@ -191,9 +191,9 @@ TEST_F(DpcppExecutor, CopiesDataToCPU)
 {
     int orig[] = {3, 8};
     auto *copy = dpcpp->alloc<int>(2);
-    gko::Array<bool> is_set(omp, 1);
+    gko::Array<bool> is_set(ref, 1);
 
-    dpcpp->copy_from(omp.get(), 2, orig, copy);
+    dpcpp->copy_from(ref.get(), 2, orig, copy);
 
     is_set.set_executor(dpcpp);
     ASSERT_NO_THROW(dpcpp->synchronize());
@@ -201,7 +201,7 @@ TEST_F(DpcppExecutor, CopiesDataToCPU)
         auto *is_set_ptr = is_set.get_data();
         cgh.single_task([=]() { check_data(copy, is_set_ptr); });
     }));
-    is_set.set_executor(omp);
+    is_set.set_executor(ref);
     ASSERT_EQ(*is_set.get_data(), true);
     ASSERT_NO_THROW(dpcpp->synchronize());
     dpcpp->free(copy);
@@ -221,7 +221,7 @@ TEST_F(DpcppExecutor, CopiesDataFromCPU)
         cgh.single_task([=]() { init_data(orig); });
     });
 
-    omp->copy_from(dpcpp.get(), 2, orig, copy);
+    ref->copy_from(dpcpp.get(), 2, orig, copy);
 
     EXPECT_EQ(3, copy[0]);
     ASSERT_EQ(8, copy[1]);
@@ -236,7 +236,7 @@ TEST_F(DpcppExecutor, CopiesDataFromDpcppToDpcpp)
     }
 
     int copy[2];
-    gko::Array<bool> is_set(omp, 1);
+    gko::Array<bool> is_set(ref, 1);
     auto orig = dpcpp->alloc<int>(2);
     dpcpp->get_queue()->submit([&](sycl::handler &cgh) {
         cgh.single_task([=]() { init_data(orig); });
@@ -250,11 +250,11 @@ TEST_F(DpcppExecutor, CopiesDataFromDpcppToDpcpp)
         auto *is_set_ptr = is_set.get_data();
         cgh.single_task([=]() { check_data(copy_dpcpp2, is_set_ptr); });
     }));
-    is_set.set_executor(omp);
+    is_set.set_executor(ref);
     ASSERT_EQ(*is_set.get_data(), true);
 
     // Put the results on OpenMP and run CPU side assertions
-    omp->copy_from(dpcpp2.get(), 2, copy_dpcpp2, copy);
+    ref->copy_from(dpcpp2.get(), 2, copy_dpcpp2, copy);
     EXPECT_EQ(3, copy[0]);
     ASSERT_EQ(8, copy[1]);
     dpcpp2->free(copy_dpcpp2);
@@ -269,4 +269,31 @@ TEST_F(DpcppExecutor, Synchronizes)
 }
 
 
+#define GTEST_ASSERT_NO_EXIT(statement) \
+    ASSERT_EXIT({ {statement} exit(0); }, ::testing::ExitedWithCode(0), "")
+
+
+TEST_F(DpcppExecutor, FreeAfterKernel)
+{
+    testing::FLAGS_gtest_death_test_style = "threadsafe";
+    GTEST_ASSERT_NO_EXIT({
+        size_t length = 10000;
+        auto dpcpp =
+            gko::DpcppExecutor::create(0, gko::ReferenceExecutor::create());
+        {
+            gko::Array<float> x(dpcpp, length);
+            gko::Array<float> y(dpcpp, length);
+            auto x_val = x.get_data();
+            auto y_val = y.get_data();
+            dpcpp->get_queue()->submit([&](sycl::handler &cgh) {
+                cgh.parallel_for(sycl::range<1>{length},
+                                 [=](sycl::id<1> i) { y_val[i] += x_val[i]; });
+            });
+        }
+        // to ensure everything on queue is finished.
+        dpcpp->synchronize();
+    });
+}
+
+
 }  // namespace
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index d9ca2e3af32..0d96cb4cd66 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <array>
+#include <iostream>
 #include <memory>
 #include <mutex>
 #include <sstream>
@@ -621,7 +622,12 @@ class Executor : public log::EnableLogging<Executor> {
             this->raw_copy_from(src_exec, num_elems * sizeof(T), src_ptr,
                                 dest_ptr);
         } catch (NotSupported &) {
+#if (GKO_VERBOSE_LEVEL >= 1) && !defined(NDEBUG)
             // Unoptimized copy. Try to go through the masters.
+            // output to log when verbose >= 1 and debug build
+            std::clog << "Not direct copy. Try to copy data from the masters."
+                      << std::endl;
+#endif
             auto src_master = src_exec->get_master().get();
             if (num_elems > 0 && src_master != src_exec) {
                 auto *master_ptr = src_exec->get_master()->alloc<T>(num_elems);
diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp
index 2cd1784b28e..81f7349daa8 100644
--- a/include/ginkgo/core/base/math.hpp
+++ b/include/ginkgo/core/base/math.hpp
@@ -41,7 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <type_traits>
 
 
-#ifdef CL_SYCL_LANGUAGE_VERSION
+#ifdef SYCL_LANGUAGE_VERSION
 #include <CL/sycl.hpp>
 #endif