diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index cbb9931f3b5..df3265cb58c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -722,7 +722,8 @@ build/dpcpp/cpu/release/static: BUILD_SHARED_LIBS: "ON" SYCL_DEVICE_TYPE: "CPU" -build/dpcpp/igpu/release/static: +# It gives two available backends of GPU on tests +build/dpcpp/igpu/release/shared: <<: *default_build_with_test extends: - .full_test_condition @@ -733,13 +734,59 @@ build/dpcpp/igpu/release/static: CXX_COMPILER: "dpcpp" BUILD_DPCPP: "ON" BUILD_TYPE: "Release" + BUILD_SHARED_LIBS: "ON" + DPCPP_SINGLE_MODE: "ON" + SYCL_DEVICE_TYPE: "GPU" + +build/dpcpp/opencl_igpu/release/static: + <<: *default_build_with_test + extends: + - .quick_test_condition + - .use_gko-oneapi-igpu + variables: + <<: *default_variables + C_COMPILER: "gcc" + CXX_COMPILER: "dpcpp" + BUILD_DPCPP: "ON" + BUILD_TYPE: "Release" + BUILD_SHARED_LIBS: "OFF" + DPCPP_SINGLE_MODE: "ON" + SYCL_DEVICE_FILTER: "OpenCL" + SYCL_DEVICE_TYPE: "GPU" + +build/dpcpp/level_zero_igpu/debug/static: + <<: *default_build_with_test + extends: + - .full_test_condition + - .use_gko-oneapi-igpu + variables: + <<: *default_variables + C_COMPILER: "gcc" + CXX_COMPILER: "dpcpp" + BUILD_DPCPP: "ON" + BUILD_TYPE: "Debug" BUILD_SHARED_LIBS: "OFF" DPCPP_SINGLE_MODE: "ON" SYCL_DEVICE_FILTER: "Level_Zero:GPU" -build/dpcpp/dgpu/debug/shared: +# It gives two available backends of GPU on tests +build/dpcpp/dgpu/release/static: + <<: *default_build_with_test + extends: + - .quick_test_condition + - .use_gko-oneapi-igpu + variables: + <<: *default_variables + C_COMPILER: "gcc" + CXX_COMPILER: "dpcpp" + BUILD_DPCPP: "ON" + BUILD_TYPE: "Release" + BUILD_SHARED_LIBS: "OF" + DPCPP_SINGLE_MODE: "ON" + SYCL_DEVICE_TYPE: "GPU" + +build/dpcpp/level_zero_dgpu/release/shared: <<: *default_build_with_test - image: localhost:5000/gko-oneapi extends: - .quick_test_condition - .use_gko-oneapi-dgpu @@ -748,10 +795,24 @@ build/dpcpp/dgpu/debug/shared: C_COMPILER: "gcc" CXX_COMPILER: "dpcpp" BUILD_DPCPP: "ON" - BUILD_TYPE: "Debug" + BUILD_TYPE: "Release" DPCPP_SINGLE_MODE: "ON" SYCL_DEVICE_FILTER: "Level_Zero:GPU" +build/dpcpp/opencl_dgpu/debug/shared: + <<: *default_build_with_test + extends: + - .full_test_condition + - .use_gko-oneapi-dgpu + variables: + <<: *default_variables + C_COMPILER: "gcc" + CXX_COMPILER: "dpcpp" + BUILD_DPCPP: "ON" + BUILD_TYPE: "Debug" + DPCPP_SINGLE_MODE: "ON" + SYCL_DEVICE_FILTER: "OpenCL" + SYCL_DEVICE_TYPE: "GPU" # Job with important warnings as error warnings: diff --git a/core/test/base/executor.cpp b/core/test/base/executor.cpp index 39ea82f6b6d..5f1ec600cd2 100644 --- a/core/test/base/executor.cpp +++ b/core/test/base/executor.cpp @@ -541,14 +541,20 @@ TEST(Executor, CanVerifyMemory) std::shared_ptr host_dpcpp; std::shared_ptr cpu_dpcpp; std::shared_ptr gpu_dpcpp; + std::shared_ptr host_dpcpp_dup; + std::shared_ptr cpu_dpcpp_dup; + std::shared_ptr gpu_dpcpp_dup; if (gko::DpcppExecutor::get_num_devices("host")) { host_dpcpp = gko::DpcppExecutor::create(0, omp, "host"); + host_dpcpp_dup = gko::DpcppExecutor::create(0, omp, "host"); } if (gko::DpcppExecutor::get_num_devices("cpu")) { cpu_dpcpp = gko::DpcppExecutor::create(0, omp, "cpu"); + cpu_dpcpp_dup = gko::DpcppExecutor::create(0, omp, "cpu"); } if (gko::DpcppExecutor::get_num_devices("gpu")) { gpu_dpcpp = gko::DpcppExecutor::create(0, omp, "gpu"); + gpu_dpcpp_dup = gko::DpcppExecutor::create(0, omp, "gpu"); } ASSERT_EQ(false, ref->memory_accessible(omp)); @@ -566,18 +572,24 @@ TEST(Executor, CanVerifyMemory) ASSERT_EQ(false, ref->memory_accessible(host_dpcpp)); ASSERT_EQ(true, host_dpcpp->memory_accessible(omp)); ASSERT_EQ(true, omp->memory_accessible(host_dpcpp)); + ASSERT_EQ(true, host_dpcpp->memory_accessible(host_dpcpp_dup)); + ASSERT_EQ(true, host_dpcpp_dup->memory_accessible(host_dpcpp)); } if (gko::DpcppExecutor::get_num_devices("cpu")) { ASSERT_EQ(false, ref->memory_accessible(cpu_dpcpp)); ASSERT_EQ(false, cpu_dpcpp->memory_accessible(ref)); ASSERT_EQ(true, cpu_dpcpp->memory_accessible(omp)); ASSERT_EQ(true, omp->memory_accessible(cpu_dpcpp)); + ASSERT_EQ(true, cpu_dpcpp->memory_accessible(cpu_dpcpp_dup)); + ASSERT_EQ(true, cpu_dpcpp_dup->memory_accessible(cpu_dpcpp)); } if (gko::DpcppExecutor::get_num_devices("gpu")) { ASSERT_EQ(false, gpu_dpcpp->memory_accessible(ref)); ASSERT_EQ(false, ref->memory_accessible(gpu_dpcpp)); ASSERT_EQ(false, gpu_dpcpp->memory_accessible(omp)); ASSERT_EQ(false, omp->memory_accessible(gpu_dpcpp)); + ASSERT_EQ(false, gpu_dpcpp->memory_accessible(gpu_dpcpp_dup)); + ASSERT_EQ(false, gpu_dpcpp_dup->memory_accessible(gpu_dpcpp)); } #if GINKGO_HIP_PLATFORM_NVCC ASSERT_EQ(true, hip->memory_accessible(cuda)); diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp index 39c3e4557e6..d0380127732 100644 --- a/dpcpp/base/executor.dp.cpp +++ b/dpcpp/base/executor.dp.cpp @@ -102,8 +102,29 @@ void DpcppExecutor::populate_exec_info(const MachineTopology *mach_topo) void DpcppExecutor::raw_free(void *ptr) const noexcept { - queue_->wait_and_throw(); - sycl::free(ptr, queue_->get_context()); + // the free function may syncronize excution or not, which depends on + // implementation or backend, so it is not guaranteed. + // TODO: maybe a light wait implementation? + try { + queue_->wait_and_throw(); + sycl::free(ptr, queue_->get_context()); + } catch (cl::sycl::exception &err) { +#if GKO_VERBOSE_LEVEL >= 1 + // Unfortunately, if memory free fails, there's not much we can do + std::cerr << "Unrecoverable Dpcpp error on device " + << this->get_device_id() << " in " << __func__ << ": " + << err.what() << std::endl + << "Exiting program" << std::endl; +#endif // GKO_VERBOSE_LEVEL >= 1 + // OpenCL error code use 0 for CL_SUCCESS and negative number for others + // error. if the error is not from OpenCL, it will return CL_SUCCESS. + int err_code = err.get_cl_code(); + // if return CL_SUCCESS, exit 1 as DPCPP error. + if (err_code == 0) { + err_code = 1; + } + std::exit(err_code); + } } @@ -144,7 +165,21 @@ void DpcppExecutor::raw_copy_to(const DpcppExecutor *dest, size_type num_bytes, const void *src_ptr, void *dest_ptr) const { if (num_bytes > 0) { - dest->get_queue()->memcpy(dest_ptr, src_ptr, num_bytes).wait(); + // If the queue is different and is not cpu/host, the queue can not + // transfer the data to another queue (on the same device) + // Note. it could be changed when we ensure the behavior is expected. + auto queue = this->get_queue(); + auto dest_queue = dest->get_queue(); + auto device = queue->get_device(); + auto dest_device = dest_queue->get_device(); + if (((device.is_host() || device.is_cpu()) && + (dest_device.is_host() || dest_device.is_cpu())) || + (queue == dest_queue)) { + dest->get_queue()->memcpy(dest_ptr, src_ptr, num_bytes).wait(); + } else { + // the memcpy only support host<->device or itself memcpy + GKO_NOT_SUPPORTED(dest); + } } } @@ -176,15 +211,16 @@ bool DpcppExecutor::verify_memory_to(const OmpExecutor *dest_exec) const bool DpcppExecutor::verify_memory_to(const DpcppExecutor *dest_exec) const { - auto device = detail::get_devices( - get_exec_info().device_type)[get_exec_info().device_id]; - auto other_device = detail::get_devices( - dest_exec->get_device_type())[dest_exec->get_device_id()]; + // If the queue is different and is not cpu/host, the queue can not access + // the data from another queue (on the same device) + // Note. it could be changed when we ensure the behavior is expected. + auto queue = this->get_queue(); + auto dest_queue = dest_exec->get_queue(); + auto device = queue->get_device(); + auto dest_device = dest_queue->get_device(); return ((device.is_host() || device.is_cpu()) && - (other_device.is_host() || other_device.is_cpu())) || - (device.get_info() == - other_device.get_info() && - device.get() == other_device.get()); + (dest_device.is_host() || dest_device.is_cpu())) || + (queue == dest_queue); } diff --git a/dpcpp/test/base/executor.dp.cpp b/dpcpp/test/base/executor.dp.cpp index 39d9e1e3313..830543349d5 100644 --- a/dpcpp/test/base/executor.dp.cpp +++ b/dpcpp/test/base/executor.dp.cpp @@ -55,20 +55,20 @@ namespace { class DpcppExecutor : public ::testing::Test { protected: DpcppExecutor() - : omp(gko::OmpExecutor::create()), dpcpp(nullptr), dpcpp2(nullptr) + : ref(gko::ReferenceExecutor::create()), dpcpp(nullptr), dpcpp2(nullptr) {} void SetUp() { if (gko::DpcppExecutor::get_num_devices("gpu") > 0) { - dpcpp = gko::DpcppExecutor::create(0, omp, "gpu"); + dpcpp = gko::DpcppExecutor::create(0, ref, "gpu"); if (gko::DpcppExecutor::get_num_devices("gpu") > 1) { - dpcpp2 = gko::DpcppExecutor::create(1, omp, "gpu"); + dpcpp2 = gko::DpcppExecutor::create(1, ref, "gpu"); } } else if (gko::DpcppExecutor::get_num_devices("cpu") > 0) { - dpcpp = gko::DpcppExecutor::create(0, omp, "cpu"); + dpcpp = gko::DpcppExecutor::create(0, ref, "cpu"); if (gko::DpcppExecutor::get_num_devices("cpu") > 1) { - dpcpp2 = gko::DpcppExecutor::create(1, omp, "cpu"); + dpcpp2 = gko::DpcppExecutor::create(1, ref, "cpu"); } } else { GKO_NOT_IMPLEMENTED; @@ -84,7 +84,7 @@ class DpcppExecutor : public ::testing::Test { } } - std::shared_ptr omp{}; + std::shared_ptr ref{}; std::shared_ptr dpcpp{}; std::shared_ptr dpcpp2{}; }; @@ -92,9 +92,9 @@ class DpcppExecutor : public ::testing::Test { TEST_F(DpcppExecutor, CanInstantiateTwoExecutorsOnOneDevice) { - auto dpcpp = gko::DpcppExecutor::create(0, omp); + auto dpcpp = gko::DpcppExecutor::create(0, ref); if (dpcpp2 != nullptr) { - auto dpcpp2 = gko::DpcppExecutor::create(0, omp); + auto dpcpp2 = gko::DpcppExecutor::create(0, ref); } // We want automatic deinitialization to not create any error @@ -103,7 +103,7 @@ TEST_F(DpcppExecutor, CanInstantiateTwoExecutorsOnOneDevice) TEST_F(DpcppExecutor, CanGetExecInfo) { - dpcpp = gko::DpcppExecutor::create(0, omp); + dpcpp = gko::DpcppExecutor::create(0, ref); ASSERT_TRUE(dpcpp->get_num_computing_units() > 0); ASSERT_TRUE(dpcpp->get_subgroup_sizes().size() > 0); @@ -191,9 +191,9 @@ TEST_F(DpcppExecutor, CopiesDataToCPU) { int orig[] = {3, 8}; auto *copy = dpcpp->alloc(2); - gko::Array is_set(omp, 1); + gko::Array is_set(ref, 1); - dpcpp->copy_from(omp.get(), 2, orig, copy); + dpcpp->copy_from(ref.get(), 2, orig, copy); is_set.set_executor(dpcpp); ASSERT_NO_THROW(dpcpp->synchronize()); @@ -201,7 +201,7 @@ TEST_F(DpcppExecutor, CopiesDataToCPU) auto *is_set_ptr = is_set.get_data(); cgh.single_task([=]() { check_data(copy, is_set_ptr); }); })); - is_set.set_executor(omp); + is_set.set_executor(ref); ASSERT_EQ(*is_set.get_data(), true); ASSERT_NO_THROW(dpcpp->synchronize()); dpcpp->free(copy); @@ -221,7 +221,7 @@ TEST_F(DpcppExecutor, CopiesDataFromCPU) cgh.single_task([=]() { init_data(orig); }); }); - omp->copy_from(dpcpp.get(), 2, orig, copy); + ref->copy_from(dpcpp.get(), 2, orig, copy); EXPECT_EQ(3, copy[0]); ASSERT_EQ(8, copy[1]); @@ -236,7 +236,7 @@ TEST_F(DpcppExecutor, CopiesDataFromDpcppToDpcpp) } int copy[2]; - gko::Array is_set(omp, 1); + gko::Array is_set(ref, 1); auto orig = dpcpp->alloc(2); dpcpp->get_queue()->submit([&](sycl::handler &cgh) { cgh.single_task([=]() { init_data(orig); }); @@ -250,11 +250,11 @@ TEST_F(DpcppExecutor, CopiesDataFromDpcppToDpcpp) auto *is_set_ptr = is_set.get_data(); cgh.single_task([=]() { check_data(copy_dpcpp2, is_set_ptr); }); })); - is_set.set_executor(omp); + is_set.set_executor(ref); ASSERT_EQ(*is_set.get_data(), true); // Put the results on OpenMP and run CPU side assertions - omp->copy_from(dpcpp2.get(), 2, copy_dpcpp2, copy); + ref->copy_from(dpcpp2.get(), 2, copy_dpcpp2, copy); EXPECT_EQ(3, copy[0]); ASSERT_EQ(8, copy[1]); dpcpp2->free(copy_dpcpp2); @@ -269,4 +269,31 @@ TEST_F(DpcppExecutor, Synchronizes) } +#define GTEST_ASSERT_NO_EXIT(statement) \ + ASSERT_EXIT({ {statement} exit(0); }, ::testing::ExitedWithCode(0), "") + + +TEST_F(DpcppExecutor, FreeAfterKernel) +{ + testing::FLAGS_gtest_death_test_style = "threadsafe"; + GTEST_ASSERT_NO_EXIT({ + size_t length = 10000; + auto dpcpp = + gko::DpcppExecutor::create(0, gko::ReferenceExecutor::create()); + { + gko::Array x(dpcpp, length); + gko::Array y(dpcpp, length); + auto x_val = x.get_data(); + auto y_val = y.get_data(); + dpcpp->get_queue()->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::range<1>{length}, + [=](sycl::id<1> i) { y_val[i] += x_val[i]; }); + }); + } + // to ensure everything on queue is finished. + dpcpp->synchronize(); + }); +} + + } // namespace diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index d9ca2e3af32..0d96cb4cd66 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include @@ -621,7 +622,12 @@ class Executor : public log::EnableLogging { this->raw_copy_from(src_exec, num_elems * sizeof(T), src_ptr, dest_ptr); } catch (NotSupported &) { +#if (GKO_VERBOSE_LEVEL >= 1) && !defined(NDEBUG) // Unoptimized copy. Try to go through the masters. + // output to log when verbose >= 1 and debug build + std::clog << "Not direct copy. Try to copy data from the masters." + << std::endl; +#endif auto src_master = src_exec->get_master().get(); if (num_elems > 0 && src_master != src_exec) { auto *master_ptr = src_exec->get_master()->alloc(num_elems); diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 2cd1784b28e..81f7349daa8 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -41,7 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#ifdef CL_SYCL_LANGUAGE_VERSION +#ifdef SYCL_LANGUAGE_VERSION #include #endif