diff --git a/dpcpp/base/config.hpp b/dpcpp/base/config.hpp index 94e15508d64..645826dcf94 100644 --- a/dpcpp/base/config.hpp +++ b/dpcpp/base/config.hpp @@ -72,6 +72,13 @@ struct config { }; +#if SYCL_LANGUAGE_VERSION < 202000 +#define KERNEL_SUBGROUP_SIZE(val) [[intel::reqd_sub_group_size(val)]] +#else +#define KERNEL_SUBGROUP_SIZE(val) [[sycl::reqd_sub_group_size(val)]] +#endif + + } // namespace dpcpp } // namespace kernels } // namespace gko diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp index 8dd89d9f27a..f844a5f3272 100644 --- a/dpcpp/base/helper.hpp +++ b/dpcpp/base/helper.hpp @@ -56,19 +56,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * @param name_ the name of the host function with config * @param kernel_ the kernel name */ -#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_) \ - template \ - void name_(dim3 grid, dim3 block, gko::size_type, sycl::queue* queue, \ - InferredArgs... args) \ - { \ - queue->submit([&](sycl::handler& cgh) { \ - cgh.parallel_for( \ - sycl_nd_range(grid, block), [= \ - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( \ - config::warp_size)]] { \ - kernel_(args..., item_ct1); \ - }); \ - }); \ +#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_) \ + template \ + void name_(dim3 grid, dim3 block, gko::size_type, sycl::queue* queue, \ + InferredArgs... args) \ + { \ + queue->submit([&](sycl::handler& cgh) { \ + cgh.parallel_for(sycl_nd_range(grid, block), \ + [=](sycl::nd_item<3> item_ct1) \ + KERNEL_SUBGROUP_SIZE(config::warp_size) { \ + kernel_(args..., item_ct1); \ + }); \ + }); \ } @@ -80,19 +79,19 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * @param name_ the name of the host function with config * @param kernel_ the kernel name */ -#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_) \ - template \ - inline void name_(dim3 grid, dim3 block, gko::size_type, \ - sycl::queue* queue, InferredArgs... args) \ - { \ - queue->submit([&](sycl::handler& cgh) { \ - cgh.parallel_for( \ - sycl_nd_range(grid, block), [= \ - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( \ - KCfg::decode<1>(encoded))]] { \ - kernel_(args..., item_ct1); \ - }); \ - }); \ +#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_) \ + template \ + inline void name_(dim3 grid, dim3 block, gko::size_type, \ + sycl::queue* queue, InferredArgs... args) \ + { \ + queue->submit([&](sycl::handler& cgh) { \ + cgh.parallel_for( \ + sycl_nd_range(grid, block), \ + [=](sycl::nd_item<3> item_ct1) \ + KERNEL_SUBGROUP_SIZE(KCFG_1D::decode<1>(encoded)) { \ + kernel_(args..., item_ct1); \ + }); \ + }); \ } /** diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp index a6706ee320c..49fd299a4d1 100644 --- a/dpcpp/base/kernel_launch_reduction.dp.hpp +++ b/dpcpp/base/kernel_launch_reduction.dp.hpp @@ -80,8 +80,7 @@ void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size, const auto global_size = num_workgroups * wg_size; cgh.parallel_for( - range, [= - ](sycl::nd_item<3> idx) [[sycl::reqd_sub_group_size(sg_size)]] { + range, [=](sycl::nd_item<3> idx) KERNEL_SUBGROUP_SIZE(sg_size) { auto subgroup_partial = &(*subgroup_partial_acc.get_pointer())[0]; const auto tidx = thread::get_thread_id_flat(idx); const auto local_tidx = static_cast(tidx % wg_size); @@ -129,8 +128,7 @@ void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols, const auto global_size = num_workgroups * wg_size; cgh.parallel_for( - range, [= - ](sycl::nd_item<3> idx) [[sycl::reqd_sub_group_size(sg_size)]] { + range, [=](sycl::nd_item<3> idx) KERNEL_SUBGROUP_SIZE(sg_size) { auto subgroup_partial = &(*subgroup_partial_acc.get_pointer())[0]; const auto tidx = thread::get_thread_id_flat(idx); const auto local_tidx = static_cast(tidx % wg_size); @@ -310,38 +308,35 @@ void generic_kernel_row_reduction_2d(syn::value_list, const auto num_workgroups = ceildiv(rows * col_blocks * ssg_size, wg_size); const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size)); exec->get_queue()->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - range, [= - ](sycl::nd_item<3> id) [[sycl::reqd_sub_group_size(sg_size)]] { - const auto idx = - thread::get_subwarp_id_flat(id); - const auto row = idx % rows; - const auto col_block = idx / rows; - auto partial = identity; - auto subgroup = group::tiled_partition( - group::this_thread_block(id)); - auto ssg_rank = - static_cast(subgroup.thread_rank() % ssg_size); - if (col_block < col_blocks) { - const auto cols_per_part = - ceildiv(ceildiv(cols, ssg_size), col_blocks) * ssg_size; - const auto begin = cols_per_part * col_block; - const auto end = min(begin + cols_per_part, cols); - for (auto col = begin + ssg_rank; col < end; - col += ssg_size) { - partial = op(partial, fn(row, col, args...)); - } + cgh.parallel_for(range, [=](sycl::nd_item<3> id) KERNEL_SUBGROUP_SIZE( + sg_size) { + const auto idx = thread::get_subwarp_id_flat(id); + const auto row = idx % rows; + const auto col_block = idx / rows; + auto partial = identity; + auto subgroup = + group::tiled_partition(group::this_thread_block(id)); + auto ssg_rank = + static_cast(subgroup.thread_rank() % ssg_size); + if (col_block < col_blocks) { + const auto cols_per_part = + ceildiv(ceildiv(cols, ssg_size), col_blocks) * ssg_size; + const auto begin = cols_per_part * col_block; + const auto end = min(begin + cols_per_part, cols); + for (auto col = begin + ssg_rank; col < end; col += ssg_size) { + partial = op(partial, fn(row, col, args...)); } + } // since we do a sub-subgroup reduction, we can't use reduce #pragma unroll - for (int i = 1; i < ssg_size; i *= 2) { - partial = op(partial, subgroup.shfl_xor(partial, i)); - } - if (col_block < col_blocks && ssg_rank == 0) { - result[(row + col_block * rows) * result_stride] = - finalize(partial); - } - }); + for (int i = 1; i < ssg_size; i *= 2) { + partial = op(partial, subgroup.shfl_xor(partial, i)); + } + if (col_block < col_blocks && ssg_rank == 0) { + result[(row + col_block * rows) * result_stride] = + finalize(partial); + } + }); }); } @@ -367,60 +362,57 @@ void generic_kernel_col_reduction_2d_small( sycl::access_mode::read_write, sycl::access::target::local> block_partial_acc(cgh); const auto range = sycl_nd_range(dim3(row_blocks), dim3(wg_size)); - cgh.parallel_for( - range, [=](sycl::nd_item<3> id) [[sycl::reqd_sub_group_size(sg_size)]] { - auto block_partial = &(*block_partial_acc.get_pointer())[0]; - const auto ssg_id = - thread::get_subwarp_id_flat(id); - const auto local_sg_id = id.get_local_id(2) / sg_size; - const auto local_ssg_id = id.get_local_id(2) % sg_size / ssg_size; - const auto ssg_num = - thread::get_subwarp_num_flat(id); - const auto workgroup = group::this_thread_block(id); - const auto subgroup = group::tiled_partition(workgroup); - const auto sg_rank = subgroup.thread_rank(); - const auto ssg_rank = sg_rank % ssg_size; - const auto col = static_cast(ssg_rank); - auto partial = identity; - // accumulate within a thread - if (col < cols) { - for (auto row = ssg_id; row < rows; row += ssg_num) { - partial = op(partial, fn(row, col, args...)); - } + cgh.parallel_for(range, [=](sycl::nd_item<3> id) KERNEL_SUBGROUP_SIZE( + sg_size) { + auto block_partial = &(*block_partial_acc.get_pointer())[0]; + const auto ssg_id = thread::get_subwarp_id_flat(id); + const auto local_sg_id = id.get_local_id(2) / sg_size; + const auto local_ssg_id = id.get_local_id(2) % sg_size / ssg_size; + const auto ssg_num = thread::get_subwarp_num_flat(id); + const auto workgroup = group::this_thread_block(id); + const auto subgroup = group::tiled_partition(workgroup); + const auto sg_rank = subgroup.thread_rank(); + const auto ssg_rank = sg_rank % ssg_size; + const auto col = static_cast(ssg_rank); + auto partial = identity; + // accumulate within a thread + if (col < cols) { + for (auto row = ssg_id; row < rows; row += ssg_num) { + partial = op(partial, fn(row, col, args...)); } + } // accumulate between all subsubgroups in the subgroup #pragma unroll - for (unsigned i = ssg_size; i < sg_size; i *= 2) { - partial = op(partial, subgroup.shfl_xor(partial, i)); - } - // store the result to shared memory - if (local_ssg_id == 0) { - block_partial[local_sg_id * ssg_size + ssg_rank] = partial; - } - workgroup.sync(); - // in a single thread: accumulate the results - if (local_sg_id == 0) { - partial = identity; - // accumulate the partial results within a thread - if (shared_storage >= sg_size) { + for (unsigned i = ssg_size; i < sg_size; i *= 2) { + partial = op(partial, subgroup.shfl_xor(partial, i)); + } + // store the result to shared memory + if (local_ssg_id == 0) { + block_partial[local_sg_id * ssg_size + ssg_rank] = partial; + } + workgroup.sync(); + // in a single thread: accumulate the results + if (local_sg_id == 0) { + partial = identity; + // accumulate the partial results within a thread + if (shared_storage >= sg_size) { #pragma unroll - for (int i = 0; i < shared_storage; i += sg_size) { - partial = op(partial, block_partial[i + sg_rank]); - } - } else if (sg_rank < shared_storage) { - partial = op(partial, block_partial[sg_rank]); + for (int i = 0; i < shared_storage; i += sg_size) { + partial = op(partial, block_partial[i + sg_rank]); } + } else if (sg_rank < shared_storage) { + partial = op(partial, block_partial[sg_rank]); + } // accumulate between all subsubgroups in the subgroup #pragma unroll - for (unsigned i = ssg_size; i < sg_size; i *= 2) { - partial = op(partial, subgroup.shfl_xor(partial, i)); - } - if (sg_rank < cols) { - result[sg_rank + id.get_group(2) * cols] = - finalize(partial); - } + for (unsigned i = ssg_size; i < sg_size; i *= 2) { + partial = op(partial, subgroup.shfl_xor(partial, i)); } - }); + if (sg_rank < cols) { + result[sg_rank + id.get_group(2) * cols] = finalize(partial); + } + } + }); } @@ -440,7 +432,7 @@ void generic_kernel_col_reduction_2d_blocked( sycl::access_mode::read_write, sycl::access::target::local> block_partial_acc(cgh); cgh.parallel_for( - range, [=](sycl::nd_item<3> id) [[sycl::reqd_sub_group_size(sg_size)]] { + range, [=](sycl::nd_item<3> id) KERNEL_SUBGROUP_SIZE(sg_size) { const auto sg_id = thread::get_subwarp_id_flat(id); const auto sg_num = thread::get_subwarp_num_flat(id); diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 7774f8c9caa..e657c4d46bd 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -678,9 +678,8 @@ void abstract_classical_spmv(dim3 grid, dim3 block, { queue->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - subgroup_size)]] { + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) KERNEL_SUBGROUP_SIZE(subgroup_size) { abstract_classical_spmv(num_rows, val, col_idxs, row_ptrs, b, b_stride, c, c_stride, item_ct1); @@ -971,13 +970,14 @@ void reduce_total_cols(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::access::target::local> block_result_acc_ct1(sycl::range<1>(default_block_size), cgh); - cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - reduce_total_cols(num_slices, max_nnz_per_slice, result, - item_ct1, block_result_acc_ct1.get_pointer()); - }); + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + KERNEL_SUBGROUP_SIZE(config::warp_size) { + reduce_total_cols( + num_slices, max_nnz_per_slice, result, + item_ct1, + block_result_acc_ct1.get_pointer()); + }); }); } @@ -1004,13 +1004,12 @@ void reduce_max_nnz(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::access::target::local> block_max_acc_ct1(sycl::range<1>(default_block_size), cgh); - cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - reduce_max_nnz(size, nnz_per_row, result, item_ct1, - block_max_acc_ct1.get_pointer()); - }); + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) KERNEL_SUBGROUP_SIZE( + config::warp_size) { + reduce_max_nnz(size, nnz_per_row, result, item_ct1, + block_max_acc_ct1.get_pointer()); + }); }); } diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 78a797f8ec2..648a14fea9b 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -301,13 +301,13 @@ void reduce_max_nnz(dim3 grid, dim3 block, size_type dynamic_shared_memory, dpct_local_acc_ct1(sycl::range<1>(dynamic_shared_memory), cgh); - cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - KCFG_1D::decode<1>(cfg))]] { - reduce_max_nnz(size, nnz_per_row, result, item_ct1, - dpct_local_acc_ct1.get_pointer().get()); - }); + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + KERNEL_SUBGROUP_SIZE(KCFG_1D::decode<1>(cfg)) { + reduce_max_nnz( + size, nnz_per_row, result, item_ct1, + dpct_local_acc_ct1.get_pointer().get()); + }); }); } @@ -382,14 +382,14 @@ void reduce_total_cols(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::access::target::local> dpct_local_acc_ct1(sycl::range<1>(dynamic_shared_memory), cgh); - cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - KCFG_1D::decode<1>(cfg))]] { - reduce_total_cols(num_slices, max_nnz_per_slice, result, - item_ct1, - dpct_local_acc_ct1.get_pointer().get()); - }); + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + KERNEL_SUBGROUP_SIZE(KCFG_1D::decode<1>(cfg)) { + reduce_total_cols( + num_slices, max_nnz_per_slice, result, + item_ct1, + dpct_local_acc_ct1.get_pointer().get()); + }); }); } diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp index 4c0d948bd3b..b8f16c76c5b 100644 --- a/dpcpp/matrix/ell_kernels.dp.cpp +++ b/dpcpp/matrix/ell_kernels.dp.cpp @@ -371,12 +371,13 @@ void count_nnz_per_row(dim3 grid, dim3 block, size_type dynamic_shared_memory, const ValueType* values, IndexType* result) { queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(warp_size)]] { - count_nnz_per_row(num_rows, max_nnz_per_row, stride, values, - result, item_ct1); - }); + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + KERNEL_SUBGROUP_SIZE(config::warp_size) { + count_nnz_per_row(num_rows, max_nnz_per_row, + stride, values, result, + item_ct1); + }); }); } diff --git a/dpcpp/matrix/hybrid_kernels.dp.cpp b/dpcpp/matrix/hybrid_kernels.dp.cpp index 4ba7169b81b..db16f14f8ce 100644 --- a/dpcpp/matrix/hybrid_kernels.dp.cpp +++ b/dpcpp/matrix/hybrid_kernels.dp.cpp @@ -144,9 +144,8 @@ void count_coo_row_nnz(dim3 grid, dim3 block, size_type dynamic_shared_memory, { queue->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - subgroup_size)]] { + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) KERNEL_SUBGROUP_SIZE(subgroup_size) { count_coo_row_nnz(nnz, num_lines, val, row, nnz_per_row, item_ct1); }); diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp index 47ed894faec..22b9a896883 100644 --- a/dpcpp/solver/cb_gmres_kernels.dp.cpp +++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp @@ -277,14 +277,14 @@ void multinorm2_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, reduction_helper_array_acc_ct1(cgh); cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - default_dot_dim)]] { - multinorm2_kernel(num_rows, num_cols, next_krylov_basis, - stride_next_krylov, norms, stop_status, - item_ct1, - reduction_helper_array_acc_ct1.get_pointer()); - }); + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + KERNEL_SUBGROUP_SIZE(default_dot_dim) { + multinorm2_kernel( + num_rows, num_cols, next_krylov_basis, + stride_next_krylov, norms, stop_status, item_ct1, + reduction_helper_array_acc_ct1.get_pointer()); + }); }); } @@ -354,14 +354,14 @@ void multinorminf_without_stop_kernel( reduction_helper_array_acc_ct1(cgh); cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - default_dot_dim)]] { - multinorminf_without_stop_kernel( - num_rows, num_cols, next_krylov_basis, stride_next_krylov, - norms, stride_norms, item_ct1, - reduction_helper_array_acc_ct1.get_pointer()); - }); + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + KERNEL_SUBGROUP_SIZE(default_dot_dim) { + multinorminf_without_stop_kernel( + num_rows, num_cols, next_krylov_basis, + stride_next_krylov, norms, stride_norms, item_ct1, + reduction_helper_array_acc_ct1.get_pointer()); + }); }); } @@ -457,14 +457,14 @@ void multinorm2_inf_kernel( reduction_helper_array_acc_ct1(cgh); cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - default_dot_dim)]] { - multinorm2_inf_kernel( - num_rows, num_cols, next_krylov_basis, stride_next_krylov, - norms1, norms2, stop_status, item_ct1, - reduction_helper_array_acc_ct1.get_pointer()); - }); + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + KERNEL_SUBGROUP_SIZE(default_dot_dim) { + multinorm2_inf_kernel( + num_rows, num_cols, next_krylov_basis, + stride_next_krylov, norms1, norms2, stop_status, + item_ct1, reduction_helper_array_acc_ct1.get_pointer()); + }); }); } @@ -549,8 +549,8 @@ void multidot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, reduction_helper_array_acc_ct1(cgh); cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(dot_dim)]] { + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) KERNEL_SUBGROUP_SIZE(dot_dim) { multidot_kernel( num_rows, num_cols, next_krylov_basis, stride_next_krylov, krylov_bases, hessenberg_iter, stride_hessenberg, @@ -630,15 +630,15 @@ void singledot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, reduction_helper_array_acc_ct1(cgh); cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - singledot_kernel( - num_rows, next_krylov_basis, stride_next_krylov, - krylov_bases, hessenberg_iter, stride_hessenberg, - stop_status, item_ct1, - *reduction_helper_array_acc_ct1.get_pointer()); - }); + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + KERNEL_SUBGROUP_SIZE(config::warp_size) { + singledot_kernel( + num_rows, next_krylov_basis, stride_next_krylov, + krylov_bases, hessenberg_iter, stride_hessenberg, + stop_status, item_ct1, + *reduction_helper_array_acc_ct1.get_pointer()); + }); }); } diff --git a/dpcpp/solver/gmres_kernels.dp.cpp b/dpcpp/solver/gmres_kernels.dp.cpp index d53257d7e20..dd11dc8a285 100644 --- a/dpcpp/solver/gmres_kernels.dp.cpp +++ b/dpcpp/solver/gmres_kernels.dp.cpp @@ -204,9 +204,9 @@ void multidot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, reduction_helper_array_acc_ct1(cgh); cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - default_dot_dim)]] { + sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) KERNEL_SUBGROUP_SIZE( + default_dot_dim) { multidot_kernel( k, num_rows, num_cols, krylov_bases, next_krylov_basis, stride_krylov, hessenberg_iter, stride_hessenberg, @@ -319,16 +319,15 @@ void update_hessenberg_2_kernel( sycl::access::target::local> reduction_helper_array_acc_ct1(cgh); - cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - update_hessenberg_2_kernel( - iter, num_rows, num_cols, next_krylov_basis, - stride_next_krylov, hessenberg_iter, stride_hessenberg, - stop_status, item_ct1, - *reduction_helper_array_acc_ct1.get_pointer()); - }); + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) KERNEL_SUBGROUP_SIZE( + config::warp_size) { + update_hessenberg_2_kernel( + iter, num_rows, num_cols, next_krylov_basis, + stride_next_krylov, hessenberg_iter, + stride_hessenberg, stop_status, item_ct1, + *reduction_helper_array_acc_ct1.get_pointer()); + }); }); } diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp index 4e07676663e..30a3561fdfb 100644 --- a/dpcpp/solver/idr_kernels.dp.cpp +++ b/dpcpp/solver/idr_kernels.dp.cpp @@ -173,14 +173,13 @@ void orthonormalize_subspace_vectors_kernel( sycl::access::target::local> reduction_helper_array_acc_ct1(cgh); - cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - config::warp_size)]] { - orthonormalize_subspace_vectors_kernel( - num_rows, num_cols, values, stride, item_ct1, - *reduction_helper_array_acc_ct1.get_pointer()); - }); + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) KERNEL_SUBGROUP_SIZE( + config::warp_size) { + orthonormalize_subspace_vectors_kernel( + num_rows, num_cols, values, stride, item_ct1, + *reduction_helper_array_acc_ct1.get_pointer()); + }); }); } @@ -385,14 +384,14 @@ void multidot_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, sycl::access::target::local> reduction_helper_array_acc_ct1(cgh); - cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - default_dot_dim)]] { - multidot_kernel(num_rows, nrhs, p_i, g_k, g_k_stride, alpha, - stop_status, item_ct1, - *reduction_helper_array_acc_ct1.get_pointer()); - }); + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) KERNEL_SUBGROUP_SIZE( + default_dot_dim) { + multidot_kernel( + num_rows, nrhs, p_i, g_k, g_k_stride, alpha, + stop_status, item_ct1, + *reduction_helper_array_acc_ct1.get_pointer()); + }); }); } diff --git a/dpcpp/test/components/cooperative_groups.dp.cpp b/dpcpp/test/components/cooperative_groups.dp.cpp index 7984d4108e8..20acfb13f07 100644 --- a/dpcpp/test/components/cooperative_groups.dp.cpp +++ b/dpcpp/test/components/cooperative_groups.dp.cpp @@ -60,11 +60,11 @@ namespace { using namespace gko::kernels::dpcpp; -using KCfg = gko::ConfigSet<11, 7>; +using KCFG_1D = gko::ConfigSet<11, 7>; constexpr auto default_config_list = - ::gko::syn::value_list(); + ::gko::syn::value_list(); class CooperativeGroups : public testing::TestWithParam { @@ -90,7 +90,7 @@ class CooperativeGroups : public testing::TestWithParam { auto queue = dpcpp->get_queue(); if (gko::kernels::dpcpp::validate(queue, subgroup_size, subgroup_size)) { - const auto cfg = KCfg::encode(subgroup_size, subgroup_size); + const auto cfg = KCFG_1D::encode(subgroup_size, subgroup_size); for (int i = 0; i < test_case * subgroup_size; i++) { result.get_data()[i] = true; } @@ -117,10 +117,10 @@ class CooperativeGroups : public testing::TestWithParam { // kernel implementation template -__WG_BOUND__(KCfg::decode<0>(config)) +__WG_BOUND__(KCFG_1D::decode<0>(config)) void cg_shuffle(bool* s, sycl::nd_item<3> item_ct1) { - constexpr auto sg_size = KCfg::decode<1>(config); + constexpr auto sg_size = KCFG_1D::decode<1>(config); auto group = group::tiled_partition(group::this_thread_block(item_ct1)); auto i = int(group.thread_rank()); @@ -170,10 +170,10 @@ TEST_P(CooperativeGroups, Shuffle) template -__WG_BOUND__(KCfg::decode<0>(config)) +__WG_BOUND__(KCFG_1D::decode<0>(config)) void cg_all(bool* s, sycl::nd_item<3> item_ct1) { - constexpr auto sg_size = KCfg::decode<1>(config); + constexpr auto sg_size = KCFG_1D::decode<1>(config); auto group = group::tiled_partition(group::this_thread_block(item_ct1)); auto i = int(group.thread_rank()); @@ -192,11 +192,11 @@ TEST_P(CooperativeGroups, All) { test_all_subgroup(cg_all_call); } template -__WG_BOUND__(KCfg::decode<0>(config)) +__WG_BOUND__(KCFG_1D::decode<0>(config)) void cg_any(bool* s, sycl::nd_item<3> item_ct1) { - constexpr auto sg_size = KCfg::decode<1>(config); - auto group = group::tiled_partition(config)>( + constexpr auto sg_size = KCFG_1D::decode<1>(config); + auto group = group::tiled_partition(config)>( group::this_thread_block(item_ct1)); auto i = int(group.thread_rank()); @@ -213,10 +213,10 @@ TEST_P(CooperativeGroups, Any) { test_all_subgroup(cg_any_call); } template -__WG_BOUND__(KCfg::decode<0>(config)) +__WG_BOUND__(KCFG_1D::decode<0>(config)) void cg_ballot(bool* s, sycl::nd_item<3> item_ct1) { - constexpr auto sg_size = KCfg::decode<1>(config); + constexpr auto sg_size = KCFG_1D::decode<1>(config); auto group = group::tiled_partition(group::this_thread_block(item_ct1)); auto active = gko::detail::mask();