Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve check for compatibility of vector size and subgroup size in SYCL #4579

Merged
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
192 changes: 115 additions & 77 deletions core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,11 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
sycl::queue& q = *instance.m_queue;

auto parallel_for_event = q.submit([&](sycl::handler& cgh) {
// Avoid capturing *this since it might not be trivially copyable
const auto shmem_begin = m_shmem_begin;
const int scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]};
void* const scratch_ptr[2] = {m_scratch_ptr[0], m_scratch_ptr[1]};
masterleinad marked this conversation as resolved.
Show resolved Hide resolved

// FIXME_SYCL accessors seem to need a size greater than zero at least for
// host queues
sycl::accessor<char, 1, sycl::access::mode::read_write,
Expand All @@ -418,35 +423,49 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
sycl::range<1>(std::max(m_scratch_size[0] + m_shmem_begin, 1)),
cgh);

// Avoid capturing *this since it might not be trivially copyable
const auto shmem_begin = m_shmem_begin;
const int scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]};
void* const scratch_ptr[2] = {m_scratch_ptr[0], m_scratch_ptr[1]};
auto lambda = [=](sycl::nd_item<2> item) {
const member_type team_member(team_scratch_memory_L0.get_pointer(),
shmem_begin, scratch_size[0],
static_cast<char*>(scratch_ptr[1]) +
item.get_group(1) * scratch_size[1],
scratch_size[1], item);
if constexpr (std::is_same<work_tag, void>::value)
functor(team_member);
else
functor(work_tag(), team_member);
};

#if defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION > 20210903
static sycl::kernel kernel = [&] {
sycl::kernel_id functor_kernel_id =
sycl::get_kernel_id<decltype(lambda)>();
auto kernel_bundle =
sycl::get_kernel_bundle<sycl::bundle_state::executable>(
q.get_context(), std::vector{functor_kernel_id});
return kernel_bundle.get_kernel(functor_kernel_id);
}();
auto max_sg_size =
kernel
.get_info<sycl::info::kernel_device_specific::max_sub_group_size>(
Comment on lines +447 to +448
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Format looks weird

q.get_device(),
sycl::range<3>(m_team_size, m_vector_size, 1));
if (max_sg_size % m_vector_size != 0) {
std::stringstream out;
out << "The maximum subgroup size (" << max_sg_size
<< ") for this kernel is not divisible by the vector_size ("
<< m_vector_size << "). Choose a smaller vector_size!\n";
Kokkos::Impl::throw_runtime_exception(out.str());
}
// FIXME_SYCL For some reason, explicitly enforcing the kernel bundle to
// be used gives a runtime error.
// cgh.use_kernel_bundle(kernel_bundle);
#endif

cgh.parallel_for(
sycl::nd_range<2>(
sycl::range<2>(m_team_size, m_league_size * m_vector_size),
sycl::range<2>(m_team_size, m_vector_size)),
[=](sycl::nd_item<2> item) {
#ifdef KOKKOS_ENABLE_DEBUG
if (item.get_sub_group().get_local_range() %
item.get_local_range(1) !=
0)
Kokkos::abort(
"The sub_group size is not divisible by the vector_size. "
"Choose a smaller vector_size!");
#endif
const member_type team_member(
team_scratch_memory_L0.get_pointer(), shmem_begin,
scratch_size[0],
static_cast<char*>(scratch_ptr[1]) +
item.get_group(1) * scratch_size[1],
scratch_size[1], item);
if constexpr (std::is_same<work_tag, void>::value)
functor(team_member);
else
functor(work_tag(), team_member);
});
lambda);
});
q.submit_barrier(std::vector<sycl::event>{parallel_for_event});
return parallel_for_event;
Expand Down Expand Up @@ -674,63 +693,82 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
const int scratch_size[2] = {m_scratch_size[0], m_scratch_size[1]};
void* const scratch_ptr[2] = {m_scratch_ptr[0], m_scratch_ptr[1]};

auto lambda = [=](sycl::nd_item<2> item) {
const auto local_id = item.get_local_linear_id();
const auto global_id =
wgroup_size * item.get_group_linear_id() + local_id;
const auto& selected_reducer = ReducerConditional::select(
static_cast<const FunctorType&>(functor),
static_cast<const ReducerType&>(reducer));

// In the first iteration, we call functor to initialize the local
// memory. Otherwise, the local memory is initialized with the
// results from the previous iteration that are stored in global
// memory.
if (first_run) {
reference_type update = ValueInit::init(
selected_reducer, &local_mem[local_id * value_count]);
const member_type team_member(
team_scratch_memory_L0.get_pointer(), shmem_begin,
scratch_size[0],
static_cast<char*>(scratch_ptr[1]) +
item.get_group(1) * scratch_size[1],
scratch_size[1], item);
if constexpr (std::is_same<WorkTag, void>::value)
functor(team_member, update);
else
functor(WorkTag(), team_member, update);
} else {
if (global_id >= size)
ValueInit::init(selected_reducer,
&local_mem[local_id * value_count]);
else {
ValueOps::copy(functor, &local_mem[local_id * value_count],
&results_ptr[global_id * value_count]);
}
}
item.barrier(sycl::access::fence_space::local_space);

SYCLReduction::workgroup_reduction<ValueJoin, ValueOps, WorkTag>(
item, local_mem.get_pointer(), results_ptr,
device_accessible_result_ptr, value_count, selected_reducer,
static_cast<const FunctorType&>(functor),
n_wgroups <= 1 && item.get_group_linear_id() == 0);

// FIXME_SYCL not quite sure why this is necessary
item.barrier(sycl::access::fence_space::global_space);
};

#if defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION > 20210903
static sycl::kernel kernel = [&] {
sycl::kernel_id functor_kernel_id =
sycl::get_kernel_id<decltype(lambda)>();
auto kernel_bundle =
sycl::get_kernel_bundle<sycl::bundle_state::executable>(
q.get_context(), std::vector{functor_kernel_id});
return kernel_bundle.get_kernel(functor_kernel_id);
}();
auto max_sg_size = kernel.get_info<
sycl::info::kernel_device_specific::max_sub_group_size>(
q.get_device(), sycl::range<3>(m_team_size, m_vector_size, 1));
if (max_sg_size % m_vector_size != 0) {
std::stringstream out;
out << "The maximum subgroup size (" << max_sg_size
<< ") for this kernel is not divisible by the vector_size ("
<< m_vector_size << "). Choose a smaller vector_size!\n";
Kokkos::Impl::throw_runtime_exception(out.str());
}
// FIXME_SYCL For some reason, explicitly enforcing the kernel bundle to
// be used gives a runtime error.

// cgh.use_kernel_bundle(kernel_bundle);
#endif

cgh.parallel_for(
sycl::nd_range<2>(
sycl::range<2>(m_team_size, m_league_size * m_vector_size),
sycl::range<2>(m_team_size, m_vector_size)),
[=](sycl::nd_item<2> item) {
#ifdef KOKKOS_ENABLE_DEBUG
if (first_run && item.get_sub_group().get_local_range() %
item.get_local_range(1) !=
0)
Kokkos::abort(
"The sub_group size is not divisible by the vector_size. "
"Choose a smaller vector_size!");
#endif
const auto local_id = item.get_local_linear_id();
const auto global_id =
wgroup_size * item.get_group_linear_id() + local_id;
const auto& selected_reducer = ReducerConditional::select(
static_cast<const FunctorType&>(functor),
static_cast<const ReducerType&>(reducer));

// In the first iteration, we call functor to initialize the local
// memory. Otherwise, the local memory is initialized with the
// results from the previous iteration that are stored in global
// memory.
if (first_run) {
reference_type update = ValueInit::init(
selected_reducer, &local_mem[local_id * value_count]);
const member_type team_member(
team_scratch_memory_L0.get_pointer(), shmem_begin,
scratch_size[0],
static_cast<char*>(scratch_ptr[1]) +
item.get_group(1) * scratch_size[1],
scratch_size[1], item);
if constexpr (std::is_same<WorkTag, void>::value)
functor(team_member, update);
else
functor(WorkTag(), team_member, update);
} else {
if (global_id >= size)
ValueInit::init(selected_reducer,
&local_mem[local_id * value_count]);
else {
ValueOps::copy(functor, &local_mem[local_id * value_count],
&results_ptr[global_id * value_count]);
}
}
item.barrier(sycl::access::fence_space::local_space);

SYCLReduction::workgroup_reduction<ValueJoin, ValueOps, WorkTag>(
item, local_mem.get_pointer(), results_ptr,
device_accessible_result_ptr, value_count, selected_reducer,
static_cast<const FunctorType&>(functor),
n_wgroups <= 1 && item.get_group_linear_id() == 0);

// FIXME_SYCL not quite sure why this is necessary
item.barrier(sycl::access::fence_space::global_space);
});
lambda);
});
q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
last_reduction_event = parallel_reduce_event;
Expand Down
5 changes: 5 additions & 0 deletions core/unit_test/TestReductions_DeviceView.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ void test_reduce_device_view(int64_t N, PolicyType policy,
Kokkos::deep_copy(reducer_result, result);
Kokkos::deep_copy(result, 0);
ASSERT_EQ(N, reducer_result);

// We need a warm-up to get reasonable results
Kokkos::parallel_reduce("Test::ReduceDeviceView::TestView", policy, functor,
result);
Kokkos::fence();
timer.reset();

// Test View
Expand Down