Skip to content

Commit

Permalink
improve document, fix auto usage in for, shared_memory usage
Browse files Browse the repository at this point in the history
Co-authored-by: Thomas Grützmacher <thomas.gruetzmacher@kit.edu>
  • Loading branch information
yhmtsai and Thomas Grützmacher committed Jul 20, 2021
1 parent 7ac6d09 commit 6169c2f
Show file tree
Hide file tree
Showing 26 changed files with 131 additions and 116 deletions.
2 changes: 1 addition & 1 deletion common/components/prefix_sum.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ __forceinline__ __device__ void subwarp_prefix_sum(ValueType element,
total_sum = element;
#pragma unroll
// hypercube prefix sum
for (auto step = 1; step < subwarp.size(); step *= 2) {
for (int step = 1; step < subwarp.size(); step *= 2) {
auto neighbor = subwarp.shfl_xor(total_sum, step);
total_sum += neighbor;
prefix_sum += bool(subwarp.thread_rank() & step) ? neighbor : 0;
Expand Down
12 changes: 6 additions & 6 deletions common/components/sorting.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ struct bitonic_local {
bool reverse)
{
auto els_mid = els + (num_elements / 2);
for (auto i = 0; i < num_elements / 2; ++i) {
for (int i = 0; i < num_elements / 2; ++i) {
bitonic_cas(els[i], els_mid[i], reverse);
}
half::merge(els, reverse);
Expand Down Expand Up @@ -131,7 +131,7 @@ struct bitonic_warp {
auto tile =
group::tiled_partition<num_threads>(group::this_thread_block());
auto new_reverse = reverse != upper_half();
for (auto i = 0; i < num_local; ++i) {
for (int i = 0; i < num_local; ++i) {
auto other = tile.shfl_xor(els[i], num_threads / 2);
bitonic_cas(els[i], other, new_reverse);
}
Expand Down Expand Up @@ -206,7 +206,7 @@ struct bitonic_global {
auto upper_shared_els = shared_els + (num_groups * num_threads / 2);
// only the lower group executes the CAS
if (!upper_half()) {
for (auto i = 0; i < num_local; ++i) {
for (int i = 0; i < num_local; ++i) {
auto j = shared_idx(i);
bitonic_cas(shared_els[j], upper_shared_els[j], reverse);
}
Expand Down Expand Up @@ -241,11 +241,11 @@ struct bitonic_global<ValueType, num_local, num_threads, 1, num_total_threads> {
bool reverse)
{
group::this_thread_block().sync();
for (auto i = 0; i < num_local; ++i) {
for (int i = 0; i < num_local; ++i) {
local_els[i] = shared_els[shared_idx(i)];
}
warp::merge(local_els, reverse);
for (auto i = 0; i < num_local; ++i) {
for (int i = 0; i < num_local; ++i) {
shared_els[shared_idx(i)] = local_els[i];
}
}
Expand All @@ -258,7 +258,7 @@ struct bitonic_global<ValueType, num_local, num_threads, 1, num_total_threads> {
// This is the first step, so we don't need to load from shared memory
warp::sort(local_els, reverse);
// store the sorted elements in shared memory
for (auto i = 0; i < num_local; ++i) {
for (int i = 0; i < num_local; ++i) {
shared_els[shared_idx(i)] = local_els[i];
}
}
Expand Down
7 changes: 4 additions & 3 deletions common/components/uninitialized_array.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/**
* Stores an array with uninitialized contents.
*
* This class needed for datatypes that do have a non-empty constructor when`
* This class is needed for datatypes that do have a non-empty constructor when
* using them as shared memory, for example `thrust::complex<float>`.
*
* @tparam ValueType the type of values
Expand All @@ -49,7 +49,7 @@ public:
*
* @return the constexpr pointer to the first entry of the array.
*/
constexpr GKO_ATTRIBUTES operator ValueType *() const noexcept
constexpr GKO_ATTRIBUTES operator const ValueType *() const noexcept
{
return &(*this)[0];
}
Expand All @@ -70,7 +70,8 @@ public:
*
* @return a reference to the array entry at the given index.
*/
constexpr GKO_ATTRIBUTES ValueType &operator[](size_type pos) const noexcept
constexpr GKO_ATTRIBUTES const ValueType &operator[](size_type pos) const
noexcept
{
return reinterpret_cast<const ValueType *>(data_)[pos];
}
Expand Down
4 changes: 2 additions & 2 deletions common/factorization/par_ilut_filter_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ __device__ void abstract_filter_impl(const IndexType *row_ptrs,
auto end = row_ptrs[row + 1];
begin_cb(row);
auto num_steps = ceildiv(end - begin, subwarp_size);
for (auto step = 0; step < num_steps; ++step) {
for (IndexType step = 0; step < num_steps; ++step) {
auto idx = begin + lane + step * subwarp_size;
auto keep = idx < end && pred(idx, begin, end);
auto mask = subwarp.ballot(keep);
Expand Down Expand Up @@ -189,4 +189,4 @@ __global__ __launch_bounds__(default_block_size) void bucket_filter(
}


} // namespace kernel
} // namespace kernel
10 changes: 6 additions & 4 deletions common/factorization/par_ilut_select_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ __global__ __launch_bounds__(searchtree_width) void build_searchtree(
// assuming rounding towards zero
auto stride = double(size) / sample_size;
#pragma unroll
for (auto i = 0; i < sampleselect_oversampling; ++i) {
for (auto i = decltype(sampleselect_oversampling){0};
i < sampleselect_oversampling; ++i) {
auto lidx = idx * sampleselect_oversampling + i;
auto val = input[static_cast<IndexType>(lidx * stride)];
samples[i] = abs(val);
Expand Down Expand Up @@ -119,7 +120,8 @@ __global__ __launch_bounds__(default_block_size) void count_buckets(
auto el = abs(input[i]);
IndexType tree_idx{};
#pragma unroll
for (auto level = 0; level < sampleselect_searchtree_height; ++level) {
for (auto level = decltype(sampleselect_searchtree_height){0};
level < sampleselect_searchtree_height; ++level) {
auto cmp = !(el < sh_tree[tree_idx]);
tree_idx = 2 * tree_idx + 1 + cmp;
}
Expand Down Expand Up @@ -168,7 +170,7 @@ __global__ __launch_bounds__(default_block_size) void block_prefix_sum(
// compute prefix sum over warp-sized blocks
IndexType total{};
auto base_idx = warp_idx * work_per_warp * warp.size();
for (auto step = 0; step < work_per_warp; ++step) {
for (auto step = decltype(work_per_warp){0}; step < work_per_warp; ++step) {
auto idx = warp_lane + step * warp.size() + base_idx;
auto val = idx < num_blocks ? local_counters[idx] : zero<IndexType>();
IndexType warp_total{};
Expand Down Expand Up @@ -207,7 +209,7 @@ __global__ __launch_bounds__(default_block_size) void block_prefix_sum(
// add block prefix sum to each warp's block of data
block.sync();
auto warp_prefixsum = warp_sums[warp_idx];
for (auto step = 0; step < work_per_warp; ++step) {
for (IndexType step = 0; step < work_per_warp; ++step) {
auto idx = warp_lane + step * warp.size() + base_idx;
auto val = idx < num_blocks ? local_counters[idx] : zero<IndexType>();
if (idx < num_blocks) {
Expand Down
2 changes: 1 addition & 1 deletion common/matrix/dense_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(

if (tidx < num_rows) {
auto write_to = row_ptrs[tidx];
for (auto i = 0; i < num_cols; i++) {
for (size_type i = 0; i < num_cols; i++) {
if (source[stride * tidx + i] != zero<ValueType>()) {
values[write_to] = source[stride * tidx + i];
col_idxs[write_to] = i;
Expand Down
4 changes: 2 additions & 2 deletions common/matrix/ell_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_dense(
{
const auto tidx = thread::get_thread_id_flat();
if (tidx < num_rows) {
for (auto col = 0; col < nnz; col++) {
for (size_type col = 0; col < nnz; col++) {
result[tidx * result_stride +
col_idxs[tidx + col * source_stride]] +=
values[tidx + col * source_stride];
Expand Down Expand Up @@ -226,7 +226,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(

if (tidx < num_rows) {
auto write_to = result_row_ptrs[tidx];
for (auto i = 0; i < max_nnz_per_row; i++) {
for (size_type i = 0; i < max_nnz_per_row; i++) {
const auto source_idx = tidx + stride * i;
if (source_values[source_idx] != zero<ValueType>()) {
result_values[write_to] = source_values[source_idx];
Expand Down
2 changes: 1 addition & 1 deletion common/matrix/hybrid_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(

if (tidx < num_rows) {
auto write_to = result_row_ptrs[tidx];
for (auto i = 0; i < max_nnz_per_row; i++) {
for (size_type i = 0; i < max_nnz_per_row; i++) {
const auto source_idx = tidx + stride * i;
if (ell_val[source_idx] != zero<ValueType>()) {
result_values[write_to] = ell_val[source_idx];
Expand Down
6 changes: 3 additions & 3 deletions cuda/components/prefix_sum.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,16 @@ template <typename IndexType>
void prefix_sum(std::shared_ptr<const CudaExecutor> exec, IndexType *counts,
size_type num_entries)
{
// prefix_sum should be on the valid array
// prefix_sum should only be performed on a valid array
if (num_entries > 0) {
auto num_blocks = ceildiv(num_entries, prefix_sum_block_size);
Array<IndexType> block_sum_array(exec, num_blocks - 1);
auto block_sums = block_sum_array.get_data();
start_prefix_sum<prefix_sum_block_size>
<<<num_blocks, prefix_sum_block_size>>>(num_entries, counts,
block_sums);
// add the total sum of the previous block only when the number of block
// is larger than 1.
// add the total sum of the previous block only when the number of
// blocks is larger than 1.
if (num_blocks > 1) {
finalize_prefix_sum<prefix_sum_block_size>
<<<num_blocks, prefix_sum_block_size>>>(num_entries, counts,
Expand Down
2 changes: 1 addition & 1 deletion cuda/test/components/sorting_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ protected:
{
// we want some duplicate elements
std::uniform_int_distribution<gko::int32> dist(0, num_elements / 2);
for (auto i = 0; i < num_elements; ++i) {
for (auto i = decltype(num_elements){0}; i < num_elements; ++i) {
ref_shared.get_data()[i] = dist(rng);
}
ddata = gko::Array<gko::int32>{cuda, ref_shared};
Expand Down
2 changes: 1 addition & 1 deletion cuda/test/matrix/dense_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -550,7 +550,7 @@ TEST_F(Dense, CalculateNNZPerRowIsEquivalentToRef)
&dnnz_per_row);

auto tmp = gko::Array<gko::size_type>(ref, dnnz_per_row);
for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) {
for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) {
ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]);
}
}
Expand Down
2 changes: 1 addition & 1 deletion cuda/test/matrix/ell_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -585,7 +585,7 @@ TEST_F(Ell, CalculateNNZPerRowIsEquivalentToRef)
&dnnz_per_row);

auto tmp = gko::Array<gko::size_type>(ref, dnnz_per_row);
for (auto i = 0; i < nnz_per_row.get_num_elems(); i++) {
for (gko::size_type i = 0; i < nnz_per_row.get_num_elems(); i++) {
ASSERT_EQ(nnz_per_row.get_const_data()[i], tmp.get_const_data()[i]);
}
}
Expand Down
6 changes: 4 additions & 2 deletions dpcpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,15 @@ target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP)

set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE)
target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}")
# Note. add MKL via PRIVATE not PUBLIC (MKL example shows) to avoid find_package(MKL) everywhere when link ginkgo
# Note: add MKL as PRIVATE not PUBLIC (MKL example shows) to avoid propagating
# find_package(MKL) everywhere when linking ginkgo (see the MKL example
# https://software.intel.com/content/www/us/en/develop/documentation/onemkl-windows-developer-guide/top/getting-started/cmake-config-for-onemkl.html)
target_compile_options(ginkgo_dpcpp PRIVATE $<TARGET_PROPERTY:MKL::MKL_DPCPP,INTERFACE_COMPILE_OPTIONS>)
target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17)
target_include_directories(ginkgo_dpcpp PRIVATE $<TARGET_PROPERTY:MKL::MKL_DPCPP,INTERFACE_INCLUDE_DIRECTORIES>)
target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all)
# When building ginkgo as a static library, we need to use dpcpp and per_kernel
# link option when the program uses dpcpp related function.
# link option when the program uses a dpcpp related function.
if (BUILD_SHARED_LIBS)
target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel)
else ()
Expand Down
6 changes: 0 additions & 6 deletions dpcpp/base/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,6 @@ struct config {
*/
using lane_mask_type = uint64;


/**
* The number of threads within a CUDA warp.
*/
static constexpr uint32 warp_size = 16;

/**
* The bitmask of the entire warp.
*/
Expand Down
20 changes: 9 additions & 11 deletions dpcpp/base/helper.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,18 +44,16 @@ namespace dpcpp {
bool validate(sycl::queue *queue, unsigned int workgroup_size,
unsigned int subgroup_size)
{
{
auto device = queue->get_device();
auto subgroup_size_list =
device.get_info<cl::sycl::info::device::sub_group_sizes>();
auto max_workgroup_size =
device.get_info<sycl::info::device::max_work_group_size>();
bool allowed = false;
for (auto &i : subgroup_size_list) {
allowed |= (i == subgroup_size);
}
return allowed && (workgroup_size <= max_workgroup_size);
auto device = queue->get_device();
auto subgroup_size_list =
device.get_info<cl::sycl::info::device::sub_group_sizes>();
auto max_workgroup_size =
device.get_info<sycl::info::device::max_work_group_size>();
bool allowed = false;
for (auto &i : subgroup_size_list) {
allowed |= (i == subgroup_size);
}
return allowed && (workgroup_size <= max_workgroup_size);
}


Expand Down
71 changes: 47 additions & 24 deletions dpcpp/base/helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,44 +51,44 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/**
* GKO_ENABLE_DEFAULT_HOST gives a default host implementation for those
* kernels which require encoded config but do not need explicit template
* parameter and share memory
* parameter and shared memory
*
* @param name_ the name of the host function with config
* @param kernel_ the kernel name
*/
#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_) \
template <typename... InferredArgs> \
void name_(dim3 grid, dim3 block, size_t dynamic_shared_memory, \
sycl::queue *queue, InferredArgs... args) \
{ \
queue->submit([&](sycl::handler &cgh) { \
cgh.parallel_for(sycl_nd_range(grid, block), \
[=](sycl::nd_item<3> item_ct1) { \
kernel_(args..., item_ct1); \
}); \
}); \
#define GKO_ENABLE_DEFAULT_HOST(name_, kernel_) \
template <typename... InferredArgs> \
void name_(dim3 grid, dim3 block, size_t, sycl::queue *queue, \
InferredArgs... args) \
{ \
queue->submit([&](sycl::handler &cgh) { \
cgh.parallel_for(sycl_nd_range(grid, block), \
[=](sycl::nd_item<3> item_ct1) { \
kernel_(args..., item_ct1); \
}); \
}); \
}


/**
* GKO_ENABLE_DEFAULT_HOST_CONFIG gives a default host implementation for those
* kernels which require encoded config but do not need explicit template
* parameter and share memory
* parameter and shared memory
*
* @param name_ the name of the host function with config
* @param kernel_ the kernel name
*/
#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_) \
template <std::uint32_t encoded, typename... InferredArgs> \
inline void name_(dim3 grid, dim3 block, size_t dynamic_shared_memory, \
sycl::queue *queue, InferredArgs... args) \
{ \
queue->submit([&](sycl::handler &cgh) { \
cgh.parallel_for(sycl_nd_range(grid, block), \
[=](sycl::nd_item<3> item_ct1) { \
kernel_<encoded>(args..., item_ct1); \
}); \
}); \
#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_) \
template <std::uint32_t encoded, typename... InferredArgs> \
inline void name_(dim3 grid, dim3 block, size_t, sycl::queue *queue, \
InferredArgs... args) \
{ \
queue->submit([&](sycl::handler &cgh) { \
cgh.parallel_for(sycl_nd_range(grid, block), \
[=](sycl::nd_item<3> item_ct1) { \
kernel_<encoded>(args..., item_ct1); \
}); \
}); \
}

/**
Expand Down Expand Up @@ -138,10 +138,33 @@ namespace kernels {
namespace dpcpp {


/**
* This is the validate function for common check. It checks the workgroup size
* is below device max workgroup size and subgroup size is in the supported
* subgroup size.
*
* @param queue the sycl queue pointer
* @param workgroup_size the workgroup size (block size in cuda sense)
* @param subgroup_size the subgroup size (warp size in cuda sense)
*
* @return the given arguments are valid or not in given queue.
*/
bool validate(sycl::queue *queue, unsigned workgroup_size,
unsigned subgroup_size);


/**
* get_first_cfg will return the first valid config by validate function from
* given config array.
*
* @tparam IterArr the iteratable array type
* @tparam Validate the validate function type
*
* @param arr the config array
* @param verify the validate function
*
* @return the first valid config
*/
template <typename IterArr, typename Validate>
std::uint32_t get_first_cfg(IterArr &arr, Validate verify)
{
Expand Down
3 changes: 3 additions & 0 deletions dpcpp/base/onemkl_bindings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GKO_DPCPP_BASE_ONEMKL_BINDINGS_HPP_


#include <type_traits>


#include <CL/sycl.hpp>
#include <oneapi/mkl.hpp>

Expand Down
Loading

0 comments on commit 6169c2f

Please sign in to comment.