Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add kernels for Csr sparsity pattern lookup #994

Merged
merged 8 commits into from
May 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions common/cuda_hip/components/segment_scan.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -38,25 +38,26 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* performs suffix sum. Works on the source array and returns whether the thread
* is the first element of its segment with same `ind`.
*/
template <unsigned subwarp_size, typename ValueType, typename IndexType>
template <unsigned subwarp_size, typename ValueType, typename IndexType,
typename Operator>
__device__ __forceinline__ bool segment_scan(
const group::thread_block_tile<subwarp_size>& group, const IndexType ind,
ValueType* __restrict__ val)
ValueType& val, Operator op)
{
bool head = true;
#pragma unroll
for (int i = 1; i < subwarp_size; i <<= 1) {
const IndexType add_ind = group.shfl_up(ind, i);
ValueType add_val = zero<ValueType>();
if (add_ind == ind && threadIdx.x >= i) {
add_val = *val;
ValueType add_val{};
if (add_ind == ind && group.thread_rank() >= i) {
add_val = val;
if (i == 1) {
head = false;
}
}
add_val = group.shfl_down(add_val, i);
if (threadIdx.x < subwarp_size - i) {
*val += add_val;
if (group.thread_rank() < subwarp_size - i) {
val = op(val, add_val);
}
}
return head;
Expand Down
10 changes: 6 additions & 4 deletions common/cuda_hip/matrix/coo_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,9 @@ __device__ void spmv_kernel(const size_type nnz, const size_type num_lines,
(ind + subwarp_size < nnz) ? row[ind + subwarp_size] : row[nnz - 1];
// segmented scan
if (tile_block.any(curr_row != next_row)) {
bool is_first_in_segment =
segment_scan<subwarp_size>(tile_block, curr_row, &temp_val);
bool is_first_in_segment = segment_scan<subwarp_size>(
tile_block, curr_row, temp_val,
[](ValueType a, ValueType b) { return a + b; });
if (is_first_in_segment) {
atomic_add(&(c[curr_row * c_stride + column_id]),
scale(temp_val));
Expand All @@ -97,8 +98,9 @@ __device__ void spmv_kernel(const size_type nnz, const size_type num_lines,
temp_val += (ind < nnz) ? val[ind] * b[col[ind] * b_stride + column_id]
: zero<ValueType>();
// segmented scan
bool is_first_in_segment =
segment_scan<subwarp_size>(tile_block, curr_row, &temp_val);
bool is_first_in_segment = segment_scan<subwarp_size>(
tile_block, curr_row, temp_val,
[](ValueType a, ValueType b) { return a + b; });
if (is_first_in_segment) {
atomic_add(&(c[curr_row * c_stride + column_id]), scale(temp_val));
}
Expand Down
257 changes: 220 additions & 37 deletions common/cuda_hip/matrix/csr_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -68,22 +68,21 @@ __device__ __forceinline__ bool block_segment_scan_reverse(
template <bool overflow, typename IndexType>
__device__ __forceinline__ void find_next_row(
const IndexType num_rows, const IndexType data_size, const IndexType ind,
IndexType* __restrict__ row, IndexType* __restrict__ row_end,
const IndexType row_predict, const IndexType row_predict_end,
const IndexType* __restrict__ row_ptr)
IndexType& row, IndexType& row_end, const IndexType row_predict,
const IndexType row_predict_end, const IndexType* __restrict__ row_ptr)
{
if (!overflow || ind < data_size) {
if (ind >= *row_end) {
*row = row_predict;
*row_end = row_predict_end;
while (ind >= *row_end) {
*row_end = row_ptr[++*row + 1];
if (ind >= row_end) {
row = row_predict;
row_end = row_predict_end;
while (ind >= row_end) {
row_end = row_ptr[++row + 1];
}
}

} else {
*row = num_rows - 1;
*row_end = data_size;
row = num_rows - 1;
row_end = data_size;
}
}

Expand All @@ -92,16 +91,17 @@ template <unsigned subwarp_size, typename ValueType, typename IndexType,
typename Closure>
__device__ __forceinline__ void warp_atomic_add(
const group::thread_block_tile<subwarp_size>& group, bool force_write,
ValueType* __restrict__ val, const IndexType row, ValueType* __restrict__ c,
ValueType& val, const IndexType row, ValueType* __restrict__ c,
const size_type c_stride, const IndexType column_id, Closure scale)
{
// do a local scan to avoid atomic collisions
const bool need_write = segment_scan(group, row, val);
const bool need_write = segment_scan(
group, row, val, [](ValueType a, ValueType b) { return a + b; });
if (need_write && force_write) {
atomic_add(&(c[row * c_stride + column_id]), scale(*val));
atomic_add(&(c[row * c_stride + column_id]), scale(val));
}
if (!need_write || force_write) {
*val = zero<ValueType>();
val = zero<ValueType>();
}
}

Expand All @@ -111,28 +111,27 @@ template <bool last, unsigned subwarp_size, typename ValueType,
__device__ __forceinline__ void process_window(
const group::thread_block_tile<subwarp_size>& group,
const IndexType num_rows, const IndexType data_size, const IndexType ind,
IndexType* __restrict__ row, IndexType* __restrict__ row_end,
IndexType* __restrict__ nrow, IndexType* __restrict__ nrow_end,
ValueType* __restrict__ temp_val, const ValueType* __restrict__ val,
IndexType& row, IndexType& row_end, IndexType& nrow, IndexType& nrow_end,
ValueType& temp_val, const ValueType* __restrict__ val,
const IndexType* __restrict__ col_idxs,
const IndexType* __restrict__ row_ptrs, const ValueType* __restrict__ b,
const size_type b_stride, ValueType* __restrict__ c,
const size_type c_stride, const IndexType column_id, Closure scale)
{
const IndexType curr_row = *row;
find_next_row<last>(num_rows, data_size, ind, row, row_end, *nrow,
*nrow_end, row_ptrs);
const auto curr_row = row;
find_next_row<last>(num_rows, data_size, ind, row, row_end, nrow, nrow_end,
row_ptrs);
// segmented scan
if (group.any(curr_row != *row)) {
warp_atomic_add(group, curr_row != *row, temp_val, curr_row, c,
c_stride, column_id, scale);
*nrow = group.shfl(*row, subwarp_size - 1);
*nrow_end = group.shfl(*row_end, subwarp_size - 1);
if (group.any(curr_row != row)) {
warp_atomic_add(group, curr_row != row, temp_val, curr_row, c, c_stride,
column_id, scale);
nrow = group.shfl(row, subwarp_size - 1);
nrow_end = group.shfl(row_end, subwarp_size - 1);
}

if (!last || ind < data_size) {
const auto col = col_idxs[ind];
*temp_val += val[ind] * b[col * b_stride + column_id];
temp_val += val[ind] * b[col * b_stride + column_id];
}
}

Expand Down Expand Up @@ -171,21 +170,21 @@ __device__ __forceinline__ void spmv_kernel(
auto nrow_end = row_end;
ValueType temp_val = zero<ValueType>();
IndexType ind = start + threadIdx.x;
find_next_row<true>(num_rows, data_size, ind, &row, &row_end, nrow,
nrow_end, row_ptrs);
find_next_row<true>(num_rows, data_size, ind, row, row_end, nrow, nrow_end,
row_ptrs);
const IndexType ind_end = end - wsize;
const auto tile_block =
group::tiled_partition<wsize>(group::this_thread_block());
for (; ind < ind_end; ind += wsize) {
process_window<false>(tile_block, num_rows, data_size, ind, &row,
&row_end, &nrow, &nrow_end, &temp_val, val,
col_idxs, row_ptrs, b, b_stride, c, c_stride,
column_id, scale);
}
process_window<true>(tile_block, num_rows, data_size, ind, &row, &row_end,
&nrow, &nrow_end, &temp_val, val, col_idxs, row_ptrs,
b, b_stride, c, c_stride, column_id, scale);
warp_atomic_add(tile_block, true, &temp_val, row, c, c_stride, column_id,
process_window<false>(tile_block, num_rows, data_size, ind, row,
row_end, nrow, nrow_end, temp_val, val, col_idxs,
row_ptrs, b, b_stride, c, c_stride, column_id,
scale);
}
process_window<true>(tile_block, num_rows, data_size, ind, row, row_end,
nrow, nrow_end, temp_val, val, col_idxs, row_ptrs, b,
b_stride, c, c_stride, column_id, scale);
warp_atomic_add(tile_block, true, temp_val, row, c, c_stride, column_id,
scale);
}

Expand Down Expand Up @@ -915,3 +914,187 @@ void convert_to_fbcsr(std::shared_ptr<const DefaultExecutor> exec,

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_CONVERT_TO_FBCSR_KERNEL);


namespace kernel {


template <int subwarp_size, typename IndexType>
__global__ __launch_bounds__(default_block_size) void build_csr_lookup(
const IndexType* __restrict__ row_ptrs,
const IndexType* __restrict__ col_idxs, size_type num_rows,
matrix::csr::sparsity_type allowed, const IndexType* storage_offsets,
int64* __restrict__ row_desc, int32* __restrict__ storage)
{
using matrix::csr::sparsity_type;
constexpr int bitmap_block_size = matrix::csr::sparsity_bitmap_block_size;
auto row = thread::get_subwarp_id_flat<subwarp_size>();
if (row >= num_rows) {
return;
}
const auto subwarp =
group::tiled_partition<subwarp_size>(group::this_thread_block());
const auto row_begin = row_ptrs[row];
const auto row_len = row_ptrs[row + 1] - row_begin;
const auto storage_begin = storage_offsets[row];
const auto available_storage = storage_offsets[row + 1] - storage_begin;
const auto local_storage = storage + storage_begin;
const auto local_cols = col_idxs + row_begin;
const auto lane = subwarp.thread_rank();
const auto min_col = row_len > 0 ? local_cols[0] : 0;
const auto col_range =
row_len > 0 ? local_cols[row_len - 1] - min_col + 1 : 0;

// full column range
if (col_range == row_len &&
csr_lookup_allowed(allowed, sparsity_type::full)) {
if (lane == 0) {
row_desc[row] = static_cast<int64>(sparsity_type::full);
}
return;
}
// dense bitmap storage
const auto num_blocks =
static_cast<int32>(ceildiv(col_range, bitmap_block_size));
if (num_blocks * 2 <= available_storage &&
csr_lookup_allowed(allowed, sparsity_type::bitmap)) {
if (lane == 0) {
row_desc[row] = (static_cast<int64>(num_blocks) << 32) |
static_cast<int64>(sparsity_type::bitmap);
}
const auto block_ranks = local_storage;
const auto block_bitmaps =
reinterpret_cast<uint32*>(block_ranks + num_blocks);
// fill bitmaps with zeros
for (int32 i = lane; i < num_blocks; i += subwarp_size) {
block_bitmaps[i] = 0;
}
// fill bitmaps with sparsity pattern
for (IndexType base_i = 0; base_i < row_len; base_i += subwarp_size) {
const auto i = base_i + lane;
const auto col = i < row_len
? local_cols[i]
: device_numeric_limits<IndexType>::max;
const auto rel_col = static_cast<int32>(col - min_col);
const auto block = rel_col / bitmap_block_size;
const auto col_in_block = rel_col % bitmap_block_size;
auto local_bitmap = uint32{i < row_len ? 1u : 0u} << col_in_block;
bool is_first =
segment_scan(subwarp, block, local_bitmap,
[](config::lane_mask_type a,
config::lane_mask_type b) { return a | b; });
// memory barrier - just to be sure
subwarp.sync();
if (is_first && i < row_len) {
block_bitmaps[block] |= local_bitmap;
}
}
// compute bitmap ranks
int32 block_partial_sum{};
for (int32 base_i = 0; base_i < num_blocks; base_i += subwarp_size) {
const auto i = base_i + lane;
const auto bitmap = i < num_blocks ? block_bitmaps[i] : 0;
int32 local_partial_sum{};
int32 local_total_sum{};
subwarp_prefix_sum<false>(popcnt(bitmap), local_partial_sum,
local_total_sum, subwarp);
if (i < num_blocks) {
block_ranks[i] = local_partial_sum + block_partial_sum;
}
block_partial_sum += local_total_sum;
}
return;
}
// sparse hashmap storage
// we need at least one unfilled entry to avoid infinite loops on search
GKO_ASSERT(row_len < available_storage);
constexpr double inv_golden_ratio = 0.61803398875;
// use golden ratio as approximation for hash parameter that spreads
// consecutive values as far apart as possible. Ensure lowest bit is set
// otherwise we skip odd hashtable entries
const auto hash_parameter =
1u | static_cast<uint32>(available_storage * inv_golden_ratio);
if (lane == 0) {
row_desc[row] = (static_cast<int64>(hash_parameter) << 32) |
static_cast<int64>(sparsity_type::hash);
}
// fill hashmap with sentinel
constexpr int32 empty = invalid_index<int32>();
for (int32 i = lane; i < available_storage; i += subwarp_size) {
local_storage[i] = empty;
}
// memory barrier
subwarp.sync();
// fill with actual entries
for (IndexType base_i = 0; base_i < row_len; base_i += subwarp_size) {
const auto i = base_i + lane;
const auto col = i < row_len ? local_cols[i] : 0;
// make sure that each idle thread gets a unique out-of-bounds value
auto hash = i < row_len ? (static_cast<uint32>(col) * hash_parameter) %
static_cast<uint32>(available_storage)
: static_cast<uint32>(available_storage + lane);
// collision resolution
#if !defined(__HIP_DEVICE_COMPILE__) && defined(__CUDA_ARCH__) && \
(__CUDA_ARCH__ >= 700)
const auto this_lane_mask = config::lane_mask_type{1} << lane;
const auto lane_prefix_mask = this_lane_mask - 1;
// memory barrier to previous loop iteration
subwarp.sync();
int32 entry = i < row_len ? local_storage[hash] : empty;
// find all threads in the subwarp with the same hash key
auto colliding = subwarp.match_any(hash);
// if there are any collisions with previously filled buckets
while (subwarp.any(entry != empty || colliding != this_lane_mask)) {
if (entry != empty || colliding != this_lane_mask) {
// assign consecutive indices to matching threads
hash += (entry == empty ? 0 : 1) +
popcnt(colliding & lane_prefix_mask);
// cheap modulo replacement
if (hash >= available_storage) {
hash -= available_storage;
}
// this could only fail for available_storage < warp_size, as
// popcnt(colliding) is at most warp_size. At the same time, we
// only increase hash by row_length at most, so this is still
// safe.
GKO_ASSERT(hash < available_storage);
entry = local_storage[hash];
}
colliding = subwarp.match_any(hash);
}
if (i < row_len) {
local_storage[hash] = i;
}
#else
if (i < row_len) {
while (atomicCAS(local_storage + hash, empty, i) != empty) {
hash++;
if (hash >= available_storage) {
hash = 0;
}
}
}
#endif
}
}


} // namespace kernel


template <typename IndexType>
void build_lookup(std::shared_ptr<const DefaultExecutor> exec,
const IndexType* row_ptrs, const IndexType* col_idxs,
size_type num_rows, matrix::csr::sparsity_type allowed,
const IndexType* storage_offsets, int64* row_desc,
int32* storage)
{
const auto num_blocks =
ceildiv(num_rows, default_block_size / config::warp_size);
kernel::build_csr_lookup<config::warp_size>
<<<num_blocks, default_block_size>>>(row_ptrs, col_idxs, num_rows,
allowed, storage_offsets, row_desc,
storage);
}

GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL);
Loading