Merge GPU permutation kernels

This adds the missing GPU permutation kernels for CSR matrices. It additionally fixes the build issues with HIP benchmark LinOps. Related PR: #657
ginkgo-project · Nov 3, 2020 · 9caff82 · 9caff82
2 parents bded4d0 + b59015b
commit 9caff82
Show file tree

Hide file tree

Showing 13 changed files with 610 additions and 313 deletions.
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
@@ -25,6 +25,8 @@ function(ginkgo_benchmark_hipsp_linops name)
     target_compile_definitions("${name}" PRIVATE HAS_HIP=1)
     EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS)
     set_target_properties("${name}" PROPERTIES COMPILE_FLAGS ${HIP_CXX_FLAGS})
+    # use Thrust C++ device just for compilation, we don't use thrust::complex in the benchmarks
+    target_compile_definitions("${name}" PUBLIC -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CPP)
     # for some reason, HIP creates a dependency on Threads::Threads here, so we
     # need to find it
     find_package(Threads REQUIRED)

diff --git a/common/matrix/csr_kernels.hpp.inc b/common/matrix/csr_kernels.hpp.inc
@@ -946,3 +946,120 @@ __global__ __launch_bounds__(default_block_size) void conjugate_kernel(
 
 
 }  //  namespace
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void inv_permutation_kernel(
+    size_type size, const IndexType *__restrict__ permutation,
+    IndexType *__restrict__ inv_permutation)
+{
+    auto tid = thread::get_thread_id_flat();
+    if (tid >= size) {
+        return;
+    }
+    inv_permutation[permutation[tid]] = tid;
+}
+
+
+template <typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void col_permute_kernel(
+    size_type num_rows, size_type num_nonzeros,
+    const IndexType *__restrict__ permutation,
+    const IndexType *__restrict__ in_row_ptrs,
+    const IndexType *__restrict__ in_cols,
+    const ValueType *__restrict__ in_vals, IndexType *__restrict__ out_row_ptrs,
+    IndexType *__restrict__ out_cols, ValueType *__restrict__ out_vals)
+{
+    auto tid = thread::get_thread_id_flat();
+    if (tid < num_nonzeros) {
+        out_cols[tid] = permutation[in_cols[tid]];
+        out_vals[tid] = in_vals[tid];
+    }
+    if (tid <= num_rows) {
+        out_row_ptrs[tid] = in_row_ptrs[tid];
+    }
+}
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void row_ptr_permute_kernel(
+    size_type num_rows, const IndexType *__restrict__ permutation,
+    const IndexType *__restrict__ in_row_ptrs, IndexType *__restrict__ out_nnz)
+{
+    auto tid = thread::get_thread_id_flat();
+    if (tid >= num_rows) {
+        return;
+    }
+    auto in_row = permutation[tid];
+    auto out_row = tid;
+    out_nnz[out_row] = in_row_ptrs[in_row + 1] - in_row_ptrs[in_row];
+}
+
+
+template <typename IndexType>
+__global__
+    __launch_bounds__(default_block_size) void inv_row_ptr_permute_kernel(
+        size_type num_rows, const IndexType *__restrict__ permutation,
+        const IndexType *__restrict__ in_row_ptrs,
+        IndexType *__restrict__ out_nnz)
+{
+    auto tid = thread::get_thread_id_flat();
+    if (tid >= num_rows) {
+        return;
+    }
+    auto in_row = tid;
+    auto out_row = permutation[tid];
+    out_nnz[out_row] = in_row_ptrs[in_row + 1] - in_row_ptrs[in_row];
+}
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void row_permute_kernel(
+    size_type num_rows, const IndexType *__restrict__ permutation,
+    const IndexType *__restrict__ in_row_ptrs,
+    const IndexType *__restrict__ in_cols,
+    const ValueType *__restrict__ in_vals,
+    const IndexType *__restrict__ out_row_ptrs,
+    IndexType *__restrict__ out_cols, ValueType *__restrict__ out_vals)
+{
+    auto tid = thread::get_subwarp_id_flat<subwarp_size>();
+    if (tid >= num_rows) {
+        return;
+    }
+    auto lane = threadIdx.x % subwarp_size;
+    auto in_row = permutation[tid];
+    auto out_row = tid;
+    auto in_begin = in_row_ptrs[in_row];
+    auto in_size = in_row_ptrs[in_row + 1] - in_begin;
+    auto out_begin = out_row_ptrs[out_row];
+    for (IndexType i = lane; i < in_size; i += subwarp_size) {
+        out_cols[out_begin + i] = in_cols[in_begin + i];
+        out_vals[out_begin + i] = in_vals[in_begin + i];
+    }
+}
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+__global__ __launch_bounds__(default_block_size) void inv_row_permute_kernel(
+    size_type num_rows, const IndexType *__restrict__ permutation,
+    const IndexType *__restrict__ in_row_ptrs,
+    const IndexType *__restrict__ in_cols,
+    const ValueType *__restrict__ in_vals,
+    const IndexType *__restrict__ out_row_ptrs,
+    IndexType *__restrict__ out_cols, ValueType *__restrict__ out_vals)
+{
+    auto tid = thread::get_subwarp_id_flat<subwarp_size>();
+    if (tid >= num_rows) {
+        return;
+    }
+    auto lane = threadIdx.x % subwarp_size;
+    auto in_row = tid;
+    auto out_row = permutation[tid];
+    auto in_begin = in_row_ptrs[in_row];
+    auto in_size = in_row_ptrs[in_row + 1] - in_begin;
+    auto out_begin = out_row_ptrs[out_row];
+    for (IndexType i = lane; i < in_size; i += subwarp_size) {
+        out_cols[out_begin + i] = in_cols[in_begin + i];
+        out_vals[out_begin + i] = in_vals[in_begin + i];
+    }
+}
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
@@ -647,22 +647,21 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
 
 template <typename ValueType, typename IndexType>
-GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL(ValueType, IndexType)
+GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL);
+    GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL);
 
 template <typename ValueType, typename IndexType>
 GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL);
 
-template <typename ValueType, typename IndexType>
-GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType)
+template <typename IndexType>
+GKO_DECLARE_INVERT_PERMUTATION_KERNEL(IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_INVERT_PERMUTATION_KERNEL);
 
 template <typename ValueType, typename IndexType>
 GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(ValueType, IndexType)

diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp
@@ -70,9 +70,9 @@ GKO_REGISTER_OPERATION(convert_to_hybrid, csr::convert_to_hybrid);
 GKO_REGISTER_OPERATION(transpose, csr::transpose);
 GKO_REGISTER_OPERATION(conj_transpose, csr::conj_transpose);
 GKO_REGISTER_OPERATION(row_permute, csr::row_permute);
-GKO_REGISTER_OPERATION(column_permute, csr::column_permute);
 GKO_REGISTER_OPERATION(inverse_row_permute, csr::inverse_row_permute);
 GKO_REGISTER_OPERATION(inverse_column_permute, csr::inverse_column_permute);
+GKO_REGISTER_OPERATION(invert_permutation, csr::invert_permutation);
 GKO_REGISTER_OPERATION(calculate_max_nnz_per_row,
                        csr::calculate_max_nnz_per_row);
 GKO_REGISTER_OPERATION(calculate_nonzeros_per_row,
@@ -412,8 +412,8 @@ std::unique_ptr<LinOp> Csr<ValueType, IndexType>::row_permute(
         Csr::create(exec, this->get_size(), this->get_num_stored_elements(),
                     this->get_strategy());
 
-    exec->run(
-        csr::make_row_permute(permutation_indices, this, permute_cpy.get()));
+    exec->run(csr::make_row_permute(permutation_indices->get_const_data(), this,
+                                    permute_cpy.get()));
     permute_cpy->make_srow();
     return std::move(permute_cpy);
 }
@@ -428,10 +428,15 @@ std::unique_ptr<LinOp> Csr<ValueType, IndexType>::column_permute(
     auto permute_cpy =
         Csr::create(exec, this->get_size(), this->get_num_stored_elements(),
                     this->get_strategy());
+    Array<IndexType> inv_permutation(exec, this->get_size()[1]);
 
-    exec->run(
-        csr::make_column_permute(permutation_indices, this, permute_cpy.get()));
+    exec->run(csr::make_invert_permutation(
+        this->get_size()[1], permutation_indices->get_const_data(),
+        inv_permutation.get_data()));
+    exec->run(csr::make_inverse_column_permute(inv_permutation.get_const_data(),
+                                               this, permute_cpy.get()));
     permute_cpy->make_srow();
+    permute_cpy->sort_by_column_index();
     return std::move(permute_cpy);
 }
 
@@ -447,8 +452,9 @@ std::unique_ptr<LinOp> Csr<ValueType, IndexType>::inverse_row_permute(
         Csr::create(exec, this->get_size(), this->get_num_stored_elements(),
                     this->get_strategy());
 
-    exec->run(csr::make_inverse_row_permute(inverse_permutation_indices, this,
-                                            inverse_permute_cpy.get()));
+    exec->run(csr::make_inverse_row_permute(
+        inverse_permutation_indices->get_const_data(), this,
+        inverse_permute_cpy.get()));
     inverse_permute_cpy->make_srow();
     return std::move(inverse_permute_cpy);
 }
@@ -466,8 +472,10 @@ std::unique_ptr<LinOp> Csr<ValueType, IndexType>::inverse_column_permute(
                     this->get_strategy());
 
     exec->run(csr::make_inverse_column_permute(
-        inverse_permutation_indices, this, inverse_permute_cpy.get()));
+        inverse_permutation_indices->get_const_data(), this,
+        inverse_permute_cpy.get()));
     inverse_permute_cpy->make_srow();
+    inverse_permute_cpy->sort_by_column_index();
     return std::move(inverse_permute_cpy);
 }
 

diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp
@@ -131,29 +131,28 @@ namespace kernels {
 
 #define GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL(ValueType, IndexType)    \
     void row_permute(std::shared_ptr<const DefaultExecutor> exec,   \
-                     const Array<IndexType> *permutation_indices,   \
+                     const IndexType *permutation_indices,          \
                      const matrix::Csr<ValueType, IndexType> *orig, \
                      matrix::Csr<ValueType, IndexType> *row_permuted)
 
-#define GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL(ValueType, IndexType)    \
-    void column_permute(std::shared_ptr<const DefaultExecutor> exec,   \
-                        const Array<IndexType> *permutation_indices,   \
-                        const matrix::Csr<ValueType, IndexType> *orig, \
-                        matrix::Csr<ValueType, IndexType> *column_permuted)
-
 #define GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType)    \
     void inverse_row_permute(std::shared_ptr<const DefaultExecutor> exec,   \
-                             const Array<IndexType> *permutation_indices,   \
+                             const IndexType *permutation_indices,          \
                              const matrix::Csr<ValueType, IndexType> *orig, \
                              matrix::Csr<ValueType, IndexType> *row_permuted)
 
 #define GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType) \
     void inverse_column_permute(                                            \
         std::shared_ptr<const DefaultExecutor> exec,                        \
-        const Array<IndexType> *permutation_indices,                        \
+        const IndexType *permutation_indices,                               \
         const matrix::Csr<ValueType, IndexType> *orig,                      \
         matrix::Csr<ValueType, IndexType> *column_permuted)
 
+#define GKO_DECLARE_INVERT_PERMUTATION_KERNEL(IndexType)             \
+    void invert_permutation(                                         \
+        std::shared_ptr<const DefaultExecutor> exec, size_type size, \
+        const IndexType *permutation_indices, IndexType *inv_permutation)
+
 #define GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(ValueType, IndexType) \
     void calculate_max_nnz_per_row(                                            \
         std::shared_ptr<const DefaultExecutor> exec,                           \
@@ -210,11 +209,11 @@ namespace kernels {
     template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL(ValueType, IndexType);                \
     template <typename ValueType, typename IndexType>                        \
-    GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL(ValueType, IndexType);             \
-    template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL(ValueType, IndexType);        \
     template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL(ValueType, IndexType);     \
+    template <typename IndexType>                                            \
+    GKO_DECLARE_INVERT_PERMUTATION_KERNEL(IndexType);                        \
     template <typename ValueType, typename IndexType>                        \
     GKO_DECLARE_CSR_CALCULATE_MAX_NNZ_PER_ROW_KERNEL(ValueType, IndexType);  \
     template <typename ValueType, typename IndexType>                        \

diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
@@ -1143,45 +1143,86 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_CONJ_TRANSPOSE_KERNEL);
 
 
+template <typename IndexType>
+void invert_permutation(std::shared_ptr<const DefaultExecutor> exec,
+                        size_type size, const IndexType *permutation_indices,
+                        IndexType *inv_permutation)
+{
+    auto num_blocks = ceildiv(size, default_block_size);
+    inv_permutation_kernel<<<num_blocks, default_block_size>>>(
+        size, permutation_indices, inv_permutation);
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_INVERT_PERMUTATION_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void row_permute(std::shared_ptr<const CudaExecutor> exec,
-                 const Array<IndexType> *permutation_indices,
+                 const IndexType *perm,
                  const matrix::Csr<ValueType, IndexType> *orig,
                  matrix::Csr<ValueType, IndexType> *row_permuted)
-    GKO_NOT_IMPLEMENTED;
+{
+    auto num_rows = orig->get_size()[0];
+    auto count_num_blocks = ceildiv(num_rows, default_block_size);
+    row_ptr_permute_kernel<<<count_num_blocks, default_block_size>>>(
+        num_rows, perm, orig->get_const_row_ptrs(),
+        row_permuted->get_row_ptrs());
+    components::prefix_sum(exec, row_permuted->get_row_ptrs(), num_rows + 1);
+    auto copy_num_blocks =
+        ceildiv(num_rows, default_block_size / config::warp_size);
+    row_permute_kernel<config::warp_size>
+        <<<copy_num_blocks, default_block_size>>>(
+            num_rows, perm, orig->get_const_row_ptrs(),
+            orig->get_const_col_idxs(), as_cuda_type(orig->get_const_values()),
+            row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(),
+            as_cuda_type(row_permuted->get_values()));
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ROW_PERMUTE_KERNEL);
 
 
-template <typename ValueType, typename IndexType>
-void column_permute(std::shared_ptr<const CudaExecutor> exec,
-                    const Array<IndexType> *permutation_indices,
-                    const matrix::Csr<ValueType, IndexType> *orig,
-                    matrix::Csr<ValueType, IndexType> *column_permuted)
-    GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_CSR_COLUMN_PERMUTE_KERNEL);
-
-
 template <typename ValueType, typename IndexType>
 void inverse_row_permute(std::shared_ptr<const CudaExecutor> exec,
-                         const Array<IndexType> *permutation_indices,
+                         const IndexType *perm,
                          const matrix::Csr<ValueType, IndexType> *orig,
                          matrix::Csr<ValueType, IndexType> *row_permuted)
-    GKO_NOT_IMPLEMENTED;
+{
+    auto num_rows = orig->get_size()[0];
+    auto count_num_blocks = ceildiv(num_rows, default_block_size);
+    inv_row_ptr_permute_kernel<<<count_num_blocks, default_block_size>>>(
+        num_rows, perm, orig->get_const_row_ptrs(),
+        row_permuted->get_row_ptrs());
+    components::prefix_sum(exec, row_permuted->get_row_ptrs(), num_rows + 1);
+    auto copy_num_blocks =
+        ceildiv(num_rows, default_block_size / config::warp_size);
+    inv_row_permute_kernel<config::warp_size>
+        <<<copy_num_blocks, default_block_size>>>(
+            num_rows, perm, orig->get_const_row_ptrs(),
+            orig->get_const_col_idxs(), as_cuda_type(orig->get_const_values()),
+            row_permuted->get_row_ptrs(), row_permuted->get_col_idxs(),
+            as_cuda_type(row_permuted->get_values()));
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INVERSE_ROW_PERMUTE_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
 void inverse_column_permute(std::shared_ptr<const CudaExecutor> exec,
-                            const Array<IndexType> *permutation_indices,
+                            const IndexType *perm,
                             const matrix::Csr<ValueType, IndexType> *orig,
                             matrix::Csr<ValueType, IndexType> *column_permuted)
-    GKO_NOT_IMPLEMENTED;
+{
+    auto num_rows = orig->get_size()[0];
+    auto nnz = orig->get_num_stored_elements();
+    auto num_blocks = ceildiv(std::max(num_rows, nnz), default_block_size);
+    col_permute_kernel<<<num_blocks, default_block_size>>>(
+        num_rows, nnz, perm, orig->get_const_row_ptrs(),
+        orig->get_const_col_idxs(), as_cuda_type(orig->get_const_values()),
+        column_permuted->get_row_ptrs(), column_permuted->get_col_idxs(),
+        as_cuda_type(column_permuted->get_values()));
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_INVERSE_COLUMN_PERMUTE_KERNEL);