Skip to content

Commit

Permalink
add compact factor storage support to trisolvers
Browse files Browse the repository at this point in the history
  • Loading branch information
upsj committed Jul 10, 2022
1 parent 78678c2 commit 8212b9b
Show file tree
Hide file tree
Showing 27 changed files with 1,027 additions and 485 deletions.
13 changes: 7 additions & 6 deletions core/solver/lower_trs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,9 @@ template <typename ValueType, typename IndexType>
void LowerTrs<ValueType, IndexType>::generate()
{
if (this->get_system_matrix()) {
this->get_executor()->run(
lower_trs::make_generate(this->get_system_matrix().get(),
this->solve_struct_, parameters_.num_rhs));
this->get_executor()->run(lower_trs::make_generate(
this->get_system_matrix().get(), this->solve_struct_,
this->get_parameters().unit_diagonal, parameters_.num_rhs));
}
}

Expand Down Expand Up @@ -176,9 +176,10 @@ void LowerTrs<ValueType, IndexType>::apply_impl(const LinOp* b, LinOp* x) const
trans_x = this->template create_workspace_op<Vector>(
ws::transposed_x, gko::transpose(dense_x->get_size()));
}
exec->run(lower_trs::make_solve(lend(this->get_system_matrix()),
lend(this->solve_struct_), trans_b,
trans_x, dense_b, dense_x));
exec->run(lower_trs::make_solve(
lend(this->get_system_matrix()), lend(this->solve_struct_),
this->get_parameters().unit_diagonal, trans_b, trans_x, dense_b,
dense_x));
},
b, x);
}
Expand Down
4 changes: 2 additions & 2 deletions core/solver/lower_trs_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,13 @@ namespace lower_trs {
void generate(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Csr<_vtype, _itype>* matrix, \
std::shared_ptr<solver::SolveStruct>& solve_struct, \
const gko::size_type num_rhs)
bool unit_diag, const gko::size_type num_rhs)


#define GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL(_vtype, _itype) \
void solve(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Csr<_vtype, _itype>* matrix, \
const solver::SolveStruct* solve_struct, \
const solver::SolveStruct* solve_struct, bool unit_diag, \
matrix::Dense<_vtype>* trans_b, matrix::Dense<_vtype>* trans_x, \
const matrix::Dense<_vtype>* b, matrix::Dense<_vtype>* x)

Expand Down
13 changes: 7 additions & 6 deletions core/solver/upper_trs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,9 @@ template <typename ValueType, typename IndexType>
void UpperTrs<ValueType, IndexType>::generate()
{
if (this->get_system_matrix()) {
this->get_executor()->run(
upper_trs::make_generate(this->get_system_matrix().get(),
this->solve_struct_, parameters_.num_rhs));
this->get_executor()->run(upper_trs::make_generate(
this->get_system_matrix().get(), this->solve_struct_,
this->get_parameters().unit_diagonal, parameters_.num_rhs));
}
}

Expand Down Expand Up @@ -176,9 +176,10 @@ void UpperTrs<ValueType, IndexType>::apply_impl(const LinOp* b, LinOp* x) const
trans_x = this->template create_workspace_op<Vector>(
ws::transposed_x, gko::transpose(dense_x->get_size()));
}
exec->run(upper_trs::make_solve(lend(this->get_system_matrix()),
lend(this->solve_struct_), trans_b,
trans_x, dense_b, dense_x));
exec->run(upper_trs::make_solve(
lend(this->get_system_matrix()), lend(this->solve_struct_),
this->get_parameters().unit_diagonal, trans_b, trans_x, dense_b,
dense_x));
},
b, x);
}
Expand Down
4 changes: 2 additions & 2 deletions core/solver/upper_trs_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,13 @@ namespace upper_trs {
void generate(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Csr<_vtype, _itype>* matrix, \
std::shared_ptr<gko::solver::SolveStruct>& solve_struct, \
const gko::size_type num_rhs)
bool unit_diag, const gko::size_type num_rhs)


#define GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL(_vtype, _itype) \
void solve(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Csr<_vtype, _itype>* matrix, \
const solver::SolveStruct* solve_struct, \
const solver::SolveStruct* solve_struct, bool unit_diag, \
matrix::Dense<_vtype>* trans_b, matrix::Dense<_vtype>* trans_x, \
const matrix::Dense<_vtype>* b, matrix::Dense<_vtype>* x)

Expand Down
7 changes: 7 additions & 0 deletions cuda/base/cusparse_bindings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -790,6 +790,13 @@ inline void set_mat_fill_mode(cusparseMatDescr_t descr,
}


inline void set_mat_diag_type(cusparseMatDescr_t descr,
cusparseDiagType_t diag_type)
{
GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSetMatDiagType(descr, diag_type));
}


inline void destroy(cusparseMatDescr_t descr)
{
GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyMatDescr(descr));
Expand Down
58 changes: 36 additions & 22 deletions cuda/solver/common_trs_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct {

CudaSolveStruct(std::shared_ptr<const gko::CudaExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
size_type num_rhs, bool is_upper)
size_type num_rhs, bool is_upper, bool unit_diag)
: handle{exec->get_cusparse_handle()},
spsm_descr{},
descr_a{},
Expand All @@ -107,7 +107,8 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
descr_a, CUSPARSE_SPMAT_FILL_MODE,
is_upper ? CUSPARSE_FILL_MODE_UPPER : CUSPARSE_FILL_MODE_LOWER);
cusparse::set_attribute<cusparseDiagType_t>(
descr_a, CUSPARSE_SPMAT_DIAG_TYPE, CUSPARSE_DIAG_TYPE_NON_UNIT);
descr_a, CUSPARSE_SPMAT_DIAG_TYPE,
unit_diag ? CUSPARSE_DIAG_TYPE_UNIT : CUSPARSE_DIAG_TYPE_NON_UNIT);

const auto rows = matrix->get_size()[0];
// workaround suggested by NVIDIA engineers: for some reason
Expand Down Expand Up @@ -193,7 +194,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct {

CudaSolveStruct(std::shared_ptr<const gko::CudaExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
size_type num_rhs, bool is_upper)
size_type num_rhs, bool is_upper, bool unit_diag)
: exec{exec},
handle{exec->get_cusparse_handle()},
algorithm{},
Expand All @@ -208,6 +209,9 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
cusparse::set_mat_fill_mode(
factor_descr,
is_upper ? CUSPARSE_FILL_MODE_UPPER : CUSPARSE_FILL_MODE_LOWER);
cusparse::set_mat_diag_type(
factor_descr,
unit_diag ? CUSPARSE_DIAG_TYPE_UNIT : CUSPARSE_DIAG_TYPE_NON_UNIT);
algorithm = 0;
policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;

Expand Down Expand Up @@ -286,14 +290,15 @@ template <typename ValueType, typename IndexType>
void generate_kernel(std::shared_ptr<const CudaExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
std::shared_ptr<solver::SolveStruct>& solve_struct,
const gko::size_type num_rhs, bool is_upper)
const gko::size_type num_rhs, bool is_upper,
bool unit_diag)
{
if (matrix->get_size()[0] == 0) {
return;
}
if (cusparse::is_supported<ValueType, IndexType>::value) {
solve_struct = std::make_shared<CudaSolveStruct<ValueType, IndexType>>(
exec, matrix, num_rhs, is_upper);
exec, matrix, num_rhs, is_upper, unit_diag);
} else {
GKO_NOT_IMPLEMENTED;
}
Expand Down Expand Up @@ -377,7 +382,8 @@ __global__ void sptrsv_naive_caching_kernel(
const IndexType* const rowptrs, const IndexType* const colidxs,
const ValueType* const vals, const ValueType* const b, size_type b_stride,
ValueType* const x, size_type x_stride, const size_type n,
const size_type nrhs, bool* nan_produced, IndexType* atomic_counter)
const size_type nrhs, bool unit_diag, bool* nan_produced,
IndexType* atomic_counter)
{
__shared__ uninitialized_array<ValueType, default_block_size> x_s_array;
__shared__ IndexType block_base_idx;
Expand Down Expand Up @@ -409,12 +415,16 @@ __global__ void sptrsv_naive_caching_kernel(
// upper tri matrix: start at last entry (row_end - 1), run backward
// until first entry, which is the diagonal entry
const auto row_begin = is_upper ? rowptrs[row + 1] - 1 : rowptrs[row];
const auto row_diag = is_upper ? rowptrs[row] : rowptrs[row + 1] - 1;
const auto row_end = is_upper ? rowptrs[row] - 1 : rowptrs[row + 1];
const int row_step = is_upper ? -1 : 1;

ValueType sum = 0.0;
for (auto i = row_begin; i != row_diag; i += row_step) {
auto sum = zero<ValueType>();
auto i = row_begin;
for (; i != row_end; i += row_step) {
const auto dependency = colidxs[i];
if (is_upper ? dependency <= row : dependency >= row) {
break;
}
auto x_p = &x[dependency * x_stride + rhs];

const auto dependency_gid = is_upper ? (n - 1 - dependency) * nrhs + rhs
Expand All @@ -434,7 +444,9 @@ __global__ void sptrsv_naive_caching_kernel(
sum += x * vals[i];
}

const auto r = (b[row * b_stride + rhs] - sum) / vals[row_diag];
// The first entry past the triangular part will be the diagonal
const auto diag = unit_diag ? one<ValueType>() : vals[i];
const auto r = (b[row * b_stride + rhs] - sum) / diag;

store(x_s, self_shid, r);
x[row * x_stride + rhs] = r;
Expand All @@ -453,7 +465,8 @@ __global__ void sptrsv_naive_legacy_kernel(
const IndexType* const rowptrs, const IndexType* const colidxs,
const ValueType* const vals, const ValueType* const b, size_type b_stride,
ValueType* const x, size_type x_stride, const size_type n,
const size_type nrhs, bool* nan_produced, IndexType* atomic_counter)
const size_type nrhs, bool unit_diag, bool* nan_produced,
IndexType* atomic_counter)
{
__shared__ IndexType block_base_idx;
if (threadIdx.x == 0) {
Expand All @@ -470,18 +483,16 @@ __global__ void sptrsv_naive_legacy_kernel(
return;
}

// lower tri matrix: start at beginning, run forward until last entry,
// (row_end - 1) which is the diagonal entry
// lower tri matrix: start at beginning, run forward
// upper tri matrix: start at last entry (row_end - 1), run backward
// until first entry, which is the diagonal entry
const auto row_begin = is_upper ? rowptrs[row + 1] - 1 : rowptrs[row];
const auto row_diag = is_upper ? rowptrs[row] : rowptrs[row + 1] - 1;
const auto row_end = is_upper ? rowptrs[row] - 1 : rowptrs[row + 1];
const int row_step = is_upper ? -1 : 1;

ValueType sum = 0.0;
auto j = row_begin;
while (j != row_diag + row_step) {
auto col = colidxs[j];
auto col = colidxs[j];
while (j != row_end) {
auto x_val = load(x, col * x_stride + rhs);
while (!is_nan(x_val)) {
sum += vals[j] * x_val;
Expand All @@ -490,9 +501,12 @@ __global__ void sptrsv_naive_legacy_kernel(
x_val = load(x, col * x_stride + rhs);
}
if (row == col) {
const auto r = (b[row * b_stride + rhs] - sum) / vals[row_diag];
auto diag = unit_diag ? one<ValueType>() : vals[j];
const auto r = (b[row * b_stride + rhs] - sum) / diag;
store(x, row * x_stride + rhs, r);
j += row_step;
// after we encountered the diagonal, we are done
// this also skips entries outside the triangle
j = row_end;
if (is_nan(r)) {
store(x, row * x_stride + rhs, zero<ValueType>());
*nan_produced = true;
Expand All @@ -514,7 +528,7 @@ __global__ void sptrsv_init_kernel(bool* const nan_produced,
template <bool is_upper, typename ValueType, typename IndexType>
void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
const matrix::Dense<ValueType>* b,
bool unit_diag, const matrix::Dense<ValueType>* b,
matrix::Dense<ValueType>* x)
{
// Pre-Volta GPUs may deadlock due to missing independent thread scheduling.
Expand All @@ -540,14 +554,14 @@ void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
as_cuda_type(matrix->get_const_values()),
as_cuda_type(b->get_const_values()), b->get_stride(),
as_cuda_type(x->get_values()), x->get_stride(), n, nrhs,
as_cuda_type(x->get_values()), x->get_stride(), n, nrhs, unit_diag,
nan_produced.get_data(), atomic_counter.get_data());
} else {
sptrsv_naive_caching_kernel<is_upper><<<grid_size, block_size>>>(
matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
as_cuda_type(matrix->get_const_values()),
as_cuda_type(b->get_const_values()), b->get_stride(),
as_cuda_type(x->get_values()), x->get_stride(), n, nrhs,
as_cuda_type(x->get_values()), x->get_stride(), n, nrhs, unit_diag,
nan_produced.get_data(), atomic_counter.get_data());
}

Expand Down
8 changes: 4 additions & 4 deletions cuda/solver/lower_trs_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,11 @@ template <typename ValueType, typename IndexType>
void generate(std::shared_ptr<const CudaExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
std::shared_ptr<solver::SolveStruct>& solve_struct,
const gko::size_type num_rhs)
bool unit_diag, const gko::size_type num_rhs)
{
if (matrix->get_strategy()->get_name() == "sparselib") {
generate_kernel<ValueType, IndexType>(exec, matrix, solve_struct,
num_rhs, false);
num_rhs, false, unit_diag);
}
}

Expand All @@ -88,15 +88,15 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
template <typename ValueType, typename IndexType>
void solve(std::shared_ptr<const CudaExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
const solver::SolveStruct* solve_struct,
const solver::SolveStruct* solve_struct, bool unit_diag,
matrix::Dense<ValueType>* trans_b, matrix::Dense<ValueType>* trans_x,
const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* x)
{
if (matrix->get_strategy()->get_name() == "sparselib") {
solve_kernel<ValueType, IndexType>(exec, matrix, solve_struct, trans_b,
trans_x, b, x);
} else {
sptrsv_naive_caching<false>(exec, matrix, b, x);
sptrsv_naive_caching<false>(exec, matrix, unit_diag, b, x);
}
}

Expand Down
8 changes: 4 additions & 4 deletions cuda/solver/upper_trs_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,11 @@ template <typename ValueType, typename IndexType>
void generate(std::shared_ptr<const CudaExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
std::shared_ptr<solver::SolveStruct>& solve_struct,
const gko::size_type num_rhs)
bool unit_diag, const gko::size_type num_rhs)
{
if (matrix->get_strategy()->get_name() == "sparselib") {
generate_kernel<ValueType, IndexType>(exec, matrix, solve_struct,
num_rhs, true);
num_rhs, true, unit_diag);
}
}

Expand All @@ -88,15 +88,15 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
template <typename ValueType, typename IndexType>
void solve(std::shared_ptr<const CudaExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
const solver::SolveStruct* solve_struct,
const solver::SolveStruct* solve_struct, bool unit_diag,
matrix::Dense<ValueType>* trans_b, matrix::Dense<ValueType>* trans_x,
const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* x)
{
if (matrix->get_strategy()->get_name() == "sparselib") {
solve_kernel<ValueType, IndexType>(exec, matrix, solve_struct, trans_b,
trans_x, b, x);
} else {
sptrsv_naive_caching<true>(exec, matrix, b, x);
sptrsv_naive_caching<true>(exec, matrix, unit_diag, b, x);
}
}

Expand Down
4 changes: 2 additions & 2 deletions dpcpp/solver/lower_trs_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ template <typename ValueType, typename IndexType>
void generate(std::shared_ptr<const DpcppExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
std::shared_ptr<solver::SolveStruct>& solve_struct,
const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED;
bool unit_diag, const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED;

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);
Expand All @@ -83,7 +83,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
template <typename ValueType, typename IndexType>
void solve(std::shared_ptr<const DpcppExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
const solver::SolveStruct* solve_struct,
const solver::SolveStruct* solve_struct, bool unit_diag,
matrix::Dense<ValueType>* trans_b, matrix::Dense<ValueType>* trans_x,
const matrix::Dense<ValueType>* b,
matrix::Dense<ValueType>* x) GKO_NOT_IMPLEMENTED;
Expand Down
4 changes: 2 additions & 2 deletions dpcpp/solver/upper_trs_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ template <typename ValueType, typename IndexType>
void generate(std::shared_ptr<const DpcppExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
std::shared_ptr<solver::SolveStruct>& solve_struct,
const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED;
bool unit_diag, const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED;

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL);
Expand All @@ -83,7 +83,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
template <typename ValueType, typename IndexType>
void solve(std::shared_ptr<const DpcppExecutor> exec,
const matrix::Csr<ValueType, IndexType>* matrix,
const solver::SolveStruct* solve_struct,
const solver::SolveStruct* solve_struct, bool unit_diag,
matrix::Dense<ValueType>* trans_b, matrix::Dense<ValueType>* trans_x,
const matrix::Dense<ValueType>* b,
matrix::Dense<ValueType>* x) GKO_NOT_IMPLEMENTED;
Expand Down
Loading

0 comments on commit 8212b9b

Please sign in to comment.