-
Notifications
You must be signed in to change notification settings - Fork 94
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge #1769 Add distributed multigrid example
This pr adds an example to show the setup of distributed multigrid. Related PR: #1769
- Loading branch information
Showing
9 changed files
with
321 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
16 changes: 16 additions & 0 deletions
16
examples/distributed-multigrid-preconditioned-solver/CMakeLists.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
cmake_minimum_required(VERSION 3.16) | ||
project(distributed-multigrid-preconditioned-solver) | ||
|
||
# We only need to find Ginkgo if we build this example stand-alone | ||
if(NOT GINKGO_BUILD_EXAMPLES) | ||
find_package(Ginkgo 1.10.0 REQUIRED) | ||
endif() | ||
|
||
add_executable( | ||
distributed-multigrid-preconditioned-solver | ||
distributed-multigrid-preconditioned-solver.cpp | ||
) | ||
target_link_libraries( | ||
distributed-multigrid-preconditioned-solver | ||
Ginkgo::ginkgo | ||
) |
268 changes: 268 additions & 0 deletions
268
...stributed-multigrid-preconditioned-solver/distributed-multigrid-preconditioned-solver.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,268 @@ | ||
// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors | ||
// | ||
// SPDX-License-Identifier: BSD-3-Clause | ||
|
||
// @sect3{Include files} | ||
|
||
// This is the main ginkgo header file. | ||
#include <ginkgo/ginkgo.hpp> | ||
|
||
// Add the C++ iostream header to output information to the console. | ||
#include <iostream> | ||
// Add the STL map header for the executor selection | ||
#include <map> | ||
// Add the string manipulation header to handle strings. | ||
#include <string> | ||
|
||
|
||
int main(int argc, char* argv[]) | ||
{ | ||
// @sect3{Initialize the MPI environment} | ||
// Since this is an MPI program, we need to initialize and finalize | ||
// MPI at the begin and end respectively of our program. This can be easily | ||
// done with the following helper construct that uses RAII to automate the | ||
// initialization and finalization. | ||
const gko::experimental::mpi::environment env(argc, argv); | ||
// @sect3{Type Definitions} | ||
// Define the needed types. In a parallel program we need to differentiate | ||
// between global and local indices, thus we have two index types. | ||
using GlobalIndexType = gko::int64; | ||
using LocalIndexType = gko::int32; | ||
// The underlying value type. | ||
using ValueType = double; | ||
// As vector type we use the following, which implements a subset of @ref | ||
// gko::matrix::Dense. | ||
using dist_vec = gko::experimental::distributed::Vector<ValueType>; | ||
// As matrix type we simply use the following type, which can read | ||
// distributed data and be applied to a distributed vector. | ||
using dist_mtx = | ||
gko::experimental::distributed::Matrix<ValueType, LocalIndexType, | ||
GlobalIndexType>; | ||
// We still need a localized vector type to be used as scalars in the | ||
// advanced apply operations. | ||
using vec = gko::matrix::Dense<ValueType>; | ||
// The partition type describes how the rows of the matrices are | ||
// distributed. | ||
using part_type = | ||
gko::experimental::distributed::Partition<LocalIndexType, | ||
GlobalIndexType>; | ||
// We can use here the same solver type as you would use in a | ||
// non-distributed program. Please note that not all solvers support | ||
// distributed systems at the moment. | ||
using solver = gko::solver::Cg<ValueType>; | ||
// We use the Schwarz preconditioner to extend non-distributed | ||
// preconditioners, like our Jacobi, | ||
// to the distributed case. The Schwarz preconditioner wraps another | ||
// preconditioner, and applies it only to the local part of a distributed | ||
// matrix. This will be used as our distributed multigrid smoother. | ||
using schwarz = gko::experimental::distributed::preconditioner::Schwarz< | ||
ValueType, LocalIndexType, GlobalIndexType>; | ||
using bj = gko::preconditioner::Jacobi<ValueType, LocalIndexType>; | ||
// Multigrid and Pgm can accept the distributed matrix, so we still use the | ||
// same type as the non-distributed case. | ||
using mg = gko::solver::Multigrid; | ||
using pgm = gko::multigrid::Pgm<ValueType, LocalIndexType>; | ||
|
||
// Create an MPI communicator get the rank of the calling process. | ||
const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); | ||
const auto rank = comm.rank(); | ||
|
||
// @sect3{User Input Handling} | ||
// User input settings: | ||
// - The executor, defaults to reference. | ||
// - The number of grid points, defaults to 100. | ||
// - The number of iterations, defaults to 1000. | ||
if (argc == 2 && (std::string(argv[1]) == "--help")) { | ||
if (rank == 0) { | ||
std::cerr << "Usage: " << argv[0] | ||
<< " [executor] [num_grid_points] [num_iterations] " | ||
<< std::endl; | ||
} | ||
std::exit(-1); | ||
} | ||
|
||
ValueType t_init = gko::experimental::mpi::get_walltime(); | ||
|
||
const auto executor_string = argc >= 2 ? argv[1] : "reference"; | ||
const auto grid_dim = | ||
static_cast<gko::size_type>(argc >= 3 ? std::atoi(argv[2]) : 100); | ||
const auto num_iters = | ||
static_cast<gko::size_type>(argc >= 4 ? std::atoi(argv[3]) : 1000); | ||
|
||
const std::map<std::string, | ||
std::function<std::shared_ptr<gko::Executor>(MPI_Comm)>> | ||
executor_factory_mpi{ | ||
{"reference", | ||
[](MPI_Comm) { return gko::ReferenceExecutor::create(); }}, | ||
{"omp", [](MPI_Comm) { return gko::OmpExecutor::create(); }}, | ||
{"cuda", | ||
[](MPI_Comm comm) { | ||
int device_id = gko::experimental::mpi::map_rank_to_device_id( | ||
comm, gko::CudaExecutor::get_num_devices()); | ||
return gko::CudaExecutor::create( | ||
device_id, gko::ReferenceExecutor::create()); | ||
}}, | ||
{"hip", | ||
[](MPI_Comm comm) { | ||
int device_id = gko::experimental::mpi::map_rank_to_device_id( | ||
comm, gko::HipExecutor::get_num_devices()); | ||
return gko::HipExecutor::create( | ||
device_id, gko::ReferenceExecutor::create()); | ||
}}, | ||
{"dpcpp", [](MPI_Comm comm) { | ||
int device_id = 0; | ||
if (gko::DpcppExecutor::get_num_devices("gpu")) { | ||
device_id = gko::experimental::mpi::map_rank_to_device_id( | ||
comm, gko::DpcppExecutor::get_num_devices("gpu")); | ||
} else if (gko::DpcppExecutor::get_num_devices("cpu")) { | ||
device_id = gko::experimental::mpi::map_rank_to_device_id( | ||
comm, gko::DpcppExecutor::get_num_devices("cpu")); | ||
} else { | ||
throw std::runtime_error("No suitable DPC++ devices"); | ||
} | ||
return gko::DpcppExecutor::create( | ||
device_id, gko::ReferenceExecutor::create()); | ||
}}}; | ||
|
||
auto exec = executor_factory_mpi.at(executor_string)(MPI_COMM_WORLD); | ||
|
||
// @sect3{Creating the Distributed Matrix and Vectors} | ||
// As a first step, we create a partition of the rows. The partition | ||
// consists of ranges of consecutive rows which are assigned a part-id. | ||
// These part-ids will be used for the distributed data structures to | ||
// determine which rows will be stored locally. In this example each rank | ||
// has (nearly) the same number of rows, so we can use the following | ||
// specialized constructor. See @ref gko::distributed::Partition for other | ||
// modes of creating a partition. | ||
const auto num_rows = grid_dim; | ||
auto partition = gko::share(part_type::build_from_global_size_uniform( | ||
exec->get_master(), comm.size(), | ||
static_cast<GlobalIndexType>(num_rows))); | ||
|
||
// Assemble the matrix using a 3-pt stencil and fill the right-hand-side | ||
// with a sine value. The distributed matrix supports only constructing an | ||
// empty matrix of zero size and filling in the values with | ||
// gko::experimental::distributed::Matrix::read_distributed. Only the data | ||
// that belongs to the rows by this rank will be assembled. | ||
gko::matrix_data<ValueType, GlobalIndexType> A_data; | ||
gko::matrix_data<ValueType, GlobalIndexType> b_data; | ||
gko::matrix_data<ValueType, GlobalIndexType> x_data; | ||
A_data.size = {num_rows, num_rows}; | ||
b_data.size = {num_rows, 1}; | ||
x_data.size = {num_rows, 1}; | ||
const auto range_start = partition->get_range_bounds()[rank]; | ||
const auto range_end = partition->get_range_bounds()[rank + 1]; | ||
for (int i = range_start; i < range_end; i++) { | ||
if (i > 0) { | ||
A_data.nonzeros.emplace_back(i, i - 1, -1); | ||
} | ||
A_data.nonzeros.emplace_back(i, i, 2); | ||
if (i < grid_dim - 1) { | ||
A_data.nonzeros.emplace_back(i, i + 1, -1); | ||
} | ||
b_data.nonzeros.emplace_back(i, 0, std::sin(i * 0.01)); | ||
x_data.nonzeros.emplace_back(i, 0, gko::zero<ValueType>()); | ||
} | ||
|
||
// Take timings. | ||
comm.synchronize(); | ||
ValueType t_init_end = gko::experimental::mpi::get_walltime(); | ||
|
||
// Read the matrix data, currently this is only supported on CPU executors. | ||
// This will also set up the communication pattern needed for the | ||
// distributed matrix-vector multiplication. | ||
auto A_host = gko::share(dist_mtx::create(exec->get_master(), comm)); | ||
auto x_host = dist_vec::create(exec->get_master(), comm); | ||
auto b_host = dist_vec::create(exec->get_master(), comm); | ||
A_host->read_distributed(A_data, partition); | ||
b_host->read_distributed(b_data, partition); | ||
x_host->read_distributed(x_data, partition); | ||
// After reading, the matrix and vector can be moved to the chosen executor, | ||
// since the distributed matrix supports SpMV also on devices. | ||
auto A = gko::share(dist_mtx::create(exec, comm)); | ||
auto x = dist_vec::create(exec, comm); | ||
auto b = dist_vec::create(exec, comm); | ||
A->copy_from(A_host); | ||
b->copy_from(b_host); | ||
x->copy_from(x_host); | ||
|
||
// Take timings. | ||
comm.synchronize(); | ||
ValueType t_read_setup_end = gko::experimental::mpi::get_walltime(); | ||
|
||
|
||
// @sect3{Solve the Distributed System} | ||
// Generate the solver | ||
|
||
// Setup the multigrid factory with customized smoother and coarse solver | ||
// Because BlockJacobi does not support distributed matrix, we need wrap it | ||
// in Schwarz. | ||
auto schwarz_bj_factory = | ||
gko::share(schwarz::build().with_local_solver(bj::build()).on(exec)); | ||
auto smoother_factory = gko::share(gko::solver::build_smoother( | ||
schwarz_bj_factory, 2u, static_cast<ValueType>(0.9))); | ||
// Cg supports distributed matrix, so we can use it as we did in | ||
// non-distributed case | ||
auto coarsest_factory = gko::share( | ||
solver::build() | ||
.with_criteria(gko::stop::Iteration::build().with_max_iters(4u)) | ||
.on(exec)); | ||
// The multigrid preconditioner uses the Schwarz Jacobi as smoother and Cg | ||
// as coarse solver | ||
auto mg_factory = gko::share( | ||
mg::build() | ||
.with_mg_level(pgm::build().with_deterministic(true)) | ||
.with_pre_smoother(smoother_factory) | ||
.with_coarsest_solver(coarsest_factory) | ||
.with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) | ||
.on(exec)); | ||
|
||
// Setup the stopping criterion and logger | ||
const gko::remove_complex<ValueType> reduction_factor{1e-8}; | ||
std::shared_ptr<const gko::log::Convergence<ValueType>> logger = | ||
gko::log::Convergence<ValueType>::create(); | ||
auto Ainv = solver::build() | ||
.with_preconditioner(mg_factory) | ||
.with_criteria( | ||
gko::stop::Iteration::build().with_max_iters(num_iters), | ||
gko::stop::ResidualNorm<ValueType>::build() | ||
.with_reduction_factor(reduction_factor)) | ||
.on(exec) | ||
->generate(A); | ||
// Add logger to the generated solver to log the iteration count and | ||
// residual norm | ||
Ainv->add_logger(logger); | ||
|
||
// Take timings. | ||
comm.synchronize(); | ||
ValueType t_solver_generate_end = gko::experimental::mpi::get_walltime(); | ||
|
||
// Apply the distributed solver, this is the same as in the non-distributed | ||
// case. | ||
Ainv->apply(b, x); | ||
|
||
// Take timings. | ||
comm.synchronize(); | ||
ValueType t_end = gko::experimental::mpi::get_walltime(); | ||
|
||
// Get the residual. | ||
auto res_norm = gko::clone(exec->get_master(), | ||
gko::as<vec>(logger->get_residual_norm())); | ||
|
||
// @sect3{Printing Results} | ||
// Print the achieved residual norm and timings on rank 0. | ||
if (comm.rank() == 0) { | ||
// clang-format off | ||
std::cout << "\nNum rows in matrix: " << num_rows | ||
<< "\nNum ranks: " << comm.size() | ||
<< "\nFinal Res norm: " << res_norm->at(0, 0) | ||
<< "\nIteration count: " << logger->get_num_iterations() | ||
<< "\nInit time: " << t_init_end - t_init | ||
<< "\nRead time: " << t_read_setup_end - t_init | ||
<< "\nSolver generate time: " << t_solver_generate_end - t_read_setup_end | ||
<< "\nSolver apply time: " << t_end - t_solver_generate_end | ||
<< "\nTotal time: " << t_end - t_init | ||
<< std::endl; | ||
// clang-format on | ||
} | ||
} |
1 change: 1 addition & 0 deletions
1
examples/distributed-multigrid-preconditioned-solver/doc/builds-on
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
distributed-solver |
9 changes: 9 additions & 0 deletions
9
examples/distributed-multigrid-preconditioned-solver/doc/intro.dox
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
<a name="Intro"></a> | ||
<h1>Introduction</h1> | ||
This distributed multigrid preconditioned solver example should help you understand customizing Ginkgo multigrid in a distributed setting. | ||
The example will solve a simple 1D Laplace equation where the system can be distributed row-wise to multiple processes. | ||
Note. Because the stencil for the discretized Laplacian is configured with equal weight, the coarsening method does not perform well on this kind of problem. | ||
To run the solver with multiple processes, use `mpirun -n NUM_PROCS ./distributed-solver [executor] [num_grid_points] [num_iterations]`. | ||
|
||
If you are using GPU devices, please make sure that you run this example with at most as many processes as you have GPU | ||
devices available. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
distributed |
18 changes: 18 additions & 0 deletions
18
examples/distributed-multigrid-preconditioned-solver/doc/results.dox
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
<h1>Results</h1> | ||
This is the expected output for `mpirun -n 4 ./distributed-multigrid-preconditioned-solver`: | ||
|
||
@code{.cpp} | ||
|
||
Num rows in matrix: 100 | ||
Num ranks: 4 | ||
Final Res norm: 1.61045e-08 | ||
Iteration count: 18 | ||
Init time: 0.000117699 | ||
Read time: 0.000522518 | ||
Solver generate time: 0.000430548 | ||
Solver apply time: 0.00183804 | ||
Total time: 0.00279111 | ||
|
||
@endcode | ||
|
||
The timings may vary depending on the machine. |
1 change: 1 addition & 0 deletions
1
examples/distributed-multigrid-preconditioned-solver/doc/short-intro
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
The distributed multigrid preconditioned solver with customized components example. |
1 change: 1 addition & 0 deletions
1
examples/distributed-multigrid-preconditioned-solver/doc/tooltip
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Solves a distributed linear system. |