diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d547ffe83cc..8f4c848005f 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -60,7 +60,12 @@ if(GINKGO_HAVE_PAPI_SDE) endif() if(GINKGO_BUILD_MPI) - list(APPEND EXAMPLES_LIST distributed-solver) + list( + APPEND + EXAMPLES_LIST + distributed-solver + distributed-multigrid-preconditioned-solver + ) endif() find_package(Kokkos 4.1.00 QUIET) diff --git a/examples/distributed-multigrid-preconditioned-solver/CMakeLists.txt b/examples/distributed-multigrid-preconditioned-solver/CMakeLists.txt new file mode 100644 index 00000000000..1c81952c0bb --- /dev/null +++ b/examples/distributed-multigrid-preconditioned-solver/CMakeLists.txt @@ -0,0 +1,16 @@ +cmake_minimum_required(VERSION 3.16) +project(distributed-multigrid-preconditioned-solver) + +# We only need to find Ginkgo if we build this example stand-alone +if(NOT GINKGO_BUILD_EXAMPLES) + find_package(Ginkgo 1.10.0 REQUIRED) +endif() + +add_executable( + distributed-multigrid-preconditioned-solver + distributed-multigrid-preconditioned-solver.cpp +) +target_link_libraries( + distributed-multigrid-preconditioned-solver + Ginkgo::ginkgo +) diff --git a/examples/distributed-multigrid-preconditioned-solver/distributed-multigrid-preconditioned-solver.cpp b/examples/distributed-multigrid-preconditioned-solver/distributed-multigrid-preconditioned-solver.cpp new file mode 100644 index 00000000000..658826f09f7 --- /dev/null +++ b/examples/distributed-multigrid-preconditioned-solver/distributed-multigrid-preconditioned-solver.cpp @@ -0,0 +1,268 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +// @sect3{Include files} + +// This is the main ginkgo header file. +#include + +// Add the C++ iostream header to output information to the console. +#include +// Add the STL map header for the executor selection +#include +// Add the string manipulation header to handle strings. +#include + + +int main(int argc, char* argv[]) +{ + // @sect3{Initialize the MPI environment} + // Since this is an MPI program, we need to initialize and finalize + // MPI at the begin and end respectively of our program. This can be easily + // done with the following helper construct that uses RAII to automate the + // initialization and finalization. + const gko::experimental::mpi::environment env(argc, argv); + // @sect3{Type Definitions} + // Define the needed types. In a parallel program we need to differentiate + // between global and local indices, thus we have two index types. + using GlobalIndexType = gko::int64; + using LocalIndexType = gko::int32; + // The underlying value type. + using ValueType = double; + // As vector type we use the following, which implements a subset of @ref + // gko::matrix::Dense. + using dist_vec = gko::experimental::distributed::Vector; + // As matrix type we simply use the following type, which can read + // distributed data and be applied to a distributed vector. + using dist_mtx = + gko::experimental::distributed::Matrix; + // We still need a localized vector type to be used as scalars in the + // advanced apply operations. + using vec = gko::matrix::Dense; + // The partition type describes how the rows of the matrices are + // distributed. + using part_type = + gko::experimental::distributed::Partition; + // We can use here the same solver type as you would use in a + // non-distributed program. Please note that not all solvers support + // distributed systems at the moment. + using solver = gko::solver::Cg; + // We use the Schwarz preconditioner to extend non-distributed + // preconditioners, like our Jacobi, + // to the distributed case. The Schwarz preconditioner wraps another + // preconditioner, and applies it only to the local part of a distributed + // matrix. This will be used as our distributed multigrid smoother. + using schwarz = gko::experimental::distributed::preconditioner::Schwarz< + ValueType, LocalIndexType, GlobalIndexType>; + using bj = gko::preconditioner::Jacobi; + // Multigrid and Pgm can accept the distributed matrix, so we still use the + // same type as the non-distributed case. + using mg = gko::solver::Multigrid; + using pgm = gko::multigrid::Pgm; + + // Create an MPI communicator get the rank of the calling process. + const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); + const auto rank = comm.rank(); + + // @sect3{User Input Handling} + // User input settings: + // - The executor, defaults to reference. + // - The number of grid points, defaults to 100. + // - The number of iterations, defaults to 1000. + if (argc == 2 && (std::string(argv[1]) == "--help")) { + if (rank == 0) { + std::cerr << "Usage: " << argv[0] + << " [executor] [num_grid_points] [num_iterations] " + << std::endl; + } + std::exit(-1); + } + + ValueType t_init = gko::experimental::mpi::get_walltime(); + + const auto executor_string = argc >= 2 ? argv[1] : "reference"; + const auto grid_dim = + static_cast(argc >= 3 ? std::atoi(argv[2]) : 100); + const auto num_iters = + static_cast(argc >= 4 ? std::atoi(argv[3]) : 1000); + + const std::map(MPI_Comm)>> + executor_factory_mpi{ + {"reference", + [](MPI_Comm) { return gko::ReferenceExecutor::create(); }}, + {"omp", [](MPI_Comm) { return gko::OmpExecutor::create(); }}, + {"cuda", + [](MPI_Comm comm) { + int device_id = gko::experimental::mpi::map_rank_to_device_id( + comm, gko::CudaExecutor::get_num_devices()); + return gko::CudaExecutor::create( + device_id, gko::ReferenceExecutor::create()); + }}, + {"hip", + [](MPI_Comm comm) { + int device_id = gko::experimental::mpi::map_rank_to_device_id( + comm, gko::HipExecutor::get_num_devices()); + return gko::HipExecutor::create( + device_id, gko::ReferenceExecutor::create()); + }}, + {"dpcpp", [](MPI_Comm comm) { + int device_id = 0; + if (gko::DpcppExecutor::get_num_devices("gpu")) { + device_id = gko::experimental::mpi::map_rank_to_device_id( + comm, gko::DpcppExecutor::get_num_devices("gpu")); + } else if (gko::DpcppExecutor::get_num_devices("cpu")) { + device_id = gko::experimental::mpi::map_rank_to_device_id( + comm, gko::DpcppExecutor::get_num_devices("cpu")); + } else { + throw std::runtime_error("No suitable DPC++ devices"); + } + return gko::DpcppExecutor::create( + device_id, gko::ReferenceExecutor::create()); + }}}; + + auto exec = executor_factory_mpi.at(executor_string)(MPI_COMM_WORLD); + + // @sect3{Creating the Distributed Matrix and Vectors} + // As a first step, we create a partition of the rows. The partition + // consists of ranges of consecutive rows which are assigned a part-id. + // These part-ids will be used for the distributed data structures to + // determine which rows will be stored locally. In this example each rank + // has (nearly) the same number of rows, so we can use the following + // specialized constructor. See @ref gko::distributed::Partition for other + // modes of creating a partition. + const auto num_rows = grid_dim; + auto partition = gko::share(part_type::build_from_global_size_uniform( + exec->get_master(), comm.size(), + static_cast(num_rows))); + + // Assemble the matrix using a 3-pt stencil and fill the right-hand-side + // with a sine value. The distributed matrix supports only constructing an + // empty matrix of zero size and filling in the values with + // gko::experimental::distributed::Matrix::read_distributed. Only the data + // that belongs to the rows by this rank will be assembled. + gko::matrix_data A_data; + gko::matrix_data b_data; + gko::matrix_data x_data; + A_data.size = {num_rows, num_rows}; + b_data.size = {num_rows, 1}; + x_data.size = {num_rows, 1}; + const auto range_start = partition->get_range_bounds()[rank]; + const auto range_end = partition->get_range_bounds()[rank + 1]; + for (int i = range_start; i < range_end; i++) { + if (i > 0) { + A_data.nonzeros.emplace_back(i, i - 1, -1); + } + A_data.nonzeros.emplace_back(i, i, 2); + if (i < grid_dim - 1) { + A_data.nonzeros.emplace_back(i, i + 1, -1); + } + b_data.nonzeros.emplace_back(i, 0, std::sin(i * 0.01)); + x_data.nonzeros.emplace_back(i, 0, gko::zero()); + } + + // Take timings. + comm.synchronize(); + ValueType t_init_end = gko::experimental::mpi::get_walltime(); + + // Read the matrix data, currently this is only supported on CPU executors. + // This will also set up the communication pattern needed for the + // distributed matrix-vector multiplication. + auto A_host = gko::share(dist_mtx::create(exec->get_master(), comm)); + auto x_host = dist_vec::create(exec->get_master(), comm); + auto b_host = dist_vec::create(exec->get_master(), comm); + A_host->read_distributed(A_data, partition); + b_host->read_distributed(b_data, partition); + x_host->read_distributed(x_data, partition); + // After reading, the matrix and vector can be moved to the chosen executor, + // since the distributed matrix supports SpMV also on devices. + auto A = gko::share(dist_mtx::create(exec, comm)); + auto x = dist_vec::create(exec, comm); + auto b = dist_vec::create(exec, comm); + A->copy_from(A_host); + b->copy_from(b_host); + x->copy_from(x_host); + + // Take timings. + comm.synchronize(); + ValueType t_read_setup_end = gko::experimental::mpi::get_walltime(); + + + // @sect3{Solve the Distributed System} + // Generate the solver + + // Setup the multigrid factory with customized smoother and coarse solver + // Because BlockJacobi does not support distributed matrix, we need wrap it + // in Schwarz. + auto schwarz_bj_factory = + gko::share(schwarz::build().with_local_solver(bj::build()).on(exec)); + auto smoother_factory = gko::share(gko::solver::build_smoother( + schwarz_bj_factory, 2u, static_cast(0.9))); + // Cg supports distributed matrix, so we can use it as we did in + // non-distributed case + auto coarsest_factory = gko::share( + solver::build() + .with_criteria(gko::stop::Iteration::build().with_max_iters(4u)) + .on(exec)); + // The multigrid preconditioner uses the Schwarz Jacobi as smoother and Cg + // as coarse solver + auto mg_factory = gko::share( + mg::build() + .with_mg_level(pgm::build().with_deterministic(true)) + .with_pre_smoother(smoother_factory) + .with_coarsest_solver(coarsest_factory) + .with_criteria(gko::stop::Iteration::build().with_max_iters(1u)) + .on(exec)); + + // Setup the stopping criterion and logger + const gko::remove_complex reduction_factor{1e-8}; + std::shared_ptr> logger = + gko::log::Convergence::create(); + auto Ainv = solver::build() + .with_preconditioner(mg_factory) + .with_criteria( + gko::stop::Iteration::build().with_max_iters(num_iters), + gko::stop::ResidualNorm::build() + .with_reduction_factor(reduction_factor)) + .on(exec) + ->generate(A); + // Add logger to the generated solver to log the iteration count and + // residual norm + Ainv->add_logger(logger); + + // Take timings. + comm.synchronize(); + ValueType t_solver_generate_end = gko::experimental::mpi::get_walltime(); + + // Apply the distributed solver, this is the same as in the non-distributed + // case. + Ainv->apply(b, x); + + // Take timings. + comm.synchronize(); + ValueType t_end = gko::experimental::mpi::get_walltime(); + + // Get the residual. + auto res_norm = gko::clone(exec->get_master(), + gko::as(logger->get_residual_norm())); + + // @sect3{Printing Results} + // Print the achieved residual norm and timings on rank 0. + if (comm.rank() == 0) { + // clang-format off + std::cout << "\nNum rows in matrix: " << num_rows + << "\nNum ranks: " << comm.size() + << "\nFinal Res norm: " << res_norm->at(0, 0) + << "\nIteration count: " << logger->get_num_iterations() + << "\nInit time: " << t_init_end - t_init + << "\nRead time: " << t_read_setup_end - t_init + << "\nSolver generate time: " << t_solver_generate_end - t_read_setup_end + << "\nSolver apply time: " << t_end - t_solver_generate_end + << "\nTotal time: " << t_end - t_init + << std::endl; + // clang-format on + } +} diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/builds-on b/examples/distributed-multigrid-preconditioned-solver/doc/builds-on new file mode 100644 index 00000000000..f70ab1608ec --- /dev/null +++ b/examples/distributed-multigrid-preconditioned-solver/doc/builds-on @@ -0,0 +1 @@ +distributed-solver diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/intro.dox b/examples/distributed-multigrid-preconditioned-solver/doc/intro.dox new file mode 100644 index 00000000000..99304ca4851 --- /dev/null +++ b/examples/distributed-multigrid-preconditioned-solver/doc/intro.dox @@ -0,0 +1,9 @@ + +

Introduction

+This distributed multigrid preconditioned solver example should help you understand customizing Ginkgo multigrid in a distributed setting. +The example will solve a simple 1D Laplace equation where the system can be distributed row-wise to multiple processes. +Note. Because the stencil for the discretized Laplacian is configured with equal weight, the coarsening method does not perform well on this kind of problem. +To run the solver with multiple processes, use `mpirun -n NUM_PROCS ./distributed-solver [executor] [num_grid_points] [num_iterations]`. + +If you are using GPU devices, please make sure that you run this example with at most as many processes as you have GPU +devices available. diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/kind b/examples/distributed-multigrid-preconditioned-solver/doc/kind new file mode 100644 index 00000000000..196aa616342 --- /dev/null +++ b/examples/distributed-multigrid-preconditioned-solver/doc/kind @@ -0,0 +1 @@ +distributed diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/results.dox b/examples/distributed-multigrid-preconditioned-solver/doc/results.dox new file mode 100644 index 00000000000..0189c4c91e6 --- /dev/null +++ b/examples/distributed-multigrid-preconditioned-solver/doc/results.dox @@ -0,0 +1,18 @@ +

Results

+This is the expected output for `mpirun -n 4 ./distributed-multigrid-preconditioned-solver`: + +@code{.cpp} + +Num rows in matrix: 100 +Num ranks: 4 +Final Res norm: 1.61045e-08 +Iteration count: 18 +Init time: 0.000117699 +Read time: 0.000522518 +Solver generate time: 0.000430548 +Solver apply time: 0.00183804 +Total time: 0.00279111 + +@endcode + +The timings may vary depending on the machine. diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/short-intro b/examples/distributed-multigrid-preconditioned-solver/doc/short-intro new file mode 100644 index 00000000000..443031b3e39 --- /dev/null +++ b/examples/distributed-multigrid-preconditioned-solver/doc/short-intro @@ -0,0 +1 @@ +The distributed multigrid preconditioned solver with customized components example. diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/tooltip b/examples/distributed-multigrid-preconditioned-solver/doc/tooltip new file mode 100644 index 00000000000..3e6cc291852 --- /dev/null +++ b/examples/distributed-multigrid-preconditioned-solver/doc/tooltip @@ -0,0 +1 @@ +Solves a distributed linear system.