From 56fbc58470ccddb3861e807e9f801985f2682028 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 15 Jan 2025 13:41:33 +0100
Subject: [PATCH 1/5] add distributed mg example

---
 examples/CMakeLists.txt                       |   2 +-
 .../CMakeLists.txt                            |  10 +
 ...ibuted-multigrid-preconditioned-solver.cpp | 246 ++++++++++++++++++
 .../doc/builds-on                             |   1 +
 .../doc/intro.dox                             |   9 +
 .../doc/kind                                  |   1 +
 .../doc/results.dox                           |  18 ++
 .../doc/short-intro                           |   1 +
 .../doc/tooltip                               |   1 +
 9 files changed, 288 insertions(+), 1 deletion(-)
 create mode 100644 examples/distributed-multigrid-preconditioned-solver/CMakeLists.txt
 create mode 100644 examples/distributed-multigrid-preconditioned-solver/distributed-multigrid-preconditioned-solver.cpp
 create mode 100644 examples/distributed-multigrid-preconditioned-solver/doc/builds-on
 create mode 100644 examples/distributed-multigrid-preconditioned-solver/doc/intro.dox
 create mode 100644 examples/distributed-multigrid-preconditioned-solver/doc/kind
 create mode 100644 examples/distributed-multigrid-preconditioned-solver/doc/results.dox
 create mode 100644 examples/distributed-multigrid-preconditioned-solver/doc/short-intro
 create mode 100644 examples/distributed-multigrid-preconditioned-solver/doc/tooltip
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d547ffe83cc..d58b9a59356 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -60,7 +60,7 @@ if(GINKGO_HAVE_PAPI_SDE)
 endif()
 
 if(GINKGO_BUILD_MPI)
-    list(APPEND EXAMPLES_LIST distributed-solver)
+    list(APPEND EXAMPLES_LIST distributed-solver distributed-multigrid-preconditioned-solver)
 endif()
 
 find_package(Kokkos 4.1.00 QUIET)
diff --git a/examples/distributed-multigrid-preconditioned-solver/CMakeLists.txt b/examples/distributed-multigrid-preconditioned-solver/CMakeLists.txt
new file mode 100644
index 00000000000..0de5e4cd44a
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver/CMakeLists.txt
@@ -0,0 +1,10 @@
+cmake_minimum_required(VERSION 3.16)
+project(distributed-multigrid-preconditioned-solver)
+
+# We only need to find Ginkgo if we build this example stand-alone
+if (NOT GINKGO_BUILD_EXAMPLES)
+    find_package(Ginkgo 1.10.0 REQUIRED)
+endif()
+
+add_executable(distributed-multigrid-preconditioned-solver distributed-multigrid-preconditioned-solver.cpp)
+target_link_libraries(distributed-multigrid-preconditioned-solver Ginkgo::ginkgo)
diff --git a/examples/distributed-multigrid-preconditioned-solver/distributed-multigrid-preconditioned-solver.cpp b/examples/distributed-multigrid-preconditioned-solver/distributed-multigrid-preconditioned-solver.cpp
new file mode 100644
index 00000000000..66f05ca911a
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver/distributed-multigrid-preconditioned-solver.cpp
@@ -0,0 +1,246 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+// @sect3{Include files}
+
+// This is the main ginkgo header file.
+#include <ginkgo/ginkgo.hpp>
+
+// Add the C++ iostream header to output information to the console.
+#include <iostream>
+// Add the STL map header for the executor selection
+#include <map>
+// Add the string manipulation header to handle strings.
+#include <string>
+
+
+int main(int argc, char* argv[])
+{
+    // @sect3{Initialize the MPI environment}
+    // Since this is an MPI program, we need to initialize and finalize
+    // MPI at the begin and end respectively of our program. This can be easily
+    // done with the following helper construct that uses RAII to automate the
+    // initialization and finalization.
+    const gko::experimental::mpi::environment env(argc, argv);
+    // @sect3{Type Definitions}
+    // Define the needed types. In a parallel program we need to differentiate
+    // between global and local indices, thus we have two index types.
+    using GlobalIndexType = gko::int64;
+    using LocalIndexType = gko::int32;
+    // The underlying value type.
+    using ValueType = double;
+    // As vector type we use the following, which implements a subset of @ref
+    // gko::matrix::Dense.
+    using dist_vec = gko::experimental::distributed::Vector<ValueType>;
+    // As matrix type we simply use the following type, which can read
+    // distributed data and be applied to a distributed vector.
+    using dist_mtx =
+        gko::experimental::distributed::Matrix<ValueType, LocalIndexType,
+                                               GlobalIndexType>;
+    // We still need a localized vector type to be used as scalars in the
+    // advanced apply operations.
+    using vec = gko::matrix::Dense<ValueType>;
+    // The partition type describes how the rows of the matrices are
+    // distributed.
+    using part_type =
+        gko::experimental::distributed::Partition<LocalIndexType,
+                                                  GlobalIndexType>;
+    // We can use here the same solver type as you would use in a
+    // non-distributed program. Please note that not all solvers support
+    // distributed systems at the moment.
+    using solver = gko::solver::Cg<ValueType>;
+    using schwarz = gko::experimental::distributed::preconditioner::Schwarz<
+        ValueType, LocalIndexType, GlobalIndexType>;
+    using bj = gko::preconditioner::Jacobi<ValueType, LocalIndexType>;
+    using mg = gko::solver::Multigrid;
+    using pgm = gko::multigrid::Pgm<ValueType, LocalIndexType>;
+
+    // Create an MPI communicator get the rank of the calling process.
+    const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    const auto rank = comm.rank();
+
+    // @sect3{User Input Handling}
+    // User input settings:
+    // - The executor, defaults to reference.
+    // - The number of grid points, defaults to 100.
+    // - The number of iterations, defaults to 1000.
+    if (argc == 2 && (std::string(argv[1]) == "--help")) {
+        if (rank == 0) {
+            std::cerr << "Usage: " << argv[0]
+                      << " [executor] [num_grid_points] [num_iterations] "
+                      << std::endl;
+        }
+        std::exit(-1);
+    }
+
+    ValueType t_init = gko::experimental::mpi::get_walltime();
+
+    const auto executor_string = argc >= 2 ? argv[1] : "reference";
+    const auto grid_dim =
+        static_cast<gko::size_type>(argc >= 3 ? std::atoi(argv[2]) : 100);
+    const auto num_iters =
+        static_cast<gko::size_type>(argc >= 4 ? std::atoi(argv[3]) : 1000);
+
+    const std::map<std::string,
+                   std::function<std::shared_ptr<gko::Executor>(MPI_Comm)>>
+        executor_factory_mpi{
+            {"reference",
+             [](MPI_Comm) { return gko::ReferenceExecutor::create(); }},
+            {"omp", [](MPI_Comm) { return gko::OmpExecutor::create(); }},
+            {"cuda",
+             [](MPI_Comm comm) {
+                 int device_id = gko::experimental::mpi::map_rank_to_device_id(
+                     comm, gko::CudaExecutor::get_num_devices());
+                 return gko::CudaExecutor::create(
+                     device_id, gko::ReferenceExecutor::create());
+             }},
+            {"hip",
+             [](MPI_Comm comm) {
+                 int device_id = gko::experimental::mpi::map_rank_to_device_id(
+                     comm, gko::HipExecutor::get_num_devices());
+                 return gko::HipExecutor::create(
+                     device_id, gko::ReferenceExecutor::create());
+             }},
+            {"dpcpp", [](MPI_Comm comm) {
+                 int device_id = 0;
+                 if (gko::DpcppExecutor::get_num_devices("gpu")) {
+                     device_id = gko::experimental::mpi::map_rank_to_device_id(
+                         comm, gko::DpcppExecutor::get_num_devices("gpu"));
+                 } else if (gko::DpcppExecutor::get_num_devices("cpu")) {
+                     device_id = gko::experimental::mpi::map_rank_to_device_id(
+                         comm, gko::DpcppExecutor::get_num_devices("cpu"));
+                 } else {
+                     throw std::runtime_error("No suitable DPC++ devices");
+                 }
+                 return gko::DpcppExecutor::create(
+                     device_id, gko::ReferenceExecutor::create());
+             }}};
+
+    auto exec = executor_factory_mpi.at(executor_string)(MPI_COMM_WORLD);
+
+    // @sect3{Creating the Distributed Matrix and Vectors}
+    // As a first step, we create a partition of the rows. The partition
+    // consists of ranges of consecutive rows which are assigned a part-id.
+    // These part-ids will be used for the distributed data structures to
+    // determine which rows will be stored locally. In this example each rank
+    // has (nearly) the same number of rows, so we can use the following
+    // specialized constructor. See @ref gko::distributed::Partition for other
+    // modes of creating a partition.
+    const auto num_rows = grid_dim;
+    auto partition = gko::share(part_type::build_from_global_size_uniform(
+        exec->get_master(), comm.size(),
+        static_cast<GlobalIndexType>(num_rows)));
+
+    // Assemble the matrix using a 3-pt stencil and fill the right-hand-side
+    // with a sine value. The distributed matrix supports only constructing an
+    // empty matrix of zero size and filling in the values with
+    // gko::experimental::distributed::Matrix::read_distributed. Only the data
+    // that belongs to the rows by this rank will be assembled.
+    gko::matrix_data<ValueType, GlobalIndexType> A_data;
+    gko::matrix_data<ValueType, GlobalIndexType> b_data;
+    gko::matrix_data<ValueType, GlobalIndexType> x_data;
+    A_data.size = {num_rows, num_rows};
+    b_data.size = {num_rows, 1};
+    x_data.size = {num_rows, 1};
+    const auto range_start = partition->get_range_bounds()[rank];
+    const auto range_end = partition->get_range_bounds()[rank + 1];
+    for (int i = range_start; i < range_end; i++) {
+        if (i > 0) {
+            A_data.nonzeros.emplace_back(i, i - 1, -1);
+        }
+        A_data.nonzeros.emplace_back(i, i, 2);
+        if (i < grid_dim - 1) {
+            A_data.nonzeros.emplace_back(i, i + 1, -1);
+        }
+        b_data.nonzeros.emplace_back(i, 0, std::sin(i * 0.01));
+        x_data.nonzeros.emplace_back(i, 0, gko::zero<ValueType>());
+    }
+
+    // Take timings.
+    comm.synchronize();
+    ValueType t_init_end = gko::experimental::mpi::get_walltime();
+
+    // Read the matrix data, currently this is only supported on CPU executors.
+    // This will also set up the communication pattern needed for the
+    // distributed matrix-vector multiplication.
+    auto A_host = gko::share(dist_mtx::create(exec->get_master(), comm));
+    auto x_host = dist_vec::create(exec->get_master(), comm);
+    auto b_host = dist_vec::create(exec->get_master(), comm);
+    A_host->read_distributed(A_data, partition);
+    b_host->read_distributed(b_data, partition);
+    x_host->read_distributed(x_data, partition);
+    // After reading, the matrix and vector can be moved to the chosen executor,
+    // since the distributed matrix supports SpMV also on devices.
+    auto A = gko::share(dist_mtx::create(exec, comm));
+    auto x = dist_vec::create(exec, comm);
+    auto b = dist_vec::create(exec, comm);
+    A->copy_from(A_host);
+    b->copy_from(b_host);
+    x->copy_from(x_host);
+
+    // Take timings.
+    comm.synchronize();
+    ValueType t_read_setup_end = gko::experimental::mpi::get_walltime();
+
+
+    // @sect3{Solve the Distributed System}
+    // Generate the solver
+
+    // Setup the multigrid factory with default setting
+    // It uses Schwarz Jacobi as smoother and GMRES as coarse solver
+    auto mg_factory = gko::share(
+        mg::build()
+            .with_mg_level(pgm::build().with_deterministic(true))
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
+            .on(exec));
+
+    // Setup the stopping criterion and logger
+    const gko::remove_complex<ValueType> reduction_factor{1e-8};
+    std::shared_ptr<const gko::log::Convergence<ValueType>> logger =
+        gko::log::Convergence<ValueType>::create();
+    auto Ainv = solver::build()
+                    .with_preconditioner(mg_factory)
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(num_iters),
+                        gko::stop::ResidualNorm<ValueType>::build()
+                            .with_reduction_factor(reduction_factor))
+                    .on(exec)
+                    ->generate(A);
+    // Add logger to the generated solver to log the iteration count and
+    // residual norm
+    Ainv->add_logger(logger);
+
+    // Take timings.
+    comm.synchronize();
+    ValueType t_solver_generate_end = gko::experimental::mpi::get_walltime();
+
+    // Apply the distributed solver, this is the same as in the non-distributed
+    // case.
+    Ainv->apply(b, x);
+
+    // Take timings.
+    comm.synchronize();
+    ValueType t_end = gko::experimental::mpi::get_walltime();
+
+    // Get the residual.
+    auto res_norm = gko::clone(exec->get_master(),
+                               gko::as<vec>(logger->get_residual_norm()));
+
+    // @sect3{Printing Results}
+    // Print the achieved residual norm and timings on rank 0.
+    if (comm.rank() == 0) {
+        // clang-format off
+        std::cout << "\nNum rows in matrix: " << num_rows
+                  << "\nNum ranks: " << comm.size()
+                  << "\nFinal Res norm: " << res_norm->at(0, 0)
+                  << "\nIteration count: " << logger->get_num_iterations()
+                  << "\nInit time: " << t_init_end - t_init
+                  << "\nRead time: " << t_read_setup_end - t_init
+                  << "\nSolver generate time: " << t_solver_generate_end - t_read_setup_end
+                  << "\nSolver apply time: " << t_end - t_solver_generate_end
+                  << "\nTotal time: " << t_end - t_init
+                  << std::endl;
+        // clang-format on
+    }
+}
diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/builds-on b/examples/distributed-multigrid-preconditioned-solver/doc/builds-on
new file mode 100644
index 00000000000..f70ab1608ec
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver/doc/builds-on
@@ -0,0 +1 @@
+distributed-solver
diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/intro.dox b/examples/distributed-multigrid-preconditioned-solver/doc/intro.dox
new file mode 100644
index 00000000000..79cfef82425
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver/doc/intro.dox
@@ -0,0 +1,9 @@
+<a name="Intro"></a>
+<h1>Introduction</h1>
+This distributed multigrid preconditioned solver example should help you understand using Ginkgo multigrid in a distributed setting.
+The example will solve a simple 1D Laplace equation where the system can be distributed row-wise to multiple processes.
+Note. Because the stencil is configured equal weighted, the coarsening method does not perform well on this kind of problem.
+To run the solver with multiple processes, use `mpirun -n NUM_PROCS ./distributed-solver [executor] [num_grid_points] [num_iterations]`.
+
+If you are using GPU devices, please make sure that you run this example with at most as many processes as you have GPU
+devices available.
diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/kind b/examples/distributed-multigrid-preconditioned-solver/doc/kind
new file mode 100644
index 00000000000..196aa616342
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver/doc/kind
@@ -0,0 +1 @@
+distributed
diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/results.dox b/examples/distributed-multigrid-preconditioned-solver/doc/results.dox
new file mode 100644
index 00000000000..2a263e8396b
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver/doc/results.dox
@@ -0,0 +1,18 @@
+<h1>Results</h1>
+This is the expected output for `mpirun -n 4 ./distributed-multigrid-preconditioned-solver`:
+
+@code{.cpp}
+
+Num rows in matrix: 100
+Num ranks: 4
+Final Res norm: 1.87487e-08
+Iteration count: 23
+Init time: 0.000153159
+Read time: 0.000563957
+Solver generate time: 0.000484838
+Solver apply time: 0.073985
+Total time: 0.0750338
+
+@endcode
+
+The timings may vary depending on the machine.
diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/short-intro b/examples/distributed-multigrid-preconditioned-solver/doc/short-intro
new file mode 100644
index 00000000000..bf52c346411
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver/doc/short-intro
@@ -0,0 +1 @@
+The distributed multigrid preconditioned solver example.
diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/tooltip b/examples/distributed-multigrid-preconditioned-solver/doc/tooltip
new file mode 100644
index 00000000000..3e6cc291852
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver/doc/tooltip
@@ -0,0 +1 @@
+Solves a distributed linear system.

From ef51441d9ad74fbc1b07696fd377dd9913aedf23 Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Wed, 15 Jan 2025 13:42:29 +0100
Subject: [PATCH 2/5] add the customized distributed mg

---
 examples/CMakeLists.txt                       |   2 +-
 .../CMakeLists.txt                            |  10 +
 ...igrid-preconditioned-solver-customized.cpp | 260 ++++++++++++++++++
 .../doc/builds-on                             |   1 +
 .../doc/intro.dox                             |   9 +
 .../doc/kind                                  |   1 +
 .../doc/results.dox                           |  18 ++
 .../doc/short-intro                           |   1 +
 .../doc/tooltip                               |   1 +
 9 files changed, 302 insertions(+), 1 deletion(-)
 create mode 100644 examples/distributed-multigrid-preconditioned-solver-customized/CMakeLists.txt
 create mode 100644 examples/distributed-multigrid-preconditioned-solver-customized/distributed-multigrid-preconditioned-solver-customized.cpp
 create mode 100644 examples/distributed-multigrid-preconditioned-solver-customized/doc/builds-on
 create mode 100644 examples/distributed-multigrid-preconditioned-solver-customized/doc/intro.dox
 create mode 100644 examples/distributed-multigrid-preconditioned-solver-customized/doc/kind
 create mode 100644 examples/distributed-multigrid-preconditioned-solver-customized/doc/results.dox
 create mode 100644 examples/distributed-multigrid-preconditioned-solver-customized/doc/short-intro
 create mode 100644 examples/distributed-multigrid-preconditioned-solver-customized/doc/tooltip

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d58b9a59356..f891426cbf7 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -60,7 +60,7 @@ if(GINKGO_HAVE_PAPI_SDE)
 endif()
 
 if(GINKGO_BUILD_MPI)
-    list(APPEND EXAMPLES_LIST distributed-solver distributed-multigrid-preconditioned-solver)
+    list(APPEND EXAMPLES_LIST distributed-solver distributed-multigrid-preconditioned-solver distributed-multigrid-preconditioned-solver-customized)
 endif()
 
 find_package(Kokkos 4.1.00 QUIET)
diff --git a/examples/distributed-multigrid-preconditioned-solver-customized/CMakeLists.txt b/examples/distributed-multigrid-preconditioned-solver-customized/CMakeLists.txt
new file mode 100644
index 00000000000..b4b06f119a4
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver-customized/CMakeLists.txt
@@ -0,0 +1,10 @@
+cmake_minimum_required(VERSION 3.16)
+project(distributed-multigrid-preconditioned-solver-customized)
+
+# We only need to find Ginkgo if we build this example stand-alone
+if (NOT GINKGO_BUILD_EXAMPLES)
+    find_package(Ginkgo 1.10.0 REQUIRED)
+endif()
+
+add_executable(distributed-multigrid-preconditioned-solver-customized distributed-multigrid-preconditioned-solver-customized.cpp)
+target_link_libraries(distributed-multigrid-preconditioned-solver-customized Ginkgo::ginkgo)
diff --git a/examples/distributed-multigrid-preconditioned-solver-customized/distributed-multigrid-preconditioned-solver-customized.cpp b/examples/distributed-multigrid-preconditioned-solver-customized/distributed-multigrid-preconditioned-solver-customized.cpp
new file mode 100644
index 00000000000..757e99adde0
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver-customized/distributed-multigrid-preconditioned-solver-customized.cpp
@@ -0,0 +1,260 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+// @sect3{Include files}
+
+// This is the main ginkgo header file.
+#include <ginkgo/ginkgo.hpp>
+
+// Add the C++ iostream header to output information to the console.
+#include <iostream>
+// Add the STL map header for the executor selection
+#include <map>
+// Add the string manipulation header to handle strings.
+#include <string>
+
+
+int main(int argc, char* argv[])
+{
+    // @sect3{Initialize the MPI environment}
+    // Since this is an MPI program, we need to initialize and finalize
+    // MPI at the begin and end respectively of our program. This can be easily
+    // done with the following helper construct that uses RAII to automate the
+    // initialization and finalization.
+    const gko::experimental::mpi::environment env(argc, argv);
+    // @sect3{Type Definitions}
+    // Define the needed types. In a parallel program we need to differentiate
+    // between global and local indices, thus we have two index types.
+    using GlobalIndexType = gko::int64;
+    using LocalIndexType = gko::int32;
+    // The underlying value type.
+    using ValueType = double;
+    // As vector type we use the following, which implements a subset of @ref
+    // gko::matrix::Dense.
+    using dist_vec = gko::experimental::distributed::Vector<ValueType>;
+    // As matrix type we simply use the following type, which can read
+    // distributed data and be applied to a distributed vector.
+    using dist_mtx =
+        gko::experimental::distributed::Matrix<ValueType, LocalIndexType,
+                                               GlobalIndexType>;
+    // We still need a localized vector type to be used as scalars in the
+    // advanced apply operations.
+    using vec = gko::matrix::Dense<ValueType>;
+    // The partition type describes how the rows of the matrices are
+    // distributed.
+    using part_type =
+        gko::experimental::distributed::Partition<LocalIndexType,
+                                                  GlobalIndexType>;
+    // We can use here the same solver type as you would use in a
+    // non-distributed program. Please note that not all solvers support
+    // distributed systems at the moment.
+    using solver = gko::solver::Cg<ValueType>;
+    using schwarz = gko::experimental::distributed::preconditioner::Schwarz<
+        ValueType, LocalIndexType, GlobalIndexType>;
+    using bj = gko::preconditioner::Jacobi<ValueType, LocalIndexType>;
+    using mg = gko::solver::Multigrid;
+    using pgm = gko::multigrid::Pgm<ValueType, LocalIndexType>;
+
+    // Create an MPI communicator get the rank of the calling process.
+    const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    const auto rank = comm.rank();
+
+    // @sect3{User Input Handling}
+    // User input settings:
+    // - The executor, defaults to reference.
+    // - The number of grid points, defaults to 100.
+    // - The number of iterations, defaults to 1000.
+    if (argc == 2 && (std::string(argv[1]) == "--help")) {
+        if (rank == 0) {
+            std::cerr << "Usage: " << argv[0]
+                      << " [executor] [num_grid_points] [num_iterations] "
+                      << std::endl;
+        }
+        std::exit(-1);
+    }
+
+    ValueType t_init = gko::experimental::mpi::get_walltime();
+
+    const auto executor_string = argc >= 2 ? argv[1] : "reference";
+    const auto grid_dim =
+        static_cast<gko::size_type>(argc >= 3 ? std::atoi(argv[2]) : 100);
+    const auto num_iters =
+        static_cast<gko::size_type>(argc >= 4 ? std::atoi(argv[3]) : 1000);
+
+    const std::map<std::string,
+                   std::function<std::shared_ptr<gko::Executor>(MPI_Comm)>>
+        executor_factory_mpi{
+            {"reference",
+             [](MPI_Comm) { return gko::ReferenceExecutor::create(); }},
+            {"omp", [](MPI_Comm) { return gko::OmpExecutor::create(); }},
+            {"cuda",
+             [](MPI_Comm comm) {
+                 int device_id = gko::experimental::mpi::map_rank_to_device_id(
+                     comm, gko::CudaExecutor::get_num_devices());
+                 return gko::CudaExecutor::create(
+                     device_id, gko::ReferenceExecutor::create());
+             }},
+            {"hip",
+             [](MPI_Comm comm) {
+                 int device_id = gko::experimental::mpi::map_rank_to_device_id(
+                     comm, gko::HipExecutor::get_num_devices());
+                 return gko::HipExecutor::create(
+                     device_id, gko::ReferenceExecutor::create());
+             }},
+            {"dpcpp", [](MPI_Comm comm) {
+                 int device_id = 0;
+                 if (gko::DpcppExecutor::get_num_devices("gpu")) {
+                     device_id = gko::experimental::mpi::map_rank_to_device_id(
+                         comm, gko::DpcppExecutor::get_num_devices("gpu"));
+                 } else if (gko::DpcppExecutor::get_num_devices("cpu")) {
+                     device_id = gko::experimental::mpi::map_rank_to_device_id(
+                         comm, gko::DpcppExecutor::get_num_devices("cpu"));
+                 } else {
+                     throw std::runtime_error("No suitable DPC++ devices");
+                 }
+                 return gko::DpcppExecutor::create(
+                     device_id, gko::ReferenceExecutor::create());
+             }}};
+
+    auto exec = executor_factory_mpi.at(executor_string)(MPI_COMM_WORLD);
+
+    // @sect3{Creating the Distributed Matrix and Vectors}
+    // As a first step, we create a partition of the rows. The partition
+    // consists of ranges of consecutive rows which are assigned a part-id.
+    // These part-ids will be used for the distributed data structures to
+    // determine which rows will be stored locally. In this example each rank
+    // has (nearly) the same number of rows, so we can use the following
+    // specialized constructor. See @ref gko::distributed::Partition for other
+    // modes of creating a partition.
+    const auto num_rows = grid_dim;
+    auto partition = gko::share(part_type::build_from_global_size_uniform(
+        exec->get_master(), comm.size(),
+        static_cast<GlobalIndexType>(num_rows)));
+
+    // Assemble the matrix using a 3-pt stencil and fill the right-hand-side
+    // with a sine value. The distributed matrix supports only constructing an
+    // empty matrix of zero size and filling in the values with
+    // gko::experimental::distributed::Matrix::read_distributed. Only the data
+    // that belongs to the rows by this rank will be assembled.
+    gko::matrix_data<ValueType, GlobalIndexType> A_data;
+    gko::matrix_data<ValueType, GlobalIndexType> b_data;
+    gko::matrix_data<ValueType, GlobalIndexType> x_data;
+    A_data.size = {num_rows, num_rows};
+    b_data.size = {num_rows, 1};
+    x_data.size = {num_rows, 1};
+    const auto range_start = partition->get_range_bounds()[rank];
+    const auto range_end = partition->get_range_bounds()[rank + 1];
+    for (int i = range_start; i < range_end; i++) {
+        if (i > 0) {
+            A_data.nonzeros.emplace_back(i, i - 1, -1);
+        }
+        A_data.nonzeros.emplace_back(i, i, 2);
+        if (i < grid_dim - 1) {
+            A_data.nonzeros.emplace_back(i, i + 1, -1);
+        }
+        b_data.nonzeros.emplace_back(i, 0, std::sin(i * 0.01));
+        x_data.nonzeros.emplace_back(i, 0, gko::zero<ValueType>());
+    }
+
+    // Take timings.
+    comm.synchronize();
+    ValueType t_init_end = gko::experimental::mpi::get_walltime();
+
+    // Read the matrix data, currently this is only supported on CPU executors.
+    // This will also set up the communication pattern needed for the
+    // distributed matrix-vector multiplication.
+    auto A_host = gko::share(dist_mtx::create(exec->get_master(), comm));
+    auto x_host = dist_vec::create(exec->get_master(), comm);
+    auto b_host = dist_vec::create(exec->get_master(), comm);
+    A_host->read_distributed(A_data, partition);
+    b_host->read_distributed(b_data, partition);
+    x_host->read_distributed(x_data, partition);
+    // After reading, the matrix and vector can be moved to the chosen executor,
+    // since the distributed matrix supports SpMV also on devices.
+    auto A = gko::share(dist_mtx::create(exec, comm));
+    auto x = dist_vec::create(exec, comm);
+    auto b = dist_vec::create(exec, comm);
+    A->copy_from(A_host);
+    b->copy_from(b_host);
+    x->copy_from(x_host);
+
+    // Take timings.
+    comm.synchronize();
+    ValueType t_read_setup_end = gko::experimental::mpi::get_walltime();
+
+
+    // @sect3{Solve the Distributed System}
+    // Generate the solver
+
+    // Setup the multigrid factory with customized smoother and coarse solver
+    // Because BlockJacobi does not support distributed matrix, we need wrap it
+    // in Schwarz.
+    auto schwarz_bj_factory =
+        gko::share(schwarz::build().with_local_solver(bj::build()).on(exec));
+    auto smoother_factory = gko::share(gko::solver::build_smoother(
+        schwarz_bj_factory, 2u, static_cast<ValueType>(0.9)));
+    // Cg supports distributed matrix, so we can use it as we did in
+    // non-distributed case
+    auto coarsest_factory = gko::share(
+        solver::build()
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(4u))
+            .on(exec));
+    // It uses Schwarz Jacobi as smoother and GMRES as coarse solver
+    auto mg_factory = gko::share(
+        mg::build()
+            .with_mg_level(pgm::build().with_deterministic(true))
+            .with_pre_smoother(smoother_factory)
+            .with_coarsest_solver(coarsest_factory)
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
+            .on(exec));
+
+    // Setup the stopping criterion and logger
+    const gko::remove_complex<ValueType> reduction_factor{1e-8};
+    std::shared_ptr<const gko::log::Convergence<ValueType>> logger =
+        gko::log::Convergence<ValueType>::create();
+    auto Ainv = solver::build()
+                    .with_preconditioner(mg_factory)
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(num_iters),
+                        gko::stop::ResidualNorm<ValueType>::build()
+                            .with_reduction_factor(reduction_factor))
+                    .on(exec)
+                    ->generate(A);
+    // Add logger to the generated solver to log the iteration count and
+    // residual norm
+    Ainv->add_logger(logger);
+
+    // Take timings.
+    comm.synchronize();
+    ValueType t_solver_generate_end = gko::experimental::mpi::get_walltime();
+
+    // Apply the distributed solver, this is the same as in the non-distributed
+    // case.
+    Ainv->apply(b, x);
+
+    // Take timings.
+    comm.synchronize();
+    ValueType t_end = gko::experimental::mpi::get_walltime();
+
+    // Get the residual.
+    auto res_norm = gko::clone(exec->get_master(),
+                               gko::as<vec>(logger->get_residual_norm()));
+
+    // @sect3{Printing Results}
+    // Print the achieved residual norm and timings on rank 0.
+    if (comm.rank() == 0) {
+        // clang-format off
+        std::cout << "\nNum rows in matrix: " << num_rows
+                  << "\nNum ranks: " << comm.size()
+                  << "\nFinal Res norm: " << res_norm->at(0, 0)
+                  << "\nIteration count: " << logger->get_num_iterations()
+                  << "\nInit time: " << t_init_end - t_init
+                  << "\nRead time: " << t_read_setup_end - t_init
+                  << "\nSolver generate time: " << t_solver_generate_end - t_read_setup_end
+                  << "\nSolver apply time: " << t_end - t_solver_generate_end
+                  << "\nTotal time: " << t_end - t_init
+                  << std::endl;
+        // clang-format on
+    }
+}
diff --git a/examples/distributed-multigrid-preconditioned-solver-customized/doc/builds-on b/examples/distributed-multigrid-preconditioned-solver-customized/doc/builds-on
new file mode 100644
index 00000000000..e4cd339e18d
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver-customized/doc/builds-on
@@ -0,0 +1 @@
+distributed-multigrid-preconditioned-solver
diff --git a/examples/distributed-multigrid-preconditioned-solver-customized/doc/intro.dox b/examples/distributed-multigrid-preconditioned-solver-customized/doc/intro.dox
new file mode 100644
index 00000000000..17c45bc7403
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver-customized/doc/intro.dox
@@ -0,0 +1,9 @@
+<a name="Intro"></a>
+<h1>Introduction</h1>
+This distributed multigrid preconditioned solver example should help you understand customizing Ginkgo multigrid in a distributed setting.
+The example will solve a simple 1D Laplace equation where the system can be distributed row-wise to multiple processes.
+Note. Because the stencil is configured equal weighted, the coarsening method does not perform well on this kind of problem.
+To run the solver with multiple processes, use `mpirun -n NUM_PROCS ./distributed-solver [executor] [num_grid_points] [num_iterations]`.
+
+If you are using GPU devices, please make sure that you run this example with at most as many processes as you have GPU
+devices available.
diff --git a/examples/distributed-multigrid-preconditioned-solver-customized/doc/kind b/examples/distributed-multigrid-preconditioned-solver-customized/doc/kind
new file mode 100644
index 00000000000..196aa616342
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver-customized/doc/kind
@@ -0,0 +1 @@
+distributed
diff --git a/examples/distributed-multigrid-preconditioned-solver-customized/doc/results.dox b/examples/distributed-multigrid-preconditioned-solver-customized/doc/results.dox
new file mode 100644
index 00000000000..1a54ada9ae1
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver-customized/doc/results.dox
@@ -0,0 +1,18 @@
+<h1>Results</h1>
+This is the expected output for `mpirun -n 4 ./distributed-multigrid-preconditioned-solver-customized`:
+
+@code{.cpp}
+
+Num rows in matrix: 100
+Num ranks: 4
+Final Res norm: 1.61045e-08
+Iteration count: 18
+Init time: 0.000117699
+Read time: 0.000522518
+Solver generate time: 0.000430548
+Solver apply time: 0.00183804
+Total time: 0.00279111
+
+@endcode
+
+The timings may vary depending on the machine.
diff --git a/examples/distributed-multigrid-preconditioned-solver-customized/doc/short-intro b/examples/distributed-multigrid-preconditioned-solver-customized/doc/short-intro
new file mode 100644
index 00000000000..443031b3e39
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver-customized/doc/short-intro
@@ -0,0 +1 @@
+The distributed multigrid preconditioned solver with customized components example.
diff --git a/examples/distributed-multigrid-preconditioned-solver-customized/doc/tooltip b/examples/distributed-multigrid-preconditioned-solver-customized/doc/tooltip
new file mode 100644
index 00000000000..3e6cc291852
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver-customized/doc/tooltip
@@ -0,0 +1 @@
+Solves a distributed linear system.

From 5d9d5798ff9c863efe9069fe2cb5392aef05adfb Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 21 Jan 2025 10:23:19 +0100
Subject: [PATCH 3/5] only keep the distributed mg customized example

---
 examples/CMakeLists.txt                       |   2 +-
 .../doc/builds-on                             |   2 +-
 .../CMakeLists.txt                            |  10 -
 ...ibuted-multigrid-preconditioned-solver.cpp | 246 ------------------
 .../doc/builds-on                             |   1 -
 .../doc/intro.dox                             |   9 -
 .../doc/kind                                  |   1 -
 .../doc/results.dox                           |  18 --
 .../doc/short-intro                           |   1 -
 .../doc/tooltip                               |   1 -
 10 files changed, 2 insertions(+), 289 deletions(-)
 delete mode 100644 examples/distributed-multigrid-preconditioned-solver/CMakeLists.txt
 delete mode 100644 examples/distributed-multigrid-preconditioned-solver/distributed-multigrid-preconditioned-solver.cpp
 delete mode 100644 examples/distributed-multigrid-preconditioned-solver/doc/builds-on
 delete mode 100644 examples/distributed-multigrid-preconditioned-solver/doc/intro.dox
 delete mode 100644 examples/distributed-multigrid-preconditioned-solver/doc/kind
 delete mode 100644 examples/distributed-multigrid-preconditioned-solver/doc/results.dox
 delete mode 100644 examples/distributed-multigrid-preconditioned-solver/doc/short-intro
 delete mode 100644 examples/distributed-multigrid-preconditioned-solver/doc/tooltip

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index f891426cbf7..e76b1c308fb 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -60,7 +60,7 @@ if(GINKGO_HAVE_PAPI_SDE)
 endif()
 
 if(GINKGO_BUILD_MPI)
-    list(APPEND EXAMPLES_LIST distributed-solver distributed-multigrid-preconditioned-solver distributed-multigrid-preconditioned-solver-customized)
+    list(APPEND EXAMPLES_LIST distributed-solver distributed-multigrid-preconditioned-solver-customized)
 endif()
 
 find_package(Kokkos 4.1.00 QUIET)
diff --git a/examples/distributed-multigrid-preconditioned-solver-customized/doc/builds-on b/examples/distributed-multigrid-preconditioned-solver-customized/doc/builds-on
index e4cd339e18d..f70ab1608ec 100644
--- a/examples/distributed-multigrid-preconditioned-solver-customized/doc/builds-on
+++ b/examples/distributed-multigrid-preconditioned-solver-customized/doc/builds-on
@@ -1 +1 @@
-distributed-multigrid-preconditioned-solver
+distributed-solver
diff --git a/examples/distributed-multigrid-preconditioned-solver/CMakeLists.txt b/examples/distributed-multigrid-preconditioned-solver/CMakeLists.txt
deleted file mode 100644
index 0de5e4cd44a..00000000000
--- a/examples/distributed-multigrid-preconditioned-solver/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-cmake_minimum_required(VERSION 3.16)
-project(distributed-multigrid-preconditioned-solver)
-
-# We only need to find Ginkgo if we build this example stand-alone
-if (NOT GINKGO_BUILD_EXAMPLES)
-    find_package(Ginkgo 1.10.0 REQUIRED)
-endif()
-
-add_executable(distributed-multigrid-preconditioned-solver distributed-multigrid-preconditioned-solver.cpp)
-target_link_libraries(distributed-multigrid-preconditioned-solver Ginkgo::ginkgo)
diff --git a/examples/distributed-multigrid-preconditioned-solver/distributed-multigrid-preconditioned-solver.cpp b/examples/distributed-multigrid-preconditioned-solver/distributed-multigrid-preconditioned-solver.cpp
deleted file mode 100644
index 66f05ca911a..00000000000
--- a/examples/distributed-multigrid-preconditioned-solver/distributed-multigrid-preconditioned-solver.cpp
+++ /dev/null
@@ -1,246 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-// @sect3{Include files}
-
-// This is the main ginkgo header file.
-#include <ginkgo/ginkgo.hpp>
-
-// Add the C++ iostream header to output information to the console.
-#include <iostream>
-// Add the STL map header for the executor selection
-#include <map>
-// Add the string manipulation header to handle strings.
-#include <string>
-
-
-int main(int argc, char* argv[])
-{
-    // @sect3{Initialize the MPI environment}
-    // Since this is an MPI program, we need to initialize and finalize
-    // MPI at the begin and end respectively of our program. This can be easily
-    // done with the following helper construct that uses RAII to automate the
-    // initialization and finalization.
-    const gko::experimental::mpi::environment env(argc, argv);
-    // @sect3{Type Definitions}
-    // Define the needed types. In a parallel program we need to differentiate
-    // between global and local indices, thus we have two index types.
-    using GlobalIndexType = gko::int64;
-    using LocalIndexType = gko::int32;
-    // The underlying value type.
-    using ValueType = double;
-    // As vector type we use the following, which implements a subset of @ref
-    // gko::matrix::Dense.
-    using dist_vec = gko::experimental::distributed::Vector<ValueType>;
-    // As matrix type we simply use the following type, which can read
-    // distributed data and be applied to a distributed vector.
-    using dist_mtx =
-        gko::experimental::distributed::Matrix<ValueType, LocalIndexType,
-                                               GlobalIndexType>;
-    // We still need a localized vector type to be used as scalars in the
-    // advanced apply operations.
-    using vec = gko::matrix::Dense<ValueType>;
-    // The partition type describes how the rows of the matrices are
-    // distributed.
-    using part_type =
-        gko::experimental::distributed::Partition<LocalIndexType,
-                                                  GlobalIndexType>;
-    // We can use here the same solver type as you would use in a
-    // non-distributed program. Please note that not all solvers support
-    // distributed systems at the moment.
-    using solver = gko::solver::Cg<ValueType>;
-    using schwarz = gko::experimental::distributed::preconditioner::Schwarz<
-        ValueType, LocalIndexType, GlobalIndexType>;
-    using bj = gko::preconditioner::Jacobi<ValueType, LocalIndexType>;
-    using mg = gko::solver::Multigrid;
-    using pgm = gko::multigrid::Pgm<ValueType, LocalIndexType>;
-
-    // Create an MPI communicator get the rank of the calling process.
-    const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
-    const auto rank = comm.rank();
-
-    // @sect3{User Input Handling}
-    // User input settings:
-    // - The executor, defaults to reference.
-    // - The number of grid points, defaults to 100.
-    // - The number of iterations, defaults to 1000.
-    if (argc == 2 && (std::string(argv[1]) == "--help")) {
-        if (rank == 0) {
-            std::cerr << "Usage: " << argv[0]
-                      << " [executor] [num_grid_points] [num_iterations] "
-                      << std::endl;
-        }
-        std::exit(-1);
-    }
-
-    ValueType t_init = gko::experimental::mpi::get_walltime();
-
-    const auto executor_string = argc >= 2 ? argv[1] : "reference";
-    const auto grid_dim =
-        static_cast<gko::size_type>(argc >= 3 ? std::atoi(argv[2]) : 100);
-    const auto num_iters =
-        static_cast<gko::size_type>(argc >= 4 ? std::atoi(argv[3]) : 1000);
-
-    const std::map<std::string,
-                   std::function<std::shared_ptr<gko::Executor>(MPI_Comm)>>
-        executor_factory_mpi{
-            {"reference",
-             [](MPI_Comm) { return gko::ReferenceExecutor::create(); }},
-            {"omp", [](MPI_Comm) { return gko::OmpExecutor::create(); }},
-            {"cuda",
-             [](MPI_Comm comm) {
-                 int device_id = gko::experimental::mpi::map_rank_to_device_id(
-                     comm, gko::CudaExecutor::get_num_devices());
-                 return gko::CudaExecutor::create(
-                     device_id, gko::ReferenceExecutor::create());
-             }},
-            {"hip",
-             [](MPI_Comm comm) {
-                 int device_id = gko::experimental::mpi::map_rank_to_device_id(
-                     comm, gko::HipExecutor::get_num_devices());
-                 return gko::HipExecutor::create(
-                     device_id, gko::ReferenceExecutor::create());
-             }},
-            {"dpcpp", [](MPI_Comm comm) {
-                 int device_id = 0;
-                 if (gko::DpcppExecutor::get_num_devices("gpu")) {
-                     device_id = gko::experimental::mpi::map_rank_to_device_id(
-                         comm, gko::DpcppExecutor::get_num_devices("gpu"));
-                 } else if (gko::DpcppExecutor::get_num_devices("cpu")) {
-                     device_id = gko::experimental::mpi::map_rank_to_device_id(
-                         comm, gko::DpcppExecutor::get_num_devices("cpu"));
-                 } else {
-                     throw std::runtime_error("No suitable DPC++ devices");
-                 }
-                 return gko::DpcppExecutor::create(
-                     device_id, gko::ReferenceExecutor::create());
-             }}};
-
-    auto exec = executor_factory_mpi.at(executor_string)(MPI_COMM_WORLD);
-
-    // @sect3{Creating the Distributed Matrix and Vectors}
-    // As a first step, we create a partition of the rows. The partition
-    // consists of ranges of consecutive rows which are assigned a part-id.
-    // These part-ids will be used for the distributed data structures to
-    // determine which rows will be stored locally. In this example each rank
-    // has (nearly) the same number of rows, so we can use the following
-    // specialized constructor. See @ref gko::distributed::Partition for other
-    // modes of creating a partition.
-    const auto num_rows = grid_dim;
-    auto partition = gko::share(part_type::build_from_global_size_uniform(
-        exec->get_master(), comm.size(),
-        static_cast<GlobalIndexType>(num_rows)));
-
-    // Assemble the matrix using a 3-pt stencil and fill the right-hand-side
-    // with a sine value. The distributed matrix supports only constructing an
-    // empty matrix of zero size and filling in the values with
-    // gko::experimental::distributed::Matrix::read_distributed. Only the data
-    // that belongs to the rows by this rank will be assembled.
-    gko::matrix_data<ValueType, GlobalIndexType> A_data;
-    gko::matrix_data<ValueType, GlobalIndexType> b_data;
-    gko::matrix_data<ValueType, GlobalIndexType> x_data;
-    A_data.size = {num_rows, num_rows};
-    b_data.size = {num_rows, 1};
-    x_data.size = {num_rows, 1};
-    const auto range_start = partition->get_range_bounds()[rank];
-    const auto range_end = partition->get_range_bounds()[rank + 1];
-    for (int i = range_start; i < range_end; i++) {
-        if (i > 0) {
-            A_data.nonzeros.emplace_back(i, i - 1, -1);
-        }
-        A_data.nonzeros.emplace_back(i, i, 2);
-        if (i < grid_dim - 1) {
-            A_data.nonzeros.emplace_back(i, i + 1, -1);
-        }
-        b_data.nonzeros.emplace_back(i, 0, std::sin(i * 0.01));
-        x_data.nonzeros.emplace_back(i, 0, gko::zero<ValueType>());
-    }
-
-    // Take timings.
-    comm.synchronize();
-    ValueType t_init_end = gko::experimental::mpi::get_walltime();
-
-    // Read the matrix data, currently this is only supported on CPU executors.
-    // This will also set up the communication pattern needed for the
-    // distributed matrix-vector multiplication.
-    auto A_host = gko::share(dist_mtx::create(exec->get_master(), comm));
-    auto x_host = dist_vec::create(exec->get_master(), comm);
-    auto b_host = dist_vec::create(exec->get_master(), comm);
-    A_host->read_distributed(A_data, partition);
-    b_host->read_distributed(b_data, partition);
-    x_host->read_distributed(x_data, partition);
-    // After reading, the matrix and vector can be moved to the chosen executor,
-    // since the distributed matrix supports SpMV also on devices.
-    auto A = gko::share(dist_mtx::create(exec, comm));
-    auto x = dist_vec::create(exec, comm);
-    auto b = dist_vec::create(exec, comm);
-    A->copy_from(A_host);
-    b->copy_from(b_host);
-    x->copy_from(x_host);
-
-    // Take timings.
-    comm.synchronize();
-    ValueType t_read_setup_end = gko::experimental::mpi::get_walltime();
-
-
-    // @sect3{Solve the Distributed System}
-    // Generate the solver
-
-    // Setup the multigrid factory with default setting
-    // It uses Schwarz Jacobi as smoother and GMRES as coarse solver
-    auto mg_factory = gko::share(
-        mg::build()
-            .with_mg_level(pgm::build().with_deterministic(true))
-            .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
-            .on(exec));
-
-    // Setup the stopping criterion and logger
-    const gko::remove_complex<ValueType> reduction_factor{1e-8};
-    std::shared_ptr<const gko::log::Convergence<ValueType>> logger =
-        gko::log::Convergence<ValueType>::create();
-    auto Ainv = solver::build()
-                    .with_preconditioner(mg_factory)
-                    .with_criteria(
-                        gko::stop::Iteration::build().with_max_iters(num_iters),
-                        gko::stop::ResidualNorm<ValueType>::build()
-                            .with_reduction_factor(reduction_factor))
-                    .on(exec)
-                    ->generate(A);
-    // Add logger to the generated solver to log the iteration count and
-    // residual norm
-    Ainv->add_logger(logger);
-
-    // Take timings.
-    comm.synchronize();
-    ValueType t_solver_generate_end = gko::experimental::mpi::get_walltime();
-
-    // Apply the distributed solver, this is the same as in the non-distributed
-    // case.
-    Ainv->apply(b, x);
-
-    // Take timings.
-    comm.synchronize();
-    ValueType t_end = gko::experimental::mpi::get_walltime();
-
-    // Get the residual.
-    auto res_norm = gko::clone(exec->get_master(),
-                               gko::as<vec>(logger->get_residual_norm()));
-
-    // @sect3{Printing Results}
-    // Print the achieved residual norm and timings on rank 0.
-    if (comm.rank() == 0) {
-        // clang-format off
-        std::cout << "\nNum rows in matrix: " << num_rows
-                  << "\nNum ranks: " << comm.size()
-                  << "\nFinal Res norm: " << res_norm->at(0, 0)
-                  << "\nIteration count: " << logger->get_num_iterations()
-                  << "\nInit time: " << t_init_end - t_init
-                  << "\nRead time: " << t_read_setup_end - t_init
-                  << "\nSolver generate time: " << t_solver_generate_end - t_read_setup_end
-                  << "\nSolver apply time: " << t_end - t_solver_generate_end
-                  << "\nTotal time: " << t_end - t_init
-                  << std::endl;
-        // clang-format on
-    }
-}
diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/builds-on b/examples/distributed-multigrid-preconditioned-solver/doc/builds-on
deleted file mode 100644
index f70ab1608ec..00000000000
--- a/examples/distributed-multigrid-preconditioned-solver/doc/builds-on
+++ /dev/null
@@ -1 +0,0 @@
-distributed-solver
diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/intro.dox b/examples/distributed-multigrid-preconditioned-solver/doc/intro.dox
deleted file mode 100644
index 79cfef82425..00000000000
--- a/examples/distributed-multigrid-preconditioned-solver/doc/intro.dox
+++ /dev/null
@@ -1,9 +0,0 @@
-<a name="Intro"></a>
-<h1>Introduction</h1>
-This distributed multigrid preconditioned solver example should help you understand using Ginkgo multigrid in a distributed setting.
-The example will solve a simple 1D Laplace equation where the system can be distributed row-wise to multiple processes.
-Note. Because the stencil is configured equal weighted, the coarsening method does not perform well on this kind of problem.
-To run the solver with multiple processes, use `mpirun -n NUM_PROCS ./distributed-solver [executor] [num_grid_points] [num_iterations]`.
-
-If you are using GPU devices, please make sure that you run this example with at most as many processes as you have GPU
-devices available.
diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/kind b/examples/distributed-multigrid-preconditioned-solver/doc/kind
deleted file mode 100644
index 196aa616342..00000000000
--- a/examples/distributed-multigrid-preconditioned-solver/doc/kind
+++ /dev/null
@@ -1 +0,0 @@
-distributed
diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/results.dox b/examples/distributed-multigrid-preconditioned-solver/doc/results.dox
deleted file mode 100644
index 2a263e8396b..00000000000
--- a/examples/distributed-multigrid-preconditioned-solver/doc/results.dox
+++ /dev/null
@@ -1,18 +0,0 @@
-<h1>Results</h1>
-This is the expected output for `mpirun -n 4 ./distributed-multigrid-preconditioned-solver`:
-
-@code{.cpp}
-
-Num rows in matrix: 100
-Num ranks: 4
-Final Res norm: 1.87487e-08
-Iteration count: 23
-Init time: 0.000153159
-Read time: 0.000563957
-Solver generate time: 0.000484838
-Solver apply time: 0.073985
-Total time: 0.0750338
-
-@endcode
-
-The timings may vary depending on the machine.
diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/short-intro b/examples/distributed-multigrid-preconditioned-solver/doc/short-intro
deleted file mode 100644
index bf52c346411..00000000000
--- a/examples/distributed-multigrid-preconditioned-solver/doc/short-intro
+++ /dev/null
@@ -1 +0,0 @@
-The distributed multigrid preconditioned solver example.
diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/tooltip b/examples/distributed-multigrid-preconditioned-solver/doc/tooltip
deleted file mode 100644
index 3e6cc291852..00000000000
--- a/examples/distributed-multigrid-preconditioned-solver/doc/tooltip
+++ /dev/null
@@ -1 +0,0 @@
-Solves a distributed linear system.

From b6927dd339b33ff2b2df133c7739a123628ae5db Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Thu, 13 Feb 2025 16:22:18 +0100
Subject: [PATCH 4/5] update documentation

Co-authored-by: Marcel Koch <marcel.koch@kit.edu>
---
 examples/CMakeLists.txt                                |  2 +-
 .../CMakeLists.txt                                     |  6 +++---
 .../distributed-multigrid-preconditioned-solver.cpp}   | 10 +++++++++-
 .../doc/builds-on                                      |  0
 .../doc/intro.dox                                      |  2 +-
 .../doc/kind                                           |  0
 .../doc/results.dox                                    |  2 +-
 .../doc/short-intro                                    |  0
 .../doc/tooltip                                        |  0
 9 files changed, 15 insertions(+), 7 deletions(-)
 rename examples/{distributed-multigrid-preconditioned-solver-customized => distributed-multigrid-preconditioned-solver}/CMakeLists.txt (52%)
 rename examples/{distributed-multigrid-preconditioned-solver-customized/distributed-multigrid-preconditioned-solver-customized.cpp => distributed-multigrid-preconditioned-solver/distributed-multigrid-preconditioned-solver.cpp} (95%)
 rename examples/{distributed-multigrid-preconditioned-solver-customized => distributed-multigrid-preconditioned-solver}/doc/builds-on (100%)
 rename examples/{distributed-multigrid-preconditioned-solver-customized => distributed-multigrid-preconditioned-solver}/doc/intro.dox (78%)
 rename examples/{distributed-multigrid-preconditioned-solver-customized => distributed-multigrid-preconditioned-solver}/doc/kind (100%)
 rename examples/{distributed-multigrid-preconditioned-solver-customized => distributed-multigrid-preconditioned-solver}/doc/results.dox (90%)
 rename examples/{distributed-multigrid-preconditioned-solver-customized => distributed-multigrid-preconditioned-solver}/doc/short-intro (100%)
 rename examples/{distributed-multigrid-preconditioned-solver-customized => distributed-multigrid-preconditioned-solver}/doc/tooltip (100%)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index e76b1c308fb..d58b9a59356 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -60,7 +60,7 @@ if(GINKGO_HAVE_PAPI_SDE)
 endif()
 
 if(GINKGO_BUILD_MPI)
-    list(APPEND EXAMPLES_LIST distributed-solver distributed-multigrid-preconditioned-solver-customized)
+    list(APPEND EXAMPLES_LIST distributed-solver distributed-multigrid-preconditioned-solver)
 endif()
 
 find_package(Kokkos 4.1.00 QUIET)
diff --git a/examples/distributed-multigrid-preconditioned-solver-customized/CMakeLists.txt b/examples/distributed-multigrid-preconditioned-solver/CMakeLists.txt
similarity index 52%
rename from examples/distributed-multigrid-preconditioned-solver-customized/CMakeLists.txt
rename to examples/distributed-multigrid-preconditioned-solver/CMakeLists.txt
index b4b06f119a4..0de5e4cd44a 100644
--- a/examples/distributed-multigrid-preconditioned-solver-customized/CMakeLists.txt
+++ b/examples/distributed-multigrid-preconditioned-solver/CMakeLists.txt
@@ -1,10 +1,10 @@
 cmake_minimum_required(VERSION 3.16)
-project(distributed-multigrid-preconditioned-solver-customized)
+project(distributed-multigrid-preconditioned-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
 if (NOT GINKGO_BUILD_EXAMPLES)
     find_package(Ginkgo 1.10.0 REQUIRED)
 endif()
 
-add_executable(distributed-multigrid-preconditioned-solver-customized distributed-multigrid-preconditioned-solver-customized.cpp)
-target_link_libraries(distributed-multigrid-preconditioned-solver-customized Ginkgo::ginkgo)
+add_executable(distributed-multigrid-preconditioned-solver distributed-multigrid-preconditioned-solver.cpp)
+target_link_libraries(distributed-multigrid-preconditioned-solver Ginkgo::ginkgo)
diff --git a/examples/distributed-multigrid-preconditioned-solver-customized/distributed-multigrid-preconditioned-solver-customized.cpp b/examples/distributed-multigrid-preconditioned-solver/distributed-multigrid-preconditioned-solver.cpp
similarity index 95%
rename from examples/distributed-multigrid-preconditioned-solver-customized/distributed-multigrid-preconditioned-solver-customized.cpp
rename to examples/distributed-multigrid-preconditioned-solver/distributed-multigrid-preconditioned-solver.cpp
index 757e99adde0..658826f09f7 100644
--- a/examples/distributed-multigrid-preconditioned-solver-customized/distributed-multigrid-preconditioned-solver-customized.cpp
+++ b/examples/distributed-multigrid-preconditioned-solver/distributed-multigrid-preconditioned-solver.cpp
@@ -50,9 +50,16 @@ int main(int argc, char* argv[])
     // non-distributed program. Please note that not all solvers support
     // distributed systems at the moment.
     using solver = gko::solver::Cg<ValueType>;
+    // We use the Schwarz preconditioner to extend non-distributed
+    // preconditioners, like our Jacobi,
+    // to the distributed case. The Schwarz preconditioner wraps another
+    // preconditioner, and applies it only to the local part of a distributed
+    // matrix. This will be used as our distributed multigrid smoother.
     using schwarz = gko::experimental::distributed::preconditioner::Schwarz<
         ValueType, LocalIndexType, GlobalIndexType>;
     using bj = gko::preconditioner::Jacobi<ValueType, LocalIndexType>;
+    // Multigrid and Pgm can accept the distributed matrix, so we still use the
+    // same type as the non-distributed case.
     using mg = gko::solver::Multigrid;
     using pgm = gko::multigrid::Pgm<ValueType, LocalIndexType>;
 
@@ -200,7 +207,8 @@ int main(int argc, char* argv[])
         solver::build()
             .with_criteria(gko::stop::Iteration::build().with_max_iters(4u))
             .on(exec));
-    // It uses Schwarz Jacobi as smoother and GMRES as coarse solver
+    // The multigrid preconditioner uses the Schwarz Jacobi as smoother and Cg
+    // as coarse solver
     auto mg_factory = gko::share(
         mg::build()
             .with_mg_level(pgm::build().with_deterministic(true))
diff --git a/examples/distributed-multigrid-preconditioned-solver-customized/doc/builds-on b/examples/distributed-multigrid-preconditioned-solver/doc/builds-on
similarity index 100%
rename from examples/distributed-multigrid-preconditioned-solver-customized/doc/builds-on
rename to examples/distributed-multigrid-preconditioned-solver/doc/builds-on
diff --git a/examples/distributed-multigrid-preconditioned-solver-customized/doc/intro.dox b/examples/distributed-multigrid-preconditioned-solver/doc/intro.dox
similarity index 78%
rename from examples/distributed-multigrid-preconditioned-solver-customized/doc/intro.dox
rename to examples/distributed-multigrid-preconditioned-solver/doc/intro.dox
index 17c45bc7403..99304ca4851 100644
--- a/examples/distributed-multigrid-preconditioned-solver-customized/doc/intro.dox
+++ b/examples/distributed-multigrid-preconditioned-solver/doc/intro.dox
@@ -2,7 +2,7 @@
 <h1>Introduction</h1>
 This distributed multigrid preconditioned solver example should help you understand customizing Ginkgo multigrid in a distributed setting.
 The example will solve a simple 1D Laplace equation where the system can be distributed row-wise to multiple processes.
-Note. Because the stencil is configured equal weighted, the coarsening method does not perform well on this kind of problem.
+Note. Because the stencil for the discretized Laplacian is configured with equal weight, the coarsening method does not perform well on this kind of problem.
 To run the solver with multiple processes, use `mpirun -n NUM_PROCS ./distributed-solver [executor] [num_grid_points] [num_iterations]`.
 
 If you are using GPU devices, please make sure that you run this example with at most as many processes as you have GPU
diff --git a/examples/distributed-multigrid-preconditioned-solver-customized/doc/kind b/examples/distributed-multigrid-preconditioned-solver/doc/kind
similarity index 100%
rename from examples/distributed-multigrid-preconditioned-solver-customized/doc/kind
rename to examples/distributed-multigrid-preconditioned-solver/doc/kind
diff --git a/examples/distributed-multigrid-preconditioned-solver-customized/doc/results.dox b/examples/distributed-multigrid-preconditioned-solver/doc/results.dox
similarity index 90%
rename from examples/distributed-multigrid-preconditioned-solver-customized/doc/results.dox
rename to examples/distributed-multigrid-preconditioned-solver/doc/results.dox
index 1a54ada9ae1..0189c4c91e6 100644
--- a/examples/distributed-multigrid-preconditioned-solver-customized/doc/results.dox
+++ b/examples/distributed-multigrid-preconditioned-solver/doc/results.dox
@@ -1,5 +1,5 @@
 <h1>Results</h1>
-This is the expected output for `mpirun -n 4 ./distributed-multigrid-preconditioned-solver-customized`:
+This is the expected output for `mpirun -n 4 ./distributed-multigrid-preconditioned-solver`:
 
 @code{.cpp}
 
diff --git a/examples/distributed-multigrid-preconditioned-solver-customized/doc/short-intro b/examples/distributed-multigrid-preconditioned-solver/doc/short-intro
similarity index 100%
rename from examples/distributed-multigrid-preconditioned-solver-customized/doc/short-intro
rename to examples/distributed-multigrid-preconditioned-solver/doc/short-intro
diff --git a/examples/distributed-multigrid-preconditioned-solver-customized/doc/tooltip b/examples/distributed-multigrid-preconditioned-solver/doc/tooltip
similarity index 100%
rename from examples/distributed-multigrid-preconditioned-solver-customized/doc/tooltip
rename to examples/distributed-multigrid-preconditioned-solver/doc/tooltip

From 4ad21786861428f0a5e69a75cd8ab7adb09ac02a Mon Sep 17 00:00:00 2001
From: "Yu-Hsiang M. Tsai" <yhmtsai@gmail.com>
Date: Tue, 18 Feb 2025 13:23:34 +0100
Subject: [PATCH 5/5] update format

---
 examples/CMakeLists.txt                              |  7 ++++++-
 .../CMakeLists.txt                                   | 12 +++++++++---
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d58b9a59356..8f4c848005f 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -60,7 +60,12 @@ if(GINKGO_HAVE_PAPI_SDE)
 endif()
 
 if(GINKGO_BUILD_MPI)
-    list(APPEND EXAMPLES_LIST distributed-solver distributed-multigrid-preconditioned-solver)
+    list(
+        APPEND
+        EXAMPLES_LIST
+        distributed-solver
+        distributed-multigrid-preconditioned-solver
+    )
 endif()
 
 find_package(Kokkos 4.1.00 QUIET)
diff --git a/examples/distributed-multigrid-preconditioned-solver/CMakeLists.txt b/examples/distributed-multigrid-preconditioned-solver/CMakeLists.txt
index 0de5e4cd44a..1c81952c0bb 100644
--- a/examples/distributed-multigrid-preconditioned-solver/CMakeLists.txt
+++ b/examples/distributed-multigrid-preconditioned-solver/CMakeLists.txt
@@ -2,9 +2,15 @@ cmake_minimum_required(VERSION 3.16)
 project(distributed-multigrid-preconditioned-solver)
 
 # We only need to find Ginkgo if we build this example stand-alone
-if (NOT GINKGO_BUILD_EXAMPLES)
+if(NOT GINKGO_BUILD_EXAMPLES)
     find_package(Ginkgo 1.10.0 REQUIRED)
 endif()
 
-add_executable(distributed-multigrid-preconditioned-solver distributed-multigrid-preconditioned-solver.cpp)
-target_link_libraries(distributed-multigrid-preconditioned-solver Ginkgo::ginkgo)
+add_executable(
+    distributed-multigrid-preconditioned-solver
+    distributed-multigrid-preconditioned-solver.cpp
+)
+target_link_libraries(
+    distributed-multigrid-preconditioned-solver
+    Ginkgo::ginkgo
+)