diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d547ffe83cc..8f4c848005f 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -60,7 +60,12 @@ if(GINKGO_HAVE_PAPI_SDE)
 endif()
 
 if(GINKGO_BUILD_MPI)
-    list(APPEND EXAMPLES_LIST distributed-solver)
+    list(
+        APPEND
+        EXAMPLES_LIST
+        distributed-solver
+        distributed-multigrid-preconditioned-solver
+    )
 endif()
 
 find_package(Kokkos 4.1.00 QUIET)
diff --git a/examples/distributed-multigrid-preconditioned-solver/CMakeLists.txt b/examples/distributed-multigrid-preconditioned-solver/CMakeLists.txt
new file mode 100644
index 00000000000..1c81952c0bb
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver/CMakeLists.txt
@@ -0,0 +1,16 @@
+cmake_minimum_required(VERSION 3.16)
+project(distributed-multigrid-preconditioned-solver)
+
+# We only need to find Ginkgo if we build this example stand-alone
+if(NOT GINKGO_BUILD_EXAMPLES)
+    find_package(Ginkgo 1.10.0 REQUIRED)
+endif()
+
+add_executable(
+    distributed-multigrid-preconditioned-solver
+    distributed-multigrid-preconditioned-solver.cpp
+)
+target_link_libraries(
+    distributed-multigrid-preconditioned-solver
+    Ginkgo::ginkgo
+)
diff --git a/examples/distributed-multigrid-preconditioned-solver/distributed-multigrid-preconditioned-solver.cpp b/examples/distributed-multigrid-preconditioned-solver/distributed-multigrid-preconditioned-solver.cpp
new file mode 100644
index 00000000000..658826f09f7
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver/distributed-multigrid-preconditioned-solver.cpp
@@ -0,0 +1,268 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+// @sect3{Include files}
+
+// This is the main ginkgo header file.
+#include <ginkgo/ginkgo.hpp>
+
+// Add the C++ iostream header to output information to the console.
+#include <iostream>
+// Add the STL map header for the executor selection
+#include <map>
+// Add the string manipulation header to handle strings.
+#include <string>
+
+
+int main(int argc, char* argv[])
+{
+    // @sect3{Initialize the MPI environment}
+    // Since this is an MPI program, we need to initialize and finalize
+    // MPI at the begin and end respectively of our program. This can be easily
+    // done with the following helper construct that uses RAII to automate the
+    // initialization and finalization.
+    const gko::experimental::mpi::environment env(argc, argv);
+    // @sect3{Type Definitions}
+    // Define the needed types. In a parallel program we need to differentiate
+    // between global and local indices, thus we have two index types.
+    using GlobalIndexType = gko::int64;
+    using LocalIndexType = gko::int32;
+    // The underlying value type.
+    using ValueType = double;
+    // As vector type we use the following, which implements a subset of @ref
+    // gko::matrix::Dense.
+    using dist_vec = gko::experimental::distributed::Vector<ValueType>;
+    // As matrix type we simply use the following type, which can read
+    // distributed data and be applied to a distributed vector.
+    using dist_mtx =
+        gko::experimental::distributed::Matrix<ValueType, LocalIndexType,
+                                               GlobalIndexType>;
+    // We still need a localized vector type to be used as scalars in the
+    // advanced apply operations.
+    using vec = gko::matrix::Dense<ValueType>;
+    // The partition type describes how the rows of the matrices are
+    // distributed.
+    using part_type =
+        gko::experimental::distributed::Partition<LocalIndexType,
+                                                  GlobalIndexType>;
+    // We can use here the same solver type as you would use in a
+    // non-distributed program. Please note that not all solvers support
+    // distributed systems at the moment.
+    using solver = gko::solver::Cg<ValueType>;
+    // We use the Schwarz preconditioner to extend non-distributed
+    // preconditioners, like our Jacobi,
+    // to the distributed case. The Schwarz preconditioner wraps another
+    // preconditioner, and applies it only to the local part of a distributed
+    // matrix. This will be used as our distributed multigrid smoother.
+    using schwarz = gko::experimental::distributed::preconditioner::Schwarz<
+        ValueType, LocalIndexType, GlobalIndexType>;
+    using bj = gko::preconditioner::Jacobi<ValueType, LocalIndexType>;
+    // Multigrid and Pgm can accept the distributed matrix, so we still use the
+    // same type as the non-distributed case.
+    using mg = gko::solver::Multigrid;
+    using pgm = gko::multigrid::Pgm<ValueType, LocalIndexType>;
+
+    // Create an MPI communicator get the rank of the calling process.
+    const auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
+    const auto rank = comm.rank();
+
+    // @sect3{User Input Handling}
+    // User input settings:
+    // - The executor, defaults to reference.
+    // - The number of grid points, defaults to 100.
+    // - The number of iterations, defaults to 1000.
+    if (argc == 2 && (std::string(argv[1]) == "--help")) {
+        if (rank == 0) {
+            std::cerr << "Usage: " << argv[0]
+                      << " [executor] [num_grid_points] [num_iterations] "
+                      << std::endl;
+        }
+        std::exit(-1);
+    }
+
+    ValueType t_init = gko::experimental::mpi::get_walltime();
+
+    const auto executor_string = argc >= 2 ? argv[1] : "reference";
+    const auto grid_dim =
+        static_cast<gko::size_type>(argc >= 3 ? std::atoi(argv[2]) : 100);
+    const auto num_iters =
+        static_cast<gko::size_type>(argc >= 4 ? std::atoi(argv[3]) : 1000);
+
+    const std::map<std::string,
+                   std::function<std::shared_ptr<gko::Executor>(MPI_Comm)>>
+        executor_factory_mpi{
+            {"reference",
+             [](MPI_Comm) { return gko::ReferenceExecutor::create(); }},
+            {"omp", [](MPI_Comm) { return gko::OmpExecutor::create(); }},
+            {"cuda",
+             [](MPI_Comm comm) {
+                 int device_id = gko::experimental::mpi::map_rank_to_device_id(
+                     comm, gko::CudaExecutor::get_num_devices());
+                 return gko::CudaExecutor::create(
+                     device_id, gko::ReferenceExecutor::create());
+             }},
+            {"hip",
+             [](MPI_Comm comm) {
+                 int device_id = gko::experimental::mpi::map_rank_to_device_id(
+                     comm, gko::HipExecutor::get_num_devices());
+                 return gko::HipExecutor::create(
+                     device_id, gko::ReferenceExecutor::create());
+             }},
+            {"dpcpp", [](MPI_Comm comm) {
+                 int device_id = 0;
+                 if (gko::DpcppExecutor::get_num_devices("gpu")) {
+                     device_id = gko::experimental::mpi::map_rank_to_device_id(
+                         comm, gko::DpcppExecutor::get_num_devices("gpu"));
+                 } else if (gko::DpcppExecutor::get_num_devices("cpu")) {
+                     device_id = gko::experimental::mpi::map_rank_to_device_id(
+                         comm, gko::DpcppExecutor::get_num_devices("cpu"));
+                 } else {
+                     throw std::runtime_error("No suitable DPC++ devices");
+                 }
+                 return gko::DpcppExecutor::create(
+                     device_id, gko::ReferenceExecutor::create());
+             }}};
+
+    auto exec = executor_factory_mpi.at(executor_string)(MPI_COMM_WORLD);
+
+    // @sect3{Creating the Distributed Matrix and Vectors}
+    // As a first step, we create a partition of the rows. The partition
+    // consists of ranges of consecutive rows which are assigned a part-id.
+    // These part-ids will be used for the distributed data structures to
+    // determine which rows will be stored locally. In this example each rank
+    // has (nearly) the same number of rows, so we can use the following
+    // specialized constructor. See @ref gko::distributed::Partition for other
+    // modes of creating a partition.
+    const auto num_rows = grid_dim;
+    auto partition = gko::share(part_type::build_from_global_size_uniform(
+        exec->get_master(), comm.size(),
+        static_cast<GlobalIndexType>(num_rows)));
+
+    // Assemble the matrix using a 3-pt stencil and fill the right-hand-side
+    // with a sine value. The distributed matrix supports only constructing an
+    // empty matrix of zero size and filling in the values with
+    // gko::experimental::distributed::Matrix::read_distributed. Only the data
+    // that belongs to the rows by this rank will be assembled.
+    gko::matrix_data<ValueType, GlobalIndexType> A_data;
+    gko::matrix_data<ValueType, GlobalIndexType> b_data;
+    gko::matrix_data<ValueType, GlobalIndexType> x_data;
+    A_data.size = {num_rows, num_rows};
+    b_data.size = {num_rows, 1};
+    x_data.size = {num_rows, 1};
+    const auto range_start = partition->get_range_bounds()[rank];
+    const auto range_end = partition->get_range_bounds()[rank + 1];
+    for (int i = range_start; i < range_end; i++) {
+        if (i > 0) {
+            A_data.nonzeros.emplace_back(i, i - 1, -1);
+        }
+        A_data.nonzeros.emplace_back(i, i, 2);
+        if (i < grid_dim - 1) {
+            A_data.nonzeros.emplace_back(i, i + 1, -1);
+        }
+        b_data.nonzeros.emplace_back(i, 0, std::sin(i * 0.01));
+        x_data.nonzeros.emplace_back(i, 0, gko::zero<ValueType>());
+    }
+
+    // Take timings.
+    comm.synchronize();
+    ValueType t_init_end = gko::experimental::mpi::get_walltime();
+
+    // Read the matrix data, currently this is only supported on CPU executors.
+    // This will also set up the communication pattern needed for the
+    // distributed matrix-vector multiplication.
+    auto A_host = gko::share(dist_mtx::create(exec->get_master(), comm));
+    auto x_host = dist_vec::create(exec->get_master(), comm);
+    auto b_host = dist_vec::create(exec->get_master(), comm);
+    A_host->read_distributed(A_data, partition);
+    b_host->read_distributed(b_data, partition);
+    x_host->read_distributed(x_data, partition);
+    // After reading, the matrix and vector can be moved to the chosen executor,
+    // since the distributed matrix supports SpMV also on devices.
+    auto A = gko::share(dist_mtx::create(exec, comm));
+    auto x = dist_vec::create(exec, comm);
+    auto b = dist_vec::create(exec, comm);
+    A->copy_from(A_host);
+    b->copy_from(b_host);
+    x->copy_from(x_host);
+
+    // Take timings.
+    comm.synchronize();
+    ValueType t_read_setup_end = gko::experimental::mpi::get_walltime();
+
+
+    // @sect3{Solve the Distributed System}
+    // Generate the solver
+
+    // Setup the multigrid factory with customized smoother and coarse solver
+    // Because BlockJacobi does not support distributed matrix, we need wrap it
+    // in Schwarz.
+    auto schwarz_bj_factory =
+        gko::share(schwarz::build().with_local_solver(bj::build()).on(exec));
+    auto smoother_factory = gko::share(gko::solver::build_smoother(
+        schwarz_bj_factory, 2u, static_cast<ValueType>(0.9)));
+    // Cg supports distributed matrix, so we can use it as we did in
+    // non-distributed case
+    auto coarsest_factory = gko::share(
+        solver::build()
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(4u))
+            .on(exec));
+    // The multigrid preconditioner uses the Schwarz Jacobi as smoother and Cg
+    // as coarse solver
+    auto mg_factory = gko::share(
+        mg::build()
+            .with_mg_level(pgm::build().with_deterministic(true))
+            .with_pre_smoother(smoother_factory)
+            .with_coarsest_solver(coarsest_factory)
+            .with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
+            .on(exec));
+
+    // Setup the stopping criterion and logger
+    const gko::remove_complex<ValueType> reduction_factor{1e-8};
+    std::shared_ptr<const gko::log::Convergence<ValueType>> logger =
+        gko::log::Convergence<ValueType>::create();
+    auto Ainv = solver::build()
+                    .with_preconditioner(mg_factory)
+                    .with_criteria(
+                        gko::stop::Iteration::build().with_max_iters(num_iters),
+                        gko::stop::ResidualNorm<ValueType>::build()
+                            .with_reduction_factor(reduction_factor))
+                    .on(exec)
+                    ->generate(A);
+    // Add logger to the generated solver to log the iteration count and
+    // residual norm
+    Ainv->add_logger(logger);
+
+    // Take timings.
+    comm.synchronize();
+    ValueType t_solver_generate_end = gko::experimental::mpi::get_walltime();
+
+    // Apply the distributed solver, this is the same as in the non-distributed
+    // case.
+    Ainv->apply(b, x);
+
+    // Take timings.
+    comm.synchronize();
+    ValueType t_end = gko::experimental::mpi::get_walltime();
+
+    // Get the residual.
+    auto res_norm = gko::clone(exec->get_master(),
+                               gko::as<vec>(logger->get_residual_norm()));
+
+    // @sect3{Printing Results}
+    // Print the achieved residual norm and timings on rank 0.
+    if (comm.rank() == 0) {
+        // clang-format off
+        std::cout << "\nNum rows in matrix: " << num_rows
+                  << "\nNum ranks: " << comm.size()
+                  << "\nFinal Res norm: " << res_norm->at(0, 0)
+                  << "\nIteration count: " << logger->get_num_iterations()
+                  << "\nInit time: " << t_init_end - t_init
+                  << "\nRead time: " << t_read_setup_end - t_init
+                  << "\nSolver generate time: " << t_solver_generate_end - t_read_setup_end
+                  << "\nSolver apply time: " << t_end - t_solver_generate_end
+                  << "\nTotal time: " << t_end - t_init
+                  << std::endl;
+        // clang-format on
+    }
+}
diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/builds-on b/examples/distributed-multigrid-preconditioned-solver/doc/builds-on
new file mode 100644
index 00000000000..f70ab1608ec
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver/doc/builds-on
@@ -0,0 +1 @@
+distributed-solver
diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/intro.dox b/examples/distributed-multigrid-preconditioned-solver/doc/intro.dox
new file mode 100644
index 00000000000..99304ca4851
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver/doc/intro.dox
@@ -0,0 +1,9 @@
+<a name="Intro"></a>
+<h1>Introduction</h1>
+This distributed multigrid preconditioned solver example should help you understand customizing Ginkgo multigrid in a distributed setting.
+The example will solve a simple 1D Laplace equation where the system can be distributed row-wise to multiple processes.
+Note. Because the stencil for the discretized Laplacian is configured with equal weight, the coarsening method does not perform well on this kind of problem.
+To run the solver with multiple processes, use `mpirun -n NUM_PROCS ./distributed-solver [executor] [num_grid_points] [num_iterations]`.
+
+If you are using GPU devices, please make sure that you run this example with at most as many processes as you have GPU
+devices available.
diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/kind b/examples/distributed-multigrid-preconditioned-solver/doc/kind
new file mode 100644
index 00000000000..196aa616342
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver/doc/kind
@@ -0,0 +1 @@
+distributed
diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/results.dox b/examples/distributed-multigrid-preconditioned-solver/doc/results.dox
new file mode 100644
index 00000000000..0189c4c91e6
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver/doc/results.dox
@@ -0,0 +1,18 @@
+<h1>Results</h1>
+This is the expected output for `mpirun -n 4 ./distributed-multigrid-preconditioned-solver`:
+
+@code{.cpp}
+
+Num rows in matrix: 100
+Num ranks: 4
+Final Res norm: 1.61045e-08
+Iteration count: 18
+Init time: 0.000117699
+Read time: 0.000522518
+Solver generate time: 0.000430548
+Solver apply time: 0.00183804
+Total time: 0.00279111
+
+@endcode
+
+The timings may vary depending on the machine.
diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/short-intro b/examples/distributed-multigrid-preconditioned-solver/doc/short-intro
new file mode 100644
index 00000000000..443031b3e39
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver/doc/short-intro
@@ -0,0 +1 @@
+The distributed multigrid preconditioned solver with customized components example.
diff --git a/examples/distributed-multigrid-preconditioned-solver/doc/tooltip b/examples/distributed-multigrid-preconditioned-solver/doc/tooltip
new file mode 100644
index 00000000000..3e6cc291852
--- /dev/null
+++ b/examples/distributed-multigrid-preconditioned-solver/doc/tooltip
@@ -0,0 +1 @@
+Solves a distributed linear system.