NOAA-GFDL · fmalatino · Jan 24, 2025 · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,9 @@ driver/examples/comm
 20*-*-*-*-*-*.json
 *.pkl
 
+# example outputs
+examples/mpi/output
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/examples/NDSL/03_orchestration_basics.ipynb b/examples/NDSL/03_orchestration_basics.ipynb
@@ -37,7 +37,7 @@
     ")\n",
     "from ndsl.constants import X_DIM, Y_DIM, Z_DIM\n",
     "from ndsl.dsl.typing import FloatField, Float\n",
-    "from ndsl.boilerplate import get_factories_single_tile_orchestrated_cpu"
+    "from ndsl.boilerplate import get_factories_single_tile_orchestrated"
    ]
   },
   {
@@ -126,7 +126,7 @@
     "    tile_size = (3, 3, 3)\n",
     "\n",
     "    # Setup\n",
-    "    stencil_factory, qty_factory = get_factories_single_tile_orchestrated_cpu(\n",
+    "    stencil_factory, qty_factory = get_factories_single_tile_orchestrated(\n",
     "        nx=tile_size[0],\n",
     "        ny=tile_size[1],\n",
     "        nz=tile_size[2],\n",

diff --git a/examples/mpi/.gitignore b/examples/mpi/.gitignore
diff --git a/ndsl/__init__.py b/ndsl/__init__.py
@@ -10,7 +10,7 @@
 from .dsl.dace.utils import (
     ArrayReport,
     DaCeProgress,
-    MaxBandwithBenchmarkProgram,
+    MaxBandwidthBenchmarkProgram,
     StorageReport,
 )
 from .dsl.dace.wrapped_halo_exchange import WrappedHaloUpdater

diff --git a/ndsl/boilerplate.py b/ndsl/boilerplate.py
@@ -16,6 +16,7 @@
     TileCommunicator,
     TilePartitioner,
 )
+from ndsl.optional_imports import cupy as cp
 
 
 def _get_factories(
@@ -74,36 +75,37 @@ def _get_factories(
 
     grid_indexing = GridIndexing.from_sizer_and_communicator(sizer, comm)
     stencil_factory = StencilFactory(config=stencil_config, grid_indexing=grid_indexing)
-    quantity_factory = QuantityFactory(sizer, np)
+    quantity_factory = QuantityFactory(
+        sizer, cp if stencil_config.is_gpu_backend else np
+    )
 
     return stencil_factory, quantity_factory
 
 
-def get_factories_single_tile_orchestrated_cpu(
-    nx, ny, nz, nhalo
+def get_factories_single_tile_orchestrated(
+    nx, ny, nz, nhalo, on_cpu: bool = True
 ) -> Tuple[StencilFactory, QuantityFactory]:
     """Build a Stencil & Quantity factory for orchestrated CPU, on a single tile topology."""
     return _get_factories(
         nx=nx,
         ny=ny,
         nz=nz,
         nhalo=nhalo,
-        backend="dace:cpu",
+        backend="dace:cpu" if on_cpu else "dace:gpu",
         orchestration=DaCeOrchestration.BuildAndRun,
         topology="tile",
     )
 
 
-def get_factories_single_tile_numpy(
-    nx, ny, nz, nhalo
+def get_factories_single_tile(
+    nx, ny, nz, nhalo, backend: str = "numpy"
 ) -> Tuple[StencilFactory, QuantityFactory]:
-    """Build a Stencil & Quantity factory for Numpy, on a single tile topology."""
     return _get_factories(
         nx=nx,
         ny=ny,
         nz=nz,
         nhalo=nhalo,
-        backend="numpy",
+        backend=backend,
         orchestration=DaCeOrchestration.Python,
         topology="tile",
     )
diff --git a/ndsl/comm/boundary.py b/ndsl/comm/boundary.py
@@ -28,7 +28,7 @@ def send_view(self, quantity: Quantity, n_points: int):
         return self._view(quantity, n_points, interior=True)
 
     def recv_view(self, quantity: Quantity, n_points: int):
-        """Return a sliced view of points which should be recieved at this boundary.
+        """Return a sliced view of points which should be received at this boundary.
 
         Args:
             quantity: quantity for which to return a slice
@@ -37,7 +37,7 @@ def recv_view(self, quantity: Quantity, n_points: int):
         return self._view(quantity, n_points, interior=False)
 
     def send_slice(self, specification: QuantityHaloSpec) -> Tuple[slice]:
-        """Return the index slices which shoud be sent at this boundary.
+        """Return the index slices which should be sent at this boundary.
 
         Args:
             specification: data specifications for the halo. Including shape

diff --git a/ndsl/comm/caching_comm.py b/ndsl/comm/caching_comm.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 
-from ndsl.comm.comm_abc import Comm, Request
+from ndsl.comm.comm_abc import Comm, ReductionOperator, Request
 
 
 T = TypeVar("T")
@@ -147,9 +147,12 @@ def Split(self, color, key) -> "CachingCommReader":
         new_data = self._data.get_split()
         return CachingCommReader(data=new_data)
 
-    def allreduce(self, sendobj, op=None) -> Any:
+    def allreduce(self, sendobj, op: Optional[ReductionOperator] = None) -> Any:
         return self._data.get_generic_obj()
 
+    def Allreduce(self, sendobj, recvobj, op: ReductionOperator) -> Any:
+        raise NotImplementedError("CachingCommReader.Allreduce")
+
     @classmethod
     def load(cls, file: BinaryIO) -> "CachingCommReader":
         data = CachingCommData.load(file)
@@ -229,7 +232,10 @@ def Split(self, color, key) -> "CachingCommWriter":
     def dump(self, file: BinaryIO):
         self._data.dump(file)
 
-    def allreduce(self, sendobj, op=None) -> Any:
+    def allreduce(self, sendobj, op: Optional[ReductionOperator] = None) -> Any:
         result = self._comm.allreduce(sendobj, op)
         self._data.generic_obj_buffers.append(copy.deepcopy(result))
         return result
+
+    def Allreduce(self, sendobj, recvobj, op: ReductionOperator) -> Any:
+        raise NotImplementedError("CachingCommWriter.Allreduce")
diff --git a/ndsl/comm/comm_abc.py b/ndsl/comm/comm_abc.py
@@ -1,10 +1,30 @@
 import abc
+import enum
 from typing import List, Optional, TypeVar
 
 
 T = TypeVar("T")
 
 
+@enum.unique
+class ReductionOperator(enum.Enum):
+    OP_NULL = enum.auto()
+    MAX = enum.auto()
+    MIN = enum.auto()
+    SUM = enum.auto()
+    PROD = enum.auto()
+    LAND = enum.auto()
+    BAND = enum.auto()
+    LOR = enum.auto()
+    BOR = enum.auto()
+    LXOR = enum.auto()
+    BXOR = enum.auto()
+    MAXLOC = enum.auto()
+    MINLOC = enum.auto()
+    REPLACE = enum.auto()
+    NO_OP = enum.auto()
+
+
 class Request(abc.ABC):
     @abc.abstractmethod
     def wait(self):
@@ -69,5 +89,12 @@ def Split(self, color, key) -> "Comm":
         ...
 
     @abc.abstractmethod
-    def allreduce(self, sendobj: T, op=None) -> T:
+    def allreduce(self, sendobj: T, op: Optional[ReductionOperator] = None) -> T:
+        ...
+
+    @abc.abstractmethod
+    def Allreduce(self, sendobj: T, recvobj: T, op: ReductionOperator) -> T:
+        ...
+
+    def Allreduce_inplace(self, obj: T, op: ReductionOperator) -> T:
         ...
diff --git a/ndsl/comm/communicator.py b/ndsl/comm/communicator.py
@@ -6,6 +6,8 @@
 import ndsl.constants as constants
 from ndsl.buffer import array_buffer, device_synchronize, recv_buffer, send_buffer
 from ndsl.comm.boundary import Boundary
+from ndsl.comm.comm_abc import Comm as CommABC
+from ndsl.comm.comm_abc import ReductionOperator
 from ndsl.comm.partitioner import CubedSpherePartitioner, Partitioner, TilePartitioner
 from ndsl.halo.updater import HaloUpdater, HaloUpdateRequest, VectorInterfaceHaloUpdater
 from ndsl.performance.timer import NullTimer, Timer
@@ -44,7 +46,11 @@ def to_numpy(array, dtype=None) -> np.ndarray:
 
 class Communicator(abc.ABC):
     def __init__(
-        self, comm, partitioner, force_cpu: bool = False, timer: Optional[Timer] = None
+        self,
+        comm: CommABC,
+        partitioner,
+        force_cpu: bool = False,
+        timer: Optional[Timer] = None,
     ):
         self.comm = comm
         self.partitioner: Partitioner = partitioner
@@ -61,7 +67,7 @@ def tile(self) -> "TileCommunicator":
     @abc.abstractmethod
     def from_layout(
         cls,
-        comm,
+        comm: CommABC,
         layout: Tuple[int, int],
         force_cpu: bool = False,
         timer: Optional[Timer] = None,
@@ -93,17 +99,63 @@ def _device_synchronize():
         # this is a method so we can profile it separately from other device syncs
         device_synchronize()
 
+    def _create_all_reduce_quantity(
+        self, input_metadata: QuantityMetadata, input_data
+    ) -> Quantity:
+        """Create a Quantity for all_reduce data and metadata"""
+        all_reduce_quantity = Quantity(
+            input_data,
+            dims=input_metadata.dims,
+            units=input_metadata.units,
+            origin=input_metadata.origin,
+            extent=input_metadata.extent,
+            gt4py_backend=input_metadata.gt4py_backend,
+            allow_mismatch_float_precision=False,
+        )
+        return all_reduce_quantity
+
+    def all_reduce(
+        self,
+        input_quantity: Quantity,
+        op: ReductionOperator,
+        output_quantity: Quantity = None,
+    ):
+        reduced_quantity_data = self.comm.allreduce(input_quantity.data, op)
+        if output_quantity is None:
+            all_reduce_quantity = self._create_all_reduce_quantity(
+                input_quantity.metadata, reduced_quantity_data
+            )
+            return all_reduce_quantity
+        else:
+            if output_quantity.data.shape != input_quantity.data.shape:
+                raise TypeError("Shapes not matching")
+
+            input_quantity.metadata.duplicate_metadata(output_quantity.metadata)
+
+            output_quantity.data = reduced_quantity_data
+
+    def all_reduce_per_element(
+        self,
+        input_quantity: Quantity,
+        output_quantity: Quantity,
+        op: ReductionOperator,
+    ):
+        self.comm.Allreduce(input_quantity.data, output_quantity.data, op)
+
+    def all_reduce_per_element_in_place(
+        self, quantity: Quantity, op: ReductionOperator
+    ):
+        self.comm.Allreduce_inplace(quantity.data, op)
+
     def _Scatter(self, numpy_module, sendbuf, recvbuf, **kwargs):
-        with send_buffer(numpy_module.zeros, sendbuf) as send, recv_buffer(
-            numpy_module.zeros, recvbuf
-        ) as recv:
-            self.comm.Scatter(send, recv, **kwargs)
+        with send_buffer(numpy_module.zeros, sendbuf) as send:
+            with recv_buffer(numpy_module.zeros, recvbuf) as recv:
+                self.comm.Scatter(send, recv, **kwargs)
 
     def _Gather(self, numpy_module, sendbuf, recvbuf, **kwargs):
-        with send_buffer(numpy_module.zeros, sendbuf) as send, recv_buffer(
-            numpy_module.zeros, recvbuf
-        ) as recv:
-            self.comm.Gather(send, recv, **kwargs)
+        with send_buffer(numpy_module.zeros, sendbuf) as send:
+            with recv_buffer(numpy_module.zeros, recvbuf) as recv:
+                self.comm.Gather(send, recv, **kwargs)
 
     def scatter(
         self,
@@ -252,7 +304,7 @@ def gather_state(self, send_state=None, recv_state=None, transfer_type=None):
 
         Args:
             send_state: the model state to be sent containing the subtile data
-            recv_state: the pre-allocated state in which to recieve the full tile
+            recv_state: the pre-allocated state in which to receive the full tile
                 state. Only variables which are scattered will be written to.
         Returns:
             recv_state: on the root rank, the state containing the entire tile
@@ -288,7 +340,7 @@ def scatter_state(self, send_state=None, recv_state=None):
         Args:
             send_state: the model state to be sent containing the entire tile,
                 required only from the root rank
-            recv_state: the pre-allocated state in which to recieve the scattered
+            recv_state: the pre-allocated state in which to receive the scattered
                 state. Only variables which are scattered will be written to.
         Returns:
             rank_state: the state corresponding to this rank's subdomain
@@ -709,7 +761,7 @@ class CubedSphereCommunicator(Communicator):
 
     def __init__(
         self,
-        comm,
+        comm: CommABC,
         partitioner: CubedSpherePartitioner,
         force_cpu: bool = False,
         timer: Optional[Timer] = None,
@@ -722,6 +774,11 @@ def __init__(
             force_cpu: Force all communication to go through central memory.
             timer: Time communication operations.
         """
+        if not issubclass(type(comm), CommABC):
+            raise TypeError(
+                "Communicator needs to be instantiated with communication subsystem"
+                f" derived from `comm_abc.Comm`, got {type(comm)}."
+            )
         if comm.Get_size() != partitioner.total_ranks:
             raise ValueError(
                 f"was given a partitioner for {partitioner.total_ranks} ranks but a "

diff --git a/ndsl/comm/local_comm.py b/ndsl/comm/local_comm.py
@@ -189,8 +189,14 @@ def Split(self, color, key):
         self._split_comms[color].append(new_comm)
         return new_comm
 
-    def allreduce(self, sendobj, op=None) -> Any:
+    def allreduce(self, sendobj, op=None, recvobj=None) -> Any:
         raise NotImplementedError(
-            "sendrecv fundamentally cannot be written for LocalComm, "
+            "allreduce fundamentally cannot be written for LocalComm, "
+            "as it requires synchronicity"
+        )
+
+    def Allreduce(self, sendobj, recvobj, op) -> Any:
+        raise NotImplementedError(
+            "Allreduce fundamentally cannot be written for LocalComm, "
             "as it requires synchronicity"
         )