Skip to content

Commit

Permalink
feat[dace]: Restirct Loop Blocking (#1775)
Browse files Browse the repository at this point in the history
Made it possible to disable loop blocking if there are no independent
nodes.
  • Loading branch information
philip-paul-mueller authored Dec 6, 2024
1 parent 06813d5 commit 2c48858
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def gt_auto_optimize(
gpu_block_size: Optional[Sequence[int | str] | str] = None,
blocking_dim: Optional[gtx_common.Dimension] = None,
blocking_size: int = 10,
blocking_only_if_independent_nodes: Optional[bool] = None,
reuse_transients: bool = False,
gpu_launch_bounds: Optional[int | str] = None,
gpu_launch_factor: Optional[int] = None,
Expand Down Expand Up @@ -90,6 +91,9 @@ def gt_auto_optimize(
one for all.
blocking_dim: On which dimension blocking should be applied.
blocking_size: How many elements each block should process.
blocking_only_if_independent_nodes: If `True` only apply loop blocking if
there are independent nodes in the Map, see the `require_independent_nodes`
option of the `LoopBlocking` transformation.
reuse_transients: Run the `TransientReuse` transformation, might reduce memory footprint.
gpu_launch_bounds: Use this value as `__launch_bounds__` for _all_ GPU Maps.
gpu_launch_factor: Use the number of threads times this value as `__launch_bounds__`
Expand All @@ -101,7 +105,6 @@ def gt_auto_optimize(
validate: Perform validation during the steps.
validate_all: Perform extensive validation.
Note:
For identifying symbols that can be treated as compile time constants
`gt_find_constant_arguments()` function can be used.
Expand Down Expand Up @@ -227,6 +230,7 @@ def gt_auto_optimize(
gtx_transformations.LoopBlocking(
blocking_size=blocking_size,
blocking_parameter=blocking_dim,
require_independent_nodes=blocking_only_if_independent_nodes,
),
validate=validate,
validate_all=validate_all,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,16 @@ class LoopBlocking(dace_transformation.SingleStateTransformation):
What makes this transformation different from simple blocking, is that
the inner map will not just be inserted right after the outer Map.
Instead the transformation will first identify all nodes that does not depend
on the blocking parameter `I` and relocate them between the outer and inner map.
Thus these operations will only be performed once, per inner loop.
on the blocking parameter `I`, called independent nodes and relocate them
between the outer and inner map. Note that an independent node must be connected
to the MapEntry or another independent node.
Thus these operations will only be performed once, per outer loop iteration.
Args:
blocking_size: The size of the block, denoted as `B` above.
blocking_parameter: On which parameter should we block.
require_independent_nodes: If `True` only apply loop blocking if the Map
actually contains independent nodes. Defaults to `False`.
Todo:
- Modify the inner map such that it always starts at zero.
Expand All @@ -59,6 +63,12 @@ class LoopBlocking(dace_transformation.SingleStateTransformation):
desc="Name of the iteration variable on which to block (must be an exact match);"
" 'I' in the above description.",
)
require_independent_nodes = dace_properties.Property(
dtype=bool,
default=False,
desc="If 'True' then blocking is only applied if there are independent nodes.",
)

# Set of nodes that are independent of the blocking parameter.
_independent_nodes: Optional[set[dace_nodes.AccessNode]]
_dependent_nodes: Optional[set[dace_nodes.AccessNode]]
Expand All @@ -69,6 +79,7 @@ def __init__(
self,
blocking_size: Optional[int] = None,
blocking_parameter: Optional[Union[gtx_common.Dimension, str]] = None,
require_independent_nodes: Optional[bool] = None,
) -> None:
super().__init__()
if isinstance(blocking_parameter, gtx_common.Dimension):
Expand All @@ -77,6 +88,8 @@ def __init__(
self.blocking_parameter = blocking_parameter
if blocking_size is not None:
self.blocking_size = blocking_size
if require_independent_nodes is not None:
self.require_independent_nodes = require_independent_nodes
self._independent_nodes = None
self._dependent_nodes = None

Expand Down Expand Up @@ -250,6 +263,9 @@ def partition_map_output(
member variables are updated. If the partition does not exists the function
will return `False` and the respective member variables will be `None`.
The function will honor `self.require_independent_nodes`. Thus if no independent
nodes were found the function behaves as if the partition does not exist.
Args:
state: The state on which we operate.
sdfg: The SDFG in which we operate on.
Expand Down Expand Up @@ -295,6 +311,10 @@ def partition_map_output(
if not found_new_independent_node:
break

if self.require_independent_nodes and len(self._independent_nodes) == 0:
self._independent_nodes = None
return False

# After the independent set is computed compute the set of dependent nodes
# as the set of all nodes adjacent to `outer_entry` that are not dependent.
self._dependent_nodes = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -803,3 +803,52 @@ def test_loop_blocking_mixked_memlets_2():
assert isinstance(node, dace_nodes.MapEntry) or (node is mx)
else:
assert scope_dict[node] is inner_map_entry


def test_loop_blocking_no_independent_nodes():
import dace

sdfg = dace.SDFG(util.unique_name("mixed_memlet_sdfg"))
state = sdfg.add_state(is_start_block=True)
names = ["A", "B"]
for aname in names:
sdfg.add_array(
aname,
shape=(10, 10),
dtype=dace.float64,
transient=False,
)
state.add_mapped_tasklet(
"fully_dependent_computation",
map_ranges={"__i0": "0:10", "__i1": "0:10"},
inputs={"__in1": dace.Memlet("A[__i0, __i1]")},
code="__out = __in1 + 10.0",
outputs={"__out": dace.Memlet("B[__i0, __i1]")},
external_edges=True,
)
sdfg.validate()

# Because there is nothing that is independent the transformation will
# not apply if `require_independent_nodes` is enabled.
count = sdfg.apply_transformations_repeated(
gtx_transformations.LoopBlocking(
blocking_size=2,
blocking_parameter="__i1",
require_independent_nodes=True,
),
validate=True,
validate_all=True,
)
assert count == 0

# But it will apply once this requirement is lifted.
count = sdfg.apply_transformations_repeated(
gtx_transformations.LoopBlocking(
blocking_size=2,
blocking_parameter="__i1",
require_independent_nodes=False,
),
validate=True,
validate_all=True,
)
assert count == 1

0 comments on commit 2c48858

Please sign in to comment.