diff --git a/dace/codegen/targets/framecode.py b/dace/codegen/targets/framecode.py index 1cb325042d..66b71bbb74 100644 --- a/dace/codegen/targets/framecode.py +++ b/dace/codegen/targets/framecode.py @@ -971,20 +971,16 @@ def generate_code(self, ctypedef = size_nodedesc.dtype.ctype from dace.codegen.targets import cpp array = [v for v in sdfg.arrays.values() if v.size_desc_name is not None and v.size_desc_name == size_desc_name] - if len(array) != 1: - print(array) assert len(array) <= 1 if len(array) == 1: array = array[0] - if any(["__dace_defer" in str(dim) for dim in array.shape]): + if type(array) == dace.data.Array and array.is_deferred_array: dimensions = ["0" if cpp.sym2cpp(dim).startswith("__dace_defer") else cpp.sym2cpp(dim) for dim in array.shape] - if any(["__dace_defer" in cpp.sym2cpp(dim) for dim in array.shape]): - size_str = ",".join(dimensions) - assert len(size_nodedesc.shape) == 1 - print("BB", size_nodedesc.shape, dimensions, array.shape) - alloc_str = f'{ctypedef} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n' - callsite_stream.write(alloc_str) - self.dispatcher.defined_vars.add(size_desc_name, disp.DefinedType.Pointer, ctypedef) + size_str = ",".join(dimensions) + assert len(size_nodedesc.shape) == 1 + alloc_str = f'{ctypedef} {size_desc_name}[{size_nodedesc.shape[0]}]{{{size_str}}};\n' + callsite_stream.write(alloc_str) + self.dispatcher.defined_vars.add(size_desc_name, disp.DefinedType.Pointer, ctypedef) ####################################################################### # Generate actual program body diff --git a/dace/data.py b/dace/data.py index f6c5a84417..a3b008f150 100644 --- a/dace/data.py +++ b/dace/data.py @@ -1388,6 +1388,7 @@ class Array(Data): pool = Property(dtype=bool, default=False, desc='Hint to the allocator that using a memory pool is preferred') is_size_array = Property(dtype=bool, default=False, desc='Special array that is used to track the size of an another array') + is_deferred_array = Property(dtype=bool, default=False, desc='Array that requires deferred allocation') def __init__(self, dtype, @@ -1440,6 +1441,9 @@ def __init__(self, self.offset = cp.copy(offset) else: self.offset = [0] * len(shape) + + self.is_deferred_array = any(["__dace_defer" in str(dim) for dim in self.shape]) + self.validate() def __repr__(self): diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index 7eafecaf6d..a08f572782 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -764,8 +764,10 @@ def replace_dict(self, if validate_name(new_name): _replace_dict_keys(self.arrays, name, new_name, non_size_arrays) # Size desc names are updated later - if "__return" not in new_name: # To catch __return_0, __return_1, gpu__return + if "__return" not in new_name: # To catch __return_0, __return_1, gpu__return, fpga__return size_desc_map[new_name] = new_name + "_size" + else: + size_desc_map[new_name] = None _replace_dict_keys(self.symbols, name, new_name) _replace_dict_keys(self.constants_prop, name, new_name) _replace_dict_keys(self.callback_mapping, name, new_name) @@ -779,12 +781,28 @@ def replace_dict(self, arr = self.arrays[arr_name] if arr_name in self.arrays else None if arr is not None: size_desc_name_before = arr.size_desc_name - if arr.transient and type(arr) == dt.Array and size_desc_name_before is not None: - arr.size_desc_name = size_desc_name if "__return" not in new_name else None + # If we change the name of an array, then we need to change its size array accordingly + if (arr.transient and type(arr) == dt.Array and size_desc_name_before is not None + and size_desc_name is not None): + arr.size_desc_name = size_desc_name + assert (arr.size_desc_name == size_desc_name) + self.arrays[size_desc_name] = self.arrays.pop(size_desc_name_before) + # If the new size array is None, then we can remove the previous (and now unused size array) if arr.size_desc_name is None and size_desc_name_before is not None: size_ararys_to_rm.add(size_desc_name_before) - for size_arr_name in size_ararys_to_rm and size_arr_name in self.arrays: - del self.arrays[size_arr_name] + # If the new size array is not None, but it was non before we need to add the size array + if size_desc_name_before is None and arr.size_desc_name is not None: + retval = self._get_size_arr(arr_name, arr) + if retval is not None: + size_desc_name, size_desc = retval + assert (size_desc_name == arr.size_desc_name) + self._arrays[size_desc_name] = size_desc + self._add_symbols(size_desc) + + # Rm any size array we need to remove + for size_arr_name in size_ararys_to_rm: + if size_arr_name in self.arrays: + del self.arrays[size_arr_name] # Replace inside data descriptors for array in self.arrays.values(): @@ -2062,6 +2080,37 @@ def _add_symbols(self, desc: dt.Data): if sym.name not in self.symbols: self.add_symbol(sym.name, sym.dtype) + def _get_size_arr(self, name: str, datadesc: dt.Data): + if ( + datadesc.transient is True and + type(datadesc) == dt.Array and + "__return" not in name and + datadesc.lifetime is not dtypes.AllocationLifetime.External and + datadesc.lifetime is not dtypes.AllocationLifetime.Persistent and + datadesc.is_deferred_array + ): + size_desc_name = f"{name}_size" + # Regardless of the scope and storage it is allocated as a register array + # And at the start of the SDFG (or nested SDFG), not setting SDFG prevents to_gpu assertions + # from failing. To lifetime and storage are set explicitly to + # to prevent optimizations to putting them to FPGA/GPU storage + size_desc = dt.Array(dtype=dace.uint64, + shape=(len(datadesc.shape),), + storage=dtypes.StorageType.CPU_Heap, + location=None, + allow_conflicts=False, + transient=True, + strides=(1,), + offset=(0,), + lifetime=dtypes.AllocationLifetime.State, + alignment=datadesc.alignment, + debuginfo=datadesc.debuginfo, + may_alias=False, + size_desc_name=None) + size_desc.is_size_array = True + return size_desc_name, size_desc + return None + def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str: """ Adds an existing data descriptor to the SDFG array store. @@ -2105,33 +2154,10 @@ def add_datadesc(self, name: str, datadesc: dt.Data, find_new_name=False) -> str # Add the data descriptor to the SDFG and all symbols that are not yet known. self._arrays[name] = datadesc self._add_symbols(datadesc) - if ( - datadesc.transient is True and - type(datadesc) == dt.Array and - "__return" not in name and - datadesc.lifetime is not dtypes.AllocationLifetime.External and - datadesc.lifetime is not dtypes.AllocationLifetime.Persistent and - any(["__dace_defer" in str(dim) for dim in datadesc.shape]) - ): - size_desc_name = f"{name}_size" - # Regardless of the scope and storage it is allocated as a register array - # And at the start of the SDFG (or nested SDFG), not setting SDFG prevents to_gpu assertions - # from failing. To lifetime and storage are set explicitly to - # to prevent optimizations to putting them to FPGA/GPU storage - size_desc = dt.Array(dtype=dace.uint64, - shape=(len(list(datadesc.shape)),), - storage=dtypes.StorageType.CPU_Heap, - location=None, - allow_conflicts=False, - transient=True, - strides=(1,), - offset=(0,), - lifetime=dtypes.AllocationLifetime.State, - alignment=datadesc.alignment, - debuginfo=datadesc.debuginfo, - may_alias=False, - size_desc_name=None) - size_desc.is_size_array = True + + retval = self._get_size_arr(name, datadesc) + if retval is not None: + size_desc_name, size_desc = retval self._arrays[size_desc_name] = size_desc # In case find_new_name and a new name is returned # we need to update the size descriptor name of the array diff --git a/dace/sdfg/validation.py b/dace/sdfg/validation.py index e5226375c9..c4173dd181 100644 --- a/dace/sdfg/validation.py +++ b/dace/sdfg/validation.py @@ -306,6 +306,24 @@ def validate_sdfg(sdfg: 'dace.sdfg.SDFG', references: Set[int] = None, **context "Arrays that use a multibank access pattern must have the size of the first dimension equal" f" the number of banks and have at least 2 dimensions for array {name}", sdfg, None) + # Check the size array shapes match + if type(desc) == dt.Array: + if desc.is_size_array is False and desc.size_desc_name is not None: + # It is an array which is not a size array and needs to have a size array + size_desc = sdfg._arrays[desc.size_desc_name] + size_arr_len = size_desc.shape[0] + if not isinstance(size_arr_len, int) and (isinstance(size_arr_len, dace.symbolic.symbol) and not size_arr_len.is_integer): + raise InvalidSDFGError( + f"Size arrays need to be one-dimensional and have an integer length known at compile time. {desc.size_desc_name}: {size_desc.shape}" + , sdfg, None + ) + # TODO: This check can be implemented as part of a getter/setter on the dimensions of the array? + if int(size_arr_len) != len(desc.shape): + raise InvalidSDFGError( + f"Size arrays size needs to match to shape of its array: {desc.size_desc_name}, {size_desc.shape}: {name}, {desc.shape}" + , sdfg, None + ) + # Check if SDFG is located within a GPU kernel context['in_gpu'] = is_devicelevel_gpu(sdfg, None, None) context['in_fpga'] = is_devicelevel_fpga(sdfg, None, None) diff --git a/tests/deferred_alloc_test.py b/tests/deferred_alloc_test.py index 2b4aa17717..adc5427a9a 100644 --- a/tests/deferred_alloc_test.py +++ b/tests/deferred_alloc_test.py @@ -20,7 +20,7 @@ def schedule_type(storage_type): return dace.dtypes.ScheduleType.GPU_Device def _get_trivial_alloc_sdfg(storage_type: dace.dtypes.StorageType, transient: bool, write_size="0:2"): - sdfg = dace.sdfg.SDFG(name="deferred_alloc_test") + sdfg = dace.sdfg.SDFG(name=f"deferred_alloc_test_1") sdfg.add_array(name="A", shape=(15, "__dace_defer"), dtype=dace.float32, storage=storage_type, transient=transient) @@ -37,9 +37,8 @@ def _get_trivial_alloc_sdfg(storage_type: dace.dtypes.StorageType, transient: bo return sdfg - def _get_assign_map_sdfg(storage_type: dace.dtypes.StorageType, transient: bool, schedule_type: dace.dtypes.ScheduleType.Default): - sdfg = dace.sdfg.SDFG(name="deferred_alloc_test_2") + sdfg = dace.sdfg.SDFG(name=f"deferred_alloc_test_2") sdfg.add_array(name="A", shape=(15, "__dace_defer"), dtype=dace.float32, storage=storage_type, lifetime=dace.dtypes.AllocationLifetime.SDFG, transient=transient) @@ -100,21 +99,20 @@ def _get_assign_map_sdfg(storage_type: dace.dtypes.StorageType, transient: bool, return sdfg - -def _valid_to_reallocate(transient, storage_type, scope): +def _valid_to_reallocate(transient, storage_type): return transient and (storage_type == dace.dtypes.StorageType.GPU_Global or storage_type == dace.dtypes.StorageType.CPU_Heap) -def test_trivial_realloc(storage_type: dace.dtypes.StorageType, transient: bool): +def _test_trivial_realloc(storage_type: dace.dtypes.StorageType, transient: bool): sdfg = _get_trivial_alloc_sdfg(storage_type, transient) try: sdfg.validate() except Exception: - if not _valid_to_reallocate(transient, storage_type, None): + if not _valid_to_reallocate(transient, storage_type): return else: raise AssertionError("Realloc with transient data failed when it was expected not to.") - if not _valid_to_reallocate(transient, storage_type, None): + if not _valid_to_reallocate(transient, storage_type): raise AssertionError("Realloc with non-transient data did not fail when it was expected to.") sdfg.compile() @@ -124,17 +122,18 @@ def test_trivial_realloc(storage_type: dace.dtypes.StorageType, transient: bool) sdfg.validate() sdfg.compile() -def test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool, schedule_type: dace.dtypes.ScheduleType): + +def _test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool, schedule_type: dace.dtypes.ScheduleType): sdfg = _get_assign_map_sdfg(storage_type, transient, schedule_type) try: sdfg.validate() except Exception: - if not _valid_to_reallocate(transient, storage_type, None): + if not _valid_to_reallocate(transient, storage_type): return else: raise AssertionError("Realloc-use with transient data failed when it was expected not to.") - if not _valid_to_reallocate(transient, storage_type, None): + if not _valid_to_reallocate(transient, storage_type): raise AssertionError("Realloc-use with non-transient data did not fail when it was expected to.") compiled_sdfg = sdfg.compile() @@ -174,13 +173,23 @@ def test_realloc_use(storage_type: dace.dtypes.StorageType, transient: bool, sch compiled_sdfg(user_size=user_size, example_array=arr) assert ( arr.get()[0] == 3.0 ) -def test_realloc_inside_map(): - pass +@pytest.mark.gpu +def test_realloc_use_gpu(transient: bool): + _test_realloc_use(dace.dtypes.StorageType.GPU_Global, transient, dace.dtypes.ScheduleType.GPU_Device) +def test_realloc_use_cpu(transient: bool): + _test_realloc_use(dace.dtypes.StorageType.CPU_Heap, transient, dace.dtypes.ScheduleType.Sequential) -def test_all_combinations(storage_type, transient, schedule_type): - test_trivial_realloc(storage_type, transient) - test_realloc_use(storage_type, transient, schedule_type) +@pytest.mark.gpu +def test_trivial_realloc_gpu(transient: bool): + _test_trivial_realloc(dace.dtypes.StorageType.GPU_Global, transient) + +def test_trivial_realloc_cpu(transient: bool): + _test_trivial_realloc(dace.dtypes.StorageType.CPU_Heap, transient) + + +def test_realloc_inside_map(): + pass def test_incomplete_write_dimensions_1(): sdfg = _get_trivial_alloc_sdfg(dace.dtypes.StorageType.CPU_Heap, True, "1:2") @@ -202,28 +211,23 @@ def test_incomplete_write_dimensions_2(): if __name__ == "__main__": - for storage_type, schedule_type in [(dace.dtypes.StorageType.CPU_Heap, dace.dtypes.ScheduleType.Sequential), - (dace.dtypes.StorageType.GPU_Global, dace.dtypes.ScheduleType.GPU_Device)]: - print(f"Trivial Realloc with storage {storage_type}") - test_trivial_realloc(storage_type, True) - print(f"Trivial Realloc-Use with storage {storage_type}") - test_realloc_use(storage_type, True, schedule_type) - - for storage_type, schedule_type in [(dace.dtypes.StorageType.CPU_Heap, dace.dtypes.ScheduleType.Sequential), - (dace.dtypes.StorageType.GPU_Global, dace.dtypes.ScheduleType.GPU_Device)]: - print(f"Trivial Realloc with storage {storage_type} on non-transient data") - test_trivial_realloc(storage_type, False) - print(f"Trivial Realloc-Use with storage {storage_type} on non-transient data") - test_realloc_use(storage_type, False, schedule_type) - - # Try some other combinations - for transient in [True, False]: - for storage_type, schedule_type in [(dace.dtypes.StorageType.CPU_Heap, dace.dtypes.ScheduleType.Sequential), - (dace.dtypes.StorageType.GPU_Global, dace.dtypes.ScheduleType.GPU_Device)]: - print(f"Trivial Realloc with storage {storage_type} on transient:{transient} data") - test_trivial_realloc(storage_type, transient) - print(f"Trivial Realloc-Use with storage {storage_type} on transient:{transient} data") - test_realloc_use(storage_type, transient, schedule_type) + print(f"Trivial Realloc with storage {dace.dtypes.StorageType.CPU_Heap}") + test_trivial_realloc_cpu(dace.dtypes.StorageType.CPU_Heap, True) + print(f"Trivial Realloc-Use with storage {dace.dtypes.StorageType.CPU_Heap}") + test_realloc_use_cpu(dace.dtypes.StorageType.CPU_Heap, True, dace.dtypes.ScheduleType.Sequential) + print(f"Trivial Realloc with storage {dace.dtypes.StorageType.GPU_Global}") + test_trivial_realloc_gpu(dace.dtypes.StorageType.GPU_Global, True) + print(f"Trivial Realloc-Use with storage {dace.dtypes.StorageType.GPU_Global}") + test_realloc_use_gpu(dace.dtypes.StorageType.GPU_Global, True, dace.dtypes.ScheduleType.GPU_Device) + + print(f"Trivial Realloc with storage {dace.dtypes.StorageType.CPU_Heap} on non-transient data") + test_trivial_realloc_cpu(dace.dtypes.StorageType.CPU_Heap, False) + print(f"Trivial Realloc-Use with storage {dace.dtypes.StorageType.CPU_Heap} on non-transient data") + test_realloc_use_cpu(dace.dtypes.StorageType.CPU_Heap, False, dace.dtypes.ScheduleType.Sequential) + print(f"Trivial Realloc with storage {dace.dtypes.StorageType.GPU_Global} on non-transient data") + test_trivial_realloc_gpu(dace.dtypes.StorageType.GPU_Global, False) + print(f"Trivial Realloc-Use with storage {dace.dtypes.StorageType.GPU_Global} on non-transient data") + test_realloc_use_gpu(dace.dtypes.StorageType.GPU_Global, False, dace.dtypes.ScheduleType.GPU_Device) print(f"Realloc with incomplete write 1") test_incomplete_write_dimensions_1()