diff --git a/RELEASE.md b/RELEASE.md index e6fc1b9617..cb63ed58bb 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -12,6 +12,7 @@ ## Major features and improvements * `kedro run --params` now updates interpolated parameters correctly when using `OmegaConfigLoader`. +* Added `metadata` attribute to `kedro.io` datasets. This is ignored by Kedro, but may be consumed by users or external plugins. ## Bug fixes and other changes * `OmegaConfigLoader` will return a `dict` instead of `DictConfig`. diff --git a/kedro/io/cached_dataset.py b/kedro/io/cached_dataset.py index c63241eecb..55dc1fb9e2 100644 --- a/kedro/io/cached_dataset.py +++ b/kedro/io/cached_dataset.py @@ -39,6 +39,7 @@ def __init__( dataset: AbstractDataSet | dict, version: Version = None, copy_mode: str = None, + metadata: dict[str, Any] = None, ): """Creates a new instance of ``CachedDataSet`` pointing to the provided Python object. @@ -52,6 +53,8 @@ def __init__( copy_mode: The copy mode used to copy the data. Possible values are: "deepcopy", "copy" and "assign". If not provided, it is inferred based on the data type. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. Raises: ValueError: If the provided dataset is not a valid dict/YAML @@ -67,6 +70,7 @@ def __init__( "representation of the dataset, or the actual dataset object." ) self._cache = MemoryDataSet(copy_mode=copy_mode) + self.metadata = metadata def _release(self) -> None: self._cache.release() diff --git a/kedro/io/lambda_dataset.py b/kedro/io/lambda_dataset.py index 2bf1e65ac4..924399f5a5 100644 --- a/kedro/io/lambda_dataset.py +++ b/kedro/io/lambda_dataset.py @@ -76,12 +76,14 @@ def _release(self) -> None: else: self.__release() + # pylint: disable=too-many-arguments def __init__( self, load: Callable[[], Any] | None, save: Callable[[Any], None] | None, exists: Callable[[], bool] = None, release: Callable[[], None] = None, + metadata: dict[str, Any] = None, ): """Creates a new instance of ``LambdaDataSet`` with references to the required input/output data set methods. @@ -91,6 +93,8 @@ def __init__( save: Method to save data to a data set. exists: Method to check whether output data already exists. release: Method to release any cached information. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. Raises: DataSetError: If a method is specified, but is not a Callable. @@ -113,3 +117,4 @@ def __init__( self.__save = save self.__exists = exists self.__release = release + self.metadata = metadata diff --git a/kedro/io/memory_dataset.py b/kedro/io/memory_dataset.py index 05ccef098e..1b0c9ee70d 100644 --- a/kedro/io/memory_dataset.py +++ b/kedro/io/memory_dataset.py @@ -34,7 +34,9 @@ class MemoryDataSet(AbstractDataSet): """ - def __init__(self, data: Any = _EMPTY, copy_mode: str = None): + def __init__( + self, data: Any = _EMPTY, copy_mode: str = None, metadata: dict[str, Any] = None + ): """Creates a new instance of ``MemoryDataSet`` pointing to the provided Python object. @@ -43,9 +45,12 @@ def __init__(self, data: Any = _EMPTY, copy_mode: str = None): copy_mode: The copy mode used to copy the data. Possible values are: "deepcopy", "copy" and "assign". If not provided, it is inferred based on the data type. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. """ self._data = _EMPTY self._copy_mode = copy_mode + self.metadata = metadata if data is not _EMPTY: self._save(data) diff --git a/kedro/io/partitioned_dataset.py b/kedro/io/partitioned_dataset.py index ae5ef7b6e8..e8544743af 100644 --- a/kedro/io/partitioned_dataset.py +++ b/kedro/io/partitioned_dataset.py @@ -141,6 +141,7 @@ def __init__( # pylint: disable=too-many-arguments load_args: dict[str, Any] = None, fs_args: dict[str, Any] = None, overwrite: bool = False, + metadata: dict[str, Any] = None, ): """Creates a new instance of ``PartitionedDataSet``. @@ -179,6 +180,8 @@ def __init__( # pylint: disable=too-many-arguments fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``) overwrite: If True, any existing partitions will be removed. + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. Raises: DataSetError: If versioning is enabled for the underlying dataset. @@ -193,6 +196,7 @@ def __init__( # pylint: disable=too-many-arguments self._overwrite = overwrite self._protocol = infer_storage_options(self._path)["protocol"] self._partition_cache: Cache = Cache(maxsize=1) + self.metadata = metadata dataset = dataset if isinstance(dataset, dict) else {"type": dataset} self._dataset_type, self._dataset_config = parse_dataset_definition(dataset) @@ -383,6 +387,7 @@ def __init__( credentials: dict[str, Any] = None, load_args: dict[str, Any] = None, fs_args: dict[str, Any] = None, + metadata: dict[str, Any] = None, ): """Creates a new instance of ``IncrementalDataSet``. @@ -429,6 +434,8 @@ def __init__( the filesystem implementation. fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). + metadata: Any arbitrary metadata. + This is ignored by Kedro, but may be consumed by users or external plugins. Raises: DataSetError: If versioning is enabled for the underlying dataset. @@ -446,6 +453,7 @@ def __init__( self._checkpoint_config = self._parse_checkpoint_config(checkpoint) self._force_checkpoint = self._checkpoint_config.pop("force_checkpoint", None) + self.metadata = metadata comparison_func = self._checkpoint_config.pop("comparison_func", operator.gt) if isinstance(comparison_func, str):