From 32fdf69ba589ae1a563385de8ed64d7558892180 Mon Sep 17 00:00:00 2001 From: Ravi Kumar Pilla Date: Wed, 18 Dec 2024 10:01:50 -0600 Subject: [PATCH 01/24] build(datasets): Release 6.0.0 (#968) release draft Signed-off-by: Richard Asselin --- kedro-datasets/RELEASE.md | 8 +++++++- kedro-datasets/kedro_datasets/__init__.py | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index b48bcce39..a477dca5e 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,4 +1,10 @@ -# Upcoming Release 6.0.0 +# Upcoming Release +## Major features and improvements +## Bug fixes and other changes +## Breaking Changes +## Community contributions + +# Release 6.0.0 ## Major features and improvements diff --git a/kedro-datasets/kedro_datasets/__init__.py b/kedro-datasets/kedro_datasets/__init__.py index 44692e803..94379814c 100644 --- a/kedro-datasets/kedro_datasets/__init__.py +++ b/kedro-datasets/kedro_datasets/__init__.py @@ -1,7 +1,7 @@ """``kedro_datasets`` is where you can find all of Kedro's data connectors.""" __all__ = ["KedroDeprecationWarning"] -__version__ = "5.1.0" +__version__ = "6.0.0" import sys import warnings From ef05d2332f332db2a39b2189734a9550c4cec887 Mon Sep 17 00:00:00 2001 From: Ravi Kumar Pilla Date: Tue, 7 Jan 2025 18:48:56 -0600 Subject: [PATCH 02/24] chore(datasets): Remove tracking datasets which are used in Kedro Viz Experiment Tracking (#969) * remove et related kedro datasets * update release note and static json schema * temporary doc fix Signed-off-by: Richard Asselin --- kedro-datasets/RELEASE.md | 4 + .../docs/source/api/kedro_datasets.rst | 2 - kedro-datasets/kedro_datasets/_typing.py | 5 - .../kedro_datasets/dask/csv_dataset.py | 4 +- .../kedro_datasets/dask/parquet_dataset.py | 4 +- .../kedro_datasets/tracking/__init__.py | 26 --- .../kedro_datasets/tracking/json_dataset.py | 56 ----- .../tracking/metrics_dataset.py | 76 ------- kedro-datasets/pyproject.toml | 4 - .../static/jsonschema/kedro-catalog-0.18.json | 72 ------- .../static/jsonschema/kedro-catalog-0.19.json | 72 ------- kedro-datasets/tests/tracking/__init__.py | 0 .../tests/tracking/test_json_dataset.py | 195 ----------------- .../tests/tracking/test_metrics_dataset.py | 204 ------------------ 14 files changed, 8 insertions(+), 716 deletions(-) delete mode 100644 kedro-datasets/kedro_datasets/tracking/__init__.py delete mode 100644 kedro-datasets/kedro_datasets/tracking/json_dataset.py delete mode 100644 kedro-datasets/kedro_datasets/tracking/metrics_dataset.py delete mode 100644 kedro-datasets/tests/tracking/__init__.py delete mode 100644 kedro-datasets/tests/tracking/test_json_dataset.py delete mode 100644 kedro-datasets/tests/tracking/test_metrics_dataset.py diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index a477dca5e..16fa5b18a 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,7 +1,11 @@ # Upcoming Release ## Major features and improvements ## Bug fixes and other changes + ## Breaking Changes + +- Removed `tracking.MetricsDataset` and `tracking.JSONDataset` + ## Community contributions # Release 6.0.0 diff --git a/kedro-datasets/docs/source/api/kedro_datasets.rst b/kedro-datasets/docs/source/api/kedro_datasets.rst index 0cbd3bc4e..63142220a 100644 --- a/kedro-datasets/docs/source/api/kedro_datasets.rst +++ b/kedro-datasets/docs/source/api/kedro_datasets.rst @@ -62,6 +62,4 @@ kedro_datasets svmlight.SVMLightDataset tensorflow.TensorFlowModelDataset text.TextDataset - tracking.JSONDataset - tracking.MetricsDataset yaml.YAMLDataset diff --git a/kedro-datasets/kedro_datasets/_typing.py b/kedro-datasets/kedro_datasets/_typing.py index feb6d91b7..aa083f514 100644 --- a/kedro-datasets/kedro_datasets/_typing.py +++ b/kedro-datasets/kedro_datasets/_typing.py @@ -9,8 +9,3 @@ ImagePreview = NewType("ImagePreview", str) PlotlyPreview = NewType("PlotlyPreview", dict) JSONPreview = NewType("JSONPreview", str) - - -# experiment tracking datasets types -MetricsTrackingPreview = NewType("MetricsTrackingPreview", dict) -JSONTrackingPreview = NewType("JSONTrackingPreview", dict) diff --git a/kedro-datasets/kedro_datasets/dask/csv_dataset.py b/kedro-datasets/kedro_datasets/dask/csv_dataset.py index 053da6b00..bc5b5764b 100644 --- a/kedro-datasets/kedro_datasets/dask/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/dask/csv_dataset.py @@ -67,9 +67,9 @@ def __init__( # noqa: PLR0913 filepath: Filepath in POSIX format to a CSV file CSV collection or the directory of a multipart CSV. load_args: Additional loading options `dask.dataframe.read_csv`: - https://docs.dask.org/en/latest/generated/dask.dataframe.read_csv.html + https://docs.dask.org/en/stable/generated/dask.dataframe.read_csv.html save_args: Additional saving options for `dask.dataframe.to_csv`: - https://docs.dask.org/en/latest/generated/dask.dataframe.to_csv.html + https://docs.dask.org/en/stable/generated/dask.dataframe.to_csv.html credentials: Credentials required to get access to the underlying filesystem. E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Optional parameters to the backend file system driver: diff --git a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py index 1acfe7cda..3b2dff73e 100644 --- a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py @@ -97,9 +97,9 @@ def __init__( # noqa: PLR0913 filepath: Filepath in POSIX format to a parquet file parquet collection or the directory of a multipart parquet. load_args: Additional loading options `dask.dataframe.read_parquet`: - https://docs.dask.org/en/latest/generated/dask.dataframe.read_parquet.html + https://docs.dask.org/en/stable/generated/dask.dataframe.read_parquet.html save_args: Additional saving options for `dask.dataframe.to_parquet`: - https://docs.dask.org/en/latest/generated/dask.dataframe.to_parquet.html + https://docs.dask.org/en/stable/generated/dask.dataframe.to_parquet.html credentials: Credentials required to get access to the underlying filesystem. E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Optional parameters to the backend file system driver: diff --git a/kedro-datasets/kedro_datasets/tracking/__init__.py b/kedro-datasets/kedro_datasets/tracking/__init__.py deleted file mode 100644 index 1b1a5c70d..000000000 --- a/kedro-datasets/kedro_datasets/tracking/__init__.py +++ /dev/null @@ -1,26 +0,0 @@ -"""Dataset implementations to save data for Kedro Experiment Tracking.""" - -import warnings -from typing import Any - -import lazy_loader as lazy - -from kedro_datasets import KedroDeprecationWarning - -# https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901 -JSONDataset: Any -MetricsDataset: Any - -__getattr__, __dir__, __all__ = lazy.attach( - __name__, - submod_attrs={ - "json_dataset": ["JSONDataset"], - "metrics_dataset": ["MetricsDataset"], - }, -) - -warnings.warn( - "`tracking.JSONDataset` and `tracking.MetricsDataset` are deprecated. These datasets will be removed in kedro-datasets 7.0.0", - KedroDeprecationWarning, - stacklevel=2, -) diff --git a/kedro-datasets/kedro_datasets/tracking/json_dataset.py b/kedro-datasets/kedro_datasets/tracking/json_dataset.py deleted file mode 100644 index d73df1b10..000000000 --- a/kedro-datasets/kedro_datasets/tracking/json_dataset.py +++ /dev/null @@ -1,56 +0,0 @@ -"""``JSONDataset`` saves data to a JSON file using an underlying -filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file. -The ``JSONDataset`` is part of Kedro Experiment Tracking. The dataset is versioned by default. -""" - -import json -from typing import NoReturn - -from kedro.io.core import DatasetError, get_filepath_str - -from kedro_datasets._typing import JSONTrackingPreview -from kedro_datasets.json import json_dataset - - -class JSONDataset(json_dataset.JSONDataset): - """``JSONDataset`` saves data to a JSON file using an underlying - filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file. - The ``JSONDataset`` is part of Kedro Experiment Tracking. - The dataset is write-only and it is versioned by default. - - Example usage for the - `YAML API `_: - - .. code-block:: yaml - - cars: - type: tracking.JSONDataset - filepath: data/09_tracking/cars.json - - Example usage for the - `Python API `_: - - .. code-block:: pycon - - >>> from kedro_datasets.tracking import JSONDataset - >>> - >>> data = {"col1": 1, "col2": 0.23, "col3": 0.002} - >>> - >>> dataset = JSONDataset(filepath=tmp_path / "test.json") - >>> dataset.save(data) - - """ - - versioned = True - - def load(self) -> NoReturn: - raise DatasetError(f"Loading not supported for '{self.__class__.__name__}'") - - def preview(self) -> JSONTrackingPreview: # type: ignore[override] - "Load the JSON tracking dataset used in Kedro-viz experiment tracking." - load_path = get_filepath_str(self._get_load_path(), self._protocol) - - with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: - return JSONTrackingPreview(json.load(fs_file)) diff --git a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py b/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py deleted file mode 100644 index 6202acf34..000000000 --- a/kedro-datasets/kedro_datasets/tracking/metrics_dataset.py +++ /dev/null @@ -1,76 +0,0 @@ -"""``MetricsDataset`` saves data to a JSON file using an underlying -filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file. -The ``MetricsDataset`` is part of Kedro Experiment Tracking. The dataset is versioned by default -and only takes metrics of numeric values. -""" - -import json -from typing import NoReturn - -from kedro.io.core import DatasetError, get_filepath_str - -from kedro_datasets._typing import MetricsTrackingPreview -from kedro_datasets.json import json_dataset - - -class MetricsDataset(json_dataset.JSONDataset): - """``MetricsDataset`` saves data to a JSON file using an underlying - filesystem (e.g.: local, S3, GCS). It uses native json to handle the JSON file. The - ``MetricsDataset`` is part of Kedro Experiment Tracking. The dataset is write-only, - it is versioned by default and only takes metrics of numeric values. - - Example usage for the - `YAML API `_: - - .. code-block:: yaml - - cars: - type: tracking.MetricsDataset - filepath: data/09_tracking/cars.json - - Example usage for the - `Python API `_: - - .. code-block:: pycon - - >>> from kedro_datasets.tracking import MetricsDataset - >>> - >>> data = {"col1": 1, "col2": 0.23, "col3": 0.002} - >>> - >>> dataset = MetricsDataset(filepath=tmp_path / "test.json") - >>> dataset.save(data) - - """ - - versioned = True - - def load(self) -> NoReturn: - raise DatasetError(f"Loading not supported for '{self.__class__.__name__}'") - - def save(self, data: dict[str, float]) -> None: - """Converts all values in the data from a ``MetricsDataset`` to float to make sure - they are numeric values which can be displayed in Kedro Viz and then saves the dataset. - """ - try: - for key, value in data.items(): - data[key] = float(value) - except ValueError as exc: - raise DatasetError( - f"The MetricsDataset expects only numeric values. {exc}" - ) from exc - - save_path = get_filepath_str(self._get_save_path(), self._protocol) - - with self._fs.open(save_path, **self._fs_open_args_save) as fs_file: - json.dump(data, fs_file, **self._save_args) - - self._invalidate_cache() - - def preview(self) -> MetricsTrackingPreview: # type: ignore[override] - "Load the Metrics tracking dataset used in Kedro-viz experiment tracking" - load_path = get_filepath_str(self._get_load_path(), self._protocol) - - with self._fs.open(load_path, **self._fs_open_args_load) as fs_file: - return json.load(fs_file) diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml index 91b938c19..3ee8eb9e9 100644 --- a/kedro-datasets/pyproject.toml +++ b/kedro-datasets/pyproject.toml @@ -163,10 +163,6 @@ tensorflow = ["kedro-datasets[tensorflow-tensorflowmodeldataset]"] text-textdataset = [] text = ["kedro-datasets[text-textdataset]"] -tracking-jsondataset = [] -tracking-metricsdataset = [] -tracking = ["kedro-datasets[tracking-jsondataset, tracking-metricsdataset]"] - yaml-yamldataset = ["kedro-datasets[pandas-base]", "PyYAML>=4.2, <7.0"] yaml = ["kedro-datasets[yaml-yamldataset]"] diff --git a/kedro-datasets/static/jsonschema/kedro-catalog-0.18.json b/kedro-datasets/static/jsonschema/kedro-catalog-0.18.json index 195f0234a..b9fa61d14 100644 --- a/kedro-datasets/static/jsonschema/kedro-catalog-0.18.json +++ b/kedro-datasets/static/jsonschema/kedro-catalog-0.18.json @@ -42,8 +42,6 @@ "spark.SparkJDBCDataSet", "tensorflow.TensorFlowModelDataset", "text.TextDataSet", - "tracking.JSONDataSet", - "tracking.MetricsDataSet", "yaml.YAMLDataSet" ] } @@ -1312,76 +1310,6 @@ } } }, - { - "if": { - "properties": { - "type": { - "const": "tracking.JSONDataSet" - } - } - }, - "then": { - "required": [ - "filepath" - ], - "properties": { - "filepath": { - "type": "string", - "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." - }, - "save_args": { - "type": "object", - "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." - }, - "credentials": { - "type": [ - "object", - "string" - ], - "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." - }, - "fs_args": { - "type": "object", - "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." - } - } - } - }, - { - "if": { - "properties": { - "type": { - "const": "tracking.MetricsDataSet" - } - } - }, - "then": { - "required": [ - "filepath" - ], - "properties": { - "filepath": { - "type": "string", - "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." - }, - "save_args": { - "type": "object", - "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." - }, - "credentials": { - "type": [ - "object", - "string" - ], - "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." - }, - "fs_args": { - "type": "object", - "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." - } - } - } - }, { "if": { "properties": { diff --git a/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json b/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json index f19266812..087725710 100644 --- a/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json +++ b/kedro-datasets/static/jsonschema/kedro-catalog-0.19.json @@ -41,8 +41,6 @@ "spark.SparkJDBCDataset", "tensorflow.TensorFlowModelDataset", "text.TextDataset", - "tracking.JSONDataset", - "tracking.MetricsDataset", "yaml.YAMLDataset" ] } @@ -1277,76 +1275,6 @@ } } }, - { - "if": { - "properties": { - "type": { - "const": "tracking.JSONDataset" - } - } - }, - "then": { - "required": [ - "filepath" - ], - "properties": { - "filepath": { - "type": "string", - "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." - }, - "save_args": { - "type": "object", - "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." - }, - "credentials": { - "type": [ - "object", - "string" - ], - "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." - }, - "fs_args": { - "type": "object", - "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." - } - } - } - }, - { - "if": { - "properties": { - "type": { - "const": "tracking.MetricsDataset" - } - } - }, - "then": { - "required": [ - "filepath" - ], - "properties": { - "filepath": { - "type": "string", - "description": "Filepath in POSIX format to a text file prefixed with a protocol like `s3://`.\nIf prefix is not provided, `file` protocol (local filesystem) will be used.\nThe prefix should be any protocol supported by ``fsspec``.\nNote: `http(s)` doesn't support versioning." - }, - "save_args": { - "type": "object", - "description": "json options for saving JSON files (arguments passed\ninto ```json.dump``). Here you can find all available arguments:\nhttps://docs.python.org/3/library/json.html\nAll defaults are preserved, but \"default_flow_style\", which is set to False." - }, - "credentials": { - "type": [ - "object", - "string" - ], - "description": "Credentials required to get access to the underlying filesystem.\nE.g. for ``GCSFileSystem`` it should look like `{\"token\": None}`." - }, - "fs_args": { - "type": "object", - "description": "Extra arguments to pass into underlying filesystem class constructor\n(e.g. `{\"project\": \"my-project\"}` for ``GCSFileSystem``), as well as\nto pass to the filesystem's `open` method through nested keys\n`open_args_load` and `open_args_save`.\nHere you can find all available arguments for `open`:\nhttps://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.open\nAll defaults are preserved, except `mode`, which is set to `r` when loading\nand to `w` when saving." - } - } - } - }, { "if": { "properties": { diff --git a/kedro-datasets/tests/tracking/__init__.py b/kedro-datasets/tests/tracking/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/kedro-datasets/tests/tracking/test_json_dataset.py b/kedro-datasets/tests/tracking/test_json_dataset.py deleted file mode 100644 index de24ba9b9..000000000 --- a/kedro-datasets/tests/tracking/test_json_dataset.py +++ /dev/null @@ -1,195 +0,0 @@ -import inspect -import json -from pathlib import Path, PurePosixPath - -import pytest -from fsspec.implementations.local import LocalFileSystem -from gcsfs import GCSFileSystem -from kedro.io.core import PROTOCOL_DELIMITER, DatasetError, Version -from s3fs.core import S3FileSystem - -from kedro_datasets.tracking import JSONDataset - - -@pytest.fixture -def filepath_json(tmp_path): - return (tmp_path / "test.json").as_posix() - - -@pytest.fixture -def json_dataset(filepath_json, save_args, fs_args): - return JSONDataset(filepath=filepath_json, save_args=save_args, fs_args=fs_args) - - -@pytest.fixture -def explicit_versioned_json_dataset(filepath_json, load_version, save_version): - return JSONDataset( - filepath=filepath_json, version=Version(load_version, save_version) - ) - - -@pytest.fixture -def dummy_data(): - return {"col1": 1, "col2": 2, "col3": "mystring"} - - -class TestJSONDataset: - def test_save(self, filepath_json, dummy_data, tmp_path, save_version): - """Test saving and reloading the dataset.""" - json_dataset = JSONDataset( - filepath=filepath_json, version=Version(None, save_version) - ) - json_dataset.save(dummy_data) - - actual_filepath = Path(json_dataset._filepath.as_posix()) - test_filepath = tmp_path / "locally_saved.json" - - test_filepath.parent.mkdir(parents=True, exist_ok=True) - with open(test_filepath, "w", encoding="utf-8") as file: - json.dump(dummy_data, file) - - with open(test_filepath, encoding="utf-8") as file: - test_data = json.load(file) - - with open( - (actual_filepath / save_version / "test.json"), encoding="utf-8" - ) as actual_file: - actual_data = json.load(actual_file) - - assert actual_data == test_data - assert json_dataset._fs_open_args_load == {} - assert json_dataset._fs_open_args_save == {"mode": "w"} - - def test_load_fail(self, json_dataset, dummy_data): - json_dataset.save(dummy_data) - pattern = r"Loading not supported for 'JSONDataset'" - with pytest.raises(DatasetError, match=pattern): - json_dataset.load() - - def test_exists(self, json_dataset, dummy_data): - """Test `exists` method invocation for both existing and - nonexistent dataset.""" - assert not json_dataset.exists() - json_dataset.save(dummy_data) - assert json_dataset.exists() - - @pytest.mark.parametrize( - "save_args", [{"k1": "v1", "index": "value"}], indirect=True - ) - def test_save_extra_params(self, json_dataset, save_args): - """Test overriding the default save arguments.""" - for key, value in save_args.items(): - assert json_dataset._save_args[key] == value - - @pytest.mark.parametrize( - "fs_args", - [{"open_args_load": {"mode": "rb", "compression": "gzip"}}], - indirect=True, - ) - def test_open_extra_args(self, json_dataset, fs_args): - assert json_dataset._fs_open_args_load == fs_args["open_args_load"] - assert json_dataset._fs_open_args_save == {"mode": "w"} # default unchanged - - @pytest.mark.parametrize( - "filepath,instance_type", - [ - ("s3://bucket/file.json", S3FileSystem), - ("file:///tmp/test.json", LocalFileSystem), - ("/tmp/test.json", LocalFileSystem), - ("gcs://bucket/file.json", GCSFileSystem), - ], - ) - def test_protocol_usage(self, filepath, instance_type): - dataset = JSONDataset(filepath=filepath) - assert isinstance(dataset._fs, instance_type) - - path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] - - assert str(dataset._filepath) == path - assert isinstance(dataset._filepath, PurePosixPath) - - def test_catalog_release(self, mocker): - fs_mock = mocker.patch("fsspec.filesystem").return_value - filepath = "test.json" - dataset = JSONDataset(filepath=filepath) - dataset.release() - fs_mock.invalidate_cache.assert_called_once_with(filepath) - - def test_not_version_str_repr(self): - """Test that version is not in string representation of the class instance.""" - filepath = "test.json" - ds = JSONDataset(filepath=filepath) - - assert filepath in str(ds) - assert "version" not in str(ds) - assert "JSONDataset" in str(ds) - assert "protocol" in str(ds) - # Default save_args - assert "save_args={'indent': 2}" in str(ds) - - def test_version_str_repr(self, load_version, save_version): - """Test that version is in string representation of the class instance.""" - filepath = "test.json" - ds_versioned = JSONDataset( - filepath=filepath, version=Version(load_version, save_version) - ) - - assert filepath in str(ds_versioned) - ver_str = f"version=Version(load={load_version}, save='{save_version}')" - assert ver_str in str(ds_versioned) - assert "JSONDataset" in str(ds_versioned) - assert "protocol" in str(ds_versioned) - # Default save_args - assert "save_args={'indent': 2}" in str(ds_versioned) - - def test_prevent_overwrite(self, explicit_versioned_json_dataset, dummy_data): - """Check the error when attempting to override the dataset if the - corresponding json file for a given save version already exists.""" - explicit_versioned_json_dataset.save(dummy_data) - pattern = ( - r"Save path \'.+\' for JSONDataset\(.+\) must " - r"not exist if versioning is enabled\." - ) - with pytest.raises(DatasetError, match=pattern): - explicit_versioned_json_dataset.save(dummy_data) - - @pytest.mark.parametrize( - "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True - ) - @pytest.mark.parametrize( - "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True - ) - def test_save_version_warning( - self, - explicit_versioned_json_dataset, - load_version, - save_version, - dummy_data, - ): - """Check the warning when saving to the path that differs from - the subsequent load path.""" - pattern = ( - f"Save version '{save_version}' did not match " - f"load version '{load_version}' for " - r"JSONDataset\(.+\)" - ) - with pytest.warns(UserWarning, match=pattern): - explicit_versioned_json_dataset.save(dummy_data) - - def test_http_filesystem_no_versioning(self): - pattern = "Versioning is not supported for HTTP protocols." - - with pytest.raises(DatasetError, match=pattern): - JSONDataset( - filepath="https://example.com/file.json", version=Version(None, None) - ) - - def test_preview(self, json_dataset, dummy_data): - expected_preview = {"col1": 1, "col2": 2, "col3": "mystring"} - json_dataset.save(dummy_data) - preview = json_dataset.preview() - assert preview == expected_preview - assert ( - inspect.signature(json_dataset.preview).return_annotation.__name__ - == "JSONTrackingPreview" - ) diff --git a/kedro-datasets/tests/tracking/test_metrics_dataset.py b/kedro-datasets/tests/tracking/test_metrics_dataset.py deleted file mode 100644 index b638fcdfd..000000000 --- a/kedro-datasets/tests/tracking/test_metrics_dataset.py +++ /dev/null @@ -1,204 +0,0 @@ -import inspect -import json -from pathlib import Path, PurePosixPath - -import pytest -from fsspec.implementations.local import LocalFileSystem -from gcsfs import GCSFileSystem -from kedro.io.core import PROTOCOL_DELIMITER, DatasetError, Version -from s3fs.core import S3FileSystem - -from kedro_datasets.tracking import MetricsDataset - - -@pytest.fixture -def filepath_json(tmp_path): - return (tmp_path / "test.json").as_posix() - - -@pytest.fixture -def metrics_dataset(filepath_json, save_args, fs_args): - return MetricsDataset(filepath=filepath_json, save_args=save_args, fs_args=fs_args) - - -@pytest.fixture -def explicit_versioned_metrics_dataset(filepath_json, load_version, save_version): - return MetricsDataset( - filepath=filepath_json, version=Version(load_version, save_version) - ) - - -@pytest.fixture -def dummy_data(): - return {"col1": 1, "col2": 2, "col3": 3} - - -class TestMetricsDataset: - def test_save_data( - self, - dummy_data, - tmp_path, - filepath_json, - save_version, - ): - """Test saving and reloading the dataset.""" - metrics_dataset = MetricsDataset( - filepath=filepath_json, version=Version(None, save_version) - ) - metrics_dataset.save(dummy_data) - - actual_filepath = Path(metrics_dataset._filepath.as_posix()) - test_filepath = tmp_path / "locally_saved.json" - - test_filepath.parent.mkdir(parents=True, exist_ok=True) - with open(test_filepath, "w", encoding="utf-8") as file: - json.dump(dummy_data, file) - - with open(test_filepath, encoding="utf-8") as file: - test_data = json.load(file) - - with open( - (actual_filepath / save_version / "test.json"), encoding="utf-8" - ) as actual_file: - actual_data = json.load(actual_file) - - assert actual_data == test_data - assert metrics_dataset._fs_open_args_load == {} - assert metrics_dataset._fs_open_args_save == {"mode": "w"} - - def test_load_fail(self, metrics_dataset, dummy_data): - metrics_dataset.save(dummy_data) - pattern = r"Loading not supported for 'MetricsDataset'" - with pytest.raises(DatasetError, match=pattern): - metrics_dataset.load() - - def test_exists(self, metrics_dataset, dummy_data): - """Test `exists` method invocation for both existing and - nonexistent dataset.""" - assert not metrics_dataset.exists() - metrics_dataset.save(dummy_data) - assert metrics_dataset.exists() - - @pytest.mark.parametrize( - "save_args", [{"k1": "v1", "index": "value"}], indirect=True - ) - def test_save_extra_params(self, metrics_dataset, save_args): - """Test overriding the default save arguments.""" - for key, value in save_args.items(): - assert metrics_dataset._save_args[key] == value - - @pytest.mark.parametrize( - "fs_args", - [{"open_args_load": {"mode": "rb", "compression": "gzip"}}], - indirect=True, - ) - def test_open_extra_args(self, metrics_dataset, fs_args): - assert metrics_dataset._fs_open_args_load == fs_args["open_args_load"] - assert metrics_dataset._fs_open_args_save == {"mode": "w"} # default unchanged - - @pytest.mark.parametrize( - "filepath,instance_type", - [ - ("s3://bucket/file.json", S3FileSystem), - ("file:///tmp/test.json", LocalFileSystem), - ("/tmp/test.json", LocalFileSystem), - ("gcs://bucket/file.json", GCSFileSystem), - ], - ) - def test_protocol_usage(self, filepath, instance_type): - dataset = MetricsDataset(filepath=filepath) - assert isinstance(dataset._fs, instance_type) - - path = filepath.split(PROTOCOL_DELIMITER, 1)[-1] - - assert str(dataset._filepath) == path - assert isinstance(dataset._filepath, PurePosixPath) - - def test_catalog_release(self, mocker): - fs_mock = mocker.patch("fsspec.filesystem").return_value - filepath = "test.json" - dataset = MetricsDataset(filepath=filepath) - dataset.release() - fs_mock.invalidate_cache.assert_called_once_with(filepath) - - def test_fail_on_saving_non_numeric_value(self, metrics_dataset): - data = {"col1": 1, "col2": 2, "col3": "hello"} - - pattern = "The MetricsDataset expects only numeric values." - with pytest.raises(DatasetError, match=pattern): - metrics_dataset.save(data) - - def test_not_version_str_repr(self): - """Test that version is not in string representation of the class instance.""" - filepath = "test.json" - ds = MetricsDataset(filepath=filepath) - - assert filepath in str(ds) - assert "version" not in str(ds) - assert "MetricsDataset" in str(ds) - assert "protocol" in str(ds) - # Default save_args - assert "save_args={'indent': 2}" in str(ds) - - def test_version_str_repr(self, load_version, save_version): - """Test that version is in string representation of the class instance.""" - filepath = "test.json" - ds_versioned = MetricsDataset( - filepath=filepath, version=Version(load_version, save_version) - ) - - assert filepath in str(ds_versioned) - ver_str = f"version=Version(load={load_version}, save='{save_version}')" - assert ver_str in str(ds_versioned) - assert "MetricsDataset" in str(ds_versioned) - assert "protocol" in str(ds_versioned) - # Default save_args - assert "save_args={'indent': 2}" in str(ds_versioned) - - def test_prevent_overwrite(self, explicit_versioned_metrics_dataset, dummy_data): - """Check the error when attempting to override the dataset if the - corresponding json file for a given save version already exists.""" - explicit_versioned_metrics_dataset.save(dummy_data) - pattern = ( - r"Save path \'.+\' for MetricsDataset\(.+\) must " - r"not exist if versioning is enabled\." - ) - with pytest.raises(DatasetError, match=pattern): - explicit_versioned_metrics_dataset.save(dummy_data) - - @pytest.mark.parametrize( - "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True - ) - @pytest.mark.parametrize( - "save_version", ["2019-01-02T00.00.00.000Z"], indirect=True - ) - def test_save_version_warning( - self, explicit_versioned_metrics_dataset, load_version, save_version, dummy_data - ): - """Check the warning when saving to the path that differs from - the subsequent load path.""" - pattern = ( - f"Save version '{save_version}' did not match " - f"load version '{load_version}' for " - r"MetricsDataset\(.+\)" - ) - with pytest.warns(UserWarning, match=pattern): - explicit_versioned_metrics_dataset.save(dummy_data) - - def test_http_filesystem_no_versioning(self): - pattern = "Versioning is not supported for HTTP protocols." - - with pytest.raises(DatasetError, match=pattern): - MetricsDataset( - filepath="https://example.com/file.json", version=Version(None, None) - ) - - def test_preview(self, metrics_dataset, dummy_data): - expected_preview = {"col1": 1, "col2": 2, "col3": 3} - metrics_dataset.save(dummy_data) - preview = metrics_dataset.preview() - assert preview == expected_preview - assert ( - inspect.signature(metrics_dataset.preview).return_annotation.__name__ - == "MetricsTrackingPreview" - ) From 778811c976ff871dde6640ae9b62a11884540859 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar <110245118+ankatiyar@users.noreply.github.com> Date: Mon, 13 Jan 2025 15:15:32 +0000 Subject: [PATCH 03/24] docs(datasets): Move to linkcode extension (#985) Move to linkcode extension Signed-off-by: Ankita Katiyar Signed-off-by: Richard Asselin --- kedro-datasets/docs/source/conf.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/docs/source/conf.py b/kedro-datasets/docs/source/conf.py index f62e80104..039658936 100644 --- a/kedro-datasets/docs/source/conf.py +++ b/kedro-datasets/docs/source/conf.py @@ -14,6 +14,8 @@ from __future__ import annotations import importlib +import inspect +import os import re import sys from inspect import getmembers, isclass, isfunction @@ -22,6 +24,8 @@ from click import secho, style from kedro import __version__ as release +import kedro_datasets + # -- Project information ----------------------------------------------------- project = "kedro-datasets" @@ -47,7 +51,7 @@ "sphinx_autodoc_typehints", "sphinx.ext.doctest", "sphinx.ext.ifconfig", - "sphinx.ext.viewcode", + "sphinx.ext.linkcode", "sphinxcontrib.jquery", "sphinx_copybutton", "myst_parser", @@ -452,3 +456,25 @@ def setup(app): user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0" myst_heading_anchors = 5 + +def linkcode_resolve(domain, info): + """Resolve a GitHub URL corresponding to a Python object.""" + if domain != 'py': + return None + + try: + mod = sys.modules[info['module']] + obj = mod + for attr in info['fullname'].split('.'): + obj = getattr(obj, attr) + obj = inspect.unwrap(obj) + + filename = inspect.getsourcefile(obj) + source, lineno = inspect.getsourcelines(obj) + relpath = os.path.relpath(filename, start=os.path.dirname( + kedro_datasets.__file__)) + + return f'https://github.com/kedro-org/kedro-plugins/blob/main/kedro-datasets/kedro_datasets/{relpath}#L{lineno}#L{lineno + len(source) - 1}' + + except (KeyError, ImportError, AttributeError, TypeError, OSError, ValueError): + return None From 5a6b100ff3ab2c557d119820b40f7963f6d385f2 Mon Sep 17 00:00:00 2001 From: Ravi Kumar Pilla Date: Mon, 13 Jan 2025 09:48:50 -0600 Subject: [PATCH 04/24] fix(datasets): Fix polars.CSVDataset `save` on Windows (#979) * test csv win Signed-off-by: ravi_kumar_pilla * change ci yaml for testing Signed-off-by: ravi_kumar_pilla * change ci yaml for testing Signed-off-by: ravi_kumar_pilla * add default encoding when opening file * revert workflow tests Signed-off-by: ravi_kumar_pilla * fix lint Signed-off-by: ravi_kumar_pilla * update release note * update release note --------- Signed-off-by: ravi_kumar_pilla Signed-off-by: Richard Asselin --- kedro-datasets/RELEASE.md | 5 ++++- kedro-datasets/kedro_datasets/polars/csv_dataset.py | 4 +++- kedro-datasets/tests/polars/test_csv_dataset.py | 10 ---------- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 16fa5b18a..27df63f78 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,10 +1,13 @@ # Upcoming Release ## Major features and improvements + ## Bug fixes and other changes +- Fix polars.CSVDataset `save` method on Windows using `utf-8` as default encoding. + ## Breaking Changes -- Removed `tracking.MetricsDataset` and `tracking.JSONDataset` +- Removed `tracking.MetricsDataset` and `tracking.JSONDataset`. ## Community contributions diff --git a/kedro-datasets/kedro_datasets/polars/csv_dataset.py b/kedro-datasets/kedro_datasets/polars/csv_dataset.py index 6d8a988a5..9e6f35846 100644 --- a/kedro-datasets/kedro_datasets/polars/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/polars/csv_dataset.py @@ -72,7 +72,9 @@ class CSVDataset(AbstractVersionedDataset[pl.DataFrame, pl.DataFrame]): DEFAULT_LOAD_ARGS: dict[str, Any] = {"rechunk": True} DEFAULT_SAVE_ARGS: dict[str, Any] = {} - DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "w"}} + DEFAULT_FS_ARGS: dict[str, Any] = { + "open_args_save": {"mode": "w", "encoding": "utf-8"} + } def __init__( # noqa: PLR0913 self, diff --git a/kedro-datasets/tests/polars/test_csv_dataset.py b/kedro-datasets/tests/polars/test_csv_dataset.py index e03f192cc..5312e9b48 100644 --- a/kedro-datasets/tests/polars/test_csv_dataset.py +++ b/kedro-datasets/tests/polars/test_csv_dataset.py @@ -88,14 +88,12 @@ def mocked_csv_in_s3(mocked_s3_bucket, mocked_dataframe: pl.DataFrame): class TestCSVDataset: - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_save_and_load(self, csv_dataset, dummy_dataframe): """Test saving and reloading the dataset.""" csv_dataset.save(dummy_dataframe) reloaded = csv_dataset.load() assert_frame_equal(dummy_dataframe, reloaded) - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_exists(self, csv_dataset, dummy_dataframe): """Test `exists` method invocation for both existing and nonexistent dataset.""" @@ -204,7 +202,6 @@ def test_version_str_repr(self, load_version, save_version): assert "load_args={'rechunk': True}" in str(ds) assert "load_args={'rechunk': True}" in str(ds_versioned) - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_save_and_load(self, versioned_csv_dataset, dummy_dataframe): """Test that saved and reloaded data matches the original one for the versioned dataset.""" @@ -212,7 +209,6 @@ def test_save_and_load(self, versioned_csv_dataset, dummy_dataframe): reloaded_df = versioned_csv_dataset.load() assert_frame_equal(dummy_dataframe, reloaded_df) - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_multiple_loads(self, versioned_csv_dataset, dummy_dataframe, filepath_csv): """Test that if a new version is created mid-run, by an external system, it won't be loaded in the current run.""" @@ -236,7 +232,6 @@ def test_multiple_loads(self, versioned_csv_dataset, dummy_dataframe, filepath_c ds_new.resolve_load_version() == v_new ) # new version is discoverable by a new instance - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_multiple_saves(self, dummy_dataframe, filepath_csv): """Test multiple cycles of save followed by load for the same dataset""" ds_versioned = CSVDataset(filepath=filepath_csv, version=Version(None, None)) @@ -259,7 +254,6 @@ def test_multiple_saves(self, dummy_dataframe, filepath_csv): ds_new = CSVDataset(filepath=filepath_csv, version=Version(None, None)) assert ds_new.resolve_load_version() == second_load_version - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_release_instance_cache(self, dummy_dataframe, filepath_csv): """Test that cache invalidation does not affect other instances""" ds_a = CSVDataset(filepath=filepath_csv, version=Version(None, None)) @@ -288,14 +282,12 @@ def test_no_versions(self, versioned_csv_dataset): with pytest.raises(DatasetError, match=pattern): versioned_csv_dataset.load() - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_exists(self, versioned_csv_dataset, dummy_dataframe): """Test `exists` method invocation for versioned dataset.""" assert not versioned_csv_dataset.exists() versioned_csv_dataset.save(dummy_dataframe) assert versioned_csv_dataset.exists() - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_prevent_overwrite(self, versioned_csv_dataset, dummy_dataframe): """Check the error when attempting to override the dataset if the corresponding CSV file for a given save version already exists.""" @@ -307,7 +299,6 @@ def test_prevent_overwrite(self, versioned_csv_dataset, dummy_dataframe): with pytest.raises(DatasetError, match=pattern): versioned_csv_dataset.save(dummy_dataframe) - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") @pytest.mark.parametrize( "load_version", ["2019-01-01T23.59.59.999Z"], indirect=True ) @@ -334,7 +325,6 @@ def test_http_filesystem_no_versioning(self): filepath="https://example.com/file.csv", version=Version(None, None) ) - @pytest.mark.xfail(sys.platform == "win32", reason="file encoding is not UTF-8") def test_versioning_existing_dataset( self, csv_dataset, versioned_csv_dataset, dummy_dataframe ): From 377cf97f2312f426ec6464be659f9fd0772b9736 Mon Sep 17 00:00:00 2001 From: ElenaKhaustova <157851531+ElenaKhaustova@users.noreply.github.com> Date: Mon, 13 Jan 2025 16:55:09 +0000 Subject: [PATCH 05/24] feat(all): Replace trufflehog with detect-secrets (#983) * Removed trufflehog Signed-off-by: Elena Khaustova * Updated github actions per plugin Signed-off-by: Elena Khaustova * Updated release notes Signed-off-by: Elena Khaustova * Updated validate-pr check scopes Signed-off-by: Elena Khaustova * Updated lint command Signed-off-by: Elena Khaustova * Added key to trigger check Signed-off-by: Elena Khaustova * Updated GH action to track per plugin Signed-off-by: Elena Khaustova * Removed secret Signed-off-by: Elena Khaustova * Updated GH for kedro-datasets Signed-off-by: Elena Khaustova * Updated secrets baseline Signed-off-by: Elena Khaustova --------- Signed-off-by: Elena Khaustova Signed-off-by: Richard Asselin --- .github/workflows/detect-secrets.yml | 46 +++ .github/workflows/kedro-airflow.yml | 7 + .github/workflows/kedro-datasets.yml | 7 + .github/workflows/kedro-docker.yml | 7 + .github/workflows/kedro-telemetry.yml | 7 + .github/workflows/validate-pr-title.yaml | 1 + .pre-commit-config.yaml | 12 +- .secrets.baseline | 494 +++++++++++++++++++++++ Makefile | 5 +- kedro-airflow/RELEASE.md | 1 + kedro-airflow/pyproject.toml | 2 +- kedro-datasets/RELEASE.md | 2 + kedro-datasets/pyproject.toml | 2 +- kedro-docker/RELEASE.md | 1 + kedro-docker/pyproject.toml | 2 +- kedro-telemetry/RELEASE.md | 1 + kedro-telemetry/pyproject.toml | 2 +- trufflehog-ignore.txt | 3 - 18 files changed, 585 insertions(+), 17 deletions(-) create mode 100644 .github/workflows/detect-secrets.yml create mode 100644 .secrets.baseline delete mode 100644 trufflehog-ignore.txt diff --git a/.github/workflows/detect-secrets.yml b/.github/workflows/detect-secrets.yml new file mode 100644 index 000000000..bd360b52b --- /dev/null +++ b/.github/workflows/detect-secrets.yml @@ -0,0 +1,46 @@ +name: Detect secrets on plugins + +on: + workflow_call: + inputs: + plugin: + type: string + os: + type: string + python-version: + type: string + +jobs: + detect-secrets: + defaults: + run: + shell: bash + runs-on: ${{ inputs.os }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ inputs.python-version }} + - name: Cache python packages + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: ${{inputs.plugin}}-${{inputs.os}}-python-${{inputs.python-version}} + restore-keys: ${{inputs.plugin}} + - name: Install uv + run: | + python -m pip install "uv==0.2.21" + - name: Install dependencies + run: | + cd ${{ inputs.plugin }} + uv pip install --system "kedro @ git+https://github.com/kedro-org/kedro@main" + uv pip install --system "${{inputs.plugin}}[lint] @ ." + uv pip freeze --system + - name: Install pre-commit hooks + run: | + pre-commit install --install-hooks + pre-commit install --hook-type pre-push + - name: Scan all tracked files + run: git ls-files ":(glob)*" ${{ inputs.plugin }} -z | xargs -0 detect-secrets-hook --baseline .secrets.baseline diff --git a/.github/workflows/kedro-airflow.yml b/.github/workflows/kedro-airflow.yml index 85e7ca62d..92c269ea2 100644 --- a/.github/workflows/kedro-airflow.yml +++ b/.github/workflows/kedro-airflow.yml @@ -46,3 +46,10 @@ jobs: plugin: kedro-airflow os: ${{ matrix.os }} python-version: ${{ matrix.python-version }} + + detect-secrets: + uses: ./.github/workflows/detect-secrets.yml + with: + plugin: kedro-airflow + os: ubuntu-latest + python-version: "3.11" diff --git a/.github/workflows/kedro-datasets.yml b/.github/workflows/kedro-datasets.yml index d5aae0282..010115b73 100644 --- a/.github/workflows/kedro-datasets.yml +++ b/.github/workflows/kedro-datasets.yml @@ -61,3 +61,10 @@ jobs: - name: Documentation check for kedro-datasets run: | make check-datasets-docs + + detect-secrets: + uses: ./.github/workflows/detect-secrets.yml + with: + plugin: kedro-datasets + os: ubuntu-latest + python-version: "3.11" diff --git a/.github/workflows/kedro-docker.yml b/.github/workflows/kedro-docker.yml index 66783b3b5..16ffcbafe 100644 --- a/.github/workflows/kedro-docker.yml +++ b/.github/workflows/kedro-docker.yml @@ -46,3 +46,10 @@ jobs: plugin: kedro-docker os: ${{ matrix.os }} python-version: ${{ matrix.python-version }} + + detect-secrets: + uses: ./.github/workflows/detect-secrets.yml + with: + plugin: kedro-docker + os: ubuntu-latest + python-version: "3.11" diff --git a/.github/workflows/kedro-telemetry.yml b/.github/workflows/kedro-telemetry.yml index 5584ac775..aac47914e 100644 --- a/.github/workflows/kedro-telemetry.yml +++ b/.github/workflows/kedro-telemetry.yml @@ -35,3 +35,10 @@ jobs: plugin: kedro-telemetry os: ubuntu-latest python-version: "3.11" + + detect-secrets: + uses: ./.github/workflows/detect-secrets.yml + with: + plugin: kedro-telemetry + os: ubuntu-latest + python-version: "3.11" diff --git a/.github/workflows/validate-pr-title.yaml b/.github/workflows/validate-pr-title.yaml index b6e6fc808..cb1e65327 100644 --- a/.github/workflows/validate-pr-title.yaml +++ b/.github/workflows/validate-pr-title.yaml @@ -19,5 +19,6 @@ jobs: datasets docker telemetry + all env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9f9706a34..9d2eb8de3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,6 +24,12 @@ repos: additional_dependencies: - black==22.12.0 + - repo: https://github.com/Yelp/detect-secrets + rev: v1.5.0 + hooks: + - id: detect-secrets + args: [ '--baseline', '.secrets.baseline' ] + - repo: local hooks: - id: ruff-kedro-datasets @@ -86,12 +92,6 @@ repos: pass_filenames: false entry: black kedro-telemetry/kedro_telemetry kedro-telemetry/tests - - id: secret_scan - name: "Secret scan" - language: system - pass_filenames: false - entry: make secret-scan - - id: bandit name: "Bandit security check" language: system diff --git a/.secrets.baseline b/.secrets.baseline new file mode 100644 index 000000000..ce3799e06 --- /dev/null +++ b/.secrets.baseline @@ -0,0 +1,494 @@ +{ + "version": "1.5.0", + "plugins_used": [ + { + "name": "ArtifactoryDetector" + }, + { + "name": "AWSKeyDetector" + }, + { + "name": "AzureStorageKeyDetector" + }, + { + "name": "Base64HighEntropyString", + "limit": 4.5 + }, + { + "name": "BasicAuthDetector" + }, + { + "name": "CloudantDetector" + }, + { + "name": "DiscordBotTokenDetector" + }, + { + "name": "GitHubTokenDetector" + }, + { + "name": "GitLabTokenDetector" + }, + { + "name": "HexHighEntropyString", + "limit": 3.0 + }, + { + "name": "IbmCloudIamDetector" + }, + { + "name": "IbmCosHmacDetector" + }, + { + "name": "IPPublicDetector" + }, + { + "name": "JwtTokenDetector" + }, + { + "name": "KeywordDetector", + "keyword_exclude": "" + }, + { + "name": "MailchimpDetector" + }, + { + "name": "NpmDetector" + }, + { + "name": "OpenAIDetector" + }, + { + "name": "PrivateKeyDetector" + }, + { + "name": "PypiTokenDetector" + }, + { + "name": "SendGridDetector" + }, + { + "name": "SlackDetector" + }, + { + "name": "SoftlayerDetector" + }, + { + "name": "SquareOAuthDetector" + }, + { + "name": "StripeDetector" + }, + { + "name": "TelegramBotTokenDetector" + }, + { + "name": "TwilioKeyDetector" + } + ], + "filters_used": [ + { + "path": "detect_secrets.filters.allowlist.is_line_allowlisted" + }, + { + "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies", + "min_level": 2 + }, + { + "path": "detect_secrets.filters.heuristic.is_indirect_reference" + }, + { + "path": "detect_secrets.filters.heuristic.is_likely_id_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_lock_file" + }, + { + "path": "detect_secrets.filters.heuristic.is_not_alphanumeric_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_potential_uuid" + }, + { + "path": "detect_secrets.filters.heuristic.is_prefixed_with_dollar_sign" + }, + { + "path": "detect_secrets.filters.heuristic.is_sequential_string" + }, + { + "path": "detect_secrets.filters.heuristic.is_swagger_file" + }, + { + "path": "detect_secrets.filters.heuristic.is_templated_secret" + } + ], + "results": { + "kedro-datasets/kedro_datasets/dask/parquet_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/kedro_datasets/dask/parquet_dataset.py", + "hashed_secret": "6e1d66a1596528c308e601c10aa0b92d53606ab9", + "is_verified": false, + "line_number": 71 + } + ], + "kedro-datasets/kedro_datasets/pandas/sql_dataset.py": [ + { + "type": "Basic Auth Credentials", + "filename": "kedro-datasets/kedro_datasets/pandas/sql_dataset.py", + "hashed_secret": "46e3d772a1888eadff26c7ada47fd7502d796e07", + "is_verified": false, + "line_number": 130 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/kedro_datasets/pandas/sql_dataset.py", + "hashed_secret": "e026e197bb77b12d16ab6986e068751f016d0ea5", + "is_verified": false, + "line_number": 382 + } + ], + "kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/kedro_datasets/snowflake/snowpark_dataset.py", + "hashed_secret": "a761ce3a45d97e41840a788495e85a70d1bb3815", + "is_verified": false, + "line_number": 83 + } + ], + "kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/kedro_datasets/spark/spark_jdbc_dataset.py", + "hashed_secret": "46e3d772a1888eadff26c7ada47fd7502d796e07", + "is_verified": false, + "line_number": 57 + } + ], + "kedro-datasets/kedro_datasets_experimental/langchain/_anthropic.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/kedro_datasets_experimental/langchain/_anthropic.py", + "hashed_secret": "b60d121b438a380c343d5ec3c2037564b82ffef3", + "is_verified": false, + "line_number": 44 + } + ], + "kedro-datasets/kedro_datasets_experimental/langchain/_cohere.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/kedro_datasets_experimental/langchain/_cohere.py", + "hashed_secret": "b60d121b438a380c343d5ec3c2037564b82ffef3", + "is_verified": false, + "line_number": 45 + } + ], + "kedro-datasets/kedro_datasets_experimental/tests/netcdf/test_netcdf_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/kedro_datasets_experimental/tests/netcdf/test_netcdf_dataset.py", + "hashed_secret": "adb5fabe51f5b45e83fdd91b71c92156fec4a63e", + "is_verified": false, + "line_number": 17 + } + ], + "kedro-datasets/kedro_datasets_experimental/tests/video/test_video_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/kedro_datasets_experimental/tests/video/test_video_dataset.py", + "hashed_secret": "adb5fabe51f5b45e83fdd91b71c92156fec4a63e", + "is_verified": false, + "line_number": 16 + } + ], + "kedro-datasets/tests/dask/test_csv_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/dask/test_csv_dataset.py", + "hashed_secret": "adb5fabe51f5b45e83fdd91b71c92156fec4a63e", + "is_verified": false, + "line_number": 14 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/dask/test_csv_dataset.py", + "hashed_secret": "727d8ff68b6b550f2cf6e737b3cad5149c65fe5b", + "is_verified": false, + "line_number": 27 + } + ], + "kedro-datasets/tests/dask/test_parquet_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/dask/test_parquet_dataset.py", + "hashed_secret": "adb5fabe51f5b45e83fdd91b71c92156fec4a63e", + "is_verified": false, + "line_number": 16 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/dask/test_parquet_dataset.py", + "hashed_secret": "727d8ff68b6b550f2cf6e737b3cad5149c65fe5b", + "is_verified": false, + "line_number": 29 + } + ], + "kedro-datasets/tests/holoviews/test_holoviews_writer.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/holoviews/test_holoviews_writer.py", + "hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", + "is_verified": false, + "line_number": 108 + } + ], + "kedro-datasets/tests/matplotlib/test_matplotlib_writer.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/matplotlib/test_matplotlib_writer.py", + "hashed_secret": "dc724af18fbdd4e59189f5fe768a5f8311527050", + "is_verified": false, + "line_number": 16 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/matplotlib/test_matplotlib_writer.py", + "hashed_secret": "727d8ff68b6b550f2cf6e737b3cad5149c65fe5b", + "is_verified": false, + "line_number": 59 + } + ], + "kedro-datasets/tests/pandas/test_csv_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/pandas/test_csv_dataset.py", + "hashed_secret": "727d8ff68b6b550f2cf6e737b3cad5149c65fe5b", + "is_verified": false, + "line_number": 66 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/pandas/test_csv_dataset.py", + "hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", + "is_verified": false, + "line_number": 213 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/pandas/test_csv_dataset.py", + "hashed_secret": "adb5fabe51f5b45e83fdd91b71c92156fec4a63e", + "is_verified": false, + "line_number": 405 + } + ], + "kedro-datasets/tests/pandas/test_generic_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/pandas/test_generic_dataset.py", + "hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", + "is_verified": false, + "line_number": 126 + } + ], + "kedro-datasets/tests/pandas/test_json_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/pandas/test_json_dataset.py", + "hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", + "is_verified": false, + "line_number": 140 + } + ], + "kedro-datasets/tests/pandas/test_sql_dataset.py": [ + { + "type": "Basic Auth Credentials", + "filename": "kedro-datasets/tests/pandas/test_sql_dataset.py", + "hashed_secret": "46e3d772a1888eadff26c7ada47fd7502d796e07", + "is_verified": false, + "line_number": 19 + } + ], + "kedro-datasets/tests/pandas/test_xml_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/pandas/test_xml_dataset.py", + "hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", + "is_verified": false, + "line_number": 117 + } + ], + "kedro-datasets/tests/partitions/test_incremental_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/partitions/test_incremental_dataset.py", + "hashed_secret": "727d8ff68b6b550f2cf6e737b3cad5149c65fe5b", + "is_verified": false, + "line_number": 440 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/partitions/test_incremental_dataset.py", + "hashed_secret": "adb5fabe51f5b45e83fdd91b71c92156fec4a63e", + "is_verified": false, + "line_number": 460 + } + ], + "kedro-datasets/tests/partitions/test_partitioned_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/partitions/test_partitioned_dataset.py", + "hashed_secret": "76f747de912e8682e29a23cb506dd5bf0de080d2", + "is_verified": false, + "line_number": 415 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/partitions/test_partitioned_dataset.py", + "hashed_secret": "9027cc5a2c1321de60a2d71ccde6229d1152d6d3", + "is_verified": false, + "line_number": 416 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/partitions/test_partitioned_dataset.py", + "hashed_secret": "5dcbdf371f181b9b7a41a4be7be70f8cbee67da7", + "is_verified": false, + "line_number": 452 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/partitions/test_partitioned_dataset.py", + "hashed_secret": "727d8ff68b6b550f2cf6e737b3cad5149c65fe5b", + "is_verified": false, + "line_number": 503 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/partitions/test_partitioned_dataset.py", + "hashed_secret": "adb5fabe51f5b45e83fdd91b71c92156fec4a63e", + "is_verified": false, + "line_number": 523 + } + ], + "kedro-datasets/tests/plotly/test_html_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/plotly/test_html_dataset.py", + "hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", + "is_verified": false, + "line_number": 70 + } + ], + "kedro-datasets/tests/plotly/test_json_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/plotly/test_json_dataset.py", + "hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", + "is_verified": false, + "line_number": 83 + } + ], + "kedro-datasets/tests/plotly/test_plotly_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/plotly/test_plotly_dataset.py", + "hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", + "is_verified": false, + "line_number": 81 + } + ], + "kedro-datasets/tests/polars/test_csv_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/polars/test_csv_dataset.py", + "hashed_secret": "727d8ff68b6b550f2cf6e737b3cad5149c65fe5b", + "is_verified": false, + "line_number": 65 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/polars/test_csv_dataset.py", + "hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", + "is_verified": false, + "line_number": 159 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/polars/test_csv_dataset.py", + "hashed_secret": "adb5fabe51f5b45e83fdd91b71c92156fec4a63e", + "is_verified": false, + "line_number": 351 + } + ], + "kedro-datasets/tests/polars/test_eager_polars_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/polars/test_eager_polars_dataset.py", + "hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", + "is_verified": false, + "line_number": 126 + } + ], + "kedro-datasets/tests/polars/test_lazy_polars_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/polars/test_lazy_polars_dataset.py", + "hashed_secret": "727d8ff68b6b550f2cf6e737b3cad5149c65fe5b", + "is_verified": false, + "line_number": 93 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/polars/test_lazy_polars_dataset.py", + "hashed_secret": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", + "is_verified": false, + "line_number": 198 + } + ], + "kedro-datasets/tests/snowflake/test_snowpark_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/snowflake/test_snowpark_dataset.py", + "hashed_secret": "1365dbfe676a193420ed7981184720b426ef2b7a", + "is_verified": false, + "line_number": 32 + } + ], + "kedro-datasets/tests/spark/test_spark_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/spark/test_spark_dataset.py", + "hashed_secret": "adb5fabe51f5b45e83fdd91b71c92156fec4a63e", + "is_verified": false, + "line_number": 42 + } + ], + "kedro-datasets/tests/spark/test_spark_jdbc_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/spark/test_spark_jdbc_dataset.py", + "hashed_secret": "4f4fa638cf19a2919f12e0105085c123ca5c5172", + "is_verified": false, + "line_number": 15 + } + ], + "kedro-datasets/tests/spark/test_spark_streaming_dataset.py": [ + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/spark/test_spark_streaming_dataset.py", + "hashed_secret": "adb5fabe51f5b45e83fdd91b71c92156fec4a63e", + "is_verified": false, + "line_number": 17 + }, + { + "type": "Secret Keyword", + "filename": "kedro-datasets/tests/spark/test_spark_streaming_dataset.py", + "hashed_secret": "727d8ff68b6b550f2cf6e737b3cad5149c65fe5b", + "is_verified": false, + "line_number": 64 + } + ] + }, + "generated_at": "2025-01-13T16:27:46Z" +} diff --git a/Makefile b/Makefile index c7946d605..e8c8a4e08 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ install-pip-setuptools: python -m pip install -U pip setuptools wheel lint: - pre-commit run -a --hook-stage manual ruff-$(plugin) && pre-commit run trailing-whitespace --all-files && pre-commit run end-of-file-fixer --all-files && pre-commit run check-yaml --all-files && pre-commit run check-added-large-files --all-files && pre-commit run check-case-conflict --all-files && pre-commit run check-merge-conflict --all-files && pre-commit run debug-statements --all-files && pre-commit run black-$(plugin) --all-files --hook-stage manual && pre-commit run secret_scan --all-files --hook-stage manual && pre-commit run bandit --all-files --hook-stage manual + pre-commit run -a --hook-stage manual ruff-$(plugin) && pre-commit run trailing-whitespace --all-files && pre-commit run end-of-file-fixer --all-files && pre-commit run check-yaml --all-files && pre-commit run check-added-large-files --all-files && pre-commit run check-case-conflict --all-files && pre-commit run check-merge-conflict --all-files && pre-commit run debug-statements --all-files && pre-commit run black-$(plugin) --all-files --hook-stage manual && pre-commit run bandit --all-files --hook-stage manual $(MAKE) mypy mypy: @@ -21,9 +21,6 @@ test: e2e-tests: cd $(plugin) && behave -secret-scan: - trufflehog --max_depth 1 --exclude_paths trufflehog-ignore.txt . - install-test-requirements: cd $(plugin) && uv pip install ".[test]" diff --git a/kedro-airflow/RELEASE.md b/kedro-airflow/RELEASE.md index 6bd0b7163..348945ac9 100755 --- a/kedro-airflow/RELEASE.md +++ b/kedro-airflow/RELEASE.md @@ -1,4 +1,5 @@ # Upcoming Release +* Replaced `trufflehog` with `detect-secrets` for detecting secrets within a code base. # Release 0.9.2 * Removed support for Python 3.8 diff --git a/kedro-airflow/pyproject.toml b/kedro-airflow/pyproject.toml index ec7563cdd..6ef8a8b40 100644 --- a/kedro-airflow/pyproject.toml +++ b/kedro-airflow/pyproject.toml @@ -38,9 +38,9 @@ test = [ lint = [ "bandit", "black~=22.0", + "detect-secrets~=1.5.0", "mypy~=1.0", "pre-commit>=2.9.2", - "trufflehog>=2.1.0, <3.0", "ruff~=0.0.290", # mypy requirements "types-PyYAML", diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 27df63f78..15c13da84 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,6 +1,8 @@ # Upcoming Release ## Major features and improvements +- Replaced `trufflehog` with `detect-secrets` for detecting secrets within a code base. + ## Bug fixes and other changes - Fix polars.CSVDataset `save` method on Windows using `utf-8` as default encoding. diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml index 3ee8eb9e9..1fcde25c6 100644 --- a/kedro-datasets/pyproject.toml +++ b/kedro-datasets/pyproject.toml @@ -270,11 +270,11 @@ lint = [ "bandit>=1.6.2, <2.0", "blacken-docs==1.9.2", "black~=22.0", + "detect-secrets~=1.5.0", "import-linter[toml]==1.2.6", "mypy~=1.0", "pre-commit>=2.9.2", "ruff~=0.0.290", - "trufflehog~=2.1", # mypy related dependencies "types-cachetools", "types-PyYAML", diff --git a/kedro-docker/RELEASE.md b/kedro-docker/RELEASE.md index f81181579..b7bab9313 100644 --- a/kedro-docker/RELEASE.md +++ b/kedro-docker/RELEASE.md @@ -1,4 +1,5 @@ # Upcoming Release +* Replaced `trufflehog` with `detect-secrets` for detecting secrets within a code base. # Release 0.6.2 diff --git a/kedro-docker/pyproject.toml b/kedro-docker/pyproject.toml index 15c8d04fc..b669a0e2d 100644 --- a/kedro-docker/pyproject.toml +++ b/kedro-docker/pyproject.toml @@ -39,9 +39,9 @@ test = [ lint = [ "bandit", "black~=22.0", + "detect-secrets~=1.5.0", "mypy~=1.0", "pre-commit>=2.9.2", - "trufflehog>=2.1.0, <3.0", "ruff~=0.0.290", ] diff --git a/kedro-telemetry/RELEASE.md b/kedro-telemetry/RELEASE.md index df7bb603a..1b4fce80f 100644 --- a/kedro-telemetry/RELEASE.md +++ b/kedro-telemetry/RELEASE.md @@ -1,4 +1,5 @@ # Upcoming release +* Replaced `trufflehog` with `detect-secrets` for detecting secrets within a code base. # Release 0.6.2 * Removed support for Python 3.8 diff --git a/kedro-telemetry/pyproject.toml b/kedro-telemetry/pyproject.toml index 45f9d995d..1f43f2315 100644 --- a/kedro-telemetry/pyproject.toml +++ b/kedro-telemetry/pyproject.toml @@ -35,9 +35,9 @@ test = [ lint = [ "bandit>=1.6.2, <2.0", "black~=22.0", + "detect-secrets~=1.5.0", "mypy~=1.0", "pre-commit>=2.9.2", - "trufflehog>=2.1.0, <3.0", "ruff~=0.0.290", # mypy requirements "types-requests", diff --git a/trufflehog-ignore.txt b/trufflehog-ignore.txt deleted file mode 100644 index 1929a2634..000000000 --- a/trufflehog-ignore.txt +++ /dev/null @@ -1,3 +0,0 @@ -kedro-telemetry/README.md -kedro-telemetry/RELEASE.md -kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py From 7ed49f96671aaad4bc042894f84fd92c13a28750 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Wed, 15 Jan 2025 07:43:25 -0700 Subject: [PATCH 06/24] build(datasets): use intersphinx over type_targets (#801) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Deepyaman Datta Signed-off-by: Juan Luis Cano Rodríguez Co-authored-by: Juan Luis Cano Rodríguez Signed-off-by: Richard Asselin --- kedro-datasets/docs/source/conf.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kedro-datasets/docs/source/conf.py b/kedro-datasets/docs/source/conf.py index 039658936..90c66bd83 100644 --- a/kedro-datasets/docs/source/conf.py +++ b/kedro-datasets/docs/source/conf.py @@ -102,6 +102,7 @@ intersphinx_mapping = { "kedro": ("https://docs.kedro.org/en/stable/", None), "python": ("https://docs.python.org/3.10/", None), + "requests": ("https://requests.readthedocs.io/en/stable/", None), } type_targets = { @@ -110,8 +111,6 @@ "kedro.io.AbstractDataset", "AbstractDataset", "kedro.io.core.Version", - "requests.auth.AuthBase", - "requests.models.Response", "google.oauth2.credentials.Credentials", "deltalake.table.Metadata", "DataCatalog", From fb9a0996fc9ba3b3c624e4b39c1b4478d4d00e1e Mon Sep 17 00:00:00 2001 From: ElenaKhaustova <157851531+ElenaKhaustova@users.noreply.github.com> Date: Wed, 22 Jan 2025 12:28:30 +0000 Subject: [PATCH 07/24] fix(datasets): Add parameter to enable/disable lazy saving for `PartitionedDataset` (#978) * Replaced callable check Signed-off-by: Elena Khaustova * Updateds lazy_save test Signed-off-by: Elena Khaustova * Added test_callable_save Signed-off-by: Elena Khaustova * Fixed lint Signed-off-by: Elena Khaustova * Fixed docs links Signed-off-by: Elena Khaustova * Fixed all docs links Signed-off-by: Elena Khaustova * Updated release notes Signed-off-by: Elena Khaustova * Fixed all docs links Signed-off-by: Elena Khaustova * Fixed typo Signed-off-by: Elena Khaustova * Added argument to disable lazy saving Signed-off-by: Elena Khaustova * Removed save function argument Signed-off-by: Elena Khaustova * Updated unit test Signed-off-by: Elena Khaustova * Fixed lint Signed-off-by: Elena Khaustova * Updated related docs Signed-off-by: Elena Khaustova * Revert test changes Signed-off-by: Elena Khaustova * Updated baseline Signed-off-by: Elena Khaustova * Updated release notes Signed-off-by: Elena Khaustova * Updated release notes Signed-off-by: Elena Khaustova * Updated docstrings Signed-off-by: Elena Khaustova --------- Signed-off-by: Elena Khaustova Signed-off-by: Richard Asselin --- .secrets.baseline | 14 +++++------ kedro-datasets/RELEASE.md | 1 + .../kedro_datasets/dask/csv_dataset.py | 5 ++-- .../kedro_datasets/dask/parquet_dataset.py | 5 ++-- .../partitions/partitioned_dataset.py | 10 +++++++- .../partitions/test_partitioned_dataset.py | 23 +++++++++++++++++++ 6 files changed, 46 insertions(+), 12 deletions(-) diff --git a/.secrets.baseline b/.secrets.baseline index ce3799e06..c18f3f6f1 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -129,7 +129,7 @@ "filename": "kedro-datasets/kedro_datasets/dask/parquet_dataset.py", "hashed_secret": "6e1d66a1596528c308e601c10aa0b92d53606ab9", "is_verified": false, - "line_number": 71 + "line_number": 72 } ], "kedro-datasets/kedro_datasets/pandas/sql_dataset.py": [ @@ -340,35 +340,35 @@ "filename": "kedro-datasets/tests/partitions/test_partitioned_dataset.py", "hashed_secret": "76f747de912e8682e29a23cb506dd5bf0de080d2", "is_verified": false, - "line_number": 415 + "line_number": 438 }, { "type": "Secret Keyword", "filename": "kedro-datasets/tests/partitions/test_partitioned_dataset.py", "hashed_secret": "9027cc5a2c1321de60a2d71ccde6229d1152d6d3", "is_verified": false, - "line_number": 416 + "line_number": 439 }, { "type": "Secret Keyword", "filename": "kedro-datasets/tests/partitions/test_partitioned_dataset.py", "hashed_secret": "5dcbdf371f181b9b7a41a4be7be70f8cbee67da7", "is_verified": false, - "line_number": 452 + "line_number": 475 }, { "type": "Secret Keyword", "filename": "kedro-datasets/tests/partitions/test_partitioned_dataset.py", "hashed_secret": "727d8ff68b6b550f2cf6e737b3cad5149c65fe5b", "is_verified": false, - "line_number": 503 + "line_number": 526 }, { "type": "Secret Keyword", "filename": "kedro-datasets/tests/partitions/test_partitioned_dataset.py", "hashed_secret": "adb5fabe51f5b45e83fdd91b71c92156fec4a63e", "is_verified": false, - "line_number": 523 + "line_number": 546 } ], "kedro-datasets/tests/plotly/test_html_dataset.py": [ @@ -490,5 +490,5 @@ } ] }, - "generated_at": "2025-01-13T16:27:46Z" + "generated_at": "2025-01-15T15:25:24Z" } diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 15c13da84..820388766 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,6 +1,7 @@ # Upcoming Release ## Major features and improvements +- Added a parameter to enable/disable lazy saving for `PartitionedDataset`. - Replaced `trufflehog` with `detect-secrets` for detecting secrets within a code base. ## Bug fixes and other changes diff --git a/kedro-datasets/kedro_datasets/dask/csv_dataset.py b/kedro-datasets/kedro_datasets/dask/csv_dataset.py index bc5b5764b..b82bff15e 100644 --- a/kedro-datasets/kedro_datasets/dask/csv_dataset.py +++ b/kedro-datasets/kedro_datasets/dask/csv_dataset.py @@ -1,5 +1,6 @@ """``CSVDataset`` is a dataset used to load and save data to CSV files using Dask dataframe""" + from __future__ import annotations from copy import deepcopy @@ -13,7 +14,7 @@ class CSVDataset(AbstractDataset[dd.DataFrame, dd.DataFrame]): """``CSVDataset`` loads and saves data to comma-separated value file(s). It uses Dask remote data services to handle the corresponding load and save operations: - https://docs.dask.org/en/latest/how-to/connect-to-remote-data.html + https://docs.dask.org/en/stable/how-to/connect-to-remote-data.html Example usage for the `YAML API `_: @@ -73,7 +74,7 @@ def __init__( # noqa: PLR0913 credentials: Credentials required to get access to the underlying filesystem. E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Optional parameters to the backend file system driver: - https://docs.dask.org/en/latest/how-to/connect-to-remote-data.html#optional-parameters + https://docs.dask.org/en/stable/how-to/connect-to-remote-data.html#optional-parameters metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ diff --git a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py index 3b2dff73e..b3a81c632 100644 --- a/kedro-datasets/kedro_datasets/dask/parquet_dataset.py +++ b/kedro-datasets/kedro_datasets/dask/parquet_dataset.py @@ -1,5 +1,6 @@ """``ParquetDataset`` is a dataset used to load and save data to parquet files using Dask dataframe""" + from __future__ import annotations from copy import deepcopy @@ -14,7 +15,7 @@ class ParquetDataset(AbstractDataset[dd.DataFrame, dd.DataFrame]): """``ParquetDataset`` loads and saves data to parquet file(s). It uses Dask remote data services to handle the corresponding load and save operations: - https://docs.dask.org/en/latest/how-to/connect-to-remote-data.html + https://docs.dask.org/en/stable/how-to/connect-to-remote-data.html Example usage for the `YAML API `_: @@ -103,7 +104,7 @@ def __init__( # noqa: PLR0913 credentials: Credentials required to get access to the underlying filesystem. E.g. for ``GCSFileSystem`` it should look like `{"token": None}`. fs_args: Optional parameters to the backend file system driver: - https://docs.dask.org/en/latest/how-to/connect-to-remote-data.html#optional-parameters + https://docs.dask.org/en/stable/how-to/connect-to-remote-data.html#optional-parameters metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. """ diff --git a/kedro-datasets/kedro_datasets/partitions/partitioned_dataset.py b/kedro-datasets/kedro_datasets/partitions/partitioned_dataset.py index ea2461034..cf1069b1a 100644 --- a/kedro-datasets/kedro_datasets/partitions/partitioned_dataset.py +++ b/kedro-datasets/kedro_datasets/partitions/partitioned_dataset.py @@ -69,6 +69,7 @@ class PartitionedDataset(AbstractDataset[dict[str, Any], dict[str, Callable[[], sep: '\\t' index: true filename_suffix: '.dat' + save_lazily: True Example usage for the `Python API >> # This will create a folder `df_with_partition` and save multiple files >>> # with the dict key + filename_suffix as filename, i.e. 1.csv, 2.csv etc. @@ -152,6 +154,7 @@ def __init__( # noqa: PLR0913 load_args: dict[str, Any] | None = None, fs_args: dict[str, Any] | None = None, overwrite: bool = False, + save_lazily: bool = True, metadata: dict[str, Any] | None = None, ) -> None: """Creates a new instance of ``PartitionedDataset``. @@ -191,6 +194,10 @@ def __init__( # noqa: PLR0913 fs_args: Extra arguments to pass into underlying filesystem class constructor (e.g. `{"project": "my-project"}` for ``GCSFileSystem``). overwrite: If True, any existing partitions will be removed. + save_lazily: Parameter to enable/disable lazy saving, the default is True. Meaning that if callable object + is passed as data to save, the partition’s data will not be materialised until it is time to write. + Lazy saving example: + https://docs.kedro.org/en/stable/data/kedro_io.html#partitioned-dataset-lazy-saving metadata: Any arbitrary metadata. This is ignored by Kedro, but may be consumed by users or external plugins. @@ -206,6 +213,7 @@ def __init__( # noqa: PLR0913 self._overwrite = overwrite self._protocol = infer_storage_options(self._path)["protocol"] self._partition_cache: Cache = Cache(maxsize=1) + self._save_lazily = save_lazily self.metadata = metadata dataset = dataset if isinstance(dataset, dict) else {"type": dataset} @@ -311,7 +319,7 @@ def save(self, data: dict[str, Any]) -> None: # join the protocol back since tools like PySpark may rely on it kwargs[self._filepath_arg] = self._join_protocol(partition) dataset = self._dataset_type(**kwargs) # type: ignore - if callable(partition_data): + if callable(partition_data) and self._save_lazily: partition_data = partition_data() # noqa: PLW2901 dataset.save(partition_data) self._invalidate_caches() diff --git a/kedro-datasets/tests/partitions/test_partitioned_dataset.py b/kedro-datasets/tests/partitions/test_partitioned_dataset.py index f0126887d..9a49d3bb8 100644 --- a/kedro-datasets/tests/partitions/test_partitioned_dataset.py +++ b/kedro-datasets/tests/partitions/test_partitioned_dataset.py @@ -52,6 +52,10 @@ def filepath_csvs(tmp_path): ] +def original_data_callable(): + return pd.DataFrame({"foo": 42, "bar": ["a", "b", None]}) + + class FakeDataset: # pylint: disable=too-few-public-methods pass @@ -101,6 +105,25 @@ def test_save(self, dataset, local_csvs, suffix): reloaded_data = loaded_partitions[part_id]() assert_frame_equal(reloaded_data, original_data) + @pytest.mark.parametrize("dataset", ["kedro_datasets.pickle.PickleDataset"]) + @pytest.mark.parametrize("suffix", ["", ".csv"]) + def test_callable_save(self, dataset, local_csvs, suffix): + pds = PartitionedDataset( + path=str(local_csvs), + dataset=dataset, + filename_suffix=suffix, + save_lazily=False, + ) + + part_id = "new/data" + pds.save({part_id: original_data_callable}) + + assert (local_csvs / "new" / ("data" + suffix)).is_file() + loaded_partitions = pds.load() + assert part_id in loaded_partitions + reloaded_data = loaded_partitions[part_id]() + assert reloaded_data == original_data_callable + @pytest.mark.parametrize("dataset", LOCAL_DATASET_DEFINITION) @pytest.mark.parametrize("suffix", ["", ".csv"]) def test_lazy_save(self, dataset, local_csvs, suffix): From 4a15b80e955ce49ced8fc63d693142f844597edd Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Mon, 10 Feb 2025 08:39:25 -0700 Subject: [PATCH 08/24] fix(datasets): use kwarg for Ibis `read_*` methods (#1005) * fix(datasets): use kwarg for Ibis `read_*` methods Signed-off-by: Deepyaman Datta * Update RELEASE.md Signed-off-by: Deepyaman Datta --------- Signed-off-by: Deepyaman Datta Signed-off-by: Richard Asselin --- kedro-datasets/RELEASE.md | 1 + kedro-datasets/kedro_datasets/ibis/file_dataset.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 820388766..2d0480807 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -7,6 +7,7 @@ ## Bug fixes and other changes - Fix polars.CSVDataset `save` method on Windows using `utf-8` as default encoding. +- Made `table_name` a keyword argument in the `ibis.FileDataset` implementation to be compatible with Ibis 10.0. ## Breaking Changes diff --git a/kedro-datasets/kedro_datasets/ibis/file_dataset.py b/kedro-datasets/kedro_datasets/ibis/file_dataset.py index c3c43b74f..82ad0c29c 100644 --- a/kedro-datasets/kedro_datasets/ibis/file_dataset.py +++ b/kedro-datasets/kedro_datasets/ibis/file_dataset.py @@ -160,7 +160,7 @@ def connection(self) -> BaseBackend: def load(self) -> ir.Table: load_path = self._get_load_path() reader = getattr(self.connection, f"read_{self._file_format}") - return reader(load_path, self._table_name, **self._load_args) + return reader(load_path, table_name=self._table_name, **self._load_args) def save(self, data: ir.Table) -> None: save_path = self._get_save_path() From e6b22c3e9a7180a9a04cd677222483626e94d889 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Mon, 10 Feb 2025 09:09:15 -0700 Subject: [PATCH 09/24] build(datasets): pin PyArrow until `19.0.1` is out (#1006) * build(datasets): pin PyArrow until `19.0.1` is out Signed-off-by: Deepyaman Datta * chore(datasets): exclude `19.0.0` instead of bound Signed-off-by: Deepyaman Datta --------- Signed-off-by: Deepyaman Datta Co-authored-by: Merel Theisen <49397448+merelcht@users.noreply.github.com> Signed-off-by: Richard Asselin --- kedro-datasets/pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml index 1fcde25c6..7adad9136 100644 --- a/kedro-datasets/pyproject.toml +++ b/kedro-datasets/pyproject.toml @@ -200,7 +200,7 @@ docs = [ # Test requirements test = [ - "accelerate<0.32", # Temporary pin + "accelerate<0.32", # Temporary pin "adlfs~=2023.1", "behave==1.2.6", "biopython~=1.73", @@ -236,6 +236,7 @@ test = [ "polars[deltalake,xlsx2csv]>=1.0", "pyarrow>=1.0; python_version < '3.11'", "pyarrow>=7.0; python_version >= '3.11'", # Adding to avoid numpy build errors + "pyarrow!=19.0.0", # Temporary pin until https://github.com/apache/arrow/issues/45283 is fixed "pyodbc~=5.0", "pyspark>=3.0; python_version < '3.11'", "pyspark>=3.4; python_version >= '3.11'", From 0fed73ce47327b04c32ec2e8b2274e328a0ee1fe Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Mon, 10 Feb 2025 09:31:02 -0700 Subject: [PATCH 10/24] build(datasets): update list of extras for Ibis 10 (#1003) * build(datasets): update list of extras for Ibis 10 Signed-off-by: Deepyaman Datta * Update RELEASE.md Signed-off-by: Deepyaman Datta --------- Signed-off-by: Deepyaman Datta Signed-off-by: Richard Asselin --- kedro-datasets/RELEASE.md | 4 +++- kedro-datasets/pyproject.toml | 8 +++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 2d0480807..b480237f7 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -1,12 +1,14 @@ # Upcoming Release + ## Major features and improvements - Added a parameter to enable/disable lazy saving for `PartitionedDataset`. - Replaced `trufflehog` with `detect-secrets` for detecting secrets within a code base. +- Added `ibis-athena` and `ibis-databricks` extras for the backends added in Ibis 10.0. ## Bug fixes and other changes -- Fix polars.CSVDataset `save` method on Windows using `utf-8` as default encoding. +- Fixed `polars.CSVDataset` `save` method on Windows using `utf-8` as default encoding. - Made `table_name` a keyword argument in the `ibis.FileDataset` implementation to be compatible with Ibis 10.0. ## Breaking Changes diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml index 7adad9136..0327eea93 100644 --- a/kedro-datasets/pyproject.toml +++ b/kedro-datasets/pyproject.toml @@ -40,7 +40,7 @@ dask = ["kedro-datasets[dask-parquetdataset, dask-csvdataset]"] databricks-managedtabledataset = ["kedro-datasets[hdfs-base,s3fs-base]"] databricks = ["kedro-datasets[databricks-managedtabledataset]"] -geopandas-genericdataset = ["geopandas>=0.8.0, <2.0", "fiona >=1.8, <2.0"] +geopandas-genericdataset = ["geopandas>=0.8.0, <2.0", "fiona>=1.8, <2.0"] geopandas = ["kedro-datasets[geopandas-genericdataset]"] holoviews-holoviewswriter = ["holoviews>=1.13.0"] @@ -50,9 +50,11 @@ huggingface-hfdataset = ["datasets", "huggingface_hub"] huggingface-hftransformerpipelinedataset = ["transformers"] huggingface = ["kedro-datasets[huggingface-hfdataset,huggingface-hftransformerpipelinedataset]"] +ibis-athena = ["ibis-framework[athena]"] ibis-bigquery = ["ibis-framework[bigquery]"] ibis-clickhouse = ["ibis-framework[clickhouse]"] -ibis-dask = ["ibis-framework[dask]"] +ibis-dask = ["ibis-framework[dask]<10.0"] +ibis-databricks = ["ibis-framework[databricks]"] ibis-datafusion = ["ibis-framework[datafusion]"] ibis-druid = ["ibis-framework[druid]"] ibis-duckdb = ["ibis-framework[duckdb]"] @@ -62,7 +64,7 @@ ibis-impala = ["ibis-framework[impala]"] ibis-mssql = ["ibis-framework[mssql]"] ibis-mysql = ["ibis-framework[mysql]"] ibis-oracle = ["ibis-framework[oracle]"] -ibis-pandas = ["ibis-framework[pandas]"] +ibis-pandas = ["ibis-framework[pandas]<10.0"] ibis-polars = ["ibis-framework[polars]"] ibis-postgres = ["ibis-framework[postgres]"] ibis-pyspark = ["ibis-framework[pyspark]"] From a652129601ef7cb718f775076748a0c5559ab90c Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Mon, 10 Feb 2025 14:19:53 -0700 Subject: [PATCH 11/24] chore: remove internal devtools from release notes (#1004) * chore: remove internal devtools from release notes Signed-off-by: Deepyaman Datta * chore: remove internal devtools from release notes Signed-off-by: Deepyaman Datta * chore: remove internal devtools from release notes Signed-off-by: Deepyaman Datta * chore: remove internal devtools from release notes Signed-off-by: Deepyaman Datta --------- Signed-off-by: Deepyaman Datta Co-authored-by: Merel Theisen <49397448+merelcht@users.noreply.github.com> Signed-off-by: Richard Asselin --- kedro-airflow/RELEASE.md | 1 - kedro-datasets/RELEASE.md | 1 - kedro-docker/RELEASE.md | 1 - kedro-telemetry/RELEASE.md | 1 - 4 files changed, 4 deletions(-) diff --git a/kedro-airflow/RELEASE.md b/kedro-airflow/RELEASE.md index 348945ac9..6bd0b7163 100755 --- a/kedro-airflow/RELEASE.md +++ b/kedro-airflow/RELEASE.md @@ -1,5 +1,4 @@ # Upcoming Release -* Replaced `trufflehog` with `detect-secrets` for detecting secrets within a code base. # Release 0.9.2 * Removed support for Python 3.8 diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index b480237f7..dd24582de 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -3,7 +3,6 @@ ## Major features and improvements - Added a parameter to enable/disable lazy saving for `PartitionedDataset`. -- Replaced `trufflehog` with `detect-secrets` for detecting secrets within a code base. - Added `ibis-athena` and `ibis-databricks` extras for the backends added in Ibis 10.0. ## Bug fixes and other changes diff --git a/kedro-docker/RELEASE.md b/kedro-docker/RELEASE.md index b7bab9313..f81181579 100644 --- a/kedro-docker/RELEASE.md +++ b/kedro-docker/RELEASE.md @@ -1,5 +1,4 @@ # Upcoming Release -* Replaced `trufflehog` with `detect-secrets` for detecting secrets within a code base. # Release 0.6.2 diff --git a/kedro-telemetry/RELEASE.md b/kedro-telemetry/RELEASE.md index 1b4fce80f..df7bb603a 100644 --- a/kedro-telemetry/RELEASE.md +++ b/kedro-telemetry/RELEASE.md @@ -1,5 +1,4 @@ # Upcoming release -* Replaced `trufflehog` with `detect-secrets` for detecting secrets within a code base. # Release 0.6.2 * Removed support for Python 3.8 From c384f0c7dd431a3647ed98319a95f49644c68bee Mon Sep 17 00:00:00 2001 From: Richard Date: Thu, 13 Feb 2025 13:33:21 -0500 Subject: [PATCH 12/24] 998: Fixed case where MemoryDatasets in catalog wouldn't trigger `_is_memory_dataset` Signed-off-by: Richard Asselin --- kedro-airflow/kedro_airflow/grouping.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kedro-airflow/kedro_airflow/grouping.py b/kedro-airflow/kedro_airflow/grouping.py index 3890804ae..31151e6d2 100644 --- a/kedro-airflow/kedro_airflow/grouping.py +++ b/kedro-airflow/kedro_airflow/grouping.py @@ -1,6 +1,6 @@ from __future__ import annotations -from kedro.io import DataCatalog +from kedro.io import DataCatalog, MemoryDataset from kedro.pipeline.node import Node from kedro.pipeline.pipeline import Pipeline @@ -11,9 +11,11 @@ def _is_memory_dataset(catalog, dataset_name: str) -> bool: + """Return whether a dataset is a MemoryDataset or not.""" if dataset_name not in catalog: return True - return False + else: + return isinstance(catalog.datasets[dataset_name], MemoryDataset) def get_memory_datasets( From b9005e6f02e41c79fbff8d8bbabb42c9256df330 Mon Sep 17 00:00:00 2001 From: Richard Asselin Date: Fri, 14 Feb 2025 09:47:26 -0500 Subject: [PATCH 13/24] 998: Tests to ensure that MemoryDatasets are passed in mocked data catalog Signed-off-by: Richard Asselin --- kedro-airflow/tests/test_node_grouping.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/kedro-airflow/tests/test_node_grouping.py b/kedro-airflow/tests/test_node_grouping.py index 84f551545..e6b7faccb 100644 --- a/kedro-airflow/tests/test_node_grouping.py +++ b/kedro-airflow/tests/test_node_grouping.py @@ -3,7 +3,7 @@ from typing import Any import pytest -from kedro.io import AbstractDataset, DataCatalog +from kedro.io import AbstractDataset, DataCatalog, MemoryDataset from kedro.pipeline import Pipeline, node from kedro.pipeline.modular_pipeline import pipeline as modular_pipeline @@ -21,12 +21,15 @@ def _load(self): return [] -def mock_data_catalog(nodes: list[str], memory_nodes: set[str]) -> DataCatalog: +def mock_data_catalog(nodes: list[str], memory_nodes: set[str], memory_nodes_in_catalog: bool = False) -> DataCatalog: mock_catalog = DataCatalog() for dataset_name in nodes: if dataset_name not in memory_nodes: dataset = TestDataset() mock_catalog.add(dataset_name, dataset) + elif memory_nodes_in_catalog: + mock_catalog.add(dataset_name, MemoryDataset()) + return mock_catalog @@ -143,8 +146,16 @@ def test_group_memory_nodes( ), ], ) -def test_is_memory_dataset(nodes: list[str], memory_nodes: set[str]): - mock_catalog = mock_data_catalog(nodes, memory_nodes) +@pytest.mark.parametrize("memory_nodes_in_catalog", (True, False)) +def test_is_memory_dataset(nodes: list[str], memory_nodes: set[str], memory_nodes_in_catalog: bool): + """Tests for the `_is_memory_dataset` function. + + Args: + nodes: list of nodes to add to the catalog + memory_nodes: set of nodes which should be considered MemoryDatasets + memory_nodes_in_catalog: whether to add MemoryDatasets to the catalog or not + """ + mock_catalog = mock_data_catalog(nodes, memory_nodes, memory_nodes_in_catalog=memory_nodes_in_catalog) for node_name in nodes: if node_name in memory_nodes: assert _is_memory_dataset(mock_catalog, node_name) From 9fe874af4877806b9e4879442c0d47462af0e089 Mon Sep 17 00:00:00 2001 From: Richard Asselin Date: Fri, 14 Feb 2025 10:36:43 -0500 Subject: [PATCH 14/24] 998: Changelog Signed-off-by: Richard Asselin --- kedro-airflow/RELEASE.md | 1 + 1 file changed, 1 insertion(+) diff --git a/kedro-airflow/RELEASE.md b/kedro-airflow/RELEASE.md index 6bd0b7163..72032b0e1 100755 --- a/kedro-airflow/RELEASE.md +++ b/kedro-airflow/RELEASE.md @@ -1,4 +1,5 @@ # Upcoming Release +* Fixed case where MemoryDatasets in catalog wouldn't be collapsed correctly # Release 0.9.2 * Removed support for Python 3.8 From 4dd6619bcb45d117529fbb7dbff353cc5edfa965 Mon Sep 17 00:00:00 2001 From: Richard Date: Mon, 17 Feb 2025 06:57:32 -0500 Subject: [PATCH 15/24] 998: Linting fixes Signed-off-by: Richard Asselin --- kedro-airflow/tests/test_node_grouping.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kedro-airflow/tests/test_node_grouping.py b/kedro-airflow/tests/test_node_grouping.py index e6b7faccb..aa0b3c5f0 100644 --- a/kedro-airflow/tests/test_node_grouping.py +++ b/kedro-airflow/tests/test_node_grouping.py @@ -21,7 +21,9 @@ def _load(self): return [] -def mock_data_catalog(nodes: list[str], memory_nodes: set[str], memory_nodes_in_catalog: bool = False) -> DataCatalog: +def mock_data_catalog( + nodes: list[str], memory_nodes: set[str], memory_nodes_in_catalog: bool = False +) -> DataCatalog: mock_catalog = DataCatalog() for dataset_name in nodes: if dataset_name not in memory_nodes: @@ -30,7 +32,6 @@ def mock_data_catalog(nodes: list[str], memory_nodes: set[str], memory_nodes_in_ elif memory_nodes_in_catalog: mock_catalog.add(dataset_name, MemoryDataset()) - return mock_catalog @@ -147,7 +148,9 @@ def test_group_memory_nodes( ], ) @pytest.mark.parametrize("memory_nodes_in_catalog", (True, False)) -def test_is_memory_dataset(nodes: list[str], memory_nodes: set[str], memory_nodes_in_catalog: bool): +def test_is_memory_dataset( + nodes: list[str], memory_nodes: set[str], memory_nodes_in_catalog: bool +): """Tests for the `_is_memory_dataset` function. Args: @@ -155,7 +158,9 @@ def test_is_memory_dataset(nodes: list[str], memory_nodes: set[str], memory_node memory_nodes: set of nodes which should be considered MemoryDatasets memory_nodes_in_catalog: whether to add MemoryDatasets to the catalog or not """ - mock_catalog = mock_data_catalog(nodes, memory_nodes, memory_nodes_in_catalog=memory_nodes_in_catalog) + mock_catalog = mock_data_catalog( + nodes, memory_nodes, memory_nodes_in_catalog=memory_nodes_in_catalog + ) for node_name in nodes: if node_name in memory_nodes: assert _is_memory_dataset(mock_catalog, node_name) From 15a0e3cf2c846c6658e8e541b44d9407439cdfb1 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Mon, 10 Feb 2025 09:31:02 -0700 Subject: [PATCH 16/24] build(datasets): update list of extras for Ibis 10 (#1003) * build(datasets): update list of extras for Ibis 10 Signed-off-by: Deepyaman Datta * Update RELEASE.md Signed-off-by: Deepyaman Datta --------- Signed-off-by: Deepyaman Datta Signed-off-by: Richard Signed-off-by: Richard Asselin --- kedro-datasets/RELEASE.md | 1 + 1 file changed, 1 insertion(+) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index dd24582de..06a1fa468 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -4,6 +4,7 @@ - Added a parameter to enable/disable lazy saving for `PartitionedDataset`. - Added `ibis-athena` and `ibis-databricks` extras for the backends added in Ibis 10.0. +- Added `ibis-athena` and `ibis-databricks` extras for the backends added in Ibis 10.0. ## Bug fixes and other changes From 6060e67a0b105560cbefad5fcd7bd6f32f218394 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Mon, 10 Feb 2025 14:19:53 -0700 Subject: [PATCH 17/24] chore: remove internal devtools from release notes (#1004) * chore: remove internal devtools from release notes Signed-off-by: Deepyaman Datta * chore: remove internal devtools from release notes Signed-off-by: Deepyaman Datta * chore: remove internal devtools from release notes Signed-off-by: Deepyaman Datta * chore: remove internal devtools from release notes Signed-off-by: Deepyaman Datta --------- Signed-off-by: Deepyaman Datta Co-authored-by: Merel Theisen <49397448+merelcht@users.noreply.github.com> Signed-off-by: Richard Signed-off-by: Richard Asselin --- kedro-datasets/RELEASE.md | 1 - 1 file changed, 1 deletion(-) diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 06a1fa468..dd24582de 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -4,7 +4,6 @@ - Added a parameter to enable/disable lazy saving for `PartitionedDataset`. - Added `ibis-athena` and `ibis-databricks` extras for the backends added in Ibis 10.0. -- Added `ibis-athena` and `ibis-databricks` extras for the backends added in Ibis 10.0. ## Bug fixes and other changes From e49f25dde291ac475a2cd5c96dfe68ca9f8a4097 Mon Sep 17 00:00:00 2001 From: Richard Asselin Date: Fri, 14 Feb 2025 09:47:26 -0500 Subject: [PATCH 18/24] 998: Tests to ensure that MemoryDatasets are passed in mocked data catalog Signed-off-by: Richard Signed-off-by: Richard Asselin --- kedro-airflow/tests/test_node_grouping.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kedro-airflow/tests/test_node_grouping.py b/kedro-airflow/tests/test_node_grouping.py index aa0b3c5f0..a2243fe7b 100644 --- a/kedro-airflow/tests/test_node_grouping.py +++ b/kedro-airflow/tests/test_node_grouping.py @@ -32,6 +32,7 @@ def mock_data_catalog( elif memory_nodes_in_catalog: mock_catalog.add(dataset_name, MemoryDataset()) + return mock_catalog @@ -148,9 +149,7 @@ def test_group_memory_nodes( ], ) @pytest.mark.parametrize("memory_nodes_in_catalog", (True, False)) -def test_is_memory_dataset( - nodes: list[str], memory_nodes: set[str], memory_nodes_in_catalog: bool -): +def test_is_memory_dataset(nodes: list[str], memory_nodes: set[str], memory_nodes_in_catalog: bool): """Tests for the `_is_memory_dataset` function. Args: From cffbaa27b4161899c5dc756a516a094c1cc3590e Mon Sep 17 00:00:00 2001 From: Richard Date: Mon, 17 Feb 2025 06:57:32 -0500 Subject: [PATCH 19/24] 998: Linting fixes Signed-off-by: Richard Signed-off-by: Richard Asselin --- kedro-airflow/tests/test_node_grouping.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kedro-airflow/tests/test_node_grouping.py b/kedro-airflow/tests/test_node_grouping.py index a2243fe7b..aa0b3c5f0 100644 --- a/kedro-airflow/tests/test_node_grouping.py +++ b/kedro-airflow/tests/test_node_grouping.py @@ -32,7 +32,6 @@ def mock_data_catalog( elif memory_nodes_in_catalog: mock_catalog.add(dataset_name, MemoryDataset()) - return mock_catalog @@ -149,7 +148,9 @@ def test_group_memory_nodes( ], ) @pytest.mark.parametrize("memory_nodes_in_catalog", (True, False)) -def test_is_memory_dataset(nodes: list[str], memory_nodes: set[str], memory_nodes_in_catalog: bool): +def test_is_memory_dataset( + nodes: list[str], memory_nodes: set[str], memory_nodes_in_catalog: bool +): """Tests for the `_is_memory_dataset` function. Args: From b07b8cb6e682db787818709b59cea3d9cbd0db1f Mon Sep 17 00:00:00 2001 From: Richard Date: Tue, 18 Feb 2025 07:17:52 -0500 Subject: [PATCH 20/24] 998: Changed function according to PR comments Signed-off-by: Richard Signed-off-by: Richard Asselin --- kedro-airflow/kedro_airflow/grouping.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kedro-airflow/kedro_airflow/grouping.py b/kedro-airflow/kedro_airflow/grouping.py index 31151e6d2..a5bcd0ffd 100644 --- a/kedro-airflow/kedro_airflow/grouping.py +++ b/kedro-airflow/kedro_airflow/grouping.py @@ -12,10 +12,9 @@ def _is_memory_dataset(catalog, dataset_name: str) -> bool: """Return whether a dataset is a MemoryDataset or not.""" - if dataset_name not in catalog: - return True - else: - return isinstance(catalog.datasets[dataset_name], MemoryDataset) + return dataset_name not in catalog or isinstance( + catalog._get_dataset(dataset_name), MemoryDataset + ) def get_memory_datasets( From 2ec88b0716f10490e820a9143e1fa01252e3aace Mon Sep 17 00:00:00 2001 From: Richard Asselin Date: Fri, 14 Feb 2025 09:47:26 -0500 Subject: [PATCH 21/24] 998: Tests to ensure that MemoryDatasets are passed in mocked data catalog Signed-off-by: Richard Asselin --- kedro-airflow/tests/test_node_grouping.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kedro-airflow/tests/test_node_grouping.py b/kedro-airflow/tests/test_node_grouping.py index aa0b3c5f0..a2243fe7b 100644 --- a/kedro-airflow/tests/test_node_grouping.py +++ b/kedro-airflow/tests/test_node_grouping.py @@ -32,6 +32,7 @@ def mock_data_catalog( elif memory_nodes_in_catalog: mock_catalog.add(dataset_name, MemoryDataset()) + return mock_catalog @@ -148,9 +149,7 @@ def test_group_memory_nodes( ], ) @pytest.mark.parametrize("memory_nodes_in_catalog", (True, False)) -def test_is_memory_dataset( - nodes: list[str], memory_nodes: set[str], memory_nodes_in_catalog: bool -): +def test_is_memory_dataset(nodes: list[str], memory_nodes: set[str], memory_nodes_in_catalog: bool): """Tests for the `_is_memory_dataset` function. Args: From 69aad578ee7da68c58d94c0749db94937d3f1715 Mon Sep 17 00:00:00 2001 From: Richard Date: Mon, 17 Feb 2025 06:57:32 -0500 Subject: [PATCH 22/24] 998: Linting fixes Signed-off-by: Richard Asselin Signed-off-by: Richard Signed-off-by: Richard Asselin --- kedro-airflow/tests/test_node_grouping.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kedro-airflow/tests/test_node_grouping.py b/kedro-airflow/tests/test_node_grouping.py index a2243fe7b..aa0b3c5f0 100644 --- a/kedro-airflow/tests/test_node_grouping.py +++ b/kedro-airflow/tests/test_node_grouping.py @@ -32,7 +32,6 @@ def mock_data_catalog( elif memory_nodes_in_catalog: mock_catalog.add(dataset_name, MemoryDataset()) - return mock_catalog @@ -149,7 +148,9 @@ def test_group_memory_nodes( ], ) @pytest.mark.parametrize("memory_nodes_in_catalog", (True, False)) -def test_is_memory_dataset(nodes: list[str], memory_nodes: set[str], memory_nodes_in_catalog: bool): +def test_is_memory_dataset( + nodes: list[str], memory_nodes: set[str], memory_nodes_in_catalog: bool +): """Tests for the `_is_memory_dataset` function. Args: From fbb80814bc45eb15c4b5c7292e746587e62abb88 Mon Sep 17 00:00:00 2001 From: Richard Asselin Date: Tue, 18 Feb 2025 10:11:25 -0500 Subject: [PATCH 23/24] 998: Tweaked release Signed-off-by: Richard Asselin --- kedro-airflow/RELEASE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-airflow/RELEASE.md b/kedro-airflow/RELEASE.md index 72032b0e1..d5d2333df 100755 --- a/kedro-airflow/RELEASE.md +++ b/kedro-airflow/RELEASE.md @@ -1,5 +1,5 @@ # Upcoming Release -* Fixed case where MemoryDatasets in catalog wouldn't be collapsed correctly +* Fixed case where MemoryDatasets in catalog wouldn't be detected correctly # Release 0.9.2 * Removed support for Python 3.8 From 8eb9beba4a3b2b443c6e8dfe90de47861000da30 Mon Sep 17 00:00:00 2001 From: Richard Date: Fri, 21 Feb 2025 11:54:54 -0500 Subject: [PATCH 24/24] Update RELEASE.md Co-authored-by: ElenaKhaustova <157851531+ElenaKhaustova@users.noreply.github.com> Signed-off-by: Richard --- kedro-airflow/RELEASE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-airflow/RELEASE.md b/kedro-airflow/RELEASE.md index d5d2333df..6bee29ad2 100755 --- a/kedro-airflow/RELEASE.md +++ b/kedro-airflow/RELEASE.md @@ -1,5 +1,5 @@ # Upcoming Release -* Fixed case where MemoryDatasets in catalog wouldn't be detected correctly +* Fixed check whether a dataset is a `MemoryDataset`. # Release 0.9.2 * Removed support for Python 3.8