diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md index 6769730f7..e095c2b2e 100755 --- a/kedro-datasets/RELEASE.md +++ b/kedro-datasets/RELEASE.md @@ -3,14 +3,14 @@ * Moved `PartitionedDataSet` and `IncrementalDataSet` from the core Kedro repo to `kedro-datasets` and renamed to `PartitionedDataset` and `IncrementalDataset`. ## Bug fixes and other changes +* Fix erroneous warning when using an cloud protocol file path with SparkDataSet on Databricks. * Updated `PickleDataset` to explicitly mention `cloudpickle` support. + ## Upcoming deprecations for Kedro-Datasets 2.0.0 ## Community contributions Many thanks to the following Kedroids for contributing PRs to this release: * [PtrBld](https://github.com/PtrBld) - -## Community contributions -Many thanks to the following Kedroids for contributing PRs to this release: +* [Alistair McKelvie](https://github.com/alamastor) * [Felix Wittmann](https://github.com/hfwittmann) # Release 1.7.1 diff --git a/kedro-datasets/kedro_datasets/spark/spark_dataset.py b/kedro-datasets/kedro_datasets/spark/spark_dataset.py index d83e3227a..58df800c8 100644 --- a/kedro-datasets/kedro_datasets/spark/spark_dataset.py +++ b/kedro-datasets/kedro_datasets/spark/spark_dataset.py @@ -14,7 +14,12 @@ import fsspec from hdfs import HdfsError, InsecureClient -from kedro.io.core import Version, get_filepath_str, get_protocol_and_path +from kedro.io.core import ( + CLOUD_PROTOCOLS, + Version, + get_filepath_str, + get_protocol_and_path, +) from pyspark.sql import DataFrame, SparkSession from pyspark.sql.types import StructType from pyspark.sql.utils import AnalysisException @@ -284,7 +289,11 @@ def __init__( # noqa: PLR0913 glob_function = None self.metadata = metadata - if not filepath.startswith("/dbfs/") and _deployed_on_databricks(): + if ( + not filepath.startswith("/dbfs/") + and fs_prefix not in (protocol + "://" for protocol in CLOUD_PROTOCOLS) + and _deployed_on_databricks() + ): logger.warning( "Using SparkDataset on Databricks without the `/dbfs/` prefix in the " "filepath is a known source of error. You must add this prefix to %s", diff --git a/kedro-datasets/tests/spark/test_spark_dataset.py b/kedro-datasets/tests/spark/test_spark_dataset.py index 7970b4ce9..032c2a0ee 100644 --- a/kedro-datasets/tests/spark/test_spark_dataset.py +++ b/kedro-datasets/tests/spark/test_spark_dataset.py @@ -495,6 +495,12 @@ def test_dbfs_prefix_warning_on_databricks_no_prefix(self, monkeypatch, caplog): SparkDataset(filepath=filepath) assert expected_message in caplog.text + def test_dbfs_prefix_warning_databricks_s3(self, monkeypatch, caplog): + # test that warning is not raised when on Databricks using an s3 path + monkeypatch.setenv("DATABRICKS_RUNTIME_VERSION", "7.3") + SparkDataset(filepath="s3://my_project/data/02_intermediate/processed_data") + assert caplog.text == "" + class TestSparkDatasetVersionedLocal: def test_no_version(self, versioned_dataset_local):