fix(datasets): Various fixes (#608)

* Try to use uv on Read the Docs Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com> * Pin dask test requirement as latest introduced installation issues Signed-off-by: Merel Theisen <merel.theisen@quantumblack.com> * Try pinning Dask in main requirements too Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com> * Use smaller model for Hugging Face doctest Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com> * Install PyTorch with transformers for testing Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com> * More robust doctest for Hugging Face transformer Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com> * Whitespace fix Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com> * Fix Tensorflow dataset for modern versions of Keras Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com> * Fix TensorFlow model loading and saving Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com> * Fix TensorFlow tests Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com> --------- Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com> Signed-off-by: Merel Theisen <merel.theisen@quantumblack.com> Co-authored-by: Merel Theisen <merel.theisen@quantumblack.com>
kedro-org · Mar 15, 2024 · e308cee · e308cee
1 parent fa3d842
commit e308cee
Show file tree

Hide file tree

Showing 5 changed files with 32 additions and 53 deletions.
diff --git a/kedro-datasets/.readthedocs.yaml b/kedro-datasets/.readthedocs.yaml
@@ -11,14 +11,10 @@ build:
     python: "3.9"
   jobs:
     pre_install:
-    # pip==23.2 breaks pip-tools<7.0, and pip-tools>=7.0 does not support Python 3.7
-    # pip==23.3 breaks dependency resolution
-      - python -m pip install -U "pip>=21.2,<23.2"
-    # These are technically installation steps, due to RTD's limit we need to inject the installation earlier.
-      - python -m pip install --upgrade --no-cache-dir sphinx readthedocs-sphinx-ext
-      - python -m pip install --upgrade --upgrade-strategy only-if-needed --no-cache-dir ./kedro-datasets[docs,test]
+      - python -m pip install "uv==0.1.13"
+      - uv pip install --system --upgrade sphinx readthedocs-sphinx-ext "kedro-datasets[docs,test] @ ./kedro-datasets"
+      - uv pip freeze
     pre_build:
-      - pip freeze
       - python -m sphinx -WETan -j auto -D language=en -b linkcheck -d kedro-datasets/_build/doctrees kedro-datasets/docs/source kedro-datasets/_build/linkcheck
 # Build documentation in the docs/ directory with Sphinx
 sphinx:

diff --git a/kedro-datasets/kedro_datasets/huggingface/transformer_pipeline_dataset.py b/kedro-datasets/kedro_datasets/huggingface/transformer_pipeline_dataset.py
@@ -29,9 +29,9 @@ class HFTransformerPipelineDataset(AbstractDataset):
     .. code-block:: pycon
 
        >>> from kedro_datasets.huggingface import HFTransformerPipelineDataset
-       >>> dataset = HFTransformerPipelineDataset(task="text-classification", model_name="papluca/xlm-roberta-base-language-detection")
-       >>> detector = dataset.load()
-       >>> assert detector("Ceci n'est pas une pipe")[0]["label"] == "fr"
+       >>> dataset = HFTransformerPipelineDataset(task="text-classification", model_name="prajjwal1/bert-tiny")
+       >>> model = dataset.load()
+       >>> assert model("Hello world")[0]["label"].startswith("LABEL_")
 
     """
 

diff --git a/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py b/kedro-datasets/kedro_datasets/tensorflow/tensorflow_model_dataset.py
@@ -17,6 +17,7 @@
 )
 
 TEMPORARY_H5_FILE = "tmp_tensorflow_model.h5"
+TEMPORARY_KERAS_FILE = "tmp_tensorflow_model.keras"
 
 
 class TensorFlowModelDataset(AbstractVersionedDataset[tf.keras.Model, tf.keras.Model]):
@@ -53,7 +54,7 @@ class TensorFlowModelDataset(AbstractVersionedDataset[tf.keras.Model, tf.keras.M
         >>> dataset = TensorFlowModelDataset(filepath=tmp_path / "data/06_models/tensorflow_model.h5")
         >>> model = tf.keras.Sequential([tf.keras.layers.Dense(5, input_shape=(3,)),tf.keras.layers.Softmax()])
         >>>
-        >>> # x = tf.random.uniform((10, 3))
+        >>> # x = tf.random.uniform((10, 3))
         >>> # predictions = model.predict(x)
         >>>
         >>> dataset.save(model)
@@ -62,7 +63,7 @@ class TensorFlowModelDataset(AbstractVersionedDataset[tf.keras.Model, tf.keras.M
     """
 
     DEFAULT_LOAD_ARGS: dict[str, Any] = {}
-    DEFAULT_SAVE_ARGS: dict[str, Any] = {"save_format": "tf"}
+    DEFAULT_SAVE_ARGS: dict[str, Any] = {}
 
     def __init__(  # noqa: PLR0913
         self,
@@ -134,12 +135,14 @@ def __init__(  # noqa: PLR0913
     def _load(self) -> tf.keras.Model:
         load_path = get_filepath_str(self._get_load_path(), self._protocol)
 
-        with tempfile.TemporaryDirectory(prefix=self._tmp_prefix) as path:
+        with tempfile.TemporaryDirectory(prefix=self._tmp_prefix) as tempdir:
             if self._is_h5:
-                path = str(PurePath(path) / TEMPORARY_H5_FILE)  # noqa: PLW2901
-                self._fs.copy(load_path, path)
+                path = str(PurePath(tempdir) / TEMPORARY_H5_FILE)  # noqa: PLW2901
             else:
-                self._fs.get(load_path + "/", path, recursive=True)
+                # We assume .keras
+                path = str(PurePath(tempdir) / TEMPORARY_KERAS_FILE)  # noqa: PLW2901
+
+            self._fs.copy(load_path, path)
 
             # Pass the local temporary directory/file path to keras.load_model
             device_name = self._load_args.pop("tf_device", None)
@@ -153,20 +156,18 @@ def _load(self) -> tf.keras.Model:
     def _save(self, data: tf.keras.Model) -> None:
         save_path = get_filepath_str(self._get_save_path(), self._protocol)
 
-        with tempfile.TemporaryDirectory(prefix=self._tmp_prefix) as path:
+        with tempfile.TemporaryDirectory(prefix=self._tmp_prefix) as tempdir:
             if self._is_h5:
-                path = str(PurePath(path) / TEMPORARY_H5_FILE)  # noqa: PLW2901
+                path = str(PurePath(tempdir) / TEMPORARY_H5_FILE)  # noqa: PLW2901
+            else:
+                # We assume .keras
+                path = str(PurePath(tempdir) / TEMPORARY_KERAS_FILE)  # noqa: PLW2901
 
             tf.keras.models.save_model(data, path, **self._save_args)
 
             # Use fsspec to take from local tempfile directory/file and
             # put in ArbitraryFileSystem
-            if self._is_h5:
-                self._fs.copy(path, save_path)
-            else:
-                if self._fs.exists(save_path):
-                    self._fs.rm(save_path, recursive=True)
-                self._fs.put(path, save_path, recursive=True)
+            self._fs.copy(path, save_path)
 
     def _exists(self) -> bool:
         try:

diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml
@@ -33,7 +33,7 @@ api = ["kedro-datasets[api-apidataset]"]
 biosequence-biosequencedataset = ["biopython~=1.73"]
 biosequence = ["kedro-datasets[biosequence-biosequencedataset]"]
 
-dask-parquetdataset = ["dask[complete]>=2021.10", "triad>=0.6.7, <1.0"]
+dask-parquetdataset = ["dask[complete]>=2021.10, <2024.3", "triad>=0.6.7, <1.0"]
 dask = ["kedro-datasets[dask-parquetdataset]"]
 
 databricks-managedtabledataset = ["kedro-datasets[spark-base,pandas-base,delta-base]"]
@@ -171,7 +171,7 @@ test = [
     "cloudpickle<=2.0.0",
     "compress-pickle[lz4]~=2.1.0",
     "coverage[toml]",
-    "dask[complete]>=2021.10",
+    "dask[complete]>=2021.10, <2024.3",
     "delta-spark>=1.0, <3.0",
     "deltalake>=0.10.0, <0.15.2",  # temporary pin as 0.15.2 breaks some of our tests
     "dill~=0.3.1",
@@ -232,7 +232,7 @@ test = [
     # huggingface
     "datasets",
     "huggingface_hub",
-    "transformers",
+    "transformers[torch]",
 ]
 
 # All requirements

diff --git a/kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py b/kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py
@@ -46,7 +46,7 @@ def tensorflow_model_dataset():
 
 @pytest.fixture
 def filepath(tmp_path):
-    return (tmp_path / "test_tf").as_posix()
+    return (tmp_path / "test_tf.h5").as_posix()
 
 
 @pytest.fixture
@@ -151,7 +151,7 @@ def test_save_and_load(self, tf_model_dataset, dummy_tf_base_model, dummy_x_test
         np.testing.assert_allclose(predictions, new_predictions, rtol=1e-6, atol=1e-6)
 
         assert tf_model_dataset._load_args == {}
-        assert tf_model_dataset._save_args == {"save_format": "tf"}
+        assert tf_model_dataset._save_args == {}
 
     def test_load_missing_model(self, tf_model_dataset):
         """Test error message when trying to load missing model."""
@@ -172,7 +172,7 @@ def test_hdf5_save_format(
     ):
         """Test TensorFlowModelDataset can save TF graph models in HDF5 format"""
         hdf5_dataset = tensorflow_model_dataset(
-            filepath=filepath, save_args={"save_format": "h5"}
+            filepath=filepath,
         )
 
         predictions = dummy_tf_base_model.predict(dummy_x_test)
@@ -193,6 +193,8 @@ def test_unused_subclass_model_hdf5_save_format(
     ):
         """Test TensorFlowModelDataset cannot save subclassed user models in HDF5 format
 
+        FIXME: What is this test doing??
+
         Subclassed model
 
         From TF docs
@@ -201,22 +203,15 @@ def test_unused_subclass_model_hdf5_save_format(
         create its weights.
         """
         hdf5_dataset = tensorflow_model_dataset(
-            filepath=filepath, save_args={"save_format": "h5"}
+            filepath=filepath,
         )
         # demonstrating is a working model
         dummy_tf_subclassed_model.fit(
             dummy_x_train, dummy_y_train, batch_size=64, epochs=1
         )
         dummy_tf_subclassed_model.predict(dummy_x_test)
-        pattern = (
-            r"Saving the model to HDF5 format requires the model to be a Functional model or a "
-            r"Sequential model. It does not work for subclassed models, because such models are "
-            r"defined via the body of a Python method, which isn\'t safely serializable. Consider "
-            r"saving to the Tensorflow SavedModel format \(by setting save_format=\"tf\"\) "
-            r"or using `save_weights`."
-        )
-        with pytest.raises(DatasetError, match=pattern):
-            hdf5_dataset.save(dummy_tf_subclassed_model)
+
+        hdf5_dataset.save(dummy_tf_subclassed_model)  # Nothing happens
 
     @pytest.mark.parametrize(
         "filepath,instance_type",
@@ -328,7 +323,6 @@ def test_hdf5_save_format(
         HDF5 format"""
         hdf5_dataset = tensorflow_model_dataset(
             filepath=filepath,
-            save_args={"save_format": "h5"},
             version=Version(load_version, save_version),
         )
 
@@ -408,18 +402,6 @@ def test_version_str_repr(self, tf_model_dataset, versioned_tf_model_dataset):
         assert "protocol" in str(versioned_tf_model_dataset)
         assert "save_args" in str(versioned_tf_model_dataset)
 
-    def test_versioning_existing_dataset(
-        self, tf_model_dataset, versioned_tf_model_dataset, dummy_tf_base_model
-    ):
-        """Check behavior when attempting to save a versioned dataset on top of an
-        already existing (non-versioned) dataset. Note: because TensorFlowModelDataset
-        saves to a directory even if non-versioned, an error is not expected."""
-        tf_model_dataset.save(dummy_tf_base_model)
-        assert tf_model_dataset.exists()
-        assert tf_model_dataset._filepath == versioned_tf_model_dataset._filepath
-        versioned_tf_model_dataset.save(dummy_tf_base_model)
-        assert versioned_tf_model_dataset.exists()
-
     def test_save_and_load_with_device(
         self,
         dummy_tf_base_model,