Skip to content

Commit

Permalink
fix(datasets): Various fixes (#608)
Browse files Browse the repository at this point in the history
* Try to use uv on Read the Docs

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Pin dask test requirement as latest introduced installation issues

Signed-off-by: Merel Theisen <merel.theisen@quantumblack.com>

* Try pinning Dask in main requirements too

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Use smaller model for Hugging Face doctest

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Install PyTorch with transformers for testing

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* More robust doctest for Hugging Face transformer

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Whitespace fix

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Fix Tensorflow dataset for modern versions of Keras

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Fix TensorFlow model loading and saving

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

* Fix TensorFlow tests

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>

---------

Signed-off-by: Juan Luis Cano Rodríguez <juan_luis_cano@mckinsey.com>
Signed-off-by: Merel Theisen <merel.theisen@quantumblack.com>
Co-authored-by: Merel Theisen <merel.theisen@quantumblack.com>
  • Loading branch information
astrojuanlu and merelcht authored Mar 15, 2024
1 parent fa3d842 commit e308cee
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 53 deletions.
10 changes: 3 additions & 7 deletions kedro-datasets/.readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,10 @@ build:
python: "3.9"
jobs:
pre_install:
# pip==23.2 breaks pip-tools<7.0, and pip-tools>=7.0 does not support Python 3.7
# pip==23.3 breaks dependency resolution
- python -m pip install -U "pip>=21.2,<23.2"
# These are technically installation steps, due to RTD's limit we need to inject the installation earlier.
- python -m pip install --upgrade --no-cache-dir sphinx readthedocs-sphinx-ext
- python -m pip install --upgrade --upgrade-strategy only-if-needed --no-cache-dir ./kedro-datasets[docs,test]
- python -m pip install "uv==0.1.13"
- uv pip install --system --upgrade sphinx readthedocs-sphinx-ext "kedro-datasets[docs,test] @ ./kedro-datasets"
- uv pip freeze
pre_build:
- pip freeze
- python -m sphinx -WETan -j auto -D language=en -b linkcheck -d kedro-datasets/_build/doctrees kedro-datasets/docs/source kedro-datasets/_build/linkcheck
# Build documentation in the docs/ directory with Sphinx
sphinx:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ class HFTransformerPipelineDataset(AbstractDataset):
.. code-block:: pycon
>>> from kedro_datasets.huggingface import HFTransformerPipelineDataset
>>> dataset = HFTransformerPipelineDataset(task="text-classification", model_name="papluca/xlm-roberta-base-language-detection")
>>> detector = dataset.load()
>>> assert detector("Ceci n'est pas une pipe")[0]["label"] == "fr"
>>> dataset = HFTransformerPipelineDataset(task="text-classification", model_name="prajjwal1/bert-tiny")
>>> model = dataset.load()
>>> assert model("Hello world")[0]["label"].startswith("LABEL_")
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
)

TEMPORARY_H5_FILE = "tmp_tensorflow_model.h5"
TEMPORARY_KERAS_FILE = "tmp_tensorflow_model.keras"


class TensorFlowModelDataset(AbstractVersionedDataset[tf.keras.Model, tf.keras.Model]):
Expand Down Expand Up @@ -53,7 +54,7 @@ class TensorFlowModelDataset(AbstractVersionedDataset[tf.keras.Model, tf.keras.M
>>> dataset = TensorFlowModelDataset(filepath=tmp_path / "data/06_models/tensorflow_model.h5")
>>> model = tf.keras.Sequential([tf.keras.layers.Dense(5, input_shape=(3,)),tf.keras.layers.Softmax()])
>>>
>>> # x = tf.random.uniform((10, 3))
>>> # x = tf.random.uniform((10, 3))
>>> # predictions = model.predict(x)
>>>
>>> dataset.save(model)
Expand All @@ -62,7 +63,7 @@ class TensorFlowModelDataset(AbstractVersionedDataset[tf.keras.Model, tf.keras.M
"""

DEFAULT_LOAD_ARGS: dict[str, Any] = {}
DEFAULT_SAVE_ARGS: dict[str, Any] = {"save_format": "tf"}
DEFAULT_SAVE_ARGS: dict[str, Any] = {}

def __init__( # noqa: PLR0913
self,
Expand Down Expand Up @@ -134,12 +135,14 @@ def __init__( # noqa: PLR0913
def _load(self) -> tf.keras.Model:
load_path = get_filepath_str(self._get_load_path(), self._protocol)

with tempfile.TemporaryDirectory(prefix=self._tmp_prefix) as path:
with tempfile.TemporaryDirectory(prefix=self._tmp_prefix) as tempdir:
if self._is_h5:
path = str(PurePath(path) / TEMPORARY_H5_FILE) # noqa: PLW2901
self._fs.copy(load_path, path)
path = str(PurePath(tempdir) / TEMPORARY_H5_FILE) # noqa: PLW2901
else:
self._fs.get(load_path + "/", path, recursive=True)
# We assume .keras
path = str(PurePath(tempdir) / TEMPORARY_KERAS_FILE) # noqa: PLW2901

self._fs.copy(load_path, path)

# Pass the local temporary directory/file path to keras.load_model
device_name = self._load_args.pop("tf_device", None)
Expand All @@ -153,20 +156,18 @@ def _load(self) -> tf.keras.Model:
def _save(self, data: tf.keras.Model) -> None:
save_path = get_filepath_str(self._get_save_path(), self._protocol)

with tempfile.TemporaryDirectory(prefix=self._tmp_prefix) as path:
with tempfile.TemporaryDirectory(prefix=self._tmp_prefix) as tempdir:
if self._is_h5:
path = str(PurePath(path) / TEMPORARY_H5_FILE) # noqa: PLW2901
path = str(PurePath(tempdir) / TEMPORARY_H5_FILE) # noqa: PLW2901
else:
# We assume .keras
path = str(PurePath(tempdir) / TEMPORARY_KERAS_FILE) # noqa: PLW2901

tf.keras.models.save_model(data, path, **self._save_args)

# Use fsspec to take from local tempfile directory/file and
# put in ArbitraryFileSystem
if self._is_h5:
self._fs.copy(path, save_path)
else:
if self._fs.exists(save_path):
self._fs.rm(save_path, recursive=True)
self._fs.put(path, save_path, recursive=True)
self._fs.copy(path, save_path)

def _exists(self) -> bool:
try:
Expand Down
6 changes: 3 additions & 3 deletions kedro-datasets/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ api = ["kedro-datasets[api-apidataset]"]
biosequence-biosequencedataset = ["biopython~=1.73"]
biosequence = ["kedro-datasets[biosequence-biosequencedataset]"]

dask-parquetdataset = ["dask[complete]>=2021.10", "triad>=0.6.7, <1.0"]
dask-parquetdataset = ["dask[complete]>=2021.10, <2024.3", "triad>=0.6.7, <1.0"]
dask = ["kedro-datasets[dask-parquetdataset]"]

databricks-managedtabledataset = ["kedro-datasets[spark-base,pandas-base,delta-base]"]
Expand Down Expand Up @@ -171,7 +171,7 @@ test = [
"cloudpickle<=2.0.0",
"compress-pickle[lz4]~=2.1.0",
"coverage[toml]",
"dask[complete]>=2021.10",
"dask[complete]>=2021.10, <2024.3",
"delta-spark>=1.0, <3.0",
"deltalake>=0.10.0, <0.15.2", # temporary pin as 0.15.2 breaks some of our tests
"dill~=0.3.1",
Expand Down Expand Up @@ -232,7 +232,7 @@ test = [
# huggingface
"datasets",
"huggingface_hub",
"transformers",
"transformers[torch]",
]

# All requirements
Expand Down
34 changes: 8 additions & 26 deletions kedro-datasets/tests/tensorflow/test_tensorflow_model_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def tensorflow_model_dataset():

@pytest.fixture
def filepath(tmp_path):
return (tmp_path / "test_tf").as_posix()
return (tmp_path / "test_tf.h5").as_posix()


@pytest.fixture
Expand Down Expand Up @@ -151,7 +151,7 @@ def test_save_and_load(self, tf_model_dataset, dummy_tf_base_model, dummy_x_test
np.testing.assert_allclose(predictions, new_predictions, rtol=1e-6, atol=1e-6)

assert tf_model_dataset._load_args == {}
assert tf_model_dataset._save_args == {"save_format": "tf"}
assert tf_model_dataset._save_args == {}

def test_load_missing_model(self, tf_model_dataset):
"""Test error message when trying to load missing model."""
Expand All @@ -172,7 +172,7 @@ def test_hdf5_save_format(
):
"""Test TensorFlowModelDataset can save TF graph models in HDF5 format"""
hdf5_dataset = tensorflow_model_dataset(
filepath=filepath, save_args={"save_format": "h5"}
filepath=filepath,
)

predictions = dummy_tf_base_model.predict(dummy_x_test)
Expand All @@ -193,6 +193,8 @@ def test_unused_subclass_model_hdf5_save_format(
):
"""Test TensorFlowModelDataset cannot save subclassed user models in HDF5 format
FIXME: What is this test doing??
Subclassed model
From TF docs
Expand All @@ -201,22 +203,15 @@ def test_unused_subclass_model_hdf5_save_format(
create its weights.
"""
hdf5_dataset = tensorflow_model_dataset(
filepath=filepath, save_args={"save_format": "h5"}
filepath=filepath,
)
# demonstrating is a working model
dummy_tf_subclassed_model.fit(
dummy_x_train, dummy_y_train, batch_size=64, epochs=1
)
dummy_tf_subclassed_model.predict(dummy_x_test)
pattern = (
r"Saving the model to HDF5 format requires the model to be a Functional model or a "
r"Sequential model. It does not work for subclassed models, because such models are "
r"defined via the body of a Python method, which isn\'t safely serializable. Consider "
r"saving to the Tensorflow SavedModel format \(by setting save_format=\"tf\"\) "
r"or using `save_weights`."
)
with pytest.raises(DatasetError, match=pattern):
hdf5_dataset.save(dummy_tf_subclassed_model)

hdf5_dataset.save(dummy_tf_subclassed_model) # Nothing happens

@pytest.mark.parametrize(
"filepath,instance_type",
Expand Down Expand Up @@ -328,7 +323,6 @@ def test_hdf5_save_format(
HDF5 format"""
hdf5_dataset = tensorflow_model_dataset(
filepath=filepath,
save_args={"save_format": "h5"},
version=Version(load_version, save_version),
)

Expand Down Expand Up @@ -408,18 +402,6 @@ def test_version_str_repr(self, tf_model_dataset, versioned_tf_model_dataset):
assert "protocol" in str(versioned_tf_model_dataset)
assert "save_args" in str(versioned_tf_model_dataset)

def test_versioning_existing_dataset(
self, tf_model_dataset, versioned_tf_model_dataset, dummy_tf_base_model
):
"""Check behavior when attempting to save a versioned dataset on top of an
already existing (non-versioned) dataset. Note: because TensorFlowModelDataset
saves to a directory even if non-versioned, an error is not expected."""
tf_model_dataset.save(dummy_tf_base_model)
assert tf_model_dataset.exists()
assert tf_model_dataset._filepath == versioned_tf_model_dataset._filepath
versioned_tf_model_dataset.save(dummy_tf_base_model)
assert versioned_tf_model_dataset.exists()

def test_save_and_load_with_device(
self,
dummy_tf_base_model,
Expand Down

0 comments on commit e308cee

Please sign in to comment.