diff --git a/doc/source/ray-air/check-ingest.rst b/doc/source/ray-air/check-ingest.rst index cc639bace0bd4..6155986f3d3de 100644 --- a/doc/source/ray-air/check-ingest.rst +++ b/doc/source/ray-air/check-ingest.rst @@ -26,7 +26,7 @@ In this basic example, the `train_ds` object is created in your Ray script befor Splitting data across workers ----------------------------- -By default, Train will split the ``"train"`` dataset across workers using :meth:`Dataset.streaming_split `. This means that each worker sees a disjoint subset of the data, instead of iterating over the entire dataset. To customize this, we can pass in a :class:`DataConfig ` to the Trainer constructor. For example, the following splits dataset ``"a"`` but not ``"b"``. +By default, Train will split the ``"train"`` dataset across workers using :meth:`Dataset.streaming_split `. This means that each worker sees a disjoint subset of the data, instead of iterating over the entire dataset. To customize this, we can pass in a :class:`DataConfig ` to the Trainer constructor. For example, the following splits dataset ``"a"`` but not ``"b"``. .. literalinclude:: doc_code/air_ingest_new.py :language: python @@ -51,7 +51,7 @@ Datasets are lazy and their execution is streamed, which means that on each epoc Ray Data execution options ~~~~~~~~~~~~~~~~~~~~~~~~~~ -Under the hood, Train configures some default Data options for ingest: limiting the data ingest memory usage to 2GB per worker, and telling it to optimize the locality of the output data for ingest. See :meth:`help(DataConfig.default_ingest_options()) ` if you want to learn more and further customize these settings. +Under the hood, Train configures some default Data options for ingest: limiting the data ingest memory usage to 2GB per worker, and telling it to optimize the locality of the output data for ingest. See :meth:`help(DataConfig.default_ingest_options()) ` if you want to learn more and further customize these settings. Common options you may want to adjust: @@ -88,3 +88,25 @@ What do you need to know about this ``DataConfig`` class? * Its ``configure`` method is called on the main actor of the Trainer group to create the data iterators for each worker. In general, you can use ``DataConfig`` for any shared setup that has to occur ahead of time before the workers start reading data. The setup will be run at the start of each Trainer run. + +Migrating from the legacy DatasetConfig API +------------------------------------------- + +Starting from Ray 2.6, the ``DatasetConfig`` API is deprecated, and it will be removed in a future release. If your workloads are still using it, consider migrating to the new :class:`DataConfig ` API as soon as possible. + +The main difference is that preprocessing no longer part of the Trainer. As Dataset operations are lazy. You can apply any operations to your Datasets before passing them to the Trainer. The operations will be re-executed before each epoch. + +In the following example with the legacy ``DatasetConfig`` API, we pass 2 Datasets ("train" and "test") to the Trainer and apply an "add_noise" preprocessor per epoch to the "train" Dataset. Also, we will split the "train" Dataset, but not the "test" Dataset. + +.. literalinclude:: doc_code/air_ingest_migration.py + :language: python + :start-after: __legacy_api__ + :end-before: __legacy_api_end__ + +To migrate this example to the new :class:`DatasetConfig ` API, we apply the "add_noise" preprocesor to the "train" Dataset prior to passing it to the Trainer. And we use ``DataConfig(datasets_to_split=["train"])`` to specify which Datasets need to be split. Note, the ``datasets_to_split`` argument is optional. By default, only the "train" Dataset will be split. If you don't want to split the "train" Dataset either, use ``datasets_to_split=[]``. + +.. literalinclude:: doc_code/air_ingest_migration.py + :language: python + :start-after: __new_api__ + :end-before: __new_api_end__ + diff --git a/doc/source/ray-air/doc_code/air_ingest_migration.py b/doc/source/ray-air/doc_code/air_ingest_migration.py new file mode 100644 index 0000000000000..fbbbbbbe860ae --- /dev/null +++ b/doc/source/ray-air/doc_code/air_ingest_migration.py @@ -0,0 +1,62 @@ +# flake8: noqa +# isort: skip_file + +# __legacy_api__ +import random +import ray + +from ray.air.config import ScalingConfig, DatasetConfig +from ray.data.preprocessors.batch_mapper import BatchMapper +from ray.train.torch import TorchTrainer + +train_ds = ray.data.range_tensor(1000) +test_ds = ray.data.range_tensor(10) + +# A randomized preprocessor that adds a random float to all values. +add_noise = BatchMapper(lambda df: df + random.random(), batch_format="pandas") + +my_trainer = TorchTrainer( + lambda: None, + scaling_config=ScalingConfig(num_workers=1), + datasets={ + "train": train_ds, + "test": test_ds, + }, + dataset_config={ + "train": DatasetConfig( + split=True, + # Apply the preprocessor for each epoch. + per_epoch_preprocessor=add_noise, + ), + "test": DatasetConfig( + split=False, + ), + }, +) +my_trainer.fit() +# __legacy_api_end__ + +# __new_api__ +from ray.train import DataConfig + +train_ds = ray.data.range_tensor(1000) +test_ds = ray.data.range_tensor(10) + +# Apply the preprocessor before passing the Dataset to the Trainer. +# This operation is lazy. It will be re-executed for each epoch. +train_ds = add_noise.transform(train_ds) + +my_trainer = TorchTrainer( + lambda: None, + scaling_config=ScalingConfig(num_workers=1), + datasets={ + "train": train_ds, + "test": test_ds, + }, + # Specify which datasets to split. + dataset_config=DataConfig( + datasets_to_split=["train"], + ), +) +my_trainer.fit() +# __new_api_end__ diff --git a/doc/source/ray-core/_examples/datasets_train/datasets_train.py b/doc/source/ray-core/_examples/datasets_train/datasets_train.py index f966ac48b9c28..bb2a0b9c4e560 100644 --- a/doc/source/ray-core/_examples/datasets_train/datasets_train.py +++ b/doc/source/ray-core/_examples/datasets_train/datasets_train.py @@ -19,7 +19,7 @@ import mlflow import pandas as pd from ray.air.config import ScalingConfig -from ray.train.data_config import DataConfig +from ray.train import DataConfig from ray.train.torch.torch_trainer import TorchTrainer import torch import torch.nn as nn diff --git a/python/ray/air/config.py b/python/ray/air/config.py index 4852ed8b408d9..c5ffd9b536815 100644 --- a/python/ray/air/config.py +++ b/python/ray/air/config.py @@ -297,7 +297,7 @@ def from_placement_group_factory( @Deprecated( message="Use `ray.train.DataConfig` instead of DatasetConfig to " "configure data ingest for training. " - "See https://docs.ray.io/en/master/ray-air/check-ingest.html for more details." + "See https://docs.ray.io/en/master/ray-air/check-ingest.html#migrating-from-the-legacy-datasetconfig-api for more details." # noqa: E501 ) class DatasetConfig: """Configuration for ingest of a single Dataset. diff --git a/python/ray/air/tests/test_new_dataset_config.py b/python/ray/air/tests/test_new_dataset_config.py index 94227e5f9b559..08a3e9c0e660d 100644 --- a/python/ray/air/tests/test_new_dataset_config.py +++ b/python/ray/air/tests/test_new_dataset_config.py @@ -7,7 +7,7 @@ from ray.air import session from ray.air.config import ScalingConfig from ray.data import DataIterator -from ray.train.data_config import DataConfig +from ray.train import DataConfig from ray.train.data_parallel_trainer import DataParallelTrainer diff --git a/python/ray/air/util/check_ingest.py b/python/ray/air/util/check_ingest.py index be0ce4972fc26..9246aa603d47b 100755 --- a/python/ray/air/util/check_ingest.py +++ b/python/ray/air/util/check_ingest.py @@ -13,7 +13,7 @@ from ray.data.preprocessors import BatchMapper, Chain from ray.train._internal.dataset_spec import DataParallelIngestSpec from ray.train.data_parallel_trainer import DataParallelTrainer -from ray.train.data_config import DataConfig +from ray.train import DataConfig from ray.util.annotations import DeveloperAPI diff --git a/python/ray/train/__init__.py b/python/ray/train/__init__.py index 6933175d45e9d..f0b3f362fb2a0 100644 --- a/python/ray/train/__init__.py +++ b/python/ray/train/__init__.py @@ -1,9 +1,9 @@ from ray._private.usage import usage_lib +from ray.train._internal.data_config import DataConfig +from ray.train._internal.session import get_dataset_shard, report from ray.train.backend import BackendConfig -from ray.train.data_config import DataConfig -from ray.train.context import get_context from ray.train.constants import TRAIN_DATASET_KEY -from ray.train._internal.session import get_dataset_shard, report +from ray.train.context import get_context from ray.train.trainer import TrainingIterator from ray.air import Checkpoint diff --git a/python/ray/train/_internal/backend_executor.py b/python/ray/train/_internal/backend_executor.py index 1b59a0955b71a..3a67ec941b83d 100644 --- a/python/ray/train/_internal/backend_executor.py +++ b/python/ray/train/_internal/backend_executor.py @@ -8,7 +8,7 @@ from ray._private.ray_constants import env_integer from ray.air.config import CheckpointConfig from ray.exceptions import RayActorError -from ray.train.data_config import DataConfig +from ray.train import DataConfig from ray.air.checkpoint import Checkpoint from ray.train._internal.session import ( TrainingResult, diff --git a/python/ray/train/data_config.py b/python/ray/train/_internal/data_config.py similarity index 100% rename from python/ray/train/data_config.py rename to python/ray/train/_internal/data_config.py diff --git a/python/ray/train/data_parallel_trainer.py b/python/ray/train/data_parallel_trainer.py index 6fc87e2eabe29..5879411987dad 100644 --- a/python/ray/train/data_parallel_trainer.py +++ b/python/ray/train/data_parallel_trainer.py @@ -16,7 +16,7 @@ from ray.train._internal import session from ray.train._internal.backend_executor import BackendExecutor, TrialInfo from ray.train._internal.checkpoint import TuneCheckpointManager -from ray.train.data_config import DataConfig, _LegacyDataConfigWrapper +from ray.train._internal.data_config import DataConfig, _LegacyDataConfigWrapper from ray.train._internal.utils import construct_train_func from ray.train.constants import TRAIN_DATASET_KEY, WILDCARD_KEY from ray.train.trainer import BaseTrainer, GenDataset diff --git a/python/ray/train/horovod/horovod_trainer.py b/python/ray/train/horovod/horovod_trainer.py index cd5b1734a9dd1..ba130ac6dd5a8 100644 --- a/python/ray/train/horovod/horovod_trainer.py +++ b/python/ray/train/horovod/horovod_trainer.py @@ -1,7 +1,7 @@ from typing import Dict, Callable, Optional, Union, TYPE_CHECKING from ray.air.config import ScalingConfig, RunConfig -from ray.train.data_config import DataConfig +from ray.train import DataConfig from ray.train.trainer import GenDataset from ray.air.checkpoint import Checkpoint diff --git a/python/ray/train/huggingface/accelerate/accelerate_trainer.py b/python/ray/train/huggingface/accelerate/accelerate_trainer.py index 76e8fd1de470c..10904a837e57f 100644 --- a/python/ray/train/huggingface/accelerate/accelerate_trainer.py +++ b/python/ray/train/huggingface/accelerate/accelerate_trainer.py @@ -7,7 +7,7 @@ from ray.air import session from ray.air.checkpoint import Checkpoint from ray.air.config import RunConfig, ScalingConfig -from ray.train.data_config import DataConfig +from ray.train import DataConfig from ray.train.torch import TorchConfig from ray.train.trainer import GenDataset diff --git a/python/ray/train/huggingface/transformers/transformers_trainer.py b/python/ray/train/huggingface/transformers/transformers_trainer.py index 696d9ada219dc..12d4c3b88d59e 100644 --- a/python/ray/train/huggingface/transformers/transformers_trainer.py +++ b/python/ray/train/huggingface/transformers/transformers_trainer.py @@ -15,7 +15,7 @@ EVALUATION_DATASET_KEY, TRAIN_DATASET_KEY, ) -from ray.train.data_config import DataConfig +from ray.train import DataConfig from ray.train.data_parallel_trainer import DataParallelTrainer from ray.train.torch import TorchConfig, TorchTrainer from ray.train.trainer import GenDataset diff --git a/python/ray/train/lightning/lightning_trainer.py b/python/ray/train/lightning/lightning_trainer.py index 44c7eae61f3e2..465eae28e6d33 100644 --- a/python/ray/train/lightning/lightning_trainer.py +++ b/python/ray/train/lightning/lightning_trainer.py @@ -11,7 +11,7 @@ from ray.air.constants import MODEL_KEY from ray.air.checkpoint import Checkpoint from ray.data.preprocessor import Preprocessor -from ray.train.data_config import DataConfig +from ray.train import DataConfig from ray.train.trainer import GenDataset from ray.train.torch import TorchTrainer from ray.train.torch.config import TorchConfig diff --git a/python/ray/train/mosaic/mosaic_trainer.py b/python/ray/train/mosaic/mosaic_trainer.py index 4b0b5f9a4158c..9e089ef2b58f0 100644 --- a/python/ray/train/mosaic/mosaic_trainer.py +++ b/python/ray/train/mosaic/mosaic_trainer.py @@ -7,7 +7,7 @@ from ray.air.checkpoint import Checkpoint from ray.air.config import RunConfig, ScalingConfig -from ray.train.data_config import DataConfig +from ray.train import DataConfig from ray.train.mosaic._mosaic_utils import RayLogger from ray.train.torch import TorchConfig, TorchTrainer from ray.train.trainer import GenDataset diff --git a/python/ray/train/tensorflow/tensorflow_trainer.py b/python/ray/train/tensorflow/tensorflow_trainer.py index f204af9f2c801..a9b8099f11a48 100644 --- a/python/ray/train/tensorflow/tensorflow_trainer.py +++ b/python/ray/train/tensorflow/tensorflow_trainer.py @@ -1,6 +1,6 @@ from typing import Callable, Optional, Dict, Union, TYPE_CHECKING -from ray.train.data_config import DataConfig +from ray.train import DataConfig from ray.train.tensorflow.config import TensorflowConfig from ray.train.trainer import GenDataset from ray.train.data_parallel_trainer import DataParallelTrainer diff --git a/python/ray/train/tests/test_backend.py b/python/ray/train/tests/test_backend.py index e0fb060a0f5c7..fd0ed2aac5546 100644 --- a/python/ray/train/tests/test_backend.py +++ b/python/ray/train/tests/test_backend.py @@ -18,7 +18,7 @@ TrainingWorkerError, ) -from ray.train.data_config import DataConfig +from ray.train import DataConfig from ray.train._internal.worker_group import WorkerGroup, WorkerMetadata from ray.train.backend import Backend, BackendConfig from ray.train.constants import ( diff --git a/python/ray/train/tests/test_training_iterator.py b/python/ray/train/tests/test_training_iterator.py index ea191299d9415..ce5d1136f9733 100644 --- a/python/ray/train/tests/test_training_iterator.py +++ b/python/ray/train/tests/test_training_iterator.py @@ -6,7 +6,7 @@ from ray.air.config import CheckpointConfig from ray.train._internal.worker_group import WorkerGroup from ray.train.trainer import TrainingIterator -from ray.train.data_config import DataConfig +from ray.train import DataConfig import ray from ray.air import session diff --git a/python/ray/train/torch/torch_trainer.py b/python/ray/train/torch/torch_trainer.py index 41d908eb2d38c..5f2f258bac086 100644 --- a/python/ray/train/torch/torch_trainer.py +++ b/python/ray/train/torch/torch_trainer.py @@ -2,7 +2,7 @@ from ray.air.checkpoint import Checkpoint from ray.air.config import RunConfig, ScalingConfig -from ray.train.data_config import DataConfig +from ray.train import DataConfig from ray.train.data_parallel_trainer import DataParallelTrainer from ray.train.torch.config import TorchConfig from ray.train.trainer import GenDataset diff --git a/python/ray/train/trainer.py b/python/ray/train/trainer.py index 8acd99f1803b6..6921977be114a 100644 --- a/python/ray/train/trainer.py +++ b/python/ray/train/trainer.py @@ -8,7 +8,7 @@ from ray.air._internal.uri_utils import URI from ray.air._internal.util import StartTraceback from ray.data import Dataset -from ray.train.data_config import DataConfig +from ray.train import DataConfig from ray.train._internal.backend_executor import ( BackendExecutor, InactiveWorkerGroupError,