diff --git a/doc/source/data/loading-data.rst b/doc/source/data/loading-data.rst index fe5c83260db7..fe854c99b5cd 100644 --- a/doc/source/data/loading-data.rst +++ b/doc/source/data/loading-data.rst @@ -620,7 +620,12 @@ Ray Data interoperates with HuggingFace and TensorFlow datasets. print(ds) + .. + The following `testoutput` is mocked to avoid illustrating download logs like + "Downloading and preparing dataset 162.17 MiB". + .. testoutput:: + :options: +MOCK MaterializedDataset( num_blocks=..., diff --git a/doc/source/data/working-with-images.rst b/doc/source/data/working-with-images.rst index 4811106fca47..a6e686a087c0 100644 --- a/doc/source/data/working-with-images.rst +++ b/doc/source/data/working-with-images.rst @@ -118,7 +118,13 @@ To view the full list of supported file formats, see the print(ds.schema()) + .. + The following `testoutput` is mocked because the order of column names can + be non-deterministic. For an example, see + https://buildkite.com/ray-project/oss-ci-build-branch/builds/4849#01892c8b-0cd0-4432-bc9f-9f86fcd38edd. + .. testoutput:: + :options: +MOCK Column Type ------ ---- diff --git a/doc/source/data/working-with-pytorch.rst b/doc/source/data/working-with-pytorch.rst index 9aaa0ed9d696..1f41e3afa4b7 100644 --- a/doc/source/data/working-with-pytorch.rst +++ b/doc/source/data/working-with-pytorch.rst @@ -17,7 +17,7 @@ This guide describes how to: Iterating over torch tensors for training ----------------------------------------- -To iterate over batches of data in torch format, call :meth:`Dataset.iter_torch_batches() `. Each batch is represented as `Dict[str, torch.Tensor]`, with one tensor per column in the dataset. +To iterate over batches of data in torch format, call :meth:`Dataset.iter_torch_batches() `. Each batch is represented as `Dict[str, torch.Tensor]`, with one tensor per column in the dataset. This is useful for training torch models with batches from your dataset. For configuration details such as providing a `collate_fn` for customizing the conversion, see `the API reference `. @@ -43,7 +43,7 @@ Integration with Ray Train Ray Data integrates with :ref:`Ray Train ` for easy data ingest for data parallel training, with support for PyTorch, PyTorch Lightning, or Huggingface training. .. testcode:: - + import torch from torch import nn import ray @@ -54,7 +54,7 @@ Ray Data integrates with :ref:`Ray Train ` for easy data ingest for model = nn.Sequential(nn.Linear(30, 1), nn.Sigmoid()) loss_fn = torch.nn.BCELoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.001) - + # Datasets can be accessed in your train_func via ``get_dataset_shard``. train_data_shard = session.get_dataset_shard("train") @@ -66,7 +66,7 @@ Ray Data integrates with :ref:`Ray Train ` for easy data ingest for train_loss.backward() optimizer.step() - + train_dataset = ray.data.read_csv("s3://anonymous@air-example-data/breast_cancer.csv") trainer = TorchTrainer( @@ -82,15 +82,15 @@ Ray Data integrates with :ref:`Ray Train ` for easy data ingest for ... For more details, see the :ref:`Ray Train user guide `. - + .. _transform_pytorch: Transformations with torch tensors ---------------------------------- -Transformations applied with `map` or `map_batches` can return torch tensors. +Transformations applied with `map` or `map_batches` can return torch tensors. .. caution:: - + Under the hood, Ray Data automatically converts torch tensors to numpy arrays. Subsequent transformations accept numpy arrays as input, not torch tensors. .. tab-set:: @@ -98,17 +98,17 @@ Transformations applied with `map` or `map_batches` can return torch tensors. .. tab-item:: map .. testcode:: - + from typing import Dict import numpy as np import torch import ray - + ds = ray.data.read_images("example://image-datasets/simple") def convert_to_torch(row: Dict[str, np.ndarray]) -> Dict[str, torch.Tensor]: return {"tensor": torch.as_tensor(row["image"])} - + # The tensor gets converted into a Numpy array under the hood transformed_ds = ds.map(convert_to_torch) print(transformed_ds.schema()) @@ -117,11 +117,11 @@ Transformations applied with `map` or `map_batches` can return torch tensors. def check_numpy(row: Dict[str, np.ndarray]): assert isinstance(row["tensor"], np.ndarray) return row - + transformed_ds.map(check_numpy).take_all() .. testoutput:: - + Column Type ------ ---- tensor numpy.ndarray(shape=(32, 32, 3), dtype=uint8) @@ -129,17 +129,17 @@ Transformations applied with `map` or `map_batches` can return torch tensors. .. tab-item:: map_batches .. testcode:: - + from typing import Dict import numpy as np import torch import ray - + ds = ray.data.read_images("example://image-datasets/simple") def convert_to_torch(batch: Dict[str, np.ndarray]) -> Dict[str, torch.Tensor]: return {"tensor": torch.as_tensor(batch["image"])} - + # The tensor gets converted into a Numpy array under the hood transformed_ds = ds.map_batches(convert_to_torch, batch_size=2) print(transformed_ds.schema()) @@ -148,11 +148,11 @@ Transformations applied with `map` or `map_batches` can return torch tensors. def check_numpy(batch: Dict[str, np.ndarray]): assert isinstance(batch["tensor"], np.ndarray) return batch - + transformed_ds.map_batches(check_numpy, batch_size=2).take_all() .. testoutput:: - + Column Type ------ ---- tensor numpy.ndarray(shape=(32, 32, 3), dtype=uint8) @@ -169,13 +169,13 @@ You can use built-in torch transforms from `torchvision`, `torchtext`, and `torc .. tab-item:: torchvision .. testcode:: - + from typing import Dict import numpy as np import torch from torchvision import transforms import ray - + # Create the Dataset. ds = ray.data.read_images("example://image-datasets/simple") @@ -191,18 +191,18 @@ You can use built-in torch transforms from `torchvision`, `torchtext`, and `torc def transform_image(row: Dict[str, np.ndarray]) -> Dict[str, torch.Tensor]: row["transformed_image"] = transform(row["image"]) return row - + # Apply the transform over the dataset. transformed_ds = ds.map(transform_image) print(transformed_ds.schema()) - + .. testoutput:: Column Type ------ ---- image numpy.ndarray(shape=(32, 32, 3), dtype=uint8) transformed_image numpy.ndarray(shape=(3, 10, 10), dtype=float) - + .. tab-item:: torchtext .. testcode:: @@ -211,7 +211,7 @@ You can use built-in torch transforms from `torchvision`, `torchtext`, and `torc import numpy as np from torchtext import transforms import ray - + # Create the Dataset. ds = ray.data.read_text("example://simple.txt") @@ -223,11 +223,11 @@ You can use built-in torch transforms from `torchvision`, `torchtext`, and `torc def tokenize_text(batch: Dict[str, np.ndarray]) -> Dict[str, List[str]]: batch["tokenized_text"] = transform(list(batch["text"])) return batch - + # Apply the transform over the dataset. transformed_ds = ds.map_batches(tokenize_text, batch_size=2) print(transformed_ds.schema()) - + .. testoutput:: Column Type @@ -296,7 +296,7 @@ For more details, see the :ref:`Batch inference user guide `. @@ -310,7 +310,7 @@ For more information on saving data, read .. tab-item:: Parquet .. testcode:: - + import torch import ray @@ -322,7 +322,7 @@ For more information on saving data, read .. tab-item:: Numpy .. testcode:: - + import torch import ray @@ -356,11 +356,16 @@ If you are using built-in PyTorch datasets, for example from `torchvision`, thes mnist = torchvision.datasets.MNIST(root="/tmp/", download=True) ds = ray.data.from_torch(mnist) - - # The data for each item of the torch dataset is under the "item" key. + + # The data for each item of the torch dataset is under the "item" key. print(ds.schema()) +.. + The following `testoutput` is mocked to avoid illustrating download logs like + "Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz". + .. testoutput:: + :options: +MOCK Column Type ------ ---- @@ -369,7 +374,7 @@ If you are using built-in PyTorch datasets, for example from `torchvision`, thes Custom PyTorch Datasets ~~~~~~~~~~~~~~~~~~~~~~~ -If you have a custom PyTorch Dataset, you can migrate to Ray Data by converting the logic in ``__getitem__`` to Ray Data read and transform operations. +If you have a custom PyTorch Dataset, you can migrate to Ray Data by converting the logic in ``__getitem__`` to Ray Data read and transform operations. Any logic for reading data from cloud storage and disk can be replaced by one of the Ray Data ``read_*`` APIs, and any transformation logic can be applied as a :meth:`map ` call on the Dataset. @@ -399,7 +404,7 @@ The following example shows a custom PyTorch Dataset, and what the analagous wou self.s3 = boto3.resource("s3", config=Config(signature_version=UNSIGNED)) self.bucket = self.s3.Bucket(bucket_name) self.files = [obj.key for obj in self.bucket.objects.filter(Prefix=dir_path)] - + self.transform = transforms.Compose([ transforms.ToTensor(), transforms.Resize((128, 128)), @@ -458,12 +463,12 @@ The following example shows a custom PyTorch Dataset, and what the analagous wou transforms.Resize((128, 128)), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) - + # Preprocess the images. def transform_image(row: dict): row["transformed_image"] = transform(row["image"]) return row - + # Map the transformations over the dataset. ds = ds.map(extract_label).map(transform_image) diff --git a/doc/source/ray-core/actors/async_api.rst b/doc/source/ray-core/actors/async_api.rst index bdfe3f7d247c..d8999148014d 100644 --- a/doc/source/ray-core/actors/async_api.rst +++ b/doc/source/ray-core/actors/async_api.rst @@ -67,6 +67,7 @@ async frameworks like aiohttp, aioredis, etc. # NOTE: The outputs from the previous code block can show up in subsequent tests. # To prevent flakiness, we wait for the async calls finish. import time + print("Sleeping...") time.sleep(3) .. testoutput::