diff --git a/doc/source/data/doc_code/quick_start.py b/doc/source/data/doc_code/quick_start.py deleted file mode 100644 index 5d63ddbaab557..0000000000000 --- a/doc/source/data/doc_code/quick_start.py +++ /dev/null @@ -1,91 +0,0 @@ -# flake8: noqa - -# fmt: off -# __create_from_python_begin__ -import ray - -# Create a Dataset of Python objects. -ds = ray.data.range(10000) -# -> Dataset(num_blocks=200, num_rows=10000, schema=) - -ds.take(5) -# -> [0, 1, 2, 3, 4] - -ds.schema() -# - -# Create a Dataset from Python objects, which are held as Arrow records. -ds = ray.data.from_items([ - {"sepal.length": 5.1, "sepal.width": 3.5, - "petal.length": 1.4, "petal.width": 0.2, "variety": "Setosa"}, - {"sepal.length": 4.9, "sepal.width": 3.0, - "petal.length": 1.4, "petal.width": 0.2, "variety": "Setosa"}, - {"sepal.length": 4.7, "sepal.width": 3.2, - "petal.length": 1.3, "petal.width": 0.2, "variety": "Setosa"}, - ]) -# Dataset(num_blocks=3, num_rows=3, -# schema={sepal.length: float64, sepal.width: float64, -# petal.length: float64, petal.width: float64, variety: object}) - -ds.show() -# -> {'sepal.length': 5.1, 'sepal.width': 3.5, -# 'petal.length': 1.4, 'petal.width': 0.2, 'variety': 'Setosa'} -# -> {'sepal.length': 4.9, 'sepal.width': 3.0, -# 'petal.length': 1.4, 'petal.width': 0.2, 'variety': 'Setosa'} -# -> {'sepal.length': 4.7, 'sepal.width': 3.2, -# 'petal.length': 1.3, 'petal.width': 0.2, 'variety': 'Setosa'} - -ds.schema() -# -> sepal.length: double -# -> sepal.width: double -# -> petal.length: double -# -> petal.width: double -# -> variety: string -# __create_from_python_end__ -# fmt: on - -# fmt: off -# __create_from_files_begin__ -# Create from CSV. -ds = ray.data.read_csv("s3://anonymous@air-example-data/iris.csv") -# Dataset(num_blocks=1, num_rows=150, -# schema={sepal length (cm): double, sepal width (cm): double, -# petal length (cm): double, petal width (cm): double, target: int64}) - -# Create from Parquet. -ds = ray.data.read_parquet("s3://anonymous@air-example-data/iris.parquet") -# Dataset(num_blocks=1, num_rows=150, -# schema={sepal.length: double, sepal.width: double, -# petal.length: double, petal.width: double, variety: string}) - -# __create_from_files_end__ -# fmt: on - -# fmt: off -# __data_transform_begin__ -import pandas - -# Create 10 blocks for parallelism. -ds = ds.repartition(10) -# Dataset(num_blocks=10, num_rows=150, -# schema={sepal.length: float64, sepal.width: float64, -# petal.length: float64, petal.width: float64, variety: object}) - -# Find rows with sepal.length < 5.5 and petal.length > 3.5. -def transform_batch(df: pandas.DataFrame) -> pandas.DataFrame: - return df[(df["sepal.length"] < 5.5) & (df["petal.length"] > 3.5)] - -transformed_ds = ds.map_batches(transform_batch, batch_format="pandas") -# Dataset(num_blocks=10, num_rows=3, -# schema={sepal.length: float64, sepal.width: float64, -# petal.length: float64, petal.width: float64, variety: object}) - -transformed_ds.show() -# -> {'sepal.length': 5.2, 'sepal.width': 2.7, -# 'petal.length': 3.9, 'petal.width': 1.4, 'variety': 'Versicolor'} -# -> {'sepal.length': 5.4, 'sepal.width': 3.0, -# 'petal.length': 4.5, 'petal.width': 1.5, 'variety': 'Versicolor'} -# -> {'sepal.length': 4.9, 'sepal.width': 2.5, -# 'petal.length': 4.5, 'petal.width': 1.7, 'variety': 'Virginica'} -# __data_transform_end__ -# fmt: on diff --git a/doc/source/ray-overview/getting-started.md b/doc/source/ray-overview/getting-started.md index c3d9d61dc61b2..8a0fce79f92a0 100644 --- a/doc/source/ray-overview/getting-started.md +++ b/doc/source/ray-overview/getting-started.md @@ -4,7 +4,7 @@ (gentle-intro)= # Getting Started -Use Ray to scale applications on your laptop or the cloud. Choose the right guide for your task. +Use Ray to scale applications on your laptop or the cloud. Choose the right guide for your task. * Scale end-to-end ML applications: [Ray AI Runtime Quickstart](#ray-ai-runtime-quickstart) * Scale single ML workloads: [Ray Libraries Quickstart](#ray-libraries-quickstart) * Scale general Python applications: [Ray Core Quickstart](#ray-core-quickstart) @@ -94,49 +94,54 @@ Learn more about Ray AIR ## Ray Libraries Quickstart -Use individual libraries for single ML workloads, without having to install the full AI Runtime package. Click on the dropdowns for your workload below. +Use individual libraries for single ML workloads, without having to install the full AI Runtime package. Click on the dropdowns for your workload below. `````{dropdown} ray Data: Scalable Datasets for ML :animate: fade-in-slide-down -Ray Data is the standard way to load and exchange data in Ray libraries and applications. -Ray Data provides basic distributed data transformations such as `map`, `filter`, and `repartition`. -They are compatible with a variety of file formats, datasources, and distributed frameworks. +Scale offline inference and training ingest with [Ray Data](data_key_concepts) -- +a data processing library designed for ML. + +To learn more, see [Offline batch inference](batch_inference_overview) and +[Data preprocessing and ingest for ML training](ml_ingest_overview). ````{note} -To run this example install Ray Data and Dask: +To run this example, install Ray Data: ```bash -pip install -U "ray[data]" dask +pip install -U "ray[data]" ``` ```` -Get started by creating a Dataset from synthetic data using ``ray.data.range()`` and ``ray.data.from_items()``. -A Dataset can hold either plain Python objects (schema is a Python type), or Arrow records (schema is Arrow). +```{testcode} +from typing import Dict +import numpy as np +import ray -```{literalinclude} ../data/doc_code/quick_start.py -:language: python -:start-after: __create_from_python_begin__ -:end-before: __create_from_python_end__ -``` +# Create datasets from on-disk files, Python objects, and cloud storage like S3. +ds = ray.data.read_csv("s3://anonymous@ray-example-data/iris.csv") -Datasets can be created from files on local disk or remote datasources such as S3. Any filesystem -[supported by pyarrow](http://arrow.apache.org/docs/python/generated/pyarrow.fs.FileSystem.html) can be used to specify file locations. -You can also create a ``Dataset`` from existing data in the Ray object store or Ray-compatible distributed DataFrames: +# Apply functions to transform data. Ray Data executes transformations in parallel. +def compute_area(batch: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]: + length = batch["petal length (cm)"] + width = batch["petal width (cm)"] + batch["petal area (cm^2)"] = length * width + return batch -```{literalinclude} ../data/doc_code/quick_start.py -:language: python -:start-after: __create_from_files_begin__ -:end-before: __create_from_files_end__ +transformed_ds = ds.map_batches(compute_area) + +# Iterate over batches of data. +for batch in transformed_ds.iter_batches(batch_size=4): + print(batch) + +# Save dataset contents to on-disk files or cloud storage. +transformed_ds.write_parquet("local:///tmp/iris/") ``` -Datasets can be transformed in parallel using ``.map()``. -Transformations are executed *eagerly* and block until the operation is finished. -Datasets also supports ``.filter()`` and ``.flat_map()``. -```{literalinclude} ../data/doc_code/quick_start.py -:language: python -:start-after: __data_transform_begin__ -:end-before: __data_transform_end__ +```{testoutput} +:hide: + +... ``` ```{button-ref} ../data/data @@ -152,7 +157,7 @@ Learn more about Ray Data :animate: fade-in-slide-down Ray Train abstracts away the complexity of setting up a distributed training -system. +system. `````{tab-set} @@ -296,8 +301,8 @@ Learn more about Ray Train `````{dropdown} ray Tune: Hyperparameter Tuning at Scale :animate: fade-in-slide-down -[Tune](../tune/index.rst) is a library for hyperparameter tuning at any scale. -With Tune, you can launch a multi-node distributed hyperparameter sweep in less than 10 lines of code. +[Tune](../tune/index.rst) is a library for hyperparameter tuning at any scale. +With Tune, you can launch a multi-node distributed hyperparameter sweep in less than 10 lines of code. Tune supports any deep learning framework, including PyTorch, TensorFlow, and Keras. ````{note} @@ -336,7 +341,7 @@ Learn more about Ray Tune `````{dropdown} ray Serve: Scalable Model Serving :animate: fade-in-slide-down -[Ray Serve](../serve/index) is a scalable model-serving library built on Ray. +[Ray Serve](../serve/index) is a scalable model-serving library built on Ray. ````{note} To run this example, install Ray Serve and scikit-learn: @@ -400,7 +405,7 @@ Learn more about Ray RLlib ## Ray Core Quickstart Turn functions and classes easily into Ray tasks and actors, -for Python and Java, with simple primitives for building and running distributed applications. +for Python and Java, with simple primitives for building and running distributed applications. ``````{dropdown} ray Core: Parallelizing Functions with Ray Tasks @@ -445,7 +450,7 @@ To run this example, add the [ray-api](https://mvnrepository.com/artifact/io.ray ``` Use `Ray.init` to initialize Ray runtime. -Then use `Ray.task(...).remote()` to convert any Java static method into a Ray task. +Then use `Ray.task(...).remote()` to convert any Java static method into a Ray task. The task runs asynchronously in a remote worker process. The `remote` method returns an ``ObjectRef``, and you can fetch the actual result with ``get``. @@ -461,7 +466,7 @@ public class RayDemo { public static int square(int x) { return x * x; } - + public static void main(String[] args) { // Intialize Ray runtime. Ray.init(); @@ -556,18 +561,18 @@ import java.util.stream.Collectors; public class RayDemo { public static class Counter { - + private int value = 0; - + public void increment() { this.value += 1; } - + public int read() { return this.value; } } - + public static void main(String[] args) { // Intialize Ray runtime. Ray.init(); @@ -577,7 +582,7 @@ public class RayDemo { for (int i = 0; i < 4; i++) { counters.add(Ray.actor(Counter::new).remote()); } - + // Invoke the `increment` method on each actor. // This will send an actor task to each remote actor. for (ActorHandle counter : counters) { @@ -712,12 +717,12 @@ Run the following code. def task_running_300_seconds(): print("Start!") time.sleep(300) - + @ray.remote class Actor: def __init__(self): print("Actor created") - + # Create 2 tasks tasks = [task_running_300_seconds.remote() for _ in range(2)]