[Templates] Update for ray 2.5.0 (#36367)

This PR removes some remnants of using `runtime_env` (the error message within the actor if `statsforecast` is not installed. This is no longer needed after we recommend `pip install --user`. This PR also improves the wording of the explanations for resource configuration in the many model training example. Signed-off-by: Justin Yu <justinvyu@anyscale.com>
ray-project · Jun 13, 2023 · dbfe420 · dbfe420
1 parent f8a1bdf
commit dbfe420
Show file tree

Hide file tree

Showing 13 changed files with 85 additions and 72 deletions.
diff --git a/doc/source/templates/01_batch_inference/README.md b/doc/source/templates/01_batch_inference/README.md
@@ -5,7 +5,7 @@
 | Summary | This template walks through GPU batch inference on an image dataset using a PyTorch ResNet model. |
 | Time to Run | Less than 2 minutes to compute predictions on the dataset. |
 | Minimum Compute Requirements | No hard requirements. The default is 4 nodes, each with 1 NVIDIA T4 GPU. |
-| Cluster Environment | This template uses the latest Anyscale-provided Ray ML image using Python 3.9: [`anyscale/ray-ml:2.4.0-py39-gpu`](https://docs.anyscale.com/reference/base-images/ray-240/py39#ray-ml-2-4-0-py39). If you want to change to a different cluster environment, make sure that it is based off of this image! |
+| Cluster Environment | This template uses the latest Anyscale-provided Ray ML image using Python 3.9: [`anyscale/ray-ml:latest-py39-gpu`](https://docs.anyscale.com/reference/base-images/overview). If you want to change to a different cluster environment, make sure that it is based off of this image! |
 
 ## Getting Started
 

diff --git a/doc/source/templates/01_batch_inference/start.ipynb b/doc/source/templates/01_batch_inference/start.ipynb
@@ -13,7 +13,7 @@
     "| Summary | This template walks through GPU batch inference on an image dataset using a PyTorch ResNet model. |\n",
     "| Time to Run | Less than 2 minutes to compute predictions on the dataset. |\n",
     "| Minimum Compute Requirements | No hard requirements. The default is 4 nodes, each with 1 NVIDIA T4 GPU. |\n",
-    "| Cluster Environment | This template uses the latest Anyscale-provided Ray ML image using Python 3.9: [`anyscale/ray-ml:2.4.0-py39-gpu`](https://docs.anyscale.com/reference/base-images/ray-240/py39#ray-ml-2-4-0-py39). If you want to change to a different cluster environment, make sure that it is based off of this image! |\n",
+    "| Cluster Environment | This template uses the latest Anyscale-provided Ray ML image using Python 3.9: [`anyscale/ray-ml:latest-py39-gpu`](https://docs.anyscale.com/reference/base-images/overview). If you want to change to a different cluster environment, make sure that it is based off of this image! |\n",
     "\n",
     "By the end, we will have classified > 3000 images using the pre-trained ResNet model and saved these predictions to a local directory.\n",
     "\n",
@@ -260,7 +260,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if NUM_WORKERS > ray.available_resources()[\"GPU\"]:\n",
+    "if USE_GPU and NUM_WORKERS > ray.available_resources()[\"GPU\"]:\n",
     "    print(\n",
     "        \"Your cluster does not currently have enough resources to run with these settings. \"\n",
     "        \"Consider decreasing the number of workers, decreasing the resources needed \"\n",

diff --git a/doc/source/templates/02_many_model_training/README.md b/doc/source/templates/02_many_model_training/README.md
@@ -5,7 +5,7 @@
 | Summary | This template demonstrates how to parallelize the training of hundreds of time-series forecasting models with [Ray Tune](https://docs.ray.io/en/latest/tune/index.html). The template uses the `statsforecast` library to fit models to partitions of the M4 forecasting competition dataset. |
 | Time to Run | Around 5 minutes to train all models. |
 | Minimum Compute Requirements | No hard requirements. The default is 8 nodes with 8 CPUs each. |
-| Cluster Environment | This template uses the latest Anyscale-provided Ray ML image using Python 3.9: [`anyscale/ray-ml:2.4.0-py39-gpu`](https://docs.anyscale.com/reference/base-images/ray-240/py39#ray-ml-2-4-0-py39), with some extra requirements from `requirements.txt` installed on top. If you want to change to a different cluster environment, make sure that it is based off of this image and includes all packages listed in the `requirements.txt` file. |
+| Cluster Environment | This template uses the latest Anyscale-provided Ray ML image using Python 3.9: [`anyscale/ray-ml:latest-py39-gpu`](https://docs.anyscale.com/reference/base-images/overview), with some extra requirements from `requirements.txt` installed on top. If you want to change to a different cluster environment, make sure that it is based off of this image and includes all packages listed in the `requirements.txt` file. |
 
 ## Getting Started
 

diff --git a/doc/source/templates/02_many_model_training/start.ipynb b/doc/source/templates/02_many_model_training/start.ipynb
@@ -13,7 +13,7 @@
     "| Summary | This template demonstrates how to parallelize the training of hundreds of time-series forecasting models with [Ray Tune](https://docs.ray.io/en/latest/tune/index.html). The template uses the `statsforecast` library to fit models to partitions of the M4 forecasting competition dataset. |\n",
     "| Time to Run | Around 5 minutes to train all models. |\n",
     "| Minimum Compute Requirements | No hard requirements. The default is 8 nodes with 8 CPUs each. |\n",
-    "| Cluster Environment | This template uses the latest Anyscale-provided Ray ML image using Python 3.9: [`anyscale/ray-ml:2.4.0-py39-gpu`](https://docs.anyscale.com/reference/base-images/ray-240/py39#ray-ml-2-4-0-py39), with some extra requirements from `requirements.txt` installed on top. If you want to change to a different cluster environment, make sure that it is based off of this image and includes all packages listed in the `requirements.txt` file. |\n",
+    "| Cluster Environment | This template uses the latest Anyscale-provided Ray ML image using Python 3.9: [`anyscale/ray-ml:latest-py39-gpu`](https://docs.anyscale.com/reference/base-images/overview), with some extra requirements from `requirements.txt` installed on top. If you want to change to a different cluster environment, make sure that it is based off of this image and includes all packages listed in the `requirements.txt` file. |\n",
     "\n",
     "The end result of the template is fitting multiple models on each dataset partition, then determining the best model based on cross-validation metrics. Then, using the best model, we can generate forecasts like the ones shown below:\n",
     "\n",
@@ -38,6 +38,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "52aa4f70",
    "metadata": {},
@@ -46,6 +47,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "488cd257",
    "metadata": {},
@@ -54,7 +56,7 @@
     "For this, we'll use `pip install --user` to install the necessary requirements. On an Anyscale Workspace, this is configured to install packages to a shared filesystem that will be available to all nodes in the cluster.\n",
     "\n",
     "```\n",
-    "pip install --user -r requirements.txt --upgrade\n",
+    "pip install --user -r requirements.txt\n",
     "```\n",
     "\n",
     "After installing all the requirements, we'll start with some imports."
@@ -67,7 +69,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import matplotlib.pyplot as plt\n",
     "import pandas as pd\n",
+    "from statsforecast import StatsForecast\n",
+    "from statsforecast.models import AutoARIMA, AutoETS, MSTL\n",
     "\n",
     "from ray import tune\n",
     "from ray.air import session\n"
@@ -93,7 +98,11 @@
    "id": "060ee3ce",
    "metadata": {},
    "source": [
-    "> ✂️ Replace this with your own training logic to run per dataset partition."
+    "> ✂️ Replace this with your own training logic to run per dataset partition.\n",
+    ">\n",
+    "> The only additional Ray Tune code that is added is the `session.report`\n",
+    "> at the end of the training function. This reports metrics for Ray Tune to log,\n",
+    "> which can be analyzed after the run finishes."
    ]
   },
   {
@@ -103,16 +112,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def train_fn(config: dict):\n",
-    "    try:\n",
-    "        from statsforecast import StatsForecast\n",
-    "        from statsforecast.models import AutoARIMA, AutoETS, MSTL\n",
-    "        import matplotlib.pyplot as plt\n",
-    "    except ImportError as e:\n",
-    "        raise RuntimeError(\n",
-    "            \"Did you use a runtime environment to set up dependencies?\"\n",
-    "        ) from e\n",
+    "n_cv_windows = 1\n",
+    "\n",
+    "# Try two different types of forecasting models per dataset partition.\n",
+    "# The dataset contains hourly records, so the `season_length` is 24 hours.\n",
+    "models = [\n",
+    "    AutoETS(season_length=24),\n",
+    "    MSTL(season_length=24, trend_forecaster=AutoARIMA()),\n",
+    "]\n",
     "\n",
+    "# See the appendix for info on setting resource requirements for each trial.\n",
+    "cpus_per_trial = len(models) * n_cv_windows\n",
+    "\n",
+    "\n",
+    "def train_fn(config: dict):\n",
     "    # First, define some helper functions for fetching data and computing eval metrics.\n",
     "\n",
     "    def get_m5_partition(unique_id: str) -> pd.DataFrame:\n",
@@ -149,25 +162,20 @@
     "    data_partition_id = config[\"data_partition_id\"]\n",
     "    train_df = get_m5_partition(data_partition_id)\n",
     "\n",
-    "    n_windows = 1\n",
     "    forecast_horizon = 24  # Forecast the next 24 hours\n",
     "\n",
-    "    # Try two different types of forecasting models.\n",
-    "    # The dataset contains hourly records, so the `season_length` is 24 hours.\n",
-    "    models = [\n",
-    "        AutoETS(season_length=24),\n",
-    "        MSTL(season_length=24, trend_forecaster=AutoARIMA()),\n",
-    "    ]\n",
     "    sf = StatsForecast(\n",
     "        df=train_df,\n",
     "        models=models,\n",
     "        freq=\"H\",\n",
-    "        n_jobs=len(models) * n_windows,\n",
+    "        # Set the number of cores used by statsforecast to the\n",
+    "        # number of CPUs assigned to the trial!\n",
+    "        n_jobs=cpus_per_trial,\n",
     "    )\n",
     "    cv_df = sf.cross_validation(\n",
     "        h=forecast_horizon,\n",
     "        step_size=forecast_horizon,\n",
-    "        n_windows=n_windows,\n",
+    "        n_windows=n_cv_windows,\n",
     "    )\n",
     "\n",
     "    eval_df = evaluate_cross_validation(df=cv_df)\n",
@@ -189,9 +197,7 @@
     "    session.report({\"forecast_mse\": forecast_mse, \"best_model\": best_model})\n",
     "\n",
     "\n",
-    "trainable = train_fn\n",
-    "# See the appendix for info on setting resource requirements on the training function.\n",
-    "trainable = tune.with_resources(trainable, resources={\"CPU\": 2 * 1})\n"
+    "trainable = tune.with_resources(train_fn, resources={\"CPU\": cpus_per_trial})\n"
    ]
   },
   {
@@ -244,6 +250,7 @@
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "13b4dd3e",
    "metadata": {},
@@ -341,13 +348,14 @@
     "`tune.with_resources` was used to specify the resources needed to launch one of our training jobs.\n",
     "Feel free to change this to the resources required by your application! You can also comment out the `tune.with_resources` block to assign `1 CPU` (the default) to each trial.\n",
     "\n",
-    "Note that this is purely for Tune to know how many trials to schedule concurrently -- setting the number of CPUs does not actually enforce any kind of resource isolation!\n",
-    "In this template, `statsforecast` runs cross validation in parallel with M models * N temporal cross-validation windows (e.g. 2 * 1).\n",
+    "Note that the number of CPUs to assign a trial is dependent on the workload.\n",
+    "In this template, `statsforecast` has a `n_jobs` configuration that determines the number of CPU cores to use for performing the model fitting and cross-validation *within a trial*. So, we should set `n_jobs = cpus_per_trial`. We chose to set the parallelism equal to the total number of models that are fitted during cross-validation: `M model types * N temporal cross-validation windows = 2 * 1 = 2`.\n",
     "\n",
     "See [Ray Tune's guide on assigning resources](https://docs.ray.io/en/latest/tune/tutorials/tune-resources.html) for more information."
    ]
   },
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "dd48618e",
    "metadata": {},

diff --git a/doc/source/templates/03_serving_stable_diffusion/README.md b/doc/source/templates/03_serving_stable_diffusion/README.md
@@ -5,7 +5,7 @@
 | Summary | This template loads a pretrained stable diffusion model from HuggingFace and serves it to a local endpoint as a [Ray Serve](https://docs.ray.io/en/latest/serve/index.html) deployment. |
 | Time to Run | Around 2 minutes to setup the models and generate your first image(s). Less than 10 seconds for every subsequent round of image generation (depending on the image size). |
 | Minimum Compute Requirements | At least 1 GPU node. The default is 4 nodes, each with 1 NVIDIA T4 GPU. |
-| Cluster Environment | This template uses a docker image built on top of the latest Anyscale-provided Ray image using Python 3.9: [`anyscale/ray:2.4.0-py39-cu118`](https://docs.anyscale.com/reference/base-images/ray-240/py39). See the appendix below for more details. |
+| Cluster Environment | This template uses a docker image built on top of the latest Anyscale-provided Ray image using Python 3.9: [`anyscale/ray:latest-py39-cu118`](https://docs.anyscale.com/reference/base-images/overview). See the appendix below for more details. |
 
 ## Get Started
 
@@ -40,5 +40,5 @@ Finally, update your workspace's cluster environment to this new one after it's
 Use the following `docker pull` command if you want to manually build a new Docker image based off of this one.
 
 ```bash
-docker pull us-docker.pkg.dev/anyscale-workspace-templates/workspace-templates/serve-stable-diffusion-model-ray-serve:2.4.0
+docker pull us-docker.pkg.dev/anyscale-workspace-templates/workspace-templates/serve-stable-diffusion-model-ray-serve:latest
 ```
diff --git a/doc/source/templates/03_serving_stable_diffusion/cluster_env.yaml b/doc/source/templates/03_serving_stable_diffusion/cluster_env.yaml
@@ -1,21 +1,21 @@
 # See https://hub.docker.com/r/anyscale/ray for full list of
 # available Ray, Python, and CUDA versions.
-base_image: anyscale/ray:2.4.0-py39-cu118
+base_image: anyscale/ray:latest-py39-cu118
 
 env_vars: {}
 
 debian_packages: []
 
 python:
   pip_packages:
-    - accelerate==0.19.0
-    - diffusers==0.15.1
-    - fastapi==0.95.1
+    - accelerate==0.20.3
+    - diffusers==0.17.1
+    - fastapi==0.97.0
     - ipywidgets
     - matplotlib==3.7.1
     - numpy==1.24.3
-    - torch==2.0.0
-    - transformers==4.28.1
+    - torch==2.0.1
+    - transformers==4.30.1
 
   conda_packages: []
 

diff --git a/doc/source/templates/03_serving_stable_diffusion/start.ipynb b/doc/source/templates/03_serving_stable_diffusion/start.ipynb
@@ -13,7 +13,7 @@
     "| Summary | This template loads a pretrained stable diffusion model from HuggingFace and serves it to a local endpoint as a [Ray Serve](https://docs.ray.io/en/latest/serve/index.html) deployment. |\n",
     "| Time to Run | Around 2 minutes to setup the models and generate your first image(s). Less than 10 seconds for every subsequent round of image generation (depending on the image size). |\n",
     "| Minimum Compute Requirements | At least 1 GPU node. The default is 4 nodes, each with 1 NVIDIA T4 GPU. |\n",
-    "| Cluster Environment | This template uses a custom docker image built on top of the Anyscale-provided Ray image using Python 3.9: [`anyscale/ray:2.4.0-py39-cu118`](https://docs.anyscale.com/reference/base-images/ray-240/py39). See the appendix in the `README` for more details. |\n",
+    "| Cluster Environment | This template uses a custom docker image built on top of the Anyscale-provided Ray image using Python 3.9: [`anyscale/ray:latest-py39-cu118`](https://docs.anyscale.com/reference/base-images/overview). See the appendix in the `README` for more details. |\n",
     "\n",
     "By the end, we'll have an application that generates images using stable diffusion for a given prompt!\n",
     "\n",
@@ -279,8 +279,8 @@
     "        requests.get(endpoint, timeout=0.1)\n",
     "    except Exception as e:\n",
     "        raise RuntimeWarning(\n",
-    "            \"Did you setup the Ray Serve model replicas with \"\n",
-    "            \"`python server.py --num-replicas=...` in another terminal yet?\"\n",
+    "            \"Did you setup the Ray Serve model replicas with `serve.run` \"\n",
+    "            \"in a previous cell?\"\n",
     "        ) from e\n",
     "\n",
     "    generation_times = []\n",

diff --git a/doc/source/templates/templates.yaml b/doc/source/templates/templates.yaml
@@ -10,7 +10,7 @@
 #   ## Some sample `build_id`'s to choose from:
 #   ## - anyscaleray-ml240-py39-gpu -> anyscale/ray-ml:2.4.0-py39-gpu
 #   ## - anyscale240-py39 -> anyscale/ray:2.4.0-py39
-#     build_id: anyscaleray-ml240-py39-gpu
+#     build_id: anyscaleray-ml250-py39-gpu
 #   ## OR, use a publicly hosted image
 #   # byod:
 #   #   docker_image: url of docker image
@@ -27,7 +27,7 @@ batch-inference-ray-data:
   labels:
     - Ray Data
   cluster_env:
-    build_id: anyscaleray-ml240-py39-gpu
+    build_id: anyscaleray-ml250-py39-gpu
   compute_config:
     GCP: doc/source/templates/configs/compute/gpu/gce.yaml
     AWS: doc/source/templates/configs/compute/gpu/aws.yaml
@@ -39,7 +39,7 @@ many-model-training-ray-tune:
   labels:
     - Ray Tune
   cluster_env:
-    build_id: anyscaleray-ml240-py39-gpu
+    build_id: anyscaleray-ml250-py39-gpu
   compute_config:
     GCP: doc/source/templates/configs/compute/cpu/gce.yaml
     AWS: doc/source/templates/configs/compute/cpu/aws.yaml
@@ -52,8 +52,8 @@ serve-stable-diffusion-model-ray-serve:
     - Ray Serve
   cluster_env:
     byod:
-      docker_image: us-docker.pkg.dev/anyscale-workspace-templates/workspace-templates/serve-stable-diffusion-model-ray-serve:2.4.0
-      ray_version: 2.4.0
+      docker_image: us-docker.pkg.dev/anyscale-workspace-templates/workspace-templates/serve-stable-diffusion-model-ray-serve:2.5.0
+      ray_version: 2.5.0
   compute_config:
     GCP: doc/source/templates/configs/compute/gpu/gce.yaml
-    AWS: doc/source/templates/configs/compute/gpu/aws.yaml
+    AWS: doc/source/templates/configs/compute/gpu/aws.yaml
diff --git a/doc/source/templates/testing/cluster_envs/02_many_model_training.yaml b/doc/source/templates/testing/cluster_envs/02_many_model_training.yaml
@@ -0,0 +1,11 @@
+base_image: anyscale/ray-ml:nightly-py39-gpu
+env_vars: {}
+
+post_build_cmds:
+  # Install Ray
+  - pip3 uninstall -y ray || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
+  - {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}
+
+python:
+  pip_packages:
+    - statsforecast==1.5.0
diff --git a/doc/source/templates/testing/cluster_envs/03_serving_stable_diffusion.yaml b/doc/source/templates/testing/cluster_envs/03_serving_stable_diffusion.yaml
@@ -1,5 +1,5 @@
 base_image: anyscale/ray:nightly-py39-cu118
-env_vars: {}
+
 debian_packages:
   - curl
 
@@ -8,19 +8,13 @@ post_build_cmds:
   - pip3 uninstall -y ray || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
   - {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}
 
-debian_packages: []
-
 python:
   pip_packages:
-    - accelerate==0.19.0
-    - diffusers==0.15.1
-    - fastapi==0.95.1
+    - accelerate==0.20.3
+    - diffusers==0.17.1
+    - fastapi==0.97.0
     - ipywidgets
     - matplotlib==3.7.1
     - numpy==1.24.3
-    - torch==2.0.0
-    - transformers==4.28.1
-
-  conda_packages: []
-
-post_build_cmds: []
+    - torch==2.0.1
+    - transformers==4.30.1
diff --git a/...s/default_cluster_env_latest_ml_py39.yaml → .../default_cluster_env_nightly_ml_py39.yaml b/...s/default_cluster_env_latest_ml_py39.yaml → .../default_cluster_env_nightly_ml_py39.yaml
@@ -1,4 +1,4 @@
-base_image: anyscale/ray-ml:2.4.0-py39-gpu
+base_image: anyscale/ray-ml:nightly-py39-gpu
 env_vars: {}
 debian_packages:
   - curl

diff --git a/doc/source/templates/testing/docker/03_serving_stable_diffusion/requirements.txt b/doc/source/templates/testing/docker/03_serving_stable_diffusion/requirements.txt
@@ -1,8 +1,8 @@
-accelerate==0.19.0
-diffusers==0.15.1
-fastapi==0.95.1
+accelerate==0.20.3
+diffusers==0.17.1
+fastapi==0.97.0
 ipywidgets
 matplotlib==3.7.1
 numpy==1.24.3
-torch==2.0.0
-transformers==4.28.1
+torch==2.0.1
+transformers==4.30.1