From 7a9f3e0f0d7f004564cfb4c4f7a329d3a71c7b28 Mon Sep 17 00:00:00 2001
From: Yunxuan Xiao <yunxuanx@anyscale.com>
Date: Mon, 24 Jul 2023 21:12:04 -0700
Subject: [PATCH] [Doc][Example] Fine-tune `vicuna-13b-v1.3` with
 LightningTrainer + DeepSpeed (#37016)

Signed-off-by: woshiyyya <xiaoyunxuan1998@gmail.com>
Signed-off-by: Yunxuan Xiao <xiaoyunxuan1998@gmail.com>
Signed-off-by: matthewdeng <matthew.j.deng@gmail.com>
Co-authored-by: matthewdeng <matthew.j.deng@gmail.com>
Signed-off-by: e428265 <arvind.chandramouli@lmco.com>
---
 doc/source/_toc.yml                           |    2 +
 doc/source/ray-overview/examples.rst          |    7 +
 doc/source/train/examples.rst                 |    8 +
 doc/source/train/examples/lightning/BUILD     |    5 +-
 ...una_13b_lightning_deepspeed_finetune.ipynb | 1424 +++++++++++++++++
 .../test_myst_doc.py                          |    1 +
 .../vicuna_13b_deepspeed_compute_aws.yaml     |   20 +
 .../vicuna_13b_deepspeed_env.yaml             |   27 +
 ...una_13b_lightning_deepspeed_finetune.ipynb |    1 +
 release/release_tests.yaml                    |   21 +
 10 files changed, 1515 insertions(+), 1 deletion(-)
 create mode 100644 doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb
 create mode 120000 release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/test_myst_doc.py
 create mode 100644 release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_deepspeed_compute_aws.yaml
 create mode 100644 release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_deepspeed_env.yaml
 create mode 120000 release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_lightning_deepspeed_finetune.ipynb

diff --git a/doc/source/_toc.yml b/doc/source/_toc.yml
index e6b0a66357524..35be6f2945091 100644
--- a/doc/source/_toc.yml
+++ b/doc/source/_toc.yml
@@ -139,6 +139,8 @@ parts:
                 title: "Torch Data Prefetching Benchmark"
               - file: train/examples/pytorch/pytorch_resnet_finetune
                 title: "PyTorch Finetuning ResNet Example"
+              - file: train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune
+                title: "Fine-tune Vicuna-13B with DeepSpeed and PyTorch Lightning"
           - file: train/faq
           - file: train/api/api
 
diff --git a/doc/source/ray-overview/examples.rst b/doc/source/ray-overview/examples.rst
index 5aa2698943f67..62acc120ce7ee 100644
--- a/doc/source/ray-overview/examples.rst
+++ b/doc/source/ray-overview/examples.rst
@@ -1618,3 +1618,10 @@ Ray Examples
         .. button-ref:: /serve/tutorials/streaming
 
             Using Ray Serve to deploy a chatbot
+    
+    .. grid-item-card:: :bdg-secondary:`Code example`
+        :class-item: gallery-item training llm
+
+        .. button-ref:: /train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune
+
+            Fine-tune vicuna-13b-v1.3 with DeepSpeed and LightningTrainer
diff --git a/doc/source/train/examples.rst b/doc/source/train/examples.rst
index b1c7807d330ab..66143800e3405 100644
--- a/doc/source/train/examples.rst
+++ b/doc/source/train/examples.rst
@@ -72,6 +72,14 @@ Distributed Training Examples using Ray Train
         .. button-ref:: dolly_lightning_fsdp_finetuning
 
             Fine-tune LLM with AIR LightningTrainer and FSDP
+    
+    .. grid-item-card::
+        :img-top: /images/pytorch_lightning_small.png
+        :class-img-top: pt-2 w-75 d-block mx-auto fixed-height-img
+
+        .. button-ref:: vicuna_lightning_deepspeed_finetuning
+
+            Fine-tune vicuna-13b-v1.3 with Deepspeed and LightningTrainer
 
 
 Ray Train Examples Using Loggers & Callbacks
diff --git a/doc/source/train/examples/lightning/BUILD b/doc/source/train/examples/lightning/BUILD
index 97d8822771b25..7532a168e1b79 100644
--- a/doc/source/train/examples/lightning/BUILD
+++ b/doc/source/train/examples/lightning/BUILD
@@ -10,7 +10,10 @@ filegroup(
 py_test_run_all_notebooks(
     size="large",
     include=["*.ipynb"],
-    exclude=["lightning_exp_tracking.ipynb"],
+    exclude=[
+        "lightning_exp_tracking.ipynb", # CPU test
+        "vicuna_13b_lightning_deepspeed_finetune.ipynb", # Release Test
+    ],
     data=["//doc/source/train/examples/lightning:lightning_examples"],
     tags=["exclusive", "team:ml", "gpu", "ray_air"],
 )
diff --git a/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb b/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb
new file mode 100644
index 0000000000000..debf22976e255
--- /dev/null
+++ b/doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb
@@ -0,0 +1,1424 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "(vicuna_lightning_deepspeed_finetuning)=\n",
+    "\n",
+    "# Fine-tune `vicuna-13b` with Ray LightningTrainer and DeepSpeed\n",
+    "\n",
+    "In this example, we will demonstrate how to perform full fine-tuning for a [`vicuna-13b-v1.3`](https://huggingface.co/lmsys/vicuna-13b-v1.3) model using LightningTrainer with the DeepSpeed ZeRO-3 strategy.\n",
+    "\n",
+    "- [DeepSpeed](<https://github.com/microsoft/DeepSpeed>) is an open-source deep learning optimization library for PyTorch. It's designed to reduce computing power and memory usage, and to train large distributed models by leveraging state-of-the-art innovations like ZeRO, 3D-Parallelism, DeepSpeed-MoE, and ZeRO-Infinity. \n",
+    "- PyTorch Lightning offers a [DeepSpeed integration](https://lightning.ai/docs/pytorch/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html), which provides a simple interface to configure the knobs for DeepSpeed and automatically trigger your training process with the DeepSpeed Engine.\n",
+    "- {class}`Ray LightningTrainer <ray.train.lightning.LightningTrainer>` allows you to easily scale your PyTorch Lightning job across multiple nodes in a Ray cluster, without worrying about the underlying cluster management, autoscaling, and distributed process group settings.\n",
+    "\n",
+    "Our demo aims to illustrate how these three tools can be combined effectively to finetune the Vicuna-13B model, leveraging the strengths of each to create an efficient and high-performance deep learning solution.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```{note}\n",
+    "This is an advanced example of Large Language Model fine-tuning with Ray AIR. If you're a beginner or new to the concepts of Ray AIR and LightningTrainer, it would be beneficial to first explore the introductory documentation below to build a foundational understanding. \n",
+    "- [Ray AIR Key Concepts](air-key-concepts) \n",
+    "- [Ray Data Key Concepts](data_key_concepts)\n",
+    "- {ref}`[Basic] Image Classification with LightningTrainer <lightning_mnist_example>`\n",
+    "- {ref}`[Intermediate] Using LightningTrainer with Ray Data <lightning_advanced_example>`\n",
+    "```\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cluster Setting\n",
+    "\n",
+    "\n",
+    "### Compute instances\n",
+    "In this example, we set up a Ray cluster on AWS with the following settings:\n",
+    "\n",
+    "|  | num | instance type | GPU per node | GPU Memory | CPU Memory |\n",
+    "|-|-|-|-|-|-|\n",
+    "|Head node|1|g5.16xlarge|1 x A10G | 24 GB | 256 GB|\n",
+    "|Worker node|15|g5.4xlarge|1 x A10G | 24 GB | 64 GB|\n",
+    "\n",
+    "```{note}\n",
+    "In this example, we used 16 A10G GPUs for model training and tuned the DeepSpeed configurations for this setup. If you have a different cluster setup or GPUs with lower memory capacities, you may need to modify the DeepSpeed configurations and batch size to fit the model into the GPUs.\n",
+    "```\n",
+    "\n",
+    "```{tip}\n",
+    "We selected a GPU instance with additional CPU memory for the head node to demonstrate single-node offline inference. If you are training only, you can still opt for the g5.4xlarge instance for the head node.\n",
+    "```\n",
+    "\n",
+    "\n",
+    "### Cloud Storage\n",
+    "\n",
+    "Additionally, since the checkpoint size for this 13B parameter model can be large (~140GB), we choose to store the checkpoints in AWS S3. Thanks to the newly introduced distributed checkpointing feature in Ray 2.5, each worker can upload its own shards individually to the S3 bucket, greatly reducing the latency and network traffic of checkpoint syncing.\n",
+    "\n",
+    "### Local Storage\n",
+    "To demonstrate offline inference, we need to download and consolidate the model checkpoint onto the head node. This action requires around 200GB disk storage. Therefore, we mounted the NVMe SSD provided by g5 instances at `/dev/nvme1n1` to `/mnt/local_storage`, and we will save the checkpoints in this folder.\n",
+    "\n",
+    "For more details, please refer to[Amazon EBS and NVMe on Linux instances](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/nvme-ebs-volumes.html).\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup Ray Environment\n",
+    "\n",
+    "We define a runtime environment to ensure that the Ray workers have access to all necessary packages. If you have already included these dependencies in your Docker image or installed them on each node, you can ignore the `runtime_env` argument.\n",
+    "\n",
+    "```{note}\n",
+    "Note that the codebases of `transformers`, `accelerate`, and `deepspeed` are all rapidly changing, so we have pinned the package versions here to ensure testing stability. You can try other version combinations and feel free to report any issues you encounter.\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ray\n",
+    "\n",
+    "NUM_WORKERS = 16\n",
+    "BATCH_SIZE_PER_WORKER = 8\n",
+    "MODEL_NAME = \"lmsys/vicuna-13b-v1.3\"\n",
+    "\n",
+    "ray.init(\n",
+    "    runtime_env={\n",
+    "        \"pip\": [\n",
+    "            \"datasets==2.13.1\",\n",
+    "            \"torch>=1.13.0\",\n",
+    "            \"deepspeed==0.9.4\",\n",
+    "            \"accelerate==0.20.3\",\n",
+    "            \"transformers==4.30.2\",\n",
+    "            \"pytorch_lightning==2.0.3\",\n",
+    "        ]\n",
+    "    }\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load and preprocess datasets\n",
+    "\n",
+    "We were impressed by LLM's ability of zero-shot text-generation, while some LLMs may not perform well in code generation due to the lack of code in the training corpus. The CMU [CoNaLa](https://conala-corpus.github.io/)(The Code/Natural Language Challenge) was designed to test systems for generating program snippets from natural language. Each data record contains an intent sentence and a one-line code snippet. The goal is to fine-tune the Vicuna model on this dataset, enabling the model to generate correct and runnable code snippets, thereby achieving natural language intent. Here are some examples:\n",
+    "\n",
+    "| intent | code snippet |\n",
+    "| - | - |\n",
+    "| \"convert a list of integers into a single integer\" | `r = int(''.join(map(str, x)))`|\n",
+    "| \"normalize a pandas dataframe `df` by row\" | `df.div(df.sum(axis=1), axis=0)` | \n",
+    "| \"Convert string '03:55' into datetime.time object\" | `datetime.datetime.strptime('03:55', '%H:%M').time()` |\n",
+    "\n",
+    "The CoNaLa team has released a dataset crawled from Stack Overflow, automatically filtered, then curated by annotators, split into 2379 training and 500 test examples. In addition, they also included an automatically-mined dataset with 600k examples. In this demo, we take all the curated data and the top 5000 mined data for fine-tuning."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here we preprocess the CoNaLa dataset with Ray Data. You can also use HuggingFace Datasets and pass it directly to `LightningConfigBuilder.fit_params()`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "535afe3e183b4cdfa61c39cbae788608",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset({\n",
+      "    features: ['question_id', 'intent', 'rewritten_intent', 'snippet', 'parent_answer_post_id', 'prob', 'id'],\n",
+      "    num_rows: 7379\n",
+      "})\n"
+     ]
+    }
+   ],
+   "source": [
+    "import re\n",
+    "import ray\n",
+    "import json\n",
+    "from transformers import AutoTokenizer\n",
+    "from datasets import concatenate_datasets, load_dataset\n",
+    "\n",
+    "# Combine the curated dataset and automatically-mined dataset\n",
+    "hf_dataset_curated = load_dataset(\"neulab/conala\")\n",
+    "hf_dataset_mined = load_dataset(\"neulab/conala\", \"mined\", split=\"train[:5000]\")\n",
+    "hf_dataset_merged = concatenate_datasets(\n",
+    "    [hf_dataset_curated[\"train\"], hf_dataset_mined]\n",
+    ")\n",
+    "print(hf_dataset_merged)\n",
+    "\n",
+    "# Convert it into Ray Dataset\n",
+    "ray_ds = ray.data.from_huggingface(hf_dataset_merged)\n",
+    "\n",
+    "# Build a prompt template for Vicuna-13b model\n",
+    "PROMPT_TEMPLATE = \"Intent: {intent}\\nOne-line code snippet: {snippet}\"\n",
+    "\n",
+    "\n",
+    "def fill_prompt(batch):\n",
+    "    batch[\"input_sentence\"] = batch.apply(\n",
+    "        lambda row: PROMPT_TEMPLATE.format(\n",
+    "            intent=row[\"rewritten_intent\"]\n",
+    "            if row[\"rewritten_intent\"]\n",
+    "            else row[\"intent\"],\n",
+    "            snippet=f\"`{row['snippet']}`\",\n",
+    "        )\n",
+    "        + \"</s>\",\n",
+    "        axis=1,\n",
+    "    )\n",
+    "    return batch[[\"input_sentence\"]]\n",
+    "\n",
+    "\n",
+    "# Tokenize input sentences to tensors\n",
+    "def tokenize(batch):\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(\n",
+    "        MODEL_NAME, padding_side=\"left\", use_fast=False\n",
+    "    )\n",
+    "    tokenizer.pad_token = tokenizer.eos_token\n",
+    "    ret = tokenizer(\n",
+    "        list(batch[\"input_sentence\"]),\n",
+    "        truncation=True,\n",
+    "        max_length=128,\n",
+    "        padding=\"max_length\",\n",
+    "        return_tensors=\"np\",\n",
+    "    )\n",
+    "    ret[\"labels\"] = ret[\"input_ids\"].copy()\n",
+    "    return dict(ret)\n",
+    "\n",
+    "# Preprocess train dataset\n",
+    "processed_ds = ray_ds.map_batches(fill_prompt, batch_format=\"pandas\").map_batches(tokenize, batch_format=\"pandas\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define your model\n",
+    "\n",
+    "Here we load the pre-trained model weights from HuggingFace Model Hub, and wrap them into `pl.LightningModule`. We adopted the efficient model initialization techniques introduced in [Lightning-transformers](https://github.com/Lightning-Universe/lightning-transformers) to avoid unnecessary full weights loading."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2023-06-30 17:39:35,109] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import transformers\n",
+    "import pytorch_lightning as pl\n",
+    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
+    "from deepspeed.ops.adam import DeepSpeedCPUAdam\n",
+    "\n",
+    "\n",
+    "class ZeRO3Config:\n",
+    "    def __init__(self, pl_module):\n",
+    "        self.config = pl_module.trainer.strategy.config\n",
+    "\n",
+    "    def __call__(self, *args, **kwargs):\n",
+    "        return self\n",
+    "\n",
+    "    def is_zero3(self) -> bool:\n",
+    "        return True\n",
+    "\n",
+    "\n",
+    "def enable_transformers_pretrained_deepspeed_sharding(\n",
+    "    pl_module: \"pl.LightningModule\",\n",
+    ") -> None:\n",
+    "    transformers.deepspeed._hf_deepspeed_config_weak_ref = ZeRO3Config(pl_module)\n",
+    "\n",
+    "\n",
+    "class Vicuna13BModel(pl.LightningModule):\n",
+    "    def __init__(self):\n",
+    "        super().__init__()\n",
+    "        # Enable tf32 for better performance\n",
+    "        torch.backends.cuda.matmul.allow_tf32 = True\n",
+    "\n",
+    "    def setup(self, stage) -> None:\n",
+    "        # Defer model initialization to inject deepspeed configs to HF.\n",
+    "        # During initialization, HF transformers can immediately partition \n",
+    "        # the model across all gpus avoid the overhead in time and memory \n",
+    "        # copying it on CPU or each GPU first.\n",
+    "        enable_transformers_pretrained_deepspeed_sharding(self)\n",
+    "        self.model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)\n",
+    "        if self.global_rank == 0:\n",
+    "            print(\"DeepSpeed Configs: \", self.trainer.strategy.config)\n",
+    "            print(\"Model Archetecture: \", self.model)\n",
+    "\n",
+    "    def forward(self, batch):\n",
+    "        outputs = self.model(\n",
+    "            batch[\"input_ids\"],\n",
+    "            labels=batch[\"labels\"],\n",
+    "            attention_mask=batch[\"attention_mask\"],\n",
+    "        )\n",
+    "        return outputs.loss\n",
+    "\n",
+    "    def training_step(self, batch, batch_idx):\n",
+    "        loss = self.forward(batch)\n",
+    "        self.log(\"train_loss\", loss, prog_bar=True, on_step=True, sync_dist=True)\n",
+    "        return loss\n",
+    "\n",
+    "    def configure_optimizers(self):\n",
+    "        return DeepSpeedCPUAdam(self.parameters(), lr=2e-5, weight_decay=0.01)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Training Configurations\n",
+    "\n",
+    "Before training, let's calculate the memory usage of finetuning a `vicuna-13b` model. Assume we are using FP16 mixed-precision training, and the optimizer is Adam with FP32 states.\n",
+    "\n",
+    "- Model parameters: 13(billion parameters) * 2(FP16) ≈ 26GB\n",
+    "- Optimizer states: 13(billion parameters)  * 2(momentums per param) * 4 (FP32) ≈ 52GB\n",
+    "\n",
+    "As we can see, the model parameters themselves require 26GB, which cannot fit in a single A10G GPU, let alone the activations and optimizers states. Here, we use ZeRO stage-3 to partition the model, gradients, and optimizer states across 16 nodes. Additionally, we employ optimizer CPU offloading to reduce GRAM usage and increase throughput with larger batch sizes. We also disabled parameter offloading and activation checkpointing to improve the training speed.\n",
+    "\n",
+    "Regarding other knobs such as `reduce_bucket_size`, `stage3_prefetch_bucket_size` and `stage3_param_persistence_threshold`, we kept them as the [default values in HuggingFace](https://huggingface.co/docs/transformers/main_classes/deepspeed#zero3-config). Feel free to further adjust them to speed up the training process."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ray.train.lightning import LightningTrainer, LightningConfigBuilder\n",
+    "from transformers import AutoConfig\n",
+    "\n",
+    "config = AutoConfig.from_pretrained(MODEL_NAME)\n",
+    "HIDDEN_SIZE = config.hidden_size\n",
+    "\n",
+    "deepspeed_configs = {\n",
+    "    \"zero_allow_untested_optimizer\": True,\n",
+    "    \"bf16\": {\"enabled\": True},\n",
+    "    \"zero_optimization\": {\n",
+    "        \"stage\": 3,\n",
+    "        \"offload_optimizer\": {\"device\": \"cpu\", \"pin_memory\": True},\n",
+    "        \"overlap_comm\": True,\n",
+    "        \"contiguous_gradients\": True,\n",
+    "        \"reduce_bucket_size\": HIDDEN_SIZE * HIDDEN_SIZE,\n",
+    "        \"stage3_prefetch_bucket_size\": 0.9 * HIDDEN_SIZE * HIDDEN_SIZE,\n",
+    "        \"stage3_param_persistence_threshold\": 10 * HIDDEN_SIZE,\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "lightning_config = (\n",
+    "    LightningConfigBuilder()\n",
+    "    .module(cls=Vicuna13BModel)\n",
+    "    .trainer(\n",
+    "        max_epochs=1,\n",
+    "        accelerator=\"gpu\",\n",
+    "        precision=\"bf16-mixed\",\n",
+    "        accumulate_grad_batches=2,\n",
+    "    )\n",
+    "    .strategy(name=\"deepspeed\", config=deepspeed_configs)\n",
+    "    .checkpointing(save_top_k=0, save_weights_only=True, save_last=True)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "from pytorch_lightning.callbacks import TQDMProgressBar\n",
+    "\n",
+    "# Create a customized progress bar for LightningTrainer\n",
+    "class VicunaProgressBar(TQDMProgressBar):\n",
+    "    def __init__(self, num_iters_per_epoch, *args, **kwargs):\n",
+    "        super().__init__(*args, **kwargs)\n",
+    "        self.num_iters_per_epoch = num_iters_per_epoch\n",
+    "\n",
+    "    def on_train_epoch_start(self, trainer, *_):\n",
+    "        super().on_train_epoch_start(trainer, *_)\n",
+    "        self.train_progress_bar.reset(self.num_iters_per_epoch)\n",
+    "\n",
+    "\n",
+    "total_batches = processed_ds.count()\n",
+    "num_iters_per_epoch = total_batches // (NUM_WORKERS * BATCH_SIZE_PER_WORKER)\n",
+    "progress_bar = VicunaProgressBar(num_iters_per_epoch)\n",
+    "\n",
+    "\n",
+    "lightning_config.trainer(\n",
+    "    callbacks=[progress_bar],\n",
+    "    # Take a subset to accelerate release tests\n",
+    "    limit_train_batches=20,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, combine all the configurations with {class}`LightningConfigBuilder <ray.train.lightning.LightningConfigBuilder>` and instantiate a LightningTrainer. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ray.air.config import CheckpointConfig, RunConfig, ScalingConfig\n",
+    "\n",
+    "trainer = LightningTrainer(\n",
+    "    lightning_config=lightning_config.build(),\n",
+    "    run_config=RunConfig(\n",
+    "        name=\"vicuna-13b-finetune\",\n",
+    "        storage_path=\"s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/air-release-tests\",\n",
+    "        checkpoint_config=CheckpointConfig(\n",
+    "            num_to_keep=1,\n",
+    "            # Enable distributed checkpointing\n",
+    "            _checkpoint_keep_all_ranks=True,\n",
+    "            _checkpoint_upload_from_workers=True,\n",
+    "        ),\n",
+    "    ),\n",
+    "    scaling_config=ScalingConfig(\n",
+    "        num_workers=NUM_WORKERS,\n",
+    "        use_gpu=True,\n",
+    "        resources_per_worker={\"CPU\": 15, \"GPU\": 1},\n",
+    "    ),\n",
+    "    datasets={\"train\": processed_ds},\n",
+    "    datasets_iter_config={\"batch_size\": BATCH_SIZE_PER_WORKER},\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```{tip}\n",
+    "\n",
+    "Here, we highly recommend saving checkpoints with cloud storage and enabling distributed checkpointing by setting `_checkpoint_keep_all_ranks` and `_checkpoint_upload_from_workers` to True when training huge models. Otherwise, all checkpoint shards will be synced to the head node, which may introduce enormous syncing overhead and even cause out-of-memory.\n",
+    "\n",
+    "```"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model Fine-tuning\n",
+    "\n",
+    "Once everything is configured in LightningTrainer, training becomes easy. Simply call `trainer.fit()`, and your workload will be scaled to the Ray cluster, initiating ZeRO-3 parallel training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div class=\"tuneStatus\">\n",
+       "  <div style=\"display: flex;flex-direction: row\">\n",
+       "    <div style=\"display: flex;flex-direction: column;\">\n",
+       "      <h3>Tune Status</h3>\n",
+       "      <table>\n",
+       "<tbody>\n",
+       "<tr><td>Current time:</td><td>2023-06-30 18:21:59</td></tr>\n",
+       "<tr><td>Running for: </td><td>00:42:22.75        </td></tr>\n",
+       "<tr><td>Memory:      </td><td>10.7/249.1 GiB     </td></tr>\n",
+       "</tbody>\n",
+       "</table>\n",
+       "    </div>\n",
+       "    <div class=\"vDivider\"></div>\n",
+       "    <div class=\"systemInfo\">\n",
+       "      <h3>System Info</h3>\n",
+       "      Using FIFO scheduling algorithm.<br>Logical resource usage: 241.0/304 CPUs, 16.0/16 GPUs (0.0/16.0 accelerator_type:A10G)\n",
+       "    </div>\n",
+       "    \n",
+       "  </div>\n",
+       "  <div class=\"hDivider\"></div>\n",
+       "  <div class=\"trialStatus\">\n",
+       "    <h3>Trial Status</h3>\n",
+       "    <table>\n",
+       "<thead>\n",
+       "<tr><th>Trial name                  </th><th>status    </th><th>loc              </th><th style=\"text-align: right;\">  iter</th><th style=\"text-align: right;\">  total time (s)</th><th style=\"text-align: right;\">  train_loss</th><th style=\"text-align: right;\">  epoch</th><th style=\"text-align: right;\">  step</th></tr>\n",
+       "</thead>\n",
+       "<tbody>\n",
+       "<tr><td>LightningTrainer_c1544_00000</td><td>TERMINATED</td><td>10.0.55.20:134103</td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">         2473.94</td><td style=\"text-align: right;\">    0.523438</td><td style=\"text-align: right;\">      0</td><td style=\"text-align: right;\">    29</td></tr>\n",
+       "</tbody>\n",
+       "</table>\n",
+       "  </div>\n",
+       "</div>\n",
+       "<style>\n",
+       ".tuneStatus {\n",
+       "  color: var(--jp-ui-font-color1);\n",
+       "}\n",
+       ".tuneStatus .systemInfo {\n",
+       "  display: flex;\n",
+       "  flex-direction: column;\n",
+       "}\n",
+       ".tuneStatus td {\n",
+       "  white-space: nowrap;\n",
+       "}\n",
+       ".tuneStatus .trialStatus {\n",
+       "  display: flex;\n",
+       "  flex-direction: column;\n",
+       "}\n",
+       ".tuneStatus h3 {\n",
+       "  font-weight: bold;\n",
+       "}\n",
+       ".tuneStatus .hDivider {\n",
+       "  border-bottom-width: var(--jp-border-width);\n",
+       "  border-bottom-color: var(--jp-border-color0);\n",
+       "  border-bottom-style: solid;\n",
+       "}\n",
+       ".tuneStatus .vDivider {\n",
+       "  border-left-width: var(--jp-border-width);\n",
+       "  border-left-color: var(--jp-border-color0);\n",
+       "  border-left-style: solid;\n",
+       "  margin: 0.5em 1em 0.5em 1em;\n",
+       "}\n",
+       "</style>\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(pid=134103)\u001b[0m [2023-06-30 17:39:41,637] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m The `preprocessor` arg to Trainer is deprecated. Apply preprocessor transformations ahead of time by calling `preprocessor.transform(ds)`. Support for the preprocessor arg will be dropped in a future release.\n",
+      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m \u001b[33mImportant: Ray Data requires schemas for all datasets in Ray 2.5. This means that standalone Python objects are no longer supported. In addition, the default batch format is fixed to NumPy. To revert to legacy behavior temporarily, set the environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n",
+      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m Learn more here: https://docs.ray.io/en/master/data/faq.html#migrating-to-strict-mode\u001b[0m\n",
+      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m Starting distributed worker processes: ['134267 (10.0.55.20)', '74152 (10.0.63.141)', '75476 (10.0.51.205)', '75547 (10.0.42.158)', '74711 (10.0.45.211)', '75132 (10.0.20.140)', '74502 (10.0.60.86)', '75695 (10.0.53.69)', '74457 (10.0.47.2)', '74569 (10.0.33.23)', '74341 (10.0.29.61)', '74274 (10.0.36.152)', '74561 (10.0.35.16)', '74427 (10.0.16.236)', '74273 (10.0.54.55)', '74996 (10.0.9.249)']\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Setting up process group for: env:// [rank=0, world_size=16]\n",
+      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(BatchMapper._transform_pandas)->MapBatches(BatchMapper._transform_pandas)] -> AllToAllOperator[RandomizeBlockOrder]\n",
+      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)\n",
+      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m Tip: For detailed progress reporting, run `ray.data.DataContext.get_current().execution_options.verbose_progress = True`\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "da7f200767b448d7b409fcdd07daecce",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "(pid=134103) - RandomizeBlockOrder 1:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d9c76218373645cc99438e1f14133e74",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "(pid=134103) Running 0:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 8.86MB/s]m_pandas) pid=74329, ip=10.0.54.55)\u001b[0m \n",
+      "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 18.2MB/s]ansform_pandas) pid=74329, ip=10.0.54.55)\u001b[0m \n",
+      "Downloading (…)cial_tokens_map.json: 100%|██████████| 435/435 [00:00<00:00, 3.33MB/s]m_pandas) pid=74329, ip=10.0.54.55)\u001b[0m \n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m [2023-06-30 17:39:54,612] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 7.86MB/s]\n",
+      "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 7.57MB/s]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m GPU available: True (cuda), used: True\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m TPU available: False, using: 0 TPU cores\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m IPU available: False, using: 0 IPUs\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m HPU available: False, using: 0 HPUs\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m `Trainer(limit_val_batches=1)` was configured so 1 batch will be used.\n",
+      "Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]\n",
+      "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 14.9MB/s]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/16\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74273, ip=10.0.54.55)\u001b[0m Missing logger folder: /home/ray/ray_results/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/rank_all/lightning_logs\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:39:55,589] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 18.2MB/s]\n",
+      "Downloading (…)cial_tokens_map.json: 100%|██████████| 435/435 [00:00<00:00, 6.49MB/s]\n",
+      "Downloading (…)lve/main/config.json:   0%|          | 0.00/585 [00:00<?, ?B/s]\n",
+      "Downloading (…)lve/main/config.json: 100%|██████████| 585/585 [00:00<00:00, 7.81MB/s]\n",
+      "Downloading (…)lve/main/config.json: 100%|██████████| 585/585 [00:00<00:00, 7.09MB/s]\n",
+      "Downloading (…)model.bin.index.json: 100%|██████████| 33.4k/33.4k [00:00<00:00, 35.1MB/s]\n",
+      "Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001b[0m \n",
+      "Downloading (…)l-00001-of-00003.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]\u001b[A\n",
+      "Downloading (…)l-00001-of-00003.bin:   0%|          | 21.0M/9.95G [00:00<00:59, 167MB/s]\u001b[A\n",
+      "Downloading (…)l-00001-of-00003.bin:   0%|          | 41.9M/9.95G [00:00<00:58, 170MB/s] \u001b[A\n",
+      "Downloading (…)okenizer_config.json: 100%|██████████| 727/727 [00:00<00:00, 8.33MB/s]\u001b[32m [repeated 9x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)\u001b[0m\n",
+      "Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]\n",
+      "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 17.5MB/s]\u001b[32m [repeated 8x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001b[0m initializing deepspeed distributed: GLOBAL_RANK: 12, MEMBER: 13/16\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001b[0m Missing logger folder: /home/ray/ray_results/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/rank_all/lightning_logs\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "Downloading tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 8.85MB/s]\n",
+      "Downloading (…)cial_tokens_map.json: 100%|██████████| 435/435 [00:00<00:00, 5.23MB/s]\u001b[32m [repeated 10x across cluster]\u001b[0m\n",
+      "Downloading (…)lve/main/config.json: 100%|██████████| 585/585 [00:00<00:00, 7.03MB/s]\u001b[32m [repeated 13x across cluster]\u001b[0m\n",
+      "Downloading (…)model.bin.index.json: 100%|██████████| 33.4k/33.4k [00:00<00:00, 87.9MB/s]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74341, ip=10.0.29.61)\u001b[0m \u001b[32m [repeated 650x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]\u001b[A\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  13%|█▎        | 1.31G/9.95G [00:05<00:36, 239MB/s]\u001b[A\u001b[32m [repeated 636x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:   1%|          | 105M/9.95G [00:00<00:41, 239MB/s] \u001b[A\u001b[32m [repeated 17x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74711, ip=10.0.45.211)\u001b[0m \u001b[32m [repeated 640x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  26%|██▌       | 2.58G/9.95G [00:10<00:28, 256MB/s]\u001b[A\u001b[32m [repeated 635x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74502, ip=10.0.60.86)\u001b[0m \u001b[32m [repeated 638x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  37%|███▋      | 3.70G/9.95G [00:15<00:26, 238MB/s]\u001b[A\u001b[32m [repeated 638x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001b[0m \u001b[32m [repeated 643x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  51%|█████▏    | 5.12G/9.95G [00:20<00:18, 255MB/s]\u001b[A\u001b[32m [repeated 649x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=75476, ip=10.0.51.205)\u001b[0m \u001b[32m [repeated 638x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  65%|██████▌   | 6.48G/9.95G [00:25<00:14, 246MB/s]\u001b[A\u001b[32m [repeated 633x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74457, ip=10.0.47.2)\u001b[0m \u001b[32m [repeated 645x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  76%|███████▌  | 7.52G/9.95G [00:29<00:09, 247MB/s]\u001b[A\u001b[32m [repeated 644x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  91%|█████████▏| 9.10G/9.95G [00:34<00:03, 263MB/s]\u001b[A\n",
+      "Downloading (…)l-00001-of-00003.bin:  92%|█████████▏| 9.13G/9.95G [00:34<00:03, 257MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74711, ip=10.0.45.211)\u001b[0m \u001b[32m [repeated 634x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00001-of-00003.bin:  82%|████████▏ | 8.17G/9.95G [00:35<00:07, 228MB/s]\u001b[A\u001b[32m [repeated 628x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00001-of-00003.bin: 100%|██████████| 9.95G/9.95G [00:37<00:00, 262MB/s]\u001b[A\n",
+      "Downloading shards:  33%|███▎      | 1/3 [00:38<01:16, 38.09s/it]\n",
+      "Downloading (…)l-00002-of-00003.bin:   0%|          | 0.00/9.90G [00:00<?, ?B/s]\u001b[A\n",
+      "Downloading (…)l-00002-of-00003.bin:   1%|▏         | 126M/9.90G [00:00<00:35, 273MB/s] \u001b[A\n",
+      "Downloading (…)l-00001-of-00003.bin:  93%|█████████▎| 9.27G/9.95G [00:39<00:02, 228MB/s]\u001b[A\u001b[32m [repeated 394x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001b[0m \u001b[32m [repeated 633x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:   2%|▏         | 241M/9.90G [00:01<00:38, 252MB/s]\u001b[A\u001b[32m [repeated 213x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00001-of-00003.bin: 100%|██████████| 9.95G/9.95G [00:40<00:00, 243MB/s]\u001b[32m [repeated 8x across cluster]\u001b[0m\n",
+      "Downloading shards:  33%|███▎      | 1/3 [00:42<01:25, 42.77s/it]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:   0%|          | 0.00/9.90G [00:00<?, ?B/s]\u001b[A\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:   1%|          | 115M/9.90G [00:00<00:46, 209MB/s] \u001b[A\u001b[32m [repeated 16x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00001-of-00003.bin: 100%|██████████| 9.95G/9.95G [00:42<00:00, 233MB/s]\u001b[32m [repeated 50x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74341, ip=10.0.29.61)\u001b[0m \u001b[32m [repeated 636x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  19%|█▊        | 1.86G/9.90G [00:06<00:29, 275MB/s]\u001b[A\u001b[32m [repeated 589x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74996, ip=10.0.9.249)\u001b[0m \u001b[32m [repeated 649x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  18%|█▊        | 1.75G/9.90G [00:07<00:34, 234MB/s]\u001b[A\u001b[32m [repeated 643x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74502, ip=10.0.60.86)\u001b[0m \u001b[32m [repeated 645x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  41%|████▏     | 4.09G/9.90G [00:15<00:21, 271MB/s]\u001b[A\u001b[32m [repeated 644x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74273, ip=10.0.54.55)\u001b[0m \u001b[32m [repeated 652x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  53%|█████▎    | 5.25G/9.90G [00:21<00:19, 242MB/s]\u001b[A\u001b[32m [repeated 656x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m \u001b[32m [repeated 647x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  67%|██████▋   | 6.66G/9.90G [00:25<00:13, 246MB/s]\u001b[A\u001b[32m [repeated 646x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=75132, ip=10.0.20.140)\u001b[0m \u001b[32m [repeated 629x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  84%|████████▍ | 8.30G/9.90G [00:31<00:06, 234MB/s]\u001b[A\u001b[32m [repeated 627x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  91%|█████████▏| 9.06G/9.90G [00:34<00:03, 241MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74457, ip=10.0.47.2)\u001b[0m \u001b[32m [repeated 627x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00002-of-00003.bin:  89%|████████▉ | 8.84G/9.90G [00:36<00:04, 228MB/s]\u001b[A\u001b[32m [repeated 567x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00002-of-00003.bin: 100%|██████████| 9.90G/9.90G [00:38<00:00, 257MB/s]\u001b[A\n",
+      "Downloading shards:  67%|██████▋   | 2/3 [01:16<00:38, 38.38s/it]\n",
+      "Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]\u001b[A\n",
+      "Downloading (…)l-00003-of-00003.bin:   2%|▏         | 126M/6.18G [00:00<00:22, 266MB/s] \u001b[A\n",
+      "Downloading (…)l-00002-of-00003.bin:  98%|█████████▊| 9.69G/9.90G [00:38<00:00, 236MB/s]\u001b[A\u001b[32m [repeated 310x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=75476, ip=10.0.51.205)\u001b[0m \u001b[32m [repeated 629x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:   2%|▏         | 94.4M/6.18G [00:00<00:24, 247MB/s]\u001b[A\u001b[32m [repeated 275x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00002-of-00003.bin: 100%|██████████| 9.90G/9.90G [00:39<00:00, 253MB/s]\u001b[32m [repeated 10x across cluster]\u001b[0m\n",
+      "Downloading shards:  67%|██████▋   | 2/3 [01:20<00:40, 40.01s/it]\u001b[32m [repeated 13x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]\u001b[A\u001b[32m [repeated 13x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:   2%|▏         | 126M/6.18G [00:00<00:24, 243MB/s] \u001b[A\u001b[32m [repeated 13x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00002-of-00003.bin: 100%|█████████▉| 9.88G/9.90G [00:41<00:00, 242MB/s]\u001b[A\u001b[32m [repeated 122x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74273, ip=10.0.54.55)\u001b[0m \u001b[32m [repeated 638x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  21%|██        | 1.31G/6.18G [00:05<00:20, 243MB/s]\u001b[A\u001b[32m [repeated 569x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00002-of-00003.bin: 100%|██████████| 9.90G/9.90G [00:40<00:00, 242MB/s]\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
+      "Downloading shards:  67%|██████▋   | 2/3 [01:23<00:41, 41.78s/it]\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]\u001b[A\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:   2%|▏         | 105M/6.18G [00:00<00:24, 248MB/s] \u001b[A\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00002-of-00003.bin: 100%|█████████▉| 9.87G/9.90G [00:40<00:00, 260MB/s]\u001b[A\u001b[32m [repeated 3x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001b[0m \u001b[32m [repeated 638x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  41%|████▏     | 2.56G/6.18G [00:10<00:14, 256MB/s]\u001b[A\u001b[32m [repeated 635x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m \u001b[32m [repeated 629x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  62%|██████▏   | 3.84G/6.18G [00:15<00:08, 279MB/s]\u001b[A\u001b[32m [repeated 627x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  92%|█████████▏| 5.66G/6.18G [00:22<00:01, 268MB/s]\u001b[A\n",
+      "Downloading (…)l-00003-of-00003.bin:  92%|█████████▏| 5.69G/6.18G [00:22<00:01, 265MB/s]\u001b[A\n",
+      "Downloading (…)l-00003-of-00003.bin:  93%|█████████▎| 5.73G/6.18G [00:22<00:01, 268MB/s]\u001b[A\n",
+      "Downloading (…)l-00003-of-00003.bin:  93%|█████████▎| 5.76G/6.18G [00:22<00:01, 270MB/s]\u001b[A\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001b[0m \u001b[32m [repeated 644x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  85%|████████▌ | 5.25G/6.18G [00:20<00:03, 270MB/s]\u001b[A\u001b[32m [repeated 618x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00003-of-00003.bin: 100%|██████████| 6.18G/6.18G [00:24<00:00, 257MB/s]\u001b[A\n",
+      "Downloading shards: 100%|██████████| 3/3 [01:40<00:00, 33.61s/it]\n",
+      "Downloading (…)l-00003-of-00003.bin:  98%|█████████▊| 6.03G/6.18G [00:23<00:00, 269MB/s]\u001b[A\u001b[32m [repeated 166x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001b[0m \u001b[32m [repeated 426x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  86%|████████▌ | 5.30G/6.18G [00:21<00:03, 246MB/s]\u001b[A\u001b[32m [repeated 222x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00003-of-00003.bin: 100%|██████████| 6.18G/6.18G [00:25<00:00, 239MB/s]\u001b[32m [repeated 7x across cluster]\u001b[0m\n",
+      "Downloading shards: 100%|██████████| 3/3 [01:45<00:00, 35.27s/it]\u001b[32m [repeated 11x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  98%|█████████▊| 6.04G/6.18G [00:25<00:00, 231MB/s]\u001b[A\u001b[32m [repeated 98x across cluster]\u001b[0m\n",
+      "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001b[0m \u001b[32m [repeated 74x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00003-of-00003.bin:  91%|█████████ | 5.63G/6.18G [00:23<00:02, 242MB/s]\u001b[A\u001b[32m [repeated 23x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00003-of-00003.bin: 100%|██████████| 6.18G/6.18G [00:24<00:00, 249MB/s]\u001b[A\n",
+      "Downloading shards: 100%|██████████| 3/3 [01:49<00:00, 36.47s/it]\u001b[32m [repeated 4x across cluster]\u001b[0m\n",
+      "Downloading (…)l-00003-of-00003.bin: 100%|██████████| 6.18G/6.18G [00:25<00:00, 241MB/s]\u001b[32m [repeated 5x across cluster]\u001b[0m\n",
+      "Loading checkpoint shards:  33%|███▎      | 1/3 [00:12<00:24, 12.11s/it]\n",
+      "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "Loading checkpoint shards:  33%|███▎      | 1/3 [00:18<00:37, 18.54s/it]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "Loading checkpoint shards:  67%|██████▋   | 2/3 [00:30<00:15, 15.63s/it]\n",
+      "Loading checkpoint shards:  67%|██████▋   | 2/3 [00:30<00:15, 15.71s/it]\n",
+      "Loading checkpoint shards:  67%|██████▋   | 2/3 [00:35<00:17, 17.73s/it]\u001b[32m [repeated 14x across cluster]\u001b[0m\n",
+      "Loading checkpoint shards: 100%|██████████| 3/3 [00:40<00:00, 13.47s/it]\n",
+      "Downloading (…)neration_config.json: 100%|██████████| 132/132 [00:00<00:00, 458kB/s]\n",
+      "Loading checkpoint shards: 100%|██████████| 3/3 [00:45<00:00, 15.29s/it]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74996, ip=10.0.9.249)\u001b[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
+      "Downloading (…)neration_config.json: 100%|██████████| 132/132 [00:00<00:00, 542kB/s]\u001b[32m [repeated 14x across cluster]\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m DeepSpeed Configs:  {'zero_allow_untested_optimizer': True, 'bf16': {'enabled': True}, 'zero_optimization': {'stage': 3, 'offload_optimizer': {'device': 'cpu', 'pin_memory': True}, 'overlap_comm': True, 'contiguous_gradients': True, 'reduce_bucket_size': 26214400, 'stage3_prefetch_bucket_size': 23592960.0, 'stage3_param_persistence_threshold': 51200}, 'gradient_accumulation_steps': 2, 'train_micro_batch_size_per_gpu': 1, 'gradient_clipping': 0.0}\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Model Archetecture:  LlamaForCausalLM(\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m   (model): LlamaModel(\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m     (embed_tokens): Embedding(32000, 5120, padding_idx=0)\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m     (layers): ModuleList(\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m       (0-39): 40 x LlamaDecoderLayer(\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m         (self_attn): LlamaAttention(\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (q_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (k_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (v_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (o_proj): Linear(in_features=5120, out_features=5120, bias=False)\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (rotary_emb): LlamaRotaryEmbedding()\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m         )\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m         (mlp): LlamaMLP(\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (gate_proj): Linear(in_features=5120, out_features=13824, bias=False)\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (down_proj): Linear(in_features=13824, out_features=5120, bias=False)\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (up_proj): Linear(in_features=5120, out_features=13824, bias=False)\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m           (act_fn): SiLUActivation()\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m         )\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m         (input_layernorm): LlamaRMSNorm()\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m         (post_attention_layernorm): LlamaRMSNorm()\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m       )\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m     )\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m     (norm): LlamaRMSNorm()\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m   )\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m   (lm_head): Linear(in_features=5120, out_features=32000, bias=False)\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m )\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74274, ip=10.0.36.152)\u001b[0m [2023-06-30 17:39:54,688] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001b[0m [2023-06-30 17:39:56,220] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m ninja: no work to do.\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Time to load cpu_adam op: 2.403524875640869 seconds\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Using /home/ray/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Detected CUDA files, patching ldflags\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Emitting ninja build file /home/ray/.cache/torch_extensions/py310_cu118/cpu_adam/build.ninja...\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Building extension module cpu_adam...\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Loading extension module cpu_adam...\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74502, ip=10.0.60.86)\u001b[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "Downloading (…)neration_config.json: 100%|██████████| 132/132 [00:00<00:00, 1.72MB/s]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74996, ip=10.0.9.249)\u001b[0m Building extension module utils...\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m Loading extension module utils...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m Time to load utils op: 0.0775597095489502 seconds\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Parameter Offload: Total persistent parameters: 414720 in 81 params\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m No modifications detected for re-loaded extension module utils, skipping build step...\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m Using /home/ray/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\u001b[32m [repeated 32x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001b[0m Detected CUDA files, patching ldflags\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Emitting ninja build file /home/ray/.cache/torch_extensions/py310_cu118/utils/build.ninja...\u001b[32m [repeated 31x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74561, ip=10.0.35.16)\u001b[0m Building extension module cpu_adam...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\u001b[32m [repeated 31x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=75132, ip=10.0.20.140)\u001b[0m Loading extension module cpu_adam...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Building extension module utils...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m Loading extension module utils...\u001b[32m [repeated 16x across cluster]\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m ninja: no work to do.\u001b[32m [repeated 31x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=75132, ip=10.0.20.140)\u001b[0m Time to load cpu_adam op: 2.3851447105407715 seconds\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m Time to load utils op: 0.0005815029144287109 seconds\u001b[32m [repeated 16x across cluster]\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m \n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m   | Name  | Type             | Params | Params per Device\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m ---------------------------------------------------------------\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m 0 | model | LlamaForCausalLM | 13.0 B | 813 M            \n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m ---------------------------------------------------------------\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m 13.0 B    Trainable params\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m 0         Non-trainable params\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m 13.0 B    Total params\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m 52,063.457Total estimated model params size (MB)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 0:   0%|          | 0/57 [00:00<?, ?it/s]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:432: PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 64 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m   rank_zero_warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 0:   2%|▏         | 1/57 [00:38<35:42, 38.26s/it, v_num=0, train_loss=11.50]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Time to load utils op: 0.00030732154846191406 seconds\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:44:33,395] [WARNING] [stage3.py:1851:step] 2 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:   4%|▎         | 2/57 [01:19<36:23, 39.69s/it, v_num=0, train_loss=10.70]\n",
+      "Epoch 0:   5%|▌         | 3/57 [01:52<33:52, 37.65s/it, v_num=0, train_loss=1.710]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:45:48,054] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:   7%|▋         | 4/57 [02:34<34:01, 38.51s/it, v_num=0, train_loss=1.610]\n",
+      "Epoch 0:   9%|▉         | 5/57 [03:08<32:35, 37.60s/it, v_num=0, train_loss=0.914]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:47:03,011] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  11%|█         | 6/57 [03:49<32:26, 38.17s/it, v_num=0, train_loss=0.973]\n",
+      "Epoch 0:  12%|█▏        | 7/57 [04:24<31:30, 37.81s/it, v_num=0, train_loss=0.801]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:48:19,362] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  14%|█▍        | 8/57 [05:05<31:10, 38.17s/it, v_num=0, train_loss=0.844]\n",
+      "Epoch 0:  16%|█▌        | 9/57 [05:39<30:12, 37.75s/it, v_num=0, train_loss=0.652]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:49:36,571] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  18%|█▊        | 10/57 [06:22<29:58, 38.26s/it, v_num=0, train_loss=0.633]\n",
+      "Epoch 0:  19%|█▉        | 11/57 [06:59<29:13, 38.12s/it, v_num=0, train_loss=0.629]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/arrow/cpp/src/arrow/filesystem/s3fs.cc:663: CompletedMultipartUpload got error embedded in a 200 OK response: InternalError (\"We encountered an internal error. Please try again.\"), retry = 1\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:50:54,177] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  21%|██        | 12/57 [07:40<28:45, 38.35s/it, v_num=0, train_loss=0.609]\n",
+      "Epoch 0:  23%|██▎       | 13/57 [08:14<27:53, 38.04s/it, v_num=0, train_loss=0.680]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:52:10,002] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  25%|██▍       | 14/57 [08:55<27:26, 38.29s/it, v_num=0, train_loss=0.648]\n",
+      "Epoch 0:  26%|██▋       | 15/57 [09:29<26:33, 37.95s/it, v_num=0, train_loss=0.645]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:53:23,209] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  28%|██▊       | 16/57 [10:09<26:01, 38.08s/it, v_num=0, train_loss=0.664]\n",
+      "Epoch 0:  30%|██▉       | 17/57 [10:43<25:13, 37.83s/it, v_num=0, train_loss=0.625]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:54:36,660] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  32%|███▏      | 18/57 [11:22<24:39, 37.93s/it, v_num=0, train_loss=0.617]\n",
+      "Epoch 0:  33%|███▎      | 19/57 [11:56<23:53, 37.71s/it, v_num=0, train_loss=0.609]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:55:51,289] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  35%|███▌      | 20/57 [12:37<23:20, 37.86s/it, v_num=0, train_loss=0.602]\n",
+      "Epoch 0:  37%|███▋      | 21/57 [13:11<22:36, 37.69s/it, v_num=0, train_loss=0.590]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:57:07,919] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  39%|███▊      | 22/57 [13:53<22:06, 37.91s/it, v_num=0, train_loss=0.555]\n",
+      "Epoch 0:  40%|████      | 23/57 [14:27<21:22, 37.72s/it, v_num=0, train_loss=0.598]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:58:22,349] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  42%|████▏     | 24/57 [15:08<20:48, 37.85s/it, v_num=0, train_loss=0.625]\n",
+      "Epoch 0:  44%|████▍     | 25/57 [15:43<20:07, 37.74s/it, v_num=0, train_loss=0.625]\n",
+      "Epoch 0:  44%|████▍     | 25/57 [15:43<20:07, 37.74s/it, v_num=0, train_loss=0.582]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 17:59:40,125] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  46%|████▌     | 26/57 [16:26<19:35, 37.93s/it, v_num=0, train_loss=0.535]\n",
+      "Epoch 0:  47%|████▋     | 27/57 [17:02<18:56, 37.88s/it, v_num=0, train_loss=0.578]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:00:58,164] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  49%|████▉     | 28/57 [17:44<18:22, 38.01s/it, v_num=0, train_loss=0.582]\n",
+      "Epoch 0:  51%|█████     | 29/57 [18:20<17:42, 37.93s/it, v_num=0, train_loss=0.578]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:02:15,097] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  53%|█████▎    | 30/57 [19:01<17:06, 38.04s/it, v_num=0, train_loss=0.598]\n",
+      "Epoch 0:  54%|█████▍    | 31/57 [19:36<16:26, 37.95s/it, v_num=0, train_loss=0.586]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:03:30,632] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  56%|█████▌    | 32/57 [20:16<15:50, 38.02s/it, v_num=0, train_loss=0.605]\n",
+      "Epoch 0:  58%|█████▊    | 33/57 [20:49<15:08, 37.87s/it, v_num=0, train_loss=0.594]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:04:45,362] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  60%|█████▉    | 34/57 [21:31<14:33, 37.98s/it, v_num=0, train_loss=0.598]\n",
+      "Epoch 0:  61%|██████▏   | 35/57 [22:08<13:54, 37.95s/it, v_num=0, train_loss=0.574]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:06:02,727] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  63%|██████▎   | 36/57 [22:48<13:18, 38.02s/it, v_num=0, train_loss=0.586]\n",
+      "Epoch 0:  65%|██████▍   | 37/57 [23:23<12:38, 37.94s/it, v_num=0, train_loss=0.562]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:07:19,126] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  67%|██████▋   | 38/57 [24:05<12:02, 38.03s/it, v_num=0, train_loss=0.535]\n",
+      "Epoch 0:  68%|██████▊   | 39/57 [24:38<11:22, 37.91s/it, v_num=0, train_loss=0.598]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:08:36,683] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  70%|███████   | 40/57 [25:22<10:47, 38.07s/it, v_num=0, train_loss=0.562]\n",
+      "Epoch 0:  72%|███████▏  | 41/57 [25:57<10:07, 37.98s/it, v_num=0, train_loss=0.555]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:09:52,426] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  74%|███████▎  | 42/57 [26:38<09:30, 38.06s/it, v_num=0, train_loss=0.555]\n",
+      "Epoch 0:  75%|███████▌  | 43/57 [27:13<08:51, 37.99s/it, v_num=0, train_loss=0.547]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:11:08,855] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  77%|███████▋  | 44/57 [27:54<08:14, 38.06s/it, v_num=0, train_loss=0.562]\n",
+      "Epoch 0:  79%|███████▉  | 45/57 [28:29<07:35, 37.98s/it, v_num=0, train_loss=0.535]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:12:25,181] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  81%|████████  | 46/57 [29:11<06:58, 38.07s/it, v_num=0, train_loss=0.531]\n",
+      "Epoch 0:  82%|████████▏ | 47/57 [29:45<06:19, 37.99s/it, v_num=0, train_loss=0.504]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:13:40,300] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  84%|████████▍ | 48/57 [30:26<05:42, 38.05s/it, v_num=0, train_loss=0.520]\n",
+      "Epoch 0:  86%|████████▌ | 49/57 [31:01<05:03, 37.99s/it, v_num=0, train_loss=0.523]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:14:55,542] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  88%|████████▊ | 50/57 [31:41<04:26, 38.03s/it, v_num=0, train_loss=0.520]\n",
+      "Epoch 0:  89%|████████▉ | 51/57 [32:16<03:47, 37.98s/it, v_num=0, train_loss=0.527]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:16:12,131] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  91%|█████████ | 52/57 [32:58<03:10, 38.04s/it, v_num=0, train_loss=0.562]\n",
+      "Epoch 0:  93%|█████████▎| 53/57 [33:34<02:32, 38.00s/it, v_num=0, train_loss=0.539]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:17:29,752] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  95%|█████████▍| 54/57 [34:15<01:54, 38.07s/it, v_num=0, train_loss=0.535]\n",
+      "Epoch 0:  96%|█████████▋| 55/57 [34:50<01:16, 38.01s/it, v_num=0, train_loss=0.512]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:18:45,986] [WARNING] [stage3.py:1851:step] 4 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0:  98%|█████████▊| 56/57 [35:31<00:38, 38.07s/it, v_num=0, train_loss=0.516]\n",
+      "Epoch 0: 100%|██████████| 57/57 [36:06<00:00, 38.00s/it, v_num=0, train_loss=0.461]\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m [2023-06-30 18:20:01,817] [WARNING] [stage3.py:1851:step] 3 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance. if this is happening frequently consider adjusting settings to reduce memory consumption. If you are unable to make the cache flushes go away consider adding get_accelerator().empty_cache() calls in your training loop to ensure that all ranks flush their caches at the same time\n",
+      "Epoch 0: : 58it [36:47, 38.07s/it, v_num=0, train_loss=0.523]                      \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74427, ip=10.0.16.236)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74427, ip=10.0.16.236)\u001b[0m   warnings.warn(\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m No modifications detected for re-loaded extension module utils, skipping build step...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Using /home/ray/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Loading extension module utils...\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Uploading checkpoint files from worker rank 0 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000.\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m /home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1802: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m   warnings.warn(\u001b[32m [repeated 15x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001b[0m Uploading checkpoint files from worker rank 3 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000.\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m Uploading checkpoint files from worker rank 1 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000.\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m Done uploading checkpoint files.\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74341, ip=10.0.29.61)\u001b[0m Uploading checkpoint files from worker rank 10 to cloud URI s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000.\u001b[32m [repeated 13x across cluster]\u001b[0m\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74427, ip=10.0.16.236)\u001b[0m Done uploading checkpoint files.\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74152, ip=10.0.63.141)\u001b[0m Done uploading checkpoint files.\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=74711, ip=10.0.45.211)\u001b[0m Done uploading checkpoint files.\u001b[32m [repeated 11x across cluster]\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 0: : 58it [37:42, 39.00s/it, v_num=0, train_loss=0.523]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=134267)\u001b[0m `Trainer.fit` stopped: `max_epochs=1` reached.\n",
+      "\u001b[2m\u001b[36m(LightningTrainer pid=134103)\u001b[0m Uploading trial artifacts took 26.651 s, which may be a performance bottleneck. Consider saving fewer/smaller artifacts to the trial log directory, or disable artifact syncing with `SyncConfig(sync_artifacts=False)`.\n",
+      "\u001b[2m\u001b[36m(RayTrainWorker pid=75547, ip=10.0.42.158)\u001b[0m Done uploading checkpoint files.\u001b[32m [repeated 2x across cluster]\u001b[0m\n",
+      "2023-06-30 18:21:59,316\tINFO tune.py:1148 -- Total run time: 2542.82 seconds (2511.95 seconds for the tuning loop).\n"
+     ]
+    }
+   ],
+   "source": [
+    "result = trainer.fit()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In summary:\n",
+    "- Training takes: 36:06 = 2166s\n",
+    "- Training + initialization + checkpointing takes 2473s\n",
+    "\n",
+    "Therefore, the model initialization and checkpoint syncing takes 307s. It will be amortized when you have larger datasets and spend more time on training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Result(\n",
+       "  metrics={'_report_on': 'train_epoch_end', 'train_loss': 0.5234375, 'epoch': 0, 'step': 29, 'should_checkpoint': True, 'done': True, 'trial_id': 'c1544_00000', 'experiment_tag': '0'},\n",
+       "  path='s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36',\n",
+       "  checkpoint=LightningCheckpoint(uri=s3://anyscale-staging-data-cld-kvedzwag2qa8i5bjxuevf5i7/yunxuanx-test/vicuna-13b-test/vicuna-13b-relation-extraction/LightningTrainer_c1544_00000_0_2023-06-30_17-39-36/checkpoint_000000)\n",
+       ")"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## LLM Inference\n",
+    "\n",
+    "Now, it's time to play with our fine-tuned Vicuna code generator!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Download and Process your checkpoints\n",
+    "\n",
+    "First, download the checkpoints to your local machine using the AWS CLI.\n",
+    "\n",
+    "Note that adding the following configurations can significantly increase the syncing throughput compared to the default configurations. On a g5 instance with NVME SSD, the download speed improved from `200MB/s` to around `1.5GB/s`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!awsv2 configure set s3.max_concurrent_requests 32\n",
+    "!awsv2 configure set default.s3.preferred_transfer_client crt\n",
+    "!awsv2 configure set default.s3.target_bandwidth 100Gb/s\n",
+    "!awsv2 configure set default.s3.multipart_chunksize 8MB"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.system(f\"awsv2 s3 sync {result.checkpoint.uri} /mnt/local_storage/checkpoint\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The deepspeed ZeRO-3 checkpoint is a directory containing of k shards (k=16 in our case).\n",
+    "\n",
+    "- `zero_pp_rank_k_mp_rank_00_model_states.pt`: contains the model parameter skeleton of shard k.\n",
+    "- `bf16_zero_pp_rank_k_mp_rank_00_optim_states.pt`: contains the actual flattened model parameters and optimizer states of shard k.\n",
+    "\n",
+    "Next, we removed the optimizer states and consolidate the checkpoint into a single binary file using DeepSpeed utilities. Also, since we wrapped vicuna-13b within a `LightningModule`, we need to remove the prefix `_forward_module.model.model` so that we can directly load the checkpoint into a HF vicuna model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processing zero checkpoint '/mnt/local_storage/checkpoint/model/checkpoint'\n",
+      "Detected checkpoint of type zero stage 3, world_size: 16\n",
+      "Parsing checkpoint created by deepspeed==0.9.4\n",
+      "Reconstructed Trainable fp32 state dict with 363 params 13015864320 elements\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint\n",
+    "\n",
+    "def extract_fp32_ckpt_from_zero(zero_ckpt_dir):\n",
+    "    state_dict = get_fp32_state_dict_from_zero_checkpoint(zero_ckpt_dir)\n",
+    "    vicuna_state_dict = {\n",
+    "        k.replace(\"_forward_module.model.\", \"\"): v for k, v in state_dict.items()\n",
+    "    }\n",
+    "    torch.save(vicuna_state_dict, os.path.join(zero_ckpt_dir, \"full_model.pt\"))\n",
+    "\n",
+    "\n",
+    "full_model_ckpt_path = \"/mnt/local_storage/checkpoint/model/full_model.pt\"\n",
+    "extract_fp32_ckpt_from_zero(\"/mnt/local_storage/checkpoint/model\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Initialize Generation Pipeline\n",
+    "\n",
+    "Here, we leverage the Accelerate library to efficiently load the model onto a suitable device(GPU and CPU) and generate a HF text generation pipeline. \n",
+    "\n",
+    "- Initialize an empty model on metadevice\n",
+    "- Create valid device mappings for the vicuna-13b model\n",
+    "- Load and distribute model weights to target devices\n",
+    "\n",
+    "This ensures that only 1x model size of RAM is used for model initialization."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import ray\n",
+    "import pytorch_lightning as pl\n",
+    "from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM\n",
+    "from accelerate import (\n",
+    "    init_empty_weights,\n",
+    "    infer_auto_device_map,\n",
+    "    load_checkpoint_and_dispatch,\n",
+    ")\n",
+    "\n",
+    "# Initialize a model on meta device\n",
+    "with init_empty_weights():\n",
+    "    config = AutoConfig.from_pretrained(MODEL_NAME)\n",
+    "    meta_model = AutoModelForCausalLM.from_config(config)\n",
+    "meta_model.tie_weights()\n",
+    "\n",
+    "# Define the device mapping\n",
+    "device_map = infer_auto_device_map(\n",
+    "    meta_model,\n",
+    "    max_memory={0: \"15GB\", \"cpu\": \"60GB\"},\n",
+    "    no_split_module_classes=[\"LlamaDecoderLayer\"],\n",
+    ")\n",
+    "\n",
+    "# Load the model parameters\n",
+    "model = load_checkpoint_and_dispatch(\n",
+    "    meta_model,\n",
+    "    checkpoint=full_model_ckpt_path,\n",
+    "    device_map=device_map,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import pipeline\n",
+    "\n",
+    "generator = pipeline(\n",
+    "    \"text-generation\",\n",
+    "    model=model,\n",
+    "    device_map=device_map,\n",
+    "    tokenizer=AutoTokenizer.from_pretrained(\n",
+    "        MODEL_NAME, padding_side=\"left\", use_fast=False\n",
+    "    ),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Case Study\n",
+    "\n",
+    "We took 3 examples from the CoNaLa's test split for demo:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "testcases = [\n",
+    "    {\n",
+    "        \"intent\": \"replace white spaces in colunm 'col' of dataframe `df` with '_'\",\n",
+    "    },\n",
+    "    {\n",
+    "        \"intent\": \"search for occurrences of regex pattern '>.*<' in xml string `line`\",\n",
+    "    },\n",
+    "    {\n",
+    "        \"intent\": \"send a signal `signal.SIGUSR1` to the current process\",\n",
+    "    },\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's begin by examining the generated outputs without fine-tuning. In this case study, we utilize [Aviary Explorer](https://aviary.anyscale.com), an open-source multi-LLM serving platform supported by Ray and Anyscale. You can easily select from a variety of open-source LLMs and compare their generation quality, cost, latency, and many other metrics.\n",
+    "\n",
+    "We constructed a prompt in a zero-shot learning manner and feed it into 3 OSS LLMs.\n",
+    "\n",
+    "![](https://user-images.githubusercontent.com/26745457/250704232-65a20f1b-6752-4d6c-bba1-8296a373162f.png)\n",
+    "\n",
+    "\n",
+    "- `vicuna-13b-v1.3` begins to speak Chinese.\n",
+    "- `mpt-7b-chat` generates a reasonable code snippet, but with multiple lines.\n",
+    "- `falcon-7b-sft` generates a one line snippet, but it doesn't seem to work.\n",
+    "\n",
+    "As we can see, none of them generate a satisfactory code snippet. \n",
+    "\n",
+    "Now let's check the performance of our fine-tuned `vicuna-13b-v1.3` model:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/ray/anaconda3/lib/python3.10/site-packages/transformers/pipelines/base.py:1081: UserWarning: You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Intent: replace white spaces in colunm 'col' of dataframe `df` with '_'\n",
+      "One-line code snippet:  `df['col'] = df['col'].str.replace(' ', '_')`\n",
+      "\n",
+      "Intent: search for occurrences of regex pattern '>.*<' in xml string `line`\n",
+      "One-line code snippet:  `re.findall('>.*<', line)``\n",
+      "\n",
+      "Intent: send a signal `signal.SIGUSR1` to the current process\n",
+      "One-line code snippet:  `os.kill(os.getpid(), signal.SIGUSR1)``\n"
+     ]
+    }
+   ],
+   "source": [
+    "for case in testcases:\n",
+    "    prompt = PROMPT_TEMPLATE.format(intent=case[\"intent\"], snippet=\"\")\n",
+    "    output = generator(prompt, max_new_tokens=30, do_sample=True)\n",
+    "    print(output[0][\"generated_text\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Test the Generated Code Snippets\n",
+    "\n",
+    "The generated code snippets look pretty reasonable. The results covered Pandas operations, regular expressions, and Linux commands. Let's test them one by one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Before\n",
+      "            col\n",
+      "0  abc def ghi\n",
+      "1     12 3 456\n",
+      "2             \n",
+      "After\n",
+      "            col\n",
+      "0  abc_def_ghi\n",
+      "1    _12_3_456\n",
+      "2        _____\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "df = pd.DataFrame.from_dict({\"col\": [\"abc def ghi\", \" 12 3 456\", \"     \"]})\n",
+    "print(\"Before\\n\", df)\n",
+    "\n",
+    "df[\"col\"] = df[\"col\"].str.replace(\" \", \"_\")\n",
+    "print(\"After\\n\", df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['>The Great Gatsby<',\n",
+       " '>F. Scott Fitzgerald<',\n",
+       " '>1925<',\n",
+       " '>Sapiens: A Brief History of Humankind<',\n",
+       " '>Yuval Noah Harari<',\n",
+       " '>2011<']"
+      ]
+     },
+     "execution_count": 47,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import re\n",
+    "\n",
+    "line = \"\"\"\n",
+    "<bookstore>\n",
+    "  <book category=\"fiction\">\n",
+    "    <title>The Great Gatsby</title>\n",
+    "    <author>F. Scott Fitzgerald</author>\n",
+    "    <year>1925</year>\n",
+    "  </book>\n",
+    "  <book category=\"non-fiction\">\n",
+    "    <title>Sapiens: A Brief History of Humankind</title>\n",
+    "    <author>Yuval Noah Harari</author>\n",
+    "    <year>2011</year>\n",
+    "  </book>\n",
+    "</bookstore>\n",
+    "\"\"\"\n",
+    "re.findall(\">.*<\", line)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, let's hand it over to LLM and let it wrap up the demo:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os, signal\n",
+    "\n",
+    "os.kill(os.getpid(), signal.SIGUSR1)  # Terminate the current process~"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## References:\n",
+    "\n",
+    "- [CoNaLa: The Code/Natural Language Challenge](https://conala-corpus.github.io/)\n",
+    "- [HuggingFace: DeepSpeed Integration](https://huggingface.co/docs/transformers/main_classes/deepspeed#deepspeed-integration)\n",
+    "- [HuggingFace: Handling big models for inference](https://huggingface.co/docs/accelerate/main/usage_guides/big_modeling)\n",
+    "- [Lightning Transformers: DeepSpeed Training with Big Transformer Models](https://lightning-transformers.readthedocs.io/en/latest/)\n",
+    "- [Aviary: Open Source Multi-LLM Serving](https://www.anyscale.com/blog/announcing-aviary-open-source-multi-llm-serving-solution)\n",
+    "- Rajbhandari, S., Rasley, J., et al. (2020). ZeRO: Memory Optimizations Toward Training Trillion Parameter Models. [arXiv:1910.02054](https://arxiv.org/abs/1910.02054)\n",
+    "- Zheng, L., Chiang, W-L., Sheng, Y., et al. (2023). Judging LLM-as-a-judge with MT-Bench and Chatbot Arena. [arXiv:2306.05685](https://arxiv.org/abs/2306.05685)\n",
+    "\n",
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/test_myst_doc.py b/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/test_myst_doc.py
new file mode 120000
index 0000000000000..c265ccc7b062b
--- /dev/null
+++ b/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/test_myst_doc.py
@@ -0,0 +1 @@
+../../../doc/test_myst_doc.py
\ No newline at end of file
diff --git a/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_deepspeed_compute_aws.yaml b/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_deepspeed_compute_aws.yaml
new file mode 100644
index 0000000000000..17f69c81a906a
--- /dev/null
+++ b/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_deepspeed_compute_aws.yaml
@@ -0,0 +1,20 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west-2
+
+head_node_type:
+    name: head_node
+    instance_type: g5.16xlarge
+
+worker_node_types:
+    - name: worker_node
+      instance_type: g5.4xlarge
+      min_workers: 15
+      max_workers: 15
+      use_spot: false
+
+aws:
+  TagSpecifications:
+    - ResourceType: "instance"
+      Tags:
+        - Key: ttl-hours
+          Value: '24'
diff --git a/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_deepspeed_env.yaml b/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_deepspeed_env.yaml
new file mode 100644
index 0000000000000..77acb25855284
--- /dev/null
+++ b/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_deepspeed_env.yaml
@@ -0,0 +1,27 @@
+base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray:nightly-py38-cu118") }}
+env_vars: {}
+debian_packages:
+  - curl
+
+python:
+  pip_packages:
+    - datasets==2.13.1
+    - evaluate==0.4.0
+    - scikit-learn==1.3.0
+    - boto3==1.28.5
+    - myst-parser==0.15.2
+    - myst-nb==0.13.1
+    - jupytext==1.13.6
+    - typing-extensions<4.6.0
+  conda_packages: []
+
+post_build_cmds:
+  - pip uninstall -y ray || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
+  - {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}
+  - echo "sudo lsblk -f" >> ~/.bashrc
+  - echo "yes N | sudo mkfs -t ext4 /dev/nvme1n1 || true" >> ~/.bashrc
+  - echo "mkdir -p /mnt/local_storage" >> ~/.bashrc
+  - echo "sudo chmod 0777 /mnt/local_storage" >> ~/.bashrc
+  - echo "sudo mount /dev/nvme1n1 /mnt/local_storage || true" >> ~/.bashrc
+  - pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+  - pip3 install "pytorch_lightning==2.0.3" "transformers==4.30.2" "accelerate==0.20.3" "deepspeed==0.9.4"
diff --git a/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_lightning_deepspeed_finetune.ipynb b/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_lightning_deepspeed_finetune.ipynb
new file mode 120000
index 0000000000000..ccd34dcfc22fa
--- /dev/null
+++ b/release/air_examples/vicuna_13b_lightning_deepspeed_finetuning/vicuna_13b_lightning_deepspeed_finetune.ipynb
@@ -0,0 +1 @@
+../../../doc/source/train/examples/lightning/vicuna_13b_lightning_deepspeed_finetune.ipynb
\ No newline at end of file
diff --git a/release/release_tests.yaml b/release/release_tests.yaml
index 7699ea7bcd31a..0fd334c7e33bb 100644
--- a/release/release_tests.yaml
+++ b/release/release_tests.yaml
@@ -967,6 +967,27 @@
 
   # variations: TODO(jungong): add GCP variation.
 
+- name: air_example_vicuna_13b_lightning_deepspeed_finetuning
+  group: AIR examples
+  working_dir: air_examples/vicuna_13b_lightning_deepspeed_finetuning
+
+  python: "3.8"
+
+  frequency: weekly
+  team: ml
+  cluster:
+    byod:
+      type: cu118
+      pip:
+        - myst-parser==0.15.2
+        - myst-nb==0.13.1
+        - jupytext==1.13.6
+    cluster_env: vicuna_13b_deepspeed_env.yaml
+    cluster_compute: vicuna_13b_deepspeed_compute_aws.yaml
+
+  run:
+    timeout: 4700
+    script: python test_myst_doc.py --path vicuna_13b_lightning_deepspeed_finetune.ipynb
 
 #####################################
 # Workspace templates release tests #