Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release/air] Fix air_example_gptj_deepspeed_fine_tuning.gce failing... #36562

Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@
" \"aws\",\n",
" \"s3\",\n",
" \"sync\",\n",
" \"--quiet\",\n",
" \"--no-sign-request\",\n",
" \"s3://large-dl-models-mirror/models--EleutherAI--gpt-j-6B/main/\",\n",
" os.path.join(path, \"snapshots\", \"main\"),\n",
" ]\n",
Expand Down
23 changes: 13 additions & 10 deletions python/ray/train/huggingface/transformers/_transformers_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from pathlib import Path
from typing import TYPE_CHECKING, Any, Optional, Tuple, Type
from typing import TYPE_CHECKING, Any, Iterator, Optional, Tuple, Type

import datasets.iterable_dataset
import transformers.trainer
Expand Down Expand Up @@ -62,19 +62,22 @@ def get_train_dataloader(self):


# TODO(ml-team): Replace with a Datasets-HuggingFace integration when available.
class RayDatasetHFIterable(datasets.iterable_dataset.ExamplesIterable):
"""HF ExamplesIterable backed by a Dataset."""
class RayDatasetHFIterable(datasets.iterable_dataset._BaseExamplesIterable):
"""HF ``_BaseExamplesIterable`` backed by a ``ray.data.DataIterator``.

The other abstract methods of shuffling and sharding the data are not implemented,
since those operations should be done by Ray Data. For example, the dataset
is already sharded to each data parallel worker and is disabled
(see ``wrap_transformers_trainer`` above).
"""

def __init__(self, dataset: DataIterator) -> None:
super().__init__()
self.dataset = dataset
self.generate_examples_fn = self.dataset.iter_rows

# Required for the superclass
self.kwargs = {}

def __iter__(self):
for row in self.generate_examples_fn(**self.kwargs):
yield (0, {k: v for k, v in row.items()})
def __iter__(self) -> Iterator[Tuple[int, dict]]:
for idx, row in enumerate(self.dataset.iter_rows()):
yield (idx, {k: v for k, v in row.items()})


def process_dataset_for_hf(
Expand Down
2 changes: 1 addition & 1 deletion release/release_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -810,7 +810,7 @@
cluster_compute: gptj_deepspeed_compute_aws.yaml

run:
timeout: 3600
timeout: 4500
script: python test_myst_doc.py --path gptj_deepspeed_fine_tuning.ipynb

variations:
Expand Down