add gpu_prefetch_batches param

Signed-off-by: Scott Lee <sjl@anyscale.com>
ray-project · amogkam · Jul 7, 2023 · Jun 26, 2023 · Jun 26, 2023 · Jun 27, 2023
commit 122986f9e935e0cceef961c89ea880040e2814d9
@@ -34,6 +34,7 @@ def iter_batches(
     shuffle_seed: Optional[int] = None,
     ensure_copy: bool = False,
     prefetch_batches: int = 1,
+    gpu_prefetch_batches: int = 1,
 ) -> Iterator[DataBatch]:
     """Create formatted batches of data from an iterator of block object references and
     corresponding metadata.
@@ -97,8 +98,13 @@ def iter_batches(
             process. If set to greater than 0, a separate thread will be used to fetch
             the specified amount of formatted batches from blocks. This improves
             performance for non-CPU bound UDFs, allowing batch fetching compute and
-            formatting to be overlapped with the UDF. Defaults to 0 (no prefetching
-            enabled).
+            formatting to be overlapped with the UDF. Defaults to 1.
+        gpu_prefetch_batches: The number of batches to fetch ahead of the current
+            batch to fetch on the GPU. If set to greater than 0, a separate
+            threadpool will be used to format batches and apply the collate_fn.
+            Defaults to 1. You can revert back to the old prefetching behavior
+            that uses `prefetch_blocks` by setting `use_legacy_iter_batches` to
+            True in the DataContext.
 
     Returns:
         An iterator over record batches.
@@ -119,7 +125,6 @@ def iter_batches(
     def _async_iter_batches(
         block_refs: Iterator[Tuple[ObjectRef[Block], BlockMetadata]],
     ) -> Iterator[DataBatch]:
-
         # Step 1: Prefetch logical batches locally.
         block_refs = prefetch_batches_locally(
             block_ref_iter=block_refs,
@@ -149,7 +154,7 @@ def _async_iter_batches(
             stats=stats,
             batch_format=batch_format,
             collate_fn=collate_fn,
-            num_threadpool_workers=prefetch_batches,
+            num_threadpool_workers=gpu_prefetch_batches,
         )
 
         # Step 5: Restore original order.

diff --git a/python/ray/data/dataset.py b/python/ray/data/dataset.py
@@ -2957,6 +2957,7 @@ def iter_batches(
         self,
         *,
         prefetch_batches: int = 1,
+        gpu_prefetch_batches: int = 1,
         batch_size: Optional[int] = 256,
         batch_format: Optional[str] = "default",
         drop_last: bool = False,
@@ -2978,10 +2979,15 @@ def iter_batches(
         Args:
             prefetch_batches: The number of batches to fetch ahead of the current batch
                 to fetch. If set to greater than 0, a separate threadpool will be used
-                to fetch the objects to the local node, format the batches, and apply
-                the collate_fn. Defaults to 1. You can revert back to the old
-                prefetching behavior that uses `prefetch_blocks` by setting
-                `use_legacy_iter_batches` to True in the datasetContext.
+                to fetch the objects to the local node. Defaults to 1. You can revert
+                back to the old prefetching behavior that uses `prefetch_blocks` by
+                setting `use_legacy_iter_batches` to True in the DataContext.
+            gpu_prefetch_batches: The number of batches to fetch ahead of the current
+                batch to fetch on the GPU. If set to greater than 0, a separate
+                threadpool will be used to format batches and apply the collate_fn.
+                Defaults to 1. You can revert back to the old prefetching behavior
+                that uses `prefetch_blocks` by setting `use_legacy_iter_batches` to
+                True in the DataContext.
             batch_size: The number of rows in each batch, or None to use entire blocks
                 as batches (blocks may contain different number of rows).
                 The final batch may include fewer than ``batch_size`` rows if
@@ -3007,6 +3013,7 @@ def iter_batches(
             logger.warning("The 'native' batch format has been renamed 'default'.")
         return self.iterator().iter_batches(
             prefetch_batches=prefetch_batches,
+            gpu_prefetch_batches=gpu_prefetch_batches,
             prefetch_blocks=prefetch_blocks,
             batch_size=batch_size,
             batch_format=batch_format,

@@ -88,6 +88,7 @@ def iter_batches(
         self,
         *,
         prefetch_batches: int = 1,
+        gpu_prefetch_batches: int = 1,
         batch_size: int = 256,
         batch_format: Optional[str] = "default",
         drop_last: bool = False,
@@ -111,10 +112,15 @@ def iter_batches(
         Args:
             prefetch_batches: The number of batches to fetch ahead of the current batch
                 to fetch. If set to greater than 0, a separate threadpool will be used
-                to fetch the objects to the local node, format the batches, and apply
-                the collate_fn. Defaults to 1. You can revert back to the old
-                prefetching behavior that uses `prefetch_blocks` by setting
-                `use_legacy_iter_batches` to True in the DataContext.
+                to fetch the objects to the local node. Defaults to 1. You can revert
+                back to the old prefetching behavior that uses `prefetch_blocks` by
+                setting `use_legacy_iter_batches` to True in the DataContext.
+            gpu_prefetch_batches: The number of batches to fetch ahead of the current
+                batch to fetch on the GPU. If set to greater than 0, a separate
+                threadpool will be used to format batches and apply the collate_fn.
+                Defaults to 1. You can revert back to the old prefetching behavior
+                that uses `prefetch_blocks` by setting `use_legacy_iter_batches` to
+                True in the DataContext.
             batch_size: The number of rows in each batch, or None to use entire blocks
                 as batches (blocks may contain different number of rows).
                 The final batch may include fewer than ``batch_size`` rows if
@@ -187,6 +193,7 @@ def drop_metadata(block_iterator):
                 shuffle_buffer_min_size=local_shuffle_buffer_size,
                 shuffle_seed=local_shuffle_seed,
                 prefetch_batches=prefetch_batches,
+                gpu_prefetch_batches=gpu_prefetch_batches,
             )
 
         if stats:
@@ -320,7 +327,6 @@ def iter_torch_batches(
             )
 
         if collate_fn is None:
-
             # Automatically move torch tensors to the appropriate device.
             if device is None:
                 default_device = get_device()

@@ -143,7 +143,8 @@ def collate_fn(batch: pd.DataFrame):
         assert concat_df["foo"].iloc[i + 1] >= concat_df["foo"].iloc[i]
 
 
-def test_iter_batches_e2e_async(ray_start_regular_shared):
+@pytest.mark.parametrize("gpu_prefetch_batches", [1, 2, 4])
+def test_iter_batches_e2e_async(ray_start_regular_shared, gpu_prefetch_batches):
     """We add time.sleep in 3 places:
     1. In the base generator to simulate streaming executor blocking on next results.
     2. In the collate_fn to simulate expensive slicing/formatting/collation
@@ -160,7 +161,11 @@ def collate_fn(batch):
     )
     start_time = time.time()
     output_batches = iter_batches(
-        block_refs_iter, batch_size=None, collate_fn=collate_fn, prefetch_batches=4
+        block_refs_iter,
+        batch_size=None,
+        collate_fn=collate_fn,
+        prefetch_batches=4,
+        gpu_prefetch_batches=gpu_prefetch_batches,
     )
     batches = []
     for batch in output_batches:
@@ -171,7 +176,12 @@ def collate_fn(batch):
     # 20 batches, 1.5 second sleep. Should be less than 45 seconds, even with some
     # overhead.
     # If there was no overlap, then we would expect this to take at least 20*2.5 = 50
-    assert end_time - start_time < 45, end_time - start_time
+    if gpu_prefetch_batches == 1:
+        assert end_time - start_time < 45, end_time - start_time
+    elif gpu_prefetch_batches == 2:
+        assert end_time - start_time < 40, end_time - start_time
+    elif gpu_prefetch_batches == 4:
+        assert end_time - start_time < 35, end_time - start_time
 
     assert len(batches) == 20
     assert all(len(batch) == 2 for batch in batches)