add test

eddyxu · eddyxu · commit f1cca685a995 · 2024-02-01T16:49:10.000-08:00
diff --git a/python/python/lance/sampler.py b/python/python/lance/sampler.py
@@ -18,6 +18,7 @@
 
 import gc
 import logging
+import warnings
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from heapq import heappush, heappushpop
@@ -266,19 +267,32 @@ def __init__(self, rank: int, world_size: int):
         self._world_size = world_size
 
     def __call__(
-        self, dataset: lance.LanceDataset, *args, **kwargs
+        self,
+        dataset: lance.LanceDataset,
+        *args,
+        batch_size: int = 128,
+        columns: Optional[List[str]] = None,
+        batch_readahead: int = 16,
+        with_row_id: Optional[bool] = None,
+        **kwargs,
     ) -> Generator[lance.RecordBatch, None, None]:
-        total = self._ds.count_rows()
+        total = dataset.count_rows()
+
+        if with_row_id is not None:
+            warnings.warn(
+                "with_row_id is not supported for ShardedBatchSampler",
+            )
 
         def _gen_ranges():
             for start in range(
-                self._rank * self._batch_size,
+                self._rank * batch_size,
                 total,
-                self._world_size * self._batch_size,
+                self._world_size * batch_size,
             ):
-                yield start, min(start + self._batch_size, total)
+                yield start, min(start + batch_size, total)
 
-        return dataset.take_scan(
+        return dataset._ds.take_scan(
             _gen_ranges(),
-            columns=self._columns,
+            columns=columns,
+            batch_readahead=batch_readahead,
         )
diff --git a/python/python/tests/torch_tests/test_data.py b/python/python/tests/torch_tests/test_data.py
@@ -13,17 +13,17 @@
 #  limitations under the License.
 
 import shutil
-from pathlib import Path
 from itertools import chain
+from pathlib import Path
 
 import lance
 import numpy as np
 import pyarrow as pa
 import pytest
+from lance.sampler import ShardedBatchSampler, ShardedFragmentSampler
 
 torch = pytest.importorskip("torch")
 from lance.torch.data import LanceDataset  # noqa: E402
-from lance.sampler import ShardedFragmentSampler, ShardedBatchSampler, FullScanSampler
 
 
 def test_iter_over_dataset(tmp_path):
@@ -131,3 +131,22 @@ def test_sample_fragments(tmp_path: Path):
 
     all_ids = list(chain.from_iterable([batch["ids"].cpu().numpy() for batch in ds]))
     assert all_ids == [i for i in range(2000) if i // 100 % 2 == 1]
+
+
+def test_sample_batches(tmp_path: Path):
+    arr = pa.array(range(2000))
+    tbl = pa.Table.from_arrays([arr], ["ids"])
+
+    # Write 20 files
+    lance.write_dataset(tbl, tmp_path, max_rows_per_file=100)
+
+    ds = LanceDataset(
+        tmp_path,
+        batch_size=25,
+        columns=["ids"],
+        with_row_id=True,
+        sampler=ShardedBatchSampler(rank=1, world_size=2),
+    )
+
+    all_ids = list(chain.from_iterable([batch.cpu().numpy() for batch in ds]))
+    assert all_ids == [i for i in range(2000) if i // 25 % 2 == 1]