lancedb · westonpace · Oct 30, 2024 · Oct 30, 2024
diff --git a/python/python/lance/sampler.py b/python/python/lance/sampler.py
@@ -139,6 +139,8 @@ def _filtered_efficient_sample(
         elif filter is not None:
             raise NotImplementedError(f"Can't yet run filter <{filter}> in-memory")
         if table.num_rows > 0:
+            if table.num_rows > remaining_rows:
+                table = table.slice(0, remaining_rows)
             tables.append(table)
             remaining_rows -= table.num_rows
             remaining_in_batch = remaining_in_batch - table.num_rows

diff --git a/python/python/lance/torch/data.py b/python/python/lance/torch/data.py
@@ -31,6 +31,19 @@
 __all__ = ["LanceDataset"]
 
 
+# Convert an Arrow FSL array into a 2D torch tensor
+def _fsl_to_tensor(arr: pa.FixedSizeListArray, dimension: int) -> torch.Tensor:
+    # Note: FixedSizeListArray.values does not take offset/len into account and
+    # so may we need to slice here
+    values = arr.values
+    start = arr.offset * dimension
+    num_vals = len(arr) * dimension
+    values = values.slice(start, num_vals)
+    # Convert to numpy
+    nparr = values.to_numpy(zero_copy_only=True).reshape(-1, dimension)
+    return torch.from_numpy(nparr)
+
+
 def _to_tensor(
     batch: pa.RecordBatch,
     *,
@@ -54,11 +67,7 @@ def _to_tensor(
             pa.types.is_floating(arr.type.value_type)
             or pa.types.is_integer(arr.type.value_type)
         ):
-            np_tensor = arr.values.to_numpy(zero_copy_only=True).reshape(
-                -1, arr.type.list_size
-            )
-            tensor = torch.from_numpy(np_tensor)
-            del np_tensor
+            tensor = _fsl_to_tensor(arr, arr.type.list_size)
         elif (
             pa.types.is_integer(arr.type)
             or pa.types.is_floating(arr.type)

diff --git a/python/python/tests/torch_tests/test_data.py b/python/python/tests/torch_tests/test_data.py
@@ -185,6 +185,32 @@ def test_sample_batches(tmp_path: Path):
     assert all_ids == [i for i in range(2000) if i // 25 % 2 == 1]
 
 
+def test_filtered_sampling_odd_batch_size(tmp_path: Path):
+    tbl = pa.Table.from_pydict(
+        {
+            "vector": pa.array(
+                [[1.0, 2.0, 3.0] for _ in range(10000)], pa.list_(pa.float32(), 3)
+            ),
+            "filterme": [i % 2 for i in range(10000)],
+        }
+    )
+
+    lance.write_dataset(tbl, tmp_path, max_rows_per_file=200)
+
+    ds = LanceDataset(
+        tmp_path,
+        batch_size=38,
+        columns=["vector"],
+        samples=38 * 256,
+        filter="vector is not null",
+    )
+
+    x = next(iter(ds))
+
+    assert x.shape[0] == 38
+    assert x.shape[1] == 3
+
+
 def test_sample_batches_with_filter(tmp_path: Path):
     NUM_ROWS = 10000
     tbl = pa.Table.from_pydict(