ray-project · bveeramani · Dec 3, 2024 · Nov 22, 2024 · Nov 25, 2024 · Nov 25, 2024
@@ -1,8 +1,7 @@
-import math
 from contextlib import contextmanager
 from typing import Any, Callable, Iterable, Iterator, List, Optional
 
-from ray.data.block import Block, BlockAccessor, BlockMetadata
+from ray.data.block import Block, BlockMetadata
 from ray.data.datasource.datasource import Datasource, ReadTask
 
 Connection = Any  # A Python DB API2-compliant `Connection` object.
@@ -72,99 +71,19 @@ def _connect(connection_factory: Callable[[], Connection]) -> Iterator[Cursor]:
 
 
 class SQLDatasource(Datasource):
-
-    NUM_SAMPLE_ROWS = 100
-    MIN_ROWS_PER_READ_TASK = 50
-
     def __init__(self, sql: str, connection_factory: Callable[[], Connection]):
         self.sql = sql
         self.connection_factory = connection_factory
 
     def estimate_inmemory_data_size(self) -> Optional[int]:
-        pass
+        return None
 
     def get_read_tasks(self, parallelism: int) -> List[ReadTask]:
-        def fallback_read_fn() -> Iterable[Block]:
-            with _connect(self.connection_factory) as cursor:
-                cursor.execute(self.sql)
-                block = _cursor_to_block(cursor)
-                return [block]
-
-        # If `parallelism` is 1, directly fetch all rows. This avoids unnecessary
-        # queries to fetch a sample block and compute the total number of rows.
-        if parallelism == 1:
-            metadata = BlockMetadata(None, None, None, None, None)
-            return [ReadTask(fallback_read_fn, metadata)]
-
-        # Databases like DB2, Oracle, and MS SQL Server don't support `LIMIT`.
-        try:
-            with _connect(self.connection_factory) as cursor:
-                cursor.execute(f"SELECT * FROM ({self.sql}) as T LIMIT 1 OFFSET 0")
-            is_limit_supported = True
-        except Exception:
-            is_limit_supported = False
-
-        if not is_limit_supported:
-            metadata = BlockMetadata(None, None, None, None, None)
-            return [ReadTask(fallback_read_fn, metadata)]
-
-        num_rows_total = self._get_num_rows()
-
-        if num_rows_total == 0:
-            return []
-
-        parallelism = min(
-            parallelism, math.ceil(num_rows_total / self.MIN_ROWS_PER_READ_TASK)
-        )
-        num_rows_per_block = num_rows_total // parallelism
-        num_blocks_with_extra_row = num_rows_total % parallelism
-
-        sample_block_accessor = BlockAccessor.for_block(self._get_sample_block())
-        estimated_size_bytes_per_row = math.ceil(
-            sample_block_accessor.size_bytes() / sample_block_accessor.num_rows()
-        )
-        sample_block_schema = sample_block_accessor.schema()
-
-        tasks = []
-        offset = 0
-        for i in range(parallelism):
-            num_rows = num_rows_per_block
-            if i < num_blocks_with_extra_row:
-                num_rows += 1
-
-            read_fn = self._create_read_fn(num_rows, offset)
-            metadata = BlockMetadata(
-                num_rows,
-                estimated_size_bytes_per_row * num_rows,
-                sample_block_schema,
-                None,
-                None,
-            )
-            tasks.append(ReadTask(read_fn, metadata))
-
-            offset += num_rows
-
-        return tasks
-
-    def _get_num_rows(self) -> int:
-        with _connect(self.connection_factory) as cursor:
-            cursor.execute(f"SELECT COUNT(*) FROM ({self.sql}) as T")
-            return cursor.fetchone()[0]
-
-    def _get_sample_block(self) -> Block:
-        with _connect(self.connection_factory) as cursor:
-            cursor.execute(
-                f"SELECT * FROM ({self.sql}) as T LIMIT {self.NUM_SAMPLE_ROWS}"
-            )
-            return _cursor_to_block(cursor)
-
-    def _create_read_fn(self, num_rows: int, offset: int):
         def read_fn() -> Iterable[Block]:
             with _connect(self.connection_factory) as cursor:
-                cursor.execute(
-                    f"SELECT * FROM ({self.sql}) as T LIMIT {num_rows} OFFSET {offset}"
-                )
+                cursor.execute(self.sql)
                 block = _cursor_to_block(cursor)
                 return [block]
 
-        return read_fn
+        metadata = BlockMetadata(None, None, None, None, None)
+        return [ReadTask(read_fn, metadata)]
@@ -2158,6 +2158,12 @@ def create_connection():
     Returns:
         A :class:`Dataset` containing the queried data.
     """
+    if parallelism != -1 and parallelism != 1:
+        raise ValueError(
+            "To ensure correctness, 'read_sql' always launches one task. The "
+            "'parallelism' argument you specified can't be used."
+        )
+
     datasource = SQLDatasource(sql=sql, connection_factory=connection_factory)
     return read_datasource(
         datasource,

@@ -22,8 +22,14 @@ def temp_database_fixture() -> Generator[str, None, None]:
         yield file.name
 
 
-@pytest.mark.parametrize("parallelism", [-1, 1])
-def test_read_sql(temp_database: str, parallelism: int):
+def test_read_sql_with_parallelism_warns(temp_database):
+    with pytest.raises(ValueError):
+        ray.data.read_sql(
+            "SELECT * FROM movie", lambda: sqlite3.connect(temp_database), parallelism=2
+        )
+
+
+def test_read_sql(temp_database: str):
     connection = sqlite3.connect(temp_database)
     connection.execute("CREATE TABLE movie(title, year, score)")
     expected_values = [
@@ -37,7 +43,6 @@ def test_read_sql(temp_database: str, parallelism: int):
     dataset = ray.data.read_sql(
         "SELECT * FROM movie",
         lambda: sqlite3.connect(temp_database),
-        override_num_blocks=parallelism,
     )
     actual_values = [tuple(record.values()) for record in dataset.take_all()]