narwhals-dev · MarcoGorelli · Dec 10, 2024 · Dec 10, 2024 · Dec 10, 2024 · Dec 10, 2024
diff --git a/docs/api-reference/narwhals.md b/docs/api-reference/narwhals.md
@@ -38,6 +38,7 @@ Here are the top-level functions available in Narwhals.
         - new_series
         - nth
         - read_csv
+        - scan_csv
         - sum
         - sum_horizontal
         - show_versions

diff --git a/narwhals/__init__.py b/narwhals/__init__.py
@@ -56,6 +56,7 @@
 from narwhals.functions import get_level
 from narwhals.functions import new_series
 from narwhals.functions import read_csv
+from narwhals.functions import scan_csv
 from narwhals.functions import show_versions
 from narwhals.schema import Schema
 from narwhals.series import Series
@@ -140,6 +141,7 @@
     "new_series",
     "nth",
     "read_csv",
+    "scan_csv",
     "selectors",
     "show_versions",
     "stable",

diff --git a/narwhals/functions.py b/narwhals/functions.py
@@ -1020,3 +1020,86 @@ def _read_csv_impl(
             msg = "Unknown namespace is expected to implement `read_csv` function."
             raise AttributeError(msg) from e
     return from_native(native_frame, eager_only=True)
+
+
+def scan_csv(
+    source: str,
+    *,
+    native_namespace: ModuleType,
+) -> LazyFrame[Any]:
+    """Lazily read from a CSV file.
+
+    For the libraries that do not support lazy dataframes, the function reads
+    a csv file eagerly and then converts the resulting dataframe to a lazyframe.
+
+    Arguments:
+        source: Path to a file.
+        native_namespace: The native library to use for DataFrame creation.
+
+    Returns:
+        LazyFrame.
+
+    Examples:
+        >>> import dask.dataframe as dd
+        >>> import polars as pl
+        >>> import pyarrow as pa
+        >>> import narwhals as nw
+        >>> from narwhals.typing import IntoFrame
+        >>> from types import ModuleType
+
+        Let's create an agnostic function that lazily reads a csv file with a specified native namespace:
+
+        >>> def agnostic_scan_csv(native_namespace: ModuleType) -> IntoFrame:
+        ...     return nw.scan_csv("file.csv", native_namespace=native_namespace).to_native()
+
+        Then we can read the file by passing, for example, Polars or Dask namespaces:
+
+        >>> agnostic_scan_csv(native_namespace=pl).collect()  # doctest:+SKIP
+        shape: (3, 2)
+        ┌─────┬─────┐
+        │ a   ┆ b   │
+        │ --- ┆ --- │
+        │ i64 ┆ i64 │
+        ╞═════╪═════╡
+        │ 1   ┆ 4   │
+        │ 2   ┆ 5   │
+        │ 3   ┆ 6   │
+        └─────┴─────┘
+        >>> agnostic_scan_csv(native_namespace=dd).compute()  # doctest:+SKIP
+           a  b
+        0  1  4
+        1  2  5
+        2  3  6
+    """
+    return _scan_csv_impl(source, native_namespace=native_namespace)
+
+
+def _scan_csv_impl(
+    source: str,
+    *,
+    native_namespace: ModuleType,
+) -> LazyFrame[Any]:
+    implementation = Implementation.from_native_namespace(native_namespace)
+    if implementation is Implementation.POLARS:
+        native_frame = native_namespace.scan_csv(source)
+    elif implementation in (
+        Implementation.PANDAS,
+        Implementation.MODIN,
+        Implementation.CUDF,
+    ):
+        native_frame = native_namespace.read_csv(source)
+    elif implementation is Implementation.PYARROW:
+        from pyarrow import csv  # ignore-banned-import
+
+        native_frame = csv.read_csv(source)
+    elif implementation is Implementation.DASK:
+        native_frame = native_namespace.read_csv(source)
+    else:  # pragma: no cover
+        try:
+            # implementation is UNKNOWN, Narwhals extension using this feature should
+            # implement `scan_csv` function in the top-level namespace.
+            native_frame = native_namespace.scan_csv(source=source)
+        except AttributeError as e:
+            msg = "Unknown namespace is expected to implement `scan_csv` function."
+            raise AttributeError(msg) from e
+    return from_native(native_frame).lazy()
diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py
@@ -25,6 +25,7 @@
 from narwhals.functions import _from_numpy_impl
 from narwhals.functions import _new_series_impl
 from narwhals.functions import _read_csv_impl
+from narwhals.functions import _scan_csv_impl
 from narwhals.functions import from_arrow as nw_from_arrow
 from narwhals.functions import get_level
 from narwhals.functions import show_versions
@@ -3447,6 +3448,60 @@ def read_csv(
     )
 
 
+def scan_csv(
+    source: str,
+    *,
+    native_namespace: ModuleType,
+) -> LazyFrame[Any]:
+    """Lazily read from a CSV file.
+
+    For the libraries that do not support lazy dataframes, the function reads
+    a csv file eagerly and then converts the resulting dataframe to a lazyframe.
+
+    Arguments:
+        source: Path to a file.
+        native_namespace: The native library to use for DataFrame creation.
+
+    Returns:
+        LazyFrame.
+
+    Examples:
+        >>> import dask.dataframe as dd
+        >>> import polars as pl
+        >>> import pyarrow as pa
+        >>> import narwhals as nw
+        >>> from narwhals.typing import IntoFrame
+        >>> from types import ModuleType
+
+        Let's create an agnostic function that lazily reads a csv file with a specified native namespace:
+
+        >>> def agnostic_scan_csv(native_namespace: ModuleType) -> IntoFrame:
+        ...     return nw.scan_csv("file.csv", native_namespace=native_namespace).to_native()
+
+        Then we can read the file by passing, for example, Polars or Dask namespaces:
+
+        >>> agnostic_scan_csv(native_namespace=pl).collect()  # doctest:+SKIP
+        shape: (3, 2)
+        ┌─────┬─────┐
+        │ a   ┆ b   │
+        │ --- ┆ --- │
+        │ i64 ┆ i64 │
+        ╞═════╪═════╡
+        │ 1   ┆ 4   │
+        │ 2   ┆ 5   │
+        │ 3   ┆ 6   │
+        └─────┴─────┘
+        >>> agnostic_scan_csv(native_namespace=dd).compute()  # doctest:+SKIP
+           a  b
+        0  1  4
+        1  2  5
+        2  3  6
+    """
+    return _stableify(  # type: ignore[no-any-return]
+        _scan_csv_impl(source, native_namespace=native_namespace)
+    )
+
+
 __all__ = [
     "Array",
     "Boolean",
@@ -3512,6 +3567,7 @@ def read_csv(
     "new_series",
     "nth",
     "read_csv",
+    "scan_csv",
     "selectors",
     "show_versions",
     "sum",

diff --git a/tests/scan_csv_test.py b/tests/scan_csv_test.py
@@ -0,0 +1,43 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import polars as pl
+
+import narwhals as nw
+import narwhals.stable.v1 as nw_v1
+from tests.utils import Constructor
+from tests.utils import assert_equal_data
+
+if TYPE_CHECKING:
+    import pytest
+
+data = {"a": [1, 2, 3], "b": [4.5, 6.7, 8.9], "z": ["x", "y", "w"]}
+
+
+def test_scan_csv(
+    tmpdir: pytest.TempdirFactory,
+    constructor: Constructor,
+) -> None:
+    df_pl = pl.DataFrame(data)
+    filepath = str(tmpdir / "file.csv")  # type: ignore[operator]
+    df_pl.write_csv(filepath)
+    df = nw.from_native(constructor(data))
+    native_namespace = nw.get_native_namespace(df)
+    result = nw.scan_csv(filepath, native_namespace=native_namespace)
+    assert_equal_data(result.collect(), data)
+    assert isinstance(result, nw.LazyFrame)
+
+
+def test_scan_csv_v1(
+    tmpdir: pytest.TempdirFactory,
+    constructor: Constructor,
+) -> None:
+    df_pl = pl.DataFrame(data)
+    filepath = str(tmpdir / "file.csv")  # type: ignore[operator]
+    df_pl.write_csv(filepath)
+    df = nw_v1.from_native(constructor(data))
+    native_namespace = nw_v1.get_native_namespace(df)
+    result = nw_v1.scan_csv(filepath, native_namespace=native_namespace)
+    assert_equal_data(result.collect(), data)
+    assert isinstance(result, nw_v1.LazyFrame)