diff --git a/docs/api-reference/narwhals.md b/docs/api-reference/narwhals.md index fccefc416..61d7fa8b1 100644 --- a/docs/api-reference/narwhals.md +++ b/docs/api-reference/narwhals.md @@ -38,6 +38,7 @@ Here are the top-level functions available in Narwhals. - new_series - nth - read_csv + - scan_csv - sum - sum_horizontal - show_versions diff --git a/narwhals/__init__.py b/narwhals/__init__.py index 22b6f62ab..d4dc5d424 100644 --- a/narwhals/__init__.py +++ b/narwhals/__init__.py @@ -56,6 +56,7 @@ from narwhals.functions import get_level from narwhals.functions import new_series from narwhals.functions import read_csv +from narwhals.functions import scan_csv from narwhals.functions import show_versions from narwhals.schema import Schema from narwhals.series import Series @@ -140,6 +141,7 @@ "new_series", "nth", "read_csv", + "scan_csv", "selectors", "show_versions", "stable", diff --git a/narwhals/functions.py b/narwhals/functions.py index 4505e42e7..41cb671fd 100644 --- a/narwhals/functions.py +++ b/narwhals/functions.py @@ -1020,3 +1020,86 @@ def _read_csv_impl( msg = "Unknown namespace is expected to implement `read_csv` function." raise AttributeError(msg) from e return from_native(native_frame, eager_only=True) + + +def scan_csv( + source: str, + *, + native_namespace: ModuleType, +) -> LazyFrame[Any]: + """Lazily read from a CSV file. + + For the libraries that do not support lazy dataframes, the function reads + a csv file eagerly and then converts the resulting dataframe to a lazyframe. + + Arguments: + source: Path to a file. + native_namespace: The native library to use for DataFrame creation. + + Returns: + LazyFrame. + + Examples: + >>> import dask.dataframe as dd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrame + >>> from types import ModuleType + + Let's create an agnostic function that lazily reads a csv file with a specified native namespace: + + >>> def agnostic_scan_csv(native_namespace: ModuleType) -> IntoFrame: + ... return nw.scan_csv("file.csv", native_namespace=native_namespace).to_native() + + Then we can read the file by passing, for example, Polars or Dask namespaces: + + >>> agnostic_scan_csv(native_namespace=pl).collect() # doctest:+SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + └─────┴─────┘ + >>> agnostic_scan_csv(native_namespace=dd).compute() # doctest:+SKIP + a b + 0 1 4 + 1 2 5 + 2 3 6 + """ + return _scan_csv_impl(source, native_namespace=native_namespace) + + +def _scan_csv_impl( + source: str, + *, + native_namespace: ModuleType, +) -> LazyFrame[Any]: + implementation = Implementation.from_native_namespace(native_namespace) + if implementation is Implementation.POLARS: + native_frame = native_namespace.scan_csv(source) + elif implementation in ( + Implementation.PANDAS, + Implementation.MODIN, + Implementation.CUDF, + ): + native_frame = native_namespace.read_csv(source) + elif implementation is Implementation.PYARROW: + from pyarrow import csv # ignore-banned-import + + native_frame = csv.read_csv(source) + elif implementation is Implementation.DASK: + native_frame = native_namespace.read_csv(source) + else: # pragma: no cover + try: + # implementation is UNKNOWN, Narwhals extension using this feature should + # implement `scan_csv` function in the top-level namespace. + native_frame = native_namespace.scan_csv(source=source) + except AttributeError as e: + msg = "Unknown namespace is expected to implement `scan_csv` function." + raise AttributeError(msg) from e + return from_native(native_frame).lazy() diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index 2d4eb0bed..7518a72ee 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -25,6 +25,7 @@ from narwhals.functions import _from_numpy_impl from narwhals.functions import _new_series_impl from narwhals.functions import _read_csv_impl +from narwhals.functions import _scan_csv_impl from narwhals.functions import from_arrow as nw_from_arrow from narwhals.functions import get_level from narwhals.functions import show_versions @@ -3447,6 +3448,60 @@ def read_csv( ) +def scan_csv( + source: str, + *, + native_namespace: ModuleType, +) -> LazyFrame[Any]: + """Lazily read from a CSV file. + + For the libraries that do not support lazy dataframes, the function reads + a csv file eagerly and then converts the resulting dataframe to a lazyframe. + + Arguments: + source: Path to a file. + native_namespace: The native library to use for DataFrame creation. + + Returns: + LazyFrame. + + Examples: + >>> import dask.dataframe as dd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrame + >>> from types import ModuleType + + Let's create an agnostic function that lazily reads a csv file with a specified native namespace: + + >>> def agnostic_scan_csv(native_namespace: ModuleType) -> IntoFrame: + ... return nw.scan_csv("file.csv", native_namespace=native_namespace).to_native() + + Then we can read the file by passing, for example, Polars or Dask namespaces: + + >>> agnostic_scan_csv(native_namespace=pl).collect() # doctest:+SKIP + shape: (3, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 1 ┆ 4 │ + │ 2 ┆ 5 │ + │ 3 ┆ 6 │ + └─────┴─────┘ + >>> agnostic_scan_csv(native_namespace=dd).compute() # doctest:+SKIP + a b + 0 1 4 + 1 2 5 + 2 3 6 + """ + return _stableify( # type: ignore[no-any-return] + _scan_csv_impl(source, native_namespace=native_namespace) + ) + + __all__ = [ "Array", "Boolean", @@ -3512,6 +3567,7 @@ def read_csv( "new_series", "nth", "read_csv", + "scan_csv", "selectors", "show_versions", "sum", diff --git a/tests/scan_csv_test.py b/tests/scan_csv_test.py new file mode 100644 index 000000000..4f0083667 --- /dev/null +++ b/tests/scan_csv_test.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +import narwhals as nw +import narwhals.stable.v1 as nw_v1 +from tests.utils import Constructor +from tests.utils import assert_equal_data + +if TYPE_CHECKING: + import pytest + +data = {"a": [1, 2, 3], "b": [4.5, 6.7, 8.9], "z": ["x", "y", "w"]} + + +def test_scan_csv( + tmpdir: pytest.TempdirFactory, + constructor: Constructor, +) -> None: + df_pl = pl.DataFrame(data) + filepath = str(tmpdir / "file.csv") # type: ignore[operator] + df_pl.write_csv(filepath) + df = nw.from_native(constructor(data)) + native_namespace = nw.get_native_namespace(df) + result = nw.scan_csv(filepath, native_namespace=native_namespace) + assert_equal_data(result.collect(), data) + assert isinstance(result, nw.LazyFrame) + + +def test_scan_csv_v1( + tmpdir: pytest.TempdirFactory, + constructor: Constructor, +) -> None: + df_pl = pl.DataFrame(data) + filepath = str(tmpdir / "file.csv") # type: ignore[operator] + df_pl.write_csv(filepath) + df = nw_v1.from_native(constructor(data)) + native_namespace = nw_v1.get_native_namespace(df) + result = nw_v1.scan_csv(filepath, native_namespace=native_namespace) + assert_equal_data(result.collect(), data) + assert isinstance(result, nw_v1.LazyFrame)