Skip to content

Commit

Permalink
feat: scan_csv (#1555)
Browse files Browse the repository at this point in the history
* add scan_csv

* add collect
  • Loading branch information
raisadz authored Dec 10, 2024
1 parent 8c9f026 commit ade07ae
Show file tree
Hide file tree
Showing 5 changed files with 185 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/api-reference/narwhals.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ Here are the top-level functions available in Narwhals.
- new_series
- nth
- read_csv
- scan_csv
- sum
- sum_horizontal
- show_versions
Expand Down
2 changes: 2 additions & 0 deletions narwhals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
from narwhals.functions import get_level
from narwhals.functions import new_series
from narwhals.functions import read_csv
from narwhals.functions import scan_csv
from narwhals.functions import show_versions
from narwhals.schema import Schema
from narwhals.series import Series
Expand Down Expand Up @@ -140,6 +141,7 @@
"new_series",
"nth",
"read_csv",
"scan_csv",
"selectors",
"show_versions",
"stable",
Expand Down
83 changes: 83 additions & 0 deletions narwhals/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1020,3 +1020,86 @@ def _read_csv_impl(
msg = "Unknown namespace is expected to implement `read_csv` function."
raise AttributeError(msg) from e
return from_native(native_frame, eager_only=True)


def scan_csv(
source: str,
*,
native_namespace: ModuleType,
) -> LazyFrame[Any]:
"""Lazily read from a CSV file.
For the libraries that do not support lazy dataframes, the function reads
a csv file eagerly and then converts the resulting dataframe to a lazyframe.
Arguments:
source: Path to a file.
native_namespace: The native library to use for DataFrame creation.
Returns:
LazyFrame.
Examples:
>>> import dask.dataframe as dd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrame
>>> from types import ModuleType
Let's create an agnostic function that lazily reads a csv file with a specified native namespace:
>>> def agnostic_scan_csv(native_namespace: ModuleType) -> IntoFrame:
... return nw.scan_csv("file.csv", native_namespace=native_namespace).to_native()
Then we can read the file by passing, for example, Polars or Dask namespaces:
>>> agnostic_scan_csv(native_namespace=pl).collect() # doctest:+SKIP
shape: (3, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 4 │
│ 2 ┆ 5 │
│ 3 ┆ 6 │
└─────┴─────┘
>>> agnostic_scan_csv(native_namespace=dd).compute() # doctest:+SKIP
a b
0 1 4
1 2 5
2 3 6
"""
return _scan_csv_impl(source, native_namespace=native_namespace)


def _scan_csv_impl(
source: str,
*,
native_namespace: ModuleType,
) -> LazyFrame[Any]:
implementation = Implementation.from_native_namespace(native_namespace)
if implementation is Implementation.POLARS:
native_frame = native_namespace.scan_csv(source)
elif implementation in (
Implementation.PANDAS,
Implementation.MODIN,
Implementation.CUDF,
):
native_frame = native_namespace.read_csv(source)
elif implementation is Implementation.PYARROW:
from pyarrow import csv # ignore-banned-import

native_frame = csv.read_csv(source)
elif implementation is Implementation.DASK:
native_frame = native_namespace.read_csv(source)
else: # pragma: no cover
try:
# implementation is UNKNOWN, Narwhals extension using this feature should
# implement `scan_csv` function in the top-level namespace.
native_frame = native_namespace.scan_csv(source=source)
except AttributeError as e:
msg = "Unknown namespace is expected to implement `scan_csv` function."
raise AttributeError(msg) from e
return from_native(native_frame).lazy()
56 changes: 56 additions & 0 deletions narwhals/stable/v1/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from narwhals.functions import _from_numpy_impl
from narwhals.functions import _new_series_impl
from narwhals.functions import _read_csv_impl
from narwhals.functions import _scan_csv_impl
from narwhals.functions import from_arrow as nw_from_arrow
from narwhals.functions import get_level
from narwhals.functions import show_versions
Expand Down Expand Up @@ -3447,6 +3448,60 @@ def read_csv(
)


def scan_csv(
source: str,
*,
native_namespace: ModuleType,
) -> LazyFrame[Any]:
"""Lazily read from a CSV file.
For the libraries that do not support lazy dataframes, the function reads
a csv file eagerly and then converts the resulting dataframe to a lazyframe.
Arguments:
source: Path to a file.
native_namespace: The native library to use for DataFrame creation.
Returns:
LazyFrame.
Examples:
>>> import dask.dataframe as dd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrame
>>> from types import ModuleType
Let's create an agnostic function that lazily reads a csv file with a specified native namespace:
>>> def agnostic_scan_csv(native_namespace: ModuleType) -> IntoFrame:
... return nw.scan_csv("file.csv", native_namespace=native_namespace).to_native()
Then we can read the file by passing, for example, Polars or Dask namespaces:
>>> agnostic_scan_csv(native_namespace=pl).collect() # doctest:+SKIP
shape: (3, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 4 │
│ 2 ┆ 5 │
│ 3 ┆ 6 │
└─────┴─────┘
>>> agnostic_scan_csv(native_namespace=dd).compute() # doctest:+SKIP
a b
0 1 4
1 2 5
2 3 6
"""
return _stableify( # type: ignore[no-any-return]
_scan_csv_impl(source, native_namespace=native_namespace)
)


__all__ = [
"Array",
"Boolean",
Expand Down Expand Up @@ -3512,6 +3567,7 @@ def read_csv(
"new_series",
"nth",
"read_csv",
"scan_csv",
"selectors",
"show_versions",
"sum",
Expand Down
43 changes: 43 additions & 0 deletions tests/scan_csv_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from __future__ import annotations

from typing import TYPE_CHECKING

import polars as pl

import narwhals as nw
import narwhals.stable.v1 as nw_v1
from tests.utils import Constructor
from tests.utils import assert_equal_data

if TYPE_CHECKING:
import pytest

data = {"a": [1, 2, 3], "b": [4.5, 6.7, 8.9], "z": ["x", "y", "w"]}


def test_scan_csv(
tmpdir: pytest.TempdirFactory,
constructor: Constructor,
) -> None:
df_pl = pl.DataFrame(data)
filepath = str(tmpdir / "file.csv") # type: ignore[operator]
df_pl.write_csv(filepath)
df = nw.from_native(constructor(data))
native_namespace = nw.get_native_namespace(df)
result = nw.scan_csv(filepath, native_namespace=native_namespace)
assert_equal_data(result.collect(), data)
assert isinstance(result, nw.LazyFrame)


def test_scan_csv_v1(
tmpdir: pytest.TempdirFactory,
constructor: Constructor,
) -> None:
df_pl = pl.DataFrame(data)
filepath = str(tmpdir / "file.csv") # type: ignore[operator]
df_pl.write_csv(filepath)
df = nw_v1.from_native(constructor(data))
native_namespace = nw_v1.get_native_namespace(df)
result = nw_v1.scan_csv(filepath, native_namespace=native_namespace)
assert_equal_data(result.collect(), data)
assert isinstance(result, nw_v1.LazyFrame)

0 comments on commit ade07ae

Please sign in to comment.