-
Notifications
You must be signed in to change notification settings - Fork 121
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: scan_csv
#1555
feat: scan_csv
#1555
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -936,3 +936,92 @@ def get_level( | |
- 'interchange': only metadata operations are supported (`df.schema`) | ||
""" | ||
return obj._level | ||
|
||
|
||
def scan_csv( | ||
source: str, | ||
*, | ||
native_namespace: ModuleType, | ||
) -> LazyFrame[Any]: | ||
"""Lazily read from a CSV file. | ||
|
||
This allows the query optimizer to push down predicates and projections | ||
to the scan level, thereby potentially reducing memory overhead. | ||
For the libraries that do not support lazy dataframes, the function reads | ||
a csv file eagerly and then converts the resulting dataframe to a lazyframe. | ||
|
||
Arguments: | ||
source: Path to a file. | ||
native_namespace: The native library to use for DataFrame creation. | ||
|
||
Returns: | ||
LazyFrame. | ||
|
||
Examples: | ||
>>> import dask.dataframe as dd | ||
>>> import polars as pl | ||
>>> import pyarrow as pa | ||
>>> import narwhals as nw | ||
>>> from narwhals.typing import IntoFrame | ||
>>> from types import ModuleType | ||
|
||
Let's create an agnostic function that lazily reads a csv file with a specified native namespace: | ||
|
||
>>> def agnostic_scan_csv(native_namespace: ModuleType) -> IntoFrame: | ||
... return ( | ||
... nw.scan_csv("file.csv", native_namespace=native_namespace) | ||
... .to_native() | ||
... .collect() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
either that, or put |
||
... ) | ||
|
||
Then we can read the file by passing Polars or dask namespaces: | ||
|
||
>>> agnostic_scan_csv(native_namespace=pl) # doctest:+SKIP | ||
shape: (3, 2) | ||
βββββββ¬ββββββ | ||
β a β b β | ||
β --- β --- β | ||
β i64 β i64 β | ||
βββββββͺββββββ‘ | ||
β 1 β 4 β | ||
β 2 β 5 β | ||
β 3 β 6 β | ||
βββββββ΄ββββββ | ||
>>> agnostic_scan_csv(native_namespace=dd) # doctest:+SKIP | ||
a b | ||
0 1 4 | ||
1 2 5 | ||
2 3 6 | ||
""" | ||
return _scan_csv_impl(source, native_namespace=native_namespace) | ||
|
||
|
||
def _scan_csv_impl( | ||
source: str, | ||
*, | ||
native_namespace: ModuleType, | ||
) -> LazyFrame[Any]: | ||
implementation = Implementation.from_native_namespace(native_namespace) | ||
if implementation is Implementation.POLARS: | ||
native_frame = native_namespace.scan_csv(source) | ||
elif implementation in ( | ||
Implementation.PANDAS, | ||
Implementation.MODIN, | ||
Implementation.CUDF, | ||
): | ||
native_frame = native_namespace.read_csv(source) | ||
elif implementation is Implementation.PYARROW: | ||
from pyarrow import csv # ignore-banned-import | ||
|
||
native_frame = csv.read_csv(source) | ||
elif implementation is Implementation.DASK: | ||
native_frame = native_namespace.read_csv(source) | ||
else: # pragma: no cover | ||
try: | ||
# implementation is UNKNOWN, Narwhals extension using this feature should | ||
# implement `scan_csv` function in the top-level namespace. | ||
native_frame = native_namespace.scan_csv(source=source) | ||
except AttributeError as e: | ||
msg = "Unknown namespace is expected to implement `scan_csv` function." | ||
raise AttributeError(msg) from e | ||
return from_native(native_frame).lazy() |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,7 @@ | |
from narwhals.functions import _from_dict_impl | ||
from narwhals.functions import _from_numpy_impl | ||
from narwhals.functions import _new_series_impl | ||
from narwhals.functions import _scan_csv_impl | ||
from narwhals.functions import from_arrow as nw_from_arrow | ||
from narwhals.functions import get_level | ||
from narwhals.functions import show_versions | ||
|
@@ -3385,6 +3386,66 @@ def from_numpy( | |
) | ||
|
||
|
||
def scan_csv( | ||
source: str, | ||
*, | ||
native_namespace: ModuleType, | ||
) -> LazyFrame[Any]: | ||
"""Lazily read from a CSV file. | ||
|
||
This allows the query optimizer to push down predicates and projections | ||
to the scan level, thereby potentially reducing memory overhead. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same |
||
For the libraries that do not support lazy dataframes, the function reads | ||
a csv file eagerly and then converts the resulting dataframe to a lazyframe. | ||
|
||
Arguments: | ||
source: Path to a file. | ||
native_namespace: The native library to use for DataFrame creation. | ||
|
||
Returns: | ||
LazyFrame. | ||
|
||
Examples: | ||
>>> import dask.dataframe as dd | ||
>>> import polars as pl | ||
>>> import pyarrow as pa | ||
>>> import narwhals as nw | ||
>>> from narwhals.typing import IntoFrame | ||
>>> from types import ModuleType | ||
|
||
Let's create an agnostic function that lazily reads a csv file with a specified native namespace: | ||
|
||
>>> def agnostic_scan_csv(native_namespace: ModuleType) -> IntoFrame: | ||
... return ( | ||
... nw.scan_csv("file.csv", native_namespace=native_namespace) | ||
... .to_native() | ||
... .collect() | ||
... ) | ||
|
||
Then we can read the file by passing Polars or dask namespaces: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how about
? |
||
|
||
>>> agnostic_scan_csv(native_namespace=pl) # doctest:+SKIP | ||
shape: (3, 2) | ||
βββββββ¬ββββββ | ||
β a β b β | ||
β --- β --- β | ||
β i64 β i64 β | ||
βββββββͺββββββ‘ | ||
β 1 β 4 β | ||
β 2 β 5 β | ||
β 3 β 6 β | ||
βββββββ΄ββββββ | ||
>>> agnostic_scan_csv(native_namespace=dd) # doctest:+SKIP | ||
a b | ||
0 1 4 | ||
1 2 5 | ||
2 3 6 | ||
""" | ||
return _stableify( # type: ignore[no-any-return] | ||
_scan_csv_impl(source, native_namespace=native_namespace) | ||
) | ||
|
||
|
||
__all__ = [ | ||
"Array", | ||
"Boolean", | ||
|
@@ -3449,6 +3510,7 @@ def from_numpy( | |
"narwhalify", | ||
"new_series", | ||
"nth", | ||
"scan_csv", | ||
"selectors", | ||
"show_versions", | ||
"sum", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
from __future__ import annotations | ||
|
||
from typing import TYPE_CHECKING | ||
|
||
import polars as pl | ||
|
||
import narwhals as nw | ||
import narwhals.stable.v1 as nw_v1 | ||
from tests.utils import Constructor | ||
from tests.utils import assert_equal_data | ||
|
||
if TYPE_CHECKING: | ||
import pytest | ||
|
||
data = {"a": [1, 2, 3], "b": [4.5, 6.7, 8.9], "z": ["x", "y", "w"]} | ||
|
||
|
||
def test_scan_csv( | ||
tmpdir: pytest.TempdirFactory, | ||
constructor: Constructor, | ||
) -> None: | ||
df_pl = pl.DataFrame(data) | ||
filepath = str(tmpdir / "file.csv") # type: ignore[operator] | ||
df_pl.write_csv(filepath) | ||
df = nw.from_native(constructor(data)) | ||
native_namespace = nw.get_native_namespace(df) | ||
result = nw.scan_csv(filepath, native_namespace=native_namespace) | ||
assert_equal_data(result.collect(), data) | ||
assert isinstance(result, nw.LazyFrame) | ||
|
||
|
||
def test_scan_csv_v1( | ||
tmpdir: pytest.TempdirFactory, | ||
constructor: Constructor, | ||
) -> None: | ||
df_pl = pl.DataFrame(data) | ||
filepath = str(tmpdir / "file.csv") # type: ignore[operator] | ||
df_pl.write_csv(filepath) | ||
df = nw_v1.from_native(constructor(data)) | ||
native_namespace = nw_v1.get_native_namespace(df) | ||
result = nw_v1.scan_csv(filepath, native_namespace=native_namespace) | ||
assert_equal_data(result.collect(), data) | ||
assert isinstance(result, nw_v1.LazyFrame) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is Polars-specific, perhaps we can remove it?