Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support DataFrame Interchange Protocol (allow Polars DataFrames) #2888

Merged
merged 11 commits into from
Feb 18, 2023
55 changes: 53 additions & 2 deletions altair/utils/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ def limit_rows(data, max_rows=5000):
values = data["values"]
else:
return data
elif hasattr(data, "__dataframe__"):
values = data
if max_rows is not None and len(values) > max_rows:
raise MaxRowsError(
"The number of rows in your dataset is greater "
Expand All @@ -98,6 +100,13 @@ def sample(data, n=None, frac=None):
n = n if n else int(frac * len(values))
values = random.sample(values, n)
return {"values": values}
elif hasattr(data, "__dataframe__"):
# experimental interchange dataframe support
pi = import_pyarrow_interchange()
pa_table = pi.from_dataframe(data)
n = n if n else int(frac * len(pa_table))
indices = random.sample(range(len(pa_table)), n)
return pa_table.take(indices)


@curried.curry
Expand Down Expand Up @@ -152,12 +161,17 @@ def to_values(data):
if "values" not in data:
raise KeyError("values expected in data dict, but not present.")
return data
elif hasattr(data, "__dataframe__"):
# experimental interchange dataframe support
pi = import_pyarrow_interchange()
pa_table = pi.from_dataframe(data)
return {"values": pa_table.to_pylist()}


def check_data_type(data):
"""Raise if the data is not a dict or DataFrame."""
if not isinstance(data, (dict, pd.DataFrame)) and not hasattr(
data, "__geo_interface__"
if not isinstance(data, (dict, pd.DataFrame)) and not any(
hasattr(data, attr) for attr in ["__geo_interface__", "__dataframe__"]
):
raise TypeError(
"Expected dict, DataFrame or a __geo_interface__ attribute, got: {}".format(
Expand Down Expand Up @@ -190,6 +204,11 @@ def _data_to_json_string(data):
if "values" not in data:
raise KeyError("values expected in data dict, but not present.")
return json.dumps(data["values"], sort_keys=True)
elif hasattr(data, "__dataframe__"):
# experimental interchange dataframe support
pi = import_pyarrow_interchange()
pa_table = pi.from_dataframe(data)
return json.dumps(pa_table.to_pylist())
else:
raise NotImplementedError(
"to_json only works with data expressed as " "a DataFrame or as a dict"
Expand All @@ -211,6 +230,16 @@ def _data_to_csv_string(data):
if "values" not in data:
raise KeyError("values expected in data dict, but not present")
return pd.DataFrame.from_dict(data["values"]).to_csv(index=False)
elif hasattr(data, "__dataframe__"):
# experimental interchange dataframe support
pi = import_pyarrow_interchange()
import pyarrow as pa
import pyarrow.csv as pa_csv

pa_table = pi.from_dataframe(data)
csv_buffer = pa.BufferOutputStream()
pa_csv.write_csv(pa_table, csv_buffer)
return csv_buffer.getvalue().to_pybytes().decode()
else:
raise NotImplementedError(
"to_csv only works with data expressed as " "a DataFrame or as a dict"
Expand Down Expand Up @@ -242,3 +271,25 @@ def curry(*args, **kwargs):
AltairDeprecationWarning,
)
return curried.curry(*args, **kwargs)


def import_pyarrow_interchange():
import pkg_resources

try:
pkg_resources.require("pyarrow>=11.0.0")
# The package is installed and meets the minimum version requirement
import pyarrow.interchange as pi

return pi
except pkg_resources.DistributionNotFound:
# The package is not installed
raise ImportError(
"Usage of the DataFrame Interchange Protocol requires the package 'pyarrow', but it is not installed."
)
except pkg_resources.VersionConflict:
# The package is installed but does not meet the minimum version requirement
raise ImportError(
"The installed version of 'pyarrow' does not meet the minimum requirement of version 11.0.0. "
"Please update 'pyarrow' to use the DataFrame Interchange Protocol."
)
7 changes: 5 additions & 2 deletions altair/vegalite/v5/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,16 @@ def _prepare_data(data, context=None):
return data

# convert dataframes or objects with __geo_interface__ to dict
if isinstance(data, pd.DataFrame) or hasattr(data, "__geo_interface__"):
elif isinstance(data, pd.DataFrame) or hasattr(data, "__geo_interface__"):
data = _pipe(data, data_transformers.get())

# convert string input to a URLData
if isinstance(data, str):
elif isinstance(data, str):
data = core.UrlData(data)

elif hasattr(data, "__dataframe__"):
data = _pipe(data, data_transformers.get())

# consolidate inline data to top-level datasets
if context is not None and data_transformers.consolidate_datasets:
data = _consolidate_data(data, context)
Expand Down
1 change: 1 addition & 0 deletions doc/releases/changes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Enhancements
- The documentation page has been revamped, both in terms of appearance and content.
- More informative autocompletion by removing deprecated methods (#2814) and adding support for completion in method chains for editors that rely on type hints (e.g. VS Code) (#2846)
- Improved error messages (#2842)
- Include experimental support for the DataFrame Interchange Protocol (through `__dataframe__` attribute). This requires `pyarrow>=11.0.0` (#2888).

Grammar Changes
~~~~~~~~~~~~~~~
Expand Down
1 change: 1 addition & 0 deletions doc/user_guide/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ there are many different ways of specifying a dataset:
- as a url string pointing to a ``json`` or ``csv`` formatted text file
- as a `geopandas GeoDataFrame <http://geopandas.org/data_structures.html#geodataframe>`_, `Shapely Geometries <https://shapely.readthedocs.io/en/latest/manual.html#geometric-objects>`_, `GeoJSON Objects <https://github.com/jazzband/geojson#geojson-objects>`_ or other objects that support the ``__geo_interface__``
- as a generated dataset such as numerical sequences or geographic reference elements
- as a DataFrame that supports the DataFrame Interchange Protocol (contains a `__dataframe__` attribute). This is experimental.

When data is specified as a DataFrame, the encoding is quite simple, as Altair
uses the data type information provided by pandas to automatically determine
Expand Down