Skip to content

Commit

Permalink
Merge pull request #17 from DanielAvdar/init
Browse files Browse the repository at this point in the history
Add Support for Unsigned Integer Data Types and Expand Documentation.
  • Loading branch information
DanielAvdar authored Mar 13, 2024
2 parents cf63880 + 78f800b commit b033901
Show file tree
Hide file tree
Showing 9 changed files with 94 additions and 47 deletions.
44 changes: 43 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -179,9 +179,51 @@ dtype: object
## Purposes

- Simplify the conversion between pandas pyarrow and numpy backends.
- Allow seamlessly switch to pyarrow pandas backend.
- Allow seamlessly switch to pyarrow pandas backend, even for problematic dtypes such float16 or db-dtypes.
- dtype standardization for db-dtypes used by bigquery python sdk.


example:

```python
import pandas as pd

# Create a pandas DataFrame
df = pd.DataFrame({

'C': [1.1, 2.2, 3.3],

}, dtype='float16')

df.convert_dtypes(dtype_backend='pyarrow')
```
will raise an error:
```
pyarrow.lib.ArrowNotImplementedError: Unsupported cast from halffloat to double using function cast_double
```
but with pandas-pyarrow:
```python
import pandas as pd

from pandas_pyarrow import convert_to_pyarrow

# Create a pandas DataFrame
df = pd.DataFrame({

'C': [1.1, 2.2, 3.3],

}, dtype='float16')
adf = convert_to_pyarrow(df)
print(adf.dtypes)

```
outputs:
```
C halffloat[pyarrow]
dtype: object
```


## Additional Information

When converting from higher precision numerical dtypes (like float64) to
Expand Down
1 change: 1 addition & 0 deletions pandas_pyarrow/mappers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def create_mapper() -> Dict[str, str]:
**numeric_mapper(["float"], ["16", "32", "64"]),
**numeric_mapper(["int"], ["8", "16", "32", "64"]),
**numeric_mapper(["Float", "Int"], ["32", "64"]),
**numeric_mapper(["uint"], ["8", "16", "32", "64"]),
**datetime_mapper(),
**mapper_dict_dt,
**mapper_dict_object,
Expand Down
16 changes: 8 additions & 8 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ packages = [{ include = "pandas_pyarrow", from = "." }]
[tool.poetry.dependencies]
python = ">=3.9,<3.13"
pandas = ">=2"
pyarrow = ">=7.0.0, <=15.0.0"
pyarrow = ">=10.0.1, <=15.0.0"
db-dtypes = { version = ">=1", optional = true }
pandas-gbq = { version = ">=0.15.0", optional = true }

Expand Down
27 changes: 15 additions & 12 deletions tests/unit/property_based/pb_sts.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,33 @@
from hypothesis.extra.pandas import columns, data_frames, range_indexes
from hypothesis.strategies import composite

# Introduced constants
DTYPES_SAMPLE: List[Union[type, str]] = [
int,
float,
# Dtype convertable to pyarrow via pandas api
COMMON_DTYPES_SAMPLE: List[Union[type, str]] = [
bool,
str,
int,
"datetime64[ns]",
"timedelta64[ns]",
"int8",
"int16",
"int32",
"int64",
"uint8",
"uint16",
"uint32",
"uint64",
]
# Dtype not convertable to pyarrow via pandas api (pyarrow.lib.ArrowNotImplementedError)
UNCOMMON_DTYPES_SAMPLE: List[Union[type, str]] = [
float,
"float16",
"float32",
"float64",
"complex64",
"complex128",
]


@composite
def dtypes_st(draw: Any) -> Any:
dtypes = st.sampled_from(DTYPES_SAMPLE)
return draw(dtypes)


def create_dataframe(draw: Any, gen_type: str) -> pd.DataFrame:
dfs_st = data_frames(
columns=columns(
Expand All @@ -46,13 +49,13 @@ def create_dataframe(draw: Any, gen_type: str) -> pd.DataFrame:


@composite
def df_st(draw: Any) -> pd.DataFrame:
def df_st(draw: Any, dtypes: List[Any]) -> pd.DataFrame:
col_names = draw(st.sets(st.text(min_size=1, max_size=10), min_size=2, max_size=5))

dfs_st = data_frames(
columns=columns(
col_names,
dtype=draw(st.sampled_from(DTYPES_SAMPLE)),
dtype=draw(st.sampled_from(dtypes)),
),
index=range_indexes(
min_size=2,
Expand Down
13 changes: 5 additions & 8 deletions tests/unit/property_based/test_dt.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,21 @@
#
# import hypothesis as hp
#
# from pandas_pyarrow import SchemArrow
# from pandas_pyarrow.mappers import DateTimeMapper
# from pandas_pyarrow import convert_to_pyarrow
# from pandas_pyarrow.mappers import datetime_mapper
# from tests.unit.property_based.pb_sts import single_column_df_st
#
#
# @hp.given(
# pair=single_column_df_st(
# gen_type='datetime64[ns]',
# pair_mapping=DateTimeMapper(
# source_type=["datetime64", ],
# variations=[],
# )()
# pair_mapping=datetime_mapper(
# )
#
# )
# )
# def test_datetime_numpy_api_hp(pair: Tuple[pd.DataFrame, str]):
# sa = SchemArrow()
# df, target_dtype = pair
# adf = sa(df)
# adf = convert_to_pyarrow(df)
#
# assert list(adf.dtypes)[0] == target_dtype
21 changes: 13 additions & 8 deletions tests/unit/property_based/test_general.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
from pandas_pyarrow.pda_converter import PandasArrowConverter
from tests.unit.property_based.pb_sts import df_st
from pandas_pyarrow import convert_to_pyarrow
from tests.unit.property_based.pb_sts import COMMON_DTYPES_SAMPLE, UNCOMMON_DTYPES_SAMPLE, df_st

import hypothesis as hp


@hp.given(df=df_st())
@hp.settings(max_examples=500)
def test_dtypes_hp(df):
@hp.given(df=df_st(dtypes=COMMON_DTYPES_SAMPLE + UNCOMMON_DTYPES_SAMPLE))
def test_uncommon_dtypes_hp(df):
df_copy = df.copy()
sa = PandasArrowConverter()
adf = sa(df)
adf = convert_to_pyarrow(df)

new_dtypes_names = [repr(i) for i in adf.dtypes.tolist()]
is_arrows = ["[pyarrow]" in dtype for dtype in new_dtypes_names]
assert all(is_arrows), "Some dtypes are not converted"
assert not df.equals(adf), "The original df has been modified"
assert df.equals(df_copy), "The original df has been modified"
assert adf.equals(sa(adf)), "The conversion is not idempotent"
assert adf.equals(convert_to_pyarrow(adf)), "The conversion is not idempotent"


@hp.given(df=df_st(dtypes=COMMON_DTYPES_SAMPLE))
def test_common_dtypes_hp(df):
adf_pd_api = df.convert_dtypes(dtype_backend="pyarrow")
adf = convert_to_pyarrow(df)
assert adf_pd_api.compare(adf).empty, "The conversion is not consistent with pandas api"
14 changes: 5 additions & 9 deletions tests/unit/property_based/test_numeric.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Tuple

from pandas_pyarrow import PandasArrowConverter
from pandas_pyarrow import convert_to_pyarrow
from pandas_pyarrow.mappers import numeric_mapper
from tests.unit.property_based.pb_sts import single_column_df_st

Expand All @@ -19,9 +19,8 @@
)
)
def test_float_numpy_api_hp(pair: Tuple[pd.DataFrame, str]):
sa = PandasArrowConverter()
df, target_dtype = pair
adf = sa(df)
adf = convert_to_pyarrow(df)

assert list(adf.dtypes)[0] == target_dtype

Expand All @@ -37,9 +36,8 @@ def test_float_numpy_api_hp(pair: Tuple[pd.DataFrame, str]):
)
)
def test_float_array_api_hp(pair: Tuple[pd.DataFrame, str]):
sa = PandasArrowConverter()
df, target_dtype = pair
adf = sa(df)
adf = convert_to_pyarrow(df)

assert list(adf.dtypes)[0] == target_dtype

Expand All @@ -55,9 +53,8 @@ def test_float_array_api_hp(pair: Tuple[pd.DataFrame, str]):
)
)
def test_int_numpy_api_hp(pair: Tuple[pd.DataFrame, str]):
sa = PandasArrowConverter()
df, target_dtype = pair
adf = sa(df)
adf = convert_to_pyarrow(df)

assert list(adf.dtypes)[0] == target_dtype

Expand All @@ -73,8 +70,7 @@ def test_int_numpy_api_hp(pair: Tuple[pd.DataFrame, str]):
)
)
def test_int_array_api_hp(pair: Tuple[pd.DataFrame, str]):
sa = PandasArrowConverter()
df, target_dtype = pair
adf = sa(df)
adf = convert_to_pyarrow(df)

assert list(adf.dtypes)[0] == target_dtype
3 changes: 3 additions & 0 deletions tests/unit/test_numeric_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ def create_test_case(
@create_test_case("int32", "int32[pyarrow]", [1, 2, 3])
@create_test_case("float64", "float64[pyarrow]", [1.0, 2.0, 3.0, None])
@create_test_case("float32", "float32[pyarrow]", [1.0, 2.0, 3.0, None])
@create_test_case("float16", "float16[pyarrow]", [1.0, 2.0, 3.0, None])
@create_test_case("uint16", "uint16[pyarrow]", [1, 2, 3])
@create_test_case("complex64", "string[pyarrow]", [1, 2, 3])
@create_test_case("float32[pyarrow]", "float32[pyarrow]", [1.0, 2.0, 3.0, None])
def test_numeric_types(df_data, expected_dtype):
sa = PandasArrowConverter()
Expand Down

0 comments on commit b033901

Please sign in to comment.