From 7411b9622dabae5ef747fc5f880feb6bdc963f60 Mon Sep 17 00:00:00 2001 From: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com> Date: Wed, 13 Mar 2024 17:31:47 +0200 Subject: [PATCH 1/8] Add pyarrow conversion example to README The README has been updated to include an example of how to use the pandas-pyarrow library. This provides a specific illustration of handling problematic data types like float16 or db-dtypes that can cause issues in PyArrow, aiding users to fully exploit the library's benefits. Signed-off-by: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com> --- README.md | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index de383fc..778b3b2 100644 --- a/README.md +++ b/README.md @@ -179,9 +179,51 @@ dtype: object ## Purposes - Simplify the conversion between pandas pyarrow and numpy backends. -- Allow seamlessly switch to pyarrow pandas backend. +- Allow seamlessly switch to pyarrow pandas backend, even for problematic dtypes such float16 or db-dtypes. - dtype standardization for db-dtypes used by bigquery python sdk. + +example: + +```python +import pandas as pd + +# Create a pandas DataFrame +df = pd.DataFrame({ + + 'C': [1.1, 2.2, 3.3], + +}, dtype='float16') + +df.convert_dtypes(dtype_backend='pyarrow') +``` +will raise an error: +``` +pyarrow.lib.ArrowNotImplementedError: Unsupported cast from halffloat to double using function cast_double +``` +but with pandas-pyarrow: +```python +import pandas as pd + +from pandas_pyarrow import convert_to_pyarrow + +# Create a pandas DataFrame +df = pd.DataFrame({ + + 'C': [1.1, 2.2, 3.3], + +}, dtype='float16') +adf = convert_to_pyarrow(df) +print(adf.dtypes) + +``` +outputs: +``` +C halffloat[pyarrow] +dtype: object +``` + + ## Additional Information When converting from higher precision numerical dtypes (like float64) to From 4b01b370a4114b49ee61fc14d878fcbd9f09741d Mon Sep 17 00:00:00 2001 From: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com> Date: Wed, 13 Mar 2024 17:32:06 +0200 Subject: [PATCH 2/8] Update minimum pyarrow version in dependencies The minimum version for the pyarrow dependency in the pyproject.toml file has been updated from "7.0.0" to "10.0.1". Users are now required to use at least version 10.0.1 of pyarrow in order to prevent potential compatibility issues. Signed-off-by: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com> --- poetry.lock | 16 ++++++++-------- pyproject.toml | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/poetry.lock b/poetry.lock index e89b66d..41e062e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -606,13 +606,13 @@ protobuf = ">=4.21.6" [[package]] name = "hypothesis" -version = "6.99.2" +version = "6.99.5" description = "A library for property-based testing" optional = false python-versions = ">=3.8" files = [ - {file = "hypothesis-6.99.2-py3-none-any.whl", hash = "sha256:f277f6ccb074f39d51c7f32ba5a0ff640dba9b71ef69ea1e1e09b6f7c25302f5"}, - {file = "hypothesis-6.99.2.tar.gz", hash = "sha256:24453b1a86151be83d26e81834e29022b3e3f0fc5d71275cc3d096649a13c53c"}, + {file = "hypothesis-6.99.5-py3-none-any.whl", hash = "sha256:0ab4968fa4c38ba6d3cd9f54f3d637e3c72fe136bff11373355f2e06416c6a7d"}, + {file = "hypothesis-6.99.5.tar.gz", hash = "sha256:1f795b71abe46f3919591acf7fc05cbcd9b601b97806d97433e0eb9bdb200861"}, ] [package.dependencies] @@ -1309,18 +1309,18 @@ files = [ [[package]] name = "setuptools" -version = "69.1.1" +version = "69.2.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-69.1.1-py3-none-any.whl", hash = "sha256:02fa291a0471b3a18b2b2481ed902af520c69e8ae0919c13da936542754b4c56"}, - {file = "setuptools-69.1.1.tar.gz", hash = "sha256:5c0806c7d9af348e6dd3777b4f4dbb42c7ad85b190104837488eab9a7c945cf8"}, + {file = "setuptools-69.2.0-py3-none-any.whl", hash = "sha256:c21c49fb1042386df081cb5d86759792ab89efca84cf114889191cd09aacc80c"}, + {file = "setuptools-69.2.0.tar.gz", hash = "sha256:0ff4183f8f42cd8fa3acea16c45205521a4ef28f73c6391d8a25e92893134f2e"}, ] [package.extras] docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing = ["build[virtualenv]", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] [[package]] @@ -1433,4 +1433,4 @@ db-dtypes = ["db-dtypes"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "3b0709d9cd8dc345d093e9b0bfc6b1312f2373da728803fbaf18aba4aae64b6d" +content-hash = "15f71256266f32a6a6d9cd0835d8c555c6780b91383b3cfb1cd070cbb6ffcf81" diff --git a/pyproject.toml b/pyproject.toml index 007e80e..b94c1f1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,7 @@ packages = [{ include = "pandas_pyarrow", from = "." }] [tool.poetry.dependencies] python = ">=3.9,<3.13" pandas = ">=2" -pyarrow = ">=7.0.0, <=15.0.0" +pyarrow = ">=10.0.1, <=15.0.0" db-dtypes = { version = ">=1", optional = true } pandas-gbq = { version = ">=0.15.0", optional = true } From 0c40dcc4e602079e22697cb7e364911bc95262c7 Mon Sep 17 00:00:00 2001 From: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com> Date: Wed, 13 Mar 2024 17:33:01 +0200 Subject: [PATCH 3/8] Add 'uint' type mapping to pandas_pyarrow The numeric_mapper method in the pandas_pyarrow mappers module now includes the 'uint' data type. This enables mapping unsigned integer types with 8, 16, 32, and 64 bit variants. Signed-off-by: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com> --- pandas_pyarrow/mappers/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas_pyarrow/mappers/__init__.py b/pandas_pyarrow/mappers/__init__.py index 4537fba..dc13b3c 100644 --- a/pandas_pyarrow/mappers/__init__.py +++ b/pandas_pyarrow/mappers/__init__.py @@ -11,6 +11,7 @@ def create_mapper() -> Dict[str, str]: **numeric_mapper(["float"], ["16", "32", "64"]), **numeric_mapper(["int"], ["8", "16", "32", "64"]), **numeric_mapper(["Float", "Int"], ["32", "64"]), + **numeric_mapper(["uint"], ["8", "16", "32", "64"]), **datetime_mapper(), **mapper_dict_dt, **mapper_dict_object, From 4998867b36dbe48b29b5d442d8dc8bf623c0de17 Mon Sep 17 00:00:00 2001 From: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com> Date: Wed, 13 Mar 2024 17:35:04 +0200 Subject: [PATCH 4/8] Update testing for pandas-pyarrow data type conversion Refactored the pandas-pyarrow tests for better coverage and efficiency. In particular, common and uncommon data types are now handled separately in the testing process. The tests were also updated to use the `convert_to_pyarrow` function, replacing the old `PandasArrowConverter` class. Finally, more comprehensive testing was made possible by expanding the range of test data types. Signed-off-by: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com> --- tests/unit/property_based/pb_sts.py | 27 +++++++++++++++-------- tests/unit/property_based/test_general.py | 21 +++++++++++------- 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/tests/unit/property_based/pb_sts.py b/tests/unit/property_based/pb_sts.py index b38fe58..3197141 100644 --- a/tests/unit/property_based/pb_sts.py +++ b/tests/unit/property_based/pb_sts.py @@ -5,8 +5,8 @@ from hypothesis.extra.pandas import columns, data_frames, range_indexes from hypothesis.strategies import composite -# Introduced constants -DTYPES_SAMPLE: List[Union[type, str]] = [ +# Dtype convertable to pyarrow via pandas api +COMMON_DTYPES_SAMPLE: List[Union[type, str]] = [ int, float, bool, @@ -17,16 +17,25 @@ "int16", "int32", "int64", - "float16", "float32", "float64", + "uint8", + "uint16", + "uint32", + "uint64", +] +# Dtype not convertable to pyarrow via pandas api (pyarrow.lib.ArrowNotImplementedError) +UNCOMMON_DTYPES_SAMPLE: List[Union[type, str]] = [ + "float16", + "complex64", + "complex128", ] -@composite -def dtypes_st(draw: Any) -> Any: - dtypes = st.sampled_from(DTYPES_SAMPLE) - return draw(dtypes) +# @composite +# def dtypes_st(draw: Any) -> Any: +# dtypes = st.sampled_from(COMMON_DTYPES_SAMPLE) +# return draw(dtypes) def create_dataframe(draw: Any, gen_type: str) -> pd.DataFrame: @@ -46,13 +55,13 @@ def create_dataframe(draw: Any, gen_type: str) -> pd.DataFrame: @composite -def df_st(draw: Any) -> pd.DataFrame: +def df_st(draw: Any, dtypes: List[Any]) -> pd.DataFrame: col_names = draw(st.sets(st.text(min_size=1, max_size=10), min_size=2, max_size=5)) dfs_st = data_frames( columns=columns( col_names, - dtype=draw(st.sampled_from(DTYPES_SAMPLE)), + dtype=draw(st.sampled_from(dtypes)), ), index=range_indexes( min_size=2, diff --git a/tests/unit/property_based/test_general.py b/tests/unit/property_based/test_general.py index d8993af..e165dc7 100644 --- a/tests/unit/property_based/test_general.py +++ b/tests/unit/property_based/test_general.py @@ -1,19 +1,24 @@ -from pandas_pyarrow.pda_converter import PandasArrowConverter -from tests.unit.property_based.pb_sts import df_st +from pandas_pyarrow import convert_to_pyarrow +from tests.unit.property_based.pb_sts import COMMON_DTYPES_SAMPLE, UNCOMMON_DTYPES_SAMPLE, df_st import hypothesis as hp -@hp.given(df=df_st()) -@hp.settings(max_examples=500) -def test_dtypes_hp(df): +@hp.given(df=df_st(dtypes=COMMON_DTYPES_SAMPLE + UNCOMMON_DTYPES_SAMPLE)) +def test_uncommon_dtypes_hp(df): df_copy = df.copy() - sa = PandasArrowConverter() - adf = sa(df) + adf = convert_to_pyarrow(df) new_dtypes_names = [repr(i) for i in adf.dtypes.tolist()] is_arrows = ["[pyarrow]" in dtype for dtype in new_dtypes_names] assert all(is_arrows), "Some dtypes are not converted" assert not df.equals(adf), "The original df has been modified" assert df.equals(df_copy), "The original df has been modified" - assert adf.equals(sa(adf)), "The conversion is not idempotent" + assert adf.equals(convert_to_pyarrow(adf)), "The conversion is not idempotent" + + +@hp.given(df=df_st(dtypes=COMMON_DTYPES_SAMPLE)) +def test_common_dtypes_hp(df): + adf_pd_api = df.convert_dtypes(dtype_backend="pyarrow") + adf = convert_to_pyarrow(df) + assert adf_pd_api.compare(adf).empty, "The conversion is not consistent with pandas api" From 1ec60e002faa0e1a8e6cbd441172f76fdd071bce Mon Sep 17 00:00:00 2001 From: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com> Date: Wed, 13 Mar 2024 17:35:19 +0200 Subject: [PATCH 5/8] Refactor data type conversion testing in pandas-pyarrow This commit updates the tests for `pandas-pyarrow` to use the `convert_to_pyarrow` function instead of the previously used `SchemArrow`. Additionally, it changes `DateTimeMapper` to `datetime_mapper` for better efficiency. These changes enhance coverage and efficiency in testing, as well as broaden the range of test data types. Signed-off-by: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com> --- tests/unit/property_based/test_dt.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/tests/unit/property_based/test_dt.py b/tests/unit/property_based/test_dt.py index 16c6a5c..8216740 100644 --- a/tests/unit/property_based/test_dt.py +++ b/tests/unit/property_based/test_dt.py @@ -3,24 +3,21 @@ # # import hypothesis as hp # -# from pandas_pyarrow import SchemArrow -# from pandas_pyarrow.mappers import DateTimeMapper +# from pandas_pyarrow import convert_to_pyarrow +# from pandas_pyarrow.mappers import datetime_mapper # from tests.unit.property_based.pb_sts import single_column_df_st # # # @hp.given( # pair=single_column_df_st( # gen_type='datetime64[ns]', -# pair_mapping=DateTimeMapper( -# source_type=["datetime64", ], -# variations=[], -# )() +# pair_mapping=datetime_mapper( +# ) # # ) # ) # def test_datetime_numpy_api_hp(pair: Tuple[pd.DataFrame, str]): -# sa = SchemArrow() # df, target_dtype = pair -# adf = sa(df) +# adf = convert_to_pyarrow(df) # # assert list(adf.dtypes)[0] == target_dtype From 1043f50fb8669457c19337fb030ba763660a5b17 Mon Sep 17 00:00:00 2001 From: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com> Date: Wed, 13 Mar 2024 17:35:42 +0200 Subject: [PATCH 6/8] Replace PandasArrowConverter with convert_to_pyarrow in tests The conversion process has been refactored for the tests in the pandas-pyarrow package. Instead of creating a PandasArrowConverter object for each test case, the convert_to_pyarrow function is directly used for converting the dataframes. This simplification makes the tests more straightforward and potentially more efficient. Signed-off-by: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com> --- tests/unit/property_based/test_numeric.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/tests/unit/property_based/test_numeric.py b/tests/unit/property_based/test_numeric.py index 045482a..8e5cf75 100644 --- a/tests/unit/property_based/test_numeric.py +++ b/tests/unit/property_based/test_numeric.py @@ -1,6 +1,6 @@ from typing import Tuple -from pandas_pyarrow import PandasArrowConverter +from pandas_pyarrow import convert_to_pyarrow from pandas_pyarrow.mappers import numeric_mapper from tests.unit.property_based.pb_sts import single_column_df_st @@ -19,9 +19,8 @@ ) ) def test_float_numpy_api_hp(pair: Tuple[pd.DataFrame, str]): - sa = PandasArrowConverter() df, target_dtype = pair - adf = sa(df) + adf = convert_to_pyarrow(df) assert list(adf.dtypes)[0] == target_dtype @@ -37,9 +36,8 @@ def test_float_numpy_api_hp(pair: Tuple[pd.DataFrame, str]): ) ) def test_float_array_api_hp(pair: Tuple[pd.DataFrame, str]): - sa = PandasArrowConverter() df, target_dtype = pair - adf = sa(df) + adf = convert_to_pyarrow(df) assert list(adf.dtypes)[0] == target_dtype @@ -55,9 +53,8 @@ def test_float_array_api_hp(pair: Tuple[pd.DataFrame, str]): ) ) def test_int_numpy_api_hp(pair: Tuple[pd.DataFrame, str]): - sa = PandasArrowConverter() df, target_dtype = pair - adf = sa(df) + adf = convert_to_pyarrow(df) assert list(adf.dtypes)[0] == target_dtype @@ -73,8 +70,7 @@ def test_int_numpy_api_hp(pair: Tuple[pd.DataFrame, str]): ) ) def test_int_array_api_hp(pair: Tuple[pd.DataFrame, str]): - sa = PandasArrowConverter() df, target_dtype = pair - adf = sa(df) + adf = convert_to_pyarrow(df) assert list(adf.dtypes)[0] == target_dtype From 80951b3167d1fc699ec9f143102590820e63cf87 Mon Sep 17 00:00:00 2001 From: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com> Date: Wed, 13 Mar 2024 17:36:08 +0200 Subject: [PATCH 7/8] Add new test cases in numeric types testing Three new test cases for numeric types i.e., float16, uint16 and complex64 have been added in the pandas-pyarrow package testing. These additions will ensure the tested coverage and compatibility of these data types within the pandas-pyarrow integration. Signed-off-by: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com> --- tests/unit/test_numeric_types.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unit/test_numeric_types.py b/tests/unit/test_numeric_types.py index f1e876d..ed18016 100644 --- a/tests/unit/test_numeric_types.py +++ b/tests/unit/test_numeric_types.py @@ -21,6 +21,9 @@ def create_test_case( @create_test_case("int32", "int32[pyarrow]", [1, 2, 3]) @create_test_case("float64", "float64[pyarrow]", [1.0, 2.0, 3.0, None]) @create_test_case("float32", "float32[pyarrow]", [1.0, 2.0, 3.0, None]) +@create_test_case("float16", "float16[pyarrow]", [1.0, 2.0, 3.0, None]) +@create_test_case("uint16", "uint16[pyarrow]", [1, 2, 3]) +@create_test_case("complex64", "string[pyarrow]", [1, 2, 3]) @create_test_case("float32[pyarrow]", "float32[pyarrow]", [1.0, 2.0, 3.0, None]) def test_numeric_types(df_data, expected_dtype): sa = PandasArrowConverter() From 78f800bd9bf78e36550de4d9814a88345cf5e5de Mon Sep 17 00:00:00 2001 From: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com> Date: Wed, 13 Mar 2024 17:58:40 +0200 Subject: [PATCH 8/8] Reorder datatype variables for clarity in testing Variables `float`, `float32`, and `float64` have been moved from the `COMMON_DTYPES_SAMPLE` to the `UNCOMMON_DTYPES_SAMPLE` list within the pandas-pyarrow testing suite. This rearrangement better illustrates which datatypes can be converted to PyArrow via the pandas API and which cannot, thus improving testing accuracy and clarity. Signed-off-by: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com> --- tests/unit/property_based/pb_sts.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/tests/unit/property_based/pb_sts.py b/tests/unit/property_based/pb_sts.py index 3197141..0f21285 100644 --- a/tests/unit/property_based/pb_sts.py +++ b/tests/unit/property_based/pb_sts.py @@ -7,18 +7,15 @@ # Dtype convertable to pyarrow via pandas api COMMON_DTYPES_SAMPLE: List[Union[type, str]] = [ - int, - float, bool, str, + int, "datetime64[ns]", "timedelta64[ns]", "int8", "int16", "int32", "int64", - "float32", - "float64", "uint8", "uint16", "uint32", @@ -26,18 +23,15 @@ ] # Dtype not convertable to pyarrow via pandas api (pyarrow.lib.ArrowNotImplementedError) UNCOMMON_DTYPES_SAMPLE: List[Union[type, str]] = [ + float, "float16", + "float32", + "float64", "complex64", "complex128", ] -# @composite -# def dtypes_st(draw: Any) -> Any: -# dtypes = st.sampled_from(COMMON_DTYPES_SAMPLE) -# return draw(dtypes) - - def create_dataframe(draw: Any, gen_type: str) -> pd.DataFrame: dfs_st = data_frames( columns=columns(