From 7411b9622dabae5ef747fc5f880feb6bdc963f60 Mon Sep 17 00:00:00 2001
From: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com>
Date: Wed, 13 Mar 2024 17:31:47 +0200
Subject: [PATCH 1/8] Add pyarrow conversion example to README

The README has been updated to include an example of how to use the pandas-pyarrow library. This provides a specific illustration of handling problematic data types like float16 or db-dtypes that can cause issues in PyArrow, aiding users to fully exploit the library's benefits.

Signed-off-by: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com>
---
 README.md | 44 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index de383fc..778b3b2 100644
--- a/README.md
+++ b/README.md
@@ -179,9 +179,51 @@ dtype: object
 ## Purposes
 
 - Simplify the conversion between pandas pyarrow and numpy backends.
-- Allow seamlessly switch to pyarrow pandas backend.
+- Allow seamlessly switch to pyarrow pandas backend, even for problematic dtypes such float16 or db-dtypes.
 - dtype standardization for db-dtypes used by bigquery python sdk.
 
+
+example:
+
+```python
+import pandas as pd
+
+# Create a pandas DataFrame
+df = pd.DataFrame({
+
+    'C': [1.1, 2.2, 3.3],
+
+}, dtype='float16')
+
+df.convert_dtypes(dtype_backend='pyarrow')
+```
+will raise an error:
+```
+pyarrow.lib.ArrowNotImplementedError: Unsupported cast from halffloat to double using function cast_double
+```
+but with pandas-pyarrow:
+```python
+import pandas as pd
+
+from pandas_pyarrow import convert_to_pyarrow
+
+# Create a pandas DataFrame
+df = pd.DataFrame({
+
+    'C': [1.1, 2.2, 3.3],
+
+}, dtype='float16')
+adf = convert_to_pyarrow(df)
+print(adf.dtypes)
+
+```
+outputs:
+```
+C    halffloat[pyarrow]
+dtype: object
+```
+
+
 ## Additional Information
 
 When converting from higher precision numerical dtypes (like float64) to

From 4b01b370a4114b49ee61fc14d878fcbd9f09741d Mon Sep 17 00:00:00 2001
From: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com>
Date: Wed, 13 Mar 2024 17:32:06 +0200
Subject: [PATCH 2/8] Update minimum pyarrow version in dependencies

The minimum version for the pyarrow dependency in the pyproject.toml file has been updated from "7.0.0" to "10.0.1". Users are now required to use at least version 10.0.1 of pyarrow in order to prevent potential compatibility issues.

Signed-off-by: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com>
---
 poetry.lock    | 16 ++++++++--------
 pyproject.toml |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index e89b66d..41e062e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -606,13 +606,13 @@ protobuf = ">=4.21.6"
 
 [[package]]
 name = "hypothesis"
-version = "6.99.2"
+version = "6.99.5"
 description = "A library for property-based testing"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "hypothesis-6.99.2-py3-none-any.whl", hash = "sha256:f277f6ccb074f39d51c7f32ba5a0ff640dba9b71ef69ea1e1e09b6f7c25302f5"},
-    {file = "hypothesis-6.99.2.tar.gz", hash = "sha256:24453b1a86151be83d26e81834e29022b3e3f0fc5d71275cc3d096649a13c53c"},
+    {file = "hypothesis-6.99.5-py3-none-any.whl", hash = "sha256:0ab4968fa4c38ba6d3cd9f54f3d637e3c72fe136bff11373355f2e06416c6a7d"},
+    {file = "hypothesis-6.99.5.tar.gz", hash = "sha256:1f795b71abe46f3919591acf7fc05cbcd9b601b97806d97433e0eb9bdb200861"},
 ]
 
 [package.dependencies]
@@ -1309,18 +1309,18 @@ files = [
 
 [[package]]
 name = "setuptools"
-version = "69.1.1"
+version = "69.2.0"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "setuptools-69.1.1-py3-none-any.whl", hash = "sha256:02fa291a0471b3a18b2b2481ed902af520c69e8ae0919c13da936542754b4c56"},
-    {file = "setuptools-69.1.1.tar.gz", hash = "sha256:5c0806c7d9af348e6dd3777b4f4dbb42c7ad85b190104837488eab9a7c945cf8"},
+    {file = "setuptools-69.2.0-py3-none-any.whl", hash = "sha256:c21c49fb1042386df081cb5d86759792ab89efca84cf114889191cd09aacc80c"},
+    {file = "setuptools-69.2.0.tar.gz", hash = "sha256:0ff4183f8f42cd8fa3acea16c45205521a4ef28f73c6391d8a25e92893134f2e"},
 ]
 
 [package.extras]
 docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
-testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+testing = ["build[virtualenv]", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
 testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
 
 [[package]]
@@ -1433,4 +1433,4 @@ db-dtypes = ["db-dtypes"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13"
-content-hash = "3b0709d9cd8dc345d093e9b0bfc6b1312f2373da728803fbaf18aba4aae64b6d"
+content-hash = "15f71256266f32a6a6d9cd0835d8c555c6780b91383b3cfb1cd070cbb6ffcf81"
diff --git a/pyproject.toml b/pyproject.toml
index 007e80e..b94c1f1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,7 +42,7 @@ packages = [{ include = "pandas_pyarrow", from = "." }]
 [tool.poetry.dependencies]
 python = ">=3.9,<3.13"
 pandas = ">=2"
-pyarrow = ">=7.0.0, <=15.0.0"
+pyarrow = ">=10.0.1, <=15.0.0"
 db-dtypes = { version = ">=1", optional = true }
 pandas-gbq = { version = ">=0.15.0", optional = true }
 

From 0c40dcc4e602079e22697cb7e364911bc95262c7 Mon Sep 17 00:00:00 2001
From: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com>
Date: Wed, 13 Mar 2024 17:33:01 +0200
Subject: [PATCH 3/8] Add 'uint' type mapping to pandas_pyarrow

The numeric_mapper method in the pandas_pyarrow mappers module now includes the 'uint' data type. This enables mapping unsigned integer types with 8, 16, 32, and 64 bit variants.

Signed-off-by: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com>
---
 pandas_pyarrow/mappers/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas_pyarrow/mappers/__init__.py b/pandas_pyarrow/mappers/__init__.py
index 4537fba..dc13b3c 100644
--- a/pandas_pyarrow/mappers/__init__.py
+++ b/pandas_pyarrow/mappers/__init__.py
@@ -11,6 +11,7 @@ def create_mapper() -> Dict[str, str]:
         **numeric_mapper(["float"], ["16", "32", "64"]),
         **numeric_mapper(["int"], ["8", "16", "32", "64"]),
         **numeric_mapper(["Float", "Int"], ["32", "64"]),
+        **numeric_mapper(["uint"], ["8", "16", "32", "64"]),
         **datetime_mapper(),
         **mapper_dict_dt,
         **mapper_dict_object,

From 4998867b36dbe48b29b5d442d8dc8bf623c0de17 Mon Sep 17 00:00:00 2001
From: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com>
Date: Wed, 13 Mar 2024 17:35:04 +0200
Subject: [PATCH 4/8] Update testing for pandas-pyarrow data type conversion

Refactored the pandas-pyarrow tests for better coverage and efficiency. In particular, common and uncommon data types are now handled separately in the testing process. The tests were also updated to use the `convert_to_pyarrow` function, replacing the old `PandasArrowConverter` class. Finally, more comprehensive testing was made possible by expanding the range of test data types.

Signed-off-by: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com>
---
 tests/unit/property_based/pb_sts.py       | 27 +++++++++++++++--------
 tests/unit/property_based/test_general.py | 21 +++++++++++-------
 2 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/tests/unit/property_based/pb_sts.py b/tests/unit/property_based/pb_sts.py
index b38fe58..3197141 100644
--- a/tests/unit/property_based/pb_sts.py
+++ b/tests/unit/property_based/pb_sts.py
@@ -5,8 +5,8 @@
 from hypothesis.extra.pandas import columns, data_frames, range_indexes
 from hypothesis.strategies import composite
 
-# Introduced constants
-DTYPES_SAMPLE: List[Union[type, str]] = [
+# Dtype convertable to pyarrow via pandas api
+COMMON_DTYPES_SAMPLE: List[Union[type, str]] = [
     int,
     float,
     bool,
@@ -17,16 +17,25 @@
     "int16",
     "int32",
     "int64",
-    "float16",
     "float32",
     "float64",
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+]
+# Dtype not convertable to pyarrow via pandas api (pyarrow.lib.ArrowNotImplementedError)
+UNCOMMON_DTYPES_SAMPLE: List[Union[type, str]] = [
+    "float16",
+    "complex64",
+    "complex128",
 ]
 
 
-@composite
-def dtypes_st(draw: Any) -> Any:
-    dtypes = st.sampled_from(DTYPES_SAMPLE)
-    return draw(dtypes)
+# @composite
+# def dtypes_st(draw: Any) -> Any:
+#     dtypes = st.sampled_from(COMMON_DTYPES_SAMPLE)
+#     return draw(dtypes)
 
 
 def create_dataframe(draw: Any, gen_type: str) -> pd.DataFrame:
@@ -46,13 +55,13 @@ def create_dataframe(draw: Any, gen_type: str) -> pd.DataFrame:
 
 
 @composite
-def df_st(draw: Any) -> pd.DataFrame:
+def df_st(draw: Any, dtypes: List[Any]) -> pd.DataFrame:
     col_names = draw(st.sets(st.text(min_size=1, max_size=10), min_size=2, max_size=5))
 
     dfs_st = data_frames(
         columns=columns(
             col_names,
-            dtype=draw(st.sampled_from(DTYPES_SAMPLE)),
+            dtype=draw(st.sampled_from(dtypes)),
         ),
         index=range_indexes(
             min_size=2,
diff --git a/tests/unit/property_based/test_general.py b/tests/unit/property_based/test_general.py
index d8993af..e165dc7 100644
--- a/tests/unit/property_based/test_general.py
+++ b/tests/unit/property_based/test_general.py
@@ -1,19 +1,24 @@
-from pandas_pyarrow.pda_converter import PandasArrowConverter
-from tests.unit.property_based.pb_sts import df_st
+from pandas_pyarrow import convert_to_pyarrow
+from tests.unit.property_based.pb_sts import COMMON_DTYPES_SAMPLE, UNCOMMON_DTYPES_SAMPLE, df_st
 
 import hypothesis as hp
 
 
-@hp.given(df=df_st())
-@hp.settings(max_examples=500)
-def test_dtypes_hp(df):
+@hp.given(df=df_st(dtypes=COMMON_DTYPES_SAMPLE + UNCOMMON_DTYPES_SAMPLE))
+def test_uncommon_dtypes_hp(df):
     df_copy = df.copy()
-    sa = PandasArrowConverter()
-    adf = sa(df)
+    adf = convert_to_pyarrow(df)
 
     new_dtypes_names = [repr(i) for i in adf.dtypes.tolist()]
     is_arrows = ["[pyarrow]" in dtype for dtype in new_dtypes_names]
     assert all(is_arrows), "Some dtypes are not converted"
     assert not df.equals(adf), "The original df has been modified"
     assert df.equals(df_copy), "The original df has been modified"
-    assert adf.equals(sa(adf)), "The conversion is not idempotent"
+    assert adf.equals(convert_to_pyarrow(adf)), "The conversion is not idempotent"
+
+
+@hp.given(df=df_st(dtypes=COMMON_DTYPES_SAMPLE))
+def test_common_dtypes_hp(df):
+    adf_pd_api = df.convert_dtypes(dtype_backend="pyarrow")
+    adf = convert_to_pyarrow(df)
+    assert adf_pd_api.compare(adf).empty, "The conversion is not consistent with pandas api"

From 1ec60e002faa0e1a8e6cbd441172f76fdd071bce Mon Sep 17 00:00:00 2001
From: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com>
Date: Wed, 13 Mar 2024 17:35:19 +0200
Subject: [PATCH 5/8] Refactor data type conversion testing in pandas-pyarrow

This commit updates the tests for `pandas-pyarrow` to use the `convert_to_pyarrow` function instead of the previously used `SchemArrow`. Additionally, it changes `DateTimeMapper` to `datetime_mapper` for better efficiency. These changes enhance coverage and efficiency in testing, as well as broaden the range of test data types.

Signed-off-by: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com>
---
 tests/unit/property_based/test_dt.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/tests/unit/property_based/test_dt.py b/tests/unit/property_based/test_dt.py
index 16c6a5c..8216740 100644
--- a/tests/unit/property_based/test_dt.py
+++ b/tests/unit/property_based/test_dt.py
@@ -3,24 +3,21 @@
 #
 # import hypothesis as hp
 #
-# from pandas_pyarrow import SchemArrow
-# from pandas_pyarrow.mappers import DateTimeMapper
+# from pandas_pyarrow import convert_to_pyarrow
+# from pandas_pyarrow.mappers import datetime_mapper
 # from tests.unit.property_based.pb_sts import single_column_df_st
 #
 #
 # @hp.given(
 #     pair=single_column_df_st(
 #         gen_type='datetime64[ns]',
-#         pair_mapping=DateTimeMapper(
-#             source_type=["datetime64", ],
-#             variations=[],
-#         )()
+#         pair_mapping=datetime_mapper(
+#         )
 #
 #     )
 # )
 # def test_datetime_numpy_api_hp(pair: Tuple[pd.DataFrame, str]):
-#     sa = SchemArrow()
 #     df, target_dtype = pair
-#     adf = sa(df)
+#     adf = convert_to_pyarrow(df)
 #
 #     assert list(adf.dtypes)[0] == target_dtype

From 1043f50fb8669457c19337fb030ba763660a5b17 Mon Sep 17 00:00:00 2001
From: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com>
Date: Wed, 13 Mar 2024 17:35:42 +0200
Subject: [PATCH 6/8] Replace PandasArrowConverter with convert_to_pyarrow in
 tests

The conversion process has been refactored for the tests in the pandas-pyarrow package. Instead of creating a PandasArrowConverter object for each test case, the convert_to_pyarrow function is directly used for converting the dataframes. This simplification makes the tests more straightforward and potentially more efficient.

Signed-off-by: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com>
---
 tests/unit/property_based/test_numeric.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/tests/unit/property_based/test_numeric.py b/tests/unit/property_based/test_numeric.py
index 045482a..8e5cf75 100644
--- a/tests/unit/property_based/test_numeric.py
+++ b/tests/unit/property_based/test_numeric.py
@@ -1,6 +1,6 @@
 from typing import Tuple
 
-from pandas_pyarrow import PandasArrowConverter
+from pandas_pyarrow import convert_to_pyarrow
 from pandas_pyarrow.mappers import numeric_mapper
 from tests.unit.property_based.pb_sts import single_column_df_st
 
@@ -19,9 +19,8 @@
     )
 )
 def test_float_numpy_api_hp(pair: Tuple[pd.DataFrame, str]):
-    sa = PandasArrowConverter()
     df, target_dtype = pair
-    adf = sa(df)
+    adf = convert_to_pyarrow(df)
 
     assert list(adf.dtypes)[0] == target_dtype
 
@@ -37,9 +36,8 @@ def test_float_numpy_api_hp(pair: Tuple[pd.DataFrame, str]):
     )
 )
 def test_float_array_api_hp(pair: Tuple[pd.DataFrame, str]):
-    sa = PandasArrowConverter()
     df, target_dtype = pair
-    adf = sa(df)
+    adf = convert_to_pyarrow(df)
 
     assert list(adf.dtypes)[0] == target_dtype
 
@@ -55,9 +53,8 @@ def test_float_array_api_hp(pair: Tuple[pd.DataFrame, str]):
     )
 )
 def test_int_numpy_api_hp(pair: Tuple[pd.DataFrame, str]):
-    sa = PandasArrowConverter()
     df, target_dtype = pair
-    adf = sa(df)
+    adf = convert_to_pyarrow(df)
 
     assert list(adf.dtypes)[0] == target_dtype
 
@@ -73,8 +70,7 @@ def test_int_numpy_api_hp(pair: Tuple[pd.DataFrame, str]):
     )
 )
 def test_int_array_api_hp(pair: Tuple[pd.DataFrame, str]):
-    sa = PandasArrowConverter()
     df, target_dtype = pair
-    adf = sa(df)
+    adf = convert_to_pyarrow(df)
 
     assert list(adf.dtypes)[0] == target_dtype

From 80951b3167d1fc699ec9f143102590820e63cf87 Mon Sep 17 00:00:00 2001
From: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com>
Date: Wed, 13 Mar 2024 17:36:08 +0200
Subject: [PATCH 7/8] Add new test cases in numeric types testing

Three new test cases for numeric types i.e., float16, uint16 and complex64 have been added in the pandas-pyarrow package testing. These additions will ensure the tested coverage and compatibility of these data types within the pandas-pyarrow integration.

Signed-off-by: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com>
---
 tests/unit/test_numeric_types.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/unit/test_numeric_types.py b/tests/unit/test_numeric_types.py
index f1e876d..ed18016 100644
--- a/tests/unit/test_numeric_types.py
+++ b/tests/unit/test_numeric_types.py
@@ -21,6 +21,9 @@ def create_test_case(
 @create_test_case("int32", "int32[pyarrow]", [1, 2, 3])
 @create_test_case("float64", "float64[pyarrow]", [1.0, 2.0, 3.0, None])
 @create_test_case("float32", "float32[pyarrow]", [1.0, 2.0, 3.0, None])
+@create_test_case("float16", "float16[pyarrow]", [1.0, 2.0, 3.0, None])
+@create_test_case("uint16", "uint16[pyarrow]", [1, 2, 3])
+@create_test_case("complex64", "string[pyarrow]", [1, 2, 3])
 @create_test_case("float32[pyarrow]", "float32[pyarrow]", [1.0, 2.0, 3.0, None])
 def test_numeric_types(df_data, expected_dtype):
     sa = PandasArrowConverter()

From 78f800bd9bf78e36550de4d9814a88345cf5e5de Mon Sep 17 00:00:00 2001
From: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com>
Date: Wed, 13 Mar 2024 17:58:40 +0200
Subject: [PATCH 8/8] Reorder datatype variables for clarity in testing

Variables `float`, `float32`, and `float64` have been moved from the `COMMON_DTYPES_SAMPLE` to the `UNCOMMON_DTYPES_SAMPLE` list within the pandas-pyarrow testing suite. This rearrangement better illustrates which datatypes can be converted to PyArrow via the pandas API and which cannot, thus improving testing accuracy and clarity.

Signed-off-by: DanielAvdar <66269169+DanielAvdar@users.noreply.github.com>
---
 tests/unit/property_based/pb_sts.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/tests/unit/property_based/pb_sts.py b/tests/unit/property_based/pb_sts.py
index 3197141..0f21285 100644
--- a/tests/unit/property_based/pb_sts.py
+++ b/tests/unit/property_based/pb_sts.py
@@ -7,18 +7,15 @@
 
 # Dtype convertable to pyarrow via pandas api
 COMMON_DTYPES_SAMPLE: List[Union[type, str]] = [
-    int,
-    float,
     bool,
     str,
+    int,
     "datetime64[ns]",
     "timedelta64[ns]",
     "int8",
     "int16",
     "int32",
     "int64",
-    "float32",
-    "float64",
     "uint8",
     "uint16",
     "uint32",
@@ -26,18 +23,15 @@
 ]
 # Dtype not convertable to pyarrow via pandas api (pyarrow.lib.ArrowNotImplementedError)
 UNCOMMON_DTYPES_SAMPLE: List[Union[type, str]] = [
+    float,
     "float16",
+    "float32",
+    "float64",
     "complex64",
     "complex128",
 ]
 
 
-# @composite
-# def dtypes_st(draw: Any) -> Any:
-#     dtypes = st.sampled_from(COMMON_DTYPES_SAMPLE)
-#     return draw(dtypes)
-
-
 def create_dataframe(draw: Any, gen_type: str) -> pd.DataFrame:
     dfs_st = data_frames(
         columns=columns(