-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* first commit * second commit * Remove parquet_compatible from PandasArrowConverter * Remove parquet_compatible from PandasArrowConverter * - Replace function-based logic with class-based methods (__call__, _target_dtype_name, etc.)\n- Ensure consistent handling of dtype conversion from pyarrow to NumPy/Pandas * - Replace function-based logic with class-based methods (__call__, _target_dtype_name, etc.) - Ensure consistent handling of dtype conversion from pyarrow to NumPy/Pandas
- Loading branch information
1 parent
4bf3752
commit 4f67f64
Showing
2 changed files
with
52 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,58 @@ | ||
from pandas_pyarrow.mappers import reverse_create_mapper | ||
# reverse_converter.py | ||
|
||
from typing import Dict, List, Optional | ||
|
||
from .mappers import reverse_create_mapper | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
|
||
class ReversePandasArrowConverter: | ||
""" | ||
ReversePandasArrowConverter manages the conversion of pyarrow-backed Pandas DataFrame dtypes | ||
back to their Numpy/Pandas equivalents. | ||
:param custom_mapper: Dictionary with key as the string-representation of the | ||
Arrow-backed dtype, and value as the desired target dtype (e.g. "object", "int64", etc.). | ||
This overrides default mapping returned by reverse_create_mapper(). | ||
:param default_target_type: Optional string specifying the default dtype to use | ||
if no mapping is found for a specific dtype. Default is "object". | ||
Methods | ||
------- | ||
- __call__(df: pd.DataFrame) -> pd.DataFrame: | ||
Converts pyarrow-backed dtypes in the given Pandas DataFrame to Numpy/Pandas dtypes | ||
and returns the converted DataFrame. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
custom_mapper: Optional[Dict[str, str]] = None, | ||
default_target_type: Optional[str] = "object", | ||
): | ||
self._mapper = reverse_create_mapper() | (custom_mapper or {}) | ||
self._default_target_type = default_target_type | ||
|
||
def __call__(self, df: pd.DataFrame) -> pd.DataFrame: | ||
dtype_names: List[str] = df.dtypes.astype(str).tolist() | ||
target_dtype_names = self._map_dtype_names(dtype_names) | ||
col_to_dtype = dict(zip(df.columns, target_dtype_names)) | ||
|
||
new_df = pd.DataFrame(df.values, columns=df.columns, dtype="object").fillna(np.nan).astype(col_to_dtype) | ||
return new_df | ||
|
||
def _target_dtype_name(self, dtype_name: str) -> str: | ||
if "pyarrow" not in dtype_name: | ||
return dtype_name | ||
|
||
if "bool" in dtype_name: | ||
return "bool" | ||
|
||
return self._mapper.get(dtype_name, self._default_target_type) | ||
|
||
def _map_dtype_names(self, dtype_names: List[str]) -> List[str]: | ||
return [self._target_dtype_name(dtype_name) for dtype_name in dtype_names] | ||
|
||
|
||
def convert_to_numpy(df: pd.DataFrame) -> pd.DataFrame: | ||
values = df.values | ||
r_mapper = reverse_create_mapper() | ||
pyarrow_types = df.dtypes | ||
numpy_types = dict() | ||
for col, dtype in pyarrow_types.items(): | ||
if "pyarrow" in repr(dtype): | ||
numpy_types[col] = r_mapper[repr(dtype)] if "bool" not in repr(dtype) else bool | ||
|
||
else: | ||
numpy_types[col] = dtype | ||
|
||
new_df = pd.DataFrame(values, columns=df.columns, dtype=object).fillna(np.nan).astype(numpy_types) | ||
return new_df | ||
converter = ReversePandasArrowConverter() | ||
return converter(df) |