Skip to content

Commit

Permalink
Dynamic Reverse Conversion (#205)
Browse files Browse the repository at this point in the history
* first commit

* second commit

* Remove parquet_compatible from PandasArrowConverter

* Remove parquet_compatible from PandasArrowConverter

* - Replace function-based logic with class-based methods (__call__, _target_dtype_name, etc.)\n- Ensure consistent handling of dtype conversion from pyarrow to NumPy/Pandas

* - Replace function-based logic with class-based methods (__call__, _target_dtype_name, etc.)
- Ensure consistent handling of dtype conversion from pyarrow to NumPy/Pandas
  • Loading branch information
emanueldavidov authored Jan 12, 2025
1 parent 4bf3752 commit 4f67f64
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 19 deletions.
5 changes: 0 additions & 5 deletions pandas_pyarrow/pda_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,6 @@

class PandasArrowConverter:
"""PandasArrowConverter manages the conversion of Pandas DataFrame data types to Arrow data types.
:param parquet_compatible: if True, column names will be converted to parquet compatible names. Default is False.
**disclaimer**: not yet implemented
:param custom_mapper: dictionary with key as the source data type and value as the target data type.
Will override default mapping
:param default_target_type: Optional string specifying the default data type to use if no mapping is found for a
Expand All @@ -23,11 +20,9 @@ class PandasArrowConverter:

def __init__(
self,
parquet_compatible: Optional[bool] = False,
custom_mapper: Optional[Dict[str, str]] = None,
default_target_type: Optional[str] = "string[pyarrow]",
):
self.parquet_compatible = parquet_compatible
self.additional_mapper_dicts = custom_mapper or {}
self.defaults_dtype = default_target_type
self._mapper = create_mapper() | self.additional_mapper_dicts
Expand Down
66 changes: 52 additions & 14 deletions pandas_pyarrow/reverse_converter.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,58 @@
from pandas_pyarrow.mappers import reverse_create_mapper
# reverse_converter.py

from typing import Dict, List, Optional

from .mappers import reverse_create_mapper

import numpy as np
import pandas as pd


class ReversePandasArrowConverter:
"""
ReversePandasArrowConverter manages the conversion of pyarrow-backed Pandas DataFrame dtypes
back to their Numpy/Pandas equivalents.
:param custom_mapper: Dictionary with key as the string-representation of the
Arrow-backed dtype, and value as the desired target dtype (e.g. "object", "int64", etc.).
This overrides default mapping returned by reverse_create_mapper().
:param default_target_type: Optional string specifying the default dtype to use
if no mapping is found for a specific dtype. Default is "object".
Methods
-------
- __call__(df: pd.DataFrame) -> pd.DataFrame:
Converts pyarrow-backed dtypes in the given Pandas DataFrame to Numpy/Pandas dtypes
and returns the converted DataFrame.
"""

def __init__(
self,
custom_mapper: Optional[Dict[str, str]] = None,
default_target_type: Optional[str] = "object",
):
self._mapper = reverse_create_mapper() | (custom_mapper or {})
self._default_target_type = default_target_type

def __call__(self, df: pd.DataFrame) -> pd.DataFrame:
dtype_names: List[str] = df.dtypes.astype(str).tolist()
target_dtype_names = self._map_dtype_names(dtype_names)
col_to_dtype = dict(zip(df.columns, target_dtype_names))

new_df = pd.DataFrame(df.values, columns=df.columns, dtype="object").fillna(np.nan).astype(col_to_dtype)
return new_df

def _target_dtype_name(self, dtype_name: str) -> str:
if "pyarrow" not in dtype_name:
return dtype_name

if "bool" in dtype_name:
return "bool"

return self._mapper.get(dtype_name, self._default_target_type)

def _map_dtype_names(self, dtype_names: List[str]) -> List[str]:
return [self._target_dtype_name(dtype_name) for dtype_name in dtype_names]


def convert_to_numpy(df: pd.DataFrame) -> pd.DataFrame:
values = df.values
r_mapper = reverse_create_mapper()
pyarrow_types = df.dtypes
numpy_types = dict()
for col, dtype in pyarrow_types.items():
if "pyarrow" in repr(dtype):
numpy_types[col] = r_mapper[repr(dtype)] if "bool" not in repr(dtype) else bool

else:
numpy_types[col] = dtype

new_df = pd.DataFrame(values, columns=df.columns, dtype=object).fillna(np.nan).astype(numpy_types)
return new_df
converter = ReversePandasArrowConverter()
return converter(df)

0 comments on commit 4f67f64

Please sign in to comment.