From 4f67f645eba0232f6b16236e551c9e8bc89ea564 Mon Sep 17 00:00:00 2001 From: emanueldavidov <130387402+emanueldavidov@users.noreply.github.com> Date: Sun, 12 Jan 2025 19:33:14 +0200 Subject: [PATCH] Dynamic Reverse Conversion (#205) * first commit * second commit * Remove parquet_compatible from PandasArrowConverter * Remove parquet_compatible from PandasArrowConverter * - Replace function-based logic with class-based methods (__call__, _target_dtype_name, etc.)\n- Ensure consistent handling of dtype conversion from pyarrow to NumPy/Pandas * - Replace function-based logic with class-based methods (__call__, _target_dtype_name, etc.) - Ensure consistent handling of dtype conversion from pyarrow to NumPy/Pandas --- pandas_pyarrow/pda_converter.py | 5 --- pandas_pyarrow/reverse_converter.py | 66 +++++++++++++++++++++++------ 2 files changed, 52 insertions(+), 19 deletions(-) diff --git a/pandas_pyarrow/pda_converter.py b/pandas_pyarrow/pda_converter.py index afac1e9..1a67932 100644 --- a/pandas_pyarrow/pda_converter.py +++ b/pandas_pyarrow/pda_converter.py @@ -7,9 +7,6 @@ class PandasArrowConverter: """PandasArrowConverter manages the conversion of Pandas DataFrame data types to Arrow data types. - - :param parquet_compatible: if True, column names will be converted to parquet compatible names. Default is False. - **disclaimer**: not yet implemented :param custom_mapper: dictionary with key as the source data type and value as the target data type. Will override default mapping :param default_target_type: Optional string specifying the default data type to use if no mapping is found for a @@ -23,11 +20,9 @@ class PandasArrowConverter: def __init__( self, - parquet_compatible: Optional[bool] = False, custom_mapper: Optional[Dict[str, str]] = None, default_target_type: Optional[str] = "string[pyarrow]", ): - self.parquet_compatible = parquet_compatible self.additional_mapper_dicts = custom_mapper or {} self.defaults_dtype = default_target_type self._mapper = create_mapper() | self.additional_mapper_dicts diff --git a/pandas_pyarrow/reverse_converter.py b/pandas_pyarrow/reverse_converter.py index f2b9803..8213c99 100644 --- a/pandas_pyarrow/reverse_converter.py +++ b/pandas_pyarrow/reverse_converter.py @@ -1,20 +1,58 @@ -from pandas_pyarrow.mappers import reverse_create_mapper +# reverse_converter.py + +from typing import Dict, List, Optional + +from .mappers import reverse_create_mapper import numpy as np import pandas as pd +class ReversePandasArrowConverter: + """ + ReversePandasArrowConverter manages the conversion of pyarrow-backed Pandas DataFrame dtypes + back to their Numpy/Pandas equivalents. + :param custom_mapper: Dictionary with key as the string-representation of the + Arrow-backed dtype, and value as the desired target dtype (e.g. "object", "int64", etc.). + This overrides default mapping returned by reverse_create_mapper(). + :param default_target_type: Optional string specifying the default dtype to use + if no mapping is found for a specific dtype. Default is "object". + Methods + ------- + - __call__(df: pd.DataFrame) -> pd.DataFrame: + Converts pyarrow-backed dtypes in the given Pandas DataFrame to Numpy/Pandas dtypes + and returns the converted DataFrame. + """ + + def __init__( + self, + custom_mapper: Optional[Dict[str, str]] = None, + default_target_type: Optional[str] = "object", + ): + self._mapper = reverse_create_mapper() | (custom_mapper or {}) + self._default_target_type = default_target_type + + def __call__(self, df: pd.DataFrame) -> pd.DataFrame: + dtype_names: List[str] = df.dtypes.astype(str).tolist() + target_dtype_names = self._map_dtype_names(dtype_names) + col_to_dtype = dict(zip(df.columns, target_dtype_names)) + + new_df = pd.DataFrame(df.values, columns=df.columns, dtype="object").fillna(np.nan).astype(col_to_dtype) + return new_df + + def _target_dtype_name(self, dtype_name: str) -> str: + if "pyarrow" not in dtype_name: + return dtype_name + + if "bool" in dtype_name: + return "bool" + + return self._mapper.get(dtype_name, self._default_target_type) + + def _map_dtype_names(self, dtype_names: List[str]) -> List[str]: + return [self._target_dtype_name(dtype_name) for dtype_name in dtype_names] + + def convert_to_numpy(df: pd.DataFrame) -> pd.DataFrame: - values = df.values - r_mapper = reverse_create_mapper() - pyarrow_types = df.dtypes - numpy_types = dict() - for col, dtype in pyarrow_types.items(): - if "pyarrow" in repr(dtype): - numpy_types[col] = r_mapper[repr(dtype)] if "bool" not in repr(dtype) else bool - - else: - numpy_types[col] = dtype - - new_df = pd.DataFrame(values, columns=df.columns, dtype=object).fillna(np.nan).astype(numpy_types) - return new_df + converter = ReversePandasArrowConverter() + return converter(df)