Skip to content

Commit

Permalink
Merge pull request #193 from DanielAvdar/dev-2
Browse files Browse the repository at this point in the history
Dev 2
  • Loading branch information
DanielAvdar authored Dec 26, 2024
2 parents efe3c8c + ffa8729 commit b4c8468
Show file tree
Hide file tree
Showing 22 changed files with 661 additions and 418 deletions.
7 changes: 4 additions & 3 deletions .github/workflows/code-checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ on:
- 'pandas_pyarrow/**'
- 'tests/**'
- '.github/workflows/code-checks.yml'
- '.pre-commit-config.yaml'

workflow_dispatch:
jobs:
Expand All @@ -21,6 +22,6 @@ jobs:
python-version: '3.11'
cache: poetry

- run: poetry install
- run: poetry run pre-commit install
- run: poetry run pre-commit run --all-files
- run: make
- run: make check
- run: make mypy
17 changes: 4 additions & 13 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,24 +1,15 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.0.1
rev: v5.0.0
hooks:
- id: debug-statements
- id: end-of-file-fixer
- id: trailing-whitespace
- id: no-commit-to-branch
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.4
rev: v0.8.4
hooks:
- id: ruff-format
args: [ --preview ]
args: [ --preview, --config=pyproject.toml ]
- id: ruff
args: [ --preview, --fix]

- repo: local
hooks:
- id: mypy
name: mypy
entry: mypy
language: python
types: [ python ]
args: [ --config-file, pyproject.toml ]
args: [ --preview, --fix,--unsafe-fixes, --config=pyproject.toml ]
18 changes: 18 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
.PHONY: help
.PHONY: default
default: install

install:
poetry install --all-extras
# poetry run pre-commit autoupdate
poetry run pre-commit install

test:
poetry run pytest

check:
poetry run pre-commit run --all-files
mypy:
poetry run mypy . --config-file pyproject.toml
coverage:
poetry run pytest --cov=ml_orchestrator --cov-report=xml
25 changes: 22 additions & 3 deletions pandas_pyarrow/mappers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from typing import Dict

from .datetime_mapper import datetime_mapper
from .datetime_mapper import datetime_mapper, reverse_datetime_mapper
from .db_types import mapper_db_types
from .dtype_mapper import mapper_dict_dt, mapper_dict_object
from .numeric_mapper import numeric_mapper
from .dtype_mapper import mapper_dict_dt, mapper_dict_object, reverse_mapper_dict
from .numeric_mapper import numeric_mapper, reverse_numeric_mapper


def create_mapper() -> Dict[str, str]:
Expand All @@ -20,10 +20,29 @@ def create_mapper() -> Dict[str, str]:
return all_mapper_dicts


def reverse_create_mapper(
adapter: str = "tz=",
) -> Dict[str, str]:
all_mapper_dicts: Dict[str, str] = dict(
**reverse_numeric_mapper(["float"], ["16", "32", "64"]),
**reverse_numeric_mapper(["int"], ["8", "16", "32", "64"]),
# **reverse_numeric_mapper(["Float", "Int"], ["32", "64"]),
**reverse_numeric_mapper(["uint"], ["8", "16", "32", "64"]),
**reverse_datetime_mapper(adapter=adapter),
**reverse_mapper_dict,
)
return all_mapper_dicts


# def reverse_mapper_adapter(all_mapper_dicts: Dict[str, str]) -> Dict[str, str]:
# new_mapper = {}
# for key, value in all_mapper_dicts.items():
# if 'tz='
__all__ = [
"mapper_dict_dt",
"mapper_dict_object",
"create_mapper",
"reverse_create_mapper",
"mapper_db_types",
"datetime_mapper",
"numeric_mapper",
Expand Down
19 changes: 18 additions & 1 deletion pandas_pyarrow/mappers/datetime_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,27 @@

def datetime_mapper(from_type: str = "datetime64", to_type: str = "timestamp") -> Dict[str, str]:
time_zones = pytz.all_timezones
time_resolutions = ["ns", "ms", "us"]
time_resolutions = ["ns", "ms", "us", "s"]
all_combinations = {f"{from_type}[{res}]": f"{to_type}[{res}][pyarrow]" for res in time_resolutions}
all_tz_combinations = {
f"{from_type}[{res}, {tz}]": f"{to_type}[{res}, {tz}][pyarrow]" for res in time_resolutions for tz in time_zones
}
all_combinations.update(all_tz_combinations)
return all_combinations


def reverse_datetime_mapper(
from_type: str = "timestamp",
to_type: str = "datetime64",
adapter: str = "tz=",
) -> Dict[str, str]:
time_zones = pytz.all_timezones
time_resolutions = ["ns", "ms", "us", "s"]
all_combinations = {f"{from_type}[{res}][pyarrow]": f"{to_type}[{res}]" for res in time_resolutions}
all_tz_combinations = {
f"{from_type}[{res}, {adapter}{tz}][pyarrow]": f"{to_type}[{res}, {tz}]"
for res in time_resolutions
for tz in time_zones
}
all_combinations.update(all_tz_combinations)
return all_combinations
20 changes: 17 additions & 3 deletions pandas_pyarrow/mappers/dtype_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@
"timedelta64[ns]": "duration[ns][pyarrow]",
"timedelta64[ms]": "duration[ms][pyarrow]",
"timedelta64[us]": "duration[us][pyarrow]",
"timedelta64[s]": "duration[s][pyarrow]",
"timedelta64": "duration[us][pyarrow]",
"date": "date32[pyarrow]",
"time": "timestamp[ns][pyarrow]",
"timestamp": "timestamp[ns][pyarrow]",
}
mapper_dict_object: Dict[str, str] = {
"object": "string[pyarrow]",
Expand All @@ -16,3 +14,19 @@
"string": "string[pyarrow]",
"bool": "bool[pyarrow]",
}

reverse_mapper_dict: Dict[str, str] = {
"duration[ns][pyarrow]": "timedelta64[ns]",
"duration[ms][pyarrow]": "timedelta64[ms]",
"duration[us][pyarrow]": "timedelta64[us]",
"duration[s][pyarrow]": "timedelta64[s]",
"date32[pyarrow]": "datetime64[ns]",
"string[pyarrow]": "object",
"bool[pyarrow]": "bool",
"time64[ns][pyarrow]": "datetime64[ns]",
# "time64[ms][pyarrow]": "datetime64[ms]", todo: pandas error NotImplementedError
"time64[us][pyarrow]": "datetime64[us]",
"double[pyarrow]": "float64",
"float[pyarrow]": "float64",
"halffloat[pyarrow]": "float16",
}
18 changes: 18 additions & 0 deletions pandas_pyarrow/mappers/numeric_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,21 @@ def numeric_mapper(source_types: List[str], variations: List[str]) -> Dict[str,
all_floats = create_type_variations(source_types, lambda x: "float" in x.lower(), variations)
all_combinations = {**all_ints, **all_floats}
return all_combinations


def reverse_create_type_variations(
source_types: List[str], filter_func: Callable[[str], bool], variations: List[str]
) -> Dict[str, str]:
filtered_types = [source_type for source_type in source_types if filter_func(source_type)]
return {
f"{source_type.lower()}{variation}[pyarrow]": f"{source_type}{variation}"
for source_type in filtered_types
for variation in variations
}


def reverse_numeric_mapper(source_types: List[str], variations: List[str]) -> Dict[str, str]:
all_ints = reverse_create_type_variations(source_types, lambda x: "int" in x.lower(), variations)
all_floats = reverse_create_type_variations(source_types, lambda x: "float" in x.lower(), variations)
all_combinations = {**all_ints, **all_floats}
return all_combinations
31 changes: 16 additions & 15 deletions pandas_pyarrow/reverse_converter.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
from pandas_pyarrow.mappers import reverse_create_mapper

import numpy as np
import pandas as pd


def convert_to_numpy(df: pd.DataFrame) -> pd.DataFrame:
cols = df.columns
dtypes = df.dtypes
df_ = df.copy()
for col, dtype in zip(cols, dtypes):
if repr(dtype).startswith("timestamp"):
dt_format = "[" + repr(dtype).split("[")[1]
df_[col] = df_[col].values.astype(f"datetime64{dt_format}")
elif repr(dtype).startswith("halffloat"):
df_[col] = df_[col].values.astype("float16")
elif repr(dtype).startswith("duration"):
td_format = "[" + repr(dtype).split("[")[1]
df_[col] = df_[col].values.astype(f"timedelta64{td_format}")
elif repr(dtype).startswith("string"):
df_[col] = df_[col].values.astype("object")
return df_.convert_dtypes()
values = df.values
r_mapper = reverse_create_mapper()
pyarrow_types = df.dtypes
numpy_types = dict()
for col, dtype in pyarrow_types.items():
if "pyarrow" in repr(dtype):
numpy_types[col] = r_mapper[repr(dtype)] if "bool" not in repr(dtype) else bool

else:
numpy_types[col] = dtype

new_df = pd.DataFrame(values, columns=df.columns, dtype=object).fillna(np.nan).astype(numpy_types)
return new_df
Loading

0 comments on commit b4c8468

Please sign in to comment.