Skip to content

Commit

Permalink
Closes #47 - updated datatype handling - Decimal, datetime.date, date…
Browse files Browse the repository at this point in the history
…time.time (#70)

* add Decimal handler and generator functions; clean up random_dataframe() arguments and add decimal_column/date_column/time_column
* add datetime.date and datetime.time generators and handlers
* check for and handle decimals and datetime.dates by default
* return gpd.GeoSeries instead of GeometryArray
* add boolean series generator option
* add datatype imports with new directory structure
* ignore flake8 C901 - "too complex"
* add datatype compatibility helpers
* add optional with_ipython_display argument to prevent calling IPython.display() on an object that goes through handle_format()
  • Loading branch information
shouples authored Oct 13, 2022
1 parent 0e01d8d commit 816a165
Show file tree
Hide file tree
Showing 20 changed files with 1,005 additions and 523 deletions.
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ select =
# docstrings must be triple-quoted, via flake8-docstrings
D300
ignore =
# "Too complex"
C901,
# Extra space in brackets
E20,
E203,
Expand Down
1 change: 1 addition & 0 deletions src/dx/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .comms import *
from .datatypes import *
from .dx import *
from .formatters import *
from .loggers import *
Expand Down
7 changes: 7 additions & 0 deletions src/dx/datatypes/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from .compatibility import *
from .date_time import *
from .geometry import *
from .main import *
from .misc import *
from .numeric import *
from .text import *
167 changes: 167 additions & 0 deletions src/dx/datatypes/compatibility.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
import traceback
from typing import Any

import pandas as pd
from pandas.io.json import build_table_schema

from dx.settings import get_settings

settings = get_settings()


def test_compatibility(value: Any, as_dataframe: bool = True) -> dict:
"""
Convenience function to test the compatibility of a given object
with the different steps involved with the dx display modes.
- pandas.io.json.build_table_schema (https://github.com/pandas-dev/pandas/blob/main/pandas/io/json/_table_schema.py)
- jupyter_client.jsonutil.json_clean (https://github.com/jupyter/jupyter_client/blob/main/jupyter_client/jsonutil.py)
- duckdb conn.register
- final dx output type
"""
result = {}
result.update(test_build_table_schema(value))
result.update(test_json_clean(value))
result.update(test_db_write(value))
result.update(test_dx_handling(value))
if as_dataframe:
return pd.DataFrame(result).transpose()
return result


def test_build_table_schema(value: Any, as_dataframe: bool = False) -> dict:
"""
Convenience function to test the compatibility of a given object
with the pandas.io.json.build_table_schema function, which
is called to set up the initial column schema during dx formatting.
"""
df = pd.DataFrame({"test": [value]})
result = {}

try:
schema = build_table_schema(df, index=False)
fields = schema["fields"]
field_type = [
field_schema["type"] for field_schema in fields if field_schema["name"] == "test"
][0]
result["pandas.io.json.build_table_schema"] = {
"success": True,
"type": field_type,
}
except Exception as e:
result["pandas.io.json.build_table_schema"] = {
"error": str(e),
"success": False,
"traceback": traceback.format_exc(),
}

if as_dataframe:
return pd.DataFrame(result).transpose()
return result


def test_json_clean(value: Any, as_dataframe: bool = False) -> dict:
"""
Convenience function to test the compatibility of a given object
with the jupyter_client.jsonutil.json_clean function, which
is called during IPython.display after dx formatting.
"""
df = pd.DataFrame({"test": [value]})
result = {}

try:
from jupyter_client.jsonutil import json_clean

clean_json = json_clean(df.to_dict("records"))
clean_json_value = clean_json[0]["test"]
result["jupyter_client.jsonutil.json_clean"] = {
"success": True,
"type": type(clean_json_value),
"value": clean_json_value,
}
except Exception as e:
result["jupyter_client.jsonutil.json_clean"] = {
"error": str(e),
"success": False,
"traceback": traceback.format_exc(),
}

if as_dataframe:
return pd.DataFrame(result).transpose()
return result


def test_db_write(value: Any, as_dataframe: bool = False) -> dict:
"""
Convenience function to test the compatibility of a given object
inside a pandas DataFrame during registration with a duckdb connection,
which is used during Datalink-enabled dataframe tracking for
push-down filtering.
"""
from dx.utils.tracking import get_db_connection # circular import

df = pd.DataFrame({"test": [value]})
result = {}

db_connection = get_db_connection()
try:
db_connection.register("test", df)
db_df = db_connection.execute("SELECT * FROM test").df()
db_df_value = db_df.iloc[0]["test"]
result["duckdb.conn.register"] = {
"type": type(db_df_value),
"success": True,
"value": db_df_value,
}
except Exception as e:
result["duckdb.conn.register"] = {
"error": str(e),
"success": False,
"traceback": traceback.format_exc(),
}

if as_dataframe:
return pd.DataFrame(result).transpose()
return result


def test_dx_handling(value: Any, as_dataframe: bool = False) -> dict:
"""
Convenience function to test the compatibility of a given object
inside a pandas DataFrame through the entire dx formatting
and data type handling process
"""
from dx.formatters.main import handle_format # circular import

df = pd.DataFrame({"test": [value]})
result = {}

try:
payload, _ = handle_format(df, with_ipython_display=False)

if settings.DISPLAY_MODE == "simple":
dx_value = payload[settings.MEDIA_TYPE]["data"][0]["test"]
if settings.DISPLAY_MODE == "enhanced":
dx_value = payload[settings.MEDIA_TYPE]["data"][0][0]

dx_schema_fields = payload[settings.MEDIA_TYPE]["schema"]["fields"]
# should only be two fields here by default: `index` and `test`
# but we wanted to run the entire formatting process, which doesn't need
# an option to disable `index` from being included
dx_schema_type = [field["type"] for field in dx_schema_fields if field["name"] == "test"][0]

result["dx.handle_format"] = {
"type": type(dx_value),
"success": True,
"value": dx_value,
"schema_type": dx_schema_type,
}
except Exception as e:
result["dx.handle_format"] = {
"error": str(e),
"success": False,
"traceback": traceback.format_exc(),
}

if as_dataframe:
return pd.DataFrame(result).transpose()
return result
36 changes: 36 additions & 0 deletions src/dx/utils/date_time.py → src/dx/datatypes/date_time.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,24 @@ def generate_datetime_series(num_rows: int) -> pd.Series:
)


def generate_date_series(num_rows: int) -> pd.Series:
return pd.Series(
[
(pd.Timestamp("now") + pd.Timedelta(f"{np.random.randint(-1000, 1000)} hours")).date()
for _ in range(num_rows)
]
)


def generate_time_series(num_rows: int) -> pd.Series:
return pd.Series(
[
(pd.Timestamp("now") + pd.Timedelta(f"{np.random.randint(-1000, 1000)} hours")).time()
for _ in range(num_rows)
]
)


def generate_time_period_series(num_rows: int) -> pd.Series:
return pd.Series(
[
Expand Down Expand Up @@ -70,6 +88,24 @@ def handle_time_delta_series(s: pd.Series) -> pd.Series:
return s


def handle_date_series(s: pd.Series) -> pd.Series:
types = (datetime.date,)
if any(isinstance(v, types) for v in s.dropna().head().values):
logger.debug(
f"series `{s.name}` has datetime.date values; converting with pd.to_datetime()"
)
s = pd.to_datetime(s)
return s


def handle_time_series(s: pd.Series) -> pd.Series:
types = (datetime.time,)
if any(isinstance(v, types) for v in s.dropna().head().values):
logger.debug(f"series `{s.name}` has datetime.time values; converting to string")
s = s.astype(str)
return s


def is_datetime_series(s: pd.Series) -> bool:
if str(s.dtype) in ("int", "float", "bool", "category", "period", "interval"):
return False
Expand Down
2 changes: 1 addition & 1 deletion src/dx/utils/geometry.py → src/dx/datatypes/geometry.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def generate_latlon_series(num_rows: int):

lats = [random.randint(-90, 89) + np.random.rand() for _ in range(num_rows)]
lons = [random.randint(-180, 179) + np.random.rand() for _ in range(num_rows)]
return gpd.points_from_xy(lons, lats)
return gpd.GeoSeries(gpd.points_from_xy(lons, lats))


def generate_filled_geojson_series(
Expand Down
Loading

0 comments on commit 816a165

Please sign in to comment.