Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

handle conversions from np.nan and pd.NA to None in display formatter payloads #43

Merged
merged 8 commits into from
Sep 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions dx/formatters/dataresource.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,14 @@ def generate_dataresource_body(
schema = build_table_schema(df)
logger.debug(f"{schema=}")

# fillna(np.nan) to handle pd.NA values
data = df.fillna(np.nan).reset_index().to_dict("records")
# This is a little odd, but it allows replacing `pd.NA` and np.nan
# with `None` values without altering any of the other values.
# Without converting to `object`, `NaN`s will persist (but `pd.NA`s
# will be converted to `None`).
# We build the schema first since, after this, the dtypes will be
# changed to `object` for any Series whose values were replaced with `None`s.
clean_df = df.astype(object).where(df.notnull(), None)
data = clean_df.reset_index().to_dict("records")

payload = {
"schema": schema,
Expand Down
10 changes: 8 additions & 2 deletions dx/formatters/dx.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,14 @@ def generate_dx_body(
schema = build_table_schema(df)
logger.debug(f"{schema=}")

# fillna(np.nan) to handle pd.NA values
data = df.fillna(np.nan).reset_index().transpose().values.tolist()
# This is a little odd, but it allows replacing `pd.NA` and np.nan
# with `None` values without altering any of the other values.
# Without converting to `object`, `NaN`s will persist (but `pd.NA`s
# will be converted to `None`).
# We build the schema first since, after this, the dtypes will be
# changed to `object` for any Series whose values were replaced with `None`s.
clean_df = df.astype(object).where(df.notnull(), None)
data = clean_df.reset_index().transpose().values.tolist()

# this will include the `df.index` by default (e.g. slicing/sampling)
payload = {
Expand Down
18 changes: 18 additions & 0 deletions dx/tests/test_dataresource.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import uuid

import numpy as np
import pandas as pd
import pytest

from dx.formatters.dataresource import (
Expand Down Expand Up @@ -60,3 +62,19 @@ def test_datalink_toggle(enabled: bool):
format_dataresource(df)
except Exception as e:
assert False, f"failed with {e}"


@pytest.mark.parametrize("null_value", [np.nan, pd.NA])
def test_dx_converts_na_to_none(null_value):
"""
Test dataresource formatting properly converts `pd.NA` and `NaN`
values to `None` before passing along the payload.
"""
df = pd.DataFrame({
"foo": [1, 2, null_value],
"bar": ["a", null_value, "b"],
})
payload = generate_dataresource_body(df)
assert payload["data"][0] == {"index": 0, "foo": 1, "bar": "a"}
assert payload["data"][1] == {"index": 1, "foo": 2, "bar": None}
assert payload["data"][2] == {"index": 2, "foo": None, "bar": "b"}
17 changes: 17 additions & 0 deletions dx/tests/test_dx.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import uuid

import numpy as np
import pandas as pd
import pytest

from dx.formatters.dx import format_dx, generate_dx_body, get_dx_settings
Expand Down Expand Up @@ -57,3 +59,18 @@ def test_datalink_toggle(enabled: bool):
format_dx(df)
except Exception as e:
assert False, f"failed with {e}"


@pytest.mark.parametrize("null_value", [np.nan, pd.NA])
def test_dx_converts_na_to_none(null_value):
"""
Test dx formatting properly converts `pd.NA` and `NaN`
values to `None` before passing along the payload.
"""
df = pd.DataFrame({
"foo": [1, 2, null_value],
"bar": ["a", null_value, "b"],
})
payload = generate_dx_body(df)
assert payload["data"][1] == [1, 2, None]
assert payload["data"][2] == ["a", None, "b"]