Skip to content

Commit

Permalink
add variable name to metadata (#84)
Browse files Browse the repository at this point in the history
* add df variable name to output metadata

* pass variable name into metadata

* add metadata tests

* move non-DXDataFrame metadata assertions to separate tests
  • Loading branch information
shouples committed Nov 14, 2022
1 parent 0edb57f commit 9cf6636
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 21 deletions.
8 changes: 7 additions & 1 deletion src/dx/formatters/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def datalink_processing(
display_id=dxdf.display_id,
has_default_index=default_index_used,
with_ipython_display=with_ipython_display,
variable_name=dxdf.variable_name,
)

# this needs to happen after sending to the frontend
Expand Down Expand Up @@ -162,6 +163,7 @@ def format_output(
display_id: Optional[str] = None,
has_default_index: bool = True,
with_ipython_display: bool = True,
variable_name: str = "",
) -> tuple:
display_id = display_id or str(uuid.uuid4())

Expand All @@ -177,7 +179,11 @@ def format_output(
**orig_df_dimensions,
**sampled_df_dimensions,
}
metadata = generate_metadata(display_id=display_id, **dataframe_info)
metadata = generate_metadata(
display_id=display_id,
variable_name=variable_name,
**dataframe_info,
)

payload = {settings.MEDIA_TYPE: payload}
metadata = {settings.MEDIA_TYPE: metadata}
Expand Down
3 changes: 2 additions & 1 deletion src/dx/utils/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def clean_column_values(s: pd.Series) -> pd.Series:
return s


def generate_metadata(display_id: str, default_index_used: bool = True, **dataframe_info):
def generate_metadata(display_id: str, variable_name: str = "", **dataframe_info):
from dx.utils.tracking import DXDF_CACHE

filters = []
Expand Down Expand Up @@ -247,6 +247,7 @@ def generate_metadata(display_id: str, default_index_used: bool = True, **datafr
"applied_filters": filters,
"sample_history": sample_history,
"sampling_time": pd.Timestamp("now").strftime(settings.DATETIME_STRING_FORMAT),
"variable_name": variable_name,
},
"display_id": display_id,
}
Expand Down
5 changes: 4 additions & 1 deletion src/dx/utils/tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,10 @@ def __init__(
self.hash = generate_df_hash(self.df)
self.display_id = SUBSET_TO_DISPLAY_ID.get(self.hash, str(uuid.uuid4()))

self.metadata = generate_metadata(self.display_id)
self.metadata = generate_metadata(
display_id=self.display_id,
variable_name=self.variable_name,
)
self.metadata["datalink"]["dataframe_info"] = {
"default_index_used": self.default_index_used,
**get_df_dimensions(self.df, prefix="orig"),
Expand Down
97 changes: 97 additions & 0 deletions tests/test_formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,3 +416,100 @@ def test_groupby_dataframe_index_left_alone(self, sample_groupby_dataframe: pd.D
df = to_dataframe(sample_groupby_dataframe)
assert df.index.equals(sample_groupby_dataframe.index)
assert df.columns.equals(sample_groupby_dataframe.columns)


class TestMetadataStructure:
@pytest.mark.parametrize("display_mode", ["simple", "enhanced"])
def test_variable_name_exists(
self,
sample_random_dataframe: pd.DataFrame,
get_ipython: TerminalInteractiveShell,
display_mode: str,
):
"""
Ensure that the variable name is present in the metadata.
"""
with settings_context(
enable_datalink=True,
display_mode=display_mode,
):
_, metadata = handle_format(sample_random_dataframe, ipython_shell=get_ipython)
display_metadata = metadata[settings.MEDIA_TYPE]

assert "datalink" in display_metadata
assert "display_id" in display_metadata

datalink_metadata = display_metadata["datalink"]
assert "variable_name" in datalink_metadata
assert "dataframe_info" in datalink_metadata

assert (
datalink_metadata["dataframe_info"]["orig_num_rows"] == sample_random_dataframe.shape[0]
)
assert (
datalink_metadata["dataframe_info"]["orig_num_cols"] == sample_random_dataframe.shape[1]
)

assert "dx_settings" in datalink_metadata
assert isinstance(datalink_metadata["applied_filters"], list)
assert isinstance(datalink_metadata["sample_history"], list)


class TestMetadataVariableName:
@pytest.mark.parametrize("display_mode", ["simple", "enhanced"])
def test_assigned_variable_name_matches(
self,
sample_random_dataframe: pd.DataFrame,
get_ipython: TerminalInteractiveShell,
display_mode: str,
):
"""
Ensure that the assigned variable name is present in the metadata.
"""
get_ipython.user_ns["test_df"] = sample_random_dataframe

with settings_context(
enable_datalink=True,
display_mode=display_mode,
):
_, metadata = handle_format(sample_random_dataframe, ipython_shell=get_ipython)
display_metadata = metadata[settings.MEDIA_TYPE]
assert display_metadata["datalink"]["variable_name"] == "test_df"

@pytest.mark.parametrize("display_mode", ["simple", "enhanced"])
def test_unassigned_variable_name_present(
self,
sample_random_dataframe: pd.DataFrame,
get_ipython: TerminalInteractiveShell,
display_mode: str,
):
"""
Ensure that our placeholder variable name is present in the metadata.
"""
with settings_context(
enable_datalink=True,
display_mode=display_mode,
):
_, metadata = handle_format(sample_random_dataframe, ipython_shell=get_ipython)
display_metadata = metadata[settings.MEDIA_TYPE]
assert display_metadata["datalink"]["variable_name"].startswith("unk_dataframe")

@pytest.mark.parametrize("display_mode", ["simple", "enhanced"])
def test_empty_variable_name_with_datalink_disabled(
self,
sample_random_dataframe: pd.DataFrame,
get_ipython: TerminalInteractiveShell,
display_mode: str,
):
"""
Ensure that our placeholder variable name is present in the metadata.
(With datalink disabled, no cleaning/hashing/variable association will be done.)
"""
with settings_context(
enable_datalink=False,
display_mode=display_mode,
):
_, metadata = handle_format(sample_random_dataframe, ipython_shell=get_ipython)
display_metadata = metadata[settings.MEDIA_TYPE]
assert display_metadata["datalink"]["variable_name"] == ""
20 changes: 2 additions & 18 deletions tests/test_tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,30 +82,14 @@ def test_dxdataframe_metadata(
"""
Test that the DXDataFrame creates metadata for the frontend
including the appropriate dataframe information and datalink keys.
(Similar to ./test_formatting.py::TestMetadataStructure)
"""
metadata = sample_dxdataframe.metadata
assert "datalink" in metadata and "display_id" in metadata
assert metadata["display_id"] == sample_dxdataframe.display_id
assert metadata["datalink"]["display_id"] == sample_dxdataframe.display_id

datalink_metadata = metadata["datalink"]

assert "dataframe_info" in datalink_metadata
assert (
datalink_metadata["dataframe_info"]["orig_num_rows"]
== sample_cleaned_random_dataframe.shape[0]
)
assert (
datalink_metadata["dataframe_info"]["orig_num_cols"]
== sample_cleaned_random_dataframe.shape[1]
)

assert "dx_settings" in datalink_metadata
assert datalink_metadata["dx_settings"]

assert isinstance(datalink_metadata["applied_filters"], list)
assert isinstance(datalink_metadata["sample_history"], list)


def test_store_in_db(
mocker,
Expand Down

0 comments on commit 9cf6636

Please sign in to comment.