diff --git a/src/dx/formatters/main.py b/src/dx/formatters/main.py index 4aa26d36..4b419dd1 100644 --- a/src/dx/formatters/main.py +++ b/src/dx/formatters/main.py @@ -58,6 +58,7 @@ def datalink_processing( display_id=dxdf.display_id, has_default_index=default_index_used, with_ipython_display=with_ipython_display, + variable_name=dxdf.variable_name, ) # this needs to happen after sending to the frontend @@ -162,6 +163,7 @@ def format_output( display_id: Optional[str] = None, has_default_index: bool = True, with_ipython_display: bool = True, + variable_name: str = "", ) -> tuple: display_id = display_id or str(uuid.uuid4()) @@ -177,7 +179,11 @@ def format_output( **orig_df_dimensions, **sampled_df_dimensions, } - metadata = generate_metadata(display_id=display_id, **dataframe_info) + metadata = generate_metadata( + display_id=display_id, + variable_name=variable_name, + **dataframe_info, + ) payload = {settings.MEDIA_TYPE: payload} metadata = {settings.MEDIA_TYPE: metadata} diff --git a/src/dx/utils/formatting.py b/src/dx/utils/formatting.py index c17a74d3..1e5f6fba 100644 --- a/src/dx/utils/formatting.py +++ b/src/dx/utils/formatting.py @@ -208,7 +208,7 @@ def clean_column_values(s: pd.Series) -> pd.Series: return s -def generate_metadata(display_id: str, default_index_used: bool = True, **dataframe_info): +def generate_metadata(display_id: str, variable_name: str = "", **dataframe_info): from dx.utils.tracking import DXDF_CACHE filters = [] @@ -247,6 +247,7 @@ def generate_metadata(display_id: str, default_index_used: bool = True, **datafr "applied_filters": filters, "sample_history": sample_history, "sampling_time": pd.Timestamp("now").strftime(settings.DATETIME_STRING_FORMAT), + "variable_name": variable_name, }, "display_id": display_id, } diff --git a/src/dx/utils/tracking.py b/src/dx/utils/tracking.py index a239b13b..c7d55d51 100644 --- a/src/dx/utils/tracking.py +++ b/src/dx/utils/tracking.py @@ -70,7 +70,10 @@ def __init__( self.hash = generate_df_hash(self.df) self.display_id = SUBSET_TO_DISPLAY_ID.get(self.hash, str(uuid.uuid4())) - self.metadata = generate_metadata(self.display_id) + self.metadata = generate_metadata( + display_id=self.display_id, + variable_name=self.variable_name, + ) self.metadata["datalink"]["dataframe_info"] = { "default_index_used": self.default_index_used, **get_df_dimensions(self.df, prefix="orig"), diff --git a/tests/test_formatting.py b/tests/test_formatting.py index 1979ca31..b605a0ab 100644 --- a/tests/test_formatting.py +++ b/tests/test_formatting.py @@ -416,3 +416,100 @@ def test_groupby_dataframe_index_left_alone(self, sample_groupby_dataframe: pd.D df = to_dataframe(sample_groupby_dataframe) assert df.index.equals(sample_groupby_dataframe.index) assert df.columns.equals(sample_groupby_dataframe.columns) + + +class TestMetadataStructure: + @pytest.mark.parametrize("display_mode", ["simple", "enhanced"]) + def test_variable_name_exists( + self, + sample_random_dataframe: pd.DataFrame, + get_ipython: TerminalInteractiveShell, + display_mode: str, + ): + """ + Ensure that the variable name is present in the metadata. + """ + with settings_context( + enable_datalink=True, + display_mode=display_mode, + ): + _, metadata = handle_format(sample_random_dataframe, ipython_shell=get_ipython) + display_metadata = metadata[settings.MEDIA_TYPE] + + assert "datalink" in display_metadata + assert "display_id" in display_metadata + + datalink_metadata = display_metadata["datalink"] + assert "variable_name" in datalink_metadata + assert "dataframe_info" in datalink_metadata + + assert ( + datalink_metadata["dataframe_info"]["orig_num_rows"] == sample_random_dataframe.shape[0] + ) + assert ( + datalink_metadata["dataframe_info"]["orig_num_cols"] == sample_random_dataframe.shape[1] + ) + + assert "dx_settings" in datalink_metadata + assert isinstance(datalink_metadata["applied_filters"], list) + assert isinstance(datalink_metadata["sample_history"], list) + + +class TestMetadataVariableName: + @pytest.mark.parametrize("display_mode", ["simple", "enhanced"]) + def test_assigned_variable_name_matches( + self, + sample_random_dataframe: pd.DataFrame, + get_ipython: TerminalInteractiveShell, + display_mode: str, + ): + """ + Ensure that the assigned variable name is present in the metadata. + """ + get_ipython.user_ns["test_df"] = sample_random_dataframe + + with settings_context( + enable_datalink=True, + display_mode=display_mode, + ): + _, metadata = handle_format(sample_random_dataframe, ipython_shell=get_ipython) + display_metadata = metadata[settings.MEDIA_TYPE] + assert display_metadata["datalink"]["variable_name"] == "test_df" + + @pytest.mark.parametrize("display_mode", ["simple", "enhanced"]) + def test_unassigned_variable_name_present( + self, + sample_random_dataframe: pd.DataFrame, + get_ipython: TerminalInteractiveShell, + display_mode: str, + ): + """ + Ensure that our placeholder variable name is present in the metadata. + """ + with settings_context( + enable_datalink=True, + display_mode=display_mode, + ): + _, metadata = handle_format(sample_random_dataframe, ipython_shell=get_ipython) + display_metadata = metadata[settings.MEDIA_TYPE] + assert display_metadata["datalink"]["variable_name"].startswith("unk_dataframe") + + @pytest.mark.parametrize("display_mode", ["simple", "enhanced"]) + def test_empty_variable_name_with_datalink_disabled( + self, + sample_random_dataframe: pd.DataFrame, + get_ipython: TerminalInteractiveShell, + display_mode: str, + ): + """ + Ensure that our placeholder variable name is present in the metadata. + + (With datalink disabled, no cleaning/hashing/variable association will be done.) + """ + with settings_context( + enable_datalink=False, + display_mode=display_mode, + ): + _, metadata = handle_format(sample_random_dataframe, ipython_shell=get_ipython) + display_metadata = metadata[settings.MEDIA_TYPE] + assert display_metadata["datalink"]["variable_name"] == "" diff --git a/tests/test_tracking.py b/tests/test_tracking.py index b6001e10..4e17fe43 100644 --- a/tests/test_tracking.py +++ b/tests/test_tracking.py @@ -82,30 +82,14 @@ def test_dxdataframe_metadata( """ Test that the DXDataFrame creates metadata for the frontend including the appropriate dataframe information and datalink keys. + + (Similar to ./test_formatting.py::TestMetadataStructure) """ metadata = sample_dxdataframe.metadata assert "datalink" in metadata and "display_id" in metadata assert metadata["display_id"] == sample_dxdataframe.display_id assert metadata["datalink"]["display_id"] == sample_dxdataframe.display_id - datalink_metadata = metadata["datalink"] - - assert "dataframe_info" in datalink_metadata - assert ( - datalink_metadata["dataframe_info"]["orig_num_rows"] - == sample_cleaned_random_dataframe.shape[0] - ) - assert ( - datalink_metadata["dataframe_info"]["orig_num_cols"] - == sample_cleaned_random_dataframe.shape[1] - ) - - assert "dx_settings" in datalink_metadata - assert datalink_metadata["dx_settings"] - - assert isinstance(datalink_metadata["applied_filters"], list) - assert isinstance(datalink_metadata["sample_history"], list) - def test_store_in_db( mocker,