From e9fe2ed29160b328edf30b0551dd3d0fbb3c972e Mon Sep 17 00:00:00 2001 From: Shankari Date: Fri, 14 Jan 2022 17:31:42 -0800 Subject: [PATCH] Create a dataframe version of has_final_labels Try two separate versions of applying `has_final_labels` for dataframes. - one row by row using `dataframe.apply` - the other filtering directly with column-wise operations I think that the second is more peformant so going with it for now If that is no longer true, we can switch back to the `apply`, in which case we can remove `has_final_labels_df` --- emission/storage/decorations/trip_queries.py | 9 +++++ .../tests/storageTests/TestTripQueries.py | 33 +++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/emission/storage/decorations/trip_queries.py b/emission/storage/decorations/trip_queries.py index 5fe0c4f16..a59cb403a 100644 --- a/emission/storage/decorations/trip_queries.py +++ b/emission/storage/decorations/trip_queries.py @@ -236,6 +236,15 @@ def has_final_labels(confirmed_trip_data): return (confirmed_trip_data["user_input"] != {} or confirmed_trip_data["expectation"]["to_label"] == False) +# Create an alternate method to work on the dataframe column-wise +# instead of iterating over each individual row for improved performance +def has_final_labels_df(df): + # print(df.expectation) + # print(pd.DataFrame(df.expectation.to_list(), index=df.index)) + to_list_series = pd.DataFrame(df.expectation.to_list(), index=df.index).to_label + return df[(df.user_input != {}) + | (to_list_series == False)] + def get_max_prob_label(inferred_label_list): # Two columns: "labels" and "p" label_prob_df = pd.DataFrame(inferred_label_list) diff --git a/emission/tests/storageTests/TestTripQueries.py b/emission/tests/storageTests/TestTripQueries.py index 92b3f55b9..5645047b4 100644 --- a/emission/tests/storageTests/TestTripQueries.py +++ b/emission/tests/storageTests/TestTripQueries.py @@ -399,6 +399,39 @@ def testHasFinalLabels(self): "expectation": {"to_label": False} }))) + def testHasFinalLabelsDataFrame(self): + test_mixed_df = pd.DataFrame( + [{"user_input": {"mode_confirm": "bike", "purpose_confirm": "shopping"}, + "expectation": {"to_label": True}}] * 3 + + [{"user_input": {}, "expectation": {"to_label": False}, + "inferred_labels": + [{"labels": {"mode_confirm": "bike", "purpose_confirm": "shopping"}, "p": 0.1}, + {"labels": {"mode_confirm": "walk", "purpose_confirm": "exercise"}, "p": 0.9}] + }] * 3 + + [{"user_input": {}, "expectation": {"to_label": True}, + "inferred_labels": + [{"labels": {"mode_confirm": "bike", "purpose_confirm": "shopping"}, "p": 0.2}, + {"labels": {"mode_confirm": "walk", "purpose_confirm": "exercise"}, "p": 0.4}, + {"labels": {"mode_confirm": "drove_alone", "purpose_confirm": "work"}, "p": 0.4}] + }] * 3 + + [{"user_input": {}, "expectation": {"to_label": True}}] * 3) + + has_user_labels_df = test_mixed_df[test_mixed_df.user_input != {}] + # only the actual user inputs will be counted in the old way + self.assertEqual(has_user_labels_df.shape[0], 3) + + # print(test_mixed_df.apply(lambda row: print(row.user_input), axis=1)) + self.assertEqual(np.count_nonzero(test_mixed_df.apply( + lambda row: esdt.has_final_labels(row), axis=1)), 6) + + has_final_labels_df = test_mixed_df[test_mixed_df.apply( + lambda row: esdt.has_final_labels(row), axis=1)] + # the actual user inputs and to_label = false will be counted in the new way + self.assertEqual(has_final_labels_df.shape[0], 6) + + self.assertEqual(esdt.has_final_labels_df(test_mixed_df).shape[0], 6) + + def testGetMaxProbLabel(self): self.assertEqual(esdt.get_max_prob_label([ {'labels': {'mc': 30, 'pc': 40}, 'p': 0.9}]), {'mc': 30, 'pc': 40})