From e9fe2ed29160b328edf30b0551dd3d0fbb3c972e Mon Sep 17 00:00:00 2001
From: Shankari <shankari@eecs.berkeley.edu>
Date: Fri, 14 Jan 2022 17:31:42 -0800
Subject: [PATCH] Create a dataframe version of has_final_labels

Try two separate versions of applying `has_final_labels` for dataframes.
- one row by row using `dataframe.apply`
- the other filtering directly with column-wise operations

I think that the second is more peformant so going with it for now
If that is no longer true, we can switch back to the `apply`, in which case we
can remove `has_final_labels_df`
---
 emission/storage/decorations/trip_queries.py  |  9 +++++
 .../tests/storageTests/TestTripQueries.py     | 33 +++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/emission/storage/decorations/trip_queries.py b/emission/storage/decorations/trip_queries.py
index 5fe0c4f16..a59cb403a 100644
--- a/emission/storage/decorations/trip_queries.py
+++ b/emission/storage/decorations/trip_queries.py
@@ -236,6 +236,15 @@ def has_final_labels(confirmed_trip_data):
     return (confirmed_trip_data["user_input"] != {}
             or confirmed_trip_data["expectation"]["to_label"] == False)
 
+# Create an alternate method to work on the dataframe column-wise
+# instead of iterating over each individual row for improved performance
+def has_final_labels_df(df):
+    # print(df.expectation)
+    # print(pd.DataFrame(df.expectation.to_list(), index=df.index))
+    to_list_series = pd.DataFrame(df.expectation.to_list(), index=df.index).to_label
+    return df[(df.user_input != {})
+            | (to_list_series == False)]
+
 def get_max_prob_label(inferred_label_list):
     # Two columns: "labels" and "p"
     label_prob_df = pd.DataFrame(inferred_label_list)
diff --git a/emission/tests/storageTests/TestTripQueries.py b/emission/tests/storageTests/TestTripQueries.py
index 92b3f55b9..5645047b4 100644
--- a/emission/tests/storageTests/TestTripQueries.py
+++ b/emission/tests/storageTests/TestTripQueries.py
@@ -399,6 +399,39 @@ def testHasFinalLabels(self):
             "expectation": {"to_label": False}
         })))
 
+    def testHasFinalLabelsDataFrame(self):
+        test_mixed_df = pd.DataFrame(
+            [{"user_input": {"mode_confirm": "bike", "purpose_confirm": "shopping"},
+            "expectation": {"to_label": True}}] * 3 +
+            [{"user_input": {}, "expectation": {"to_label": False},
+            "inferred_labels":
+                [{"labels": {"mode_confirm": "bike", "purpose_confirm": "shopping"}, "p": 0.1},
+                {"labels": {"mode_confirm": "walk", "purpose_confirm": "exercise"}, "p": 0.9}]
+            }] * 3 +
+            [{"user_input": {}, "expectation": {"to_label": True},
+            "inferred_labels":
+                [{"labels": {"mode_confirm": "bike", "purpose_confirm": "shopping"}, "p": 0.2},
+                {"labels": {"mode_confirm": "walk", "purpose_confirm": "exercise"}, "p": 0.4},
+                {"labels": {"mode_confirm": "drove_alone", "purpose_confirm": "work"}, "p": 0.4}]
+            }] * 3 +
+            [{"user_input": {}, "expectation": {"to_label": True}}] * 3)
+
+        has_user_labels_df = test_mixed_df[test_mixed_df.user_input != {}]
+        # only the actual user inputs will be counted in the old way
+        self.assertEqual(has_user_labels_df.shape[0], 3)
+
+        # print(test_mixed_df.apply(lambda row: print(row.user_input), axis=1))
+        self.assertEqual(np.count_nonzero(test_mixed_df.apply(
+            lambda row: esdt.has_final_labels(row), axis=1)), 6)
+
+        has_final_labels_df = test_mixed_df[test_mixed_df.apply(
+            lambda row: esdt.has_final_labels(row), axis=1)]
+        # the actual user inputs and to_label = false will be counted in the new way
+        self.assertEqual(has_final_labels_df.shape[0], 6)
+
+        self.assertEqual(esdt.has_final_labels_df(test_mixed_df).shape[0], 6)
+
+
     def testGetMaxProbLabel(self):
         self.assertEqual(esdt.get_max_prob_label([
             {'labels': {'mc': 30, 'pc': 40}, 'p': 0.9}]), {'mc': 30, 'pc': 40})