ray-project · pcmoritz · Jun 23, 2023 · Jun 21, 2023 · Jun 21, 2023 · Jun 21, 2023
@@ -1719,6 +1719,33 @@ def groupby(self, key: Optional[str]) -> "GroupedData":
 
         return GroupedData(self, key)
 
+    def distinct(self) -> "Dataset":
+        """Remove duplicate rows from the :class:`~ray.data.Dataset`.
+
+        Examples:
+            >>> import ray
+            >>> ds = ray.data.from_items([1, 2, 3, 2, 3])
+            >>> ds.distinct().take_all()
+            [{'item': 1}, {'item': 2}, {'item': 3}]
+
+        Time complexity: O(dataset size * log(dataset size / parallelism))
+
+        .. note:: Currently distinct only supports :class:`~ray.data.Dataset`s with one single column.
+
+        Returns:
+            A new :class:`~ray.data.Dataset` with distinct rows.
+        """
+        columns = self.columns(fetch_if_missing=True)
+        assert columns is not None
+        if len(columns) > 1:
+            # TODO(hchen): Remove this limitation once groupby supports
+            # multiple columns.
+            raise NotImplementedError(
+                "`distinct` currently only suports Datasets with one single column, "
+                "please apply `select_columns` before `distinct`."
+            )
+        return self.groupby(columns[0]).count().drop_columns(["count()"])
+
     @ConsumptionAPI
     def aggregate(self, *aggs: AggregateFn) -> Union[Any, Dict[str, Any]]:
         """Aggregate the entire dataset as one group.

@@ -217,6 +217,24 @@ def test_repartition_shuffle_arrow(ray_start_regular_shared):
     assert large._block_num_rows() == [500] * 20
 
 
+def test_distinct(ray_start_regular_shared):
+    ds = ray.data.from_items([3, 2, 3, 1, 2, 3])
+    assert ds.distinct().sort("item").take_all() == [
+      {"item": 1},
+      {"item": 2},
+      {"item": 3},
+    ]
+    ds = ray.data.from_items([
+        {"a": 1, "b": 1},
+        {"a": 1, "b": 2},
+    ])
+    # Currently, we don't support distinct on multiple columns.
+    with pytest.raises(NotImplementedError):
+        ds.distinct().take_all()
+    # After selecting a single column, distinct should work.
+    assert ds.select_columns(["a"]).distinct().take_all() == [{"a": 1}]
+
+
 def test_grouped_dataset_repr(ray_start_regular_shared):
     ds = ray.data.from_items([{"key": "spam"}, {"key": "ham"}, {"key": "spam"}])
     assert repr(ds.groupby("key")) == f"GroupedData(dataset={ds!r}, key='key')"