ray-project · richardliaw · Aug 11, 2022 · Jul 21, 2022 · Jul 21, 2022 · Jul 21, 2022
@@ -17,16 +17,78 @@ Preprocessor
 .. autoclass:: ray.data.preprocessor.Preprocessor
     :members:
 
-Built-in Preprocessors
-######################
+General Preprocessors
+#####################
 
-.. automodule:: ray.data.preprocessors
-    :members:
+.. autoclass:: ray.data.preprocessors.BatchMapper
+    :show-inheritance:
+
+.. autoclass:: ray.data.preprocessors.Chain
+    :show-inheritance:
+
+.. autoclass:: ray.data.preprocessors.Concatenator
+    :show-inheritance:
+
+.. autoclass:: ray.data.preprocessors.SimpleImputer
     :show-inheritance:
 
 .. automethod:: ray.data.Dataset.train_test_split
     :noindex:
 
+Categorical Encoders
+####################
+
+.. autoclass:: ray.data.preprocessors.Categorizer
+    :show-inheritance:
+
+.. autoclass:: ray.data.preprocessors.LabelEncoder
+    :show-inheritance:
+
+.. autoclass:: ray.data.preprocessors.MultiHotEncoder
+    :show-inheritance:
+
+.. autoclass:: ray.data.preprocessors.OneHotEncoder
+    :show-inheritance:
+
+.. autoclass:: ray.data.preprocessors.OrdinalEncoder
+    :show-inheritance:
+
+Feature Scalers
+###############
+
+.. autoclass:: ray.data.preprocessors.MaxAbsScaler
+    :show-inheritance:
+
+.. autoclass:: ray.data.preprocessors.MinMaxScaler
+    :show-inheritance:
+
+.. autoclass:: ray.data.preprocessors.Normalizer
+    :show-inheritance:
+
+.. autoclass:: ray.data.preprocessors.PowerTransformer
+    :show-inheritance:
+
+.. autoclass:: ray.data.preprocessors.RobustScaler
+    :show-inheritance:
+
+.. autoclass:: ray.data.preprocessors.StandardScaler
+    :show-inheritance:
+
+Text Encoders
+#############
+
+.. autoclass:: ray.data.preprocessors.CountVectorizer
+    :show-inheritance:
+
+.. autoclass:: ray.data.preprocessors.FeatureHasher
+    :show-inheritance:
+
+.. autoclass:: ray.data.preprocessors.HashingVectorizer
+    :show-inheritance:
+
+.. autoclass:: ray.data.preprocessors.Tokenizer
+    :show-inheritance:
+
 .. _air-abstract-trainer-ref:
 
 Trainer

@@ -35,11 +35,12 @@ class Preprocessor(abc.ABC):
 
     If you are implementing your own Preprocessor sub-class, you should override the
     following:
-    * ``_fit`` - if your preprocessor is stateful. Otherwise, set
-    ``_is_fittable=False``.
-    * ``_transform_pandas`` and/or ``_transform_arrow`` - for best performance,
-    implement both. Otherwise, the data will be converted to the match the
-    implemented method.
+
+    * ``_fit`` if your preprocessor is stateful. Otherwise, set
+      ``_is_fittable=False``.
+    * ``_transform_pandas`` and/or ``_transform_arrow`` for best performance,
+      implement both. Otherwise, the data will be converted to the match the
+      implemented method.
     """
 
     class FitStatus(str, Enum):
@@ -129,7 +130,7 @@ def transform(self, dataset: Dataset) -> Dataset:
             ray.data.Dataset: The transformed Dataset.
 
         Raises:
-            PreprocessorNotFittedException, if ``fit`` is not called yet.
+            PreprocessorNotFittedException: if ``fit`` is not called yet.
         """
         fit_status = self.fit_status()
         if fit_status in (
@@ -154,7 +155,8 @@ def transform_batch(self, df: "DataBatchType") -> "DataBatchType":
             df: Input data batch.
 
         Returns:
-            DataBatchType: The transformed data batch. This may differ
+            DataBatchType:
+                The transformed data batch. This may differ
                 from the input type depending on which ``_transform_*`` method(s)
                 are implemented.
         """

@@ -7,14 +7,38 @@
 
 
 class BatchMapper(Preprocessor):
-    """Apply ``fn`` to batches of records of given dataset.
-
-    This is meant to be generic and supports low level operation on records.
-    One could easily leverage this preprocessor to achieve operations like
-    adding a new column or modifying a column in place.
+    """Apply an arbitrary operation to a dataset.
+
+    :class:`BatchMapper` applies a user-defined function to batches of a dataset. A
+    batch is a Pandas ``DataFrame`` that represents a small amount of data. By modifying
+    batches instead of individual records, this class can efficiently transform a
+    dataset with vectorized operations.
+
+    Use this preprocessor to apply stateless operations that aren't already built-in.
+
+    .. tip::
+        :class:`BatchMapper` doesn't need to be fit. You can call
+        ``transform`` without calling ``fit``.
+
+    Examples:
+        Use :class:`BatchMapper` to apply arbitrary operations like dropping a column.
+
+        >>> import pandas as pd
+        >>> import ray
+        >>> from ray.data.preprocessors import BatchMapper
+        >>>
+        >>> df = pd.DataFrame({"X": [0, 1, 2], "Y": [3, 4, 5]})
+        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
+        >>>
+        >>> def fn(batch: pd.DataFrame) -> pd.DataFrame:
+        ...     return batch.drop("Y", axis="columns")
+        >>>
+        >>> preprocessor = BatchMapper(fn)
+        >>> preprocessor.transform(ds)  # doctest: +SKIP
+        Dataset(num_blocks=1, num_rows=3, schema={X: int64})
 
     Args:
-        fn: The udf function for batch operation.
+        fn: The function to apply to data batches.
     """
 
     _is_fittable = False

@@ -7,14 +7,36 @@
 
 
 class Chain(Preprocessor):
-    """Chain multiple Preprocessors into a single Preprocessor.
+    """Combine multiple preprocessors into a single :py:class:`Preprocessor`.
 
-    Calling ``fit`` will invoke ``fit_transform`` on the input preprocessors,
-    so that one preprocessor can ``fit`` based on columns/values produced by
-    the ``transform`` of a preceding preprocessor.
+    When you call ``fit``, each preprocessor is fit on the dataset produced by the
+    preceeding preprocessor's ``fit_transform``.
+
+    Example:
+        >>> import pandas as pd
+        >>> import ray
+        >>> from ray.data.preprocessors import *
+        >>>
+        >>> df = pd.DataFrame({
+        ...     "X0": [0, 1, 2],
+        ...     "X1": [3, 4, 5],
+        ...     "Y": ["orange", "blue", "orange"],
+        ... })
+        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
+        >>>
+        >>> preprocessor = Chain(
+        ...     StandardScaler(columns=["X0", "X1"]),
+        ...     Concatenator(include=["X0", "X1"], output_column_name="X"),
+        ...     LabelEncoder(label_column="Y")
+        ... )
+        >>> preprocessor.fit_transform(ds).to_pandas()  # doctest: +SKIP
+           Y                                         X
+        0  1  [-1.224744871391589, -1.224744871391589]
+        1  0                                [0.0, 0.0]
+        2  1    [1.224744871391589, 1.224744871391589]
 
     Args:
-        preprocessors: The preprocessors that should be executed sequentially.
+        preprocessors: The preprocessors to sequentially compose.
     """
 
     def fit_status(self):

@@ -6,43 +6,101 @@
 
 
 class Concatenator(Preprocessor):
-    """Creates a tensor column via concatenation.
+    """Combine numeric columns into a column of type
+    :class:`~ray.air.util.tensor_extensions.pandas.TensorDtype`.
 
-    A tensor column is a column consisting of ndarrays as elements.
-    The tensor column will be generated from the provided list
-    of columns and will take on the provided "output" label.
-    Columns that are included in the concatenation
-    will be dropped, while columns that are not included in concatenation
-    will be preserved.
+    This preprocessor concatenates numeric columns and stores the result in a new
+    column. The new column contains
+    :class:`~ray.air.util.tensor_extensions.pandas.TensorArrayElement` objects of
+    shape :math:`(m,)`, where :math:`m` is the number of columns concatenated.
+    The :math:`m` concatenated columns are dropped after concatenation.
 
-    Example:
-        >>> import ray
+    Examples:
+        >>> import numpy as np
         >>> import pandas as pd
+        >>> import ray
         >>> from ray.data.preprocessors import Concatenator
-        >>> df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4],})
+
+        :py:class:`Concatenator` combines numeric columns into a column of
+        :py:class:`~ray.air.util.tensor_extensions.pandas.TensorDtype`.
+
+        >>> df = pd.DataFrame({"X0": [0, 3, 1], "X1": [0.5, 0.2, 0.9]})
+        >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
+        >>> concatenator = Concatenator()
+        >>> concatenator.fit_transform(ds).to_pandas()  # doctest: +SKIP
+           concat_out
+        0  [0.0, 0.5]
+        1  [3.0, 0.2]
+        2  [1.0, 0.9]
+
+        By default, the created column is called `"concat_out"`, but you can specify
+        a different name.
+
+        >>> concatenator = Concatenator(output_column_name="tensor")
+        >>> concatenator.fit_transform(ds).to_pandas()  # doctest: +SKIP
+               tensor
+        0  [0.0, 0.5]
+        1  [3.0, 0.2]
+        2  [1.0, 0.9]
+
+        Sometimes, you might not want to concatenate all of of the columns in your
+        dataset. In this case, you can exclude columns with the ``exclude`` parameter.
+
+        >>> df = pd.DataFrame({"X0": [0, 3, 1], "X1": [0.5, 0.2, 0.9], "Y": ["blue", "orange", "blue"]})
         >>> ds = ray.data.from_pandas(df)  # doctest: +SKIP
-        >>> prep = Concatenator(output_column_name="c") # doctest: +SKIP
-        >>> new_ds = prep.transform(ds) # doctest: +SKIP
-        >>> assert set(new_ds.take(1)[0]) == {"c"} # doctest: +SKIP
+        >>> concatenator = Concatenator(exclude=["Y"])
+        >>> concatenator.fit_transform(ds).to_pandas()  # doctest: +SKIP
+                Y  concat_out
+        0    blue  [0.0, 0.5]
+        1  orange  [3.0, 0.2]
+        2    blue  [1.0, 0.9]
+
+        Alternatively, you can specify which columns to concatenate with the
+        ``include`` parameter.
+
+        >>> concatenator = Concatenator(include=["X0", "X1"])
+        >>> concatenator.fit_transform(ds).to_pandas()  # doctest: +SKIP
+                Y  concat_out
+        0    blue  [0.0, 0.5]
+        1  orange  [3.0, 0.2]
+        2    blue  [1.0, 0.9]
+
+        Note that if a column is in both ``include`` and ``exclude``, the column is
+        excluded.
+
+        >>> concatenator = Concatenator(include=["X0", "X1", "Y"], exclude=["Y"])
+        >>> concatenator.fit_transform(ds).to_pandas()  # doctest: +SKIP
+                Y  concat_out
+        0    blue  [0.0, 0.5]
+        1  orange  [3.0, 0.2]
+        2    blue  [1.0, 0.9]
+
+        By default, the concatenated tensor is a ``dtype`` common to the input columns.
+        However, you can also explicitly set the ``dtype`` with the ``dtype``
+        parameter.
+
+        >>> concatenator = Concatenator(include=["X0", "X1"], dtype=np.float32)
+        >>> concatenator.fit_transform(ds)  # doctest: +SKIP
+        Dataset(num_blocks=1, num_rows=3, schema={Y: object, concat_out: TensorDtype(shape=(2,), dtype=float32)})
 
     Args:
-        output_column_name: output_column_name is a string that represents the
-            name of the outputted, concatenated tensor column. Defaults to
-            "concat_out".
-        include: A list of column names to be included for
-            concatenation. If None, then all columns will be included.
-            Included columns will be dropped after concatenation.
-        exclude: List of column names to be excluded
-            from concatenation. Exclude takes precedence over include.
-        dtype: Optional. The dtype to convert the output column array to.
-        raise_if_missing: Optional. If True, an error will be raised if any
-            of the columns to in 'include' or 'exclude' are
-            not present in the dataset schema.
+        output_column_name: The desired name for the new column.
+            Defaults to ``"concat_out"``.
+        include: A list of columns to concatenate. If ``None``, all columns are
+            concatenated.
+        exclude: A list of column to exclude from concatenation.
+            If a column is in both ``include`` and ``exclude``, the column is excluded
+            from concatenation.
+        dtype: The ``dtype`` to convert the output tensors to. If unspecified,
+            the ``dtype`` is determined by standard coercion rules.
+        raise_if_missing: If ``True``, an error is raised if any
+            of the columns in ``include`` or ``exclude`` don't exist.
+            Defaults to ``False``.
 
     Raises:
-        ValueError if `raise_if_missing=True` and any column name in
-            `include` or `exclude` does not exist in the dataset columns.
-    """
+        ValueError: if `raise_if_missing` is `True` and a column in `include` or
+            `exclude` doesn't exist in the dataset.
+    """  # noqa: E501
 
     _is_fittable = False