From bb82168f37e4de37e766d895bcfa6296fb9290a1 Mon Sep 17 00:00:00 2001 From: David Gardner Date: Mon, 19 Sep 2022 13:40:46 -0700 Subject: [PATCH] Rename variables and update comments to be generalized --- .../guides/5_digital_fingerprinting.md | 52 ++++++++++++++++++- .../morpheus/dfp/utils/column_info.py | 6 +-- 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/docs/source/developer_guide/guides/5_digital_fingerprinting.md b/docs/source/developer_guide/guides/5_digital_fingerprinting.md index 277debe57c..75cef6c5b7 100644 --- a/docs/source/developer_guide/guides/5_digital_fingerprinting.md +++ b/docs/source/developer_guide/guides/5_digital_fingerprinting.md @@ -298,7 +298,57 @@ field = BoolColumn(name="result", false_values=["DENIED", "CANCELED", "EXPIRED"]) ``` -We used strings in this example, however we also could have just as easily mapped integer status codes. +We used strings in this example, however we also could have just as easily mapped integer status codes. We also have the ability to map on to types other than boolean by providing custom values for true and false (eg. `1`/`0`, `yes`/`no`) . + +| Argument | Type | Descirption | +| -------- | ---- | ----------- | +| `name` | `str` | Name of the destination column | +| `dtype` | `str` or Python type | Typically this should be `bool` however it could potentially be another type if `true_value` and `false_value` are specified. | +| `input_name` | `str` | Original column name | +| `true_value` | Any | Optional value to store for true values, should be of a type `dtype`. Defaults to `True`. | +| `false_value` | Any | Optional value to store for false values, should be of a type `dtype`. Defaults to `False`. | +| `true_values` | `List[str]` | List of string values to be interpreted as true. | +| `false_values` | `List[str]` | List of string values to be interpreted as false. | + +##### DateTimeColumn +Subclass of `RenameColumn` specific to casting UTC localized datetime values. When incoming values contain a time-zone offset string the values are converted to UTC, while values without a time-zone are assumed to be UTC. + +| Argument | Type | Descirption | +| -------- | ---- | ----------- | +| `name` | `str` | Name of the destination column | +| `dtype` | `str` or Python type | Any type string or Python class recognized by [Pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | +| `input_name` | `str` | Original column name | + +##### StringJoinColumn +Subclass of `RenameColumn`, converts incoming `list` values to string by joining by `sep`. + +| Argument | Type | Descirption | +| -------- | ---- | ----------- | +| `name` | `str` | Name of the destination column | +| `dtype` | `str` or Python type | Any type string or Python class recognized by [Pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | +| `input_name` | `str` | Original column name | +| `sep` | `str` | Separator string to use for the join | + +##### StringCatColumn +Concatinate values from multiple columns into a new string column separated by `sep`. + +| Argument | Type | Descirption | +| -------- | ---- | ----------- | +| `name` | `str` | Name of the destination column | +| `dtype` | `str` or Python type | Any type string or Python class recognized by [Pandas](https://pandas.pydata.org/docs/user_guide/basics.html#dtypes) | +| `input_columns` | `List[str]` | List of columns to concatinate | +| `sep` | `str` | Separator string | + +##### IncrementColumn +Subclass of `DateTimeColumn`, counts the unique occurrences of a value in `groupby_column` over a specific time window `period` based on dates in the `input_name` field. + +| Argument | Type | Descirption | +| -------- | ---- | ----------- | +| `name` | `str` | Name of the destination column | +| `dtype` | `str` or Python type | Should be `int` or other integer class | +| `input_name` | `str` | Original column name containing timestamp values | +| `groupby_column` | `str` | Column name to group by | +| `period` | `str` | Optional time period to peform the calculation over, value must be [one of pandas' offset strings](https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases). Defaults to `D` one day | ### Output Stages ![Output Stages](img/dfp_output_config.png) diff --git a/examples/digital_fingerprinting/production/morpheus/dfp/utils/column_info.py b/examples/digital_fingerprinting/production/morpheus/dfp/utils/column_info.py index ca707af664..ab093885e2 100644 --- a/examples/digital_fingerprinting/production/morpheus/dfp/utils/column_info.py +++ b/examples/digital_fingerprinting/production/morpheus/dfp/utils/column_info.py @@ -159,10 +159,10 @@ class IncrementColumn(DateTimeColumn): period: str = "D" def process_column(self, df: pd.DataFrame) -> pd.Series: - per_day = super().process_column(df).dt.to_period(self.period) + period = super().process_column(df).dt.to_period(self.period) - # Create the per-user, per-day log count - return df.groupby([self.groupby_column, per_day]).cumcount() + # Create the `groupby_column`, per-period log count + return df.groupby([self.groupby_column, period]).cumcount() @dataclasses.dataclass