From 0415c4f77a5f84e7bb6738dd7bace00496231798 Mon Sep 17 00:00:00 2001 From: tveten Date: Mon, 19 Aug 2024 16:14:30 +0200 Subject: [PATCH 01/75] Update dependencies --- pyproject.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index bfc24b29..71f105a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,10 +34,10 @@ classifiers = [ ] requires-python = ">=3.9,<3.13" dependencies = [ - "numpy<1.27,>=1.21", # required for framework layer and base class logic - "pandas<2.2.0,>=1.3", # pandas is the main in-memory data container + "numpy>=1.21", + "pandas>=1.1", "numba>=0.56", # numba is used for fast computation throughout - "sktime>=0.23.0,<0.30.0", + "sktime>=0.30", ] [project.urls] From 6987a38a52da84e71ec32297981af78ea0f6a924 Mon Sep 17 00:00:00 2001 From: tveten Date: Mon, 19 Aug 2024 16:17:43 +0200 Subject: [PATCH 02/75] Add first draft of BaseDetector --- skchange/base.py | 569 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 569 insertions(+) create mode 100644 skchange/base.py diff --git a/skchange/base.py b/skchange/base.py new file mode 100644 index 00000000..78bddf09 --- /dev/null +++ b/skchange/base.py @@ -0,0 +1,569 @@ +"""Detector base class. + + class name: BaseSeriesAnnotator + +Scitype defining methods: + fitting - fit(self, X, Y=None) + detecting - predict(self, X) + updating (temporal) - update(self, X, Y=None) + update&detect - update_predict(self, X) + +Inspection methods: + hyper-parameter inspection - get_params() + fitted parameter inspection - get_fitted_params() + +State: + fitted model/strategy - by convention, any attributes ending in "_" + fitted state flag - check_is_fitted() + +""" + +__author__ = ["mtveten"] +__all__ = ["BaseDetector"] + +from sktime.base import BaseEstimator +from sktime.utils.validation.series import check_series + + +class BaseDetector(BaseEstimator): + """Base detector. + + An alternative implementation to the BaseSeriesAnnotator class from sktime, + more focused on the detection of events of interest. + Enables quicker bug fixes for example, since the annotation module is still + experimental. + + All detectors share the common feature that each element of the output from .predict + indicates the detection of a specific event of interest, such as an anomaly, a + changepoint, or something else. + + Needs to be implemented: + - _fit(self, X, Y=None) -> self + - _predict(self, X) -> pd.Series or pd.DataFrame + + Optional to implement: + - _transform_scores(self, X) -> pd.Series or pd.DataFrame + - _update(self, X, Y=None) -> self + + Required .predict output formats per task and capability: + - task == "anomaly_detection": + pd.Series(anomaly_indices, dtype=int, name="anomalies) + - task == "collective_anomaly_detection": + pd.Series(pd.IntervalIndex( + anomaly_intervals, closed=, name="collective_anomalies" + )) + - task == "change_point_detection": + Changepoints are defined as the last element of a segment. + pd.Series(changepoint_indices, dtype=int, name="changepoints") + - task == "segmentation": + Difference from change point detection: Allows the same label to be assigned to + multiple segments. + pd.Series({ + index = pd.IntervalIndex(segment_intervals, closed=), + values = segment_labels, + }) + - task == "None": + Custom task. + Only restriction is that the output must be a pd.Series or pd.DataFrame where + each element or row corresponds to a detected event. + For .transform to work, .sparse_to_dense must be implemented for custom tasks. + - capability:subset_detection is True: + * task == "anomaly_detection": + pd.DataFrame({ + "location": anomaly_indices, + "columns": affected_components_list, + }) + * task == "collective_anomaly_detection": + pd.DataFrame({ + "location": pd.IntervalIndex(anomaly_intervals, closed=), + "columns": affected_components_list, + }) + * task == "change_point_detection": + pd.DataFrame({ + "location": changepoint_indices, + "columns": affected_components_list, + }) + - capability:detection_score is True: Explicit way of stating that _transform_scores + is implemented. + """ + + _tags = { + "object_type": "detector", # type of object + "learning_type": "None", # Tag to determine test in test_all_annotators + "task": "None", # Tag to determine test in test_all_annotators + # + # todo: distribution_type? we may have to refactor this, seems very soecufuc + "distribution_type": "None", # Tag to determine test in test_all_annotators + } # for unit test cases + + def __init__(self): + self.task = self.get_class_tag("task") + self.learning_type = self.get_class_tag("learning_type") + + self._is_fitted = False + + self._X = None + self._Y = None + + super().__init__() + + def _fit(self, X, Y=None): + """Fit to training data. + + core logic + + Parameters + ---------- + X : pd.DataFrame + Training data to fit model to time series. + Y : pd.Series, optional + Ground truth annotations for training if annotator is supervised. + + Returns + ------- + self : + Reference to self. + + Notes + ----- + Updates fitted model that updates attributes ending in "_". + """ + raise NotImplementedError("abstract method") + + def _predict(self, X): + """Create annotations on test/deployment data. + + core logic + + Parameters + ---------- + X : pd.DataFrame + Data to annotate, time series. + + Returns + ------- + Y : pd.Series + Annotations for sequence X exact format depends on annotation type. + """ + raise NotImplementedError("abstract method") + + def _transform_scores(self, X): + """Return scores for predicted annotations on test/deployment data. + + core logic + + Parameters + ---------- + X : pd.DataFrame + Data to annotate, time series. + + Returns + ------- + Y : pd.Series + One score for each element in X. + Annotations for sequence X exact format depends on annotation type. + """ + raise NotImplementedError("abstract method") + + def _update(self, X, Y=None): + """Update model with new data and optional ground truth annotations. + + core logic + + Parameters + ---------- + X : pd.DataFrame + Training data to update model with time series + Y : pd.Series, optional + Ground truth annotations for training if annotator is supervised. + + Returns + ------- + self : + Reference to self. + + Notes + ----- + Updates fitted model that updates attributes ending in "_". + """ + # default/fallback: re-fit to all data + self._fit(self._X, self._Y) + + return self + + def fit(self, X, Y=None): + """Fit to training data. + + Parameters + ---------- + X : pd.DataFrame + Training data to fit model to (time series). + Y : pd.Series, optional + Ground truth annotations for training if annotator is supervised. + + Returns + ------- + self : + Reference to self. + + Notes + ----- + Creates fitted model that updates attributes ending in "_". Sets + _is_fitted flag to True. + """ + X = check_series(X) + + if Y is not None: + Y = check_series(Y) + + self._X = X + self._Y = Y + + # fkiraly: insert checks/conversions here, after PR #1012 I suggest + + self._fit(X=X, Y=Y) + + # this should happen last + self._is_fitted = True + + return self + + def predict(self, X): + """Create annotations on test/deployment data. + + Parameters + ---------- + X : pd.DataFrame + Data to annotate (time series). + + Returns + ------- + Y : pd.Series + Annotations for sequence X exact format depends on annotation type. + """ + self.check_is_fitted() + + X = check_series(X) + + # fkiraly: insert checks/conversions here, after PR #1012 I suggest + + Y = self._predict(X=X) + + return Y + + def transform(self, X): + """Create annotations on test/deployment data. + + Parameters + ---------- + X : pd.DataFrame + Data to annotate (time series). + + Returns + ------- + Y : pd.Series + Annotations for sequence X. The returned annotations will be in the dense + format. + """ + Y = self.predict(X) + return self.sparse_to_dense(Y, X.index) + + def sparse_to_dense(self, y_sparse, index): + """Convert the sparse output from an annotator to a dense format. + + Parameters + ---------- + y_sparse : pd.Series + The sparse output from an annotator's predict method. The format of the + series depends on the task and capability of the annotator. + index : array-like + Indices that are to be annotated according to ``y_sparse``. + + Returns + ------- + pd.Series + """ + if self.get_class_tag("capability:subset_detection"): + y_sparse = y_sparse["location"] + + if self.task == "segmentation": + return self.sparse_to_dense_segmentation(y_sparse, index) + elif self.task == "change_point_detection": + return self.sparse_to_dense_change_points(y_sparse, index) + elif self.task == "anomaly_detection": + return self.sparse_to_dense_anomalies(y_sparse, index) + elif self.task == "collective_anomaly_detection": + return self.sparse_to_dense_collective_anomalies(y_sparse, index) + else: + # Overwrite sparse_to_dense for custom tasks. + raise NotImplementedError( + f"sparse_to_dense not implemented for task='{self.task}'" + ) + + @staticmethod + def sparse_to_dense_segmentation(y_sparse, index): + """Convert the output from a segmentation annotator to a dense format.""" + + @staticmethod + def sparse_to_dense_change_points(y_sparse, index): + """Convert the output from a change point detector to a dense format.""" + + @staticmethod + def sparse_to_dense_anomalies(y_sparse, index): + """Convert the output from an anomaly detector to a dense format.""" + + @staticmethod + def sparse_to_dense_collective_anomalies(y_sparse, index): + """Convert the output from a collective anomaly detector to a dense format.""" + + def transform_scores(self, X): + """Return scores for predicted annotations on test/deployment data. + + Parameters + ---------- + X : pd.DataFrame + Data to annotate (time series). + + Returns + ------- + Y : pd.Series + Scores for sequence X exact format depends on annotation type. + """ + self.check_is_fitted() + X = check_series(X) + return self._transform_scores(X) + + def update(self, X, Y=None): + """Update model with new data and optional ground truth annotations. + + Parameters + ---------- + X : pd.DataFrame + Training data to update model with (time series). + Y : pd.Series, optional + Ground truth annotations for training if annotator is supervised. + + Returns + ------- + self : + Reference to self. + + Notes + ----- + Updates fitted model that updates attributes ending in "_". + """ + self.check_is_fitted() + + X = check_series(X) + + if Y is not None: + Y = check_series(Y) + + self._X = X.combine_first(self._X) + + if Y is not None: + self._Y = Y.combine_first(self._Y) + + self._update(X=X, Y=Y) + + return self + + def update_predict(self, X): + """Update model with new data and create annotations for it. + + Parameters + ---------- + X : pd.DataFrame + Training data to update model with, time series. + + Returns + ------- + Y : pd.Series + Annotations for sequence X exact format depends on annotation type. + + Notes + ----- + Updates fitted model that updates attributes ending in "_". + """ + X = check_series(X) + + self.update(X=X) + Y = self.predict(X=X) + + return Y + + def fit_predict(self, X, Y=None): + """Fit to data, then predict it. + + Fits model to X and Y with given annotation parameters + and returns the annotations made by the model. + + Parameters + ---------- + X : pd.DataFrame, pd.Series or np.ndarray + Data to be transformed + Y : pd.Series or np.ndarray, optional (default=None) + Target values of data to be predicted. + + Returns + ------- + self : pd.Series + Annotations for sequence X exact format depends on annotation type. + """ + # Non-optimized default implementation; override when a better + # method is possible for a given algorithm. + return self.fit(X, Y).predict(X) + + def fit_transform(self, X, Y=None): + """Fit to data, then transform it. + + Fits model to X and Y with given annotation parameters + and returns the annotations made by the model. + + Parameters + ---------- + X : pd.DataFrame, pd.Series or np.ndarray + Data to be transformed + Y : pd.Series or np.ndarray, optional (default=None) + Target values of data to be predicted. + + Returns + ------- + self : pd.Series + Annotations for sequence X exact format depends on annotation type. + """ + Y = self.fit_predict(X) + return self.sparse_to_dense(Y, index=X.index) + + # def predict_segments(self, X): + # """Predict segments on test/deployment data. + + # Parameters + # ---------- + # X : pd.DataFrame + # Data to annotate, time series. + + # Returns + # ------- + # Y : pd.Series + # A series with an index of intervals. Each interval is the range of a + # segment and the corresponding value is the label of the segment. + # """ + # self.check_is_fitted() + # X = check_series(X) + + # predict_output = self.predict(X) + # if self.get_class_tag("capability:subset_detection"): + # predict_output = predict_output["location"] + + # if self.task == "segmentation": + # return predict_output + # elif self.task == "change_point_detection": + # return self.change_points_to_segments( + # predict_output, start=X.index.min(), end=X.index.max() + # ) + # elif self.task == "anomaly_detection": + # return self.point_anomalies_to_segments( + # predict_output, start=X.index.min(), end=X.index.max() + # ) + # elif self.task == "collective_anomaly_detection": + # return self.collective_anomalies_to_segments( + # predict_output, start=X.index.min(), end=X.index.max() + # ) + + # def predict_points(self, X): + # """Predict changepoints/anomalies on test/deployment data. + + # Parameters + # ---------- + # X : pd.DataFrame + # Data to annotate, time series. + + # Returns + # ------- + # Y : pd.Series + # A series whose values are the changepoints/anomalies in X. + # """ + # self.check_is_fitted() + # X = check_series(X) + + # predict_output = self.predict(X) + # if self.get_class_tag("capability:subset_detection"): + # predict_output = predict_output["location"] + + # if self.task == "anomaly_detection" or self.task == "change_point_detection": + # return predict_output + # elif self.task == "collective_anomaly_detection": + # # TODO Add support. Turn collective anomalies into point anomalies. + # return self.collective_anomalies_to_point_anomalies(predict_output) + # elif self.task == "segmentation": + # return self.segments_to_change_points(predict_output) + + # @staticmethod + # def point_anomalies_to_segments(self, anomalies, start, end): + # # TODO Add support. 0 = normal, 1, ..., K = anomaly. + # pass + + # @staticmethod + # def collective_anomalies_to_segments(self, collective_anomalies, start, end): + # # TODO Add support. 0 = normal, 1, ..., K = anomaly. + # pass + + # @staticmethod + # def collective_anomalies_to_point_anomalies(self, collective_anomalies): + # pass + + # @staticmethod + # def change_points_to_segments(y_sparse, start, end): + # """Convert a series of change point indexes to segments. + + # Parameters + # ---------- + # y_sparse : pd.Series + # A series containing the indexes of change points. + # start : optional + # Starting point of the first segment. + # end : optional + # Ending point of the last segment + + # Returns + # ------- + # pd.Series + # A series with an interval index indicating the start and end points of the + # segments. The values of the series are the labels of the segments. + + # Examples + # -------- + # >>> import pandas as pd + # >>> from sktime.annotation.base._base import BaseSeriesAnnotator + # >>> change_points = pd.Series([1, 2, 5]) + # >>> BaseSeriesAnnotator.change_points_to_segments(change_points, 0, 7) + # [0, 1) -1 + # [1, 2) 1 + # [2, 5) 2 + # [5, 7) 3 + # dtype: int64 + # """ + # breaks = y_sparse.values + + # if start > breaks.min(): + # raise ValueError( + # "The starting index must be before the first change point." + # ) + # first_change_point = breaks.min() + + # if start is not None: + # breaks = np.insert(breaks, 0, start) + # if end is not None: + # breaks = np.append(breaks, end) + + # index = pd.IntervalIndex.from_breaks(breaks, copy=True, closed="left") + # segments = pd.Series(0, index=index) + + # in_range = index.left >= first_change_point + + # number_of_segments = in_range.sum() + # segments.loc[in_range] = range(1, number_of_segments + 1) + # segments.loc[~in_range] = -1 + + # return segments From fe87c360835ae9ccd7d6109b398f3281a2fe9555 Mon Sep 17 00:00:00 2001 From: tveten Date: Tue, 20 Aug 2024 08:12:06 +0200 Subject: [PATCH 03/75] Allow index names in dataframe inputs --- skchange/base.py | 69 +++++------------------------------------------- 1 file changed, 7 insertions(+), 62 deletions(-) diff --git a/skchange/base.py b/skchange/base.py index 78bddf09..b10682b8 100644 --- a/skchange/base.py +++ b/skchange/base.py @@ -211,10 +211,10 @@ def fit(self, X, Y=None): Creates fitted model that updates attributes ending in "_". Sets _is_fitted flag to True. """ - X = check_series(X) + X = check_series(X, allow_index_names=True) if Y is not None: - Y = check_series(Y) + Y = check_series(Y, allow_index_names=True) self._X = X self._Y = Y @@ -243,7 +243,7 @@ def predict(self, X): """ self.check_is_fitted() - X = check_series(X) + X = check_series(X, allow_index_names=True) # fkiraly: insert checks/conversions here, after PR #1012 I suggest @@ -330,7 +330,7 @@ def transform_scores(self, X): Scores for sequence X exact format depends on annotation type. """ self.check_is_fitted() - X = check_series(X) + X = check_series(X, allow_index_names=True) return self._transform_scores(X) def update(self, X, Y=None): @@ -354,10 +354,10 @@ def update(self, X, Y=None): """ self.check_is_fitted() - X = check_series(X) + X = check_series(X, allow_index_names=True) if Y is not None: - Y = check_series(Y) + Y = check_series(Y, allow_index_names=True) self._X = X.combine_first(self._X) @@ -385,7 +385,7 @@ def update_predict(self, X): ----- Updates fitted model that updates attributes ending in "_". """ - X = check_series(X) + X = check_series(X, allow_index_names=True) self.update(X=X) Y = self.predict(X=X) @@ -512,58 +512,3 @@ def fit_transform(self, X, Y=None): # @staticmethod # def collective_anomalies_to_point_anomalies(self, collective_anomalies): # pass - - # @staticmethod - # def change_points_to_segments(y_sparse, start, end): - # """Convert a series of change point indexes to segments. - - # Parameters - # ---------- - # y_sparse : pd.Series - # A series containing the indexes of change points. - # start : optional - # Starting point of the first segment. - # end : optional - # Ending point of the last segment - - # Returns - # ------- - # pd.Series - # A series with an interval index indicating the start and end points of the - # segments. The values of the series are the labels of the segments. - - # Examples - # -------- - # >>> import pandas as pd - # >>> from sktime.annotation.base._base import BaseSeriesAnnotator - # >>> change_points = pd.Series([1, 2, 5]) - # >>> BaseSeriesAnnotator.change_points_to_segments(change_points, 0, 7) - # [0, 1) -1 - # [1, 2) 1 - # [2, 5) 2 - # [5, 7) 3 - # dtype: int64 - # """ - # breaks = y_sparse.values - - # if start > breaks.min(): - # raise ValueError( - # "The starting index must be before the first change point." - # ) - # first_change_point = breaks.min() - - # if start is not None: - # breaks = np.insert(breaks, 0, start) - # if end is not None: - # breaks = np.append(breaks, end) - - # index = pd.IntervalIndex.from_breaks(breaks, copy=True, closed="left") - # segments = pd.Series(0, index=index) - - # in_range = index.left >= first_change_point - - # number_of_segments = in_range.sum() - # segments.loc[in_range] = range(1, number_of_segments + 1) - # segments.loc[~in_range] = -1 - - # return segments From e78b7663b31c5fc5f0ea78f3bd8bf13e9f8319e7 Mon Sep 17 00:00:00 2001 From: tveten Date: Tue, 20 Aug 2024 08:12:40 +0200 Subject: [PATCH 04/75] Remove old commented code --- skchange/base.py | 78 ------------------------------------------------ 1 file changed, 78 deletions(-) diff --git a/skchange/base.py b/skchange/base.py index b10682b8..d86ab95b 100644 --- a/skchange/base.py +++ b/skchange/base.py @@ -434,81 +434,3 @@ def fit_transform(self, X, Y=None): """ Y = self.fit_predict(X) return self.sparse_to_dense(Y, index=X.index) - - # def predict_segments(self, X): - # """Predict segments on test/deployment data. - - # Parameters - # ---------- - # X : pd.DataFrame - # Data to annotate, time series. - - # Returns - # ------- - # Y : pd.Series - # A series with an index of intervals. Each interval is the range of a - # segment and the corresponding value is the label of the segment. - # """ - # self.check_is_fitted() - # X = check_series(X) - - # predict_output = self.predict(X) - # if self.get_class_tag("capability:subset_detection"): - # predict_output = predict_output["location"] - - # if self.task == "segmentation": - # return predict_output - # elif self.task == "change_point_detection": - # return self.change_points_to_segments( - # predict_output, start=X.index.min(), end=X.index.max() - # ) - # elif self.task == "anomaly_detection": - # return self.point_anomalies_to_segments( - # predict_output, start=X.index.min(), end=X.index.max() - # ) - # elif self.task == "collective_anomaly_detection": - # return self.collective_anomalies_to_segments( - # predict_output, start=X.index.min(), end=X.index.max() - # ) - - # def predict_points(self, X): - # """Predict changepoints/anomalies on test/deployment data. - - # Parameters - # ---------- - # X : pd.DataFrame - # Data to annotate, time series. - - # Returns - # ------- - # Y : pd.Series - # A series whose values are the changepoints/anomalies in X. - # """ - # self.check_is_fitted() - # X = check_series(X) - - # predict_output = self.predict(X) - # if self.get_class_tag("capability:subset_detection"): - # predict_output = predict_output["location"] - - # if self.task == "anomaly_detection" or self.task == "change_point_detection": - # return predict_output - # elif self.task == "collective_anomaly_detection": - # # TODO Add support. Turn collective anomalies into point anomalies. - # return self.collective_anomalies_to_point_anomalies(predict_output) - # elif self.task == "segmentation": - # return self.segments_to_change_points(predict_output) - - # @staticmethod - # def point_anomalies_to_segments(self, anomalies, start, end): - # # TODO Add support. 0 = normal, 1, ..., K = anomaly. - # pass - - # @staticmethod - # def collective_anomalies_to_segments(self, collective_anomalies, start, end): - # # TODO Add support. 0 = normal, 1, ..., K = anomaly. - # pass - - # @staticmethod - # def collective_anomalies_to_point_anomalies(self, collective_anomalies): - # pass From 11a6a91302e15ee04b30c1799cb5de2f9eb08609 Mon Sep 17 00:00:00 2001 From: tveten Date: Tue, 20 Aug 2024 16:26:13 +0200 Subject: [PATCH 05/75] Rewrite BaseDetector such that subclasses define detection types Separates functionality regarding different detectors more clearly. Less clutter in BaseDetector. --- skchange/base.py | 613 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 450 insertions(+), 163 deletions(-) diff --git a/skchange/base.py b/skchange/base.py index d86ab95b..15bd135b 100644 --- a/skchange/base.py +++ b/skchange/base.py @@ -1,12 +1,27 @@ """Detector base class. - class name: BaseSeriesAnnotator + class name: BaseDetector + + Adapted from the sktime.BaseSeriesAnnotator class. Scitype defining methods: - fitting - fit(self, X, Y=None) - detecting - predict(self, X) - updating (temporal) - update(self, X, Y=None) - update&detect - update_predict(self, X) + fitting - fit(self, X, Y=None) + detecting, sparse format - predict(self, X) + detecting, dense format - transform(self, X) + detection scores, dense - score_transform(self, X) + updating (temporal) - update(self, X, Y=None) + +Each detector type (subclass of BaseDetector in skchange, task + learning_type tags of +BaseSeriesAnnotator in sktime) is defined by the content and format of the output of the +predict method. Each detector type therefore needs the following methods for converting +between sparse and dense output formats: + sparse_to_dense - sparse_to_dense(y_sparse, index) + dense_to_sparse - dense_to_sparse(y_dense) + +Convenience methods: + update&detect - update_predict(self, X) + fit&detect - fit_predict(self, X, Y=None) + fit&transform - fit_transform(self, X, Y=None) Inspection methods: hyper-parameter inspection - get_params() @@ -15,12 +30,13 @@ class name: BaseSeriesAnnotator State: fitted model/strategy - by convention, any attributes ending in "_" fitted state flag - check_is_fitted() - """ __author__ = ["mtveten"] __all__ = ["BaseDetector"] +import numpy as np +import pandas as pd from sktime.base import BaseEstimator from sktime.utils.validation.series import check_series @@ -30,8 +46,7 @@ class BaseDetector(BaseEstimator): An alternative implementation to the BaseSeriesAnnotator class from sktime, more focused on the detection of events of interest. - Enables quicker bug fixes for example, since the annotation module is still - experimental. + Safer for now since the annotation module is still experimental. All detectors share the common feature that each element of the output from .predict indicates the detection of a specific event of interest, such as an anomaly, a @@ -40,51 +55,13 @@ class BaseDetector(BaseEstimator): Needs to be implemented: - _fit(self, X, Y=None) -> self - _predict(self, X) -> pd.Series or pd.DataFrame + - sparse_to_dense(y_sparse, index) -> pd.Series or pd.DataFrame Optional to implement: - - _transform_scores(self, X) -> pd.Series or pd.DataFrame + - dense_to_sparse(y_dense) -> pd.Series or pd.DataFrame + - _score_transform(self, X) -> pd.Series or pd.DataFrame - _update(self, X, Y=None) -> self - Required .predict output formats per task and capability: - - task == "anomaly_detection": - pd.Series(anomaly_indices, dtype=int, name="anomalies) - - task == "collective_anomaly_detection": - pd.Series(pd.IntervalIndex( - anomaly_intervals, closed=, name="collective_anomalies" - )) - - task == "change_point_detection": - Changepoints are defined as the last element of a segment. - pd.Series(changepoint_indices, dtype=int, name="changepoints") - - task == "segmentation": - Difference from change point detection: Allows the same label to be assigned to - multiple segments. - pd.Series({ - index = pd.IntervalIndex(segment_intervals, closed=), - values = segment_labels, - }) - - task == "None": - Custom task. - Only restriction is that the output must be a pd.Series or pd.DataFrame where - each element or row corresponds to a detected event. - For .transform to work, .sparse_to_dense must be implemented for custom tasks. - - capability:subset_detection is True: - * task == "anomaly_detection": - pd.DataFrame({ - "location": anomaly_indices, - "columns": affected_components_list, - }) - * task == "collective_anomaly_detection": - pd.DataFrame({ - "location": pd.IntervalIndex(anomaly_intervals, closed=), - "columns": affected_components_list, - }) - * task == "change_point_detection": - pd.DataFrame({ - "location": changepoint_indices, - "columns": affected_components_list, - }) - - capability:detection_score is True: Explicit way of stating that _transform_scores - is implemented. """ _tags = { @@ -107,15 +84,13 @@ def __init__(self): super().__init__() - def _fit(self, X, Y=None): + def fit(self, X, Y=None): """Fit to training data. - core logic - Parameters ---------- X : pd.DataFrame - Training data to fit model to time series. + Training data to fit model to (time series). Y : pd.Series, optional Ground truth annotations for training if annotator is supervised. @@ -126,54 +101,35 @@ def _fit(self, X, Y=None): Notes ----- - Updates fitted model that updates attributes ending in "_". + Creates fitted model that updates attributes ending in "_". Sets + _is_fitted flag to True. """ - raise NotImplementedError("abstract method") - - def _predict(self, X): - """Create annotations on test/deployment data. - - core logic + X = check_series(X, allow_index_names=True) - Parameters - ---------- - X : pd.DataFrame - Data to annotate, time series. + if Y is not None: + Y = check_series(Y, allow_index_names=True) - Returns - ------- - Y : pd.Series - Annotations for sequence X exact format depends on annotation type. - """ - raise NotImplementedError("abstract method") + self._X = X + self._Y = Y - def _transform_scores(self, X): - """Return scores for predicted annotations on test/deployment data. + # fkiraly: insert checks/conversions here, after PR #1012 I suggest - core logic + self._fit(X=X, Y=Y) - Parameters - ---------- - X : pd.DataFrame - Data to annotate, time series. + # this should happen last + self._is_fitted = True - Returns - ------- - Y : pd.Series - One score for each element in X. - Annotations for sequence X exact format depends on annotation type. - """ - raise NotImplementedError("abstract method") + return self - def _update(self, X, Y=None): - """Update model with new data and optional ground truth annotations. + def _fit(self, X, Y=None): + """Fit to training data. core logic Parameters ---------- X : pd.DataFrame - Training data to update model with time series + Training data to fit model to time series. Y : pd.Series, optional Ground truth annotations for training if annotator is supervised. @@ -186,95 +142,75 @@ def _update(self, X, Y=None): ----- Updates fitted model that updates attributes ending in "_". """ - # default/fallback: re-fit to all data - self._fit(self._X, self._Y) - - return self + raise NotImplementedError("abstract method") - def fit(self, X, Y=None): - """Fit to training data. + def predict(self, X): + """Detect events in test/deployment data. Parameters ---------- X : pd.DataFrame - Training data to fit model to (time series). - Y : pd.Series, optional - Ground truth annotations for training if annotator is supervised. + Data to detect events in (time series). Returns ------- - self : - Reference to self. - - Notes - ----- - Creates fitted model that updates attributes ending in "_". Sets - _is_fitted flag to True. + Y : pd.Series or pd.DataFrame + Each element or row corresponds to a detected event. Exact format depends on + the specific detector type. """ - X = check_series(X, allow_index_names=True) - - if Y is not None: - Y = check_series(Y, allow_index_names=True) + self.check_is_fitted() - self._X = X - self._Y = Y + X = check_series(X, allow_index_names=True) # fkiraly: insert checks/conversions here, after PR #1012 I suggest - self._fit(X=X, Y=Y) - - # this should happen last - self._is_fitted = True + Y = self._predict(X=X) - return self + return Y - def predict(self, X): + def _predict(self, X): """Create annotations on test/deployment data. + core logic + Parameters ---------- X : pd.DataFrame - Data to annotate (time series). + Data to annotate, time series. Returns ------- Y : pd.Series Annotations for sequence X exact format depends on annotation type. """ - self.check_is_fitted() - - X = check_series(X, allow_index_names=True) - - # fkiraly: insert checks/conversions here, after PR #1012 I suggest - - Y = self._predict(X=X) - - return Y + raise NotImplementedError("abstract method") def transform(self, X): - """Create annotations on test/deployment data. + """Detect events and return the result in a dense format. Parameters ---------- X : pd.DataFrame - Data to annotate (time series). + Data to detect events in (time series). Returns ------- - Y : pd.Series - Annotations for sequence X. The returned annotations will be in the dense - format. + Y : pd.Series or pd.DataFrame + Detections for sequence X. The returned detections will be in the dense + format, meaning that each element in X will be annotated according to the + detection results in some meaningful way depending on the detector type. """ Y = self.predict(X) return self.sparse_to_dense(Y, X.index) - def sparse_to_dense(self, y_sparse, index): - """Convert the sparse output from an annotator to a dense format. + @staticmethod + def sparse_to_dense(y_sparse, index): + """Convert the sparse output from a detector to a dense format. Parameters ---------- y_sparse : pd.Series - The sparse output from an annotator's predict method. The format of the + The sparse output from a detector's predict method. The format of the series depends on the task and capability of the annotator. index : array-like Indices that are to be annotated according to ``y_sparse``. @@ -283,40 +219,25 @@ def sparse_to_dense(self, y_sparse, index): ------- pd.Series """ - if self.get_class_tag("capability:subset_detection"): - y_sparse = y_sparse["location"] - - if self.task == "segmentation": - return self.sparse_to_dense_segmentation(y_sparse, index) - elif self.task == "change_point_detection": - return self.sparse_to_dense_change_points(y_sparse, index) - elif self.task == "anomaly_detection": - return self.sparse_to_dense_anomalies(y_sparse, index) - elif self.task == "collective_anomaly_detection": - return self.sparse_to_dense_collective_anomalies(y_sparse, index) - else: - # Overwrite sparse_to_dense for custom tasks. - raise NotImplementedError( - f"sparse_to_dense not implemented for task='{self.task}'" - ) - - @staticmethod - def sparse_to_dense_segmentation(y_sparse, index): - """Convert the output from a segmentation annotator to a dense format.""" + raise NotImplementedError("abstract method") @staticmethod - def sparse_to_dense_change_points(y_sparse, index): - """Convert the output from a change point detector to a dense format.""" + def dense_to_sparse(y_dense): + """Convert the dense output from a detector to a sparse format. - @staticmethod - def sparse_to_dense_anomalies(y_sparse, index): - """Convert the output from an anomaly detector to a dense format.""" + Parameters + ---------- + y_dense : pd.Series + The dense output from a detector's transform method. The format of the + series depends on the task and capability of the annotator. - @staticmethod - def sparse_to_dense_collective_anomalies(y_sparse, index): - """Convert the output from a collective anomaly detector to a dense format.""" + Returns + ------- + pd.Series + """ + raise NotImplementedError("abstract method") - def transform_scores(self, X): + def score_transform(self, X): """Return scores for predicted annotations on test/deployment data. Parameters @@ -331,7 +252,25 @@ def transform_scores(self, X): """ self.check_is_fitted() X = check_series(X, allow_index_names=True) - return self._transform_scores(X) + return self._score_transform(X) + + def _score_transform(self, X): + """Return scores for predicted annotations on test/deployment data. + + core logic + + Parameters + ---------- + X : pd.DataFrame + Data to annotate, time series. + + Returns + ------- + Y : pd.Series + One score for each element in X. + Annotations for sequence X exact format depends on annotation type. + """ + raise NotImplementedError("abstract method") def update(self, X, Y=None): """Update model with new data and optional ground truth annotations. @@ -368,6 +307,32 @@ def update(self, X, Y=None): return self + def _update(self, X, Y=None): + """Update model with new data and optional ground truth annotations. + + core logic + + Parameters + ---------- + X : pd.DataFrame + Training data to update model with time series + Y : pd.Series, optional + Ground truth annotations for training if annotator is supervised. + + Returns + ------- + self : + Reference to self. + + Notes + ----- + Updates fitted model that updates attributes ending in "_". + """ + # default/fallback: re-fit to all data + self._fit(self._X, self._Y) + + return self + def update_predict(self, X): """Update model with new data and create annotations for it. @@ -434,3 +399,325 @@ def fit_transform(self, X, Y=None): """ Y = self.fit_predict(X) return self.sparse_to_dense(Y, index=X.index) + + +# Required .predict output formats per task and capability: +# - task == "anomaly_detection": +# pd.Series(anomaly_indices, dtype=int, name="anomalies) +# - task == "collective_anomaly_detection": +# pd.Series(pd.IntervalIndex( +# anomaly_intervals, closed=, name="collective_anomalies" +# )) +# - task == "change_point_detection": +# Changepoints are defined as the last element of a segment. +# pd.Series(changepoint_indices, dtype=int, name="changepoints") +# - task == "segmentation": +# Difference from change point detection: Allows the same label to be assigned to +# multiple segments. +# pd.Series({ +# index = pd.IntervalIndex(segment_intervals, closed=), +# values = segment_labels, +# }) +# - task == "None": +# Custom task. +# Only restriction is that the output must be a pd.Series or pd.DataFrame where +# each element or row corresponds to a detected event. +# For .transform to work, .sparse_to_dense must be implemented for custom tasks. +# - capability:subset_detection is True: +# * task == "anomaly_detection": +# pd.DataFrame({ +# "location": anomaly_indices, +# "columns": affected_components_list, +# }) +# * task == "collective_anomaly_detection": +# pd.DataFrame({ +# "location": pd.IntervalIndex(anomaly_intervals, closed=), +# "columns": affected_components_list, +# }) +# * task == "change_point_detection": +# pd.DataFrame({ +# "location": changepoint_indices, +# "columns": affected_components_list, +# }) +# - capability:detection_score is True: Explicit way of stating that _score_transform +# is implemented. + + +class PointAnomalyDetector(BaseDetector): + """Base class for anomaly detectors. + + Anomaly detectors detect individual data points that are considered anomalous. + + Output format of the predict method: + pd.Series(anomaly_indices, dtype=int, name="anomaly") + + Subclasses should set the following tags for sktime compatibility: + - task: "anomaly_detection" + - learning_type: "unsupervised" or "supervised" + - And possibly other tags, such as + * "capability:missing_values": False, + * "capability:multivariate": True, + * "fit_is_empty": False, + + + Needs to be implemented: + - _fit(self, X, Y=None) -> self + - _predict(self, X) -> pd.Series + + Optional to implement: + - _score_transform(self, X) -> pd.Series + - _update(self, X, Y=None) -> self + """ + + @staticmethod + def sparse_to_dense(y_sparse: pd.Series, index: pd.Index): + """Convert the sparse output from the predict method to a dense format. + + Parameters + ---------- + y_sparse : pd.Series + The sparse output from an anomaly detector's predict method. + index : array-like + Indices that are to be annotated according to ``y_sparse``. + + Returns + ------- + pd.Series where 0-entries are normal and 1-entries are anomalous. + """ + y_dense = pd.Series(0, index=index, name="anomaly", dtype="int64") + y_dense.iloc[y_sparse.values] = 1 + return y_dense + + @staticmethod + def dense_to_sparse(y_dense: pd.Series): + """Convert the dense output from the transform method to a sparse format. + + Parameters + ---------- + y_dense : pd.Series + The dense output from an anomaly detector's transform method. + 0-entries are normal and 1-entries are anomalous. + + Returns + ------- + pd.Series of the integer locations of the anomalous data points. + + Notes + ----- + The output from the predict method is expected to be in this format. + """ + y_dense = y_dense.reset_index(drop=True) + y_sparse = y_dense.iloc[y_dense.values == 1].index + return pd.Series(y_sparse) + + +class CollectiveAnomalyDetector(BaseDetector): + """Base class for collective anomaly detectors. + + Collective anomaly detectors detect segments of data points that are considered + anomalous. + + Output format of the predict method: See the dense_to_sparse method. + + Subclasses should set the following tags for sktime compatibility: + - task: "collective_anomaly_detection" + - learning_type: "unsupervised" or "supervised" + - And possibly other tags, such as + * "capability:missing_values": False, + * "capability:multivariate": True, + * "fit_is_empty": False, + + Needs to be implemented: + - _fit(self, X, Y=None) -> self + - _predict(self, X) -> pd.Series + + Optional to implement: + - _score_transform(self, X) -> pd.Series + - _update(self, X, Y=None) -> self + """ + + @staticmethod + def sparse_to_dense(y_sparse: pd.arrays.IntervalArray, index: pd.Index): + """Convert the sparse output from the predict method to a dense format. + + Parameters + ---------- + y_sparse : pd.arrays.IntervalArray + The sparse output from a collective anomaly detector's predict method. + index : array-like + Indices that are to be annotated according to ``y_sparse``. + + Returns + ------- + pd.Series + """ + y_dense = pd.IntervalIndex(y_sparse).get_indexer(index) + # get_indexer return values 0, 1, 2, ... for values inside each intervals. + y_dense.loc[y_dense >= 0] = 1 + # get_indexer returns -1 for values outside any interval + y_dense.loc[y_dense < 0] = 0 + return y_dense + + @staticmethod + def dense_to_sparse(y_dense: pd.Series): + """Convert the dense output from the transform method to a sparse format. + + Parameters + ---------- + y_dense : pd.Series + The dense output from a collective anomaly detector's transform method: + A binary series where 0-entries are normal and 1-entries are anomalous. + + Returns + ------- + pd.arrays.IntervalArray containing the collective anomaly intervals. + + Notes + ----- + The output from the predict method is expected to be in this format. + """ + y_dense = y_dense.reset_index(drop=True) + y_anomaly = y_dense.loc[y_dense.values == 1] + anomaly_locations_diff = y_anomaly.index.diff() + + first_anomaly_start = y_anomaly.index[:1].to_numpy() + anomaly_starts = y_anomaly.index[anomaly_locations_diff > 1] + anomaly_starts = np.insert(anomaly_starts, 0, first_anomaly_start) + + last_anomaly_end = y_anomaly.index[-1:].to_numpy() + anomaly_ends = y_anomaly.index[np.roll(anomaly_locations_diff > 1, -1)] + anomaly_ends = np.insert(anomaly_ends, len(anomaly_ends), last_anomaly_end) + + y_sparse = pd.arrays.IntervalArray.from_arrays( + anomaly_starts, anomaly_ends, closed="both" + ) + return y_sparse + + +class ChangepointDetector(BaseDetector): + """Base class for changepoint detectors. + + Changepoint detectors detect the point in time where a change in the data occurs. + A changepoint is defined as the index of the last element before a change. + + Output format of the predict method: + pd.Series(changepoint_indices, dtype=int, name="changepoint") + + Subclasses should set the following tags for sktime compatibility: + - task: "change_point_detection" + - learning_type: "unsupervised" or "supervised" + - And possibly other tags, such as + * "capability:missing_values": False, + * "capability:multivariate": True, + * "fit_is_empty": False, + + Needs to be implemented: + - _fit(self, X, Y=None) -> self + - _predict(self, X) -> pd.Series + + Optional to implement: + - _score_transform(self, X) -> pd.Series + - _update(self, X, Y=None) -> self + """ + + @staticmethod + def sparse_to_dense(y_sparse: pd.Series, index: pd.Index): + """Convert the sparse output from the predict method to a dense format. + + Parameters + ---------- + y_sparse : pd.Series + The sparse output from a changepoint detector's predict method. + index : array-like + Indices that are to be annotated according to ``y_sparse``. + + Returns + ------- + pd.Series + """ + # TODO: Use segment labels as dense output or changepoint indicator? + # Segment labels probably more useful. + # y_dense = pd.Series(0, index=index, name="changepoint", dtype="int64") + # y_dense.iloc[y_sparse.values] = 1 + changepoints = y_sparse.to_list() + n = len(index) + changepoints = [-1] + changepoints + [n - 1] + segment_labels = np.zeros(n) + for i in range(len(changepoints) - 1): + segment_labels[changepoints[i] + 1 : changepoints[i + 1] + 1] = i + + y_dense = pd.Series( + segment_labels, index=index, name="segment_label", dtype="int64" + ) + return y_dense + + @staticmethod + def dense_to_sparse(y_dense: pd.Series): + """Convert the dense output from the transform method to a sparse format. + + Parameters + ---------- + y_dense : pd.Series + The dense output from a changepoint detector's transform method. + + Returns + ------- + pd.Series + """ + # TODO: Use segment labels as dense output or changepoint indicator? + # Segment labels probably more useful. + # y_dense = y_dense.reset_index(drop=True) + # y_sparse = y_dense.iloc[y_dense.values == 1].index + y_dense = y_dense.reset_index(drop=True) + is_changepoint = np.roll(y_dense.diff() > 0, -1) # changepoint = end of segment + changepoints = y_dense.index[is_changepoint] + y_sparse = pd.Series(changepoints, name="changepoint", dtype="int64") + return y_sparse + + +class SubsetCollectiveAnomalyDetector(BaseDetector): + """Base class for subset collective anomaly detectors. + + Subset collective anomaly detectors detect segments of multivariate time series data + that are considered anomalous, and also provide information on which components of + the data are affected. + + Output format of the predict method: + pd.DataFrame({ + "location": pd.IntervalIndex(anomaly_intervals, closed=), + "columns": affected_components_list, + }) + + Subclasses should set the following tags for sktime compatibility: + - task: "collective_anomaly_detection" + - learning_type: "unsupervised" or "supervised" + - capability:subset_detection: True + - And possibly other tags, such as + * "capability:missing_values": False, + * "capability:multivariate": True, + * "fit_is_empty": False, + + Needs to be implemented: + - _fit(self, X, Y=None) -> self + - _predict(self, X) -> pd.DataFrame + + Optional to implement: + - _score_transform(self, X) -> pd.Series + - _update(self, X, Y=None) -> self + """ + + @staticmethod + def sparse_to_dense(y_sparse, index): + """Convert the sparse output from the predict method to a dense format. + + Parameters + ---------- + y_sparse : pd.DataFrame + The sparse output from the predict method. + index : array-like + Indices that are to be annotated according to ``y_sparse``. + + Returns + ------- + pd.DataFrame + """ From 99cb1a38b349a1e05b9c3bddc76fd358143cd3fd Mon Sep 17 00:00:00 2001 From: tveten Date: Wed, 21 Aug 2024 08:18:38 +0200 Subject: [PATCH 06/75] Rephrasing module description --- skchange/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skchange/base.py b/skchange/base.py index 15bd135b..27076d68 100644 --- a/skchange/base.py +++ b/skchange/base.py @@ -13,7 +13,7 @@ class name: BaseDetector Each detector type (subclass of BaseDetector in skchange, task + learning_type tags of BaseSeriesAnnotator in sktime) is defined by the content and format of the output of the -predict method. Each detector type therefore needs the following methods for converting +predict method. Each detector type therefore has the following methods for converting between sparse and dense output formats: sparse_to_dense - sparse_to_dense(y_sparse, index) dense_to_sparse - dense_to_sparse(y_dense) From 56e5f2bf04a11e39d219b7abd16ec7df60fcec3b Mon Sep 17 00:00:00 2001 From: tveten Date: Wed, 21 Aug 2024 12:00:57 +0200 Subject: [PATCH 07/75] Use separate integer labels per collective anomaly as dense output --- skchange/base.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/skchange/base.py b/skchange/base.py index 27076d68..ca7669d3 100644 --- a/skchange/base.py +++ b/skchange/base.py @@ -56,12 +56,12 @@ class BaseDetector(BaseEstimator): - _fit(self, X, Y=None) -> self - _predict(self, X) -> pd.Series or pd.DataFrame - sparse_to_dense(y_sparse, index) -> pd.Series or pd.DataFrame + * Enables the transform method to work. Optional to implement: - dense_to_sparse(y_dense) -> pd.Series or pd.DataFrame - _score_transform(self, X) -> pd.Series or pd.DataFrame - _update(self, X, Y=None) -> self - """ _tags = { @@ -459,7 +459,6 @@ class PointAnomalyDetector(BaseDetector): * "capability:multivariate": True, * "fit_is_empty": False, - Needs to be implemented: - _fit(self, X, Y=None) -> self - _predict(self, X) -> pd.Series @@ -496,7 +495,7 @@ def dense_to_sparse(y_dense: pd.Series): ---------- y_dense : pd.Series The dense output from an anomaly detector's transform method. - 0-entries are normal and 1-entries are anomalous. + 0-entries are normal and >0-entries are anomalous. Returns ------- @@ -507,7 +506,7 @@ def dense_to_sparse(y_dense: pd.Series): The output from the predict method is expected to be in this format. """ y_dense = y_dense.reset_index(drop=True) - y_sparse = y_dense.iloc[y_dense.values == 1].index + y_sparse = y_dense.iloc[y_dense.values > 0].index return pd.Series(y_sparse) @@ -543,20 +542,22 @@ def sparse_to_dense(y_sparse: pd.arrays.IntervalArray, index: pd.Index): Parameters ---------- y_sparse : pd.arrays.IntervalArray - The sparse output from a collective anomaly detector's predict method. + The collective anomaly intervals. index : array-like Indices that are to be annotated according to ``y_sparse``. Returns ------- - pd.Series + pd.Series where 0-entries are normal and each collective anomaly are labelled + from 1, ..., K. """ y_dense = pd.IntervalIndex(y_sparse).get_indexer(index) - # get_indexer return values 0, 1, 2, ... for values inside each intervals. - y_dense.loc[y_dense >= 0] = 1 - # get_indexer returns -1 for values outside any interval - y_dense.loc[y_dense < 0] = 0 - return y_dense + # get_indexer return values 0 for the values inside the first interval, 1 to + # the values within the next interval and so on, and -1 for values outside any + # interval. The skchange convention is that 0 is normal and > 0 is anomalous, + # so we add 1 to the result. + y_dense += 1 + return pd.Series(y_dense, index=index, name="anomaly", dtype="int64") @staticmethod def dense_to_sparse(y_dense: pd.Series): @@ -566,7 +567,8 @@ def dense_to_sparse(y_dense: pd.Series): ---------- y_dense : pd.Series The dense output from a collective anomaly detector's transform method: - A binary series where 0-entries are normal and 1-entries are anomalous. + An integer series where 0-entries are normal and each collective anomaly + are labelled from 1, ..., K. Returns ------- @@ -577,7 +579,7 @@ def dense_to_sparse(y_dense: pd.Series): The output from the predict method is expected to be in this format. """ y_dense = y_dense.reset_index(drop=True) - y_anomaly = y_dense.loc[y_dense.values == 1] + y_anomaly = y_dense.loc[y_dense.values > 0] anomaly_locations_diff = y_anomaly.index.diff() first_anomaly_start = y_anomaly.index[:1].to_numpy() From 1c9e707824eb5e1c195cc3a76ebba0c400bd2b4f Mon Sep 17 00:00:00 2001 From: tveten Date: Wed, 21 Aug 2024 12:03:20 +0200 Subject: [PATCH 08/75] Fix bug --- skchange/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/skchange/base.py b/skchange/base.py index ca7669d3..7e31e8f7 100644 --- a/skchange/base.py +++ b/skchange/base.py @@ -671,7 +671,8 @@ def dense_to_sparse(y_dense: pd.Series): # y_dense = y_dense.reset_index(drop=True) # y_sparse = y_dense.iloc[y_dense.values == 1].index y_dense = y_dense.reset_index(drop=True) - is_changepoint = np.roll(y_dense.diff() > 0, -1) # changepoint = end of segment + # changepoint = end of segment, so the label diffs > 0 must be shiftet by -1. + is_changepoint = np.roll(y_dense.diff().abs() > 0, -1) changepoints = y_dense.index[is_changepoint] y_sparse = pd.Series(changepoints, name="changepoint", dtype="int64") return y_sparse From 25413ba3f815e62547c2e9d96d3e9783d1dac511 Mon Sep 17 00:00:00 2001 From: tveten Date: Wed, 21 Aug 2024 12:28:01 +0200 Subject: [PATCH 09/75] Add expected output typing in subclasses --- skchange/base.py | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/skchange/base.py b/skchange/base.py index 7e31e8f7..797186d3 100644 --- a/skchange/base.py +++ b/skchange/base.py @@ -401,7 +401,8 @@ def fit_transform(self, X, Y=None): return self.sparse_to_dense(Y, index=X.index) -# Required .predict output formats per task and capability: +# Notes on required .predict output formats per detector type (task and capability): +# # - task == "anomaly_detection": # pd.Series(anomaly_indices, dtype=int, name="anomalies) # - task == "collective_anomaly_detection": @@ -448,8 +449,8 @@ class PointAnomalyDetector(BaseDetector): Anomaly detectors detect individual data points that are considered anomalous. - Output format of the predict method: - pd.Series(anomaly_indices, dtype=int, name="anomaly") + Output format of the predict method: See the dense_to_sparse method. + Output format of the transform method: See the sparse_to_dense method. Subclasses should set the following tags for sktime compatibility: - task: "anomaly_detection" @@ -469,7 +470,7 @@ class PointAnomalyDetector(BaseDetector): """ @staticmethod - def sparse_to_dense(y_sparse: pd.Series, index: pd.Index): + def sparse_to_dense(y_sparse: pd.Series, index: pd.Index) -> pd.Series[int]: """Convert the sparse output from the predict method to a dense format. Parameters @@ -488,7 +489,7 @@ def sparse_to_dense(y_sparse: pd.Series, index: pd.Index): return y_dense @staticmethod - def dense_to_sparse(y_dense: pd.Series): + def dense_to_sparse(y_dense: pd.Series) -> pd.Series[int]: """Convert the dense output from the transform method to a sparse format. Parameters @@ -500,10 +501,6 @@ def dense_to_sparse(y_dense: pd.Series): Returns ------- pd.Series of the integer locations of the anomalous data points. - - Notes - ----- - The output from the predict method is expected to be in this format. """ y_dense = y_dense.reset_index(drop=True) y_sparse = y_dense.iloc[y_dense.values > 0].index @@ -517,6 +514,7 @@ class CollectiveAnomalyDetector(BaseDetector): anomalous. Output format of the predict method: See the dense_to_sparse method. + Output format of the transform method: See the sparse_to_dense method. Subclasses should set the following tags for sktime compatibility: - task: "collective_anomaly_detection" @@ -536,12 +534,14 @@ class CollectiveAnomalyDetector(BaseDetector): """ @staticmethod - def sparse_to_dense(y_sparse: pd.arrays.IntervalArray, index: pd.Index): + def sparse_to_dense( + y_sparse: pd.Series[pd.Interval], index: pd.Index + ) -> pd.Series[int]: """Convert the sparse output from the predict method to a dense format. Parameters ---------- - y_sparse : pd.arrays.IntervalArray + y_sparse : pd.Series[pd.Interval] The collective anomaly intervals. index : array-like Indices that are to be annotated according to ``y_sparse``. @@ -560,7 +560,7 @@ def sparse_to_dense(y_sparse: pd.arrays.IntervalArray, index: pd.Index): return pd.Series(y_dense, index=index, name="anomaly", dtype="int64") @staticmethod - def dense_to_sparse(y_dense: pd.Series): + def dense_to_sparse(y_dense: pd.Series) -> pd.Series[pd.Interval]: """Convert the dense output from the transform method to a sparse format. Parameters @@ -572,11 +572,12 @@ def dense_to_sparse(y_dense: pd.Series): Returns ------- - pd.arrays.IntervalArray containing the collective anomaly intervals. + pd.Series[pd.Interval] containing the collective anomaly intervals. Notes ----- - The output from the predict method is expected to be in this format. + The start and end points of the intervals can be accessed by + output.array.left and output.array.right, respectively. """ y_dense = y_dense.reset_index(drop=True) y_anomaly = y_dense.loc[y_dense.values > 0] @@ -590,8 +591,8 @@ def dense_to_sparse(y_dense: pd.Series): anomaly_ends = y_anomaly.index[np.roll(anomaly_locations_diff > 1, -1)] anomaly_ends = np.insert(anomaly_ends, len(anomaly_ends), last_anomaly_end) - y_sparse = pd.arrays.IntervalArray.from_arrays( - anomaly_starts, anomaly_ends, closed="both" + y_sparse = pd.Series( + pd.IntervalIndex.from_arrays(anomaly_starts, anomaly_ends, closed="both") ) return y_sparse @@ -602,8 +603,8 @@ class ChangepointDetector(BaseDetector): Changepoint detectors detect the point in time where a change in the data occurs. A changepoint is defined as the index of the last element before a change. - Output format of the predict method: - pd.Series(changepoint_indices, dtype=int, name="changepoint") + Output format of the predict method: See the dense_to_sparse method. + Output format of the transform method: See the sparse_to_dense method. Subclasses should set the following tags for sktime compatibility: - task: "change_point_detection" @@ -623,7 +624,7 @@ class ChangepointDetector(BaseDetector): """ @staticmethod - def sparse_to_dense(y_sparse: pd.Series, index: pd.Index): + def sparse_to_dense(y_sparse: pd.Series, index: pd.Index) -> pd.Series[int]: """Convert the sparse output from the predict method to a dense format. Parameters @@ -654,7 +655,7 @@ def sparse_to_dense(y_sparse: pd.Series, index: pd.Index): return y_dense @staticmethod - def dense_to_sparse(y_dense: pd.Series): + def dense_to_sparse(y_dense: pd.Series) -> pd.Series[int]: """Convert the dense output from the transform method to a sparse format. Parameters From 666f05c6e2cba4c44842f858511baec9b7159339 Mon Sep 17 00:00:00 2001 From: tveten Date: Wed, 21 Aug 2024 12:33:37 +0200 Subject: [PATCH 10/75] Improve documentation and remove old comments --- skchange/base.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/skchange/base.py b/skchange/base.py index 797186d3..824c83f2 100644 --- a/skchange/base.py +++ b/skchange/base.py @@ -600,8 +600,10 @@ def dense_to_sparse(y_dense: pd.Series) -> pd.Series[pd.Interval]: class ChangepointDetector(BaseDetector): """Base class for changepoint detectors. - Changepoint detectors detect the point in time where a change in the data occurs. - A changepoint is defined as the index of the last element before a change. + Changepoint detectors detect points in time where a change in the data occurs. + Data between two changepoints is a segment where the data is considered to be + homogeneous, i.e., of the same distribution. A changepoint is defined as the + location of the last element of a segment. Output format of the predict method: See the dense_to_sparse method. Output format of the transform method: See the sparse_to_dense method. @@ -638,10 +640,6 @@ def sparse_to_dense(y_sparse: pd.Series, index: pd.Index) -> pd.Series[int]: ------- pd.Series """ - # TODO: Use segment labels as dense output or changepoint indicator? - # Segment labels probably more useful. - # y_dense = pd.Series(0, index=index, name="changepoint", dtype="int64") - # y_dense.iloc[y_sparse.values] = 1 changepoints = y_sparse.to_list() n = len(index) changepoints = [-1] + changepoints + [n - 1] @@ -667,10 +665,6 @@ def dense_to_sparse(y_dense: pd.Series) -> pd.Series[int]: ------- pd.Series """ - # TODO: Use segment labels as dense output or changepoint indicator? - # Segment labels probably more useful. - # y_dense = y_dense.reset_index(drop=True) - # y_sparse = y_dense.iloc[y_dense.values == 1].index y_dense = y_dense.reset_index(drop=True) # changepoint = end of segment, so the label diffs > 0 must be shiftet by -1. is_changepoint = np.roll(y_dense.diff().abs() > 0, -1) From d0afbff015596d0b6c89d85a68147856e98398e6 Mon Sep 17 00:00:00 2001 From: tveten Date: Wed, 21 Aug 2024 15:52:08 +0200 Subject: [PATCH 11/75] Improve module description --- skchange/base.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/skchange/base.py b/skchange/base.py index 824c83f2..e7e9c48a 100644 --- a/skchange/base.py +++ b/skchange/base.py @@ -11,12 +11,13 @@ class name: BaseDetector detection scores, dense - score_transform(self, X) updating (temporal) - update(self, X, Y=None) -Each detector type (subclass of BaseDetector in skchange, task + learning_type tags of -BaseSeriesAnnotator in sktime) is defined by the content and format of the output of the -predict method. Each detector type therefore has the following methods for converting -between sparse and dense output formats: - sparse_to_dense - sparse_to_dense(y_sparse, index) - dense_to_sparse - dense_to_sparse(y_dense) +Each detector type (e.g. anomaly detector, collective anomaly detector, changepoint +detector) are subclasses of BaseDetector (task + learning_type tags in sktime). +They are defined by the content and format of the output of the predict method. Each +detector type therefore has the following methods for converting between sparse and +dense output formats: + converting sparse output to dense - sparse_to_dense(y_sparse, index) + converting dense output to sparse - dense_to_sparse(y_dense) Convenience methods: update&detect - update_predict(self, X) From afdd42a7fbd477ed6bca872ad46ae7d036d8881e Mon Sep 17 00:00:00 2001 From: tveten Date: Wed, 21 Aug 2024 16:06:39 +0200 Subject: [PATCH 12/75] Move anomaly and change detector base classes to respective modules --- skchange/anomaly_detectors/base.py | 207 +++++++++++++++++++++ skchange/base.py | 279 ----------------------------- skchange/change_detectors/base.py | 82 +++++++++ 3 files changed, 289 insertions(+), 279 deletions(-) create mode 100644 skchange/anomaly_detectors/base.py create mode 100644 skchange/change_detectors/base.py diff --git a/skchange/anomaly_detectors/base.py b/skchange/anomaly_detectors/base.py new file mode 100644 index 00000000..9c81dc54 --- /dev/null +++ b/skchange/anomaly_detectors/base.py @@ -0,0 +1,207 @@ +"""Base classes for anomaly detectors.""" + +import numpy as np +import pandas as pd + +from skchange.base import BaseDetector + + +class PointAnomalyDetector(BaseDetector): + """Base class for anomaly detectors. + + Anomaly detectors detect individual data points that are considered anomalous. + + Output format of the predict method: See the dense_to_sparse method. + Output format of the transform method: See the sparse_to_dense method. + + Subclasses should set the following tags for sktime compatibility: + - task: "anomaly_detection" + - learning_type: "unsupervised" or "supervised" + - And possibly other tags, such as + * "capability:missing_values": False, + * "capability:multivariate": True, + * "fit_is_empty": False, + + Needs to be implemented: + - _fit(self, X, Y=None) -> self + - _predict(self, X) -> pd.Series + + Optional to implement: + - _score_transform(self, X) -> pd.Series + - _update(self, X, Y=None) -> self + """ + + @staticmethod + def sparse_to_dense(y_sparse: pd.Series, index: pd.Index) -> pd.Series[int]: + """Convert the sparse output from the predict method to a dense format. + + Parameters + ---------- + y_sparse : pd.Series + The sparse output from an anomaly detector's predict method. + index : array-like + Indices that are to be annotated according to ``y_sparse``. + + Returns + ------- + pd.Series where 0-entries are normal and 1-entries are anomalous. + """ + y_dense = pd.Series(0, index=index, name="anomaly", dtype="int64") + y_dense.iloc[y_sparse.values] = 1 + return y_dense + + @staticmethod + def dense_to_sparse(y_dense: pd.Series) -> pd.Series[int]: + """Convert the dense output from the transform method to a sparse format. + + Parameters + ---------- + y_dense : pd.Series + The dense output from an anomaly detector's transform method. + 0-entries are normal and >0-entries are anomalous. + + Returns + ------- + pd.Series of the integer locations of the anomalous data points. + """ + y_dense = y_dense.reset_index(drop=True) + y_sparse = y_dense.iloc[y_dense.values > 0].index + return pd.Series(y_sparse) + + +class CollectiveAnomalyDetector(BaseDetector): + """Base class for collective anomaly detectors. + + Collective anomaly detectors detect segments of data points that are considered + anomalous. + + Output format of the predict method: See the dense_to_sparse method. + Output format of the transform method: See the sparse_to_dense method. + + Subclasses should set the following tags for sktime compatibility: + - task: "collective_anomaly_detection" + - learning_type: "unsupervised" or "supervised" + - And possibly other tags, such as + * "capability:missing_values": False, + * "capability:multivariate": True, + * "fit_is_empty": False, + + Needs to be implemented: + - _fit(self, X, Y=None) -> self + - _predict(self, X) -> pd.Series + + Optional to implement: + - _score_transform(self, X) -> pd.Series + - _update(self, X, Y=None) -> self + """ + + @staticmethod + def sparse_to_dense( + y_sparse: pd.Series[pd.Interval], index: pd.Index + ) -> pd.Series[int]: + """Convert the sparse output from the predict method to a dense format. + + Parameters + ---------- + y_sparse : pd.Series[pd.Interval] + The collective anomaly intervals. + index : array-like + Indices that are to be annotated according to ``y_sparse``. + + Returns + ------- + pd.Series where 0-entries are normal and each collective anomaly are labelled + from 1, ..., K. + """ + y_dense = pd.IntervalIndex(y_sparse).get_indexer(index) + # get_indexer return values 0 for the values inside the first interval, 1 to + # the values within the next interval and so on, and -1 for values outside any + # interval. The skchange convention is that 0 is normal and > 0 is anomalous, + # so we add 1 to the result. + y_dense += 1 + return pd.Series(y_dense, index=index, name="anomaly", dtype="int64") + + @staticmethod + def dense_to_sparse(y_dense: pd.Series) -> pd.Series[pd.Interval]: + """Convert the dense output from the transform method to a sparse format. + + Parameters + ---------- + y_dense : pd.Series + The dense output from a collective anomaly detector's transform method: + An integer series where 0-entries are normal and each collective anomaly + are labelled from 1, ..., K. + + Returns + ------- + pd.Series[pd.Interval] containing the collective anomaly intervals. + + Notes + ----- + The start and end points of the intervals can be accessed by + output.array.left and output.array.right, respectively. + """ + y_dense = y_dense.reset_index(drop=True) + y_anomaly = y_dense.loc[y_dense.values > 0] + anomaly_locations_diff = y_anomaly.index.diff() + + first_anomaly_start = y_anomaly.index[:1].to_numpy() + anomaly_starts = y_anomaly.index[anomaly_locations_diff > 1] + anomaly_starts = np.insert(anomaly_starts, 0, first_anomaly_start) + + last_anomaly_end = y_anomaly.index[-1:].to_numpy() + anomaly_ends = y_anomaly.index[np.roll(anomaly_locations_diff > 1, -1)] + anomaly_ends = np.insert(anomaly_ends, len(anomaly_ends), last_anomaly_end) + + y_sparse = pd.Series( + pd.IntervalIndex.from_arrays(anomaly_starts, anomaly_ends, closed="both") + ) + return y_sparse + + +class SubsetCollectiveAnomalyDetector(BaseDetector): + """Base class for subset collective anomaly detectors. + + Subset collective anomaly detectors detect segments of multivariate time series data + that are considered anomalous, and also provide information on which components of + the data are affected. + + Output format of the predict method: + pd.DataFrame({ + "location": pd.IntervalIndex(anomaly_intervals, closed=), + "columns": affected_components_list, + }) + + Subclasses should set the following tags for sktime compatibility: + - task: "collective_anomaly_detection" + - learning_type: "unsupervised" or "supervised" + - capability:subset_detection: True + - And possibly other tags, such as + * "capability:missing_values": False, + * "capability:multivariate": True, + * "fit_is_empty": False, + + Needs to be implemented: + - _fit(self, X, Y=None) -> self + - _predict(self, X) -> pd.DataFrame + + Optional to implement: + - _score_transform(self, X) -> pd.Series + - _update(self, X, Y=None) -> self + """ + + @staticmethod + def sparse_to_dense(y_sparse, index): + """Convert the sparse output from the predict method to a dense format. + + Parameters + ---------- + y_sparse : pd.DataFrame + The sparse output from the predict method. + index : array-like + Indices that are to be annotated according to ``y_sparse``. + + Returns + ------- + pd.DataFrame + """ diff --git a/skchange/base.py b/skchange/base.py index e7e9c48a..e634c661 100644 --- a/skchange/base.py +++ b/skchange/base.py @@ -36,8 +36,6 @@ class name: BaseDetector __author__ = ["mtveten"] __all__ = ["BaseDetector"] -import numpy as np -import pandas as pd from sktime.base import BaseEstimator from sktime.utils.validation.series import check_series @@ -443,280 +441,3 @@ def fit_transform(self, X, Y=None): # }) # - capability:detection_score is True: Explicit way of stating that _score_transform # is implemented. - - -class PointAnomalyDetector(BaseDetector): - """Base class for anomaly detectors. - - Anomaly detectors detect individual data points that are considered anomalous. - - Output format of the predict method: See the dense_to_sparse method. - Output format of the transform method: See the sparse_to_dense method. - - Subclasses should set the following tags for sktime compatibility: - - task: "anomaly_detection" - - learning_type: "unsupervised" or "supervised" - - And possibly other tags, such as - * "capability:missing_values": False, - * "capability:multivariate": True, - * "fit_is_empty": False, - - Needs to be implemented: - - _fit(self, X, Y=None) -> self - - _predict(self, X) -> pd.Series - - Optional to implement: - - _score_transform(self, X) -> pd.Series - - _update(self, X, Y=None) -> self - """ - - @staticmethod - def sparse_to_dense(y_sparse: pd.Series, index: pd.Index) -> pd.Series[int]: - """Convert the sparse output from the predict method to a dense format. - - Parameters - ---------- - y_sparse : pd.Series - The sparse output from an anomaly detector's predict method. - index : array-like - Indices that are to be annotated according to ``y_sparse``. - - Returns - ------- - pd.Series where 0-entries are normal and 1-entries are anomalous. - """ - y_dense = pd.Series(0, index=index, name="anomaly", dtype="int64") - y_dense.iloc[y_sparse.values] = 1 - return y_dense - - @staticmethod - def dense_to_sparse(y_dense: pd.Series) -> pd.Series[int]: - """Convert the dense output from the transform method to a sparse format. - - Parameters - ---------- - y_dense : pd.Series - The dense output from an anomaly detector's transform method. - 0-entries are normal and >0-entries are anomalous. - - Returns - ------- - pd.Series of the integer locations of the anomalous data points. - """ - y_dense = y_dense.reset_index(drop=True) - y_sparse = y_dense.iloc[y_dense.values > 0].index - return pd.Series(y_sparse) - - -class CollectiveAnomalyDetector(BaseDetector): - """Base class for collective anomaly detectors. - - Collective anomaly detectors detect segments of data points that are considered - anomalous. - - Output format of the predict method: See the dense_to_sparse method. - Output format of the transform method: See the sparse_to_dense method. - - Subclasses should set the following tags for sktime compatibility: - - task: "collective_anomaly_detection" - - learning_type: "unsupervised" or "supervised" - - And possibly other tags, such as - * "capability:missing_values": False, - * "capability:multivariate": True, - * "fit_is_empty": False, - - Needs to be implemented: - - _fit(self, X, Y=None) -> self - - _predict(self, X) -> pd.Series - - Optional to implement: - - _score_transform(self, X) -> pd.Series - - _update(self, X, Y=None) -> self - """ - - @staticmethod - def sparse_to_dense( - y_sparse: pd.Series[pd.Interval], index: pd.Index - ) -> pd.Series[int]: - """Convert the sparse output from the predict method to a dense format. - - Parameters - ---------- - y_sparse : pd.Series[pd.Interval] - The collective anomaly intervals. - index : array-like - Indices that are to be annotated according to ``y_sparse``. - - Returns - ------- - pd.Series where 0-entries are normal and each collective anomaly are labelled - from 1, ..., K. - """ - y_dense = pd.IntervalIndex(y_sparse).get_indexer(index) - # get_indexer return values 0 for the values inside the first interval, 1 to - # the values within the next interval and so on, and -1 for values outside any - # interval. The skchange convention is that 0 is normal and > 0 is anomalous, - # so we add 1 to the result. - y_dense += 1 - return pd.Series(y_dense, index=index, name="anomaly", dtype="int64") - - @staticmethod - def dense_to_sparse(y_dense: pd.Series) -> pd.Series[pd.Interval]: - """Convert the dense output from the transform method to a sparse format. - - Parameters - ---------- - y_dense : pd.Series - The dense output from a collective anomaly detector's transform method: - An integer series where 0-entries are normal and each collective anomaly - are labelled from 1, ..., K. - - Returns - ------- - pd.Series[pd.Interval] containing the collective anomaly intervals. - - Notes - ----- - The start and end points of the intervals can be accessed by - output.array.left and output.array.right, respectively. - """ - y_dense = y_dense.reset_index(drop=True) - y_anomaly = y_dense.loc[y_dense.values > 0] - anomaly_locations_diff = y_anomaly.index.diff() - - first_anomaly_start = y_anomaly.index[:1].to_numpy() - anomaly_starts = y_anomaly.index[anomaly_locations_diff > 1] - anomaly_starts = np.insert(anomaly_starts, 0, first_anomaly_start) - - last_anomaly_end = y_anomaly.index[-1:].to_numpy() - anomaly_ends = y_anomaly.index[np.roll(anomaly_locations_diff > 1, -1)] - anomaly_ends = np.insert(anomaly_ends, len(anomaly_ends), last_anomaly_end) - - y_sparse = pd.Series( - pd.IntervalIndex.from_arrays(anomaly_starts, anomaly_ends, closed="both") - ) - return y_sparse - - -class ChangepointDetector(BaseDetector): - """Base class for changepoint detectors. - - Changepoint detectors detect points in time where a change in the data occurs. - Data between two changepoints is a segment where the data is considered to be - homogeneous, i.e., of the same distribution. A changepoint is defined as the - location of the last element of a segment. - - Output format of the predict method: See the dense_to_sparse method. - Output format of the transform method: See the sparse_to_dense method. - - Subclasses should set the following tags for sktime compatibility: - - task: "change_point_detection" - - learning_type: "unsupervised" or "supervised" - - And possibly other tags, such as - * "capability:missing_values": False, - * "capability:multivariate": True, - * "fit_is_empty": False, - - Needs to be implemented: - - _fit(self, X, Y=None) -> self - - _predict(self, X) -> pd.Series - - Optional to implement: - - _score_transform(self, X) -> pd.Series - - _update(self, X, Y=None) -> self - """ - - @staticmethod - def sparse_to_dense(y_sparse: pd.Series, index: pd.Index) -> pd.Series[int]: - """Convert the sparse output from the predict method to a dense format. - - Parameters - ---------- - y_sparse : pd.Series - The sparse output from a changepoint detector's predict method. - index : array-like - Indices that are to be annotated according to ``y_sparse``. - - Returns - ------- - pd.Series - """ - changepoints = y_sparse.to_list() - n = len(index) - changepoints = [-1] + changepoints + [n - 1] - segment_labels = np.zeros(n) - for i in range(len(changepoints) - 1): - segment_labels[changepoints[i] + 1 : changepoints[i + 1] + 1] = i - - y_dense = pd.Series( - segment_labels, index=index, name="segment_label", dtype="int64" - ) - return y_dense - - @staticmethod - def dense_to_sparse(y_dense: pd.Series) -> pd.Series[int]: - """Convert the dense output from the transform method to a sparse format. - - Parameters - ---------- - y_dense : pd.Series - The dense output from a changepoint detector's transform method. - - Returns - ------- - pd.Series - """ - y_dense = y_dense.reset_index(drop=True) - # changepoint = end of segment, so the label diffs > 0 must be shiftet by -1. - is_changepoint = np.roll(y_dense.diff().abs() > 0, -1) - changepoints = y_dense.index[is_changepoint] - y_sparse = pd.Series(changepoints, name="changepoint", dtype="int64") - return y_sparse - - -class SubsetCollectiveAnomalyDetector(BaseDetector): - """Base class for subset collective anomaly detectors. - - Subset collective anomaly detectors detect segments of multivariate time series data - that are considered anomalous, and also provide information on which components of - the data are affected. - - Output format of the predict method: - pd.DataFrame({ - "location": pd.IntervalIndex(anomaly_intervals, closed=), - "columns": affected_components_list, - }) - - Subclasses should set the following tags for sktime compatibility: - - task: "collective_anomaly_detection" - - learning_type: "unsupervised" or "supervised" - - capability:subset_detection: True - - And possibly other tags, such as - * "capability:missing_values": False, - * "capability:multivariate": True, - * "fit_is_empty": False, - - Needs to be implemented: - - _fit(self, X, Y=None) -> self - - _predict(self, X) -> pd.DataFrame - - Optional to implement: - - _score_transform(self, X) -> pd.Series - - _update(self, X, Y=None) -> self - """ - - @staticmethod - def sparse_to_dense(y_sparse, index): - """Convert the sparse output from the predict method to a dense format. - - Parameters - ---------- - y_sparse : pd.DataFrame - The sparse output from the predict method. - index : array-like - Indices that are to be annotated according to ``y_sparse``. - - Returns - ------- - pd.DataFrame - """ diff --git a/skchange/change_detectors/base.py b/skchange/change_detectors/base.py new file mode 100644 index 00000000..b990bb06 --- /dev/null +++ b/skchange/change_detectors/base.py @@ -0,0 +1,82 @@ +"""Base classes for changepoint detectors.""" + +import numpy as np +import pandas as pd + +from skchange.base import BaseDetector + + +class ChangepointDetector(BaseDetector): + """Base class for changepoint detectors. + + Changepoint detectors detect points in time where a change in the data occurs. + Data between two changepoints is a segment where the data is considered to be + homogeneous, i.e., of the same distribution. A changepoint is defined as the + location of the last element of a segment. + + Output format of the predict method: See the dense_to_sparse method. + Output format of the transform method: See the sparse_to_dense method. + + Subclasses should set the following tags for sktime compatibility: + - task: "change_point_detection" + - learning_type: "unsupervised" or "supervised" + - And possibly other tags, such as + * "capability:missing_values": False, + * "capability:multivariate": True, + * "fit_is_empty": False, + + Needs to be implemented: + - _fit(self, X, Y=None) -> self + - _predict(self, X) -> pd.Series + + Optional to implement: + - _score_transform(self, X) -> pd.Series + - _update(self, X, Y=None) -> self + """ + + @staticmethod + def sparse_to_dense(y_sparse: pd.Series, index: pd.Index) -> pd.Series[int]: + """Convert the sparse output from the predict method to a dense format. + + Parameters + ---------- + y_sparse : pd.Series + The sparse output from a changepoint detector's predict method. + index : array-like + Indices that are to be annotated according to ``y_sparse``. + + Returns + ------- + pd.Series + """ + changepoints = y_sparse.to_list() + n = len(index) + changepoints = [-1] + changepoints + [n - 1] + segment_labels = np.zeros(n) + for i in range(len(changepoints) - 1): + segment_labels[changepoints[i] + 1 : changepoints[i + 1] + 1] = i + + y_dense = pd.Series( + segment_labels, index=index, name="segment_label", dtype="int64" + ) + return y_dense + + @staticmethod + def dense_to_sparse(y_dense: pd.Series) -> pd.Series[int]: + """Convert the dense output from the transform method to a sparse format. + + Parameters + ---------- + y_dense : pd.Series + The dense output from a changepoint detector's transform method. + + Returns + ------- + pd.Series + """ + y_dense = y_dense.reset_index(drop=True) + # changepoint = end of segment, so the label diffs > 0 must be shiftet by -1. + is_changepoint = np.roll(y_dense.diff().abs() > 0, -1) + changepoints = y_dense.index[is_changepoint] + y_sparse = pd.Series(changepoints, name="changepoint", dtype="int64") + return y_sparse From 42cc9c735c575f6e01dbe4d70504b66943a7bc8f Mon Sep 17 00:00:00 2001 From: tveten Date: Wed, 21 Aug 2024 16:20:13 +0200 Subject: [PATCH 13/75] Improve docstring: Mark optional methods --- skchange/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/skchange/base.py b/skchange/base.py index e634c661..55c48ef7 100644 --- a/skchange/base.py +++ b/skchange/base.py @@ -8,8 +8,8 @@ class name: BaseDetector fitting - fit(self, X, Y=None) detecting, sparse format - predict(self, X) detecting, dense format - transform(self, X) - detection scores, dense - score_transform(self, X) - updating (temporal) - update(self, X, Y=None) + detection scores, dense - score_transform(self, X) [optional] + updating (temporal) - update(self, X, Y=None) [optional] Each detector type (e.g. anomaly detector, collective anomaly detector, changepoint detector) are subclasses of BaseDetector (task + learning_type tags in sktime). @@ -17,7 +17,7 @@ class name: BaseDetector detector type therefore has the following methods for converting between sparse and dense output formats: converting sparse output to dense - sparse_to_dense(y_sparse, index) - converting dense output to sparse - dense_to_sparse(y_dense) + converting dense output to sparse - dense_to_sparse(y_dense) [optional] Convenience methods: update&detect - update_predict(self, X) From 3c8603e6c37b30aeb2ca459380df721ed7f725f9 Mon Sep 17 00:00:00 2001 From: tveten Date: Wed, 21 Aug 2024 16:20:42 +0200 Subject: [PATCH 14/75] Fix invalid type hints for pd.Series --- skchange/anomaly_detectors/base.py | 10 ++++------ skchange/change_detectors/base.py | 4 ++-- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/skchange/anomaly_detectors/base.py b/skchange/anomaly_detectors/base.py index 9c81dc54..9e9012ed 100644 --- a/skchange/anomaly_detectors/base.py +++ b/skchange/anomaly_detectors/base.py @@ -32,7 +32,7 @@ class PointAnomalyDetector(BaseDetector): """ @staticmethod - def sparse_to_dense(y_sparse: pd.Series, index: pd.Index) -> pd.Series[int]: + def sparse_to_dense(y_sparse: pd.Series, index: pd.Index) -> pd.Series: """Convert the sparse output from the predict method to a dense format. Parameters @@ -51,7 +51,7 @@ def sparse_to_dense(y_sparse: pd.Series, index: pd.Index) -> pd.Series[int]: return y_dense @staticmethod - def dense_to_sparse(y_dense: pd.Series) -> pd.Series[int]: + def dense_to_sparse(y_dense: pd.Series) -> pd.Series: """Convert the dense output from the transform method to a sparse format. Parameters @@ -96,9 +96,7 @@ class CollectiveAnomalyDetector(BaseDetector): """ @staticmethod - def sparse_to_dense( - y_sparse: pd.Series[pd.Interval], index: pd.Index - ) -> pd.Series[int]: + def sparse_to_dense(y_sparse: pd.Series, index: pd.Index) -> pd.Series: """Convert the sparse output from the predict method to a dense format. Parameters @@ -122,7 +120,7 @@ def sparse_to_dense( return pd.Series(y_dense, index=index, name="anomaly", dtype="int64") @staticmethod - def dense_to_sparse(y_dense: pd.Series) -> pd.Series[pd.Interval]: + def dense_to_sparse(y_dense: pd.Series) -> pd.Series: """Convert the dense output from the transform method to a sparse format. Parameters diff --git a/skchange/change_detectors/base.py b/skchange/change_detectors/base.py index b990bb06..91b86313 100644 --- a/skchange/change_detectors/base.py +++ b/skchange/change_detectors/base.py @@ -35,7 +35,7 @@ class ChangepointDetector(BaseDetector): """ @staticmethod - def sparse_to_dense(y_sparse: pd.Series, index: pd.Index) -> pd.Series[int]: + def sparse_to_dense(y_sparse: pd.Series, index: pd.Index) -> pd.Series: """Convert the sparse output from the predict method to a dense format. Parameters @@ -62,7 +62,7 @@ def sparse_to_dense(y_sparse: pd.Series, index: pd.Index) -> pd.Series[int]: return y_dense @staticmethod - def dense_to_sparse(y_dense: pd.Series) -> pd.Series[int]: + def dense_to_sparse(y_dense: pd.Series) -> pd.Series: """Convert the dense output from the transform method to a sparse format. Parameters From 5abd3268c12198d29f3bf8891d3a4c2704c8973b Mon Sep 17 00:00:00 2001 From: tveten Date: Wed, 21 Aug 2024 16:21:54 +0200 Subject: [PATCH 15/75] Switch class dependence to internal BaseDetector --- skchange/change_detectors/moscore.py | 57 ++++++++++++++-------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/skchange/change_detectors/moscore.py b/skchange/change_detectors/moscore.py index ca501c7a..19e1bf2d 100644 --- a/skchange/change_detectors/moscore.py +++ b/skchange/change_detectors/moscore.py @@ -8,9 +8,8 @@ import numpy as np import pandas as pd from numba import njit -from sktime.annotation.base import BaseSeriesAnnotator -from skchange.change_detectors.utils import format_changepoint_output +from skchange.change_detectors.base import ChangepointDetector from skchange.scores.score_factory import score_factory from skchange.utils.numba.general import where from skchange.utils.validation.data import check_data @@ -49,7 +48,7 @@ def moscore_transform( return scores -class Moscore(BaseSeriesAnnotator): +class Moscore(ChangepointDetector): """Moving score algorithm for multiple changepoint detection. A generalized version of the MOSUM (moving sum) algorithm [1]_ for changepoint @@ -85,17 +84,6 @@ class Moscore(BaseSeriesAnnotator): min_detection_interval : int, optional (default=1) Minimum number of consecutive scores above the threshold to be considered a changepoint. Must be between 1 and `bandwidth`/2. - fmt : str {"dense", "sparse"}, optional (default="sparse") - Annotation output format: - * If "sparse", a sub-series of labels for only the outliers in X is returned, - * If "dense", a series of labels for all values in X is returned. - labels : str {"indicator", "score", "int_label"}, optional (default="int_label") - Annotation output labels: - * If "indicator", returned values are boolean, indicating whether a value is an - outlier, - * If "score", returned values are floats, giving the outlier score. - * If "int_label", returned values are integer, indicating which segment a value - belongs to. References ---------- @@ -113,6 +101,8 @@ class Moscore(BaseSeriesAnnotator): """ _tags = { + "task": "change_point_detection", + "learning_type": "unsupervised", "capability:missing_values": False, "capability:multivariate": True, "fit_is_empty": False, @@ -125,15 +115,13 @@ def __init__( threshold_scale: Optional[float] = 2.0, level: float = 0.01, min_detection_interval: int = 1, - fmt: str = "sparse", - labels: str = "int_label", ): self.score = score self.bandwidth = bandwidth self.threshold_scale = threshold_scale # Just holds the input value. self.level = level self.min_detection_interval = min_detection_interval - super().__init__(fmt=fmt, labels=labels) + super().__init__() self.score_f, self.score_init_f = score_factory(self.score) check_larger_than(1, self.bandwidth, "bandwidth") @@ -254,17 +242,18 @@ def _fit(self, X: pd.DataFrame, Y: Optional[pd.DataFrame] = None): self.threshold_ = self._get_threshold(X) return self - def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: - """Create annotations on test/deployment data. + def _score_transform(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: + """Return scores for predicted annotations on test/deployment data. Parameters ---------- - X : pd.DataFrame - data to annotate, time series + X : pd.DataFrame + Data to annotate, time series. Returns ------- - Y : pd.Series - annotations for sequence X - exact format depends on annotation type + Y : pd.Series + Annotations for sequence X exact format depends on annotation type. """ X = check_data( X, @@ -277,13 +266,25 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: self.score_init_f, self.bandwidth, ) - self.changepoints = get_moscore_changepoints( - scores, self.threshold_, self.min_detection_interval - ) - self.scores = pd.Series(scores, index=X.index, name="score") - return format_changepoint_output( - self.fmt, self.labels, self.changepoints, X.index, self.scores + return pd.Series(scores, index=X.index, name="score") + + def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: + """Create annotations on test/deployment data. + + Parameters + ---------- + X : pd.DataFrame - data to annotate, time series + + Returns + ------- + Y : pd.Series - annotations for sequence X + exact format depends on annotation type + """ + self.scores = self.score_transform(X) + changepoints = get_moscore_changepoints( + self.scores.values, self.threshold_, self.min_detection_interval ) + return pd.Series(changepoints, name="changepoint", dtype="int64") @classmethod def get_test_params(cls, parameter_set="default"): From 430033fff12ad8adf09b7a21a47b788a45842e41 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 09:07:44 +0200 Subject: [PATCH 16/75] Add utility functions for sparse output Can be reused when implementing _predict in subclasses --- skchange/anomaly_detectors/base.py | 35 ++++++++++++++++++++++-------- skchange/change_detectors/base.py | 14 ++++++++---- 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/skchange/anomaly_detectors/base.py b/skchange/anomaly_detectors/base.py index 9e9012ed..cae1f86a 100644 --- a/skchange/anomaly_detectors/base.py +++ b/skchange/anomaly_detectors/base.py @@ -46,7 +46,7 @@ def sparse_to_dense(y_sparse: pd.Series, index: pd.Index) -> pd.Series: ------- pd.Series where 0-entries are normal and 1-entries are anomalous. """ - y_dense = pd.Series(0, index=index, name="anomaly", dtype="int64") + y_dense = pd.Series(0, index=index, name="anomaly_label", dtype="int64") y_dense.iloc[y_sparse.values] = 1 return y_dense @@ -65,8 +65,16 @@ def dense_to_sparse(y_dense: pd.Series) -> pd.Series: pd.Series of the integer locations of the anomalous data points. """ y_dense = y_dense.reset_index(drop=True) - y_sparse = y_dense.iloc[y_dense.values > 0].index - return pd.Series(y_sparse) + anomalies = y_dense.iloc[y_dense.values > 0].index + return PointAnomalyDetector._format_sparse_output(anomalies) + + @staticmethod + def _format_sparse_output(anomalies) -> pd.Series: + """Format the sparse output of anomaly detectors. + + Can be reused by subclasses to format the output of the _predict method. + """ + return pd.Series(anomalies, name="anomaly", dtype="int64") class CollectiveAnomalyDetector(BaseDetector): @@ -111,13 +119,13 @@ def sparse_to_dense(y_sparse: pd.Series, index: pd.Index) -> pd.Series: pd.Series where 0-entries are normal and each collective anomaly are labelled from 1, ..., K. """ - y_dense = pd.IntervalIndex(y_sparse).get_indexer(index) + labels = pd.IntervalIndex(y_sparse).get_indexer(index) # get_indexer return values 0 for the values inside the first interval, 1 to # the values within the next interval and so on, and -1 for values outside any # interval. The skchange convention is that 0 is normal and > 0 is anomalous, # so we add 1 to the result. - y_dense += 1 - return pd.Series(y_dense, index=index, name="anomaly", dtype="int64") + labels += 1 + return pd.Series(labels, index=index, name="anomaly_label", dtype="int64") @staticmethod def dense_to_sparse(y_dense: pd.Series) -> pd.Series: @@ -151,10 +159,19 @@ def dense_to_sparse(y_dense: pd.Series) -> pd.Series: anomaly_ends = y_anomaly.index[np.roll(anomaly_locations_diff > 1, -1)] anomaly_ends = np.insert(anomaly_ends, len(anomaly_ends), last_anomaly_end) - y_sparse = pd.Series( - pd.IntervalIndex.from_arrays(anomaly_starts, anomaly_ends, closed="both") + anomaly_intervals = list(zip(anomaly_starts, anomaly_ends)) + return CollectiveAnomalyDetector._format_sparse_output(anomaly_intervals) + + @staticmethod + def _format_sparse_output(anomaly_intervals, closed="both") -> pd.Series: + """Format the sparse output of collective anomaly detectors. + + Can be reused by subclasses to format the output of the _predict method. + """ + return pd.Series( + pd.IntervalIndex.from_tuples(anomaly_intervals, closed=closed), + name="collective_anomaly", ) - return y_sparse class SubsetCollectiveAnomalyDetector(BaseDetector): diff --git a/skchange/change_detectors/base.py b/skchange/change_detectors/base.py index 91b86313..eb1fde27 100644 --- a/skchange/change_detectors/base.py +++ b/skchange/change_detectors/base.py @@ -56,10 +56,9 @@ def sparse_to_dense(y_sparse: pd.Series, index: pd.Index) -> pd.Series: for i in range(len(changepoints) - 1): segment_labels[changepoints[i] + 1 : changepoints[i + 1] + 1] = i - y_dense = pd.Series( + return pd.Series( segment_labels, index=index, name="segment_label", dtype="int64" ) - return y_dense @staticmethod def dense_to_sparse(y_dense: pd.Series) -> pd.Series: @@ -78,5 +77,12 @@ def dense_to_sparse(y_dense: pd.Series) -> pd.Series: # changepoint = end of segment, so the label diffs > 0 must be shiftet by -1. is_changepoint = np.roll(y_dense.diff().abs() > 0, -1) changepoints = y_dense.index[is_changepoint] - y_sparse = pd.Series(changepoints, name="changepoint", dtype="int64") - return y_sparse + return ChangepointDetector._format_sparse_output(changepoints) + + @staticmethod + def _format_sparse_output(changepoints) -> pd.Series: + """Format the sparse output of changepoint detectors. + + Can be reused by subclasses to format the output of the _predict method. + """ + return pd.Series(changepoints, name="changepoint", dtype="int64") From 18c737ccaaee1d50b1978441bd68f351924da5d7 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 09:38:11 +0200 Subject: [PATCH 17/75] Set object_type tag to estimator Checks for sktime compatibility with estimators, not subclasses. --- skchange/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skchange/base.py b/skchange/base.py index 55c48ef7..a976a59d 100644 --- a/skchange/base.py +++ b/skchange/base.py @@ -64,7 +64,7 @@ class BaseDetector(BaseEstimator): """ _tags = { - "object_type": "detector", # type of object + "object_type": "estimator", # sktime scitype of object "learning_type": "None", # Tag to determine test in test_all_annotators "task": "None", # Tag to determine test in test_all_annotators # From 00d23c5cc793c2329587247bb3b717ef35de96ad Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 09:38:54 +0200 Subject: [PATCH 18/75] Use parametrize_with_checks from sktime --- skchange/tests/test_all_detectors.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/skchange/tests/test_all_detectors.py b/skchange/tests/test_all_detectors.py index d053cd1d..3a1621d4 100644 --- a/skchange/tests/test_all_detectors.py +++ b/skchange/tests/test_all_detectors.py @@ -1,19 +1,13 @@ """Tests for all annotators/detectors in skchange.""" -import pytest -from sktime.tests.test_switch import run_test_for_class -from sktime.utils.estimator_checks import check_estimator +from sktime.utils.estimator_checks import check_estimator, parametrize_with_checks from skchange.anomaly_detectors.tests.test_anomaly_detectors import anomaly_detectors from skchange.change_detectors.tests.test_change_detectors import change_detectors -all_annotators = anomaly_detectors + change_detectors +all_detectors = anomaly_detectors + change_detectors -@pytest.mark.parametrize("Estimator", all_annotators) -def test_sktime_annotator_compatibility(Estimator): - """Check compatibility with sktime annotator interface.""" - if not run_test_for_class(Estimator): - return None - - check_estimator(Estimator, raise_exceptions=True) +@parametrize_with_checks(all_detectors) +def test_sktime_compatible_estimators(obj, test_name): + check_estimator(obj, tests_to_run=test_name, raise_exceptions=True) From b214b1da6066a905af7c05f8b003d5e21feb8519 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 10:28:13 +0200 Subject: [PATCH 19/75] Standardise predict output --- skchange/change_detectors/moscore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skchange/change_detectors/moscore.py b/skchange/change_detectors/moscore.py index 19e1bf2d..19404470 100644 --- a/skchange/change_detectors/moscore.py +++ b/skchange/change_detectors/moscore.py @@ -284,7 +284,7 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: changepoints = get_moscore_changepoints( self.scores.values, self.threshold_, self.min_detection_interval ) - return pd.Series(changepoints, name="changepoint", dtype="int64") + return ChangepointDetector._format_sparse_output(changepoints) @classmethod def get_test_params(cls, parameter_set="default"): From 5da1cbdf439d9aaa7cfac9b435c60cc5371cf505 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 10:29:23 +0200 Subject: [PATCH 20/75] Conform Pelt to BaseDetector --- skchange/change_detectors/pelt.py | 47 ++++++++++++++++--------------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/skchange/change_detectors/pelt.py b/skchange/change_detectors/pelt.py index 5bc29238..b182dae3 100644 --- a/skchange/change_detectors/pelt.py +++ b/skchange/change_detectors/pelt.py @@ -9,9 +9,8 @@ import numpy as np import pandas as pd from numba import njit -from sktime.annotation.base import BaseSeriesAnnotator -from skchange.change_detectors.utils import format_changepoint_output +from skchange.change_detectors.base import ChangepointDetector from skchange.costs.cost_factory import cost_factory from skchange.utils.validation.data import check_data from skchange.utils.validation.parameters import check_larger_than @@ -62,7 +61,7 @@ def run_pelt( return opt_cost[1:], get_changepoints(prev_cpts) -class Pelt(BaseSeriesAnnotator): +class Pelt(ChangepointDetector): """Pruned exact linear time changepoint detection. An efficient implementation of the PELT algorithm [1]_ for changepoint detection. @@ -80,18 +79,6 @@ class Pelt(BaseSeriesAnnotator): input to .fit() (not supported yet). min_segment_length : int, optional (default=2) Minimum length of a segment. - fmt : str {"dense", "sparse"}, optional (default="sparse") - Annotation output format: - * If "sparse", a sub-series of labels for only the outliers in X is returned, - * If "dense", a series of labels for all values in X is returned. - labels : str {"indicator", "score", "int_label"}, optional (default="int_label") - Annotation output labels: - * If "indicator", returned values are boolean, indicating whether a value is an - outlier, - * If "score", returned values are floats, giving the outlier score. - * If "int_label", returned values are integer, indicating which segment a value - belongs to. - References ---------- @@ -120,13 +107,11 @@ def __init__( cost: Union[str, Callable] = "mean", penalty_scale: Optional[float] = 2.0, min_segment_length: int = 2, - fmt: str = "sparse", - labels: str = "int_label", ): self.cost = cost self.penalty_scale = penalty_scale self.min_segment_length = min_segment_length - super().__init__(fmt=fmt, labels=labels) + super().__init__() self.cost_func, self.cost_init_func = cost_factory(self.cost) check_larger_than(0, penalty_scale, "penalty_scale", allow_none=True) @@ -223,17 +208,35 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: min_length=2 * self.min_segment_length, min_length_name="2*min_segment_length", ) - opt_costs, self.changepoints = run_pelt( + opt_costs, changepoints = run_pelt( X.values, self.cost_func, self.cost_init_func, self.penalty_, self.min_segment_length, ) + # Store the scores for introspection without recomputing using score_transform self.scores = pd.Series(opt_costs, index=X.index, name="score") - return format_changepoint_output( - self.fmt, self.labels, self.changepoints, X.index, self.scores - ) + return ChangepointDetector._format_sparse_output(changepoints) + + def _score_transform(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: + """Compute the pelt scores for the input data. + + Parameters + ---------- + X : pd.DataFrame - data to compute scores for, time series + + Returns + ------- + scores : pd.Series - scores for sequence X + + Notes + ----- + The PELT scores are the cumulative optimal costs, so the scores are increasing + and are not per observation scores. + """ + self.predict(X) + return self.scores @classmethod def get_test_params(cls, parameter_set="default"): From a1c2b8621b73e7eba145ea88c18cf1410388cab6 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 10:38:50 +0200 Subject: [PATCH 21/75] Conform SeededBinarySegmentation to BaseDetector --- skchange/change_detectors/seeded_binseg.py | 25 ++++------------------ 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/skchange/change_detectors/seeded_binseg.py b/skchange/change_detectors/seeded_binseg.py index bbbb6585..1299780c 100644 --- a/skchange/change_detectors/seeded_binseg.py +++ b/skchange/change_detectors/seeded_binseg.py @@ -8,9 +8,8 @@ import numpy as np import pandas as pd from numba import njit -from sktime.annotation.base import BaseSeriesAnnotator -from skchange.change_detectors.utils import format_changepoint_output +from skchange.change_detectors.base import ChangepointDetector from skchange.scores.score_factory import score_factory from skchange.utils.validation.data import check_data from skchange.utils.validation.parameters import check_in_interval, check_larger_than @@ -95,7 +94,7 @@ def run_seeded_binseg( return cpts, amoc_scores, maximizers, starts, ends -class SeededBinarySegmentation(BaseSeriesAnnotator): +class SeededBinarySegmentation(ChangepointDetector): """Seeded binary segmentation algorithm for multiple changepoint detection. Binary segmentation type changepoint detection algorithms recursively split the data @@ -138,17 +137,6 @@ class SeededBinarySegmentation(BaseSeriesAnnotator): starting at 'interval_len'='min_interval_length'. It also governs the amount of overlap between intervals of the same length, as the start of each interval is shifted by a factor of '1 + 1 / growth_factor'. Must be a float in (1, 2]. - fmt : str {"dense", "sparse"}, optional (default="sparse") - Annotation output format: - * If "sparse", a sub-series of labels for only the outliers in X is returned, - * If "dense", a series of labels for all values in X is returned. - labels : str {"indicator", "score", "int_label"}, optional (default="int_label") - Annotation output labels: - * If "indicator", returned values are boolean, indicating whether a value is an - outlier, - * If "score", returned values are floats, giving the outlier score. - * If "int_label", returned values are integer, indicating which segment a value - belongs to. References ---------- @@ -180,8 +168,6 @@ def __init__( min_segment_length: int = 5, max_interval_length: int = 200, growth_factor: float = 1.5, - fmt: str = "sparse", - labels: str = "int_label", ): self.score = score self.threshold_scale = threshold_scale # Just holds the input value. @@ -189,7 +175,7 @@ def __init__( self.min_segment_length = min_segment_length self.max_interval_length = max_interval_length self.growth_factor = growth_factor - super().__init__(fmt=fmt, labels=labels) + super().__init__() self.score_f, self.score_init_f = score_factory(self.score) check_larger_than(0.0, self.threshold_scale, "threshold_scale", allow_none=True) @@ -315,13 +301,10 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: self.max_interval_length, self.growth_factor, ) - self.changepoints = cpts self.scores = pd.DataFrame( {"start": starts, "end": ends, "argmax_cpt": maximizers, "score": scores} ) - return format_changepoint_output( - self.fmt, self.labels, self.changepoints, X.index, self.scores - ) + return ChangepointDetector.sparse_to_dense(cpts, X.index) @classmethod def get_test_params(cls, parameter_set="default"): From 7da353e63517af7d7d32b253eaf960ab92ec1aeb Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 11:49:00 +0200 Subject: [PATCH 22/75] Inherit from BaseTransformer To pass sktime check_estimator tests. It requires inheritance from an sktime.BaseEstimator subclass. Could be a good idea, since it passes as a transformer immediately. --- skchange/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skchange/base.py b/skchange/base.py index a976a59d..6a55187f 100644 --- a/skchange/base.py +++ b/skchange/base.py @@ -36,11 +36,11 @@ class name: BaseDetector __author__ = ["mtveten"] __all__ = ["BaseDetector"] -from sktime.base import BaseEstimator +from sktime.transformations.base import BaseTransformer from sktime.utils.validation.series import check_series -class BaseDetector(BaseEstimator): +class BaseDetector(BaseTransformer): """Base detector. An alternative implementation to the BaseSeriesAnnotator class from sktime, From 862b420f757dfdbbd39080cc734a9d43d372ec4c Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 14:45:53 +0200 Subject: [PATCH 23/75] Inherit from BaseTransformer rather than BaseEstimator Mainly to pass sktime conformance tests, which requires inheritance of a subclass of BaseEstimator. But aso useful to be a fully functional transformer. --- skchange/anomaly_detectors/base.py | 12 +- skchange/base.py | 200 +++++++++++++-------- skchange/change_detectors/base.py | 4 +- skchange/change_detectors/moscore.py | 13 +- skchange/change_detectors/pelt.py | 11 +- skchange/change_detectors/seeded_binseg.py | 9 +- 6 files changed, 151 insertions(+), 98 deletions(-) diff --git a/skchange/anomaly_detectors/base.py b/skchange/anomaly_detectors/base.py index cae1f86a..ad20311b 100644 --- a/skchange/anomaly_detectors/base.py +++ b/skchange/anomaly_detectors/base.py @@ -23,12 +23,12 @@ class PointAnomalyDetector(BaseDetector): * "fit_is_empty": False, Needs to be implemented: - - _fit(self, X, Y=None) -> self + - _fit(self, X, y=None) -> self - _predict(self, X) -> pd.Series Optional to implement: - _score_transform(self, X) -> pd.Series - - _update(self, X, Y=None) -> self + - _update(self, X, y=None) -> self """ @staticmethod @@ -95,12 +95,12 @@ class CollectiveAnomalyDetector(BaseDetector): * "fit_is_empty": False, Needs to be implemented: - - _fit(self, X, Y=None) -> self + - _fit(self, X, y=None) -> self - _predict(self, X) -> pd.Series Optional to implement: - _score_transform(self, X) -> pd.Series - - _update(self, X, Y=None) -> self + - _update(self, X, y=None) -> self """ @staticmethod @@ -197,12 +197,12 @@ class SubsetCollectiveAnomalyDetector(BaseDetector): * "fit_is_empty": False, Needs to be implemented: - - _fit(self, X, Y=None) -> self + - _fit(self, X, y=None) -> self - _predict(self, X) -> pd.DataFrame Optional to implement: - _score_transform(self, X) -> pd.Series - - _update(self, X, Y=None) -> self + - _update(self, X, y=None) -> self """ @staticmethod diff --git a/skchange/base.py b/skchange/base.py index 6a55187f..4bcd5c25 100644 --- a/skchange/base.py +++ b/skchange/base.py @@ -5,11 +5,11 @@ class name: BaseDetector Adapted from the sktime.BaseSeriesAnnotator class. Scitype defining methods: - fitting - fit(self, X, Y=None) + fitting - fit(self, X, y=None) detecting, sparse format - predict(self, X) - detecting, dense format - transform(self, X) + detecting, dense format - transform(self, X, y=None) detection scores, dense - score_transform(self, X) [optional] - updating (temporal) - update(self, X, Y=None) [optional] + updating (temporal) - update(self, X, y=None) [optional] Each detector type (e.g. anomaly detector, collective anomaly detector, changepoint detector) are subclasses of BaseDetector (task + learning_type tags in sktime). @@ -21,8 +21,8 @@ class name: BaseDetector Convenience methods: update&detect - update_predict(self, X) - fit&detect - fit_predict(self, X, Y=None) - fit&transform - fit_transform(self, X, Y=None) + fit&detect - fit_predict(self, X, y=None) + fit&transform - fit_transform(self, X, y=None) Inspection methods: hyper-parameter inspection - get_params() @@ -52,7 +52,7 @@ class BaseDetector(BaseTransformer): changepoint, or something else. Needs to be implemented: - - _fit(self, X, Y=None) -> self + - _fit(self, X, y=None) -> self - _predict(self, X) -> pd.Series or pd.DataFrame - sparse_to_dense(y_sparse, index) -> pd.Series or pd.DataFrame * Enables the transform method to work. @@ -60,17 +60,56 @@ class BaseDetector(BaseTransformer): Optional to implement: - dense_to_sparse(y_dense) -> pd.Series or pd.DataFrame - _score_transform(self, X) -> pd.Series or pd.DataFrame - - _update(self, X, Y=None) -> self + - _update(self, X, y=None) -> self """ + # _tags = { + # "object_type": "transformer", # sktime scitype of object + # "learning_type": "None", # Tag to determine test in test_all_annotators + # "task": "None", # Tag to determine test in test_all_annotators + # # + # # todo: distribution_type? we may have to refactor this, seems very soecufuc + # "distribution_type": "None", # Tag to determine test in test_all_annotators + # } # for unit test cases + _tags = { - "object_type": "estimator", # sktime scitype of object - "learning_type": "None", # Tag to determine test in test_all_annotators - "task": "None", # Tag to determine test in test_all_annotators - # - # todo: distribution_type? we may have to refactor this, seems very soecufuc - "distribution_type": "None", # Tag to determine test in test_all_annotators - } # for unit test cases + "object_type": "transformer", # type of object + "scitype:transform-input": "Series", + # what is the scitype of X: Series, or Panel + "scitype:transform-output": "Series", + # what scitype is returned: Primitives, Series, Panel + "scitype:transform-labels": "None", + # what is the scitype of y: None (not needed), Primitives, Series, Panel + "scitype:instancewise": True, # is this an instance-wise transform? + "capability:inverse_transform": False, # can the transformer inverse transform? + "capability:inverse_transform:range": None, + "capability:inverse_transform:exact": True, + # inverting range of inverse transform = domain of invertibility of transform + "univariate-only": False, # can the transformer handle multivariate X? + "X_inner_mtype": "pd.DataFrame", # which mtypes do _fit/_predict support for X? + # this can be a Panel mtype even if transform-input is Series, vectorized + "y_inner_mtype": "None", # which mtypes do _fit/_predict support for y? + "requires_X": True, # does X need to be passed in fit? + "requires_y": False, # does y need to be passed in fit? + "enforce_index_type": None, # index type that needs to be enforced in X/y + "fit_is_empty": False, # is fit empty and can be skipped? Yes = True + "X-y-must-have-same-index": False, # can estimator handle different X/y index? + "transform-returns-same-time-index": True, + # does transform return have the same time index as input X + "skip-inverse-transform": False, # is inverse-transform skipped when called? + "capability:unequal_length": True, + # can the transformer handle unequal length time series (if passed Panel)? + "capability:unequal_length:removes": False, + # is transform result always guaranteed to be equal length (and series)? + "handles-missing-data": False, # can estimator handle missing data? + # todo: rename to capability:missing_values + "capability:missing_values": False, + # is transform result always guaranteed to contain no missing values? + "remember_data": False, # whether all data seen is remembered as self._X + "python_version": None, # PEP 440 python version specifier to limit versions + "authors": "mtveten", # author(s) of the object + "maintainers": "mtveten", # current maintainer(s) of the object + } def __init__(self): self.task = self.get_class_tag("task") @@ -79,48 +118,48 @@ def __init__(self): self._is_fitted = False self._X = None - self._Y = None + self._y = None super().__init__() - def fit(self, X, Y=None): - """Fit to training data. + # def fit(self, X, y=None): + # """Fit to training data. - Parameters - ---------- - X : pd.DataFrame - Training data to fit model to (time series). - Y : pd.Series, optional - Ground truth annotations for training if annotator is supervised. + # Parameters + # ---------- + # X : pd.DataFrame + # Training data to fit model to (time series). + # y : pd.Series, optional + # Ground truth annotations for training if annotator is supervised. - Returns - ------- - self : - Reference to self. + # Returns + # ------- + # self : + # Reference to self. - Notes - ----- - Creates fitted model that updates attributes ending in "_". Sets - _is_fitted flag to True. - """ - X = check_series(X, allow_index_names=True) + # Notes + # ----- + # Creates fitted model that updates attributes ending in "_". Sets + # _is_fitted flag to True. + # """ + # X = check_series(X, allow_index_names=True) - if Y is not None: - Y = check_series(Y, allow_index_names=True) + # if y is not None: + # y = check_series(y, allow_index_names=True) - self._X = X - self._Y = Y + # self._X = X + # self._y = y - # fkiraly: insert checks/conversions here, after PR #1012 I suggest + # # fkiraly: insert checks/conversions here, after PR #1012 I suggest - self._fit(X=X, Y=Y) + # self._fit(X=X, y=y) - # this should happen last - self._is_fitted = True + # # this should happen last + # self._is_fitted = True - return self + # return self - def _fit(self, X, Y=None): + def _fit(self, X, y=None): """Fit to training data. core logic @@ -129,7 +168,7 @@ def _fit(self, X, Y=None): ---------- X : pd.DataFrame Training data to fit model to time series. - Y : pd.Series, optional + y : pd.Series, optional Ground truth annotations for training if annotator is supervised. Returns @@ -153,7 +192,7 @@ def predict(self, X): Returns ------- - Y : pd.Series or pd.DataFrame + y : pd.Series or pd.DataFrame Each element or row corresponds to a detected event. Exact format depends on the specific detector type. """ @@ -163,9 +202,8 @@ def predict(self, X): # fkiraly: insert checks/conversions here, after PR #1012 I suggest - Y = self._predict(X=X) - - return Y + y = self._predict(X=X) + return y def _predict(self, X): """Create annotations on test/deployment data. @@ -179,12 +217,12 @@ def _predict(self, X): Returns ------- - Y : pd.Series + y : pd.Series Annotations for sequence X exact format depends on annotation type. """ raise NotImplementedError("abstract method") - def transform(self, X): + def _transform(self, X, y=None): """Detect events and return the result in a dense format. Parameters @@ -194,13 +232,21 @@ def transform(self, X): Returns ------- - Y : pd.Series or pd.DataFrame + y : pd.Series or pd.DataFrame Detections for sequence X. The returned detections will be in the dense format, meaning that each element in X will be annotated according to the detection results in some meaningful way depending on the detector type. """ - Y = self.predict(X) - return self.sparse_to_dense(Y, X.index) + y = self.predict(X) + y_dense = self.sparse_to_dense(y, X.index) + + # sktime does not support transformations that change the state of the object. + # Some detectors store detection score information a self.scores during predict. + # For now remove self.scores in transform to pass tests. + if hasattr(self, "scores"): + del self.scores + + return y_dense @staticmethod def sparse_to_dense(y_sparse, index): @@ -246,7 +292,7 @@ def score_transform(self, X): Returns ------- - Y : pd.Series + y : pd.Series Scores for sequence X exact format depends on annotation type. """ self.check_is_fitted() @@ -265,20 +311,20 @@ def _score_transform(self, X): Returns ------- - Y : pd.Series + y : pd.Series One score for each element in X. Annotations for sequence X exact format depends on annotation type. """ raise NotImplementedError("abstract method") - def update(self, X, Y=None): + def update(self, X, y=None): """Update model with new data and optional ground truth annotations. Parameters ---------- X : pd.DataFrame Training data to update model with (time series). - Y : pd.Series, optional + y : pd.Series, optional Ground truth annotations for training if annotator is supervised. Returns @@ -294,19 +340,19 @@ def update(self, X, Y=None): X = check_series(X, allow_index_names=True) - if Y is not None: - Y = check_series(Y, allow_index_names=True) + if y is not None: + y = check_series(y, allow_index_names=True) self._X = X.combine_first(self._X) - if Y is not None: - self._Y = Y.combine_first(self._Y) + if y is not None: + self._y = y.combine_first(self._y) - self._update(X=X, Y=Y) + self._update(X=X, y=y) return self - def _update(self, X, Y=None): + def _update(self, X, y=None): """Update model with new data and optional ground truth annotations. core logic @@ -315,7 +361,7 @@ def _update(self, X, Y=None): ---------- X : pd.DataFrame Training data to update model with time series - Y : pd.Series, optional + y : pd.Series, optional Ground truth annotations for training if annotator is supervised. Returns @@ -328,7 +374,7 @@ def _update(self, X, Y=None): Updates fitted model that updates attributes ending in "_". """ # default/fallback: re-fit to all data - self._fit(self._X, self._Y) + self._fit(self._X, self._y) return self @@ -342,7 +388,7 @@ def update_predict(self, X): Returns ------- - Y : pd.Series + y : pd.Series Annotations for sequence X exact format depends on annotation type. Notes @@ -352,21 +398,21 @@ def update_predict(self, X): X = check_series(X, allow_index_names=True) self.update(X=X) - Y = self.predict(X=X) + y = self.predict(X=X) - return Y + return y - def fit_predict(self, X, Y=None): + def fit_predict(self, X, y=None): """Fit to data, then predict it. - Fits model to X and Y with given annotation parameters + Fits model to X and y with given annotation parameters and returns the annotations made by the model. Parameters ---------- X : pd.DataFrame, pd.Series or np.ndarray Data to be transformed - Y : pd.Series or np.ndarray, optional (default=None) + y : pd.Series or np.ndarray, optional (default=None) Target values of data to be predicted. Returns @@ -376,19 +422,19 @@ def fit_predict(self, X, Y=None): """ # Non-optimized default implementation; override when a better # method is possible for a given algorithm. - return self.fit(X, Y).predict(X) + return self.fit(X, y).predict(X) - def fit_transform(self, X, Y=None): + def fit_transform(self, X, y=None): """Fit to data, then transform it. - Fits model to X and Y with given annotation parameters + Fits model to X and y with given annotation parameters and returns the annotations made by the model. Parameters ---------- X : pd.DataFrame, pd.Series or np.ndarray Data to be transformed - Y : pd.Series or np.ndarray, optional (default=None) + y : pd.Series or np.ndarray, optional (default=None) Target values of data to be predicted. Returns @@ -396,8 +442,8 @@ def fit_transform(self, X, Y=None): self : pd.Series Annotations for sequence X exact format depends on annotation type. """ - Y = self.fit_predict(X) - return self.sparse_to_dense(Y, index=X.index) + y = self.fit_predict(X) + return self.sparse_to_dense(y, index=X.index) # Notes on required .predict output formats per detector type (task and capability): diff --git a/skchange/change_detectors/base.py b/skchange/change_detectors/base.py index eb1fde27..cd7c6c91 100644 --- a/skchange/change_detectors/base.py +++ b/skchange/change_detectors/base.py @@ -26,12 +26,12 @@ class ChangepointDetector(BaseDetector): * "fit_is_empty": False, Needs to be implemented: - - _fit(self, X, Y=None) -> self + - _fit(self, X, y=None) -> self - _predict(self, X) -> pd.Series Optional to implement: - _score_transform(self, X) -> pd.Series - - _update(self, X, Y=None) -> self + - _update(self, X, y=None) -> self """ @staticmethod diff --git a/skchange/change_detectors/moscore.py b/skchange/change_detectors/moscore.py index 19404470..67a1e2a5 100644 --- a/skchange/change_detectors/moscore.py +++ b/skchange/change_detectors/moscore.py @@ -128,7 +128,7 @@ def __init__( check_larger_than(0, threshold_scale, "threshold_scale", allow_none=True) check_larger_than(0, self.level, "level") check_in_interval( - pd.Interval(1, self.bandwidth / 2 - 1, closed="both"), + pd.Interval(1, max(1, self.bandwidth / 2 - 1), closed="both"), self.min_detection_interval, "min_detection_interval", ) @@ -205,7 +205,7 @@ def _get_threshold(self, X: pd.DataFrame) -> float: n, p, self.bandwidth, self.level ) - def _fit(self, X: pd.DataFrame, Y: Optional[pd.DataFrame] = None): + def _fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None): """Fit to training data. Sets the threshold of the detector. @@ -222,7 +222,7 @@ def _fit(self, X: pd.DataFrame, Y: Optional[pd.DataFrame] = None): ---------- X : pd.DataFrame training data to fit the threshold to. - Y : pd.Series, optional + y : pd.Series, optional Does nothing. Only here to make the fit method compatible with sktime and scikit-learn. @@ -252,7 +252,7 @@ def _score_transform(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: Returns ------- - Y : pd.Series + y : pd.Series Annotations for sequence X exact format depends on annotation type. """ X = check_data( @@ -277,7 +277,7 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: Returns ------- - Y : pd.Series - annotations for sequence X + y : pd.Series - annotations for sequence X exact format depends on annotation type """ self.scores = self.score_transform(X) @@ -306,6 +306,7 @@ def get_test_params(cls, parameter_set="default"): `create_test_instance` uses the first (or only) dictionary in `params` """ params = [ - {"score": "mean", "bandwidth": 10}, + {"score": "mean", "bandwidth": 5}, + {"score": "meanvar", "bandwidth": 5}, ] return params diff --git a/skchange/change_detectors/pelt.py b/skchange/change_detectors/pelt.py index b182dae3..cfd0bf70 100644 --- a/skchange/change_detectors/pelt.py +++ b/skchange/change_detectors/pelt.py @@ -160,7 +160,11 @@ def _get_penalty(self, X: pd.DataFrame) -> float: p = X.shape[1] return self.penalty_scale * self.get_default_penalty(n, p) - def _fit(self, X: Union[pd.Series, pd.DataFrame], Y: Optional[pd.DataFrame] = None): + def _fit( + self, + X: Union[pd.Series, pd.DataFrame], + y: Optional[Union[pd.Series, pd.DataFrame]] = None, + ): """Fit to training data. Sets the penalty of the detector. @@ -175,7 +179,7 @@ def _fit(self, X: Union[pd.Series, pd.DataFrame], Y: Optional[pd.DataFrame] = No ---------- X : pd.DataFrame training data to fit the penalty to. - Y : pd.Series, optional + y : pd.Series, optional Does nothing. Only here to make the fit method compatible with sktime and scikit-learn. @@ -200,7 +204,7 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: Returns ------- - Y : pd.Series - annotations for sequence X + y : pd.Series - annotations for sequence X exact format depends on annotation type """ X = check_data( @@ -259,5 +263,6 @@ def get_test_params(cls, parameter_set="default"): """ params = [ {"cost": "mean", "min_segment_length": 5}, + {"cost": "mean", "penalty_scale": 0.0, "min_segment_length": 1}, ] return params diff --git a/skchange/change_detectors/seeded_binseg.py b/skchange/change_detectors/seeded_binseg.py index 1299780c..407cb4ad 100644 --- a/skchange/change_detectors/seeded_binseg.py +++ b/skchange/change_detectors/seeded_binseg.py @@ -244,7 +244,7 @@ def _get_threshold(self, X: pd.DataFrame) -> float: p = X.shape[1] return self.threshold_scale * self.get_default_threshold(n, p) - def _fit(self, X: pd.DataFrame, Y: Optional[pd.DataFrame] = None): + def _fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None): """Fit to training data. Sets the threshold of the detector. @@ -259,7 +259,7 @@ def _fit(self, X: pd.DataFrame, Y: Optional[pd.DataFrame] = None): ---------- X : pd.DataFrame training data to fit the threshold to. - Y : pd.Series, optional + y : pd.Series, optional Does nothing. Only here to make the fit method compatible with sktime and scikit-learn. @@ -284,7 +284,7 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: Returns ------- - Y : pd.Series - annotations for sequence X + y : pd.Series - annotations for sequence X exact format depends on annotation type """ X = check_data( @@ -304,7 +304,7 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: self.scores = pd.DataFrame( {"start": starts, "end": ends, "argmax_cpt": maximizers, "score": scores} ) - return ChangepointDetector.sparse_to_dense(cpts, X.index) + return ChangepointDetector._format_sparse_output(cpts) @classmethod def get_test_params(cls, parameter_set="default"): @@ -327,5 +327,6 @@ def get_test_params(cls, parameter_set="default"): """ params = [ {"score": "mean", "min_segment_length": 5, "max_interval_length": 100}, + {"score": "mean", "min_segment_length": 1, "max_interval_length": 20}, ] return params From c6e7a4cde8f95817a99ff9fdf71400434183cba1 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 14:46:37 +0200 Subject: [PATCH 24/75] Add base class tests --- skchange/tests/test_all_detectors.py | 38 ++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/skchange/tests/test_all_detectors.py b/skchange/tests/test_all_detectors.py index 3a1621d4..37bd35a5 100644 --- a/skchange/tests/test_all_detectors.py +++ b/skchange/tests/test_all_detectors.py @@ -1,13 +1,47 @@ """Tests for all annotators/detectors in skchange.""" +import pandas as pd +import pytest +from sktime.utils._testing.annotation import make_annotation_problem from sktime.utils.estimator_checks import check_estimator, parametrize_with_checks -from skchange.anomaly_detectors.tests.test_anomaly_detectors import anomaly_detectors +from skchange.base import BaseDetector from skchange.change_detectors.tests.test_change_detectors import change_detectors +from skchange.datasets.generate import generate_anomalous_data -all_detectors = anomaly_detectors + change_detectors +# all_detectors = anomaly_detectors + change_detectors +all_detectors = change_detectors @parametrize_with_checks(all_detectors) def test_sktime_compatible_estimators(obj, test_name): check_estimator(obj, tests_to_run=test_name, raise_exceptions=True) + + +@pytest.mark.parametrize("Detector", all_detectors) +def test_detector_fit(Detector): + """Test fit method.""" + detector = Detector.create_test_instance() + x = make_annotation_problem(n_timepoints=50, estimator_type="None") + fit_detector = detector.fit(x) + assert issubclass(detector.__class__, BaseDetector) + assert issubclass(fit_detector.__class__, BaseDetector) + assert isinstance(fit_detector, Detector) + + +@pytest.mark.parametrize("Detector", all_detectors) +def test_detector_predict(Detector): + """Test fit method.""" + detector = Detector.create_test_instance() + x = generate_anomalous_data(means=10, random_state=60) + y = detector.fit_predict(x) + assert isinstance(y, (pd.Series, pd.DataFrame)) + + +@pytest.mark.parametrize("Detector", all_detectors) +def test_detector_transform(Detector): + """Test fit method.""" + detector = Detector.create_test_instance() + x = generate_anomalous_data(means=10, random_state=60) + y = detector.fit_transform(x) + assert isinstance(y, (pd.Series, pd.DataFrame)) From 1c366e5b39b31e9559b4316119c0909688a09e50 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 15:04:44 +0200 Subject: [PATCH 25/75] Add more tests --- skchange/tests/test_all_detectors.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/skchange/tests/test_all_detectors.py b/skchange/tests/test_all_detectors.py index 37bd35a5..04a335ef 100644 --- a/skchange/tests/test_all_detectors.py +++ b/skchange/tests/test_all_detectors.py @@ -20,7 +20,7 @@ def test_sktime_compatible_estimators(obj, test_name): @pytest.mark.parametrize("Detector", all_detectors) def test_detector_fit(Detector): - """Test fit method.""" + """Test fit method output.""" detector = Detector.create_test_instance() x = make_annotation_problem(n_timepoints=50, estimator_type="None") fit_detector = detector.fit(x) @@ -31,7 +31,7 @@ def test_detector_fit(Detector): @pytest.mark.parametrize("Detector", all_detectors) def test_detector_predict(Detector): - """Test fit method.""" + """Test predict method output.""" detector = Detector.create_test_instance() x = generate_anomalous_data(means=10, random_state=60) y = detector.fit_predict(x) @@ -40,8 +40,21 @@ def test_detector_predict(Detector): @pytest.mark.parametrize("Detector", all_detectors) def test_detector_transform(Detector): - """Test fit method.""" + """Test transform method output.""" detector = Detector.create_test_instance() - x = generate_anomalous_data(means=10, random_state=60) + x = generate_anomalous_data(means=10, random_state=61) y = detector.fit_transform(x) assert isinstance(y, (pd.Series, pd.DataFrame)) + assert len(x) == len(y) + + +@pytest.mark.parametrize("Detector", all_detectors) +def test_detector_score_transform(Detector): + """Test score_transform method output.""" + detector = Detector.create_test_instance() + x = generate_anomalous_data(means=10, random_state=62) + try: + y = detector.fit(x).score_transform(x) + assert isinstance(y, (pd.Series, pd.DataFrame)) + except NotImplementedError: + pass From 96d164af0dafb8d3f039bd0b6526be02b0877432 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 15:08:27 +0200 Subject: [PATCH 26/75] Remove duplicate test Exists in test_detectors.py --- .../tests/test_change_detectors.py | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/skchange/change_detectors/tests/test_change_detectors.py b/skchange/change_detectors/tests/test_change_detectors.py index 8e9c1411..e2f6c9bd 100644 --- a/skchange/change_detectors/tests/test_change_detectors.py +++ b/skchange/change_detectors/tests/test_change_detectors.py @@ -2,8 +2,6 @@ import pandas as pd import pytest -from sktime.tests.test_switch import run_test_for_class -from sktime.utils._testing.annotation import make_annotation_problem from skchange.change_detectors.moscore import Moscore from skchange.change_detectors.pelt import Pelt @@ -13,24 +11,6 @@ change_detectors = [Moscore, Pelt, SeededBinarySegmentation] -@pytest.mark.parametrize("Estimator", change_detectors) -def test_output_type(Estimator): - """Test annotator output type.""" - estimator = Estimator.create_test_instance() - if not run_test_for_class(Estimator): - return None - - arg = make_annotation_problem( - n_timepoints=50, estimator_type=estimator.get_tag("distribution_type") - ) - estimator.fit(arg) - arg = make_annotation_problem( - n_timepoints=30, estimator_type=estimator.get_tag("distribution_type") - ) - y_pred = estimator.predict(arg) - assert isinstance(y_pred, (pd.DataFrame, pd.Series)) - - @pytest.mark.parametrize("Estimator", change_detectors) def test_change_detector_sparse_int(Estimator): """Test sparse int_label segmentation.""" From 0fb4992336145400e5f95b68891f93d07b51ab25 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 15:30:36 +0200 Subject: [PATCH 27/75] Fix changepoint tests for new interface --- .../tests/test_change_detectors.py | 65 ++++++------------- 1 file changed, 21 insertions(+), 44 deletions(-) diff --git a/skchange/change_detectors/tests/test_change_detectors.py b/skchange/change_detectors/tests/test_change_detectors.py index e2f6c9bd..51e918b6 100644 --- a/skchange/change_detectors/tests/test_change_detectors.py +++ b/skchange/change_detectors/tests/test_change_detectors.py @@ -1,6 +1,5 @@ """Basic tests for all change detectors.""" -import pandas as pd import pytest from skchange.change_detectors.moscore import Moscore @@ -12,79 +11,57 @@ @pytest.mark.parametrize("Estimator", change_detectors) -def test_change_detector_sparse_int(Estimator): - """Test sparse int_label segmentation.""" +def test_change_detector_predict(Estimator): + """Test changepoint detector predict (sparse output).""" n_segments = 2 seg_len = 50 df = generate_teeth_data( n_segments=n_segments, mean=10, segment_length=seg_len, p=1, random_state=2 ) detector = Estimator.create_test_instance() - detector.set_params(fmt="sparse", labels="int_label") changepoints = detector.fit_predict(df) assert len(changepoints) == n_segments - 1 and changepoints[0] == seg_len - 1 @pytest.mark.parametrize("Estimator", change_detectors) -def test_change_detector_sparse_indicator(Estimator): - """Test sparse indicator segmentation.""" +def test_change_detector_transform(Estimator): + """Test changepoint detector transform (dense output).""" n_segments = 2 seg_len = 50 df = generate_teeth_data( - n_segments=n_segments, mean=10, segment_length=seg_len, p=1, random_state=3 + n_segments=n_segments, mean=10, segment_length=seg_len, p=1, random_state=2 ) detector = Estimator.create_test_instance() - detector.set_params(fmt="sparse", labels="indicator") - changepoints = detector.fit_predict(df) - assert len(changepoints) == n_segments - 1 and changepoints[0] == seg_len - 1 - - -@pytest.mark.parametrize("Estimator", change_detectors) -def test_change_detector_score(Estimator): - """Test sparse score segmentation.""" - n_segments = 2 - seg_len = 50 - df = generate_teeth_data( - n_segments=n_segments, mean=10, segment_length=seg_len, p=1, random_state=4 - ) - sparse_detector = Estimator.create_test_instance() - sparse_detector.set_params(fmt="sparse", labels="score") - dense_detector = Estimator.create_test_instance() - dense_detector.set_params(fmt="dense", labels="score") - sparse_scores = sparse_detector.fit_predict(df) - dense_scores = dense_detector.fit_predict(df) - assert (sparse_scores == dense_scores).all(axis=None) - if isinstance(sparse_scores, pd.DataFrame): - assert "score" in sparse_scores.columns - else: - assert sparse_scores.name == "score" + labels = detector.fit_transform(df) + assert labels.nunique() == n_segments + assert labels[seg_len - 1] == 0.0 and labels[seg_len] == 1.0 @pytest.mark.parametrize("Estimator", change_detectors) -def test_change_detector_dense_int(Estimator): - """Tests dense int_label segmentation.""" +def test_change_detector_sparse_to_dense(Estimator): + """Test that predict + sparse_to_dense == transform.""" n_segments = 2 seg_len = 50 df = generate_teeth_data( n_segments=n_segments, mean=10, segment_length=seg_len, p=1, random_state=2 ) detector = Estimator.create_test_instance() - detector.set_params(fmt="dense", labels="int_label") - labels = detector.fit_predict(df) - assert labels.nunique() == n_segments - assert labels[seg_len - 1] == 0.0 and labels[seg_len] == 1.0 + changepoints = detector.fit_predict(df) + labels = detector.sparse_to_dense(changepoints, df.index) + labels_transform = detector.fit_transform(df) + assert labels.equals(labels_transform) @pytest.mark.parametrize("Estimator", change_detectors) -def test_change_detector_dense_indicator(Estimator): - """Tests dense indicator segmentation.""" +def test_change_detector_dense_to_sparse(Estimator): + """Test that transform + dense_to_sparse == predict.""" n_segments = 2 seg_len = 50 df = generate_teeth_data( - n_segments=n_segments, mean=10, segment_length=seg_len, p=1, random_state=8 + n_segments=n_segments, mean=10, segment_length=seg_len, p=1, random_state=2 ) detector = Estimator.create_test_instance() - detector.set_params(fmt="dense", labels="indicator") - cpt_indicator = detector.fit_predict(df) - assert cpt_indicator.sum() == n_segments - 1 - assert cpt_indicator[seg_len - 1] + labels = detector.fit_transform(df) + changepoints = detector.dense_to_sparse(labels) + changepoints_predict = detector.fit_predict(df) + assert changepoints.equals(changepoints_predict) From b0e24bd5718f7d8f3fb3e7772aa184bda8959d13 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 15:40:23 +0200 Subject: [PATCH 28/75] Fix tests for new interface --- skchange/change_detectors/tests/test_moscore.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/skchange/change_detectors/tests/test_moscore.py b/skchange/change_detectors/tests/test_moscore.py index 7d4ecbab..f090403c 100644 --- a/skchange/change_detectors/tests/test_moscore.py +++ b/skchange/change_detectors/tests/test_moscore.py @@ -16,7 +16,7 @@ def test_moscore_changepoint(score): df = generate_teeth_data( n_segments=n_segments, mean=10, segment_length=seg_len, p=1, random_state=2 ) - detector = Moscore(score, fmt="sparse", labels="int_label") + detector = Moscore(score) changepoints = detector.fit_predict(df) assert len(changepoints) == n_segments - 1 and changepoints[0] == seg_len - 1 @@ -29,9 +29,10 @@ def test_moscore_scores(score): df = generate_teeth_data( n_segments=n_segments, mean=10, segment_length=seg_len, p=1, random_state=3 ) - detector = Moscore(score, fmt="dense", labels="score") - scores = detector.fit_predict(df) + detector = Moscore(score) + scores = detector.fit(df).score_transform(df) assert np.all(scores >= 0.0) + assert len(scores) == len(df) @pytest.mark.parametrize("score", VALID_CHANGE_SCORES) @@ -42,6 +43,6 @@ def test_moscore_tuning(score): df = generate_teeth_data( n_segments=n_segments, mean=10, segment_length=seg_len, p=1, random_state=4 ) - detector = Moscore(score, threshold_scale=None, fmt="dense", labels="indicator") + detector = Moscore(score, threshold_scale=None) detector.fit(df) assert detector.threshold_ > 0.0 From 7387cc8bece08372c7c6ddab632a5d1eebf719d5 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 15:40:29 +0200 Subject: [PATCH 29/75] Fix tests for new interface --- skchange/change_detectors/tests/test_seeded_binseg.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/skchange/change_detectors/tests/test_seeded_binseg.py b/skchange/change_detectors/tests/test_seeded_binseg.py index d4a49ff3..7bc69295 100644 --- a/skchange/change_detectors/tests/test_seeded_binseg.py +++ b/skchange/change_detectors/tests/test_seeded_binseg.py @@ -51,10 +51,7 @@ def test_binseg_tuning(score): df = generate_teeth_data( n_segments=n_segments, mean=10, segment_length=seg_len, p=1, random_state=4 ) - detector = SeededBinarySegmentation( - score, threshold_scale=None, fmt="dense", labels="score" - ) - detector.fit(df) - scores = detector.predict(df) - assert detector.threshold_ >= scores["score"].mean() - assert detector.threshold_ <= scores["score"].max() + detector = SeededBinarySegmentation(score, threshold_scale=None) + detector.fit_predict(df) + assert detector.threshold_ >= detector.scores["score"].mean() + assert detector.threshold_ <= detector.scores["score"].max() From 2cd54d107f00126e2667e050559f02fa285603b1 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 15:53:48 +0200 Subject: [PATCH 30/75] Delete change_detectors.utils module None of the functions are in use anymore. --- skchange/change_detectors/utils.py | 72 ------------------------------ 1 file changed, 72 deletions(-) delete mode 100644 skchange/change_detectors/utils.py diff --git a/skchange/change_detectors/utils.py b/skchange/change_detectors/utils.py deleted file mode 100644 index ab667cea..00000000 --- a/skchange/change_detectors/utils.py +++ /dev/null @@ -1,72 +0,0 @@ -"""Utility functions for change detection.""" - -from typing import Union - -import numpy as np -import pandas as pd - - -def changepoints_to_labels(changepoints: list, n) -> np.ndarray: - """Convert a list of changepoints to a list of labels. - - Parameters - ---------- - changepoints : list - List of changepoint indices. - n: int - Sample size. - - Returns - ------- - labels : np.ndarray - 1D array of labels: 0 for the first segment, 1 for the second, etc. - """ - changepoints = [-1] + changepoints + [n - 1] - labels = np.zeros(n) - for i in range(len(changepoints) - 1): - labels[changepoints[i] + 1 : changepoints[i + 1] + 1] = i - return labels - - -def format_changepoint_output( - fmt: str, - labels: str, - changepoints: list, - X_index: pd.Index, - scores: Union[pd.Series, pd.DataFrame] = None, -) -> pd.Series: - """Format the predict method output of change detectors. - - Parameters - ---------- - fmt : str - Format of the output. Either "sparse" or "dense". - labels : str - Labels of the output. Either "indicator", "score" or "int_label". - changepoints : list - List of changepoint indices. - X_index : pd.Index - Index of the input data. - scores : pd.Series or pd.DataFrame, optional (default=None) - Series or DataFrame of scores. If Series, it must be named 'score', and if - DataFrame, it must have a column named 'score'. - - Returns - ------- - pd.Series - Either a sparse or dense pd.Series of boolean labels, integer labels or scores. - """ - if fmt == "sparse" and labels in ["int_label", "indicator"]: - out = pd.Series(changepoints, name="changepoints", dtype=int) - elif fmt == "dense" and labels == "int_label": - out = changepoints_to_labels(changepoints, len(X_index)) - out = pd.Series(out, index=X_index, name="int_label", dtype=int) - elif fmt == "dense" and labels == "indicator": - out = pd.Series(False, index=X_index, name="indicator", dtype=bool) - out.iloc[changepoints] = True - elif labels == "score": - # There is no sparse version of 'score'. - # The scores are formatted in each class' _predict method, as what is a good - # format for the scores is method dependent. - out = scores - return out From c463e0862d93a2346a0f80f513f95390b6ea776b Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 20:35:29 +0200 Subject: [PATCH 31/75] Put data generation outside test functions Avoid duplication --- .../tests/test_change_detectors.py | 40 ++++++------------- 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/skchange/change_detectors/tests/test_change_detectors.py b/skchange/change_detectors/tests/test_change_detectors.py index 51e918b6..a72c4c6b 100644 --- a/skchange/change_detectors/tests/test_change_detectors.py +++ b/skchange/change_detectors/tests/test_change_detectors.py @@ -9,30 +9,26 @@ change_detectors = [Moscore, Pelt, SeededBinarySegmentation] +n_segments = 2 +seg_len = 50 +changepoint_data = generate_teeth_data( + n_segments=n_segments, mean=10, segment_length=seg_len, p=1, random_state=2 +) + @pytest.mark.parametrize("Estimator", change_detectors) def test_change_detector_predict(Estimator): """Test changepoint detector predict (sparse output).""" - n_segments = 2 - seg_len = 50 - df = generate_teeth_data( - n_segments=n_segments, mean=10, segment_length=seg_len, p=1, random_state=2 - ) detector = Estimator.create_test_instance() - changepoints = detector.fit_predict(df) + changepoints = detector.fit_predict(changepoint_data) assert len(changepoints) == n_segments - 1 and changepoints[0] == seg_len - 1 @pytest.mark.parametrize("Estimator", change_detectors) def test_change_detector_transform(Estimator): """Test changepoint detector transform (dense output).""" - n_segments = 2 - seg_len = 50 - df = generate_teeth_data( - n_segments=n_segments, mean=10, segment_length=seg_len, p=1, random_state=2 - ) detector = Estimator.create_test_instance() - labels = detector.fit_transform(df) + labels = detector.fit_transform(changepoint_data) assert labels.nunique() == n_segments assert labels[seg_len - 1] == 0.0 and labels[seg_len] == 1.0 @@ -40,28 +36,18 @@ def test_change_detector_transform(Estimator): @pytest.mark.parametrize("Estimator", change_detectors) def test_change_detector_sparse_to_dense(Estimator): """Test that predict + sparse_to_dense == transform.""" - n_segments = 2 - seg_len = 50 - df = generate_teeth_data( - n_segments=n_segments, mean=10, segment_length=seg_len, p=1, random_state=2 - ) detector = Estimator.create_test_instance() - changepoints = detector.fit_predict(df) - labels = detector.sparse_to_dense(changepoints, df.index) - labels_transform = detector.fit_transform(df) + changepoints = detector.fit_predict(changepoint_data) + labels = detector.sparse_to_dense(changepoints, changepoint_data.index) + labels_transform = detector.fit_transform(changepoint_data) assert labels.equals(labels_transform) @pytest.mark.parametrize("Estimator", change_detectors) def test_change_detector_dense_to_sparse(Estimator): """Test that transform + dense_to_sparse == predict.""" - n_segments = 2 - seg_len = 50 - df = generate_teeth_data( - n_segments=n_segments, mean=10, segment_length=seg_len, p=1, random_state=2 - ) detector = Estimator.create_test_instance() - labels = detector.fit_transform(df) + labels = detector.fit_transform(changepoint_data) changepoints = detector.dense_to_sparse(labels) - changepoints_predict = detector.fit_predict(df) + changepoints_predict = detector.fit_predict(changepoint_data) assert changepoints.equals(changepoints_predict) From bd000a88a8c6aece9766b91450b209578d0add57 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 21:22:49 +0200 Subject: [PATCH 32/75] Fix anomaly tests for new interface --- .../tests/test_anomaly_detectors.py | 131 ++++++------------ skchange/tests/test_all_detectors.py | 5 +- 2 files changed, 47 insertions(+), 89 deletions(-) diff --git a/skchange/anomaly_detectors/tests/test_anomaly_detectors.py b/skchange/anomaly_detectors/tests/test_anomaly_detectors.py index 2a4fc418..d1a5c09e 100644 --- a/skchange/anomaly_detectors/tests/test_anomaly_detectors.py +++ b/skchange/anomaly_detectors/tests/test_anomaly_detectors.py @@ -2,121 +2,78 @@ import pandas as pd import pytest -from sktime.tests.test_switch import run_test_for_class -from sktime.utils._testing.annotation import make_annotation_problem from skchange.anomaly_detectors.anomalisers import StatThresholdAnomaliser -from skchange.anomaly_detectors.capa import Capa -from skchange.anomaly_detectors.circular_binseg import CircularBinarySegmentation -from skchange.anomaly_detectors.moscore_anomaly import MoscoreAnomaly -from skchange.anomaly_detectors.mvcapa import Mvcapa +from skchange.anomaly_detectors.base import CollectiveAnomalyDetector from skchange.datasets.generate import generate_anomalous_data -anomaly_detectors = [ - Capa, - CircularBinarySegmentation, - MoscoreAnomaly, - Mvcapa, +collective_anomaly_detectors = [ + # Capa, + # CircularBinarySegmentation, + # MoscoreAnomaly, + # Mvcapa, StatThresholdAnomaliser, ] +point_anomaly_detectors = [] +anomaly_detectors = collective_anomaly_detectors + point_anomaly_detectors -true_anomalies = [(50, 59), (120, 129)] + +true_anomalies = [(30, 39), (70, 75)] anomaly_data = generate_anomalous_data( - 200, anomalies=true_anomalies, means=[10.0, 5.0], random_state=39 + 100, anomalies=true_anomalies, means=[10.0, 15.0], random_state=39 ) -@pytest.mark.parametrize("Estimator", anomaly_detectors) -def test_output_type(Estimator): - """Test annotator output type.""" - estimator = Estimator.create_test_instance() - if not run_test_for_class(Estimator): - return None - - arg = make_annotation_problem( - n_timepoints=500, estimator_type=estimator.get_tag("distribution_type") - ) - estimator.fit(arg) - arg = make_annotation_problem( - n_timepoints=200, estimator_type=estimator.get_tag("distribution_type") - ) - y_pred = estimator.predict(arg) - assert isinstance(y_pred, (pd.DataFrame, pd.Series)) - - -@pytest.mark.parametrize("Estimator", anomaly_detectors) -def test_anomaly_detector_sparse_int(Estimator): - """Test sparse int label anomaly detector output. - - Check if the predicted anomalies match. - """ +@pytest.mark.parametrize("Estimator", collective_anomaly_detectors) +def test_collective_anomaly_detector_predict(Estimator): + """Test collective anomaly detector's predict method (sparse output).""" detector = Estimator.create_test_instance() - detector.set_params(fmt="sparse", labels="int_label") anomalies = detector.fit_predict(anomaly_data) - assert len(anomalies) == len(true_anomalies) - for i, (start, end) in enumerate(true_anomalies): - assert anomalies.loc[i, "start"] == start and anomalies.loc[i, "end"] == end + if isinstance(anomalies, pd.DataFrame): + anomalies = anomalies["location"] - -@pytest.mark.parametrize("Estimator", anomaly_detectors) -def test_anomaly_detector_sparse_indicator(Estimator): - """Test sparse indicator anomaly detector output. - - Check if the predicted anomalies match. - """ - detector = Estimator.create_test_instance() - detector.set_params(fmt="sparse", labels="indicator") - anomalies = detector.fit_predict(anomaly_data) assert len(anomalies) == len(true_anomalies) for i, (start, end) in enumerate(true_anomalies): - assert anomalies.loc[i, "start"] == start and anomalies.loc[i, "end"] == end + assert anomalies.array.left[i] == start and anomalies.array.right[i] == end @pytest.mark.parametrize("Estimator", anomaly_detectors) -def test_anomaly_detector_score(Estimator): - """Test score anomaly detector output.""" - sparse_detector = Estimator.create_test_instance() - sparse_detector.set_params(fmt="sparse", labels="score") - dense_detector = Estimator.create_test_instance() - dense_detector.set_params(fmt="dense", labels="score") - sparse_scores = sparse_detector.fit_predict(anomaly_data) - dense_scores = dense_detector.fit_predict(anomaly_data) - assert (sparse_scores == dense_scores).all(axis=None) - if isinstance(sparse_scores, pd.DataFrame): - assert "score" in sparse_scores.columns - else: - assert sparse_scores.name == "score" - - -@pytest.mark.parametrize("Estimator", anomaly_detectors) -def test_anomaly_detector_dense_int(Estimator): - """Tests dense int label anomaly detector output. - - Check if the predicted anomalies matches. - """ +def test_collective_anomaly_detector_transform(Estimator): + """Test collective anomaly detector's transform method (dense output).""" detector = Estimator.create_test_instance() - detector.set_params(fmt="dense", labels="int_label") - labels = detector.fit_predict(anomaly_data) + labels = detector.fit_transform(anomaly_data) if isinstance(labels, pd.DataFrame): labels = labels.iloc[:, 0] + true_collective_anomalies = pd.IntervalIndex.from_tuples( + true_anomalies, closed="both" + ) + true_anomaly_labels = CollectiveAnomalyDetector.sparse_to_dense( + true_collective_anomalies, anomaly_data.index + ) + labels.equals(true_anomaly_labels) + + # Similar test that does not depend on sparse_to_dense, just to be sure. assert labels.nunique() == len(true_anomalies) + 1 for i, (start, end) in enumerate(true_anomalies): assert (labels.iloc[start : end + 1] == i + 1).all() @pytest.mark.parametrize("Estimator", anomaly_detectors) -def test_anomaly_detector_dense_indicator(Estimator): - """Tests dense indicator anomaly detector output. - - Check if the predicted anomalies matches. - """ +def test_anomaly_detector_sparse_to_dense(Estimator): + """Test that predict + sparse_to_dense == transform.""" detector = Estimator.create_test_instance() - detector.set_params(fmt="dense", labels="indicator") - labels = detector.fit_predict(anomaly_data) - if isinstance(labels, pd.DataFrame): - labels = labels.iloc[:, 0] + anomalies = detector.fit_predict(anomaly_data) + labels = detector.sparse_to_dense(anomalies, anomaly_data.index) + labels_transform = detector.fit_transform(anomaly_data) + assert labels.equals(labels_transform) - for start, end in true_anomalies: - assert labels.iloc[start : end + 1].all() - assert not labels.iloc[start - 1] and not labels.iloc[end + 1] + +@pytest.mark.parametrize("Estimator", anomaly_detectors) +def test_anomaly_detector_dense_to_sparse(Estimator): + """Test that transform + dense_to_sparse == predict.""" + detector = Estimator.create_test_instance() + labels = detector.fit_transform(anomaly_data) + anomalies = detector.dense_to_sparse(labels) + anomalies_predict = detector.fit_predict(anomaly_data) + assert anomalies.equals(anomalies_predict) diff --git a/skchange/tests/test_all_detectors.py b/skchange/tests/test_all_detectors.py index 04a335ef..17330875 100644 --- a/skchange/tests/test_all_detectors.py +++ b/skchange/tests/test_all_detectors.py @@ -5,12 +5,13 @@ from sktime.utils._testing.annotation import make_annotation_problem from sktime.utils.estimator_checks import check_estimator, parametrize_with_checks +from skchange.anomaly_detectors.tests.test_anomaly_detectors import anomaly_detectors from skchange.base import BaseDetector from skchange.change_detectors.tests.test_change_detectors import change_detectors from skchange.datasets.generate import generate_anomalous_data -# all_detectors = anomaly_detectors + change_detectors -all_detectors = change_detectors +# TODO: Move all detectors to __init__ files. A bit random construction now. +all_detectors = anomaly_detectors + change_detectors @parametrize_with_checks(all_detectors) From 1e3f5401de612257e041fa9b12c6fcf40156d689 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 21:23:35 +0200 Subject: [PATCH 33/75] Conform Anomaliser to BaseDetector --- skchange/anomaly_detectors/anomalisers.py | 52 +++++++++++------------ 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/skchange/anomaly_detectors/anomalisers.py b/skchange/anomaly_detectors/anomalisers.py index 7500fe43..ba9e6e9a 100644 --- a/skchange/anomaly_detectors/anomalisers.py +++ b/skchange/anomaly_detectors/anomalisers.py @@ -4,17 +4,17 @@ import numpy as np import pandas as pd -from sktime.annotation.base import BaseSeriesAnnotator -from skchange.anomaly_detectors.utils import format_anomaly_output +from skchange.anomaly_detectors.base import CollectiveAnomalyDetector +from skchange.change_detectors.base import ChangepointDetector -class StatThresholdAnomaliser(BaseSeriesAnnotator): +class StatThresholdAnomaliser(CollectiveAnomalyDetector): """Anomaly detection based on thresholding the values of segment statistics. Parameters ---------- - change_detector : BaseSeriesAnnotator + change_detector : ChangepointDetector Change detector to use for detecting segments. stat : callable, optional (default=np.mean) Statistic to calculate per segment. @@ -32,32 +32,30 @@ class StatThresholdAnomaliser(BaseSeriesAnnotator): def __init__( self, - change_detector: BaseSeriesAnnotator, + change_detector: ChangepointDetector, stat: Callable = np.mean, stat_lower: float = -1.0, stat_upper: float = 1.0, - fmt: str = "sparse", - labels: str = "indicator", ): self.change_detector = change_detector self.stat = stat self.stat_lower = stat_lower self.stat_upper = stat_upper - super().__init__(fmt=fmt, labels=labels) + super().__init__() if self.stat_lower > self.stat_upper: message = f"stat_lower ({self.stat_lower}) must be less" +f" than or equal to stat_upper ({self.stat_upper})." raise ValueError(message) - def _fit(self, X: pd.DataFrame, Y: Optional[pd.DataFrame] = None): + def _fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None): """Fits the change detector to training data. Parameters ---------- X : pd.DataFrame training data to fit the threshold to. - Y : pd.Series, optional + y : pd.Series, optional Does nothing. Only here to make the fit method compatible with sktime and scikit-learn. @@ -65,7 +63,8 @@ def _fit(self, X: pd.DataFrame, Y: Optional[pd.DataFrame] = None): ------- self : returns a reference to self """ - self.change_detector.fit(X, Y) + self.change_detector_ = self.change_detector.clone() + self.change_detector_.fit(X, y) return self def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: @@ -77,28 +76,19 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: Returns ------- - Y : pd.Series - annotations for sequence X + y : pd.Series - annotations for sequence X exact format depends on annotation type """ # This is the required output format for the rest of the code to work. - self.change_detector.fmt = "dense" - self.change_detector.labels = "int_label" - self.segments = self.change_detector.predict(X) - - df = pd.concat([X, self.segments], axis=1) - self.anomalies = [] - for _, segment in df.reset_index(drop=True).groupby("int_label"): + segments = self.change_detector_.transform(X) + df = pd.concat([X, segments], axis=1) + anomalies = [] + for _, segment in df.reset_index(drop=True).groupby("segment_label"): segment_stat = self.stat(segment.iloc[:, 0].values) if (segment_stat < self.stat_lower) | (segment_stat > self.stat_upper): - self.anomalies.append((int(segment.index[0]), int(segment.index[-1]))) + anomalies.append((int(segment.index[0]), int(segment.index[-1]))) - return format_anomaly_output( - self.fmt, - self.labels, - X.index, - self.anomalies, - scores=self.change_detector.scores, - ) + return CollectiveAnomalyDetector._format_sparse_output(anomalies) @classmethod def get_test_params(cls, parameter_set="default"): @@ -123,10 +113,16 @@ def get_test_params(cls, parameter_set="default"): params = [ { - "change_detector": Moscore(bandwidth=10), + "change_detector": Moscore(bandwidth=3), "stat": np.mean, "stat_lower": -1.0, "stat_upper": 1.0, }, + { + "change_detector": Moscore(bandwidth=5), + "stat": np.median, + "stat_lower": -2.0, + "stat_upper": 2.0, + }, ] return params From b05857747a76277026070e1510543044a3998075 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 21:47:05 +0200 Subject: [PATCH 34/75] Rename ChangepointDetector -> ChangeDetector To make terminology uniform + shorter --- NOTES.md | 4 ++-- skchange/change_detectors/base.py | 4 ++-- skchange/change_detectors/moscore.py | 6 +++--- skchange/change_detectors/pelt.py | 6 +++--- skchange/change_detectors/seeded_binseg.py | 6 +++--- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/NOTES.md b/NOTES.md index e6dfda2f..aef08432 100644 --- a/NOTES.md +++ b/NOTES.md @@ -141,7 +141,7 @@ using the same example data as for anomaly detection. ### Changepoints in univariate data or multivariate data without subset changes ```python -detector = ChangepointDetector().fit(x_univariate) +detector = ChangeDetector().fit(x_univariate) detector.predict(x_univariate) 0 0 1 1 @@ -152,7 +152,7 @@ dtype: int64 ``` ### Subset changepoints in multivariate data ```python -detector = SubsetChangepointDetector().fit(x_multivariate) +detector = SubsetChangeDetector().fit(x_multivariate) detector.predict(x_multivariate) index columns 0 0 [0] diff --git a/skchange/change_detectors/base.py b/skchange/change_detectors/base.py index cd7c6c91..928b2a1a 100644 --- a/skchange/change_detectors/base.py +++ b/skchange/change_detectors/base.py @@ -6,7 +6,7 @@ from skchange.base import BaseDetector -class ChangepointDetector(BaseDetector): +class ChangeDetector(BaseDetector): """Base class for changepoint detectors. Changepoint detectors detect points in time where a change in the data occurs. @@ -77,7 +77,7 @@ def dense_to_sparse(y_dense: pd.Series) -> pd.Series: # changepoint = end of segment, so the label diffs > 0 must be shiftet by -1. is_changepoint = np.roll(y_dense.diff().abs() > 0, -1) changepoints = y_dense.index[is_changepoint] - return ChangepointDetector._format_sparse_output(changepoints) + return ChangeDetector._format_sparse_output(changepoints) @staticmethod def _format_sparse_output(changepoints) -> pd.Series: diff --git a/skchange/change_detectors/moscore.py b/skchange/change_detectors/moscore.py index 67a1e2a5..979bf45c 100644 --- a/skchange/change_detectors/moscore.py +++ b/skchange/change_detectors/moscore.py @@ -9,7 +9,7 @@ import pandas as pd from numba import njit -from skchange.change_detectors.base import ChangepointDetector +from skchange.change_detectors.base import ChangeDetector from skchange.scores.score_factory import score_factory from skchange.utils.numba.general import where from skchange.utils.validation.data import check_data @@ -48,7 +48,7 @@ def moscore_transform( return scores -class Moscore(ChangepointDetector): +class Moscore(ChangeDetector): """Moving score algorithm for multiple changepoint detection. A generalized version of the MOSUM (moving sum) algorithm [1]_ for changepoint @@ -284,7 +284,7 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: changepoints = get_moscore_changepoints( self.scores.values, self.threshold_, self.min_detection_interval ) - return ChangepointDetector._format_sparse_output(changepoints) + return ChangeDetector._format_sparse_output(changepoints) @classmethod def get_test_params(cls, parameter_set="default"): diff --git a/skchange/change_detectors/pelt.py b/skchange/change_detectors/pelt.py index cfd0bf70..479f1c77 100644 --- a/skchange/change_detectors/pelt.py +++ b/skchange/change_detectors/pelt.py @@ -10,7 +10,7 @@ import pandas as pd from numba import njit -from skchange.change_detectors.base import ChangepointDetector +from skchange.change_detectors.base import ChangeDetector from skchange.costs.cost_factory import cost_factory from skchange.utils.validation.data import check_data from skchange.utils.validation.parameters import check_larger_than @@ -61,7 +61,7 @@ def run_pelt( return opt_cost[1:], get_changepoints(prev_cpts) -class Pelt(ChangepointDetector): +class Pelt(ChangeDetector): """Pruned exact linear time changepoint detection. An efficient implementation of the PELT algorithm [1]_ for changepoint detection. @@ -221,7 +221,7 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: ) # Store the scores for introspection without recomputing using score_transform self.scores = pd.Series(opt_costs, index=X.index, name="score") - return ChangepointDetector._format_sparse_output(changepoints) + return ChangeDetector._format_sparse_output(changepoints) def _score_transform(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: """Compute the pelt scores for the input data. diff --git a/skchange/change_detectors/seeded_binseg.py b/skchange/change_detectors/seeded_binseg.py index 407cb4ad..5d5b3312 100644 --- a/skchange/change_detectors/seeded_binseg.py +++ b/skchange/change_detectors/seeded_binseg.py @@ -9,7 +9,7 @@ import pandas as pd from numba import njit -from skchange.change_detectors.base import ChangepointDetector +from skchange.change_detectors.base import ChangeDetector from skchange.scores.score_factory import score_factory from skchange.utils.validation.data import check_data from skchange.utils.validation.parameters import check_in_interval, check_larger_than @@ -94,7 +94,7 @@ def run_seeded_binseg( return cpts, amoc_scores, maximizers, starts, ends -class SeededBinarySegmentation(ChangepointDetector): +class SeededBinarySegmentation(ChangeDetector): """Seeded binary segmentation algorithm for multiple changepoint detection. Binary segmentation type changepoint detection algorithms recursively split the data @@ -304,7 +304,7 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: self.scores = pd.DataFrame( {"start": starts, "end": ends, "argmax_cpt": maximizers, "score": scores} ) - return ChangepointDetector._format_sparse_output(cpts) + return ChangeDetector._format_sparse_output(cpts) @classmethod def get_test_params(cls, parameter_set="default"): From 827ab74ac251375c7ae1b74c61d244d64208092b Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 21:49:01 +0200 Subject: [PATCH 35/75] Rename ChangepointDetector -> ChangeDetector --- skchange/anomaly_detectors/anomalisers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/skchange/anomaly_detectors/anomalisers.py b/skchange/anomaly_detectors/anomalisers.py index ba9e6e9a..b9ce8bb8 100644 --- a/skchange/anomaly_detectors/anomalisers.py +++ b/skchange/anomaly_detectors/anomalisers.py @@ -6,7 +6,7 @@ import pandas as pd from skchange.anomaly_detectors.base import CollectiveAnomalyDetector -from skchange.change_detectors.base import ChangepointDetector +from skchange.change_detectors.base import ChangeDetector class StatThresholdAnomaliser(CollectiveAnomalyDetector): @@ -14,7 +14,7 @@ class StatThresholdAnomaliser(CollectiveAnomalyDetector): Parameters ---------- - change_detector : ChangepointDetector + change_detector : ChangeDetector Change detector to use for detecting segments. stat : callable, optional (default=np.mean) Statistic to calculate per segment. @@ -32,7 +32,7 @@ class StatThresholdAnomaliser(CollectiveAnomalyDetector): def __init__( self, - change_detector: ChangepointDetector, + change_detector: ChangeDetector, stat: Callable = np.mean, stat_lower: float = -1.0, stat_upper: float = 1.0, From 9c7269b999e6b6d30bcb2cdc6755be76b796bfcb Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 21:50:10 +0200 Subject: [PATCH 36/75] Add detector lists to __init__ files --- skchange/anomaly_detectors/__init__.py | 20 ++++++++++++++++++ .../tests/test_anomaly_detectors.py | 21 +++++-------------- skchange/change_detectors/__init__.py | 10 +++++++++ .../tests/test_change_detectors.py | 14 +++++-------- skchange/tests/test_all_detectors.py | 17 +++++++-------- 5 files changed, 48 insertions(+), 34 deletions(-) diff --git a/skchange/anomaly_detectors/__init__.py b/skchange/anomaly_detectors/__init__.py index e406f81f..56c15793 100644 --- a/skchange/anomaly_detectors/__init__.py +++ b/skchange/anomaly_detectors/__init__.py @@ -1 +1,21 @@ """Anomaly detection algorithms.""" + +from skchange.anomaly_detectors.anomalisers import StatThresholdAnomaliser +from skchange.anomaly_detectors.base import ( + CollectiveAnomalyDetector, + PointAnomalyDetector, +) +from skchange.anomaly_detectors.capa import Capa + +BASE_ANOMALY_DETECTORS = [CollectiveAnomalyDetector, PointAnomalyDetector] +COLLECTIVE_ANOMALY_DETECTORS = [ + # Capa, + # CircularBinarySegmentation, + # MoscoreAnomaly, + # Mvcapa, + StatThresholdAnomaliser, +] +POINT_ANOMALY_DETECTORS = [] +ANOMALY_DETECTORS = COLLECTIVE_ANOMALY_DETECTORS + POINT_ANOMALY_DETECTORS + +__all__ = BASE_ANOMALY_DETECTORS + ANOMALY_DETECTORS diff --git a/skchange/anomaly_detectors/tests/test_anomaly_detectors.py b/skchange/anomaly_detectors/tests/test_anomaly_detectors.py index d1a5c09e..891449a8 100644 --- a/skchange/anomaly_detectors/tests/test_anomaly_detectors.py +++ b/skchange/anomaly_detectors/tests/test_anomaly_detectors.py @@ -3,28 +3,17 @@ import pandas as pd import pytest -from skchange.anomaly_detectors.anomalisers import StatThresholdAnomaliser +from skchange.anomaly_detectors import ANOMALY_DETECTORS, COLLECTIVE_ANOMALY_DETECTORS from skchange.anomaly_detectors.base import CollectiveAnomalyDetector from skchange.datasets.generate import generate_anomalous_data -collective_anomaly_detectors = [ - # Capa, - # CircularBinarySegmentation, - # MoscoreAnomaly, - # Mvcapa, - StatThresholdAnomaliser, -] -point_anomaly_detectors = [] -anomaly_detectors = collective_anomaly_detectors + point_anomaly_detectors - - true_anomalies = [(30, 39), (70, 75)] anomaly_data = generate_anomalous_data( 100, anomalies=true_anomalies, means=[10.0, 15.0], random_state=39 ) -@pytest.mark.parametrize("Estimator", collective_anomaly_detectors) +@pytest.mark.parametrize("Estimator", COLLECTIVE_ANOMALY_DETECTORS) def test_collective_anomaly_detector_predict(Estimator): """Test collective anomaly detector's predict method (sparse output).""" detector = Estimator.create_test_instance() @@ -37,7 +26,7 @@ def test_collective_anomaly_detector_predict(Estimator): assert anomalies.array.left[i] == start and anomalies.array.right[i] == end -@pytest.mark.parametrize("Estimator", anomaly_detectors) +@pytest.mark.parametrize("Estimator", COLLECTIVE_ANOMALY_DETECTORS) def test_collective_anomaly_detector_transform(Estimator): """Test collective anomaly detector's transform method (dense output).""" detector = Estimator.create_test_instance() @@ -59,7 +48,7 @@ def test_collective_anomaly_detector_transform(Estimator): assert (labels.iloc[start : end + 1] == i + 1).all() -@pytest.mark.parametrize("Estimator", anomaly_detectors) +@pytest.mark.parametrize("Estimator", ANOMALY_DETECTORS) def test_anomaly_detector_sparse_to_dense(Estimator): """Test that predict + sparse_to_dense == transform.""" detector = Estimator.create_test_instance() @@ -69,7 +58,7 @@ def test_anomaly_detector_sparse_to_dense(Estimator): assert labels.equals(labels_transform) -@pytest.mark.parametrize("Estimator", anomaly_detectors) +@pytest.mark.parametrize("Estimator", ANOMALY_DETECTORS) def test_anomaly_detector_dense_to_sparse(Estimator): """Test that transform + dense_to_sparse == predict.""" detector = Estimator.create_test_instance() diff --git a/skchange/change_detectors/__init__.py b/skchange/change_detectors/__init__.py index 3064220c..f081b171 100644 --- a/skchange/change_detectors/__init__.py +++ b/skchange/change_detectors/__init__.py @@ -1 +1,11 @@ """Change detection algorithms.""" + +from skchange.change_detectors.base import ChangeDetector +from skchange.change_detectors.moscore import Moscore +from skchange.change_detectors.pelt import Pelt +from skchange.change_detectors.seeded_binseg import SeededBinarySegmentation + +BASE_CHANGE_DETECTORS = [ChangeDetector] +CHANGE_DETECTORS = [Moscore, Pelt, SeededBinarySegmentation] + +__all__ = BASE_CHANGE_DETECTORS + CHANGE_DETECTORS diff --git a/skchange/change_detectors/tests/test_change_detectors.py b/skchange/change_detectors/tests/test_change_detectors.py index a72c4c6b..f51d8d90 100644 --- a/skchange/change_detectors/tests/test_change_detectors.py +++ b/skchange/change_detectors/tests/test_change_detectors.py @@ -2,13 +2,9 @@ import pytest -from skchange.change_detectors.moscore import Moscore -from skchange.change_detectors.pelt import Pelt -from skchange.change_detectors.seeded_binseg import SeededBinarySegmentation +from skchange.change_detectors import CHANGEPOINT_DETECTORS from skchange.datasets.generate import generate_teeth_data -change_detectors = [Moscore, Pelt, SeededBinarySegmentation] - n_segments = 2 seg_len = 50 changepoint_data = generate_teeth_data( @@ -16,7 +12,7 @@ ) -@pytest.mark.parametrize("Estimator", change_detectors) +@pytest.mark.parametrize("Estimator", CHANGEPOINT_DETECTORS) def test_change_detector_predict(Estimator): """Test changepoint detector predict (sparse output).""" detector = Estimator.create_test_instance() @@ -24,7 +20,7 @@ def test_change_detector_predict(Estimator): assert len(changepoints) == n_segments - 1 and changepoints[0] == seg_len - 1 -@pytest.mark.parametrize("Estimator", change_detectors) +@pytest.mark.parametrize("Estimator", CHANGEPOINT_DETECTORS) def test_change_detector_transform(Estimator): """Test changepoint detector transform (dense output).""" detector = Estimator.create_test_instance() @@ -33,7 +29,7 @@ def test_change_detector_transform(Estimator): assert labels[seg_len - 1] == 0.0 and labels[seg_len] == 1.0 -@pytest.mark.parametrize("Estimator", change_detectors) +@pytest.mark.parametrize("Estimator", CHANGEPOINT_DETECTORS) def test_change_detector_sparse_to_dense(Estimator): """Test that predict + sparse_to_dense == transform.""" detector = Estimator.create_test_instance() @@ -43,7 +39,7 @@ def test_change_detector_sparse_to_dense(Estimator): assert labels.equals(labels_transform) -@pytest.mark.parametrize("Estimator", change_detectors) +@pytest.mark.parametrize("Estimator", CHANGEPOINT_DETECTORS) def test_change_detector_dense_to_sparse(Estimator): """Test that transform + dense_to_sparse == predict.""" detector = Estimator.create_test_instance() diff --git a/skchange/tests/test_all_detectors.py b/skchange/tests/test_all_detectors.py index 17330875..4f17e01c 100644 --- a/skchange/tests/test_all_detectors.py +++ b/skchange/tests/test_all_detectors.py @@ -5,21 +5,20 @@ from sktime.utils._testing.annotation import make_annotation_problem from sktime.utils.estimator_checks import check_estimator, parametrize_with_checks -from skchange.anomaly_detectors.tests.test_anomaly_detectors import anomaly_detectors +from skchange.anomaly_detectors import ANOMALY_DETECTORS from skchange.base import BaseDetector -from skchange.change_detectors.tests.test_change_detectors import change_detectors +from skchange.change_detectors import CHANGE_DETECTORS from skchange.datasets.generate import generate_anomalous_data -# TODO: Move all detectors to __init__ files. A bit random construction now. -all_detectors = anomaly_detectors + change_detectors +ALL_DETECTORS = ANOMALY_DETECTORS + CHANGE_DETECTORS -@parametrize_with_checks(all_detectors) +@parametrize_with_checks(ALL_DETECTORS) def test_sktime_compatible_estimators(obj, test_name): check_estimator(obj, tests_to_run=test_name, raise_exceptions=True) -@pytest.mark.parametrize("Detector", all_detectors) +@pytest.mark.parametrize("Detector", ALL_DETECTORS) def test_detector_fit(Detector): """Test fit method output.""" detector = Detector.create_test_instance() @@ -30,7 +29,7 @@ def test_detector_fit(Detector): assert isinstance(fit_detector, Detector) -@pytest.mark.parametrize("Detector", all_detectors) +@pytest.mark.parametrize("Detector", ALL_DETECTORS) def test_detector_predict(Detector): """Test predict method output.""" detector = Detector.create_test_instance() @@ -39,7 +38,7 @@ def test_detector_predict(Detector): assert isinstance(y, (pd.Series, pd.DataFrame)) -@pytest.mark.parametrize("Detector", all_detectors) +@pytest.mark.parametrize("Detector", ALL_DETECTORS) def test_detector_transform(Detector): """Test transform method output.""" detector = Detector.create_test_instance() @@ -49,7 +48,7 @@ def test_detector_transform(Detector): assert len(x) == len(y) -@pytest.mark.parametrize("Detector", all_detectors) +@pytest.mark.parametrize("Detector", ALL_DETECTORS) def test_detector_score_transform(Detector): """Test score_transform method output.""" detector = Detector.create_test_instance() From 87ea550aac4b1c8bb31f5f53b14dfcc94b5564ae Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 22:10:04 +0200 Subject: [PATCH 37/75] Add typing in _format_sparse_output --- skchange/anomaly_detectors/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/skchange/anomaly_detectors/base.py b/skchange/anomaly_detectors/base.py index ad20311b..671f4650 100644 --- a/skchange/anomaly_detectors/base.py +++ b/skchange/anomaly_detectors/base.py @@ -163,7 +163,9 @@ def dense_to_sparse(y_dense: pd.Series) -> pd.Series: return CollectiveAnomalyDetector._format_sparse_output(anomaly_intervals) @staticmethod - def _format_sparse_output(anomaly_intervals, closed="both") -> pd.Series: + def _format_sparse_output( + anomaly_intervals: list[tuple[int, int]], closed: str = "both" + ) -> pd.Series: """Format the sparse output of collective anomaly detectors. Can be reused by subclasses to format the output of the _predict method. From e25f42d992ac8414c9d7a290e05537ea8bdb20b6 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 22:10:21 +0200 Subject: [PATCH 38/75] Improve _predict docstring --- skchange/base.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/skchange/base.py b/skchange/base.py index 4bcd5c25..41ece9bd 100644 --- a/skchange/base.py +++ b/skchange/base.py @@ -206,19 +206,20 @@ def predict(self, X): return y def _predict(self, X): - """Create annotations on test/deployment data. + """Detect events in test/deployment data. core logic Parameters ---------- X : pd.DataFrame - Data to annotate, time series. + Data to detect events in (time series). Returns ------- - y : pd.Series - Annotations for sequence X exact format depends on annotation type. + y : pd.Series or pd.DataFrame + Each element or row corresponds to a detected event. Exact format depends on + the specific detector type. """ raise NotImplementedError("abstract method") From ca28634fc86b9f3493ca93f4c6d4b706ed8a5384 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 22:19:29 +0200 Subject: [PATCH 39/75] Conform Capa to BaseDetector --- skchange/anomaly_detectors/__init__.py | 2 +- skchange/anomaly_detectors/capa.py | 81 +++++++++++-------- skchange/anomaly_detectors/tests/test_capa.py | 15 ++-- 3 files changed, 55 insertions(+), 43 deletions(-) diff --git a/skchange/anomaly_detectors/__init__.py b/skchange/anomaly_detectors/__init__.py index 56c15793..10b0d154 100644 --- a/skchange/anomaly_detectors/__init__.py +++ b/skchange/anomaly_detectors/__init__.py @@ -9,7 +9,7 @@ BASE_ANOMALY_DETECTORS = [CollectiveAnomalyDetector, PointAnomalyDetector] COLLECTIVE_ANOMALY_DETECTORS = [ - # Capa, + Capa, # CircularBinarySegmentation, # MoscoreAnomaly, # Mvcapa, diff --git a/skchange/anomaly_detectors/capa.py b/skchange/anomaly_detectors/capa.py index 13e67db1..9bf8bce7 100644 --- a/skchange/anomaly_detectors/capa.py +++ b/skchange/anomaly_detectors/capa.py @@ -8,10 +8,10 @@ import numpy as np import pandas as pd from numba import njit -from sktime.annotation.base import BaseSeriesAnnotator +from skchange.anomaly_detectors.base import CollectiveAnomalyDetector from skchange.anomaly_detectors.mvcapa import dense_capa_penalty, run_base_capa -from skchange.anomaly_detectors.utils import format_anomaly_output +from skchange.anomaly_detectors.utils import merge_anomalies from skchange.costs.saving_factory import saving_factory from skchange.utils.validation.data import check_data from skchange.utils.validation.parameters import check_larger_than @@ -43,7 +43,7 @@ def run_capa( ) -class Capa(BaseSeriesAnnotator): +class Capa(CollectiveAnomalyDetector): """Collective and point anomaly detection. An efficient implementation of the CAPA algorithm [1]_ for anomaly detection. @@ -67,19 +67,8 @@ class Capa(BaseSeriesAnnotator): Maximum length of a segment. ignore_point_anomalies : bool, optional (default=False) If True, detected point anomalies are not returned by .predict(). I.e., only - collective anomalies are returned. - fmt : str {"dense", "sparse"}, optional (default="sparse") - Annotation output format: - * If "sparse", a sub-series of labels for only the outliers in X is returned, - * If "dense", a series of labels for all values in X is returned. - labels : str {"indicator", "score", "int_label"}, optional (default="int_label") - Annotation output labels: - * If "indicator", returned values are boolean, indicating whether a value is - an outlier, - * If "score", returned values are floats, giving the outlier score. - * If "int_label", returned values are integer, indicating which segment a - value belongs to. - + collective anomalies are returned. If False, point anomalies are included in the + output as collective anomalies of length 1. References ---------- @@ -113,8 +102,6 @@ def __init__( min_segment_length: int = 2, max_segment_length: int = 1000, ignore_point_anomalies: bool = False, - fmt: str = "sparse", - labels: str = "int_label", ): self.saving = saving self.collective_penalty_scale = collective_penalty_scale @@ -122,7 +109,7 @@ def __init__( self.min_segment_length = min_segment_length self.max_segment_length = max_segment_length self.ignore_point_anomalies = ignore_point_anomalies - super().__init__(fmt=fmt, labels=labels) + super().__init__() self.saving_func, self.saving_init_func = saving_factory(self.saving) @@ -149,7 +136,7 @@ def _get_penalty_components(self, X: pd.DataFrame) -> tuple[np.ndarray, float]: point_penalty = self.point_penalty_scale * n_params * p * np.log(n) return collective_penalty, point_penalty - def _fit(self, X: pd.DataFrame, Y: Optional[pd.DataFrame] = None): + def _fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None): """Fit to training data. Sets the penalty of the detector. @@ -164,7 +151,7 @@ def _fit(self, X: pd.DataFrame, Y: Optional[pd.DataFrame] = None): ---------- X : pd.DataFrame training data to fit the threshold to. - Y : pd.Series, optional + y : pd.Series, optional Does nothing. Only here to make the fit method compatible with sktime and scikit-learn. @@ -185,23 +172,30 @@ def _fit(self, X: pd.DataFrame, Y: Optional[pd.DataFrame] = None): return self def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: - """Create annotations on test/deployment data. + """Detect events in test/deployment data. + + core logic Parameters ---------- - X : pd.DataFrame - data to annotate, time series + X : pd.DataFrame + Data to detect events in (time series). Returns ------- - Y : pd.Series - annotations for sequence X - exact format depends on annotation type + pd.Series[pd.Interval] containing the collective anomaly intervals. + + Notes + ----- + The start and end points of the intervals can be accessed by + output.array.left and output.array.right, respectively. """ X = check_data( X, min_length=self.min_segment_length, min_length_name="min_segment_length", ) - opt_savings, self.collective_anomalies, self.point_anomalies = run_capa( + opt_savings, collective_anomalies, point_anomalies = run_capa( X.values, self.saving_func, self.saving_init_func, @@ -211,15 +205,31 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: self.max_segment_length, ) self.scores = pd.Series(opt_savings, index=X.index, name="score") - anomalies = format_anomaly_output( - self.fmt, - self.labels, - X.index, - self.collective_anomalies, - self.point_anomalies if not self.ignore_point_anomalies else None, - scores=self.scores, - ) - return anomalies + + if self.ignore_point_anomalies: + anomalies = collective_anomalies + else: + anomalies = merge_anomalies(collective_anomalies, point_anomalies) + return CollectiveAnomalyDetector._format_sparse_output(anomalies) + + def _score_transform(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: + """Compute the pelt scores for the input data. + + Parameters + ---------- + X : pd.DataFrame - data to compute scores for, time series + + Returns + ------- + scores : pd.Series - scores for sequence X + + Notes + ----- + The CAPA scores are the cumulative optimal savings, so the scores are increasing + and are not per observation scores. + """ + self.predict(X) + return self.scores @classmethod def get_test_params(cls, parameter_set="default"): @@ -242,5 +252,6 @@ def get_test_params(cls, parameter_set="default"): """ params = [ {"saving": "mean", "min_segment_length": 5, "max_segment_length": 100}, + {"saving": "mean", "min_segment_length": 2, "max_segment_length": 20}, ] return params diff --git a/skchange/anomaly_detectors/tests/test_capa.py b/skchange/anomaly_detectors/tests/test_capa.py index 7c8b7305..ea73779a 100644 --- a/skchange/anomaly_detectors/tests/test_capa.py +++ b/skchange/anomaly_detectors/tests/test_capa.py @@ -1,9 +1,9 @@ """Tests for CAPA and all available savings.""" +import pandas as pd import pytest from skchange.anomaly_detectors.capa import Capa -from skchange.anomaly_detectors.mvcapa import Mvcapa from skchange.costs.saving_factory import VALID_SAVINGS from skchange.datasets.generate import generate_teeth_data @@ -16,14 +16,15 @@ def test_capa_anomalies(saving): df = generate_teeth_data( n_segments=n_segments, mean=10, segment_length=seg_len, p=5, random_state=8 ) - for detector_class in [Capa, Mvcapa]: - detector = detector_class( - saving=saving, fmt="sparse", collective_penalty_scale=2.0 - ) + capa_classes = [Capa] + for detector_class in capa_classes: + detector = detector_class(saving=saving, collective_penalty_scale=2.0) anomalies = detector.fit_predict(df) + if isinstance(anomalies, pd.DataFrame): + anomalies = anomalies["location"] # End point also included as a changepoint assert ( len(anomalies) == 1 - and anomalies.loc[0, "start"] == seg_len - and anomalies.loc[0, "end"] == 2 * seg_len - 1 + and anomalies.array.left[0] == seg_len + and anomalies.array.right[0] == 2 * seg_len - 1 ) From 3bdb4b65e86a62fade33bcba9210fe794a50b1c3 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 22:35:10 +0200 Subject: [PATCH 40/75] Conform CircularBinarySegmentation to BaseDetector --- skchange/anomaly_detectors/__init__.py | 3 +- skchange/anomaly_detectors/circular_binseg.py | 47 ++++++++----------- 2 files changed, 21 insertions(+), 29 deletions(-) diff --git a/skchange/anomaly_detectors/__init__.py b/skchange/anomaly_detectors/__init__.py index 10b0d154..394c2865 100644 --- a/skchange/anomaly_detectors/__init__.py +++ b/skchange/anomaly_detectors/__init__.py @@ -6,11 +6,12 @@ PointAnomalyDetector, ) from skchange.anomaly_detectors.capa import Capa +from skchange.anomaly_detectors.circular_binseg import CircularBinarySegmentation BASE_ANOMALY_DETECTORS = [CollectiveAnomalyDetector, PointAnomalyDetector] COLLECTIVE_ANOMALY_DETECTORS = [ Capa, - # CircularBinarySegmentation, + CircularBinarySegmentation, # MoscoreAnomaly, # Mvcapa, StatThresholdAnomaliser, diff --git a/skchange/anomaly_detectors/circular_binseg.py b/skchange/anomaly_detectors/circular_binseg.py index 3543df9b..bfdf7d36 100644 --- a/skchange/anomaly_detectors/circular_binseg.py +++ b/skchange/anomaly_detectors/circular_binseg.py @@ -8,9 +8,8 @@ import numpy as np import pandas as pd from numba import njit -from sktime.annotation.base import BaseSeriesAnnotator -from skchange.anomaly_detectors.utils import format_anomaly_output +from skchange.anomaly_detectors.base import CollectiveAnomalyDetector from skchange.change_detectors.seeded_binseg import make_seeded_intervals from skchange.scores.score_factory import anomaly_score_factory from skchange.utils.validation.data import check_data @@ -97,7 +96,7 @@ def run_circular_binseg( return anomalies, anomaly_scores, maximizers, starts, ends -class CircularBinarySegmentation(BaseSeriesAnnotator): +class CircularBinarySegmentation(CollectiveAnomalyDetector): """Circular binary segmentation algorithm for multiple collective anomaly detection. Binary segmentation type changepoint detection algorithms recursively split the data @@ -137,17 +136,6 @@ class CircularBinarySegmentation(BaseSeriesAnnotator): starting at 'interval_len'='min_interval_length'. It also governs the amount of overlap between intervals of the same length, as the start of each interval is shifted by a factor of '1 + 1 / growth_factor'. Must be a float in (1, 2]. - fmt : str {"dense", "sparse"}, optional (default="sparse") - Annotation output format: - * If "sparse", a sub-series of labels for only the outliers in X is returned, - * If "dense", a series of labels for all values in X is returned. - labels : str {"indicator", "score", "int_label"}, optional (default="int_label") - Annotation output labels: - * If "indicator", returned values are boolean, indicating whether a value is an - outlier, - * If "score", returned values are floats, giving the outlier score. - * If "int_label", returned values are integer, indicating which segment a value - belongs to. References ---------- @@ -184,8 +172,6 @@ def __init__( min_segment_length: int = 5, max_interval_length: int = 100, growth_factor: float = 1.5, - fmt: str = "sparse", - labels: str = "int_label", ): self.score = score self.threshold_scale = threshold_scale # Just holds the input value. @@ -193,7 +179,7 @@ def __init__( self.min_segment_length = min_segment_length self.max_interval_length = max_interval_length self.growth_factor = growth_factor - super().__init__(fmt=fmt, labels=labels) + super().__init__() self.score_f, self.score_init_f = anomaly_score_factory(self.score) check_larger_than(0.0, self.threshold_scale, "threshold_scale", allow_none=True) @@ -262,7 +248,7 @@ def _get_threshold(self, X: pd.DataFrame) -> float: p = X.shape[1] return self.threshold_scale * self.get_default_threshold(n, p) - def _fit(self, X: pd.DataFrame, Y: Optional[pd.DataFrame] = None): + def _fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None): """Fit to training data. Sets the threshold of the detector. @@ -277,7 +263,7 @@ def _fit(self, X: pd.DataFrame, Y: Optional[pd.DataFrame] = None): ---------- X : pd.DataFrame training data to fit the threshold to. - Y : pd.Series, optional + y : pd.Series, optional Does nothing. Only here to make the fit method compatible with sktime and scikit-learn. @@ -294,16 +280,23 @@ def _fit(self, X: pd.DataFrame, Y: Optional[pd.DataFrame] = None): return self def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: - """Create annotations on test/deployment data. + """Detect events in test/deployment data. + + core logic Parameters ---------- - X : pd.DataFrame - data to annotate, time series + X : pd.DataFrame + Data to detect events in (time series). Returns ------- - Y : pd.Series - annotations for sequence X - exact format depends on annotation type + pd.Series[pd.Interval] containing the collective anomaly intervals. + + Notes + ----- + The start and end points of the intervals can be accessed by + output.array.left and output.array.right, respectively. """ X = check_data( X, @@ -319,7 +312,6 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: self.max_interval_length, self.growth_factor, ) - self.anomalies = anomalies self.scores = pd.DataFrame( { "interval_start": starts, @@ -329,9 +321,7 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: "score": scores, } ) - return format_anomaly_output( - self.fmt, self.labels, X.index, self.anomalies, scores=self.scores - ) + return CollectiveAnomalyDetector._format_sparse_output(anomalies) @classmethod def get_test_params(cls, parameter_set="default"): @@ -353,6 +343,7 @@ def get_test_params(cls, parameter_set="default"): `create_test_instance` uses the first (or only) dictionary in `params` """ params = [ - {"score": "mean", "min_segment_length": 5, "max_interval_length": 100}, + {"score": "mean", "min_segment_length": 5, "max_interval_length": 50}, + {"score": "mean", "min_segment_length": 2, "max_interval_length": 20}, ] return params From 79758447151c519226a3f8b0c8ec0f4d4ee4fb6b Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 22:54:08 +0200 Subject: [PATCH 41/75] Update anomaly test data Need shorter anomalies to work for MoscoreAnomaly test instances --- skchange/anomaly_detectors/tests/test_anomaly_detectors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skchange/anomaly_detectors/tests/test_anomaly_detectors.py b/skchange/anomaly_detectors/tests/test_anomaly_detectors.py index 891449a8..1830ac28 100644 --- a/skchange/anomaly_detectors/tests/test_anomaly_detectors.py +++ b/skchange/anomaly_detectors/tests/test_anomaly_detectors.py @@ -7,9 +7,9 @@ from skchange.anomaly_detectors.base import CollectiveAnomalyDetector from skchange.datasets.generate import generate_anomalous_data -true_anomalies = [(30, 39), (70, 75)] +true_anomalies = [(30, 34), (70, 75)] anomaly_data = generate_anomalous_data( - 100, anomalies=true_anomalies, means=[10.0, 15.0], random_state=39 + 100, anomalies=true_anomalies, means=[10.0, 15.0], random_state=2 ) From bab0e24cb0bbe6aae0f4149ffafdd45428a69230 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 23:10:57 +0200 Subject: [PATCH 42/75] Conform MoscoreAnomaly to BaseDetector --- skchange/anomaly_detectors/__init__.py | 3 +- skchange/anomaly_detectors/moscore_anomaly.py | 57 +++++++++---------- .../tests/test_moscore_anomaly.py | 24 ++++---- 3 files changed, 40 insertions(+), 44 deletions(-) diff --git a/skchange/anomaly_detectors/__init__.py b/skchange/anomaly_detectors/__init__.py index 394c2865..11590c03 100644 --- a/skchange/anomaly_detectors/__init__.py +++ b/skchange/anomaly_detectors/__init__.py @@ -7,12 +7,13 @@ ) from skchange.anomaly_detectors.capa import Capa from skchange.anomaly_detectors.circular_binseg import CircularBinarySegmentation +from skchange.anomaly_detectors.moscore_anomaly import MoscoreAnomaly BASE_ANOMALY_DETECTORS = [CollectiveAnomalyDetector, PointAnomalyDetector] COLLECTIVE_ANOMALY_DETECTORS = [ Capa, CircularBinarySegmentation, - # MoscoreAnomaly, + MoscoreAnomaly, # Mvcapa, StatThresholdAnomaliser, ] diff --git a/skchange/anomaly_detectors/moscore_anomaly.py b/skchange/anomaly_detectors/moscore_anomaly.py index b9079170..f51282ed 100644 --- a/skchange/anomaly_detectors/moscore_anomaly.py +++ b/skchange/anomaly_detectors/moscore_anomaly.py @@ -7,10 +7,9 @@ import numpy as np import pandas as pd -from sktime.annotation.base import BaseSeriesAnnotator +from skchange.anomaly_detectors.base import CollectiveAnomalyDetector from skchange.anomaly_detectors.circular_binseg import greedy_anomaly_selection -from skchange.anomaly_detectors.utils import format_anomaly_output from skchange.scores.score_factory import anomaly_score_factory from skchange.utils.validation.data import check_data from skchange.utils.validation.parameters import check_larger_than, check_smaller_than @@ -53,7 +52,7 @@ def run_moscore_anomaly( return anomalies, scores, starts, ends -class MoscoreAnomaly(BaseSeriesAnnotator): +class MoscoreAnomaly(CollectiveAnomalyDetector): """Moving score algorithm for multiple collective anomaly detection. A custom version of the MOSUM (moving sum) algorithm [1]_ for collective anomaly @@ -62,7 +61,7 @@ class MoscoreAnomaly(BaseSeriesAnnotator): `left_bandwidth` values to the left and `right_bandwidth` samples to the right of the anomaly window. - Experimental for now. + Experimental. Efficently implemented using numba. @@ -102,17 +101,6 @@ class MoscoreAnomaly(BaseSeriesAnnotator): `min_anomaly_length` and `max_anomaly_length` are considered. If it is not important to consider all candidates, just a sparse subset for example, customising the anomaly lengths can significantly speed up the algorithm. - fmt : str {"dense", "sparse"}, optional (default="sparse") - Annotation output format: - * If "sparse", a sub-series of labels for only the outliers in X is returned, - * If "dense", a series of labels for all values in X is returned. - labels : str {"indicator", "score", "int_label"}, optional (default="int_label") - Annotation output labels: - * If "indicator", returned values are boolean, indicating whether a value is an - outlier, - * If "score", returned values are floats, giving the outlier score. - * If "int_label", returned values are integer, indicating which segment a value - belongs to. References ---------- @@ -146,8 +134,6 @@ def __init__( threshold_scale: Optional[float] = 2.0, level: float = 0.01, anomaly_lengths: np.ndarray = None, - fmt: str = "sparse", - labels: str = "int_label", ): self.score = score self.min_anomaly_length = min_anomaly_length @@ -157,7 +143,7 @@ def __init__( self.threshold_scale = threshold_scale self.level = level self.anomaly_lengths = anomaly_lengths - super().__init__(fmt=fmt, labels=labels) + super().__init__() self.score_f, self.score_init_f = anomaly_score_factory(score) self._right_bandwidth = right_bandwidth if right_bandwidth else left_bandwidth @@ -247,7 +233,7 @@ def _get_threshold(self, X: pd.DataFrame) -> float: p = X.shape[1] return self.threshold_scale * self.get_default_threshold(n, p) - def _fit(self, X: pd.DataFrame, Y: Optional[pd.DataFrame] = None): + def _fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None): """Fit to training data. Sets the threshold of the detector. @@ -262,7 +248,7 @@ def _fit(self, X: pd.DataFrame, Y: Optional[pd.DataFrame] = None): ---------- X : pd.DataFrame training data to fit the threshold to. - Y : pd.Series, optional + y : pd.Series, optional Does nothing. Only here to make the fit method compatible with sktime and scikit-learn. @@ -282,16 +268,21 @@ def _fit(self, X: pd.DataFrame, Y: Optional[pd.DataFrame] = None): return self def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: - """Create annotations on test/deployment data. + """Detect events in test/deployment data. Parameters ---------- - X : pd.DataFrame - data to annotate, time series + X : pd.DataFrame + Data to detect events in (time series). Returns ------- - Y : pd.Series - annotations for sequence X - exact format depends on annotation type + pd.Series[pd.Interval] containing the collective anomaly intervals. + + Notes + ----- + The start and end points of the intervals can be accessed by + output.array.left and output.array.right, respectively. """ min_length = ( self.left_bandwidth + self._right_bandwidth + self._min_anomaly_length @@ -301,7 +292,7 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: min_length=min_length, min_length_name="left_bandwidth + _right_bandwidth + _min_anomaly_length", ) - self.anomalies, scores, starts, ends = run_moscore_anomaly( + anomalies, scores, starts, ends = run_moscore_anomaly( X.values, self.score_f, self.score_init_f, @@ -313,9 +304,7 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: self.scores = pd.DataFrame( {"anomaly_start": starts, "anomaly_end": ends, "score": scores} ) - return format_anomaly_output( - self.fmt, self.labels, X.index, self.anomalies, scores=self.scores - ) + return CollectiveAnomalyDetector._format_sparse_output(anomalies) @classmethod def get_test_params(cls, parameter_set="default"): @@ -339,9 +328,15 @@ def get_test_params(cls, parameter_set="default"): params = [ { "score": "mean", - "min_anomaly_length": 5, - "max_anomaly_length": 100, - "left_bandwidth": 50, + "min_anomaly_length": 2, + "max_anomaly_length": 8, + "left_bandwidth": 4, + }, + { + "score": "mean", + "min_anomaly_length": 2, + "max_anomaly_length": 6, + "left_bandwidth": 3, }, ] return params diff --git a/skchange/anomaly_detectors/tests/test_moscore_anomaly.py b/skchange/anomaly_detectors/tests/test_moscore_anomaly.py index 8d78f147..d9173d8a 100644 --- a/skchange/anomaly_detectors/tests/test_moscore_anomaly.py +++ b/skchange/anomaly_detectors/tests/test_moscore_anomaly.py @@ -7,38 +7,38 @@ from skchange.datasets.generate import generate_anomalous_data from skchange.scores.score_factory import VALID_ANOMALY_SCORES -true_anomalies = [(50, 59), (120, 129)] +true_anomalies = [(30, 34), (70, 75)] anomaly_data = generate_anomalous_data( - 200, anomalies=true_anomalies, means=[10.0, 5.0], random_state=5 + 100, anomalies=true_anomalies, means=[10.0, 15.0], random_state=103 ) @pytest.mark.parametrize("score", VALID_ANOMALY_SCORES) def test_moscore_anomalies(score): """Test Moscore anomalies.""" - detector = MoscoreAnomaly.create_test_instance() - detector.set_params(score=score, fmt="sparse", labels="int_label") + detector = MoscoreAnomaly( + score, min_anomaly_length=4, max_anomaly_length=10, left_bandwidth=20 + ) + detector.set_params(score=score) anomalies = detector.fit_predict(anomaly_data) assert len(anomalies) == len(true_anomalies) for i, (start, end) in enumerate(true_anomalies): - assert anomalies.loc[i, "start"] == start and anomalies.loc[i, "end"] == end + assert anomalies.array.left[i] == start and anomalies.array.right[i] == end @pytest.mark.parametrize("score", VALID_ANOMALY_SCORES) def test_moscore_scores(score): - """Test Moscore scores.""" + """Test MoscoreAnomaly scores.""" detector = MoscoreAnomaly.create_test_instance() - detector.set_params(score=score, fmt="sparse", labels="int_label") - scores = detector.fit_predict(anomaly_data) - assert np.all(scores >= 0.0) + detector.set_params(score=score) + detector.fit_predict(anomaly_data) + assert np.all(detector.scores >= 0.0) @pytest.mark.parametrize("score", VALID_ANOMALY_SCORES) def test_moscore_tuning(score): """Test Moscore tuning.""" detector = MoscoreAnomaly.create_test_instance() - detector.set_params( - score=score, threshold_scale=None, fmt="dense", labels="indicator" - ) + detector.set_params(score=score, threshold_scale=None) detector.fit(anomaly_data) assert detector.threshold_ > 0.0 From 04ae18d8e0b4bf16099910839800290a5e84ee7b Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 23:13:15 +0200 Subject: [PATCH 43/75] Fix bug --- .../change_detectors/tests/test_change_detectors.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/skchange/change_detectors/tests/test_change_detectors.py b/skchange/change_detectors/tests/test_change_detectors.py index f51d8d90..e0e9cc19 100644 --- a/skchange/change_detectors/tests/test_change_detectors.py +++ b/skchange/change_detectors/tests/test_change_detectors.py @@ -2,7 +2,7 @@ import pytest -from skchange.change_detectors import CHANGEPOINT_DETECTORS +from skchange.change_detectors import CHANGE_DETECTORS from skchange.datasets.generate import generate_teeth_data n_segments = 2 @@ -12,7 +12,7 @@ ) -@pytest.mark.parametrize("Estimator", CHANGEPOINT_DETECTORS) +@pytest.mark.parametrize("Estimator", CHANGE_DETECTORS) def test_change_detector_predict(Estimator): """Test changepoint detector predict (sparse output).""" detector = Estimator.create_test_instance() @@ -20,7 +20,7 @@ def test_change_detector_predict(Estimator): assert len(changepoints) == n_segments - 1 and changepoints[0] == seg_len - 1 -@pytest.mark.parametrize("Estimator", CHANGEPOINT_DETECTORS) +@pytest.mark.parametrize("Estimator", CHANGE_DETECTORS) def test_change_detector_transform(Estimator): """Test changepoint detector transform (dense output).""" detector = Estimator.create_test_instance() @@ -29,7 +29,7 @@ def test_change_detector_transform(Estimator): assert labels[seg_len - 1] == 0.0 and labels[seg_len] == 1.0 -@pytest.mark.parametrize("Estimator", CHANGEPOINT_DETECTORS) +@pytest.mark.parametrize("Estimator", CHANGE_DETECTORS) def test_change_detector_sparse_to_dense(Estimator): """Test that predict + sparse_to_dense == transform.""" detector = Estimator.create_test_instance() @@ -39,7 +39,7 @@ def test_change_detector_sparse_to_dense(Estimator): assert labels.equals(labels_transform) -@pytest.mark.parametrize("Estimator", CHANGEPOINT_DETECTORS) +@pytest.mark.parametrize("Estimator", CHANGE_DETECTORS) def test_change_detector_dense_to_sparse(Estimator): """Test that transform + dense_to_sparse == predict.""" detector = Estimator.create_test_instance() From f05cdcfd335d9c3d19e3f0d735caf92205d35ea8 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 23:38:42 +0200 Subject: [PATCH 44/75] Remove unnecessary check --- skchange/base.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/skchange/base.py b/skchange/base.py index 41ece9bd..52b4d90d 100644 --- a/skchange/base.py +++ b/skchange/base.py @@ -396,8 +396,6 @@ def update_predict(self, X): ----- Updates fitted model that updates attributes ending in "_". """ - X = check_series(X, allow_index_names=True) - self.update(X=X) y = self.predict(X=X) From da1a81098887087f0415e1cf71daed0e9020c2e0 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 23:46:51 +0200 Subject: [PATCH 45/75] Add test for update method --- skchange/tests/test_all_detectors.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/skchange/tests/test_all_detectors.py b/skchange/tests/test_all_detectors.py index 4f17e01c..689dc3e3 100644 --- a/skchange/tests/test_all_detectors.py +++ b/skchange/tests/test_all_detectors.py @@ -58,3 +58,17 @@ def test_detector_score_transform(Detector): assert isinstance(y, (pd.Series, pd.DataFrame)) except NotImplementedError: pass + + +@pytest.mark.parametrize("Detector", ALL_DETECTORS) +def test_detector_update(Detector): + """Test update method output.""" + detector = Detector.create_test_instance() + x = make_annotation_problem(n_timepoints=15, estimator_type="None") + x_train = x[:10].to_frame() + x_next = x[10:].to_frame() + detector.fit(x_train) + updated_detector = detector.update(x_next) + assert issubclass(detector.__class__, BaseDetector) + assert issubclass(updated_detector.__class__, BaseDetector) + assert isinstance(updated_detector, Detector) From d7e3876af7fb8f064015b09bf57180825a677a76 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 23:47:13 +0200 Subject: [PATCH 46/75] Switch tag "remember_data" to True for .update to work --- skchange/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skchange/base.py b/skchange/base.py index 52b4d90d..4e241825 100644 --- a/skchange/base.py +++ b/skchange/base.py @@ -105,7 +105,7 @@ class BaseDetector(BaseTransformer): # todo: rename to capability:missing_values "capability:missing_values": False, # is transform result always guaranteed to contain no missing values? - "remember_data": False, # whether all data seen is remembered as self._X + "remember_data": True, # whether all data seen is remembered as self._X "python_version": None, # PEP 440 python version specifier to limit versions "authors": "mtveten", # author(s) of the object "maintainers": "mtveten", # current maintainer(s) of the object From b67f7c0d994730302b3e063b19ca9333683735c0 Mon Sep 17 00:00:00 2001 From: tveten Date: Thu, 22 Aug 2024 23:49:21 +0200 Subject: [PATCH 47/75] Fix test for update method --- skchange/tests/test_all_detectors.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/skchange/tests/test_all_detectors.py b/skchange/tests/test_all_detectors.py index 689dc3e3..0861108c 100644 --- a/skchange/tests/test_all_detectors.py +++ b/skchange/tests/test_all_detectors.py @@ -68,7 +68,6 @@ def test_detector_update(Detector): x_train = x[:10].to_frame() x_next = x[10:].to_frame() detector.fit(x_train) - updated_detector = detector.update(x_next) + detector.update_predict(x_next) assert issubclass(detector.__class__, BaseDetector) - assert issubclass(updated_detector.__class__, BaseDetector) - assert isinstance(updated_detector, Detector) + assert isinstance(detector, Detector) From 5322a76329e9ba51a64db326b7f9980729070e94 Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 08:46:00 +0200 Subject: [PATCH 48/75] Fix bug in test --- skchange/tests/test_all_detectors.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/skchange/tests/test_all_detectors.py b/skchange/tests/test_all_detectors.py index 0861108c..7d7c84c7 100644 --- a/skchange/tests/test_all_detectors.py +++ b/skchange/tests/test_all_detectors.py @@ -64,9 +64,9 @@ def test_detector_score_transform(Detector): def test_detector_update(Detector): """Test update method output.""" detector = Detector.create_test_instance() - x = make_annotation_problem(n_timepoints=15, estimator_type="None") - x_train = x[:10].to_frame() - x_next = x[10:].to_frame() + x = make_annotation_problem(n_timepoints=30, estimator_type="None") + x_train = x[:20].to_frame() + x_next = x[20:].to_frame() detector.fit(x_train) detector.update_predict(x_next) assert issubclass(detector.__class__, BaseDetector) From cc0248a284802042418e6ed546bad41d1d78e2f2 Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 08:46:40 +0200 Subject: [PATCH 49/75] Remove old commented fit method --- skchange/base.py | 37 ------------------------------------- 1 file changed, 37 deletions(-) diff --git a/skchange/base.py b/skchange/base.py index 4e241825..13b71df8 100644 --- a/skchange/base.py +++ b/skchange/base.py @@ -122,43 +122,6 @@ def __init__(self): super().__init__() - # def fit(self, X, y=None): - # """Fit to training data. - - # Parameters - # ---------- - # X : pd.DataFrame - # Training data to fit model to (time series). - # y : pd.Series, optional - # Ground truth annotations for training if annotator is supervised. - - # Returns - # ------- - # self : - # Reference to self. - - # Notes - # ----- - # Creates fitted model that updates attributes ending in "_". Sets - # _is_fitted flag to True. - # """ - # X = check_series(X, allow_index_names=True) - - # if y is not None: - # y = check_series(y, allow_index_names=True) - - # self._X = X - # self._y = y - - # # fkiraly: insert checks/conversions here, after PR #1012 I suggest - - # self._fit(X=X, y=y) - - # # this should happen last - # self._is_fitted = True - - # return self - def _fit(self, X, y=None): """Fit to training data. From dd4ffc6754d32cda51e921dba19410aed21afda8 Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 08:54:05 +0200 Subject: [PATCH 50/75] Add tests for NotImplementedError --- skchange/tests/test_all_detectors.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/skchange/tests/test_all_detectors.py b/skchange/tests/test_all_detectors.py index 7d7c84c7..6eea06c5 100644 --- a/skchange/tests/test_all_detectors.py +++ b/skchange/tests/test_all_detectors.py @@ -71,3 +71,20 @@ def test_detector_update(Detector): detector.update_predict(x_next) assert issubclass(detector.__class__, BaseDetector) assert isinstance(detector, Detector) + + +def test_detector_not_implemented_methods(): + detector = BaseDetector() + x = make_annotation_problem(n_timepoints=20, estimator_type="None") + with pytest.raises(NotImplementedError): + detector.fit(x) + + detector._is_fitted = True # Required for the following functions to run + with pytest.raises(NotImplementedError): + detector.predict(x) + with pytest.raises(NotImplementedError): + detector.transform(x) + with pytest.raises(NotImplementedError): + detector.score_transform(x) + with pytest.raises(NotImplementedError): + detector.dense_to_sparse(x) From d22f54da07b8e3920ef68689ed173a11965a44df Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 09:01:13 +0200 Subject: [PATCH 51/75] Add sparse_to_dense error test --- skchange/tests/test_all_detectors.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/skchange/tests/test_all_detectors.py b/skchange/tests/test_all_detectors.py index 6eea06c5..54e52da7 100644 --- a/skchange/tests/test_all_detectors.py +++ b/skchange/tests/test_all_detectors.py @@ -88,3 +88,5 @@ def test_detector_not_implemented_methods(): detector.score_transform(x) with pytest.raises(NotImplementedError): detector.dense_to_sparse(x) + with pytest.raises(NotImplementedError): + detector.sparse_to_dense(x) From 2ec9c1a534a0db841a2a51f1628be72ff2ebfedd Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 09:04:00 +0200 Subject: [PATCH 52/75] Use ignore_point_anomalies in test --- skchange/anomaly_detectors/tests/test_capa.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/skchange/anomaly_detectors/tests/test_capa.py b/skchange/anomaly_detectors/tests/test_capa.py index ea73779a..43ffa99f 100644 --- a/skchange/anomaly_detectors/tests/test_capa.py +++ b/skchange/anomaly_detectors/tests/test_capa.py @@ -18,7 +18,11 @@ def test_capa_anomalies(saving): ) capa_classes = [Capa] for detector_class in capa_classes: - detector = detector_class(saving=saving, collective_penalty_scale=2.0) + detector = detector_class( + saving=saving, + collective_penalty_scale=2.0, + ignore_point_anomalies=True, # To get test coverage. + ) anomalies = detector.fit_predict(df) if isinstance(anomalies, pd.DataFrame): anomalies = anomalies["location"] From c9ebae56f168566f6dfe8b0301186810cde090e6 Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 11:56:24 +0200 Subject: [PATCH 53/75] Add columns argument to sparse_to_dense To allow for subset detectors --- .../tests/test_anomaly_detectors.py | 12 +++++++----- skchange/base.py | 13 +++++++------ skchange/change_detectors/base.py | 11 ++++++++--- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/skchange/anomaly_detectors/tests/test_anomaly_detectors.py b/skchange/anomaly_detectors/tests/test_anomaly_detectors.py index 1830ac28..da9dedca 100644 --- a/skchange/anomaly_detectors/tests/test_anomaly_detectors.py +++ b/skchange/anomaly_detectors/tests/test_anomaly_detectors.py @@ -19,7 +19,7 @@ def test_collective_anomaly_detector_predict(Estimator): detector = Estimator.create_test_instance() anomalies = detector.fit_predict(anomaly_data) if isinstance(anomalies, pd.DataFrame): - anomalies = anomalies["location"] + anomalies = anomalies.iloc[:, 0] assert len(anomalies) == len(true_anomalies) for i, (start, end) in enumerate(true_anomalies): @@ -53,9 +53,11 @@ def test_anomaly_detector_sparse_to_dense(Estimator): """Test that predict + sparse_to_dense == transform.""" detector = Estimator.create_test_instance() anomalies = detector.fit_predict(anomaly_data) - labels = detector.sparse_to_dense(anomalies, anomaly_data.index) + labels_predict_convert = detector.sparse_to_dense( + anomalies, anomaly_data.index, anomaly_data.columns + ) labels_transform = detector.fit_transform(anomaly_data) - assert labels.equals(labels_transform) + assert labels_predict_convert.equals(labels_transform) @pytest.mark.parametrize("Estimator", ANOMALY_DETECTORS) @@ -63,6 +65,6 @@ def test_anomaly_detector_dense_to_sparse(Estimator): """Test that transform + dense_to_sparse == predict.""" detector = Estimator.create_test_instance() labels = detector.fit_transform(anomaly_data) - anomalies = detector.dense_to_sparse(labels) + anomalies_transform_convert = detector.dense_to_sparse(labels) anomalies_predict = detector.fit_predict(anomaly_data) - assert anomalies.equals(anomalies_predict) + assert anomalies_transform_convert.equals(anomalies_predict) diff --git a/skchange/base.py b/skchange/base.py index 13b71df8..1b5163aa 100644 --- a/skchange/base.py +++ b/skchange/base.py @@ -16,7 +16,7 @@ class name: BaseDetector They are defined by the content and format of the output of the predict method. Each detector type therefore has the following methods for converting between sparse and dense output formats: - converting sparse output to dense - sparse_to_dense(y_sparse, index) + converting sparse output to dense - sparse_to_dense(y_sparse, index, columns) converting dense output to sparse - dense_to_sparse(y_dense) [optional] Convenience methods: @@ -202,7 +202,7 @@ def _transform(self, X, y=None): detection results in some meaningful way depending on the detector type. """ y = self.predict(X) - y_dense = self.sparse_to_dense(y, X.index) + y_dense = self.sparse_to_dense(y, X.index, X.columns) # sktime does not support transformations that change the state of the object. # Some detectors store detection score information a self.scores during predict. @@ -213,7 +213,7 @@ def _transform(self, X, y=None): return y_dense @staticmethod - def sparse_to_dense(y_sparse, index): + def sparse_to_dense(y_sparse, index, columns=None): """Convert the sparse output from a detector to a dense format. Parameters @@ -223,10 +223,12 @@ def sparse_to_dense(y_sparse, index): series depends on the task and capability of the annotator. index : array-like Indices that are to be annotated according to ``y_sparse``. + columns : array-like, optional + Columns that are to be annotated according to ``y_sparse``. Returns ------- - pd.Series + pd.Series or pd.DataFrame of detection labels. """ raise NotImplementedError("abstract method") @@ -404,8 +406,7 @@ def fit_transform(self, X, y=None): self : pd.Series Annotations for sequence X exact format depends on annotation type. """ - y = self.fit_predict(X) - return self.sparse_to_dense(y, index=X.index) + return self.fit(X).transform(X) # Notes on required .predict output formats per detector type (task and capability): diff --git a/skchange/change_detectors/base.py b/skchange/change_detectors/base.py index 928b2a1a..d9bb995d 100644 --- a/skchange/change_detectors/base.py +++ b/skchange/change_detectors/base.py @@ -35,19 +35,24 @@ class ChangeDetector(BaseDetector): """ @staticmethod - def sparse_to_dense(y_sparse: pd.Series, index: pd.Index) -> pd.Series: + def sparse_to_dense( + y_sparse: pd.Series, index: pd.Index, columns: pd.Index = None + ) -> pd.Series: """Convert the sparse output from the predict method to a dense format. Parameters ---------- - y_sparse : pd.Series + y_sparse : pd.DataFrame The sparse output from a changepoint detector's predict method. index : array-like Indices that are to be annotated according to ``y_sparse``. + columns : array-like + Columns that are to be annotated according to ``y_sparse``. Returns ------- - pd.Series + pd.Series with integer labels 0, ..., K for each segment between two + changepoints. """ changepoints = y_sparse.to_list() n = len(index) From 760153bc269d3126cb6f4e7ba05dffdfbe75e599 Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 11:56:47 +0200 Subject: [PATCH 54/75] Fix CAPA docstring --- skchange/anomaly_detectors/capa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skchange/anomaly_detectors/capa.py b/skchange/anomaly_detectors/capa.py index 9bf8bce7..e168c45f 100644 --- a/skchange/anomaly_detectors/capa.py +++ b/skchange/anomaly_detectors/capa.py @@ -213,7 +213,7 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: return CollectiveAnomalyDetector._format_sparse_output(anomalies) def _score_transform(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: - """Compute the pelt scores for the input data. + """Compute the CAPA scores for the input data. Parameters ---------- From e202a4200ebe14d9f2813aafd1cfba496f4b02b9 Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 11:57:38 +0200 Subject: [PATCH 55/75] Add SubsetCollectiveAnomalyDetector base class --- skchange/anomaly_detectors/base.py | 120 ++++++++++++++++++++++++++--- 1 file changed, 110 insertions(+), 10 deletions(-) diff --git a/skchange/anomaly_detectors/base.py b/skchange/anomaly_detectors/base.py index 671f4650..ca7d83bd 100644 --- a/skchange/anomaly_detectors/base.py +++ b/skchange/anomaly_detectors/base.py @@ -32,7 +32,9 @@ class PointAnomalyDetector(BaseDetector): """ @staticmethod - def sparse_to_dense(y_sparse: pd.Series, index: pd.Index) -> pd.Series: + def sparse_to_dense( + y_sparse: pd.Series, index: pd.Index, columns: pd.Index = None + ) -> pd.Series: """Convert the sparse output from the predict method to a dense format. Parameters @@ -64,7 +66,9 @@ def dense_to_sparse(y_dense: pd.Series) -> pd.Series: ------- pd.Series of the integer locations of the anomalous data points. """ + # The sparse format only uses integer positions, so we reset the index. y_dense = y_dense.reset_index(drop=True) + anomalies = y_dense.iloc[y_dense.values > 0].index return PointAnomalyDetector._format_sparse_output(anomalies) @@ -104,7 +108,9 @@ class CollectiveAnomalyDetector(BaseDetector): """ @staticmethod - def sparse_to_dense(y_sparse: pd.Series, index: pd.Index) -> pd.Series: + def sparse_to_dense( + y_sparse: pd.Series, index: pd.Index, columns: pd.Index = None + ) -> pd.Series: """Convert the sparse output from the predict method to a dense format. Parameters @@ -147,7 +153,9 @@ def dense_to_sparse(y_dense: pd.Series) -> pd.Series: The start and end points of the intervals can be accessed by output.array.left and output.array.right, respectively. """ + # The sparse format only uses integer positions, so we reset the index. y_dense = y_dense.reset_index(drop=True) + y_anomaly = y_dense.loc[y_dense.values > 0] anomaly_locations_diff = y_anomaly.index.diff() @@ -160,7 +168,9 @@ def dense_to_sparse(y_dense: pd.Series) -> pd.Series: anomaly_ends = np.insert(anomaly_ends, len(anomaly_ends), last_anomaly_end) anomaly_intervals = list(zip(anomaly_starts, anomaly_ends)) - return CollectiveAnomalyDetector._format_sparse_output(anomaly_intervals) + return CollectiveAnomalyDetector._format_sparse_output( + anomaly_intervals, closed="both" + ) @staticmethod def _format_sparse_output( @@ -172,7 +182,7 @@ def _format_sparse_output( """ return pd.Series( pd.IntervalIndex.from_tuples(anomaly_intervals, closed=closed), - name="collective_anomaly", + name="anomaly_interval", ) @@ -183,11 +193,10 @@ class SubsetCollectiveAnomalyDetector(BaseDetector): that are considered anomalous, and also provide information on which components of the data are affected. + Output format of the predict method: See the dense_to_sparse method. + Output format of the transform method: See the sparse_to_dense method. + Output format of the predict method: - pd.DataFrame({ - "location": pd.IntervalIndex(anomaly_intervals, closed=), - "columns": affected_components_list, - }) Subclasses should set the following tags for sktime compatibility: - task: "collective_anomaly_detection" @@ -208,17 +217,108 @@ class SubsetCollectiveAnomalyDetector(BaseDetector): """ @staticmethod - def sparse_to_dense(y_sparse, index): + def sparse_to_dense( + y_sparse: pd.Series, index: pd.Index, columns: pd.Index + ) -> pd.Series: """Convert the sparse output from the predict method to a dense format. Parameters ---------- y_sparse : pd.DataFrame - The sparse output from the predict method. + The sparse output from the predict method. The first column must contain the + anomaly intervals, the second column must contain a list of the affected + columns. index : array-like Indices that are to be annotated according to ``y_sparse``. + columns : array-like + Columns that are to be annotated according to ``y_sparse``. Returns ------- pd.DataFrame """ + anomaly_intervals = y_sparse.iloc[:, 0].array + anomaly_starts = anomaly_intervals.left + anomaly_ends = anomaly_intervals.right + anomaly_columns = y_sparse.iloc[:, 1] + + start_is_open = anomaly_intervals.closed in ["neither", "right"] + if start_is_open: + anomaly_starts += 1 # Exclude the start index in the for loop below. + end_is_closed = anomaly_intervals.closed in ["both", "right"] + if end_is_closed: + anomaly_ends += 1 # Include the end index in the for loop below. + + labels = np.zeros((len(index), len(columns)), dtype="int64") + anomalies = zip(anomaly_starts, anomaly_ends, anomaly_columns) + for i, (start, end, columns) in enumerate(anomalies): + labels[start:end, columns] = i + 1 + + return pd.DataFrame(labels, index=index, columns=columns) + + @staticmethod + def dense_to_sparse(y_dense): + """Convert the dense output from the transform method to a sparse format. + + Parameters + ---------- + y_dense : pd.DataFrame + The dense output from the transform method. + + Returns + ------- + pd.DataFrame + """ + # The sparse format only uses integer positions, so we reset index and columns. + y_dense = y_dense.reset_index(drop=True) + y_dense.columns = range(y_dense.columns.size) + + anomaly_intervals = [] + unique_labels = np.unique(y_dense.values) + for i in unique_labels[unique_labels > 0]: + anomaly_mask = y_dense == i + which_columns = anomaly_mask.any(axis=0) + which_rows = anomaly_mask.any(axis=1) + anomaly_columns = anomaly_mask.columns[which_columns].to_list() + anomaly_start = anomaly_mask.index[which_rows][0] + anomaly_end = anomaly_mask.index[which_rows][-1] + anomaly_intervals.append((anomaly_start, anomaly_end, anomaly_columns)) + + return SubsetCollectiveAnomalyDetector._format_sparse_output( + anomaly_intervals, closed="both" + ) + + @staticmethod + def _format_sparse_output( + collective_anomalies: list[tuple[int, int, np.ndarray]], + closed: str = "both", + ) -> pd.Series: + """Format the sparse output of subset collective anomaly detectors. + + Parameters + ---------- + collective_anomalies : list + List of tuples containing start and end indices of collective + anomalies and a np.array of the affected components/columns. + closed : str + Whether the (start, end) tuple correspond to intervals that are closed + on the left, right, both, or neither. + + Can be reused by subclasses to format the output of the _predict method. + + Returns + ------- + pd.DataFrame with columns + anomaly_interval: Intervals of the collective anomalies. + anomaly_columns: Affected columns of the collective anomalies. + """ + anomaly_intervals = [(start, end) for start, end, _ in collective_anomalies] + affected_components = [components for _, _, components in collective_anomalies] + return pd.DataFrame( + { + "anomaly_interval": pd.IntervalIndex.from_tuples( + anomaly_intervals, closed=closed + ), + "anomaly_columns": affected_components, + } + ) From 78f50d236b94ea5adf0c29354981484e1f9b0891 Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 11:57:56 +0200 Subject: [PATCH 56/75] Conform Mvcapa to BaseDetector --- skchange/anomaly_detectors/__init__.py | 3 +- skchange/anomaly_detectors/mvcapa.py | 66 ++++++++++--------- skchange/anomaly_detectors/tests/test_capa.py | 34 +++++----- 3 files changed, 53 insertions(+), 50 deletions(-) diff --git a/skchange/anomaly_detectors/__init__.py b/skchange/anomaly_detectors/__init__.py index 11590c03..04d55ac4 100644 --- a/skchange/anomaly_detectors/__init__.py +++ b/skchange/anomaly_detectors/__init__.py @@ -8,13 +8,14 @@ from skchange.anomaly_detectors.capa import Capa from skchange.anomaly_detectors.circular_binseg import CircularBinarySegmentation from skchange.anomaly_detectors.moscore_anomaly import MoscoreAnomaly +from skchange.anomaly_detectors.mvcapa import Mvcapa BASE_ANOMALY_DETECTORS = [CollectiveAnomalyDetector, PointAnomalyDetector] COLLECTIVE_ANOMALY_DETECTORS = [ Capa, CircularBinarySegmentation, MoscoreAnomaly, - # Mvcapa, + Mvcapa, StatThresholdAnomaliser, ] POINT_ANOMALY_DETECTORS = [] diff --git a/skchange/anomaly_detectors/mvcapa.py b/skchange/anomaly_detectors/mvcapa.py index 611ead02..a03b2239 100644 --- a/skchange/anomaly_detectors/mvcapa.py +++ b/skchange/anomaly_detectors/mvcapa.py @@ -9,9 +9,9 @@ import pandas as pd from numba import njit from scipy.stats import chi2 -from sktime.annotation.base import BaseSeriesAnnotator -from skchange.anomaly_detectors.utils import format_multivariate_anomaly_output +from skchange.anomaly_detectors.base import SubsetCollectiveAnomalyDetector +from skchange.anomaly_detectors.utils import merge_anomalies from skchange.costs.saving_factory import saving_factory from skchange.utils.validation.data import check_data from skchange.utils.validation.parameters import check_larger_than @@ -354,7 +354,7 @@ def run_mvcapa( return opt_savings, collective_anomalies, point_anomalies -class Mvcapa(BaseSeriesAnnotator): +class Mvcapa(SubsetCollectiveAnomalyDetector): """Subset multivariate collective and point anomaly detection. An efficient implementation of the MVCAPA algorithm [1]_ for anomaly detection. @@ -381,18 +381,6 @@ class Mvcapa(BaseSeriesAnnotator): ignore_point_anomalies : bool, optional (default=False) If True, detected point anomalies are not returned by .predict(). I.e., only collective anomalies are returned. - fmt : str {"dense", "sparse"}, optional (default="sparse") - Annotation output format: - * If "sparse", a sub-series of labels for only the outliers in X is returned, - * If "dense", a series of labels for all values in X is returned. - labels : str {"indicator", "score", "int_label"}, optional (default="int_label") - Annotation output labels: - * If "indicator", returned values are boolean, indicating whether a value is - an outlier, - * If "score", returned values are floats, giving the outlier score. - * If "int_label", returned values are integer, indicating which segment a - value belongs to. - References ---------- @@ -426,8 +414,6 @@ def __init__( min_segment_length: int = 2, max_segment_length: int = 1000, ignore_point_anomalies: bool = False, - fmt: str = "sparse", - labels: str = "int_label", ): self.saving = saving self.collective_penalty = collective_penalty @@ -437,7 +423,7 @@ def __init__( self.min_segment_length = min_segment_length self.max_segment_length = max_segment_length self.ignore_point_anomalies = ignore_point_anomalies - super().__init__(fmt=fmt, labels=labels) + super().__init__() self.saving_func, self.saving_init_func = saving_factory(self.saving) @@ -463,7 +449,7 @@ def _get_penalty_components(self, X: pd.DataFrame) -> tuple[np.ndarray, float]: ) return collective_alpha, collective_betas, point_alpha, point_betas - def _fit(self, X: pd.DataFrame, Y: Optional[pd.DataFrame] = None): + def _fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None): """Fit to training data. Sets the penalty of the detector. @@ -478,7 +464,7 @@ def _fit(self, X: pd.DataFrame, Y: Optional[pd.DataFrame] = None): ---------- X : pd.DataFrame training data to fit the threshold to. - Y : pd.Series, optional + y : pd.Series, optional Does nothing. Only here to make the fit method compatible with sktime and scikit-learn. @@ -511,7 +497,7 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: Returns ------- - Y : pd.Series or pd.DataFrame + y : pd.Series or pd.DataFrame Annotations for sequence X, exact format depends on annotation type. """ X = check_data( @@ -519,7 +505,7 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: min_length=self.min_segment_length, min_length_name="min_segment_length", ) - opt_savings, self.collective_anomalies, self.point_anomalies = run_mvcapa( + opt_savings, collective_anomalies, point_anomalies = run_mvcapa( X.values, self.saving_func, self.saving_init_func, @@ -531,16 +517,31 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: self.max_segment_length, ) self.scores = pd.Series(opt_savings, index=X.index, name="score") - anomalies = format_multivariate_anomaly_output( - self.fmt, - self.labels, - X.index, - X.columns, - self.collective_anomalies, - self.point_anomalies if not self.ignore_point_anomalies else None, - self.scores, - ) - return anomalies + + if self.ignore_point_anomalies: + anomalies = collective_anomalies + else: + anomalies = merge_anomalies(collective_anomalies, point_anomalies) + return SubsetCollectiveAnomalyDetector._format_sparse_output(anomalies) + + def _score_transform(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: + """Compute the MVCAPA scores for the input data. + + Parameters + ---------- + X : pd.DataFrame - data to compute scores for, time series + + Returns + ------- + scores : pd.Series - scores for sequence X + + Notes + ----- + The MVCAPA scores are the cumulative optimal savings, so the scores are + increasing and are not per observation scores. + """ + self.predict(X) + return self.scores @classmethod def get_test_params(cls, parameter_set="default"): @@ -563,5 +564,6 @@ def get_test_params(cls, parameter_set="default"): """ params = [ {"saving": "mean", "min_segment_length": 5, "max_segment_length": 100}, + {"saving": "mean", "min_segment_length": 2, "max_segment_length": 20}, ] return params diff --git a/skchange/anomaly_detectors/tests/test_capa.py b/skchange/anomaly_detectors/tests/test_capa.py index 43ffa99f..11013bbe 100644 --- a/skchange/anomaly_detectors/tests/test_capa.py +++ b/skchange/anomaly_detectors/tests/test_capa.py @@ -4,31 +4,31 @@ import pytest from skchange.anomaly_detectors.capa import Capa +from skchange.anomaly_detectors.mvcapa import Mvcapa from skchange.costs.saving_factory import VALID_SAVINGS from skchange.datasets.generate import generate_teeth_data @pytest.mark.parametrize("saving", VALID_SAVINGS) -def test_capa_anomalies(saving): +@pytest.mark.parametrize("detector_class", [Capa, Mvcapa]) +def test_capa_anomalies(detector_class, saving): """Test Capa anomalies.""" n_segments = 2 seg_len = 20 df = generate_teeth_data( n_segments=n_segments, mean=10, segment_length=seg_len, p=5, random_state=8 ) - capa_classes = [Capa] - for detector_class in capa_classes: - detector = detector_class( - saving=saving, - collective_penalty_scale=2.0, - ignore_point_anomalies=True, # To get test coverage. - ) - anomalies = detector.fit_predict(df) - if isinstance(anomalies, pd.DataFrame): - anomalies = anomalies["location"] - # End point also included as a changepoint - assert ( - len(anomalies) == 1 - and anomalies.array.left[0] == seg_len - and anomalies.array.right[0] == 2 * seg_len - 1 - ) + detector = detector_class( + saving=saving, + collective_penalty_scale=2.0, + ignore_point_anomalies=True, # To get test coverage. + ) + anomalies = detector.fit_predict(df) + if isinstance(anomalies, pd.DataFrame): + anomalies = anomalies.iloc[:, 0] + # End point also included as a changepoint + assert ( + len(anomalies) == 1 + and anomalies.array.left[0] == seg_len + and anomalies.array.right[0] == 2 * seg_len - 1 + ) From 306cb4098d282858dacf872ccaec946301bd5f62 Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 12:25:36 +0200 Subject: [PATCH 57/75] Remove merge_anomalies Unnecessarily complicated function --- skchange/anomaly_detectors/capa.py | 10 +- skchange/anomaly_detectors/mvcapa.py | 10 +- skchange/anomaly_detectors/utils.py | 205 --------------------------- 3 files changed, 10 insertions(+), 215 deletions(-) delete mode 100644 skchange/anomaly_detectors/utils.py diff --git a/skchange/anomaly_detectors/capa.py b/skchange/anomaly_detectors/capa.py index e168c45f..e0e7ef8b 100644 --- a/skchange/anomaly_detectors/capa.py +++ b/skchange/anomaly_detectors/capa.py @@ -11,7 +11,6 @@ from skchange.anomaly_detectors.base import CollectiveAnomalyDetector from skchange.anomaly_detectors.mvcapa import dense_capa_penalty, run_base_capa -from skchange.anomaly_detectors.utils import merge_anomalies from skchange.costs.saving_factory import saving_factory from skchange.utils.validation.data import check_data from skchange.utils.validation.parameters import check_larger_than @@ -206,10 +205,11 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: ) self.scores = pd.Series(opt_savings, index=X.index, name="score") - if self.ignore_point_anomalies: - anomalies = collective_anomalies - else: - anomalies = merge_anomalies(collective_anomalies, point_anomalies) + anomalies = collective_anomalies + if not self.ignore_point_anomalies: + anomalies += point_anomalies + anomalies = sorted(anomalies) + return CollectiveAnomalyDetector._format_sparse_output(anomalies) def _score_transform(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: diff --git a/skchange/anomaly_detectors/mvcapa.py b/skchange/anomaly_detectors/mvcapa.py index a03b2239..5ad79621 100644 --- a/skchange/anomaly_detectors/mvcapa.py +++ b/skchange/anomaly_detectors/mvcapa.py @@ -11,7 +11,6 @@ from scipy.stats import chi2 from skchange.anomaly_detectors.base import SubsetCollectiveAnomalyDetector -from skchange.anomaly_detectors.utils import merge_anomalies from skchange.costs.saving_factory import saving_factory from skchange.utils.validation.data import check_data from skchange.utils.validation.parameters import check_larger_than @@ -518,10 +517,11 @@ def _predict(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: ) self.scores = pd.Series(opt_savings, index=X.index, name="score") - if self.ignore_point_anomalies: - anomalies = collective_anomalies - else: - anomalies = merge_anomalies(collective_anomalies, point_anomalies) + anomalies = collective_anomalies + if not self.ignore_point_anomalies: + anomalies += point_anomalies + anomalies = sorted(anomalies) + return SubsetCollectiveAnomalyDetector._format_sparse_output(anomalies) def _score_transform(self, X: Union[pd.DataFrame, pd.Series]) -> pd.Series: diff --git a/skchange/anomaly_detectors/utils.py b/skchange/anomaly_detectors/utils.py deleted file mode 100644 index 6ab6c12f..00000000 --- a/skchange/anomaly_detectors/utils.py +++ /dev/null @@ -1,205 +0,0 @@ -"""Utility functions for anomaly detection.""" - -from typing import Union - -import numpy as np -import pandas as pd - - -def merge_anomalies( - collective_anomalies: Union[ - list[tuple[int, int]], list[tuple[int, int, np.ndarray]] - ] = None, - point_anomalies: Union[ - list[int], - list[tuple[int, int]], - list[tuple[int, np.ndarray]], - list[tuple[int, int, np.ndarray]], - ] = None, -) -> list[tuple[int, int, np.ndarray]]: - """Merge collective and point anomalies into a single list of intervals. - - Parameters - ---------- - collective_anomalies : list, optional (default=None) - List of tuples containing inclusive start and end indices of collective - anomalies. - point_anomalies : list, optional (default=None) - List of point anomaly indices. - - Returns - ------- - list - List of tuples containing inclusive start and end indices of collective - anomalies and point anomalies. - """ - if collective_anomalies is None and point_anomalies is None: - raise ValueError( - "Either collective_anomalies or point_anomalies must be given." - ) - - anomalies = [] - if collective_anomalies: - anomalies += collective_anomalies - if point_anomalies: - # Convert point anomalies to the same format as collective anomalies - if isinstance(point_anomalies[0], int): - anomalies += [(i, i) for i in point_anomalies] - elif len(point_anomalies[0]) == 2 and isinstance( - point_anomalies[0][-1], np.ndarray - ): - anomalies += [(i, i, components) for (i, components) in point_anomalies] - else: - anomalies += point_anomalies - - anomalies = sorted(anomalies) - return anomalies - - -def anomalies_to_labels( - anomalies: list[tuple[int, int]], n: int, p: int = None -) -> np.ndarray: - """Convert anomaly indices to labels. - - Parameters - ---------- - anomalies : list - List of tuples containing inclusive start and end indices of collective - anomalies and point anomalies. - n : int - Sample size. - p : int - Dimensionality of the data input to the anomaly detector. - - Returns - ------- - np.ndarray - Array of labels, where 0 is the normal class, and 1, 2, ... are labels for each - distinct collective and/or point_anomaly. - """ - labels = np.zeros(n, dtype=int) if p is None else np.zeros((n, p), dtype=int) - if len(anomalies) == 0: - return labels - - if len(anomalies[0]) == 2: - for i, (start, end) in enumerate(anomalies): - labels[start : end + 1] = i + 1 - elif len(anomalies[0]) == 3: - # Multivariate - for i, (start, end, components) in enumerate(anomalies): - labels[start : end + 1, components] = i + 1 - return labels - - -def format_anomaly_output( - fmt: str, - labels: str, - X_index: pd.Index, - collective_anomalies: list[tuple] = None, - point_anomalies: list[tuple] = None, - scores: Union[pd.Series, pd.DataFrame] = None, -) -> pd.Series: - """Format the predict method output of change detectors. - - Parameters - ---------- - fmt : str - Format of the output. Either "sparse" or "dense". - labels : str - Labels of the output. Either "indicator", "score" or "int_label". - X_index : pd.Index - Index of the input data. - collective_anomalies : list, optional (default=None) - List of tuples containing inclusive start and end indices of collective - anomalies. - point_anomalies : list, optional (default=None) - List of point anomaly indices. - scores : pd.Series or pd.DataFrame, optional (default=None) - Series or DataFrame of scores. If Series, it must be named 'score', and if - DataFrame, it must have a column named 'score'. - - Returns - ------- - pd.Series - Either a sparse or dense pd.Series of boolean labels, integer labels or scores. - """ - n = X_index.size - anomalies = merge_anomalies(collective_anomalies, point_anomalies) - if labels == "int_label": - if fmt == "dense": - anomaly_labels = anomalies_to_labels(anomalies, n) - out = pd.Series(anomaly_labels, index=X_index, name="int_label", dtype=int) - elif fmt == "sparse": - out = pd.DataFrame(anomalies, columns=["start", "end"]) - elif labels == "indicator": - if fmt == "dense": - anomaly_labels = anomalies_to_labels(anomalies, n) - out = pd.Series(anomaly_labels > 0, index=X_index, name="indicator") - elif fmt == "sparse": - out = pd.DataFrame(anomalies, columns=["start", "end"]) - elif labels == "score": - # There is no sparse version of 'score'. - # The scores are formatted in each class' _predict method, as what is a good - # format for the scores is method dependent. - out = scores - return out - - -def format_multivariate_anomaly_output( - fmt: str, - labels: str, - X_index: pd.Index, - X_columns: pd.Index, - collective_anomalies: list[dict] = None, - point_anomalies: list[dict] = None, - scores: Union[pd.Series, pd.DataFrame] = None, -) -> pd.Series: - """Format the predict method output of change detectors. - - Parameters - ---------- - fmt : str - Format of the output. Either "sparse" or "dense". - labels : str - Labels of the output. Either "indicator", "score" or "int_label". - X_index : pd.Index - Index of the input data. - X_columns : pd.Index - Columns of the input data. - collective_anomalies : list, optional (default=None) - List of tuples containing inclusive start and end indices of collective - anomalies. - point_anomalies : list, optional (default=None) - List of point anomaly indices. - scores : pd.Series or pd.DataFrame, optional (default=None) - Series or DataFrame of scores. If Series, it must be named 'score', and if - DataFrame, it must have a column named 'score'. - - Returns - ------- - pd.Series - Either a sparse or dense pd.Series of boolean labels, integer labels or scores. - """ - n = X_index.size - p = X_columns.size - anomalies = merge_anomalies(collective_anomalies, point_anomalies) - if labels == "int_label": - if fmt == "dense": - anomaly_labels = anomalies_to_labels(anomalies, n, p) - out = pd.DataFrame( - anomaly_labels, index=X_index, columns=X_columns, dtype=int - ) - elif fmt == "sparse": - out = pd.DataFrame(anomalies, columns=["start", "end", "components"]) - elif labels == "indicator": - if fmt == "dense": - anomaly_labels = anomalies_to_labels(anomalies, n, p) - out = pd.DataFrame(anomaly_labels > 0, index=X_index, columns=X_columns) - elif fmt == "sparse": - out = pd.DataFrame(anomalies, columns=["start", "end", "components"]) - elif labels == "score": - # There is no sparse version of 'score'. - # The scores are formatted in each class' _predict method, as what is a good - # format for the scores is method dependent. - out = scores - return out From edd64a276db0add8433dc7737f9cb6711b8aa325 Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 12:57:13 +0200 Subject: [PATCH 58/75] Remove old commented code --- skchange/base.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/skchange/base.py b/skchange/base.py index 1b5163aa..939f30a1 100644 --- a/skchange/base.py +++ b/skchange/base.py @@ -63,15 +63,6 @@ class BaseDetector(BaseTransformer): - _update(self, X, y=None) -> self """ - # _tags = { - # "object_type": "transformer", # sktime scitype of object - # "learning_type": "None", # Tag to determine test in test_all_annotators - # "task": "None", # Tag to determine test in test_all_annotators - # # - # # todo: distribution_type? we may have to refactor this, seems very soecufuc - # "distribution_type": "None", # Tag to determine test in test_all_annotators - # } # for unit test cases - _tags = { "object_type": "transformer", # type of object "scitype:transform-input": "Series", From 78c940c2fd8e97b3a0c1c97211378eff70fa8b09 Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 12:57:52 +0200 Subject: [PATCH 59/75] Allow sparse_to_dense and transform to mismatch on Series vs DataFrame --- skchange/anomaly_detectors/tests/test_anomaly_detectors.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/skchange/anomaly_detectors/tests/test_anomaly_detectors.py b/skchange/anomaly_detectors/tests/test_anomaly_detectors.py index da9dedca..28a1847c 100644 --- a/skchange/anomaly_detectors/tests/test_anomaly_detectors.py +++ b/skchange/anomaly_detectors/tests/test_anomaly_detectors.py @@ -56,6 +56,10 @@ def test_anomaly_detector_sparse_to_dense(Estimator): labels_predict_convert = detector.sparse_to_dense( anomalies, anomaly_data.index, anomaly_data.columns ) + if isinstance(labels_predict_convert, pd.Series): + # transforms does output conversion to match the input. This is not required of + # spare_to_dense. + labels_predict_convert = labels_predict_convert.to_frame() labels_transform = detector.fit_transform(anomaly_data) assert labels_predict_convert.equals(labels_transform) From ac780aeffc5d7eeae4caaecf37098136768415ff Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 12:58:38 +0200 Subject: [PATCH 60/75] Fix doctrings and typings --- skchange/anomaly_detectors/base.py | 10 +++++++--- skchange/change_detectors/base.py | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/skchange/anomaly_detectors/base.py b/skchange/anomaly_detectors/base.py index ca7d83bd..1057bad1 100644 --- a/skchange/anomaly_detectors/base.py +++ b/skchange/anomaly_detectors/base.py @@ -43,6 +43,8 @@ def sparse_to_dense( The sparse output from an anomaly detector's predict method. index : array-like Indices that are to be annotated according to ``y_sparse``. + columns: array-like + Not used. Only for API compatibility. Returns ------- @@ -119,6 +121,8 @@ def sparse_to_dense( The collective anomaly intervals. index : array-like Indices that are to be annotated according to ``y_sparse``. + columns: array-like + Not used. Only for API compatibility. Returns ------- @@ -218,8 +222,8 @@ class SubsetCollectiveAnomalyDetector(BaseDetector): @staticmethod def sparse_to_dense( - y_sparse: pd.Series, index: pd.Index, columns: pd.Index - ) -> pd.Series: + y_sparse: pd.DataFrame, index: pd.Index, columns: pd.Index + ) -> pd.DataFrame: """Convert the sparse output from the predict method to a dense format. Parameters @@ -257,7 +261,7 @@ def sparse_to_dense( return pd.DataFrame(labels, index=index, columns=columns) @staticmethod - def dense_to_sparse(y_dense): + def dense_to_sparse(y_dense: pd.DataFrame): """Convert the dense output from the transform method to a sparse format. Parameters diff --git a/skchange/change_detectors/base.py b/skchange/change_detectors/base.py index d9bb995d..ed22668d 100644 --- a/skchange/change_detectors/base.py +++ b/skchange/change_detectors/base.py @@ -46,8 +46,8 @@ def sparse_to_dense( The sparse output from a changepoint detector's predict method. index : array-like Indices that are to be annotated according to ``y_sparse``. - columns : array-like - Columns that are to be annotated according to ``y_sparse``. + columns: array-like + Not used. Only for API compatibility. Returns ------- From ac910b7cec2c651d56418d082a060e971ea658e8 Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 13:06:47 +0200 Subject: [PATCH 61/75] Fix tests --- skchange/change_detectors/tests/test_change_detectors.py | 2 +- skchange/tests/test_all_detectors.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/skchange/change_detectors/tests/test_change_detectors.py b/skchange/change_detectors/tests/test_change_detectors.py index e0e9cc19..c9ea7b07 100644 --- a/skchange/change_detectors/tests/test_change_detectors.py +++ b/skchange/change_detectors/tests/test_change_detectors.py @@ -9,7 +9,7 @@ seg_len = 50 changepoint_data = generate_teeth_data( n_segments=n_segments, mean=10, segment_length=seg_len, p=1, random_state=2 -) +)[0] @pytest.mark.parametrize("Estimator", CHANGE_DETECTORS) diff --git a/skchange/tests/test_all_detectors.py b/skchange/tests/test_all_detectors.py index 54e52da7..3f99e4cb 100644 --- a/skchange/tests/test_all_detectors.py +++ b/skchange/tests/test_all_detectors.py @@ -89,4 +89,4 @@ def test_detector_not_implemented_methods(): with pytest.raises(NotImplementedError): detector.dense_to_sparse(x) with pytest.raises(NotImplementedError): - detector.sparse_to_dense(x) + detector.sparse_to_dense(x, x.index, pd.Index(["a"])) From 788064d6a19df3e2548fb0cad20bf3a4f7cc52d8 Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 13:25:11 +0200 Subject: [PATCH 62/75] Clean up files --- skchange/anomaly_detectors/base.py | 38 +++++--------------- skchange/base.py | 56 ++++-------------------------- skchange/change_detectors/base.py | 5 +-- 3 files changed, 18 insertions(+), 81 deletions(-) diff --git a/skchange/anomaly_detectors/base.py b/skchange/anomaly_detectors/base.py index 1057bad1..c33e3bdb 100644 --- a/skchange/anomaly_detectors/base.py +++ b/skchange/anomaly_detectors/base.py @@ -14,14 +14,6 @@ class PointAnomalyDetector(BaseDetector): Output format of the predict method: See the dense_to_sparse method. Output format of the transform method: See the sparse_to_dense method. - Subclasses should set the following tags for sktime compatibility: - - task: "anomaly_detection" - - learning_type: "unsupervised" or "supervised" - - And possibly other tags, such as - * "capability:missing_values": False, - * "capability:multivariate": True, - * "fit_is_empty": False, - Needs to be implemented: - _fit(self, X, y=None) -> self - _predict(self, X) -> pd.Series @@ -92,14 +84,6 @@ class CollectiveAnomalyDetector(BaseDetector): Output format of the predict method: See the dense_to_sparse method. Output format of the transform method: See the sparse_to_dense method. - Subclasses should set the following tags for sktime compatibility: - - task: "collective_anomaly_detection" - - learning_type: "unsupervised" or "supervised" - - And possibly other tags, such as - * "capability:missing_values": False, - * "capability:multivariate": True, - * "fit_is_empty": False, - Needs to be implemented: - _fit(self, X, y=None) -> self - _predict(self, X) -> pd.Series @@ -202,15 +186,6 @@ class SubsetCollectiveAnomalyDetector(BaseDetector): Output format of the predict method: - Subclasses should set the following tags for sktime compatibility: - - task: "collective_anomaly_detection" - - learning_type: "unsupervised" or "supervised" - - capability:subset_detection: True - - And possibly other tags, such as - * "capability:missing_values": False, - * "capability:multivariate": True, - * "fit_is_empty": False, - Needs to be implemented: - _fit(self, X, y=None) -> self - _predict(self, X) -> pd.DataFrame @@ -239,7 +214,8 @@ def sparse_to_dense( Returns ------- - pd.DataFrame + pd.DataFrame where 0-entries are normal and each collective anomaly are labelled + from 1, ..., K. """ anomaly_intervals = y_sparse.iloc[:, 0].array anomaly_starts = anomaly_intervals.left @@ -271,7 +247,9 @@ def dense_to_sparse(y_dense: pd.DataFrame): Returns ------- - pd.DataFrame + pd.DataFrame with columns + anomaly_interval: Intervals of the collective anomalies. + anomaly_columns: Affected columns of the collective anomalies. """ # The sparse format only uses integer positions, so we reset index and columns. y_dense = y_dense.reset_index(drop=True) @@ -296,9 +274,11 @@ def dense_to_sparse(y_dense: pd.DataFrame): def _format_sparse_output( collective_anomalies: list[tuple[int, int, np.ndarray]], closed: str = "both", - ) -> pd.Series: + ) -> pd.DataFrame: """Format the sparse output of subset collective anomaly detectors. + Can be reused by subclasses to format the output of the _predict method. + Parameters ---------- collective_anomalies : list @@ -308,8 +288,6 @@ def _format_sparse_output( Whether the (start, end) tuple correspond to intervals that are closed on the left, right, both, or neither. - Can be reused by subclasses to format the output of the _predict method. - Returns ------- pd.DataFrame with columns diff --git a/skchange/base.py b/skchange/base.py index 939f30a1..429e289a 100644 --- a/skchange/base.py +++ b/skchange/base.py @@ -2,7 +2,7 @@ class name: BaseDetector - Adapted from the sktime.BaseSeriesAnnotator class. + Adapted from the BaseSeriesAnnotator and BaseTransformer class in sktime. Scitype defining methods: fitting - fit(self, X, y=None) @@ -11,11 +11,11 @@ class name: BaseDetector detection scores, dense - score_transform(self, X) [optional] updating (temporal) - update(self, X, y=None) [optional] -Each detector type (e.g. anomaly detector, collective anomaly detector, changepoint -detector) are subclasses of BaseDetector (task + learning_type tags in sktime). -They are defined by the content and format of the output of the predict method. Each -detector type therefore has the following methods for converting between sparse and -dense output formats: +Each detector type (e.g. point anomaly detector, collective anomaly detector, +changepoint detector) are subclasses of BaseDetector (task tag in sktime). +A detector type is defined by the content and format of the output of the predict +method. Each detector type therefore has the following methods for converting between +sparse and dense output formats: converting sparse output to dense - sparse_to_dense(y_sparse, index, columns) converting dense output to sparse - dense_to_sparse(y_dense) [optional] @@ -63,6 +63,7 @@ class BaseDetector(BaseTransformer): - _update(self, X, y=None) -> self """ + # _tags are adapted from BaseTransformer in sktime. _tags = { "object_type": "transformer", # type of object "scitype:transform-input": "Series", @@ -398,46 +399,3 @@ def fit_transform(self, X, y=None): Annotations for sequence X exact format depends on annotation type. """ return self.fit(X).transform(X) - - -# Notes on required .predict output formats per detector type (task and capability): -# -# - task == "anomaly_detection": -# pd.Series(anomaly_indices, dtype=int, name="anomalies) -# - task == "collective_anomaly_detection": -# pd.Series(pd.IntervalIndex( -# anomaly_intervals, closed=, name="collective_anomalies" -# )) -# - task == "change_point_detection": -# Changepoints are defined as the last element of a segment. -# pd.Series(changepoint_indices, dtype=int, name="changepoints") -# - task == "segmentation": -# Difference from change point detection: Allows the same label to be assigned to -# multiple segments. -# pd.Series({ -# index = pd.IntervalIndex(segment_intervals, closed=), -# values = segment_labels, -# }) -# - task == "None": -# Custom task. -# Only restriction is that the output must be a pd.Series or pd.DataFrame where -# each element or row corresponds to a detected event. -# For .transform to work, .sparse_to_dense must be implemented for custom tasks. -# - capability:subset_detection is True: -# * task == "anomaly_detection": -# pd.DataFrame({ -# "location": anomaly_indices, -# "columns": affected_components_list, -# }) -# * task == "collective_anomaly_detection": -# pd.DataFrame({ -# "location": pd.IntervalIndex(anomaly_intervals, closed=), -# "columns": affected_components_list, -# }) -# * task == "change_point_detection": -# pd.DataFrame({ -# "location": changepoint_indices, -# "columns": affected_components_list, -# }) -# - capability:detection_score is True: Explicit way of stating that _score_transform -# is implemented. diff --git a/skchange/change_detectors/base.py b/skchange/change_detectors/base.py index ed22668d..d03d9a81 100644 --- a/skchange/change_detectors/base.py +++ b/skchange/change_detectors/base.py @@ -52,7 +52,7 @@ def sparse_to_dense( Returns ------- pd.Series with integer labels 0, ..., K for each segment between two - changepoints. + changepoints. """ changepoints = y_sparse.to_list() n = len(index) @@ -76,7 +76,8 @@ def dense_to_sparse(y_dense: pd.Series) -> pd.Series: Returns ------- - pd.Series + pd.Series of changepoint locations. Changepoints are defined as the last element + of a segment. """ y_dense = y_dense.reset_index(drop=True) # changepoint = end of segment, so the label diffs > 0 must be shiftet by -1. From b4a5da2d40708b3a25dd209a3d05c2c299206e9a Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 13:28:47 +0200 Subject: [PATCH 63/75] Tidy up tags --- skchange/change_detectors/moscore.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/skchange/change_detectors/moscore.py b/skchange/change_detectors/moscore.py index 979bf45c..4c50313a 100644 --- a/skchange/change_detectors/moscore.py +++ b/skchange/change_detectors/moscore.py @@ -101,8 +101,6 @@ class Moscore(ChangeDetector): """ _tags = { - "task": "change_point_detection", - "learning_type": "unsupervised", "capability:missing_values": False, "capability:multivariate": True, "fit_is_empty": False, From a922a90241fc609399a417be5dc1f35b5d85d926 Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 13:39:00 +0200 Subject: [PATCH 64/75] Update readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7ddd9a89..6d920bcd 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ from skchange.change_detectors.moscore import Moscore from skchange.datasets.generate import generate_teeth_data df = generate_teeth_data(n_segments=10, segment_length=50, mean=5, random_state=1) -detector = Moscore(bandwidth=10, fmt="sparse") +detector = Moscore(bandwidth=10) detector.fit_predict(df) >>> 0 49 @@ -51,7 +51,7 @@ df = generate_teeth_data( affected_proportion=0.2, random_state=2, ) -detector = Mvcapa(collective_penalty="sparse", fmt="sparse") +detector = Mvcapa(collective_penalty="sparse") detector.fit_predict(df) >>> start end components From 6f6427f1171d350218ef646c2cae94b66ec308f4 Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 16:27:55 +0200 Subject: [PATCH 65/75] Fix bug: Input columns being overwritten --- skchange/anomaly_detectors/base.py | 4 ++-- skchange/anomaly_detectors/tests/test_capa.py | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/skchange/anomaly_detectors/base.py b/skchange/anomaly_detectors/base.py index c33e3bdb..fd4ce347 100644 --- a/skchange/anomaly_detectors/base.py +++ b/skchange/anomaly_detectors/base.py @@ -231,8 +231,8 @@ def sparse_to_dense( labels = np.zeros((len(index), len(columns)), dtype="int64") anomalies = zip(anomaly_starts, anomaly_ends, anomaly_columns) - for i, (start, end, columns) in enumerate(anomalies): - labels[start:end, columns] = i + 1 + for i, (start, end, affected_columns) in enumerate(anomalies): + labels[start:end, affected_columns] = i + 1 return pd.DataFrame(labels, index=index, columns=columns) diff --git a/skchange/anomaly_detectors/tests/test_capa.py b/skchange/anomaly_detectors/tests/test_capa.py index 11013bbe..b21406c1 100644 --- a/skchange/anomaly_detectors/tests/test_capa.py +++ b/skchange/anomaly_detectors/tests/test_capa.py @@ -16,7 +16,12 @@ def test_capa_anomalies(detector_class, saving): n_segments = 2 seg_len = 20 df = generate_teeth_data( - n_segments=n_segments, mean=10, segment_length=seg_len, p=5, random_state=8 + n_segments=n_segments, + mean=20, + segment_length=seg_len, + p=5, + affected_proportion=0.2, + random_state=8, ) detector = detector_class( saving=saving, From 4ded1976bd8bae6535882926965ebb17859ce961 Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 16:50:20 +0200 Subject: [PATCH 66/75] Update benchmarking script --- interactive/benchmark.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/interactive/benchmark.py b/interactive/benchmark.py index ef057706..6f9ca4a0 100644 --- a/interactive/benchmark.py +++ b/interactive/benchmark.py @@ -1,20 +1,23 @@ +"""Benchmarking the computational efficiency of the detectors.""" + from timeit import timeit import numpy as np import pandas as pd import plotly.express as px -from skchange.anomaly_detectors.tests.test_anomaly_detectors import anomaly_detectors -from skchange.change_detectors.tests.test_change_detectors import change_detectors +from skchange.anomaly_detectors import ANOMALY_DETECTORS +from skchange.change_detectors import CHANGE_DETECTORS # TODO: Add all the different scores and costs. -detector_classes = anomaly_detectors + change_detectors +# TODO: Make sure hyperparameters are set such that comparisons are fair. +detector_classes = ANOMALY_DETECTORS + CHANGE_DETECTORS ns = [1000, 10000, 100000, 1000000] n_runs = [100, 10, 1, 1] timings = {} for detector_class in detector_classes: detector_name = detector_class.__name__ - detector = detector_class.create_test_instance().set_params(fmt="sparse") + detector = detector_class.create_test_instance() setup_data = pd.DataFrame(np.random.normal(0, 1, size=1000)) detector.fit_predict(setup_data) # Compile numba timings[detector_name] = [] From 2b49a1c1301c273a8cfec3469fb76431423fe34c Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 16:50:55 +0200 Subject: [PATCH 67/75] Update the capa interactive scripts --- interactive/explore_capa.py | 51 ++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/interactive/explore_capa.py b/interactive/explore_capa.py index 3a7106b3..5c269660 100644 --- a/interactive/explore_capa.py +++ b/interactive/explore_capa.py @@ -1,3 +1,6 @@ +"""Interactively explore the Capa and Mvcapa anomaly detectors.""" + +import pandas as pd import plotly.express as px from skchange.anomaly_detectors.capa import Capa @@ -6,30 +9,44 @@ from skchange.utils.benchmarking.profiler import Profiler # Unviariate -df = generate_teeth_data(n_segments=5, mean=10, segment_length=10, p=1, random_state=2) -capa = Capa(fmt="sparse", max_segment_length=20) -anomalies = capa.fit_predict(df) +df = generate_teeth_data(n_segments=5, segment_length=10, mean=10, random_state=2)[0] +detector = Capa(max_segment_length=20) + +anomalies = detector.fit_predict(df) +print(anomalies) -capa = Capa(labels="score", fmt="dense", max_segment_length=20) -scores = capa.fit_predict(df) +anomaly_labels = detector.fit_transform(df) +px.scatter(x=df.index, y=df, color=anomaly_labels.astype(str)) -capa = Capa(labels="indicator", fmt="dense", max_segment_length=20) -anomalies = capa.fit_predict(df) -px.scatter(x=df.index, y=df.values[:, 0], color=anomalies) +scores = detector.score_transform(df) +px.scatter(scores) # Multivariate -# TODO: Add plotting functionality to assess the affected subset. df = generate_teeth_data(5, 10, p=10, mean=10, affected_proportion=0.2, random_state=2) -capa = Mvcapa(collective_penalty="sparse", fmt="sparse") -anomalies = capa.fit_predict(df) +detector = Mvcapa(collective_penalty="sparse") + +anomalies = detector.fit_predict(df) +print(anomalies) + +anomaly_labels = detector.fit_transform(df) +anomaly_labels = (anomaly_labels > 0).astype(int) +anomaly_labels[anomaly_labels == 0] = 0.1 +plot_df = pd.concat( + [ + df.melt(ignore_index=False).reset_index(), + anomaly_labels.melt(value_name="anomaly_label")["anomaly_label"], + ], + axis=1, +) +plot_df["variable"] = plot_df["variable"].astype(str) +px.scatter(plot_df, x="index", y="value", color="variable", size="anomaly_label") -capa = Mvcapa(labels="score", fmt="dense", max_segment_length=20) -scores = capa.fit_predict(df) +fig = px.line(df) +fig.add_scatter(anomaly_labels) +px.line(anomaly_labels) -capa = Mvcapa(collective_penalty_scale=5, labels="indicator", fmt="dense") -anomalies = capa.fit_predict(df) -df.plot(kind="line", backend="plotly") -anomalies.plot(kind="line", backend="plotly") +scores = detector.score_transform(df) +px.scatter(scores) # Profiling From 7e2eeca6dc31d06599b7cb944096a679f2abc0e2 Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 16:52:55 +0200 Subject: [PATCH 68/75] Fix docstring --- interactive/explore_capa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interactive/explore_capa.py b/interactive/explore_capa.py index 5c269660..40fb41d7 100644 --- a/interactive/explore_capa.py +++ b/interactive/explore_capa.py @@ -1,4 +1,4 @@ -"""Interactively explore the Capa and Mvcapa anomaly detectors.""" +"""Interactive exploration of the Capa and Mvcapa anomaly detectors.""" import pandas as pd import plotly.express as px From 1b81f86e4d84b1eb9815a5bb3654176f3351cc90 Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 16:53:13 +0200 Subject: [PATCH 69/75] Update the circ bin seg interactive script --- interactive/explore_circular_binseg.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/interactive/explore_circular_binseg.py b/interactive/explore_circular_binseg.py index c0b581c3..d9cbdbfb 100644 --- a/interactive/explore_circular_binseg.py +++ b/interactive/explore_circular_binseg.py @@ -1,3 +1,5 @@ +"""Interacive exploration of the Circular Binary Segmentation anomaly detector.""" + import plotly.express as px from skchange.anomaly_detectors.circular_binseg import ( @@ -12,9 +14,7 @@ score="mean", growth_factor=1.5, min_segment_length=10 ) anomalies = detector.fit_predict(df) - -df.plot(kind="line", backend="plotly") - +px.line(df) px.scatter(detector.scores, x="argmax_anomaly_start", y="score") # Test anomaly intervals From ab48424f2bbbbdde5a00893103d88bf0ef18c0c6 Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 16:55:00 +0200 Subject: [PATCH 70/75] Update the moscore anomaly interactives cript --- interactive/explore_moscore_anomaly.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/interactive/explore_moscore_anomaly.py b/interactive/explore_moscore_anomaly.py index d874bb00..6ca957cf 100644 --- a/interactive/explore_moscore_anomaly.py +++ b/interactive/explore_moscore_anomaly.py @@ -1,3 +1,5 @@ +"""Interactive exploration of the MoscoreAnomaly detector.""" + import numpy as np import plotly.express as px @@ -20,17 +22,7 @@ left_bandwidth=50, ) anomalies = detector.fit_predict(df) - -detector = MoscoreAnomaly( - score="mean", - min_anomaly_length=10, - max_anomaly_length=1000, - left_bandwidth=20, - labels="score", -) -scores = detector.fit_predict(df) -scores["length"] = scores["anomaly_end"] - scores["anomaly_start"] + 1 -px.scatter(scores, x="anomaly_start", y="score", color="length") +print(anomalies) # Profiling From 301183f464a4ecf050b35fb50a013e3fbadf6c11 Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 17:11:22 +0200 Subject: [PATCH 71/75] Update the moscore interactive script --- interactive/explore_moscore.py | 46 ++++++++++++++++------------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/interactive/explore_moscore.py b/interactive/explore_moscore.py index f9f33200..ec26d553 100644 --- a/interactive/explore_moscore.py +++ b/interactive/explore_moscore.py @@ -1,17 +1,20 @@ +"""Interactive exploration of the Moscore change detector.""" + import numpy as np import plotly.express as px from numba import njit -from skchange.change_detectors.moscore import Moscore, where +from skchange.change_detectors.moscore import Moscore from skchange.datasets.generate import add_linspace_outliers, generate_teeth_data -from skchange.scores.mean_score import init_mean_score, mean_score from skchange.utils.benchmarking.profiler import Profiler # Simple univariate example df = generate_teeth_data(n_segments=2, mean=10, segment_length=100, p=1, random_state=2) detector = Moscore() changepoints = detector.fit_predict(df) -px.scatter(detector.scores) +labels = detector.transform(df) +scores = detector.score_transform(df) +px.scatter(scores) # Profiling @@ -24,13 +27,6 @@ profiler.stop() -# Various unit tests -df = generate_teeth_data(n_segments=1, mean=10, segment_length=10, p=1) -precomputed_params = init_mean_score(df.values) -mean_score(precomputed_params, start=0, end=9, split=4) -where(np.array([True, True, True, False, False])) - - # Variance score df = generate_teeth_data( n_segments=2, variance=16, segment_length=100, p=1, random_state=1 @@ -44,6 +40,7 @@ # Custom score @njit def col_median(X: np.ndarray) -> np.ndarray: + """Compute the median of each column of X.""" m = X.shape[1] medians = np.zeros(m) for j in range(m): @@ -53,27 +50,28 @@ def col_median(X: np.ndarray) -> np.ndarray: @njit def init_spike_score(X: np.ndarray) -> np.ndarray: + """Initialize the spike score.""" return X -def spike_score_factory(margin: int = 0): - @njit - def spike_score( - precomputed_params: np.ndarray, start: int, end: int, split: int - ) -> float: - X = precomputed_params - interval_X = np.concatenate( - (X[start : split - margin], X[split + margin + 1 : end + 1]) - ) - baseline_median = col_median(interval_X) - return np.sum(np.abs(X[split] - baseline_median)) - - return spike_score +@njit +def spike_score( + precomputed_params: np.ndarray, + start: np.ndarray, + end: np.ndarray, + split: np.ndarray, +) -> float: + """Calculate the score for a spike at the split point.""" + X = precomputed_params + baseline_median = np.zeros((len(start), X.shape[1])) + for i, (s, e) in enumerate(zip(start, end)): + baseline_median[i] = col_median(X[s : e + 1]) + return np.sum(np.abs(X[split] - baseline_median), axis=1) df = generate_teeth_data(n_segments=1, mean=0, segment_length=100, p=1) df = add_linspace_outliers(df, n_outliers=4, outlier_size=10) -score = (spike_score_factory(margin=0), init_spike_score) +score = (spike_score, init_spike_score) detector = Moscore(score, bandwidth=5) anomalies = detector.fit_predict(df) px.scatter(detector.scores) From 953fb929e18602a45386fab1af24b5e10f1f5ffe Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 17:12:10 +0200 Subject: [PATCH 72/75] Update the PELT interactive script --- interactive/explore_pelt.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/interactive/explore_pelt.py b/interactive/explore_pelt.py index e2d03f29..505bdcc9 100644 --- a/interactive/explore_pelt.py +++ b/interactive/explore_pelt.py @@ -1,3 +1,5 @@ +"""Interactive exploration of the Pelt change detector.""" + import numpy as np from skchange.change_detectors.pelt import Pelt From 0f9f1ce047a3fffe664a3682bb75ccf1e9a35cf8 Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 17:13:51 +0200 Subject: [PATCH 73/75] Update the seeded binary segmentation interactive script --- interactive/explore_seeded_binseg.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/interactive/explore_seeded_binseg.py b/interactive/explore_seeded_binseg.py index 19d35858..29cb418e 100644 --- a/interactive/explore_seeded_binseg.py +++ b/interactive/explore_seeded_binseg.py @@ -1,3 +1,5 @@ +"""Interactive exploration of Seeded Binary Segmentation.""" + import plotly.express as px from skchange.change_detectors.seeded_binseg import SeededBinarySegmentation @@ -8,9 +10,8 @@ detector = SeededBinarySegmentation(score="mean", growth_factor=2) detector.fit_predict(df) -df.plot(kind="line", backend="plotly") - -px.scatter(detector.scores, x="maximizer", y="score", hover_data=["start", "end"]) +px.line(df) +px.scatter(detector.scores, x="argmax_cpt", y="score", hover_data=["start", "end"]) # Profiling From b1cbb21f903c1ec06c5a6f94df1385c4eedd2207 Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 17:14:32 +0200 Subject: [PATCH 74/75] Update the stat threshold anomaliser interactive script --- interactive/explore_stat_threshold_anomaliser.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/interactive/explore_stat_threshold_anomaliser.py b/interactive/explore_stat_threshold_anomaliser.py index f1b7b094..74966bd9 100644 --- a/interactive/explore_stat_threshold_anomaliser.py +++ b/interactive/explore_stat_threshold_anomaliser.py @@ -1,3 +1,5 @@ +"""Interactive exploration of the StatThresholdAnomaliser.""" + import numpy as np from skchange.anomaly_detectors.anomalisers import StatThresholdAnomaliser @@ -16,3 +18,4 @@ change_detector, stat=np.mean, stat_lower=-1.0, stat_upper=1.0 ) anomalies = detector.fit_predict(df) +print(anomalies) From 8777d950e3e27aea5b8cc9c41606ea86d360d653 Mon Sep 17 00:00:00 2001 From: tveten Date: Fri, 23 Aug 2024 17:15:17 +0200 Subject: [PATCH 75/75] Remove reference to the old fmt argument --- skchange/anomaly_detectors/mvcapa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skchange/anomaly_detectors/mvcapa.py b/skchange/anomaly_detectors/mvcapa.py index 5ad79621..a8a010ab 100644 --- a/skchange/anomaly_detectors/mvcapa.py +++ b/skchange/anomaly_detectors/mvcapa.py @@ -393,7 +393,7 @@ class Mvcapa(SubsetCollectiveAnomalyDetector): from skchange.datasets.generate import generate_teeth_data df = generate_teeth_data(5, 10, p=10, mean=10, affected_proportion=0.2) - capa = Capa(collective_penalty_scale=5, fmt="sparse", max_segment_length=20) + capa = Capa(collective_penalty_scale=5, max_segment_length=20) capa.fit_predict(df) """