Merge pull request #11 from NorskRegnesentral/new_detector_base_class

[ENH] New detector base class
NorskRegnesentral · Aug 23, 2024 · 6cd6796 · 6cd6796
2 parents 74bdd0f + 8777d95
commit 6cd6796
Show file tree

Hide file tree

Showing 33 changed files with 1,345 additions and 813 deletions.
diff --git a/NOTES.md b/NOTES.md
@@ -141,7 +141,7 @@ using the same example data as for anomaly detection.
 
 ### Changepoints in univariate data or multivariate data without subset changes
 ```python
-detector = ChangepointDetector().fit(x_univariate)
+detector = ChangeDetector().fit(x_univariate)
 detector.predict(x_univariate)
 0    0
 1    1
@@ -152,7 +152,7 @@ dtype: int64
 ```
 ### Subset changepoints in multivariate data
 ```python
-detector = SubsetChangepointDetector().fit(x_multivariate)
+detector = SubsetChangeDetector().fit(x_multivariate)
 detector.predict(x_multivariate)
    index columns
 0      0     [0]

diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@ from skchange.change_detectors.moscore import Moscore
 from skchange.datasets.generate import generate_teeth_data
 
 df = generate_teeth_data(n_segments=10, segment_length=50, mean=5, random_state=1)
-detector = Moscore(bandwidth=10, fmt="sparse")
+detector = Moscore(bandwidth=10)
 detector.fit_predict(df)
 >>>
 0     49
@@ -51,7 +51,7 @@ df = generate_teeth_data(
     affected_proportion=0.2,
     random_state=2,
 )
-detector = Mvcapa(collective_penalty="sparse", fmt="sparse")
+detector = Mvcapa(collective_penalty="sparse")
 detector.fit_predict(df)
 >>>
    start  end components

diff --git a/interactive/benchmark.py b/interactive/benchmark.py
@@ -1,20 +1,23 @@
+"""Benchmarking the computational efficiency of the detectors."""
+
 from timeit import timeit
 
 import numpy as np
 import pandas as pd
 import plotly.express as px
 
-from skchange.anomaly_detectors.tests.test_anomaly_detectors import anomaly_detectors
-from skchange.change_detectors.tests.test_change_detectors import change_detectors
+from skchange.anomaly_detectors import ANOMALY_DETECTORS
+from skchange.change_detectors import CHANGE_DETECTORS
 
 # TODO: Add all the different scores and costs.
-detector_classes = anomaly_detectors + change_detectors
+# TODO: Make sure hyperparameters are set such that comparisons are fair.
+detector_classes = ANOMALY_DETECTORS + CHANGE_DETECTORS
 ns = [1000, 10000, 100000, 1000000]
 n_runs = [100, 10, 1, 1]
 timings = {}
 for detector_class in detector_classes:
     detector_name = detector_class.__name__
-    detector = detector_class.create_test_instance().set_params(fmt="sparse")
+    detector = detector_class.create_test_instance()
     setup_data = pd.DataFrame(np.random.normal(0, 1, size=1000))
     detector.fit_predict(setup_data)  # Compile numba
     timings[detector_name] = []

diff --git a/interactive/explore_capa.py b/interactive/explore_capa.py
@@ -1,3 +1,6 @@
+"""Interactive exploration of the Capa and Mvcapa anomaly detectors."""
+
+import pandas as pd
 import plotly.express as px
 
 from skchange.anomaly_detectors.capa import Capa
@@ -6,30 +9,44 @@
 from skchange.utils.benchmarking.profiler import Profiler
 
 # Unviariate
-df = generate_teeth_data(n_segments=5, mean=10, segment_length=10, p=1, random_state=2)
-capa = Capa(fmt="sparse", max_segment_length=20)
-anomalies = capa.fit_predict(df)
+df = generate_teeth_data(n_segments=5, segment_length=10, mean=10, random_state=2)[0]
+detector = Capa(max_segment_length=20)
+
+anomalies = detector.fit_predict(df)
+print(anomalies)
 
-capa = Capa(labels="score", fmt="dense", max_segment_length=20)
-scores = capa.fit_predict(df)
+anomaly_labels = detector.fit_transform(df)
+px.scatter(x=df.index, y=df, color=anomaly_labels.astype(str))
 
-capa = Capa(labels="indicator", fmt="dense", max_segment_length=20)
-anomalies = capa.fit_predict(df)
-px.scatter(x=df.index, y=df.values[:, 0], color=anomalies)
+scores = detector.score_transform(df)
+px.scatter(scores)
 
 # Multivariate
-# TODO: Add plotting functionality to assess the affected subset.
 df = generate_teeth_data(5, 10, p=10, mean=10, affected_proportion=0.2, random_state=2)
-capa = Mvcapa(collective_penalty="sparse", fmt="sparse")
-anomalies = capa.fit_predict(df)
+detector = Mvcapa(collective_penalty="sparse")
+
+anomalies = detector.fit_predict(df)
+print(anomalies)
+
+anomaly_labels = detector.fit_transform(df)
+anomaly_labels = (anomaly_labels > 0).astype(int)
+anomaly_labels[anomaly_labels == 0] = 0.1
+plot_df = pd.concat(
+    [
+        df.melt(ignore_index=False).reset_index(),
+        anomaly_labels.melt(value_name="anomaly_label")["anomaly_label"],
+    ],
+    axis=1,
+)
+plot_df["variable"] = plot_df["variable"].astype(str)
+px.scatter(plot_df, x="index", y="value", color="variable", size="anomaly_label")
 
-capa = Mvcapa(labels="score", fmt="dense", max_segment_length=20)
-scores = capa.fit_predict(df)
+fig = px.line(df)
+fig.add_scatter(anomaly_labels)
+px.line(anomaly_labels)
 
-capa = Mvcapa(collective_penalty_scale=5, labels="indicator", fmt="dense")
-anomalies = capa.fit_predict(df)
-df.plot(kind="line", backend="plotly")
-anomalies.plot(kind="line", backend="plotly")
+scores = detector.score_transform(df)
+px.scatter(scores)
 
 
 # Profiling

diff --git a/interactive/explore_circular_binseg.py b/interactive/explore_circular_binseg.py
@@ -1,3 +1,5 @@
+"""Interacive exploration of the Circular Binary Segmentation anomaly detector."""
+
 import plotly.express as px
 
 from skchange.anomaly_detectors.circular_binseg import (
@@ -12,9 +14,7 @@
     score="mean", growth_factor=1.5, min_segment_length=10
 )
 anomalies = detector.fit_predict(df)
-
-df.plot(kind="line", backend="plotly")
-
+px.line(df)
 px.scatter(detector.scores, x="argmax_anomaly_start", y="score")
 
 # Test anomaly intervals

diff --git a/interactive/explore_moscore.py b/interactive/explore_moscore.py
@@ -1,17 +1,20 @@
+"""Interactive exploration of the Moscore change detector."""
+
 import numpy as np
 import plotly.express as px
 from numba import njit
 
-from skchange.change_detectors.moscore import Moscore, where
+from skchange.change_detectors.moscore import Moscore
 from skchange.datasets.generate import add_linspace_outliers, generate_teeth_data
-from skchange.scores.mean_score import init_mean_score, mean_score
 from skchange.utils.benchmarking.profiler import Profiler
 
 # Simple univariate example
 df = generate_teeth_data(n_segments=2, mean=10, segment_length=100, p=1, random_state=2)
 detector = Moscore()
 changepoints = detector.fit_predict(df)
-px.scatter(detector.scores)
+labels = detector.transform(df)
+scores = detector.score_transform(df)
+px.scatter(scores)
 
 
 # Profiling
@@ -24,13 +27,6 @@
 profiler.stop()
 
 
-# Various unit tests
-df = generate_teeth_data(n_segments=1, mean=10, segment_length=10, p=1)
-precomputed_params = init_mean_score(df.values)
-mean_score(precomputed_params, start=0, end=9, split=4)
-where(np.array([True, True, True, False, False]))
-
-
 # Variance score
 df = generate_teeth_data(
     n_segments=2, variance=16, segment_length=100, p=1, random_state=1
@@ -44,6 +40,7 @@
 # Custom score
 @njit
 def col_median(X: np.ndarray) -> np.ndarray:
+    """Compute the median of each column of X."""
     m = X.shape[1]
     medians = np.zeros(m)
     for j in range(m):
@@ -53,27 +50,28 @@ def col_median(X: np.ndarray) -> np.ndarray:
 
 @njit
 def init_spike_score(X: np.ndarray) -> np.ndarray:
+    """Initialize the spike score."""
     return X
 
 
-def spike_score_factory(margin: int = 0):
-    @njit
-    def spike_score(
-        precomputed_params: np.ndarray, start: int, end: int, split: int
-    ) -> float:
-        X = precomputed_params
-        interval_X = np.concatenate(
-            (X[start : split - margin], X[split + margin + 1 : end + 1])
-        )
-        baseline_median = col_median(interval_X)
-        return np.sum(np.abs(X[split] - baseline_median))
-
-    return spike_score
+@njit
+def spike_score(
+    precomputed_params: np.ndarray,
+    start: np.ndarray,
+    end: np.ndarray,
+    split: np.ndarray,
+) -> float:
+    """Calculate the score for a spike at the split point."""
+    X = precomputed_params
+    baseline_median = np.zeros((len(start), X.shape[1]))
+    for i, (s, e) in enumerate(zip(start, end)):
+        baseline_median[i] = col_median(X[s : e + 1])
+    return np.sum(np.abs(X[split] - baseline_median), axis=1)
 
 
 df = generate_teeth_data(n_segments=1, mean=0, segment_length=100, p=1)
 df = add_linspace_outliers(df, n_outliers=4, outlier_size=10)
-score = (spike_score_factory(margin=0), init_spike_score)
+score = (spike_score, init_spike_score)
 detector = Moscore(score, bandwidth=5)
 anomalies = detector.fit_predict(df)
 px.scatter(detector.scores)

diff --git a/interactive/explore_moscore_anomaly.py b/interactive/explore_moscore_anomaly.py
@@ -1,3 +1,5 @@
+"""Interactive exploration of the MoscoreAnomaly detector."""
+
 import numpy as np
 import plotly.express as px
 
@@ -20,17 +22,7 @@
     left_bandwidth=50,
 )
 anomalies = detector.fit_predict(df)
-
-detector = MoscoreAnomaly(
-    score="mean",
-    min_anomaly_length=10,
-    max_anomaly_length=1000,
-    left_bandwidth=20,
-    labels="score",
-)
-scores = detector.fit_predict(df)
-scores["length"] = scores["anomaly_end"] - scores["anomaly_start"] + 1
-px.scatter(scores, x="anomaly_start", y="score", color="length")
+print(anomalies)
 
 
 # Profiling

diff --git a/interactive/explore_pelt.py b/interactive/explore_pelt.py
@@ -1,3 +1,5 @@
+"""Interactive exploration of the Pelt change detector."""
+
 import numpy as np
 
 from skchange.change_detectors.pelt import Pelt

diff --git a/interactive/explore_seeded_binseg.py b/interactive/explore_seeded_binseg.py
@@ -1,3 +1,5 @@
+"""Interactive exploration of Seeded Binary Segmentation."""
+
 import plotly.express as px
 
 from skchange.change_detectors.seeded_binseg import SeededBinarySegmentation
@@ -8,9 +10,8 @@
 detector = SeededBinarySegmentation(score="mean", growth_factor=2)
 detector.fit_predict(df)
 
-df.plot(kind="line", backend="plotly")
-
-px.scatter(detector.scores, x="maximizer", y="score", hover_data=["start", "end"])
+px.line(df)
+px.scatter(detector.scores, x="argmax_cpt", y="score", hover_data=["start", "end"])
 
 
 # Profiling

diff --git a/interactive/explore_stat_threshold_anomaliser.py b/interactive/explore_stat_threshold_anomaliser.py
@@ -1,3 +1,5 @@
+"""Interactive exploration of the StatThresholdAnomaliser."""
+
 import numpy as np
 
 from skchange.anomaly_detectors.anomalisers import StatThresholdAnomaliser
@@ -16,3 +18,4 @@
     change_detector, stat=np.mean, stat_lower=-1.0, stat_upper=1.0
 )
 anomalies = detector.fit_predict(df)
+print(anomalies)
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,10 +34,10 @@ classifiers = [
 ]
 requires-python = ">=3.9,<3.13"
 dependencies = [
-  "numpy<1.27,>=1.21",  # required for framework layer and base class logic
-  "pandas<2.2.0,>=1.3",  # pandas is the main in-memory data container
+  "numpy>=1.21",
+  "pandas>=1.1",
   "numba>=0.56",  # numba is used for fast computation throughout
-  "sktime>=0.23.0,<0.30.0",
+  "sktime>=0.30",
 ]
 
 [project.urls]

diff --git a/skchange/anomaly_detectors/__init__.py b/skchange/anomaly_detectors/__init__.py
@@ -1 +1,24 @@
 """Anomaly detection algorithms."""
+
+from skchange.anomaly_detectors.anomalisers import StatThresholdAnomaliser
+from skchange.anomaly_detectors.base import (
+    CollectiveAnomalyDetector,
+    PointAnomalyDetector,
+)
+from skchange.anomaly_detectors.capa import Capa
+from skchange.anomaly_detectors.circular_binseg import CircularBinarySegmentation
+from skchange.anomaly_detectors.moscore_anomaly import MoscoreAnomaly
+from skchange.anomaly_detectors.mvcapa import Mvcapa
+
+BASE_ANOMALY_DETECTORS = [CollectiveAnomalyDetector, PointAnomalyDetector]
+COLLECTIVE_ANOMALY_DETECTORS = [
+    Capa,
+    CircularBinarySegmentation,
+    MoscoreAnomaly,
+    Mvcapa,
+    StatThresholdAnomaliser,
+]
+POINT_ANOMALY_DETECTORS = []
+ANOMALY_DETECTORS = COLLECTIVE_ANOMALY_DETECTORS + POINT_ANOMALY_DETECTORS
+
+__all__ = BASE_ANOMALY_DETECTORS + ANOMALY_DETECTORS