Skip to content

Commit

Permalink
refactor(config): update configuration in pipelines
Browse files Browse the repository at this point in the history
  • Loading branch information
sbrugman committed Jul 4, 2022
1 parent 4f8126d commit 78bae81
Show file tree
Hide file tree
Showing 9 changed files with 97 additions and 381 deletions.
2 changes: 2 additions & 0 deletions popmon/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
# pandas/spark dataframe decorators
from popmon import decorators

from .config import Settings
from .pipeline.metrics import df_stability_metrics, stability_metrics
from .pipeline.report import df_stability_report, stability_report
from .stitching import stitch_histograms
Expand All @@ -44,4 +45,5 @@
"stability_report",
"stitch_histograms",
"__version__",
"Settings",
]
11 changes: 6 additions & 5 deletions popmon/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@ class HistogramSectionModel(BaseModel):
plot_hist_n: plot histograms for last 'n' periods. default is 2 (optional)
"""
plot_hist_n: int = 2

"""
top_n: plot heatmap for top 'n' categories. default is 20 (optional)
"""
top_n: int = 20

"""
cmap: colormap for histogram heatmaps
"""
Expand Down Expand Up @@ -165,11 +171,6 @@ class Report(BaseModel):
"*max_prob_diff*",
]

"""
top_n: limit of number of categorical items to plot (default: 20)
"""
top_n: int = 20

section: Section = Section()


Expand Down
128 changes: 15 additions & 113 deletions popmon/pipeline/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
make_histograms,
)

from ..config import Settings
from ..pipeline.metrics_pipelines import create_metrics_pipeline

logging.basicConfig(
Expand All @@ -37,13 +38,10 @@

def stability_metrics(
hists,
settings: Settings,
reference_type="self",
reference=None,
time_axis="",
window=10,
shift=1,
monitoring_rules=None,
pull_rules=None,
features=None,
**kwargs,
):
Expand All @@ -54,47 +52,6 @@ def stability_metrics(
default is 'self'.
:param reference: histograms used as reference. default is None
:param str time_axis: name of datetime feature, used as time axis, eg 'date'. auto-guessed when not provided.
:param int window: size of rolling window and/or trend detection. default is 10.
:param int shift: shift of time-bins in rolling/expanding window. default is 1.
:param dict monitoring_rules: monitoring rules to generate traffic light alerts.
The default setting is:
.. code-block:: python
monitoring_rules = {
"*_pull": [7, 4, -4, -7],
"*_zscore": [7, 4, -4, -7],
"[!p]*_unknown_labels": [0.5, 0.5, 0, 0],
}
Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern.
For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull".
You can also specify rules for specific features and/or statistics by leaving out wildcard and putting the
feature name in front. E.g.
.. code-block:: python
monitoring_rules = {
"featureA:*_pull": [5, 3, -3, -5],
"featureA:nan": [4, 1, 0, 0],
"*_pull": [7, 4, -4, -7],
"nan": [8, 1, 0, 0],
}
In case of multiple rules could apply for a feature's statistic, the most specific one applies.
So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule
for all other features.
:param dict pull_rules: red and yellow (possibly dynamic) boundaries shown in plots in the report.
Default is:
.. code-block:: python
pull_rules = {"*_pull": [7, 4, -4, -7]}
This means that the shown yellow boundaries are at -4, +4 standard deviations around the (reference) mean,
and the shown red boundaries are at -7, +7 standard deviations around the (reference) mean.
Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern.
(The same string logic applies as for monitoring_rules.)
:param list features: histograms to pick up from the 'hists' dictionary (default is all keys)
:param kwargs: residual keyword arguments passed on to report pipeline.
:return: dict with results of metrics pipeline
Expand All @@ -103,32 +60,22 @@ def stability_metrics(
if not isinstance(hists, dict):
raise TypeError("hists should be a dict of histogrammar histograms.")

if not isinstance(monitoring_rules, dict):
monitoring_rules = {
"*_pull": [7, 4, -4, -7],
"*_zscore": [7, 4, -4, -7],
"[!p]*_unknown_labels": [0.5, 0.5, 0, 0],
}
if not isinstance(pull_rules, dict):
pull_rules = {"*_pull": [7, 4, -4, -7]}

if (isinstance(time_axis, str) and len(time_axis) == 0) or (
isinstance(time_axis, bool) and time_axis
):
# auto guess the time_axis: find the most frequent first column name in the histograms list
first_cols = [k.split(":")[0] for k in list(hists.keys())]
time_axis = max(set(first_cols), key=first_cols.count)

if reference_type == "external" and "ref_hists_key" not in kwargs:
kwargs["ref_hists_key"] = "ref_hists"

pipeline = create_metrics_pipeline(
settings=settings,
reference_type=reference_type,
reference=reference,
hists_key="hists",
ref_hists_key="ref_hists",
time_axis=time_axis,
window=window,
shift=shift,
monitoring_rules=monitoring_rules,
pull_rules=pull_rules,
features=features,
**kwargs,
)
Expand All @@ -143,6 +90,7 @@ def stability_metrics(
def df_stability_metrics(
df,
time_axis,
settings: Settings = None,
features=None,
binning="auto",
bin_specs=None,
Expand All @@ -151,10 +99,6 @@ def df_stability_metrics(
var_dtype=None,
reference_type="self",
reference=None,
window=10,
shift=1,
monitoring_rules=None,
pull_rules=None,
**kwargs,
):
"""Create a data stability monitoring html datastore for given pandas or spark dataframe.
Expand Down Expand Up @@ -204,50 +148,12 @@ def df_stability_metrics(
:param reference_type: type or reference used for comparisons. Options [self, external, rolling, expanding].
default is 'self'.
:param reference: reference dataframe or histograms. default is None
:param int window: size of rolling window and/or trend detection. default is 10.
:param int shift: shift of time-bins in rolling/expanding window. default is 1.
:param dict monitoring_rules: monitoring rules to generate traffic light alerts.
The default setting is:
.. code-block:: python
monitoring_rules = {
"*_pull": [7, 4, -4, -7],
"*_zscore": [7, 4, -4, -7],
"[!p]*_unknown_labels": [0.5, 0.5, 0, 0],
}
Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern.
For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull".
You can also specify rules for specific features and/or statistics by leaving out wildcard and putting the
feature name in front. E.g.
.. code-block:: python
monitoring_rules = {
"featureA:*_pull": [5, 3, -3, -5],
"featureA:nan": [4, 1, 0, 0],
"*_pull": [7, 4, -4, -7],
"nan": [8, 1, 0, 0],
}
In case of multiple rules could apply for a feature's statistic, the most specific one applies.
So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule
for all other features.
:param dict pull_rules: red and yellow (possibly dynamic) boundaries shown in plots in the report.
Default is:
.. code-block:: python
pull_rules = {"*_pull": [7, 4, -4, -7]}
This means that the shown yellow boundaries are at -4, +4 standard deviations around the (reference) mean,
and the shown red boundaries are at -7, +7 standard deviations around the (reference) mean.
Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern.
(The same string logic applies as for monitoring_rules.)
:param kwargs: residual keyword arguments, passed on to stability_report()
:return: dict with results of metrics pipeline
"""
if settings is None:
settings = Settings()

# basic checks on presence of time_axis
if not (isinstance(time_axis, str) and len(time_axis) > 0) and not (
isinstance(time_axis, bool) and time_axis
Expand Down Expand Up @@ -304,7 +210,6 @@ def df_stability_metrics(
}
bin_specs[time_axis] = time_specs

reference_hists = None
if reference is not None:
reference_type = "external"
if isinstance(reference, dict):
Expand All @@ -331,6 +236,7 @@ def df_stability_metrics(
var_dtype,
ret_specs=True,
)
kwargs["reference_hists"] = reference_hists

# use the same features, bin_specs, time_axis, etc as for reference hists
hists = make_histograms(
Expand All @@ -345,13 +251,9 @@ def df_stability_metrics(
# generate data stability report
return stability_metrics(
hists,
reference_type,
reference_hists,
time_axis,
window,
shift,
monitoring_rules,
pull_rules,
features,
settings=settings,
reference_type=reference_type,
time_axis=time_axis,
features=features,
**kwargs,
)
34 changes: 8 additions & 26 deletions popmon/pipeline/metrics_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,12 @@ def get_metrics_pipeline_class(reference_type, reference):


def create_metrics_pipeline(
settings: Settings,
reference_type="self",
reference=None,
hists_key="hists",
time_axis="",
features=None,
settings: Settings = None,
**kwargs,
):
# configuration and datastore for report pipeline
Expand Down Expand Up @@ -212,20 +212,16 @@ def get_trend_modules(window) -> List[Union[Module, Pipeline]]:
class SelfReferenceMetricsPipeline(Pipeline):
def __init__(
self,
hists_key="test_hists",
time_axis="date",
features=None,
settings: Settings = None,
settings: Settings,
hists_key,
time_axis,
features,
):
"""Example metrics pipeline for comparing test data with itself (full test set)
:param str hists_key: key to test histograms in datastore. default is 'test_hists'
:param str time_axis: name of datetime feature. default is 'date'
:param int window: window size for trend detection. default is 10
:param dict monitoring_rules: traffic light rules
:param dict pull_rules: pull rules to determine dynamic boundaries
:param list features: features of histograms to pick up from input data (optional)
:param kwargs: residual keyword arguments
:return: assembled self reference pipeline
"""
from popmon.analysis.comparison.comparison_registry import Comparisons
Expand Down Expand Up @@ -274,22 +270,18 @@ def __init__(
class ExternalReferenceMetricsPipeline(Pipeline):
def __init__(
self,
settings: Settings,
hists_key="test_hists",
ref_hists_key="ref_hists",
time_axis="date",
features=None,
settings: Settings = None,
):
"""Example metrics pipeline for comparing test data with other (full) external reference set
:param str hists_key: key to test histograms in datastore. default is 'test_hists'
:param str ref_hists_key: key to reference histograms in datastore. default is 'ref_hists'
:param str time_axis: name of datetime feature. default is 'date' (column should be timestamp, date(time) or numeric batch id)
:param int window: window size for trend detection. default is 10
:param dict monitoring_rules: traffic light rules
:param dict pull_rules: pull rules to determine dynamic boundaries
:param list features: features of histograms to pick up from input data (optional)
:param kwargs: residual keyword arguments
:return: assembled external reference pipeline
"""
from popmon.analysis.comparison.comparison_registry import Comparisons
Expand Down Expand Up @@ -343,21 +335,16 @@ def __init__(
class RollingReferenceMetricsPipeline(Pipeline):
def __init__(
self,
settings: Settings,
hists_key="test_hists",
time_axis="date",
features=None,
settings: Settings = None,
):
"""Example metrics pipeline for comparing test data with itself (rolling test set)
:param str hists_key: key to test histograms in datastore. default is 'test_hists'
:param str time_axis: name of datetime feature. default is 'date'
:param int window: size of rolling window and for trend detection. default is 10
:param int shift: shift in rolling window. default is 1
:param dict monitoring_rules: traffic light rules
:param dict pull_rules: pull rules to determine dynamic boundaries
:param list features: features of histograms to pick up from input data (optional)
:param kwargs: residual keyword arguments
:return: assembled rolling reference pipeline
"""
from popmon.analysis.comparison.comparison_registry import Comparisons
Expand Down Expand Up @@ -409,21 +396,16 @@ def __init__(
class ExpandingReferenceMetricsPipeline(Pipeline):
def __init__(
self,
settings: Settings,
hists_key="test_hists",
time_axis="date",
features=None,
settings: Settings = None,
):
"""Example metrics pipeline for comparing test data with itself (expanding test set)
:param str hists_key: key to test histograms in datastore. default is 'test_hists'
:param str time_axis: name of datetime feature. default is 'date'
:param int window: window size for trend detection. default is 10
:param int shift: shift in expanding window. default is 1
:param dict monitoring_rules: traffic light rules
:param dict pull_rules: pull rules to determine dynamic boundaries
:param list features: features of histograms to pick up from input data (optional)
:param kwargs: residual keyword arguments
:return: assembled expanding reference pipeline
"""
from popmon.analysis.comparison.comparison_registry import Comparisons
Expand Down
Loading

0 comments on commit 78bae81

Please sign in to comment.