diff --git a/popmon/__init__.py b/popmon/__init__.py index bc734eac..ecd13edb 100644 --- a/popmon/__init__.py +++ b/popmon/__init__.py @@ -28,6 +28,7 @@ # pandas/spark dataframe decorators from popmon import decorators +from .config import Settings from .pipeline.metrics import df_stability_metrics, stability_metrics from .pipeline.report import df_stability_report, stability_report from .stitching import stitch_histograms @@ -44,4 +45,5 @@ "stability_report", "stitch_histograms", "__version__", + "Settings", ] diff --git a/popmon/config.py b/popmon/config.py index 0c118456..fe4db4ea 100644 --- a/popmon/config.py +++ b/popmon/config.py @@ -74,6 +74,12 @@ class HistogramSectionModel(BaseModel): plot_hist_n: plot histograms for last 'n' periods. default is 2 (optional) """ plot_hist_n: int = 2 + + """ + top_n: plot heatmap for top 'n' categories. default is 20 (optional) + """ + top_n: int = 20 + """ cmap: colormap for histogram heatmaps """ @@ -165,11 +171,6 @@ class Report(BaseModel): "*max_prob_diff*", ] - """ - top_n: limit of number of categorical items to plot (default: 20) - """ - top_n: int = 20 - section: Section = Section() diff --git a/popmon/pipeline/metrics.py b/popmon/pipeline/metrics.py index b6b72d85..d0a7f654 100644 --- a/popmon/pipeline/metrics.py +++ b/popmon/pipeline/metrics.py @@ -27,6 +27,7 @@ make_histograms, ) +from ..config import Settings from ..pipeline.metrics_pipelines import create_metrics_pipeline logging.basicConfig( @@ -37,13 +38,10 @@ def stability_metrics( hists, + settings: Settings, reference_type="self", reference=None, time_axis="", - window=10, - shift=1, - monitoring_rules=None, - pull_rules=None, features=None, **kwargs, ): @@ -54,47 +52,6 @@ def stability_metrics( default is 'self'. :param reference: histograms used as reference. default is None :param str time_axis: name of datetime feature, used as time axis, eg 'date'. auto-guessed when not provided. - :param int window: size of rolling window and/or trend detection. default is 10. - :param int shift: shift of time-bins in rolling/expanding window. default is 1. - :param dict monitoring_rules: monitoring rules to generate traffic light alerts. - The default setting is: - - .. code-block:: python - - monitoring_rules = { - "*_pull": [7, 4, -4, -7], - "*_zscore": [7, 4, -4, -7], - "[!p]*_unknown_labels": [0.5, 0.5, 0, 0], - } - - Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. - For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull". - You can also specify rules for specific features and/or statistics by leaving out wildcard and putting the - feature name in front. E.g. - - .. code-block:: python - - monitoring_rules = { - "featureA:*_pull": [5, 3, -3, -5], - "featureA:nan": [4, 1, 0, 0], - "*_pull": [7, 4, -4, -7], - "nan": [8, 1, 0, 0], - } - - In case of multiple rules could apply for a feature's statistic, the most specific one applies. - So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule - for all other features. - :param dict pull_rules: red and yellow (possibly dynamic) boundaries shown in plots in the report. - Default is: - - .. code-block:: python - - pull_rules = {"*_pull": [7, 4, -4, -7]} - - This means that the shown yellow boundaries are at -4, +4 standard deviations around the (reference) mean, - and the shown red boundaries are at -7, +7 standard deviations around the (reference) mean. - Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. - (The same string logic applies as for monitoring_rules.) :param list features: histograms to pick up from the 'hists' dictionary (default is all keys) :param kwargs: residual keyword arguments passed on to report pipeline. :return: dict with results of metrics pipeline @@ -103,15 +60,6 @@ def stability_metrics( if not isinstance(hists, dict): raise TypeError("hists should be a dict of histogrammar histograms.") - if not isinstance(monitoring_rules, dict): - monitoring_rules = { - "*_pull": [7, 4, -4, -7], - "*_zscore": [7, 4, -4, -7], - "[!p]*_unknown_labels": [0.5, 0.5, 0, 0], - } - if not isinstance(pull_rules, dict): - pull_rules = {"*_pull": [7, 4, -4, -7]} - if (isinstance(time_axis, str) and len(time_axis) == 0) or ( isinstance(time_axis, bool) and time_axis ): @@ -119,16 +67,15 @@ def stability_metrics( first_cols = [k.split(":")[0] for k in list(hists.keys())] time_axis = max(set(first_cols), key=first_cols.count) + if reference_type == "external" and "ref_hists_key" not in kwargs: + kwargs["ref_hists_key"] = "ref_hists" + pipeline = create_metrics_pipeline( + settings=settings, reference_type=reference_type, reference=reference, hists_key="hists", - ref_hists_key="ref_hists", time_axis=time_axis, - window=window, - shift=shift, - monitoring_rules=monitoring_rules, - pull_rules=pull_rules, features=features, **kwargs, ) @@ -143,6 +90,7 @@ def stability_metrics( def df_stability_metrics( df, time_axis, + settings: Settings = None, features=None, binning="auto", bin_specs=None, @@ -151,10 +99,6 @@ def df_stability_metrics( var_dtype=None, reference_type="self", reference=None, - window=10, - shift=1, - monitoring_rules=None, - pull_rules=None, **kwargs, ): """Create a data stability monitoring html datastore for given pandas or spark dataframe. @@ -204,50 +148,12 @@ def df_stability_metrics( :param reference_type: type or reference used for comparisons. Options [self, external, rolling, expanding]. default is 'self'. :param reference: reference dataframe or histograms. default is None - :param int window: size of rolling window and/or trend detection. default is 10. - :param int shift: shift of time-bins in rolling/expanding window. default is 1. - :param dict monitoring_rules: monitoring rules to generate traffic light alerts. - The default setting is: - - .. code-block:: python - - monitoring_rules = { - "*_pull": [7, 4, -4, -7], - "*_zscore": [7, 4, -4, -7], - "[!p]*_unknown_labels": [0.5, 0.5, 0, 0], - } - - Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. - For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull". - You can also specify rules for specific features and/or statistics by leaving out wildcard and putting the - feature name in front. E.g. - - .. code-block:: python - - monitoring_rules = { - "featureA:*_pull": [5, 3, -3, -5], - "featureA:nan": [4, 1, 0, 0], - "*_pull": [7, 4, -4, -7], - "nan": [8, 1, 0, 0], - } - - In case of multiple rules could apply for a feature's statistic, the most specific one applies. - So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule - for all other features. - :param dict pull_rules: red and yellow (possibly dynamic) boundaries shown in plots in the report. - Default is: - - .. code-block:: python - - pull_rules = {"*_pull": [7, 4, -4, -7]} - - This means that the shown yellow boundaries are at -4, +4 standard deviations around the (reference) mean, - and the shown red boundaries are at -7, +7 standard deviations around the (reference) mean. - Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. - (The same string logic applies as for monitoring_rules.) :param kwargs: residual keyword arguments, passed on to stability_report() :return: dict with results of metrics pipeline """ + if settings is None: + settings = Settings() + # basic checks on presence of time_axis if not (isinstance(time_axis, str) and len(time_axis) > 0) and not ( isinstance(time_axis, bool) and time_axis @@ -304,7 +210,6 @@ def df_stability_metrics( } bin_specs[time_axis] = time_specs - reference_hists = None if reference is not None: reference_type = "external" if isinstance(reference, dict): @@ -331,6 +236,7 @@ def df_stability_metrics( var_dtype, ret_specs=True, ) + kwargs["reference_hists"] = reference_hists # use the same features, bin_specs, time_axis, etc as for reference hists hists = make_histograms( @@ -345,13 +251,9 @@ def df_stability_metrics( # generate data stability report return stability_metrics( hists, - reference_type, - reference_hists, - time_axis, - window, - shift, - monitoring_rules, - pull_rules, - features, + settings=settings, + reference_type=reference_type, + time_axis=time_axis, + features=features, **kwargs, ) diff --git a/popmon/pipeline/metrics_pipelines.py b/popmon/pipeline/metrics_pipelines.py index b1122d0f..f3c8ef31 100644 --- a/popmon/pipeline/metrics_pipelines.py +++ b/popmon/pipeline/metrics_pipelines.py @@ -68,12 +68,12 @@ def get_metrics_pipeline_class(reference_type, reference): def create_metrics_pipeline( + settings: Settings, reference_type="self", reference=None, hists_key="hists", time_axis="", features=None, - settings: Settings = None, **kwargs, ): # configuration and datastore for report pipeline @@ -212,20 +212,16 @@ def get_trend_modules(window) -> List[Union[Module, Pipeline]]: class SelfReferenceMetricsPipeline(Pipeline): def __init__( self, - hists_key="test_hists", - time_axis="date", - features=None, - settings: Settings = None, + settings: Settings, + hists_key, + time_axis, + features, ): """Example metrics pipeline for comparing test data with itself (full test set) :param str hists_key: key to test histograms in datastore. default is 'test_hists' :param str time_axis: name of datetime feature. default is 'date' - :param int window: window size for trend detection. default is 10 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries :param list features: features of histograms to pick up from input data (optional) - :param kwargs: residual keyword arguments :return: assembled self reference pipeline """ from popmon.analysis.comparison.comparison_registry import Comparisons @@ -274,22 +270,18 @@ def __init__( class ExternalReferenceMetricsPipeline(Pipeline): def __init__( self, + settings: Settings, hists_key="test_hists", ref_hists_key="ref_hists", time_axis="date", features=None, - settings: Settings = None, ): """Example metrics pipeline for comparing test data with other (full) external reference set :param str hists_key: key to test histograms in datastore. default is 'test_hists' :param str ref_hists_key: key to reference histograms in datastore. default is 'ref_hists' :param str time_axis: name of datetime feature. default is 'date' (column should be timestamp, date(time) or numeric batch id) - :param int window: window size for trend detection. default is 10 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries :param list features: features of histograms to pick up from input data (optional) - :param kwargs: residual keyword arguments :return: assembled external reference pipeline """ from popmon.analysis.comparison.comparison_registry import Comparisons @@ -343,21 +335,16 @@ def __init__( class RollingReferenceMetricsPipeline(Pipeline): def __init__( self, + settings: Settings, hists_key="test_hists", time_axis="date", features=None, - settings: Settings = None, ): """Example metrics pipeline for comparing test data with itself (rolling test set) :param str hists_key: key to test histograms in datastore. default is 'test_hists' :param str time_axis: name of datetime feature. default is 'date' - :param int window: size of rolling window and for trend detection. default is 10 - :param int shift: shift in rolling window. default is 1 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries :param list features: features of histograms to pick up from input data (optional) - :param kwargs: residual keyword arguments :return: assembled rolling reference pipeline """ from popmon.analysis.comparison.comparison_registry import Comparisons @@ -409,21 +396,16 @@ def __init__( class ExpandingReferenceMetricsPipeline(Pipeline): def __init__( self, + settings: Settings, hists_key="test_hists", time_axis="date", features=None, - settings: Settings = None, ): """Example metrics pipeline for comparing test data with itself (expanding test set) :param str hists_key: key to test histograms in datastore. default is 'test_hists' :param str time_axis: name of datetime feature. default is 'date' - :param int window: window size for trend detection. default is 10 - :param int shift: shift in expanding window. default is 1 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries :param list features: features of histograms to pick up from input data (optional) - :param kwargs: residual keyword arguments :return: assembled expanding reference pipeline """ from popmon.analysis.comparison.comparison_registry import Comparisons diff --git a/popmon/pipeline/report.py b/popmon/pipeline/report.py index e70c0312..7039a73a 100644 --- a/popmon/pipeline/report.py +++ b/popmon/pipeline/report.py @@ -19,7 +19,7 @@ import logging -from collections import defaultdict +from typing import Optional import pandas as pd from histogrammar.dfinterface.make_histograms import ( @@ -40,11 +40,11 @@ def stability_report( hists, + settings: Optional[Settings] = None, reference_type="self", reference=None, time_axis="", features=None, - **kwargs, ): """Create a data stability monitoring html report for given dict of input histograms. @@ -53,57 +53,13 @@ def stability_report( default is 'self'. :param reference: histograms used as reference. default is None :param str time_axis: name of datetime feature, used as time axis, eg 'date'. auto-guessed when not provided. - :param int window: size of rolling window and/or trend detection. default is 10. - :param int shift: shift of time-bins in rolling/expanding window. default is 1. - :param dict monitoring_rules: monitoring rules to generate traffic light alerts. - The default setting is: - - .. code-block:: python - - monitoring_rules = { - "*_pull": [7, 4, -4, -7], - "*_zscore": [7, 4, -4, -7], - "[!p]*_unknown_labels": [0.5, 0.5, 0, 0], - } - - Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. - For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull". - You can also specify rules for specific features and/or statistics by leaving out wildcard and putting the - feature name in front. E.g. - - .. code-block:: python - - monitoring_rules = { - "featureA:*_pull": [5, 3, -3, -5], - "featureA:nan": [4, 1, 0, 0], - "*_pull": [7, 4, -4, -7], - "nan": [8, 1, 0, 0], - } - - In case of multiple rules could apply for a feature's statistic, the most specific one applies. - So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule - for all other features. - :param dict pull_rules: red and yellow (possibly dynamic) boundaries shown in plots in the report. - Default is: - - .. code-block:: python - - pull_rules = {"*_pull": [7, 4, -4, -7]} - - This means that the shown yellow boundaries are at -4, +4 standard deviations around the (reference) mean, - and the shown red boundaries are at -7, +7 standard deviations around the (reference) mean. - Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. - (The same string logic applies as for monitoring_rules.) :param list features: histograms to pick up from the 'hists' dictionary (default is all keys) - :param bool skip_empty_plots: if false, also show empty plots in report with only nans or zeroes (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int plot_hist_n: plot histograms for last 'n' periods. default is 2 (optional) - :param str report_filepath: the file path where to output the report (optional) - :param bool extended_report: if True, show all the generated statistics in the report (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) - :param kwargs: residual keyword arguments passed on to report pipeline. :return: dict with results of reporting pipeline """ + + if settings is None: + settings = Settings() + # perform basic input checks if not isinstance(hists, dict): raise TypeError("hists should be a dict of histogrammar histograms.") @@ -114,20 +70,6 @@ def stability_report( first_cols = [k.split(":")[0] for k in list(hists.keys())] time_axis = max(set(first_cols), key=first_cols.count) - # parse the kwargs - keys = Settings.get_keys() - data = defaultdict(dict) - for k, m in keys.items(): - if k in kwargs: - if isinstance(m, tuple): - data[m[0]][m[1]] = kwargs.pop(k) - else: - data[m] = kwargs.pop(k) - if len(kwargs) > 0: - raise ValueError(f"kwargs not supported {kwargs}") - - settings = Settings(**data) - # configuration and datastore for report pipeline cfg = { "hists_key": "hists", @@ -169,6 +111,7 @@ def set_time_axis(df): def df_stability_report( df, time_axis, + settings: Settings = None, features=None, binning="auto", bin_specs=None, @@ -177,7 +120,6 @@ def df_stability_report( var_dtype=None, reference_type="self", reference=None, - **kwargs, ): """Create a data stability monitoring html report for given pandas or spark dataframe. @@ -226,56 +168,12 @@ def df_stability_report( :param reference_type: type or reference used for comparisons. Options [self, external, rolling, expanding]. default is 'self'. :param reference: reference dataframe or histograms. default is None - :param int window: size of rolling window and/or trend detection. default is 10. - :param int shift: shift of time-bins in rolling/expanding window. default is 1. - :param dict monitoring_rules: monitoring rules to generate traffic light alerts. - The default setting is: - - .. code-block:: python - - monitoring_rules = { - "*_pull": [7, 4, -4, -7], - "*_zscore": [7, 4, -4, -7], - "[!p]*_unknown_labels": [0.5, 0.5, 0, 0], - } - - Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. - For example, ``"*_pull"`` applies for all features to all statistics ending on "_pull". - You can also specify rules for specific features and/or statistics by leaving out wildcard and putting the - feature name in front. E.g. - - .. code-block:: python - - monitoring_rules = { - "featureA:*_pull": [5, 3, -3, -5], - "featureA:nan": [4, 1, 0, 0], - "*_pull": [7, 4, -4, -7], - "nan": [8, 1, 0, 0], - } - - In case of multiple rules could apply for a feature's statistic, the most specific one applies. - So in case of the statistic "nan": "featureA:nan" is used for "featureA", and the other "nan" rule - for all other features. - :param dict pull_rules: red and yellow (possibly dynamic) boundaries shown in plots in the report. - Default is: - - .. code-block:: python - - pull_rules = {"*_pull": [7, 4, -4, -7]} - - This means that the shown yellow boundaries are at -4, +4 standard deviations around the (reference) mean, - and the shown red boundaries are at -7, +7 standard deviations around the (reference) mean. - Note that the (filename based) wildcards such as * apply to all statistic names matching that pattern. - (The same string logic applies as for monitoring_rules.) - :param bool skip_empty_plots: if false, also show empty plots in report with only nans or zeroes (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int plot_hist_n: plot histograms for last 'n' periods. default is 2 (optional) - :param str report_filepath: the file path where to output the report (optional) - :param bool extended_report: if True, show all the generated statistics in the report (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) - :param kwargs: residual keyword arguments, passed on to stability_report() :return: dict with results of reporting pipeline """ + + if settings is None: + settings = Settings() + # basic checks on presence of time_axis if not (isinstance(time_axis, str) and len(time_axis) > 0) and not ( isinstance(time_axis, bool) and time_axis @@ -361,12 +259,12 @@ def df_stability_report( # generate data stability report return stability_report( - hists, - reference_type, - reference_hists, - time_axis, - features, - **kwargs, + hists=hists, + settings=settings, + reference_type=reference_type, + reference=reference_hists, + time_axis=time_axis, + features=features, ) @@ -446,22 +344,14 @@ def to_notebook_iframe(self, width="100%", height="100%"): def regenerate( self, - store_key="html_report", - sections_key="report_sections", - **kwargs, + store_key: str = "html_report", + sections_key: str = "report_sections", + report_settings: Report = None, ): """Regenerate HTML report with different plot settings - - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int skip_first_n: in plot skip first 'n' periods. last_n takes precedence (optional) - :param int skip_last_n: in plot skip last 'n' periods. last_n takes precedence (optional) - :param int plot_hist_n: plot histograms for last 'n' periods. default is 2 (optional) - :param bool skip_empty_plots: if false, also show empty plots in report with only nans or zeroes (optional) - :param str report_filepath: the file path where to output the report (optional) :param str sections_key: key to store sections data in the datastore. default is 'report_sections'. :param str store_key: key to store the HTML report data in the datastore. default is 'html_report' - :param bool extended_report: if True, show all the generated statistics in the report (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) + :param Report report_settings: configuration to regenerate the report :return HTML: HTML report in an iframe """ # basic checks @@ -474,12 +364,12 @@ def regenerate( del self.datastore[sections_key] if store_key in self.datastore: del self.datastore[store_key] - - settings = Report(**kwargs) + if report_settings is None: + report_settings = Report() pipeline = ReportPipe( sections_key=sections_key, - settings=settings, + settings=report_settings, ) result = pipeline.transform(self.datastore) diff --git a/popmon/pipeline/report_pipelines.py b/popmon/pipeline/report_pipelines.py index ea00baeb..8373ec86 100644 --- a/popmon/pipeline/report_pipelines.py +++ b/popmon/pipeline/report_pipelines.py @@ -21,7 +21,7 @@ from pathlib import Path from ..base import Pipeline -from ..config import Report +from ..config import Report, Settings from ..io import FileWriter from ..pipeline.metrics_pipelines import ( ExpandingReferenceMetricsPipeline, @@ -58,33 +58,24 @@ def get_report_pipeline_class(reference_type, reference): class SelfReference(Pipeline): def __init__( self, - hists_key="test_hists", - time_axis="date", - features=None, - settings=None, + settings: Settings, + features: list, + hists_key: str = "test_hists", + time_axis: str = "date", ): """Example pipeline for comparing test data with itself (full test set) :param str hists_key: key to test histograms in datastore. default is 'test_hists' :param str time_axis: name of datetime feature. default is 'date' (column should be timestamp, date(time) or numeric batch id) - :param int window: window size for trend detection. default is 10 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries :param list features: features of histograms to pick up from input data (optional) - :param bool skip_empty_plots: if false, also show empty plots in report with only nans or zeroes (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int plot_hist_n: plot histograms for last 'n' periods. default is 1 (optional) - :param str report_filepath: the file path where to output the report (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) - :param kwargs: residual keyword arguments :return: assembled self reference pipeline """ modules = [ SelfReferenceMetricsPipeline( - hists_key, - time_axis, - features, - settings, + hists_key=hists_key, + time_axis=time_axis, + features=features, + settings=settings, ), ReportPipe( sections_key="report_sections", @@ -99,36 +90,28 @@ def __init__( class ExternalReference(Pipeline): def __init__( self, - hists_key="test_hists", - ref_hists_key="ref_hists", - time_axis="date", + settings: Settings, + hists_key: str = "test_hists", + ref_hists_key: str = "ref_hists", + time_axis: str = "date", features=None, - settings=None, ): """Example pipeline for comparing test data with other (full) external reference set :param str hists_key: key to test histograms in datastore. default is 'test_hists' :param str ref_hists_key: key to reference histograms in datastore. default is 'ref_hists' :param str time_axis: name of datetime feature. default is 'date' (column should be timestamp, date(time) or numeric batch id) - :param int window: window size for trend detection. default is 10 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries :param list features: features of histograms to pick up from input data (optional) - :param bool skip_empty_plots: if false, show empty plots in report with only nans or zeroes (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int plot_hist_n: plot histograms for last 'n' periods. default is 1 (optional) - :param str report_filepath: the file path where to output the report (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) :param kwargs: residual keyword arguments :return: assembled external reference pipeline """ modules = [ ExternalReferenceMetricsPipeline( - hists_key, - ref_hists_key, - time_axis, - features, - settings, + hists_key=hists_key, + ref_hists_key=ref_hists_key, + time_axis=time_axis, + features=features, + settings=settings, ), ReportPipe( sections_key="report_sections", @@ -143,34 +126,24 @@ def __init__( class RollingReference(Pipeline): def __init__( self, - hists_key="test_hists", - time_axis="date", + settings: Settings, + hists_key: str = "test_hists", + time_axis: str = "date", features=None, - settings=None, ): """Example pipeline for comparing test data with itself (rolling test set) :param str hists_key: key to test histograms in datastore. default is 'test_hists' :param str time_axis: name of datetime feature. default is 'date' (column should be timestamp, date(time) or numeric batch id) - :param int window: size of rolling window and for trend detection. default is 10 - :param int shift: shift in rolling window. default is 1 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries :param list features: features of histograms to pick up from input data (optional) - :param bool skip_empty_plots: if false, show empty plots in report with only nans or zeroes (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int plot_hist_n: plot histograms for last 'n' periods. default is 1 (optional) - :param str report_filepath: the file path where to output the report (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) - :param kwargs: residual keyword arguments :return: assembled rolling reference pipeline """ modules = [ RollingReferenceMetricsPipeline( - hists_key, - time_axis, - features, - settings, + settings=settings, + hists_key=hists_key, + time_axis=time_axis, + features=features, ), ReportPipe( sections_key="report_sections", @@ -185,34 +158,24 @@ def __init__( class ExpandingReference(Pipeline): def __init__( self, - hists_key="test_hists", - time_axis="date", + settings: Settings, + hists_key: str = "test_hists", + time_axis: str = "date", features=None, - settings=None, ): """Example pipeline for comparing test data with itself (expanding test set) :param str hists_key: key to test histograms in datastore. default is 'test_hists' :param str time_axis: name of datetime feature. default is 'date' (column should be timestamp, date(time) or numeric batch id) - :param int window: window size for trend detection. default is 10 - :param int shift: shift in expanding window. default is 1 - :param dict monitoring_rules: traffic light rules - :param dict pull_rules: pull rules to determine dynamic boundaries :param list features: features of histograms to pick up from input data (optional) - :param bool skip_empty_plots: if false, show empty plots in report with only nans or zeroes (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int plot_hist_n: plot histograms for last 'n' periods. default is 1 (optional) - :param str report_filepath: the file path where to output the report (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) - :param kwargs: residual keyword arguments :return: assembled expanding reference pipeline """ modules = [ ExpandingReferenceMetricsPipeline( - hists_key, - time_axis, - features, - settings, + hists_key=hists_key, + time_axis=time_axis, + features=features, + settings=settings, ), ReportPipe( sections_key="report_sections", @@ -229,26 +192,14 @@ class ReportPipe(Pipeline): def __init__( self, - sections_key="report_sections", - store_key="html_report", - settings: Report = None, + settings: Report, + sections_key: str = "report_sections", + store_key: str = "html_report", ): """Initialize an instance of Report. :param str sections_key: key to store sections data in the datastore :param str store_key: key to store the HTML report data in the datastore - :param str profiles_section: name for the profile data section. default is 'Profiles' - :param str comparisons_section: name for the comparison data section. default is 'Comparisons' - :param str traffic_lights_section: name for the traffic light section. default is 'Traffic Lights' - :param str alerts_section: name for the alerts section. default is 'Alerts' - :param str histograms_section: name for the histograms section. default is 'Histograms' - :param str report_filepath: the file path where to output the report (optional) - :param bool skip_empty_plots: if false, also show empty plots in report with only nans or zeroes (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int skip_first_n: when plotting data skip first 'n' periods. last_n takes precedence (optional) - :param int skip_last_n: when plotting data skip last 'n' periods. last_n takes precedence (optional) - :param int plot_hist_n: plot histograms for last 'n' periods. default is 1 (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) """ self.store_key = store_key @@ -264,7 +215,6 @@ def __init__( store_key=sections_key, hist_name_starts_with="histogram", settings=settings.section.histograms, - top_n=settings.top_n, ), # section showing all traffic light alerts of monitored statistics TrafficLightSectionGenerator( diff --git a/popmon/visualization/histogram_section.py b/popmon/visualization/histogram_section.py index b678eb7c..11ddc722 100644 --- a/popmon/visualization/histogram_section.py +++ b/popmon/visualization/histogram_section.py @@ -51,7 +51,6 @@ def __init__( ignore_features=None, hist_names=None, hist_name_starts_with="histogram", - top_n=None, ): """Initialize an instance of SectionGenerator. @@ -59,7 +58,6 @@ def __init__( :param str store_key: key for output data to be stored in the datastore :param list features: list of features to pick up from input data (optional) :param list ignore_features: ignore list of features, if present (optional) - :param int top_n: plot heatmap for top 'n' categories. default is 20 (optional) :param list hist_names: list of histogram names to plot :param str hist_name_starts_with: find histograms in case hist_names is empty. default is histogram. """ @@ -72,8 +70,6 @@ def __init__( self.hist_names = hist_names or [] self.hist_name_starts_with = hist_name_starts_with - self.top_n = top_n - # section specific self.section_name = settings.name self.descriptions = settings.descriptions @@ -81,6 +77,7 @@ def __init__( self.hist_names = settings.hist_names self.hist_names_formatted = settings.hist_names_formatted self.plot_hist_n = settings.plot_hist_n + self.top_n = settings.top_n self.cmap = settings.cmap def get_description(self): diff --git a/popmon/visualization/overview_section.py b/popmon/visualization/overview_section.py index 46c6e31a..da268a88 100644 --- a/popmon/visualization/overview_section.py +++ b/popmon/visualization/overview_section.py @@ -44,9 +44,9 @@ def __init__( self, read_key, store_key, + settings: Report, features=None, ignore_features=None, - settings: Report = None, static_bounds=None, dynamic_bounds=None, prefix="traffic_light_", @@ -57,22 +57,13 @@ def __init__( :param str read_key: key of input data to read from the datastore and use for plotting :param str store_key: key for output data to be stored in the datastore - :param str section_name: key of output data to store in the datastore :param list features: list of features to pick up from input data (optional) :param list ignore_features: ignore list of features, if present (optional) - :param int last_n: plot statistic data for last 'n' periods (optional) - :param int skip_first_n: when plotting data skip first 'n' periods. last_n takes precedence (optional) - :param int skip_last_n: in plot skip last 'n' periods. last_n takes precedence (optional) :param str static_bounds: key to static traffic light bounds key in datastore (optional) :param str dynamic_bounds: key to dynamic traffic light bounds key in datastore (optional) :param str prefix: dynamic traffic light prefix. default is ``'traffic_light_'`` (optional) :param str suffices: dynamic traffic light suffices. (optional) :param list ignore_stat_endswith: ignore stats ending with any of list of suffices. (optional) - :param bool skip_empty_plots: if false, also show empty plots in report with only nans or zeroes (optional) - :param str description: description of the section. default is empty (optional) - :param list show_stats: list of statistic name patterns to show in the report. If None, show all (optional) - :param bool plot_overview: heatmap overview of traffic lights (features x time) - :param bool plot_metrics: individual plot per feature """ super().__init__() self.read_key = read_key @@ -82,12 +73,13 @@ def __init__( self.features = features or [] self.ignore_features = ignore_features or [] - self.last_n = settings.last_n - self.skip_first_n = settings.skip_first_n - self.skip_last_n = settings.skip_last_n self.prefix = prefix self.suffices = suffices self.ignore_stat_endswith = ignore_stat_endswith or [] + + self.last_n = settings.last_n + self.skip_first_n = settings.skip_first_n + self.skip_last_n = settings.skip_last_n self.skip_empty_plots = settings.skip_empty_plots self.show_stats = settings.show_stats if not settings.extended_report else None self.section_name = settings.section.overview.name diff --git a/popmon/visualization/section_generator.py b/popmon/visualization/section_generator.py index 71ce72f9..437acc2b 100644 --- a/popmon/visualization/section_generator.py +++ b/popmon/visualization/section_generator.py @@ -81,9 +81,9 @@ def __init__( read_key, store_key, section_name, + settings: Report, features=None, ignore_features=None, - settings: Report = None, static_bounds=None, dynamic_bounds=None, prefix="traffic_light_",