ktr lite house keeping (#440)

* ktr lite house keeping - revert level knots plot - refine tutorial - refine some plotting code * more house keeping - [x] enhance unit test - [x] relable pool - [x] initializer of ktrlite * notebook update * Update test_ktrlite.py * Update test_ktrlite.py * add comment for stan_extract cleaning Co-authored-by: Zhishi Wang <zhishiw@uber.com>
uber · May 30, 2021 · 47b76dd · 47b76dd
1 parent b18c8ad
commit 47b76dd
Show file tree

Hide file tree

Showing 11 changed files with 714 additions and 342 deletions.
diff --git a/docs/tutorials/ktrlite.ipynb b/docs/tutorials/ktrlite.ipynb
diff --git a/examples/ktrlite.ipynb b/examples/ktrlite.ipynb
diff --git a/orbit/constants/ktrlite.py b/orbit/constants/ktrlite.py
@@ -24,8 +24,7 @@ class DataInputMapper(Enum):
     KERNEL_COEFFICIENTS = 'K_COEF'
     NUM_OF_REGRESSORS = 'P'
     REGRESSOR_MATRIX = 'REGRESSORS'
-    COEFFICIENTS_KNOT_POOLING_LOC = 'COEF_KNOT_POOL_LOC'
-    COEFFICIENTS_KNOT_POOLING_SCALE = 'COEF_KNOT_POOL_SCALE'
+    COEFFICIENTS_INITIAL_KNOT_SCALE = 'COEF_INIT_KNOT_SCALE'
     COEFFICIENTS_KNOT_SCALE = 'COEF_KNOT_SCALE'
 
 

diff --git a/orbit/constants/palette.py b/orbit/constants/palette.py
@@ -12,12 +12,17 @@ class QualitativePalette(Enum):
     PostQ = ['#1fc600', '#ff4500']
     # large amount of stacking series
     Stack = ["#12939A", "#F15C17", "#DDB27C", "#88572C", "#FF991F", "#DA70BF", "#125C77",
-            "#4DC19C", "#776E57", "#17B8BE", "#F6D18A", "#B7885E", "#FFCB99", "#F89570",
-            "#829AE3", "#E79FD5", "#1E96BE", "#89DAC1", "#B3AD9E"]
+             "#4DC19C", "#776E57", "#17B8BE", "#F6D18A", "#B7885E", "#FFCB99", "#F89570",
+             "#829AE3", "#E79FD5", "#1E96BE", "#89DAC1", "#B3AD9E"]
     # bar plot
     Bar5 = ["#ef476fff", "#ffd166ff", "#06d6a0ff", "#118ab2ff", "#073b4cff"]
 
 
-class KTRPalette(Enum):
-    KNOTS_SEGMENT = '#276ef1'
-    KNOTS_REGION = '#5b91f5ff'
+class OrbitPalette(Enum):
+    Black = '#000000'
+    DarkGrey = '#333131'
+    SafetyBlue = '#276EF1'
+    LightBlue = '#5B91F5'
+    Green = '#05A357'
+    Orange = '#05A357'
+    Yellow = '#FFC043'
diff --git a/orbit/diagnostics/plot.py b/orbit/diagnostics/plot.py
@@ -10,7 +10,7 @@
 
 from orbit.constants.constants import PredictionKeys
 from orbit.utils.general import is_empty_dataframe, is_ordered_datetime
-from orbit.constants.palette import QualitativePalette, KTRPalette
+from orbit.constants.palette import QualitativePalette
 
 az.style.use("arviz-darkgrid")
 
@@ -422,72 +422,7 @@ def _pair_plot(posterior_samples, pair_type='scatter', n_bins=20):
     return axes
 
 
-def plot_ktr_lev_knots(actual_df, lev_knots_df, date_col, actual_col,
-                       knots_delta_threshold=0.5,
-                       path=None, is_visible=True, title="",
-                       fontsize=16, markersize=150, figsize=(16, 8)):
-    """ Plot the fitted level knots along with the actual time series.
 
-    Parameters
-    ----------
-    actual_df : pd.DataFrame
-        actual data frame including the actual response
-    lev_knots_df : pd.DataFrame
-        level knots data from KTRLite model
-    date_col : str
-        the date column name
-    actual_col : str
-        actual response column name
-    knots_delta_threshold : float
-        standardized threshold of level knots difference to detect change point of levels
-    path : str; optional
-        path to save the figure
-    is_visible : boolean
-        whether we want to show the plot. If called from unittest, is_visible might = False.
-    title : str; optional
-        title of the plot
-    fontsize : int; optional
-        fontsize of the title
-    markersize : int; optional
-        knot marker size
-    figsize : tuple; optional
-        figsize pass through to `matplotlib.pyplot.figure()`
-   Returns
-    -------
-        matplotlib axes object
-    """
-    actuals = actual_df[actual_col]
-    ymin = min(actual_df[actual_col]) * 0.98
-    ymax = max(actual_df[actual_col]) * 1.02
-    fig, ax = plt.subplots(1, 1, figsize=figsize)
-    ax.set_ylim(ymin, ymax)
-    ax.plot(actual_df[date_col], actuals, color='black', lw=1, alpha=0.5, label='actual')
-
-    # plot all the segments divided by knots
-    dt = lev_knots_df[date_col].values
-    ax.vlines(x=dt, ymin=ymin, ymax=ymax, linestyles='dashed', alpha=0.8, linewidth=1,
-              facecolor=KTRPalette.KNOTS_SEGMENT.value, label='knots')
-
-    # standardized threshold of level knots difference to detect change point of levels
-    lk = lev_knots_df['lev_knot'].values
-    lk = (lk - np.mean(lk)) / np.std(lk)
-    lkd = np.diff(lk)
-    flag = np.fabs(lkd) > knots_delta_threshold
-    if flag.any():
-        # due to diff function creates a lag of 1
-        dt = lev_knots_df[date_col].values
-        for idx in np.where(flag)[0]:
-            ax.axvspan(dt[idx], dt[idx+1], facecolor=KTRPalette.KNOTS_REGION.value, alpha=0.5)
-
-    ax.legend()
-    ax.set_title(title, fontsize=fontsize)
-    if path:
-        fig.savefig(path)
-    if is_visible:
-        plt.show()
-    else:
-        plt.close()
-    return ax
 
 
 def get_arviz_plot_dict(mod,
@@ -532,7 +467,7 @@ def get_arviz_plot_dict(mod,
 
 
 def plot_param_diagnostics(mod, incl_noise_params=False, incl_trend_params=False, incl_smooth_params=False,
-                     which='trace', **kwargs):
+                           which='trace', **kwargs):
     """
     Parameters
     -----------

diff --git a/orbit/estimators/stan_estimator.py b/orbit/estimators/stan_estimator.py
@@ -133,6 +133,7 @@ def fit(self, model_name, model_param_names, data_input, fitter=None, init_value
         )
 
         # todo: move dimension cleaning function to the model directly
+        # flatten the first two dims by preserving the chain order
         for key, val in stan_extract.items():
             if len(val.shape) == 2:
                 # here `order` is important to make samples flattened by chain

diff --git a/orbit/initializer/ktrlite.py b/orbit/initializer/ktrlite.py
@@ -0,0 +1,17 @@
+import numpy as np
+from ..constants import ktrlite as constants
+
+
+class KTRLiteInitializer(object):
+    def __init__(self, num_regressor, num_knots_coefficients):
+        self.num_regressor = num_regressor
+        self.num_knots_coefficients = num_knots_coefficients
+
+    def __call__(self):
+        init_values = dict()
+        if self.num_regressor > 1:
+            init_values[constants.RegressionSamplingParameters.COEFFICIENTS_KNOT.value] = np.random.normal(
+                0, 0.1, (self.num_regressor, self.num_knots_coefficients)
+            )
+
+        return init_values
diff --git a/orbit/models/ktrlite.py b/orbit/models/ktrlite.py
@@ -3,14 +3,18 @@
 import math
 from scipy.stats import nct
 from copy import deepcopy
+import matplotlib.pyplot as plt
 
 from ..estimators.stan_estimator import StanEstimatorMAP
 from ..exceptions import IllegalArgument, ModelException
 from ..utils.kernels import sandwich_kernel
 from ..utils.features import make_fourier_series_df
 from .template import BaseTemplate, MAPTemplate
 from ..constants.constants import PredictionKeys, PredictMethod
+from ..initializer.ktrlite import KTRLiteInitializer
 from ..constants import ktrlite as constants
+from orbit.constants.palette import OrbitPalette
+
 
 class BaseKTRLite(BaseTemplate):
     """Base KTRLite model object with shared functionality for MAP method
@@ -23,10 +27,10 @@ class BaseKTRLite(BaseTemplate):
         fourier series order for seasonality
     level_knot_scale : float
         sigma for level; default to be .5
-    seasonal_knot_pooling_scale : float
-        pooling sigma for seasonal fourier series regressors; default to be 1
+    seasonal_initial_knot_scale : float
+        scale parameter for seasonal regressors initial coefficient knots; default to be 1
     seasonal_knot_scale : float
-        sigma for seasonal fourier series regressors; default to be 0.1.
+        scale parameter for seasonal regressors drift of coefficient knots; default to be 0.1.
     span_level : float between (0, 1)
         window width to decide the number of windows for the level (trend) term.
         e.g., span 0.1 will produce 10 windows.
@@ -56,7 +60,7 @@ def __init__(self,
                  seasonality=None,
                  seasonality_fs_order=None,
                  level_knot_scale=0.5,
-                 seasonal_knot_pooling_scale=1.0,
+                 seasonal_initial_knot_scale=1.0,
                  seasonal_knot_scale=0.1,
                  span_level=0.1,
                  span_coefficients=0.3,
@@ -78,7 +82,7 @@ def __init__(self,
 
         self.seasonality = seasonality
         self.seasonality_fs_order = seasonality_fs_order
-        self.seasonal_knot_pooling_scale = seasonal_knot_pooling_scale
+        self.seasonal_initial_knot_scale = seasonal_initial_knot_scale
         self.seasonal_knot_scale = seasonal_knot_scale
 
         # set private var to arg value
@@ -87,22 +91,20 @@ def __init__(self,
         self._seasonality = self.seasonality
         self._seasonality_fs_order = self.seasonality_fs_order
         self._seasonal_knot_scale = self.seasonal_knot_scale
-        self._seasonal_knot_pooling_scale = None
+        self._seasonal_initial_knot_scale = None
         self._seasonal_knot_scale = None
 
         self._level_knot_dates = self.level_knot_dates
         self._degree_of_freedom = degree_of_freedom
 
         self.span_coefficients = span_coefficients
-        # self.rho_coefficients = rho_coefficients
         self.date_freq = date_freq
 
         # regression attributes -- now is ONLY used for fourier series as seasonality
         self.num_of_regressors = 0
         self.regressor_col = list()
         self.regressor_col_gp = list()
-        self.coefficients_knot_pooling_loc = list()
-        self.coefficients_knot_pooling_scale = list()
+        self.coefficients_initial_knot_scale = list()
         self.coefficients_knot_scale = list()
 
         # set static data attributes
@@ -127,9 +129,19 @@ def __init__(self,
         self.num_knots_coefficients = None
         self.knots_tp_coefficients = None
         self.regressor_matrix = None
-        # self.coefficients_knot_dates = None
 
-    # initialization related modules
+    def _set_init_values(self):
+        """Override function from Base Template"""
+        # init_values_partial = partial(init_values_callable, seasonality=seasonality)
+        # partialfunc does not work when passed to PyStan because PyStan uses
+        # inspect.getargspec(func) which seems to raise an exception with keyword-only args
+        # caused by using partialfunc
+        # lambda as an alternative workaround
+        if len(self._seasonality) > 1 and self.num_of_regressors > 0:
+            init_values_callable = KTRLiteInitializer(self.num_of_regressors, self.num_knots_coefficients)
+            self._init_values = init_values_callable
+
+    # set defaults
     def _set_default_args(self):
         """Set default attributes for None
         """
@@ -151,11 +163,11 @@ def _set_default_args(self):
             if 2 * order > self._seasonality[k] - 1:
                 raise IllegalArgument('reduce seasonality_fs_order to avoid over-fitting')
 
-        if not isinstance(self.seasonal_knot_pooling_scale, list) and \
-                isinstance(self.seasonal_knot_pooling_scale * 1.0, float):
-            self._seasonal_knot_pooling_scale = [self.seasonal_knot_pooling_scale] * len(self._seasonality)
+        if not isinstance(self.seasonal_initial_knot_scale, list) and \
+                isinstance(self.seasonal_initial_knot_scale * 1.0, float):
+            self._seasonal_initial_knot_scale = [self.seasonal_initial_knot_scale] * len(self._seasonality)
         else:
-            self._seasonal_knot_pooling_scale = self.seasonal_knot_pooling_scale
+            self._seasonal_initial_knot_scale = self.seasonal_initial_knot_scale
 
         if not isinstance(self.seasonal_knot_scale, list) and isinstance(self.seasonal_knot_scale * 1.0, float):
             self._seasonal_knot_scale = [self.seasonal_knot_scale] * len(self._seasonality)
@@ -166,16 +178,14 @@ def _set_seasonality_attributes(self):
         """given list of seasonalities and their order, create list of seasonal_regressors_columns"""
         self.regressor_col_gp = list()
         self.regressor_col = list()
-        self.coefficients_knot_pooling_loc = list()
-        self.coefficients_knot_pooling_scale = list()
+        self.coefficients_initial_knot_scale = list()
         self.coefficients_knot_scale = list()
 
         if len(self._seasonality) > 0:
             for idx, s in enumerate(self._seasonality):
                 fs_cols = []
                 order = self._seasonality_fs_order[idx]
-                self.coefficients_knot_pooling_loc += [0.0] * order * 2
-                self.coefficients_knot_pooling_scale += [self._seasonal_knot_pooling_scale[idx]] * order * 2
+                self.coefficients_initial_knot_scale += [self._seasonal_initial_knot_scale[idx]] * order * 2
                 self.coefficients_knot_scale += [self._seasonal_knot_scale[idx]] * order * 2
                 for i in range(1, order + 1):
                     fs_cols.append('seas{}_fs_cos{}'.format(s, i))
@@ -235,6 +245,7 @@ def _set_regressor_matrix(self, df):
         if self.num_of_regressors > 0:
             self.regressor_matrix = df.filter(items=self.regressor_col, ).values
 
+    # TODO: docstring and make this a utils since it is quite generic?
     @staticmethod
     def get_gap_between_dates(start_date, end_date, freq):
         diff = end_date - start_date
@@ -271,10 +282,13 @@ def _set_kernel_matrix(self, df):
             self.knots_tp_level = (1 + knots_idx_level) / self.num_of_observations
             self._level_knot_dates = df[self.date_col].values[knots_idx_level]
         else:
+            # to exclude dates which are not within training period
             self._level_knot_dates = pd.to_datetime([
-                x for x in self._level_knot_dates if 
-                (x <= df[self.date_col].max()) and (x >= df[self.date_col].min())
+                x for x in self._level_knot_dates if
+                (x <= df[self.date_col].values[-1]) and (x >= df[self.date_col].values[0])
             ])
+            # since we allow _level_knot_dates to be continuous, we calculate distance between knots
+            # in continuous value as well (instead of index)
             if self.date_freq is None:
                 self.date_freq = pd.infer_freq(df[self.date_col])[0]
             start_date = self.training_start
@@ -412,6 +426,8 @@ def _predict(self, posterior_estimates, df, include_error=False, **kwargs):
         return out
 
 
+
+
 class KTRLiteMAP(MAPTemplate, BaseKTRLite):
     """Concrete KTRLite model for MAP (Maximum a Posteriori) prediction
 
@@ -430,6 +446,60 @@ def get_level_knots(self):
             constants.BaseSamplingParameters.LEVEL_KNOT.value:
             # TODO: this is hacky, investigate why we have an extra dimension here?
                 np.squeeze(self._aggregate_posteriors[PredictMethod.MAP.value][
-                    constants.BaseSamplingParameters.LEVEL_KNOT.value], 0),
+                               constants.BaseSamplingParameters.LEVEL_KNOT.value], 0),
+        }
+        return pd.DataFrame(out)
+
+    def get_levels(self):
+        out = {
+            self.date_col:
+                self.date_array,
+            constants.BaseSamplingParameters.LEVEL.value:
+            # TODO: this is hacky, investigate why we have an extra dimension here?
+                np.squeeze(self._aggregate_posteriors[PredictMethod.MAP.value][
+                               constants.BaseSamplingParameters.LEVEL.value], 0),
         }
-        return pd.DataFrame(out)
+        return pd.DataFrame(out)
+
+    def plot_lev_knots(self, path=None, is_visible=True, title="",
+                       fontsize=16, markersize=250, figsize=(16, 8)):
+        """ Plot the fitted level knots along with the actual time series.
+        Parameters
+        ----------
+        path : str; optional
+            path to save the figure
+        is_visible : boolean
+            whether we want to show the plot. If called from unittest, is_visible might = False.
+        title : str; optional
+            title of the plot
+        fontsize : int; optional
+            fontsize of the title
+        markersize : int; optional
+            knot marker size
+        figsize : tuple; optional
+            figsize pass through to `matplotlib.pyplot.figure()`
+       Returns
+        -------
+            matplotlib axes object
+        """
+        levels_df = self.get_levels()
+        knots_df = self.get_level_knots()
+
+        fig, ax = plt.subplots(1, 1, figsize=figsize)
+        ax.plot(self.date_array, self.response, color=OrbitPalette.DarkGrey.value, lw=1, alpha=0.7, label='actual')
+        ax.plot(levels_df[self.date_col], levels_df[constants.BaseSamplingParameters.LEVEL.value],
+                color=OrbitPalette.SafetyBlue.value, lw=1, alpha=0.8,
+                label=constants.BaseSamplingParameters.LEVEL.value)
+        ax.scatter(knots_df[self.date_col], knots_df[constants.BaseSamplingParameters.LEVEL_KNOT.value],
+                   color=OrbitPalette.Green.value, lw=1, s=markersize, marker='^',  alpha=0.8,
+                   label=constants.BaseSamplingParameters.LEVEL_KNOT.value)
+        ax.legend()
+        ax.grid(True, which='major', c='grey', ls='-', lw=1, alpha=0.5)
+        ax.set_title(title, fontsize=fontsize)
+        if path:
+            fig.savefig(path)
+        if is_visible:
+            plt.show()
+        else:
+            plt.close()
+        return ax