py-why · vsyrgkanis · Jan 12, 2021 · Jan 3, 2021 · Jan 3, 2021 · Jan 3, 2021
diff --git a/doc/reference.rst b/doc/reference.rst
@@ -14,6 +14,7 @@ Public Module Reference
     econml.metalearners
     econml.ortho_forest
     econml.ortho_iv
+    econml.score
     econml.two_stage_least_squares
     econml.utilities
 

diff --git a/econml/__init__.py b/econml/__init__.py
@@ -3,5 +3,5 @@
            'cate_interpreter', 'causal_forest',
            'data', 'deepiv', 'dml', 'drlearner', 'inference',
            'metalearners', 'ortho_forest', 'ortho_iv',
-           'sklearn_extensions', 'tree',
+           'score', 'sklearn_extensions', 'tree',
            'two_stage_least_squares', 'utilities']
diff --git a/econml/score/__init__.py b/econml/score/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""
+A suite of scoring methods for scoring CATE models out-of-sample for the
+purpose of model selection.
+"""
+
+from .rscorer import RScorer
+from .ensemble_cate import EnsembleCateEstimator
+
+__all__ = ['RScorer',
+           'EnsembleCateEstimator']
diff --git a/econml/score/ensemble_cate.py b/econml/score/ensemble_cate.py
@@ -0,0 +1,68 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import numpy as np
+from sklearn.utils.validation import check_array
+from .._cate_estimator import BaseCateEstimator, LinearCateEstimator
+
+
+class EnsembleCateEstimator:
+    """ A CATE estimator that represents a weighted ensemble of many
+    CATE estimators. Returns their weighted effect prediction.
+
+    Parameters
+    ----------
+    cate_models : list of BaseCateEstimator objects
+        A list of fitted cate estimator objects that will be used in the ensemble.
+        The models are passed by reference, and not copied internally, because we
+        need the fitted objects, so any change to the passed models will affect
+        the internal predictions (e.g. if the input models are refitted).
+    weights : np.ndarray of shape (len(cate_models),)
+        The weight placed on each model. Weights must be non-positive. The
+        ensemble will predict effects based on the weighted average predictions
+        of the cate_models estiamtors, weighted by the corresponding weight in `weights`. 
+    """
+
+    def __init__(self, *, cate_models, weights):
+        self._cate_models = cate_models
+        self._weights = weights
+
+    def effect(self, X=None, *, T0=0, T1=1):
+        return np.average([mdl.effect(X=X, T0=T0, T1=T1) for mdl in self.cate_models],
+                          weights=self.weights, axis=0)
+    effect.__doc__ = BaseCateEstimator.effect.__doc__
+
+    def marginal_effect(self, T, X=None):
+        return np.average([mdl.marginal_effect(T, X=X) for mdl in self.cate_models],
+                          weights=self.weights, axis=0)
+    marginal_effect.__doc__ = BaseCateEstimator.marginal_effect.__doc__
+
+    def const_marginal_effect(self, X=None):
+        if np.any([not hasattr(mdl, 'const_marginal_effect') for mdl in self.cate_models]):
+            raise ValueError("One of the base CATE models in parameter `cate_models` does not support "
+                             "the `const_marginal_effect` method.")
+        return np.average([mdl.const_marginal_effect(X=X) for mdl in self.cate_models],
+                          weights=self.weights, axis=0)
+    const_marginal_effect.__doc__ = LinearCateEstimator.const_marginal_effect.__doc__
+
+    @property
+    def cate_models(self):
+        return self._cate_models
+
+    @cate_models.setter
+    def cate_models(self, value):
+        if not isinstance(value, list):
+            raise ValueError('Parameter `cate_models` should be a list of `BaseCateEstimator` objects.')
+        self._cate_models = value
+
+    @property
+    def weights(self):
+        return self._weights
+
+    @weights.setter
+    def weights(self, value):
+        weights = check_array(value, accept_sparse=False, ensure_2d=False, allow_nd=False, dtype='numeric',
+                              force_all_finite=True)
+        if np.any(weights < 0):
+            raise ValueError("All weights in parameter `weights` must be non-negative.")
+        self._weights = weights
diff --git a/econml/score/rscorer.py b/econml/score/rscorer.py
@@ -0,0 +1,219 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+from ..dml import LinearDML
+from sklearn.base import clone
+import numpy as np
+from scipy.special import softmax
+from .ensemble_cate import EnsembleCateEstimator
+
+
+class RScorer:
+    """ Scorer based on the RLearner loss. Fits residual models at fit time and calculates
+    residuals of the evaluation data in a cross-fitting manner::
+
+        Yres = Y - E[Y|X, W]
+        Tres = T - E[T|X, W]
+
+    Then for any given cate model calculates the loss::
+
+        loss(cate) = E_n[(Yres - <cate(X), Tres>)^2]
+
+    Also calculates a baseline loss based on a constant treatment effect model, i.e.::
+
+        base_loss = min_{theta} E_n[(Yres - <theta, Tres>)^2]
+
+    Returns an analogue of the R-square score for regression::
+
+        score = 1 - loss(cate) / base_loss
+
+    This corresponds to the extra variance of the outcome explained by introducing heterogeneity
+    in the effect as captured by the cate model, as opposed to always predicting a constant effect.
+    A negative score, means that the cate model performs even worse than a constant effect model
+    and hints at overfitting during training of the cate model.
+
+    Parameters
+    ----------
+    model_y: estimator
+        The estimator for fitting the response to the features. Must implement
+        `fit` and `predict` methods.
+
+    model_t: estimator
+        The estimator for fitting the treatment to the features. Must implement
+        `fit` and `predict` methods.
+
+    discrete_treatment: bool, optional (default is ``False``)
+        Whether the treatment values should be treated as categorical, rather than continuous, quantities
+
+    categories: 'auto' or list, default 'auto'
+        The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
+        The first category will be treated as the control treatment.
+
+    n_splits: int, cross-validation generator or an iterable, optional (Default=2)
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 3-fold cross-validation,
+        - integer, to specify the number of folds.
+        - :term:`CV splitter`
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For integer/None inputs, if the treatment is discrete
+        :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
+        :class:`~sklearn.model_selection.KFold` is used
+        (with a random shuffle in either case).
+
+        Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
+        W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.
+
+    mc_iters: int, optional (default=None)
+        The number of times to rerun the first stage models to reduce the variance of the nuisances.
+
+    mc_agg: {'mean', 'median'}, optional (default='mean')
+        How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
+        cross-fitting.
+
+    random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
+        If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
+        by :mod:`np.random<numpy.random>`.
+    """
+
+    def __init__(self, *,
+                 model_y,
+                 model_t,
+                 discrete_treatment=False,
+                 categories='auto',
+                 n_splits=2,
+                 mc_iters=None,
+                 mc_agg='mean',
+                 random_state=None):
+        self.model_y = clone(model_y, safe=False)
+        self.model_t = clone(model_t, safe=False)
+        self.discrete_treatment = discrete_treatment
+        self.n_splits = n_splits
+        self.categories = categories
+        self.random_state = random_state
+        self.mc_iters = mc_iters
+        self.mc_agg = mc_agg
+
+    def fit(self, y, T, X=None, W=None, sample_weight=None, groups=None):
+        """
+
+        Parameters
+        ----------
+        Y: (n × d_y) matrix or vector of length n
+            Outcomes for each sample
+        T: (n × dₜ) matrix or vector of length n
+            Treatments for each sample
+        X: optional (n × dₓ) matrix
+            Features for each sample
+        W: optional (n × d_w) matrix
+            Controls for each sample
+        sample_weight: optional (n,) vector
+            Weights for each row
+        groups: (n,) vector, optional
+            All rows corresponding to the same group will be kept together during splitting.
+            If groups is not None, the n_splits argument passed to this class's initializer
+            must support a 'groups' argument to its split method.
+
+        Returns
+        -------
+        self
+        """
+        if X is None:
+            raise ValueError("X cannot be None for the RScorer!")
+
+        self.lineardml_ = LinearDML(model_y=self.model_y,
+                                    model_t=self.model_t,
+                                    n_splits=self.n_splits,
+                                    discrete_treatment=self.discrete_treatment,
+                                    categories=self.categories,
+                                    random_state=self.random_state,
+                                    mc_iters=self.mc_iters,
+                                    mc_agg=self.mc_agg)
+        self.lineardml_.fit(y, T, X=None, W=np.hstack([v for v in [X, W] if v is not None]),
+                            sample_weight=sample_weight, groups=groups, cache_values=True)
+        self.base_score_ = self.lineardml_.score_
+        self.dx_ = X.shape[1]
+        return self
+
+    def score(self, cate_model):
+        """
+        Parameters
+        ----------
+        cate_model : instance of fitted BaseCateEstimator
+
+        Returns
+        -------
+        score : double
+            An analogue of the R-square loss for the causal setting.
+        """
+        Y_res, T_res = self.lineardml_._cached_values.nuisances
+        X = self.lineardml_._cached_values.W[:, :self.dx_]
+        sample_weight = self.lineardml_._cached_values.sample_weight
+        if Y_res.ndim == 1:
+            Y_res = Y_res.reshape((-1, 1))
+        if T_res.ndim == 1:
+            T_res = T_res.reshape((-1, 1))
+        effects = cate_model.const_marginal_effect(X).reshape((-1, Y_res.shape[1], T_res.shape[1]))
+        Y_res_pred = np.einsum('ijk,ik->ij', effects, T_res).reshape(Y_res.shape)
+        if sample_weight is not None:
+            return 1 - np.mean(np.average((Y_res - Y_res_pred)**2, weights=sample_weight, axis=0)) / self.base_score_
+        else:
+            return 1 - np.mean((Y_res - Y_res_pred) ** 2) / self.base_score_
+
+    def best_model(self, cate_models, return_scores=False):
+        """ Chooses the best among a list of models
+
+        Parameters
+        ----------
+        cate_models : list of instances of fitted BaseCateEstimator
+        return_scores : bool, optional (default=False)
+            Whether to return the list scores of each model
+        Returns
+        -------
+        best_model : instance of fitted BaseCateEstimator
+            The model that achieves the best score
+        best_score : double
+            The score of the best model
+        scores : list of double
+            The list of scores for each of the input models. Returned only if `return_scores=True`.
+        """
+        rscores = [self.score(mdl) for mdl in cate_models]
+        best = np.argmax(rscores)
+        if return_scores:
+            return cate_models[best], rscores[best], rscores
+        else:
+            return cate_models[best], rscores[best]
+
+    def ensemble(self, cate_models, eta=1000.0, return_scores=False):
+        """ Ensembles a list of models based on their performance
+
+        Parameters
+        ----------
+        cate_models : list of instances of fitted BaseCateEstimator
+        eta : double, optional (default=1000)
+            The soft-max parameter for the ensemble
+        return_scores : bool, optional (default=False)
+            Whether to return the list scores of each model
+        Returns
+        -------
+        ensemble_model : instance of fitted EnsembleCateEstimator
+            A fitted ensemble cate model that calculates effects based on a weighted
+            version of the input cate models, weighted by a softmax of their score
+            performance
+        ensemble_score : double
+            The score of the ensemble model
+        scores : list of double
+            The list of scores for each of the input models. Returned only if `return_scores=True`.
+        """
+        rscores = np.array([self.score(mdl) for mdl in cate_models])
+        weights = softmax(eta * rscores)
+        ensemble = EnsembleCateEstimator(cate_models=cate_models, weights=weights)
+        ensemble_score = self.score(ensemble)
+        if return_scores:
+            return ensemble, ensemble_score, rscores
+        else:
+            return ensemble, ensemble_score
diff --git a/econml/tests/test_rscorer.py b/econml/tests/test_rscorer.py
@@ -0,0 +1,75 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import unittest
+import numpy as np
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.linear_model import LinearRegression, LogisticRegression
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from joblib import Parallel, delayed
+
+from econml.dml import DML, LinearDML, SparseLinearDML, NonParamDML
+from econml.metalearners import XLearner, TLearner, SLearner, DomainAdaptationLearner
+from econml.drlearner import DRLearner
+from econml.score import RScorer
+
+
+def _fit_model(name, model, Y, T, X):
+    return name, model.fit(Y, T, X=X)
+
+
+class TestRScorer(unittest.TestCase):
+
+    def _get_data(self):
+        X = np.random.normal(0, 1, size=(500, 2))
+        T = np.random.binomial(1, .5, size=(500,))
+        y = X[:, 0] * T + np.random.normal(size=(500,))
+        return y, T, X, X[:, 0]
+
+    def test_comparison(self):
+        def reg():
+            return LinearRegression()
+
+        def clf():
+            return LogisticRegression()
+
+        y, T, X, true_eff = self._get_data()
+        (X_train, X_val, T_train, T_val,
+         Y_train, Y_val, _, true_eff_val) = train_test_split(X, T, y, true_eff, test_size=.4)
+
+        models = [('ldml', LinearDML(model_y=reg(), model_t=clf(), discrete_treatment=True,
+                                     linear_first_stages=False, n_splits=3)),
+                  ('sldml', SparseLinearDML(model_y=reg(), model_t=clf(), discrete_treatment=True,
+                                            featurizer=PolynomialFeatures(degree=2, include_bias=False),
+                                            linear_first_stages=False, n_splits=3)),
+                  ('xlearner', XLearner(models=reg(), cate_models=reg(), propensity_model=clf())),
+                  ('dalearner', DomainAdaptationLearner(models=reg(), final_models=reg(), propensity_model=clf())),
+                  ('slearner', SLearner(overall_model=reg())),
+                  ('tlearner', TLearner(models=reg())),
+                  ('drlearner', DRLearner(model_propensity=clf(), model_regression=reg(),
+                                          model_final=reg(), n_splits=3)),
+                  ('rlearner', NonParamDML(model_y=reg(), model_t=clf(), model_final=reg(),
+                                           discrete_treatment=True, n_splits=3)),
+                  ('dml3dlasso', DML(model_y=reg(), model_t=clf(), model_final=reg(), discrete_treatment=True,
+                                     featurizer=PolynomialFeatures(degree=3),
+                                     linear_first_stages=False, n_splits=3))
+                  ]
+
+        models = Parallel(n_jobs=-1, verbose=1)(delayed(_fit_model)(name, mdl,
+                                                                    Y_train, T_train, X_train)
+                                                for name, mdl in models)
+
+        scorer = RScorer(model_y=reg(), model_t=clf(),
+                         discrete_treatment=True, n_splits=3, mc_iters=2, mc_agg='median')
+        scorer.fit(Y_val, T_val, X=X_val)
+        rscore = [scorer.score(mdl) for _, mdl in models]
+        rootpehe_score = [np.sqrt(np.mean((true_eff_val.flatten() - mdl.effect(X_val).flatten())**2))
+                          for _, mdl in models]
+        assert LinearRegression().fit(np.array(rscore).reshape(-1, 1), np.array(rootpehe_score)).coef_ < 0.5
+        mdl, _ = scorer.best_model([mdl for _, mdl in models])
+        rootpehe_best = np.sqrt(np.mean((true_eff_val.flatten() - mdl.effect(X_val).flatten())**2))
+        assert rootpehe_best < 1.2 * np.min(rootpehe_score)
+        mdl, _ = scorer.ensemble([mdl for _, mdl in models])
+        rootpehe_ensemble = np.sqrt(np.mean((true_eff_val.flatten() - mdl.effect(X_val).flatten())**2))
+        assert rootpehe_ensemble < 1.2 * np.min(rootpehe_score)