From ddbcf1ce366a83fb8da6141e8461bfa818ad38cd Mon Sep 17 00:00:00 2001
From: Jamie Morton <jamietmorton@gmail.com>
Date: Wed, 25 Jan 2017 21:15:19 -0800
Subject: [PATCH 01/12] Adding test cases for loo

---
 gneiss/regression/_ols.py           | 197 +++++++++++++++++++++++++---
 gneiss/regression/tests/test_ols.py |  89 +++++++++++++
 2 files changed, 270 insertions(+), 16 deletions(-)

diff --git a/gneiss/regression/_ols.py b/gneiss/regression/_ols.py
index d3e39bc..1dcf970 100644
--- a/gneiss/regression/_ols.py
+++ b/gneiss/regression/_ols.py
@@ -7,14 +7,30 @@
 # ----------------------------------------------------------------------------
 from decimal import Decimal
 from collections import OrderedDict
-
+import numpy as np
 import pandas as pd
 from gneiss.regression._model import RegressionModel
 from ._regression import (_intersect_of_table_metadata_tree,
                           _to_balances)
 import statsmodels.formula.api as smf
 from statsmodels.iolib.summary2 import Summary
+from statsmodels.sandbox.tools.cross_val import LeaveOneOut
+from patsy import dmatrix
+import inspect
+
 
+def _fit_ols(y, x, **kwargs):
+    """ Perform the basic ols regression."""
+    #one-time creation of exogenous data matrix allows for faster run-time
+    exog_data = x
+    submodels = []
+    for b in y.columns:
+        endog_data = y[b]
+        # mixed effects code is obtained here:
+        # http://stackoverflow.com/a/22439820/1167475
+        mdf = smf.OLS(endog=endog_data, exog=exog_data, **kwargs)
+        submodels.append(mdf)
+    return submodels
 
 # TODO: Register as qiime 2 method
 def ols(formula, table, metadata, tree, **kwargs):
@@ -155,18 +171,9 @@ def ols(formula, table, metadata, tree, **kwargs):
                                                               metadata,
                                                               tree)
     ilr_table, basis = _to_balances(table, tree)
-    data = pd.merge(ilr_table, metadata, left_index=True, right_index=True)
-
-    submodels = []
-
-    for b in ilr_table.columns:
-        # mixed effects code is obtained here:
-        # http://stackoverflow.com/a/22439820/1167475
-        stats_formula = '%s ~ %s' % (b, formula)
-
-        mdf = smf.ols(stats_formula, data=data, **kwargs)
-        submodels.append(mdf)
-
+    ilr_table, metadata = ilr_table.align(metadata, join='inner', axis=0)
+    x = dmatrix(formula, metadata, return_type='dataframe')
+    submodels = _fit_ols(ilr_table, x)
     return OLSModel(submodels, basis=basis,
                     balances=ilr_table,
                     tree=tree)
@@ -203,11 +210,24 @@ def __init__(self, *args, **kwargs):
         """
         super().__init__(*args, **kwargs)
 
-    def fit(self, **kwargs):
-        """ Fit the model """
+    def fit(self, regularized=False, **kwargs):
+        """ Fit the model.
+
+        Parameters
+        ----------
+        regularized : bool
+            Specifies if a regularization procedure should be used
+            when performing the fit. (default = False)
+        **kwargs : dict
+           Keyword arguments used to tune the parameter estimation.
+
+        """
         for s in self.submodels:
             # assumes that the underlying submodels have implemented `fit`.
-            m = s.fit(**kwargs)
+            if regularized:
+                m = s.fit(**kwargs)
+            else:
+                m = s.fit_regularized(**kwargs)
             self.results.append(m)
 
     def summary(self, ndim=10):
@@ -298,3 +318,148 @@ def r2(self):
 
         sst = sse + ssr
         return 1 - sse / sst
+
+    @property
+    def mse(self):
+        sse = sum([r.ssr for r in self.results])
+        dfe = self.results[0].df_resid
+        return sse / dfe
+
+    def loo(self, regularized=False, **kwargs):
+        """ Leave one out cross-validation.
+
+        Calculates summary statistics for each iteraction of
+        leave one out cross-validation, specially `mse` on entire model
+        and `pred_err` to measure prediction error.
+
+        Parameters
+        ----------
+        regularized : bool
+            Specifies if a regularization procedure should be used
+            when performing the fit.
+        **kwargs : dict
+           Keyword arguments used to tune the parameter estimation.
+
+        Returns
+        -------
+        pd.DataFrame
+           mse : np.array, float
+               Mean sum of squares error for each iteration of
+               the cross validation.
+           pred_err : np.array, float
+               Prediction mean sum of squares error for each iteration of
+               the cross validation.
+
+        See Also
+        --------
+        fit
+        """
+        params = {'fit_regularized': False}
+        for key in params:
+            params[key] = kwargs.get(key, params[key])
+            kwargs.pop(key, None)
+
+        nobs = self.balances.shape[0] # number of observations (i.e. samples)
+        cv_iter = LeaveOneOut(nobs)
+        endog = self.balances
+        exog_names = self.results[0].model.exog_names
+        exog = pd.DataFrame(self.results[0].model.exog,
+                            index=self.balances.index,
+                            columns=exog_names)
+        results = pd.DataFrame(index=self.balances.index,
+                               columns=['mse', 'pred_err'])
+        i = 0
+        for inidx, outidx in cv_iter:
+            sample_id = self.balances.index[i]
+            res_i = _fit_ols(y=endog.loc[inidx], x=exog.loc[inidx], **kwargs)
+            if params['fit_regularized']:
+                res_i = [r.fit_regularized(**kwargs) for r in res_i]
+            else:
+                res_i = [r.fit(**kwargs) for r in res_i]
+
+            # mean sum of squares error
+            sse = sum([r.ssr for r in res_i])
+            # degrees of freedom for residuals
+            dfe = res_i[0].df_resid
+            results.loc[sample_id, 'mse'] = sse / dfe
+
+            # prediction error on loo point
+            predicted = np.hstack([r.predict(exog.loc[outidx]) for r in res_i])
+
+            pred_sse = np.sum((predicted - self.balances.loc[outidx])**2)
+            results.loc[sample_id, 'pred_err'] = pred_sse.sum()
+            i += 1
+        return results
+
+    def lovo(self, **kwargs):
+        """ Leave one variable out cross-validation.
+
+        Calculates summary statistics for each iteraction of leave one variable
+        out cross-validation, specially `r2` and `mse` on entire model.
+        This technique is particularly useful for feature selection.
+
+        Parameters
+        ----------
+        regularized : bool
+            Specifies if a regularization procedure should be used
+            when performing the fit.
+        **kwargs : dict
+           Keyword arguments used to tune the parameter estimation.
+
+        Returns
+        -------
+        pd.DataFrame
+           Rsquared : np.array, flot
+               Coefficient of determination for each variable left out.
+           mse : np.array, float
+               Mean sum of squares error for each iteration of
+               the cross validation.
+        """
+
+        params = {'regularized': False}
+        for key in params:
+            params[key] = kwargs.get(key, params[key])
+
+            kwargs.pop(key, None)
+
+        nobs = self.balances.shape[0] # number of observations (i.e. samples)
+
+        endog = self.balances
+        exog_names = self.results[0].model.exog_names
+        exog = pd.DataFrame(self.results[0].model.exog,
+                            index=self.balances.index,
+                            columns=exog_names)
+        cv_iter = LeaveOneOut(len(exog_names))
+        results = pd.DataFrame(index=exog_names,
+                               columns=['mse', 'Rsquared'])
+        i = 0
+        for inidx, outidx in cv_iter:
+            feature_id = exog_names[i]
+            res_i = _fit_ols(endog, exog.loc[:, inidx], **kwargs)
+            if params['regularized']:
+                res_i = [r.fit_regularized(**kwargs) for r in res_i]
+            else:
+                res_i = [r.fit(**kwargs) for r in res_i]
+            # See `statsmodels.regression.linear_model.RegressionResults`
+            # for more explanation on `ess` and `ssr`.
+            # sum of squares regression. Also referred to as
+            # explained sum of squares.
+            ssr = sum([r.ess for r in res_i])
+            # sum of squares error.  Also referred to as sum of squares residuals
+            sse = sum([r.ssr for r in res_i])
+            # calculate the overall coefficient of determination (i.e. R2)
+            sst = sse + ssr
+            results.loc[feature_id, 'Rsquared'] = 1 - sse / sst
+            # degrees of freedom for residuals
+            dfe = res_i[0].df_resid
+            results.loc[feature_id, 'mse'] = sse / dfe
+            i+=1
+        return results
+
+    def percent_explained(self):
+        """ Proportion explained by each principal balance."""
+        # Using sum of squares error calculation (df=1)
+        # instead of population variance (df=0).
+        axis_vars = np.var(self.balances, ddof=1, axis=0)
+        return axis_vars / axis_vars.sum()
+
diff --git a/gneiss/regression/tests/test_ols.py b/gneiss/regression/tests/test_ols.py
index 4293773..5d28ffa 100644
--- a/gneiss/regression/tests/test_ols.py
+++ b/gneiss/regression/tests/test_ols.py
@@ -33,6 +33,20 @@ def setUp(self):
             'real': [1, 2, 3, 4, 5]
         }, index=['s1', 's2', 's3', 's4', 's5'])
 
+        np.random.seed(0)
+        n = 15
+        a = np.array([1, 4.2, 5.3, -2.2, 8])
+        x1 = np.linspace(.01, 0.1, n)
+        x2 = np.logspace(0, 0.01, n)
+        x3 = np.exp(np.linspace(0, 0.01, n))
+        x4 = x1 ** 2
+        self.x = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'x4':x4})
+        y = (a[0] + a[1]*x1 + a[2]*x2 + a[3]*x3 + a[4]*x4 + \
+             (np.random.normal(size=n)))
+        sy = np.vstack((y, y/10)).T
+        self.y = pd.DataFrame(ilr_inv(sy), columns=['a', 'b', 'c'])
+        self.t2 = TreeNode.read([r"((a,b)n,c);"])
+
     def test_ols(self):
         res = ols('real', self.table, self.metadata, self.tree)
         res.fit()
@@ -226,6 +240,81 @@ def test_summary_head(self):
             exp = fh.read()
             self.assertEqual(res, exp)
 
+    def test_fit_regularized(self):
+        res = ols(formula="x1 + x2 + x3 + x4",
+                  table=self.y, metadata=self.x, tree=self.t2)
+        res.fit(regularized=True)
+        exp_coefs = pd.DataFrame({
+            'y0': [-2.074816e+09, -2.039271e+08, -1.843143e+08,
+                   2.261170e+09, -7.925699e+06],
+            'y1': [-2.074816e+10, -2.039271e+09, -1.843143e+09,
+                   2.261170e+10, -7.925699e+07]
+            }, index=['Intercept', 'x1', 'x2', 'x3', 'x4']).T
+        # The L1 regularization either has numerical issues
+        # or is random ...
+        res_coefs = res.coefficients()
+        pdt.assert_index_equal(exp_coefs.index, res_coefs.index)
+        pdt.assert_index_equal(exp_coefs.columns, res_coefs.columns)
+
+
+    def test_loo(self):
+        res = ols(formula="x1 + x2 + x3 + x4",
+                  table=self.y, metadata=self.x, tree=self.t2)
+        res.fit()
+        exp_loo = pd.DataFrame([[0.66953263510975791, 10.994700550912553],
+                                [0.69679777354984163, 2.3613911713947062],
+                                [0.84934173316473072, 0.4057812892157881],
+                                [0.6990546679957772, 2.2872776593899351],
+                                [0.72855466737125463, 1.7615637744849277],
+                                [0.55998953661859308, 3.617823652256889],
+                                [0.81787392852582308, 0.72395497360494043],
+                                [0.8653549732546999, 0.17706927499520822],
+                                [0.86983181933002329, 0.1216027316667969],
+                                [0.87779006612352628, 0.028600627330344405],
+                                [0.86591226075609384, 0.16724511075065476],
+                                [0.7787232221539, 1.2820054843437292],
+                                [0.88032413856094505, 3.4113910096200831e-06],
+                                [0.83195133809800792, 0.62276589277034022],
+                                [0.85352707356786695, 1.4038585971691198]],
+                               columns=['mse', 'pred_err'],
+                               index=self.y.index)
+        res_loo = res.loo().astype(np.float)
+        pdt.assert_frame_equal(exp_loo, res_loo)
+
+        # Make sure that the regularization works
+        exp_loo = pd.DataFrame([[0.967322, 0.167080],
+                                [0.849595, 1.565647],
+                                [0.975039, 0.021956],
+                                [0.752424, 2.312112],
+                                [0.820359, 1.605247],
+                                [0.643426, 3.464196],
+                                [0.953059, 0.252861],
+                                [0.930367, 0.495480],
+                                [0.945151, 0.336588],
+                                [0.976778, 0.003095],
+                                [0.971040, 0.061874],
+                                [0.830764, 1.505806],
+                                [0.963017, 0.151680],
+                                [0.953350, 0.291705],
+                                [0.970680, 0.107289]],
+                               columns=['mse', 'pred_err'],
+                               index=self.y.index)
+        res = ols(formula="x1 + x2 + x3 + x4",
+                  table=self.y, metadata=self.x, tree=self.t2)
+        res.fit(regularized=True)
+        res_loo = res.loo(fit_regularized=True).astype(np.float)
+
+        pdt.assert_frame_equal(exp_loo, res_loo, check_less_precise=True)
+
+    def test_lovo(self):
+        self.assertTrue(False)
+
+    def test_percent_explained(self):
+        self.assertTrue(False)
+
+    def test_mse(self):
+        self.assertTrue(False)
+
 
 if __name__ == "__main__":
     unittest.main()

From bbb6a02fd939619f6d111c4ff74ee4acdb8d4d9a Mon Sep 17 00:00:00 2001
From: Jamie Morton <jamietmorton@gmail.com>
Date: Wed, 25 Jan 2017 21:27:26 -0800
Subject: [PATCH 02/12] Adding lovo, percent_explained and mse tests

---
 gneiss/regression/tests/test_ols.py | 41 ++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/gneiss/regression/tests/test_ols.py b/gneiss/regression/tests/test_ols.py
index 5d28ffa..68372fe 100644
--- a/gneiss/regression/tests/test_ols.py
+++ b/gneiss/regression/tests/test_ols.py
@@ -303,17 +303,50 @@ def test_loo(self):
                   table=self.y, metadata=self.x, tree=self.t2)
         res.fit(regularized=True)
         res_loo = res.loo(fit_regularized=True).astype(np.float)
-
         pdt.assert_frame_equal(exp_loo, res_loo, check_less_precise=True)
 
     def test_lovo(self):
-        self.assertTrue(False)
+        res = ols(formula="x1 + x2 + x3 + x4",
+                  table=self.y, metadata=self.x, tree=self.t2)
+        res.fit()
+        exp_lovo = pd.DataFrame([[0.799364, 0.978214],
+                                 [0.799363, 0.097355],
+                                 [0.799368, 0.0973498],
+                                 [0.799364, 0.097354],
+                                 [0.799361, 0.0973575]],
+                                columns=['mse', 'Rsquared'],
+                                index=['Intercept', 'x1', 'x2', 'x3', 'x4'])
+        res_lovo = res.lovo().astype(np.float)
+        pdt.assert_frame_equal(exp_lovo, res_lovo)
+
+        # Make sure that the regularization works
+        exp_lovo = pd.DataFrame([[0.799364, 0.978214],
+                                 [0.799363, 0.097355],
+                                 [0.799368, 0.0973498],
+                                 [0.799364, 0.097354],
+                                 [0.799361, 0.0973575]],
+                                columns=['mse', 'Rsquared'],
+                                index=['Intercept', 'x1', 'x2', 'x3', 'x4'])
+        res = ols(formula="x1 + x2 + x3 + x4",
+                  table=self.y, metadata=self.x, tree=self.t2)
+        res.fit(regularized=True)
+        res_lovo = res.lovo(fit_regularized=True).astype(np.float)
+        pdt.assert_frame_equal(exp_lovo, res_lovo, check_less_precise=True)
 
     def test_percent_explained(self):
-        self.assertTrue(False)
+        res = ols(formula="x1 + x2 + x3 + x4",
+                  table=self.y, metadata=self.x, tree=self.t2)
+        res.fit()
+        res_perc = res.percent_explained()
+        exp_perc = pd.Series({'y0': 0.009901,
+                              'y1': 0.990099})
+        pdt.assert_series_equal(res_perc, exp_perc)
 
     def test_mse(self):
-        self.assertTrue(False)
+        res = ols(formula="x1 + x2 + x3 + x4",
+                  table=self.y, metadata=self.x, tree=self.t2)
+        res.fit()
+        self.assertEquals(res.mse, 0.87936961790361856)
 
 
 if __name__ == "__main__":

From c4f47fd940219e8b95cf2a706e9409c86633ee2a Mon Sep 17 00:00:00 2001
From: Jamie Morton <jamietmorton@gmail.com>
Date: Wed, 25 Jan 2017 21:30:08 -0800
Subject: [PATCH 03/12] flake8

---
 gneiss/regression/_ols.py           | 15 +++++++--------
 gneiss/regression/tests/test_ols.py |  7 +++----
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/gneiss/regression/_ols.py b/gneiss/regression/_ols.py
index 1dcf970..fec7670 100644
--- a/gneiss/regression/_ols.py
+++ b/gneiss/regression/_ols.py
@@ -21,7 +21,6 @@
 
 def _fit_ols(y, x, **kwargs):
     """ Perform the basic ols regression."""
-    #one-time creation of exogenous data matrix allows for faster run-time
     exog_data = x
     submodels = []
     for b in y.columns:
@@ -32,6 +31,7 @@ def _fit_ols(y, x, **kwargs):
         submodels.append(mdf)
     return submodels
 
+
 # TODO: Register as qiime 2 method
 def ols(formula, table, metadata, tree, **kwargs):
     """ Ordinary Least Squares applied to balances.
@@ -172,6 +172,7 @@ def ols(formula, table, metadata, tree, **kwargs):
                                                               tree)
     ilr_table, basis = _to_balances(table, tree)
     ilr_table, metadata = ilr_table.align(metadata, join='inner', axis=0)
+    # one-time creation of exogenous data matrix allows for faster run-time
     x = dmatrix(formula, metadata, return_type='dataframe')
     submodels = _fit_ols(ilr_table, x)
     return OLSModel(submodels, basis=basis,
@@ -359,7 +360,7 @@ def loo(self, regularized=False, **kwargs):
             params[key] = kwargs.get(key, params[key])
             kwargs.pop(key, None)
 
-        nobs = self.balances.shape[0] # number of observations (i.e. samples)
+        nobs = self.balances.shape[0]  # number of observations (i.e. samples)
         cv_iter = LeaveOneOut(nobs)
         endog = self.balances
         exog_names = self.results[0].model.exog_names
@@ -422,7 +423,7 @@ def lovo(self, **kwargs):
 
             kwargs.pop(key, None)
 
-        nobs = self.balances.shape[0] # number of observations (i.e. samples)
+        nobs = self.balances.shape[0]  # number of observations (i.e. samples)
 
         endog = self.balances
         exog_names = self.results[0].model.exog_names
@@ -442,10 +443,9 @@ def lovo(self, **kwargs):
                 res_i = [r.fit(**kwargs) for r in res_i]
             # See `statsmodels.regression.linear_model.RegressionResults`
             # for more explanation on `ess` and `ssr`.
-            # sum of squares regression. Also referred to as
-            # explained sum of squares.
+            # sum of squares regression.
             ssr = sum([r.ess for r in res_i])
-            # sum of squares error.  Also referred to as sum of squares residuals
+            # sum of squares error.
             sse = sum([r.ssr for r in res_i])
             # calculate the overall coefficient of determination (i.e. R2)
             sst = sse + ssr
@@ -453,7 +453,7 @@ def lovo(self, **kwargs):
             # degrees of freedom for residuals
             dfe = res_i[0].df_resid
             results.loc[feature_id, 'mse'] = sse / dfe
-            i+=1
+            i += 1
         return results
 
     def percent_explained(self):
@@ -462,4 +462,3 @@ def percent_explained(self):
         # instead of population variance (df=0).
         axis_vars = np.var(self.balances, ddof=1, axis=0)
         return axis_vars / axis_vars.sum()
-
diff --git a/gneiss/regression/tests/test_ols.py b/gneiss/regression/tests/test_ols.py
index 68372fe..a878b9b 100644
--- a/gneiss/regression/tests/test_ols.py
+++ b/gneiss/regression/tests/test_ols.py
@@ -40,9 +40,9 @@ def setUp(self):
         x2 = np.logspace(0, 0.01, n)
         x3 = np.exp(np.linspace(0, 0.01, n))
         x4 = x1 ** 2
-        self.x = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'x4':x4})
-        y = (a[0] + a[1]*x1 + a[2]*x2 + a[3]*x3 + a[4]*x4 + \
-             (np.random.normal(size=n)))
+        self.x = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'x4': x4})
+        y = (a[0] + a[1]*x1 + a[2]*x2 + a[3]*x3 + a[4]*x4 +
+             np.random.normal(size=n))
         sy = np.vstack((y, y/10)).T
         self.y = pd.DataFrame(ilr_inv(sy), columns=['a', 'b', 'c'])
         self.t2 = TreeNode.read([r"((a,b)n,c);"])
@@ -256,7 +256,6 @@ def test_fit_regularized(self):
         pdt.assert_index_equal(exp_coefs.index, res_coefs.index)
         pdt.assert_index_equal(exp_coefs.columns, res_coefs.columns)
 
-
     def test_loo(self):
         res = ols(formula="x1 + x2 + x3 + x4",
                   table=self.y, metadata=self.x, tree=self.t2)

From c4406081e3c7e1a7b7406c99d8d3fe586b9b7e3e Mon Sep 17 00:00:00 2001
From: Jamie Morton <jamietmorton@gmail.com>
Date: Wed, 25 Jan 2017 21:43:16 -0800
Subject: [PATCH 04/12] flake8

---
 gneiss/regression/_ols.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/gneiss/regression/_ols.py b/gneiss/regression/_ols.py
index fec7670..b338594 100644
--- a/gneiss/regression/_ols.py
+++ b/gneiss/regression/_ols.py
@@ -16,7 +16,6 @@
 from statsmodels.iolib.summary2 import Summary
 from statsmodels.sandbox.tools.cross_val import LeaveOneOut
 from patsy import dmatrix
-import inspect
 
 
 def _fit_ols(y, x, **kwargs):
@@ -423,8 +422,6 @@ def lovo(self, **kwargs):
 
             kwargs.pop(key, None)
 
-        nobs = self.balances.shape[0]  # number of observations (i.e. samples)
-
         endog = self.balances
         exog_names = self.results[0].model.exog_names
         exog = pd.DataFrame(self.results[0].model.exog,

From 129f81ce290a456f2896bca72be06f6e621610f8 Mon Sep 17 00:00:00 2001
From: Jamie Morton <jamietmorton@gmail.com>
Date: Wed, 25 Jan 2017 21:47:24 -0800
Subject: [PATCH 05/12] fixing dumb numerical issues

---
 gneiss/regression/tests/test_ols.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gneiss/regression/tests/test_ols.py b/gneiss/regression/tests/test_ols.py
index a878b9b..54b07b5 100644
--- a/gneiss/regression/tests/test_ols.py
+++ b/gneiss/regression/tests/test_ols.py
@@ -278,7 +278,7 @@ def test_loo(self):
                                columns=['mse', 'pred_err'],
                                index=self.y.index)
         res_loo = res.loo().astype(np.float)
-        pdt.assert_frame_equal(exp_loo, res_loo)
+        pdt.assert_frame_equal(exp_loo, res_loo, check_less_precise=True)
 
         # Make sure that the regularization works
         exp_loo = pd.DataFrame([[0.967322, 0.167080],
@@ -316,7 +316,7 @@ def test_lovo(self):
                                 columns=['mse', 'Rsquared'],
                                 index=['Intercept', 'x1', 'x2', 'x3', 'x4'])
         res_lovo = res.lovo().astype(np.float)
-        pdt.assert_frame_equal(exp_lovo, res_lovo)
+        pdt.assert_frame_equal(exp_lovo, res_lovo, check_less_precise=True)
 
         # Make sure that the regularization works
         exp_lovo = pd.DataFrame([[0.799364, 0.978214],

From be4633e77192e5725846562c14f06ca99982cb12 Mon Sep 17 00:00:00 2001
From: Jamie Morton <jamietmorton@gmail.com>
Date: Fri, 27 Jan 2017 14:13:46 -0800
Subject: [PATCH 06/12] Removing regularization

Addressed @josenavas's comments
---
 gneiss/regression/_ols.py           | 62 ++++++-----------------------
 gneiss/regression/tests/test_ols.py | 57 +-------------------------
 2 files changed, 14 insertions(+), 105 deletions(-)

diff --git a/gneiss/regression/_ols.py b/gneiss/regression/_ols.py
index b338594..723e780 100644
--- a/gneiss/regression/_ols.py
+++ b/gneiss/regression/_ols.py
@@ -21,13 +21,9 @@
 def _fit_ols(y, x, **kwargs):
     """ Perform the basic ols regression."""
     exog_data = x
-    submodels = []
-    for b in y.columns:
-        endog_data = y[b]
-        # mixed effects code is obtained here:
-        # http://stackoverflow.com/a/22439820/1167475
-        mdf = smf.OLS(endog=endog_data, exog=exog_data, **kwargs)
-        submodels.append(mdf)
+    # mixed effects code is obtained here:
+    # http://stackoverflow.com/a/22439820/1167475
+    submodels = [smf.OLS(endog=y[b], exog=x, **kwargs) for b in y.columns]
     return submodels
 
 
@@ -222,13 +218,8 @@ def fit(self, regularized=False, **kwargs):
            Keyword arguments used to tune the parameter estimation.
 
         """
-        for s in self.submodels:
-            # assumes that the underlying submodels have implemented `fit`.
-            if regularized:
-                m = s.fit(**kwargs)
-            else:
-                m = s.fit_regularized(**kwargs)
-            self.results.append(m)
+        # assumes that the underlying submodels have implemented `fit`.
+        self.results = [s.fit(**kwargs) for s in self.submodels]
 
     def summary(self, ndim=10):
         """ Summarize the Ordinary Least Squares Regression Results.
@@ -298,7 +289,6 @@ def _format(x):
         smry.add_dict(info, ncols=1)
         smry.add_title("Simplicial Least Squares Results")
         smry.add_df(scores, align='r')
-
         return smry
 
     @property
@@ -306,7 +296,6 @@ def r2(self):
         """ Coefficient of determination for overall fit"""
         # Reason why we wanted to move this out was because not
         # all types of statsmodels regressions have this property.
-
         # See `statsmodels.regression.linear_model.RegressionResults`
         # for more explanation on `ess` and `ssr`.
         # sum of squares regression. Also referred to as
@@ -315,17 +304,17 @@ def r2(self):
         # sum of squares error.  Also referred to as sum of squares residuals
         sse = sum([r.ssr for r in self.results])
         # calculate the overall coefficient of determination (i.e. R2)
-
         sst = sse + ssr
         return 1 - sse / sst
 
     @property
     def mse(self):
+        """ Mean Sum of squares Error"""
         sse = sum([r.ssr for r in self.results])
         dfe = self.results[0].df_resid
         return sse / dfe
 
-    def loo(self, regularized=False, **kwargs):
+    def loo(self, **kwargs):
         """ Leave one out cross-validation.
 
         Calculates summary statistics for each iteraction of
@@ -334,9 +323,6 @@ def loo(self, regularized=False, **kwargs):
 
         Parameters
         ----------
-        regularized : bool
-            Specifies if a regularization procedure should be used
-            when performing the fit.
         **kwargs : dict
            Keyword arguments used to tune the parameter estimation.
 
@@ -353,11 +339,8 @@ def loo(self, regularized=False, **kwargs):
         See Also
         --------
         fit
+        statsmodels.regression.linear_model.
         """
-        params = {'fit_regularized': False}
-        for key in params:
-            params[key] = kwargs.get(key, params[key])
-            kwargs.pop(key, None)
 
         nobs = self.balances.shape[0]  # number of observations (i.e. samples)
         cv_iter = LeaveOneOut(nobs)
@@ -368,14 +351,11 @@ def loo(self, regularized=False, **kwargs):
                             columns=exog_names)
         results = pd.DataFrame(index=self.balances.index,
                                columns=['mse', 'pred_err'])
-        i = 0
-        for inidx, outidx in cv_iter:
+
+        for i, (inidx, outidx) in enumerate(cv_iter):
             sample_id = self.balances.index[i]
             res_i = _fit_ols(y=endog.loc[inidx], x=exog.loc[inidx], **kwargs)
-            if params['fit_regularized']:
-                res_i = [r.fit_regularized(**kwargs) for r in res_i]
-            else:
-                res_i = [r.fit(**kwargs) for r in res_i]
+            res_i = [r.fit(**kwargs) for r in res_i]
 
             # mean sum of squares error
             sse = sum([r.ssr for r in res_i])
@@ -388,7 +368,6 @@ def loo(self, regularized=False, **kwargs):
 
             pred_sse = np.sum((predicted - self.balances.loc[outidx])**2)
             results.loc[sample_id, 'pred_err'] = pred_sse.sum()
-            i += 1
         return results
 
     def lovo(self, **kwargs):
@@ -400,9 +379,6 @@ def lovo(self, **kwargs):
 
         Parameters
         ----------
-        regularized : bool
-            Specifies if a regularization procedure should be used
-            when performing the fit.
         **kwargs : dict
            Keyword arguments used to tune the parameter estimation.
 
@@ -415,13 +391,6 @@ def lovo(self, **kwargs):
                Mean sum of squares error for each iteration of
                the cross validation.
         """
-
-        params = {'regularized': False}
-        for key in params:
-            params[key] = kwargs.get(key, params[key])
-
-            kwargs.pop(key, None)
-
         endog = self.balances
         exog_names = self.results[0].model.exog_names
         exog = pd.DataFrame(self.results[0].model.exog,
@@ -430,14 +399,10 @@ def lovo(self, **kwargs):
         cv_iter = LeaveOneOut(len(exog_names))
         results = pd.DataFrame(index=exog_names,
                                columns=['mse', 'Rsquared'])
-        i = 0
-        for inidx, outidx in cv_iter:
+        for i, (inidx, outidx) in enumerate(cv_iter):
             feature_id = exog_names[i]
             res_i = _fit_ols(endog, exog.loc[:, inidx], **kwargs)
-            if params['regularized']:
-                res_i = [r.fit_regularized(**kwargs) for r in res_i]
-            else:
-                res_i = [r.fit(**kwargs) for r in res_i]
+            res_i = [r.fit(**kwargs) for r in res_i]
             # See `statsmodels.regression.linear_model.RegressionResults`
             # for more explanation on `ess` and `ssr`.
             # sum of squares regression.
@@ -450,7 +415,6 @@ def lovo(self, **kwargs):
             # degrees of freedom for residuals
             dfe = res_i[0].df_resid
             results.loc[feature_id, 'mse'] = sse / dfe
-            i += 1
         return results
 
     def percent_explained(self):
diff --git a/gneiss/regression/tests/test_ols.py b/gneiss/regression/tests/test_ols.py
index 54b07b5..975b147 100644
--- a/gneiss/regression/tests/test_ols.py
+++ b/gneiss/regression/tests/test_ols.py
@@ -240,22 +240,6 @@ def test_summary_head(self):
             exp = fh.read()
             self.assertEqual(res, exp)
 
-    def test_fit_regularized(self):
-        res = ols(formula="x1 + x2 + x3 + x4",
-                  table=self.y, metadata=self.x, tree=self.t2)
-        res.fit(regularized=True)
-        exp_coefs = pd.DataFrame({
-            'y0': [-2.074816e+09, -2.039271e+08, -1.843143e+08,
-                   2.261170e+09, -7.925699e+06],
-            'y1': [-2.074816e+10, -2.039271e+09, -1.843143e+09,
-                   2.261170e+10, -7.925699e+07]
-            }, index=['Intercept', 'x1', 'x2', 'x3', 'x4']).T
-        # The L1 regularization either has numerical issues
-        # or is random ...
-        res_coefs = res.coefficients()
-        pdt.assert_index_equal(exp_coefs.index, res_coefs.index)
-        pdt.assert_index_equal(exp_coefs.columns, res_coefs.columns)
-
     def test_loo(self):
         res = ols(formula="x1 + x2 + x3 + x4",
                   table=self.y, metadata=self.x, tree=self.t2)
@@ -280,30 +264,6 @@ def test_loo(self):
         res_loo = res.loo().astype(np.float)
         pdt.assert_frame_equal(exp_loo, res_loo, check_less_precise=True)
 
-        # Make sure that the regularization works
-        exp_loo = pd.DataFrame([[0.967322, 0.167080],
-                                [0.849595, 1.565647],
-                                [0.975039, 0.021956],
-                                [0.752424, 2.312112],
-                                [0.820359, 1.605247],
-                                [0.643426, 3.464196],
-                                [0.953059, 0.252861],
-                                [0.930367, 0.495480],
-                                [0.945151, 0.336588],
-                                [0.976778, 0.003095],
-                                [0.971040, 0.061874],
-                                [0.830764, 1.505806],
-                                [0.963017, 0.151680],
-                                [0.953350, 0.291705],
-                                [0.970680, 0.107289]],
-                               columns=['mse', 'pred_err'],
-                               index=self.y.index)
-        res = ols(formula="x1 + x2 + x3 + x4",
-                  table=self.y, metadata=self.x, tree=self.t2)
-        res.fit(regularized=True)
-        res_loo = res.loo(fit_regularized=True).astype(np.float)
-        pdt.assert_frame_equal(exp_loo, res_loo, check_less_precise=True)
-
     def test_lovo(self):
         res = ols(formula="x1 + x2 + x3 + x4",
                   table=self.y, metadata=self.x, tree=self.t2)
@@ -318,20 +278,6 @@ def test_lovo(self):
         res_lovo = res.lovo().astype(np.float)
         pdt.assert_frame_equal(exp_lovo, res_lovo, check_less_precise=True)
 
-        # Make sure that the regularization works
-        exp_lovo = pd.DataFrame([[0.799364, 0.978214],
-                                 [0.799363, 0.097355],
-                                 [0.799368, 0.0973498],
-                                 [0.799364, 0.097354],
-                                 [0.799361, 0.0973575]],
-                                columns=['mse', 'Rsquared'],
-                                index=['Intercept', 'x1', 'x2', 'x3', 'x4'])
-        res = ols(formula="x1 + x2 + x3 + x4",
-                  table=self.y, metadata=self.x, tree=self.t2)
-        res.fit(regularized=True)
-        res_lovo = res.lovo(fit_regularized=True).astype(np.float)
-        pdt.assert_frame_equal(exp_lovo, res_lovo, check_less_precise=True)
-
     def test_percent_explained(self):
         res = ols(formula="x1 + x2 + x3 + x4",
                   table=self.y, metadata=self.x, tree=self.t2)
@@ -345,8 +291,7 @@ def test_mse(self):
         res = ols(formula="x1 + x2 + x3 + x4",
                   table=self.y, metadata=self.x, tree=self.t2)
         res.fit()
-        self.assertEquals(res.mse, 0.87936961790361856)
-
+        self.assertEquals(res.mse, 0.79228890379010453)
 
 if __name__ == "__main__":
     unittest.main()

From 823eccde47279a11bb422d4e4a10b2210e51b73a Mon Sep 17 00:00:00 2001
From: Jamie Morton <jamietmorton@gmail.com>
Date: Fri, 27 Jan 2017 14:14:27 -0800
Subject: [PATCH 07/12] flake9

---
 gneiss/regression/_ols.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gneiss/regression/_ols.py b/gneiss/regression/_ols.py
index 723e780..4ef448b 100644
--- a/gneiss/regression/_ols.py
+++ b/gneiss/regression/_ols.py
@@ -20,7 +20,6 @@
 
 def _fit_ols(y, x, **kwargs):
     """ Perform the basic ols regression."""
-    exog_data = x
     # mixed effects code is obtained here:
     # http://stackoverflow.com/a/22439820/1167475
     submodels = [smf.OLS(endog=y[b], exog=x, **kwargs) for b in y.columns]

From cc227a8a5a2160951e07919f8dd505711d26454e Mon Sep 17 00:00:00 2001
From: Jamie Morton <jamietmorton@gmail.com>
Date: Fri, 27 Jan 2017 14:39:03 -0800
Subject: [PATCH 08/12] fixing precision in test

---
 gneiss/regression/tests/test_ols.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gneiss/regression/tests/test_ols.py b/gneiss/regression/tests/test_ols.py
index 975b147..d3e64b7 100644
--- a/gneiss/regression/tests/test_ols.py
+++ b/gneiss/regression/tests/test_ols.py
@@ -291,7 +291,7 @@ def test_mse(self):
         res = ols(formula="x1 + x2 + x3 + x4",
                   table=self.y, metadata=self.x, tree=self.t2)
         res.fit()
-        self.assertEquals(res.mse, 0.79228890379010453)
+        self.assertAlmostEqual(res.mse, 0.79228890379010453, )
 
 if __name__ == "__main__":
     unittest.main()

From ae536328b7b23a4ecc7443ba227815ce1042f93c Mon Sep 17 00:00:00 2001
From: Jamie Morton <jamietmorton@gmail.com>
Date: Fri, 27 Jan 2017 14:49:20 -0800
Subject: [PATCH 09/12] Bumping down precision

---
 gneiss/regression/tests/test_ols.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gneiss/regression/tests/test_ols.py b/gneiss/regression/tests/test_ols.py
index d3e64b7..0bee878 100644
--- a/gneiss/regression/tests/test_ols.py
+++ b/gneiss/regression/tests/test_ols.py
@@ -291,7 +291,7 @@ def test_mse(self):
         res = ols(formula="x1 + x2 + x3 + x4",
                   table=self.y, metadata=self.x, tree=self.t2)
         res.fit()
-        self.assertAlmostEqual(res.mse, 0.79228890379010453, )
+        self.assertAlmostEqual(res.mse, 0.79228890379010453, places=4)
 
 if __name__ == "__main__":
     unittest.main()

From 353f1b7bb7d51a501e9ad44986eda9226c86b8f3 Mon Sep 17 00:00:00 2001
From: Jamie Morton <jamietmorton@gmail.com>
Date: Tue, 7 Feb 2017 23:10:21 -0800
Subject: [PATCH 10/12] Update _ols.py

---
 gneiss/regression/_ols.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/gneiss/regression/_ols.py b/gneiss/regression/_ols.py
index 4ef448b..7649ddc 100644
--- a/gneiss/regression/_ols.py
+++ b/gneiss/regression/_ols.py
@@ -22,8 +22,7 @@ def _fit_ols(y, x, **kwargs):
     """ Perform the basic ols regression."""
     # mixed effects code is obtained here:
     # http://stackoverflow.com/a/22439820/1167475
-    submodels = [smf.OLS(endog=y[b], exog=x, **kwargs) for b in y.columns]
-    return submodels
+    return [smf.OLS(endog=y[b], exog=x, **kwargs) for b in y.columns]
 
 
 # TODO: Register as qiime 2 method

From f2bd9fb483f9249c102c6f92a6dd9863d9a32ca6 Mon Sep 17 00:00:00 2001
From: Jamie Morton <jamietmorton@gmail.com>
Date: Tue, 7 Feb 2017 23:11:41 -0800
Subject: [PATCH 11/12] renaming first res_i to model_i

---
 gneiss/regression/_ols.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gneiss/regression/_ols.py b/gneiss/regression/_ols.py
index 7649ddc..cd2e869 100644
--- a/gneiss/regression/_ols.py
+++ b/gneiss/regression/_ols.py
@@ -352,8 +352,8 @@ def loo(self, **kwargs):
 
         for i, (inidx, outidx) in enumerate(cv_iter):
             sample_id = self.balances.index[i]
-            res_i = _fit_ols(y=endog.loc[inidx], x=exog.loc[inidx], **kwargs)
-            res_i = [r.fit(**kwargs) for r in res_i]
+            model_i = _fit_ols(y=endog.loc[inidx], x=exog.loc[inidx], **kwargs)
+            res_i = [r.fit(**kwargs) for r in model_i]
 
             # mean sum of squares error
             sse = sum([r.ssr for r in res_i])

From 89d781077d6baf75c3b0df5cd0f1dde4432da230 Mon Sep 17 00:00:00 2001
From: Jamie Morton <jamietmorton@gmail.com>
Date: Tue, 7 Feb 2017 23:11:58 -0800
Subject: [PATCH 12/12] renaming first res_i to model_i