From ddbcf1ce366a83fb8da6141e8461bfa818ad38cd Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Wed, 25 Jan 2017 21:15:19 -0800 Subject: [PATCH 01/12] Adding test cases for loo --- gneiss/regression/_ols.py | 197 +++++++++++++++++++++++++--- gneiss/regression/tests/test_ols.py | 89 +++++++++++++ 2 files changed, 270 insertions(+), 16 deletions(-) diff --git a/gneiss/regression/_ols.py b/gneiss/regression/_ols.py index d3e39bc..1dcf970 100644 --- a/gneiss/regression/_ols.py +++ b/gneiss/regression/_ols.py @@ -7,14 +7,30 @@ # ---------------------------------------------------------------------------- from decimal import Decimal from collections import OrderedDict - +import numpy as np import pandas as pd from gneiss.regression._model import RegressionModel from ._regression import (_intersect_of_table_metadata_tree, _to_balances) import statsmodels.formula.api as smf from statsmodels.iolib.summary2 import Summary +from statsmodels.sandbox.tools.cross_val import LeaveOneOut +from patsy import dmatrix +import inspect + +def _fit_ols(y, x, **kwargs): + """ Perform the basic ols regression.""" + #one-time creation of exogenous data matrix allows for faster run-time + exog_data = x + submodels = [] + for b in y.columns: + endog_data = y[b] + # mixed effects code is obtained here: + # http://stackoverflow.com/a/22439820/1167475 + mdf = smf.OLS(endog=endog_data, exog=exog_data, **kwargs) + submodels.append(mdf) + return submodels # TODO: Register as qiime 2 method def ols(formula, table, metadata, tree, **kwargs): @@ -155,18 +171,9 @@ def ols(formula, table, metadata, tree, **kwargs): metadata, tree) ilr_table, basis = _to_balances(table, tree) - data = pd.merge(ilr_table, metadata, left_index=True, right_index=True) - - submodels = [] - - for b in ilr_table.columns: - # mixed effects code is obtained here: - # http://stackoverflow.com/a/22439820/1167475 - stats_formula = '%s ~ %s' % (b, formula) - - mdf = smf.ols(stats_formula, data=data, **kwargs) - submodels.append(mdf) - + ilr_table, metadata = ilr_table.align(metadata, join='inner', axis=0) + x = dmatrix(formula, metadata, return_type='dataframe') + submodels = _fit_ols(ilr_table, x) return OLSModel(submodels, basis=basis, balances=ilr_table, tree=tree) @@ -203,11 +210,24 @@ def __init__(self, *args, **kwargs): """ super().__init__(*args, **kwargs) - def fit(self, **kwargs): - """ Fit the model """ + def fit(self, regularized=False, **kwargs): + """ Fit the model. + + Parameters + ---------- + regularized : bool + Specifies if a regularization procedure should be used + when performing the fit. (default = False) + **kwargs : dict + Keyword arguments used to tune the parameter estimation. + + """ for s in self.submodels: # assumes that the underlying submodels have implemented `fit`. - m = s.fit(**kwargs) + if regularized: + m = s.fit(**kwargs) + else: + m = s.fit_regularized(**kwargs) self.results.append(m) def summary(self, ndim=10): @@ -298,3 +318,148 @@ def r2(self): sst = sse + ssr return 1 - sse / sst + + @property + def mse(self): + sse = sum([r.ssr for r in self.results]) + dfe = self.results[0].df_resid + return sse / dfe + + def loo(self, regularized=False, **kwargs): + """ Leave one out cross-validation. + + Calculates summary statistics for each iteraction of + leave one out cross-validation, specially `mse` on entire model + and `pred_err` to measure prediction error. + + Parameters + ---------- + regularized : bool + Specifies if a regularization procedure should be used + when performing the fit. + **kwargs : dict + Keyword arguments used to tune the parameter estimation. + + Returns + ------- + pd.DataFrame + mse : np.array, float + Mean sum of squares error for each iteration of + the cross validation. + pred_err : np.array, float + Prediction mean sum of squares error for each iteration of + the cross validation. + + See Also + -------- + fit + """ + params = {'fit_regularized': False} + for key in params: + params[key] = kwargs.get(key, params[key]) + kwargs.pop(key, None) + + nobs = self.balances.shape[0] # number of observations (i.e. samples) + cv_iter = LeaveOneOut(nobs) + endog = self.balances + exog_names = self.results[0].model.exog_names + exog = pd.DataFrame(self.results[0].model.exog, + index=self.balances.index, + columns=exog_names) + results = pd.DataFrame(index=self.balances.index, + columns=['mse', 'pred_err']) + i = 0 + for inidx, outidx in cv_iter: + sample_id = self.balances.index[i] + res_i = _fit_ols(y=endog.loc[inidx], x=exog.loc[inidx], **kwargs) + if params['fit_regularized']: + res_i = [r.fit_regularized(**kwargs) for r in res_i] + else: + res_i = [r.fit(**kwargs) for r in res_i] + + # mean sum of squares error + sse = sum([r.ssr for r in res_i]) + # degrees of freedom for residuals + dfe = res_i[0].df_resid + results.loc[sample_id, 'mse'] = sse / dfe + + # prediction error on loo point + predicted = np.hstack([r.predict(exog.loc[outidx]) for r in res_i]) + + pred_sse = np.sum((predicted - self.balances.loc[outidx])**2) + results.loc[sample_id, 'pred_err'] = pred_sse.sum() + i += 1 + return results + + def lovo(self, **kwargs): + """ Leave one variable out cross-validation. + + Calculates summary statistics for each iteraction of leave one variable + out cross-validation, specially `r2` and `mse` on entire model. + This technique is particularly useful for feature selection. + + Parameters + ---------- + regularized : bool + Specifies if a regularization procedure should be used + when performing the fit. + **kwargs : dict + Keyword arguments used to tune the parameter estimation. + + Returns + ------- + pd.DataFrame + Rsquared : np.array, flot + Coefficient of determination for each variable left out. + mse : np.array, float + Mean sum of squares error for each iteration of + the cross validation. + """ + + params = {'regularized': False} + for key in params: + params[key] = kwargs.get(key, params[key]) + + kwargs.pop(key, None) + + nobs = self.balances.shape[0] # number of observations (i.e. samples) + + endog = self.balances + exog_names = self.results[0].model.exog_names + exog = pd.DataFrame(self.results[0].model.exog, + index=self.balances.index, + columns=exog_names) + cv_iter = LeaveOneOut(len(exog_names)) + results = pd.DataFrame(index=exog_names, + columns=['mse', 'Rsquared']) + i = 0 + for inidx, outidx in cv_iter: + feature_id = exog_names[i] + res_i = _fit_ols(endog, exog.loc[:, inidx], **kwargs) + if params['regularized']: + res_i = [r.fit_regularized(**kwargs) for r in res_i] + else: + res_i = [r.fit(**kwargs) for r in res_i] + # See `statsmodels.regression.linear_model.RegressionResults` + # for more explanation on `ess` and `ssr`. + # sum of squares regression. Also referred to as + # explained sum of squares. + ssr = sum([r.ess for r in res_i]) + # sum of squares error. Also referred to as sum of squares residuals + sse = sum([r.ssr for r in res_i]) + # calculate the overall coefficient of determination (i.e. R2) + sst = sse + ssr + results.loc[feature_id, 'Rsquared'] = 1 - sse / sst + # degrees of freedom for residuals + dfe = res_i[0].df_resid + results.loc[feature_id, 'mse'] = sse / dfe + i+=1 + return results + + def percent_explained(self): + """ Proportion explained by each principal balance.""" + # Using sum of squares error calculation (df=1) + # instead of population variance (df=0). + axis_vars = np.var(self.balances, ddof=1, axis=0) + return axis_vars / axis_vars.sum() + diff --git a/gneiss/regression/tests/test_ols.py b/gneiss/regression/tests/test_ols.py index 4293773..5d28ffa 100644 --- a/gneiss/regression/tests/test_ols.py +++ b/gneiss/regression/tests/test_ols.py @@ -33,6 +33,20 @@ def setUp(self): 'real': [1, 2, 3, 4, 5] }, index=['s1', 's2', 's3', 's4', 's5']) + np.random.seed(0) + n = 15 + a = np.array([1, 4.2, 5.3, -2.2, 8]) + x1 = np.linspace(.01, 0.1, n) + x2 = np.logspace(0, 0.01, n) + x3 = np.exp(np.linspace(0, 0.01, n)) + x4 = x1 ** 2 + self.x = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'x4':x4}) + y = (a[0] + a[1]*x1 + a[2]*x2 + a[3]*x3 + a[4]*x4 + \ + (np.random.normal(size=n))) + sy = np.vstack((y, y/10)).T + self.y = pd.DataFrame(ilr_inv(sy), columns=['a', 'b', 'c']) + self.t2 = TreeNode.read([r"((a,b)n,c);"]) + def test_ols(self): res = ols('real', self.table, self.metadata, self.tree) res.fit() @@ -226,6 +240,81 @@ def test_summary_head(self): exp = fh.read() self.assertEqual(res, exp) + def test_fit_regularized(self): + res = ols(formula="x1 + x2 + x3 + x4", + table=self.y, metadata=self.x, tree=self.t2) + res.fit(regularized=True) + exp_coefs = pd.DataFrame({ + 'y0': [-2.074816e+09, -2.039271e+08, -1.843143e+08, + 2.261170e+09, -7.925699e+06], + 'y1': [-2.074816e+10, -2.039271e+09, -1.843143e+09, + 2.261170e+10, -7.925699e+07] + }, index=['Intercept', 'x1', 'x2', 'x3', 'x4']).T + # The L1 regularization either has numerical issues + # or is random ... + res_coefs = res.coefficients() + pdt.assert_index_equal(exp_coefs.index, res_coefs.index) + pdt.assert_index_equal(exp_coefs.columns, res_coefs.columns) + + + def test_loo(self): + res = ols(formula="x1 + x2 + x3 + x4", + table=self.y, metadata=self.x, tree=self.t2) + res.fit() + exp_loo = pd.DataFrame([[0.66953263510975791, 10.994700550912553], + [0.69679777354984163, 2.3613911713947062], + [0.84934173316473072, 0.4057812892157881], + [0.6990546679957772, 2.2872776593899351], + [0.72855466737125463, 1.7615637744849277], + [0.55998953661859308, 3.617823652256889], + [0.81787392852582308, 0.72395497360494043], + [0.8653549732546999, 0.17706927499520822], + [0.86983181933002329, 0.1216027316667969], + [0.87779006612352628, 0.028600627330344405], + [0.86591226075609384, 0.16724511075065476], + [0.7787232221539, 1.2820054843437292], + [0.88032413856094505, 3.4113910096200831e-06], + [0.83195133809800792, 0.62276589277034022], + [0.85352707356786695, 1.4038585971691198]], + columns=['mse', 'pred_err'], + index=self.y.index) + res_loo = res.loo().astype(np.float) + pdt.assert_frame_equal(exp_loo, res_loo) + + # Make sure that the regularization works + exp_loo = pd.DataFrame([[0.967322, 0.167080], + [0.849595, 1.565647], + [0.975039, 0.021956], + [0.752424, 2.312112], + [0.820359, 1.605247], + [0.643426, 3.464196], + [0.953059, 0.252861], + [0.930367, 0.495480], + [0.945151, 0.336588], + [0.976778, 0.003095], + [0.971040, 0.061874], + [0.830764, 1.505806], + [0.963017, 0.151680], + [0.953350, 0.291705], + [0.970680, 0.107289]], + columns=['mse', 'pred_err'], + index=self.y.index) + res = ols(formula="x1 + x2 + x3 + x4", + table=self.y, metadata=self.x, tree=self.t2) + res.fit(regularized=True) + res_loo = res.loo(fit_regularized=True).astype(np.float) + + pdt.assert_frame_equal(exp_loo, res_loo, check_less_precise=True) + + def test_lovo(self): + self.assertTrue(False) + + def test_percent_explained(self): + self.assertTrue(False) + + def test_mse(self): + self.assertTrue(False) + if __name__ == "__main__": unittest.main() From bbb6a02fd939619f6d111c4ff74ee4acdb8d4d9a Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Wed, 25 Jan 2017 21:27:26 -0800 Subject: [PATCH 02/12] Adding lovo, percent_explained and mse tests --- gneiss/regression/tests/test_ols.py | 41 ++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/gneiss/regression/tests/test_ols.py b/gneiss/regression/tests/test_ols.py index 5d28ffa..68372fe 100644 --- a/gneiss/regression/tests/test_ols.py +++ b/gneiss/regression/tests/test_ols.py @@ -303,17 +303,50 @@ def test_loo(self): table=self.y, metadata=self.x, tree=self.t2) res.fit(regularized=True) res_loo = res.loo(fit_regularized=True).astype(np.float) - pdt.assert_frame_equal(exp_loo, res_loo, check_less_precise=True) def test_lovo(self): - self.assertTrue(False) + res = ols(formula="x1 + x2 + x3 + x4", + table=self.y, metadata=self.x, tree=self.t2) + res.fit() + exp_lovo = pd.DataFrame([[0.799364, 0.978214], + [0.799363, 0.097355], + [0.799368, 0.0973498], + [0.799364, 0.097354], + [0.799361, 0.0973575]], + columns=['mse', 'Rsquared'], + index=['Intercept', 'x1', 'x2', 'x3', 'x4']) + res_lovo = res.lovo().astype(np.float) + pdt.assert_frame_equal(exp_lovo, res_lovo) + + # Make sure that the regularization works + exp_lovo = pd.DataFrame([[0.799364, 0.978214], + [0.799363, 0.097355], + [0.799368, 0.0973498], + [0.799364, 0.097354], + [0.799361, 0.0973575]], + columns=['mse', 'Rsquared'], + index=['Intercept', 'x1', 'x2', 'x3', 'x4']) + res = ols(formula="x1 + x2 + x3 + x4", + table=self.y, metadata=self.x, tree=self.t2) + res.fit(regularized=True) + res_lovo = res.lovo(fit_regularized=True).astype(np.float) + pdt.assert_frame_equal(exp_lovo, res_lovo, check_less_precise=True) def test_percent_explained(self): - self.assertTrue(False) + res = ols(formula="x1 + x2 + x3 + x4", + table=self.y, metadata=self.x, tree=self.t2) + res.fit() + res_perc = res.percent_explained() + exp_perc = pd.Series({'y0': 0.009901, + 'y1': 0.990099}) + pdt.assert_series_equal(res_perc, exp_perc) def test_mse(self): - self.assertTrue(False) + res = ols(formula="x1 + x2 + x3 + x4", + table=self.y, metadata=self.x, tree=self.t2) + res.fit() + self.assertEquals(res.mse, 0.87936961790361856) if __name__ == "__main__": From c4f47fd940219e8b95cf2a706e9409c86633ee2a Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Wed, 25 Jan 2017 21:30:08 -0800 Subject: [PATCH 03/12] flake8 --- gneiss/regression/_ols.py | 15 +++++++-------- gneiss/regression/tests/test_ols.py | 7 +++---- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/gneiss/regression/_ols.py b/gneiss/regression/_ols.py index 1dcf970..fec7670 100644 --- a/gneiss/regression/_ols.py +++ b/gneiss/regression/_ols.py @@ -21,7 +21,6 @@ def _fit_ols(y, x, **kwargs): """ Perform the basic ols regression.""" - #one-time creation of exogenous data matrix allows for faster run-time exog_data = x submodels = [] for b in y.columns: @@ -32,6 +31,7 @@ def _fit_ols(y, x, **kwargs): submodels.append(mdf) return submodels + # TODO: Register as qiime 2 method def ols(formula, table, metadata, tree, **kwargs): """ Ordinary Least Squares applied to balances. @@ -172,6 +172,7 @@ def ols(formula, table, metadata, tree, **kwargs): tree) ilr_table, basis = _to_balances(table, tree) ilr_table, metadata = ilr_table.align(metadata, join='inner', axis=0) + # one-time creation of exogenous data matrix allows for faster run-time x = dmatrix(formula, metadata, return_type='dataframe') submodels = _fit_ols(ilr_table, x) return OLSModel(submodels, basis=basis, @@ -359,7 +360,7 @@ def loo(self, regularized=False, **kwargs): params[key] = kwargs.get(key, params[key]) kwargs.pop(key, None) - nobs = self.balances.shape[0] # number of observations (i.e. samples) + nobs = self.balances.shape[0] # number of observations (i.e. samples) cv_iter = LeaveOneOut(nobs) endog = self.balances exog_names = self.results[0].model.exog_names @@ -422,7 +423,7 @@ def lovo(self, **kwargs): kwargs.pop(key, None) - nobs = self.balances.shape[0] # number of observations (i.e. samples) + nobs = self.balances.shape[0] # number of observations (i.e. samples) endog = self.balances exog_names = self.results[0].model.exog_names @@ -442,10 +443,9 @@ def lovo(self, **kwargs): res_i = [r.fit(**kwargs) for r in res_i] # See `statsmodels.regression.linear_model.RegressionResults` # for more explanation on `ess` and `ssr`. - # sum of squares regression. Also referred to as - # explained sum of squares. + # sum of squares regression. ssr = sum([r.ess for r in res_i]) - # sum of squares error. Also referred to as sum of squares residuals + # sum of squares error. sse = sum([r.ssr for r in res_i]) # calculate the overall coefficient of determination (i.e. R2) sst = sse + ssr @@ -453,7 +453,7 @@ def lovo(self, **kwargs): # degrees of freedom for residuals dfe = res_i[0].df_resid results.loc[feature_id, 'mse'] = sse / dfe - i+=1 + i += 1 return results def percent_explained(self): @@ -462,4 +462,3 @@ def percent_explained(self): # instead of population variance (df=0). axis_vars = np.var(self.balances, ddof=1, axis=0) return axis_vars / axis_vars.sum() - diff --git a/gneiss/regression/tests/test_ols.py b/gneiss/regression/tests/test_ols.py index 68372fe..a878b9b 100644 --- a/gneiss/regression/tests/test_ols.py +++ b/gneiss/regression/tests/test_ols.py @@ -40,9 +40,9 @@ def setUp(self): x2 = np.logspace(0, 0.01, n) x3 = np.exp(np.linspace(0, 0.01, n)) x4 = x1 ** 2 - self.x = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'x4':x4}) - y = (a[0] + a[1]*x1 + a[2]*x2 + a[3]*x3 + a[4]*x4 + \ - (np.random.normal(size=n))) + self.x = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'x4': x4}) + y = (a[0] + a[1]*x1 + a[2]*x2 + a[3]*x3 + a[4]*x4 + + np.random.normal(size=n)) sy = np.vstack((y, y/10)).T self.y = pd.DataFrame(ilr_inv(sy), columns=['a', 'b', 'c']) self.t2 = TreeNode.read([r"((a,b)n,c);"]) @@ -256,7 +256,6 @@ def test_fit_regularized(self): pdt.assert_index_equal(exp_coefs.index, res_coefs.index) pdt.assert_index_equal(exp_coefs.columns, res_coefs.columns) - def test_loo(self): res = ols(formula="x1 + x2 + x3 + x4", table=self.y, metadata=self.x, tree=self.t2) From c4406081e3c7e1a7b7406c99d8d3fe586b9b7e3e Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Wed, 25 Jan 2017 21:43:16 -0800 Subject: [PATCH 04/12] flake8 --- gneiss/regression/_ols.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/gneiss/regression/_ols.py b/gneiss/regression/_ols.py index fec7670..b338594 100644 --- a/gneiss/regression/_ols.py +++ b/gneiss/regression/_ols.py @@ -16,7 +16,6 @@ from statsmodels.iolib.summary2 import Summary from statsmodels.sandbox.tools.cross_val import LeaveOneOut from patsy import dmatrix -import inspect def _fit_ols(y, x, **kwargs): @@ -423,8 +422,6 @@ def lovo(self, **kwargs): kwargs.pop(key, None) - nobs = self.balances.shape[0] # number of observations (i.e. samples) - endog = self.balances exog_names = self.results[0].model.exog_names exog = pd.DataFrame(self.results[0].model.exog, From 129f81ce290a456f2896bca72be06f6e621610f8 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Wed, 25 Jan 2017 21:47:24 -0800 Subject: [PATCH 05/12] fixing dumb numerical issues --- gneiss/regression/tests/test_ols.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gneiss/regression/tests/test_ols.py b/gneiss/regression/tests/test_ols.py index a878b9b..54b07b5 100644 --- a/gneiss/regression/tests/test_ols.py +++ b/gneiss/regression/tests/test_ols.py @@ -278,7 +278,7 @@ def test_loo(self): columns=['mse', 'pred_err'], index=self.y.index) res_loo = res.loo().astype(np.float) - pdt.assert_frame_equal(exp_loo, res_loo) + pdt.assert_frame_equal(exp_loo, res_loo, check_less_precise=True) # Make sure that the regularization works exp_loo = pd.DataFrame([[0.967322, 0.167080], @@ -316,7 +316,7 @@ def test_lovo(self): columns=['mse', 'Rsquared'], index=['Intercept', 'x1', 'x2', 'x3', 'x4']) res_lovo = res.lovo().astype(np.float) - pdt.assert_frame_equal(exp_lovo, res_lovo) + pdt.assert_frame_equal(exp_lovo, res_lovo, check_less_precise=True) # Make sure that the regularization works exp_lovo = pd.DataFrame([[0.799364, 0.978214], From be4633e77192e5725846562c14f06ca99982cb12 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Fri, 27 Jan 2017 14:13:46 -0800 Subject: [PATCH 06/12] Removing regularization Addressed @josenavas's comments --- gneiss/regression/_ols.py | 62 ++++++----------------------- gneiss/regression/tests/test_ols.py | 57 +------------------------- 2 files changed, 14 insertions(+), 105 deletions(-) diff --git a/gneiss/regression/_ols.py b/gneiss/regression/_ols.py index b338594..723e780 100644 --- a/gneiss/regression/_ols.py +++ b/gneiss/regression/_ols.py @@ -21,13 +21,9 @@ def _fit_ols(y, x, **kwargs): """ Perform the basic ols regression.""" exog_data = x - submodels = [] - for b in y.columns: - endog_data = y[b] - # mixed effects code is obtained here: - # http://stackoverflow.com/a/22439820/1167475 - mdf = smf.OLS(endog=endog_data, exog=exog_data, **kwargs) - submodels.append(mdf) + # mixed effects code is obtained here: + # http://stackoverflow.com/a/22439820/1167475 + submodels = [smf.OLS(endog=y[b], exog=x, **kwargs) for b in y.columns] return submodels @@ -222,13 +218,8 @@ def fit(self, regularized=False, **kwargs): Keyword arguments used to tune the parameter estimation. """ - for s in self.submodels: - # assumes that the underlying submodels have implemented `fit`. - if regularized: - m = s.fit(**kwargs) - else: - m = s.fit_regularized(**kwargs) - self.results.append(m) + # assumes that the underlying submodels have implemented `fit`. + self.results = [s.fit(**kwargs) for s in self.submodels] def summary(self, ndim=10): """ Summarize the Ordinary Least Squares Regression Results. @@ -298,7 +289,6 @@ def _format(x): smry.add_dict(info, ncols=1) smry.add_title("Simplicial Least Squares Results") smry.add_df(scores, align='r') - return smry @property @@ -306,7 +296,6 @@ def r2(self): """ Coefficient of determination for overall fit""" # Reason why we wanted to move this out was because not # all types of statsmodels regressions have this property. - # See `statsmodels.regression.linear_model.RegressionResults` # for more explanation on `ess` and `ssr`. # sum of squares regression. Also referred to as @@ -315,17 +304,17 @@ def r2(self): # sum of squares error. Also referred to as sum of squares residuals sse = sum([r.ssr for r in self.results]) # calculate the overall coefficient of determination (i.e. R2) - sst = sse + ssr return 1 - sse / sst @property def mse(self): + """ Mean Sum of squares Error""" sse = sum([r.ssr for r in self.results]) dfe = self.results[0].df_resid return sse / dfe - def loo(self, regularized=False, **kwargs): + def loo(self, **kwargs): """ Leave one out cross-validation. Calculates summary statistics for each iteraction of @@ -334,9 +323,6 @@ def loo(self, regularized=False, **kwargs): Parameters ---------- - regularized : bool - Specifies if a regularization procedure should be used - when performing the fit. **kwargs : dict Keyword arguments used to tune the parameter estimation. @@ -353,11 +339,8 @@ def loo(self, regularized=False, **kwargs): See Also -------- fit + statsmodels.regression.linear_model. """ - params = {'fit_regularized': False} - for key in params: - params[key] = kwargs.get(key, params[key]) - kwargs.pop(key, None) nobs = self.balances.shape[0] # number of observations (i.e. samples) cv_iter = LeaveOneOut(nobs) @@ -368,14 +351,11 @@ def loo(self, regularized=False, **kwargs): columns=exog_names) results = pd.DataFrame(index=self.balances.index, columns=['mse', 'pred_err']) - i = 0 - for inidx, outidx in cv_iter: + + for i, (inidx, outidx) in enumerate(cv_iter): sample_id = self.balances.index[i] res_i = _fit_ols(y=endog.loc[inidx], x=exog.loc[inidx], **kwargs) - if params['fit_regularized']: - res_i = [r.fit_regularized(**kwargs) for r in res_i] - else: - res_i = [r.fit(**kwargs) for r in res_i] + res_i = [r.fit(**kwargs) for r in res_i] # mean sum of squares error sse = sum([r.ssr for r in res_i]) @@ -388,7 +368,6 @@ def loo(self, regularized=False, **kwargs): pred_sse = np.sum((predicted - self.balances.loc[outidx])**2) results.loc[sample_id, 'pred_err'] = pred_sse.sum() - i += 1 return results def lovo(self, **kwargs): @@ -400,9 +379,6 @@ def lovo(self, **kwargs): Parameters ---------- - regularized : bool - Specifies if a regularization procedure should be used - when performing the fit. **kwargs : dict Keyword arguments used to tune the parameter estimation. @@ -415,13 +391,6 @@ def lovo(self, **kwargs): Mean sum of squares error for each iteration of the cross validation. """ - - params = {'regularized': False} - for key in params: - params[key] = kwargs.get(key, params[key]) - - kwargs.pop(key, None) - endog = self.balances exog_names = self.results[0].model.exog_names exog = pd.DataFrame(self.results[0].model.exog, @@ -430,14 +399,10 @@ def lovo(self, **kwargs): cv_iter = LeaveOneOut(len(exog_names)) results = pd.DataFrame(index=exog_names, columns=['mse', 'Rsquared']) - i = 0 - for inidx, outidx in cv_iter: + for i, (inidx, outidx) in enumerate(cv_iter): feature_id = exog_names[i] res_i = _fit_ols(endog, exog.loc[:, inidx], **kwargs) - if params['regularized']: - res_i = [r.fit_regularized(**kwargs) for r in res_i] - else: - res_i = [r.fit(**kwargs) for r in res_i] + res_i = [r.fit(**kwargs) for r in res_i] # See `statsmodels.regression.linear_model.RegressionResults` # for more explanation on `ess` and `ssr`. # sum of squares regression. @@ -450,7 +415,6 @@ def lovo(self, **kwargs): # degrees of freedom for residuals dfe = res_i[0].df_resid results.loc[feature_id, 'mse'] = sse / dfe - i += 1 return results def percent_explained(self): diff --git a/gneiss/regression/tests/test_ols.py b/gneiss/regression/tests/test_ols.py index 54b07b5..975b147 100644 --- a/gneiss/regression/tests/test_ols.py +++ b/gneiss/regression/tests/test_ols.py @@ -240,22 +240,6 @@ def test_summary_head(self): exp = fh.read() self.assertEqual(res, exp) - def test_fit_regularized(self): - res = ols(formula="x1 + x2 + x3 + x4", - table=self.y, metadata=self.x, tree=self.t2) - res.fit(regularized=True) - exp_coefs = pd.DataFrame({ - 'y0': [-2.074816e+09, -2.039271e+08, -1.843143e+08, - 2.261170e+09, -7.925699e+06], - 'y1': [-2.074816e+10, -2.039271e+09, -1.843143e+09, - 2.261170e+10, -7.925699e+07] - }, index=['Intercept', 'x1', 'x2', 'x3', 'x4']).T - # The L1 regularization either has numerical issues - # or is random ... - res_coefs = res.coefficients() - pdt.assert_index_equal(exp_coefs.index, res_coefs.index) - pdt.assert_index_equal(exp_coefs.columns, res_coefs.columns) - def test_loo(self): res = ols(formula="x1 + x2 + x3 + x4", table=self.y, metadata=self.x, tree=self.t2) @@ -280,30 +264,6 @@ def test_loo(self): res_loo = res.loo().astype(np.float) pdt.assert_frame_equal(exp_loo, res_loo, check_less_precise=True) - # Make sure that the regularization works - exp_loo = pd.DataFrame([[0.967322, 0.167080], - [0.849595, 1.565647], - [0.975039, 0.021956], - [0.752424, 2.312112], - [0.820359, 1.605247], - [0.643426, 3.464196], - [0.953059, 0.252861], - [0.930367, 0.495480], - [0.945151, 0.336588], - [0.976778, 0.003095], - [0.971040, 0.061874], - [0.830764, 1.505806], - [0.963017, 0.151680], - [0.953350, 0.291705], - [0.970680, 0.107289]], - columns=['mse', 'pred_err'], - index=self.y.index) - res = ols(formula="x1 + x2 + x3 + x4", - table=self.y, metadata=self.x, tree=self.t2) - res.fit(regularized=True) - res_loo = res.loo(fit_regularized=True).astype(np.float) - pdt.assert_frame_equal(exp_loo, res_loo, check_less_precise=True) - def test_lovo(self): res = ols(formula="x1 + x2 + x3 + x4", table=self.y, metadata=self.x, tree=self.t2) @@ -318,20 +278,6 @@ def test_lovo(self): res_lovo = res.lovo().astype(np.float) pdt.assert_frame_equal(exp_lovo, res_lovo, check_less_precise=True) - # Make sure that the regularization works - exp_lovo = pd.DataFrame([[0.799364, 0.978214], - [0.799363, 0.097355], - [0.799368, 0.0973498], - [0.799364, 0.097354], - [0.799361, 0.0973575]], - columns=['mse', 'Rsquared'], - index=['Intercept', 'x1', 'x2', 'x3', 'x4']) - res = ols(formula="x1 + x2 + x3 + x4", - table=self.y, metadata=self.x, tree=self.t2) - res.fit(regularized=True) - res_lovo = res.lovo(fit_regularized=True).astype(np.float) - pdt.assert_frame_equal(exp_lovo, res_lovo, check_less_precise=True) - def test_percent_explained(self): res = ols(formula="x1 + x2 + x3 + x4", table=self.y, metadata=self.x, tree=self.t2) @@ -345,8 +291,7 @@ def test_mse(self): res = ols(formula="x1 + x2 + x3 + x4", table=self.y, metadata=self.x, tree=self.t2) res.fit() - self.assertEquals(res.mse, 0.87936961790361856) - + self.assertEquals(res.mse, 0.79228890379010453) if __name__ == "__main__": unittest.main() From 823eccde47279a11bb422d4e4a10b2210e51b73a Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Fri, 27 Jan 2017 14:14:27 -0800 Subject: [PATCH 07/12] flake9 --- gneiss/regression/_ols.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gneiss/regression/_ols.py b/gneiss/regression/_ols.py index 723e780..4ef448b 100644 --- a/gneiss/regression/_ols.py +++ b/gneiss/regression/_ols.py @@ -20,7 +20,6 @@ def _fit_ols(y, x, **kwargs): """ Perform the basic ols regression.""" - exog_data = x # mixed effects code is obtained here: # http://stackoverflow.com/a/22439820/1167475 submodels = [smf.OLS(endog=y[b], exog=x, **kwargs) for b in y.columns] From cc227a8a5a2160951e07919f8dd505711d26454e Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Fri, 27 Jan 2017 14:39:03 -0800 Subject: [PATCH 08/12] fixing precision in test --- gneiss/regression/tests/test_ols.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gneiss/regression/tests/test_ols.py b/gneiss/regression/tests/test_ols.py index 975b147..d3e64b7 100644 --- a/gneiss/regression/tests/test_ols.py +++ b/gneiss/regression/tests/test_ols.py @@ -291,7 +291,7 @@ def test_mse(self): res = ols(formula="x1 + x2 + x3 + x4", table=self.y, metadata=self.x, tree=self.t2) res.fit() - self.assertEquals(res.mse, 0.79228890379010453) + self.assertAlmostEqual(res.mse, 0.79228890379010453, ) if __name__ == "__main__": unittest.main() From ae536328b7b23a4ecc7443ba227815ce1042f93c Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Fri, 27 Jan 2017 14:49:20 -0800 Subject: [PATCH 09/12] Bumping down precision --- gneiss/regression/tests/test_ols.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gneiss/regression/tests/test_ols.py b/gneiss/regression/tests/test_ols.py index d3e64b7..0bee878 100644 --- a/gneiss/regression/tests/test_ols.py +++ b/gneiss/regression/tests/test_ols.py @@ -291,7 +291,7 @@ def test_mse(self): res = ols(formula="x1 + x2 + x3 + x4", table=self.y, metadata=self.x, tree=self.t2) res.fit() - self.assertAlmostEqual(res.mse, 0.79228890379010453, ) + self.assertAlmostEqual(res.mse, 0.79228890379010453, places=4) if __name__ == "__main__": unittest.main() From 353f1b7bb7d51a501e9ad44986eda9226c86b8f3 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Tue, 7 Feb 2017 23:10:21 -0800 Subject: [PATCH 10/12] Update _ols.py --- gneiss/regression/_ols.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gneiss/regression/_ols.py b/gneiss/regression/_ols.py index 4ef448b..7649ddc 100644 --- a/gneiss/regression/_ols.py +++ b/gneiss/regression/_ols.py @@ -22,8 +22,7 @@ def _fit_ols(y, x, **kwargs): """ Perform the basic ols regression.""" # mixed effects code is obtained here: # http://stackoverflow.com/a/22439820/1167475 - submodels = [smf.OLS(endog=y[b], exog=x, **kwargs) for b in y.columns] - return submodels + return [smf.OLS(endog=y[b], exog=x, **kwargs) for b in y.columns] # TODO: Register as qiime 2 method From f2bd9fb483f9249c102c6f92a6dd9863d9a32ca6 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Tue, 7 Feb 2017 23:11:41 -0800 Subject: [PATCH 11/12] renaming first res_i to model_i --- gneiss/regression/_ols.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gneiss/regression/_ols.py b/gneiss/regression/_ols.py index 7649ddc..cd2e869 100644 --- a/gneiss/regression/_ols.py +++ b/gneiss/regression/_ols.py @@ -352,8 +352,8 @@ def loo(self, **kwargs): for i, (inidx, outidx) in enumerate(cv_iter): sample_id = self.balances.index[i] - res_i = _fit_ols(y=endog.loc[inidx], x=exog.loc[inidx], **kwargs) - res_i = [r.fit(**kwargs) for r in res_i] + model_i = _fit_ols(y=endog.loc[inidx], x=exog.loc[inidx], **kwargs) + res_i = [r.fit(**kwargs) for r in model_i] # mean sum of squares error sse = sum([r.ssr for r in res_i]) From 89d781077d6baf75c3b0df5cd0f1dde4432da230 Mon Sep 17 00:00:00 2001 From: Jamie Morton Date: Tue, 7 Feb 2017 23:11:58 -0800 Subject: [PATCH 12/12] renaming first res_i to model_i