diff --git a/gneiss/_summary.py b/gneiss/_summary.py index a39d504..2bbf326 100644 --- a/gneiss/_summary.py +++ b/gneiss/_summary.py @@ -15,10 +15,11 @@ class RegressionResults(): """ Summary object for storing regression results. """ - def __init__(self, stat_results, + def __init__(self, + stat_results, feature_names=None, basis=None): - """ Reorganizes statsmodels regression modules. + """ Reorganizes statsmodels regression results modules. Accepts a list of statsmodels RegressionResults objects and performs some addition summary statistics. @@ -78,11 +79,11 @@ def _check_projection(self, project): """ if self.basis is None and project: raise ValueError("Cannot perform projection into Aitchison simplex" - "if `basis` is not specified.") + " if `basis` is not specified.") if self.feature_names is None and project: raise ValueError("Cannot perform projection into Aitchison simplex" - "if `feature_names` is not specified.") + " if `feature_names` is not specified.") def coefficients(self, project=False): """ Returns coefficients from fit. @@ -91,7 +92,7 @@ def coefficients(self, project=False): ---------- project : bool, optional Specifies if coefficients should be projected back into - the Aitchison simplex. If false, the coefficients will be + the Aitchison simplex [1]_. If false, the coefficients will be represented as balances (default: False). Returns @@ -109,6 +110,11 @@ def coefficients(self, project=False): ValueError: Cannot perform projection into Aitchison simplex if `feature_names` is not specified. + + References + ---------- + .. [1] Aitchison, J. "A concise guide to compositional data analysis, + CDA work." Girona 24 (2003): 73-81. """ self._check_projection(project) coef = pd.DataFrame() @@ -128,3 +134,102 @@ def coefficients(self, project=False): columns=coef.columns) else: return coef + + def residuals(self, project=False): + """ Returns calculated residuals. + + Parameters + ---------- + X : pd.DataFrame, optional + Input table of covariates. If not specified, then the + fitted values calculated from training the model will be + returned. + project : bool, optional + Specifies if coefficients should be projected back into + the Aitchison simplex [1]_. If false, the coefficients will be + represented as balances (default: False). + Returns + ------- + pd.DataFrame + A table of values where rows are samples, and the columns + are either balances or proportions, depending on the value of + `project`. + + References + ---------- + .. [1] Aitchison, J. "A concise guide to compositional data analysis, + CDA work." Girona 24 (2003): 73-81. + """ + self._check_projection(project) + + resid = pd.DataFrame() + + for r in self.results: + err = r.resid + err.name = r.model.endog_names + resid = resid.append(err) + + if project: + # `check=False`, due to a problem with error handling + # addressed here https://github.com/biocore/scikit-bio/pull/1396 + # This will need to be fixed here: + # https://github.com/biocore/gneiss/issues/34 + proj_resid = ilr_inv(resid.values.T, basis=self.basis, + check=False).T + return pd.DataFrame(proj_resid, index=self.feature_names, + columns=resid.columns).T + else: + return resid.T + + def predict(self, X=None, project=False, **kwargs): + """ Performs a prediction based on model. + + Parameters + ---------- + X : pd.DataFrame, optional + Input table of covariates, where columns are covariates, and + rows are samples. If not specified, then the fitted values + calculated from training the model will be returned. + project : bool, optional + Specifies if coefficients should be projected back into + the Aitchison simplex [1]_. If false, the coefficients will be + represented as balances (default: False). + **kwargs : dict + Other arguments to be passed into the model prediction. + + Returns + ------- + pd.DataFrame + A table of values where rows are coefficients, and the columns + are either balances or proportions, depending on the value of + `project`. + + References + ---------- + .. [1] Aitchison, J. "A concise guide to compositional data analysis, + CDA work." Girona 24 (2003): 73-81. + """ + self._check_projection(project) + + prediction = pd.DataFrame() + for m in self.results: + # check if X is none. + p = pd.Series(m.predict(X, **kwargs)) + p.name = m.model.endog_names + if X is not None: + p.index = X.index + else: + p.index = m.fittedvalues.index + prediction = prediction.append(p) + + if project: + # `check=False`, due to a problem with error handling + # addressed here https://github.com/biocore/scikit-bio/pull/1396 + # This will need to be fixed here: + # https://github.com/biocore/gneiss/issues/34 + proj_prediction = ilr_inv(prediction.values.T, basis=self.basis, + check=False) + return pd.DataFrame(proj_prediction, + columns=self.feature_names, + index=prediction.columns) + return prediction.T diff --git a/gneiss/tests/test_summary.py b/gneiss/tests/test_summary.py index ff4932d..84f6523 100644 --- a/gneiss/tests/test_summary.py +++ b/gneiss/tests/test_summary.py @@ -19,12 +19,16 @@ class TestRegressionResults(unittest.TestCase): def setUp(self): - self.data = pd.DataFrame([[1, 3, 4, 5, 2, 3, 4], - list(range(1, 8)), - [1, 3, 2, 4, 3, 5, 4]], - columns=['s1', 's2', 's3', 's4', - 's5', 's6', 's7'], - index=['Y1', 'Y2', 'X']).T + self.data = pd.DataFrame([[1, 1, 1], + [3, 2, 3], + [4, 3, 2], + [5, 4, 4], + [2, 5, 3], + [3, 6, 5], + [4, 7, 4]], + index=['s1', 's2', 's3', 's4', + 's5', 's6', 's7'], + columns=['Y1', 'Y2', 'X']) model1 = smf.ols(formula="Y1 ~ X", data=self.data) model2 = smf.ols(formula="Y2 ~ X", data=self.data) self.results = [model1.fit(), model2.fit()] @@ -104,5 +108,100 @@ def test_regression_results_coefficient_project_error(self): with self.assertRaises(ValueError): res.coefficients(project=True) + def test_regression_results_residuals_projection(self): + A = np.array # aliasing np.array for the sake of pep8 + exp_resid = pd.DataFrame({'s1': ilr_inv(A([-0.986842, -0.236842])), + 's2': ilr_inv(A([-0.065789, -1.815789])), + 's3': ilr_inv(A([1.473684, 0.473684])), + 's4': ilr_inv(A([1.394737, -1.105263])), + 's5': ilr_inv(A([-1.065789, 1.184211])), + 's6': ilr_inv(A([-1.144737, -0.394737])), + 's7': ilr_inv(A([0.394737, 1.894737]))}, + index=['Z1', 'Z2', 'Z3']).T + feature_names = ['Z1', 'Z2', 'Z3'] + basis = _gram_schmidt_basis(3) + res = RegressionResults(self.results, basis=basis, + feature_names=feature_names) + pdt.assert_frame_equal(res.residuals(project=True), exp_resid, + check_exact=False, + check_less_precise=True) + + def test_regression_results_residuals(self): + exp_resid = pd.DataFrame({'s1': [-0.986842, -0.236842], + 's2': [-0.065789, -1.815789], + 's3': [1.473684, 0.473684], + 's4': [1.394737, -1.105263], + 's5': [-1.065789, 1.184211], + 's6': [-1.144737, -0.394737], + 's7': [0.394737, 1.894737]}, + index=['Y1', 'Y2']).T + res = RegressionResults(self.results) + pdt.assert_frame_equal(res.residuals(), exp_resid, + check_exact=False, + check_less_precise=True) + + def test_regression_results_predict(self): + model = RegressionResults(self.results) + res_predict = model.predict(self.data[['X']]) + + exp_predict = pd.DataFrame({'s1': [1.986842, 1.236842], + 's2': [3.065789, 3.815789], + 's3': [2.526316, 2.526316], + 's4': [3.605263, 5.105263], + 's5': [3.065789, 3.815789], + 's6': [4.144737, 6.394737], + 's7': [3.605263, 5.105263]}, + index=['Y1', 'Y2']).T + + pdt.assert_frame_equal(res_predict, exp_predict) + + def test_regression_results_predict_extrapolate(self): + model = RegressionResults(self.results) + extrapolate = pd.DataFrame({'X': [8, 9, 10]}, + index=['k1', 'k2', 'k3']) + res_predict = model.predict(extrapolate) + + exp_predict = pd.DataFrame({'k1': [5.76315789, 10.26315789], + 'k2': [6.30263158, 11.55263158], + 'k3': [6.84210526, 12.84210526]}, + index=['Y1', 'Y2']).T + + pdt.assert_frame_equal(res_predict, exp_predict) + + def test_regression_results_predict_projection(self): + feature_names = ['Z1', 'Z2', 'Z3'] + basis = _gram_schmidt_basis(3) + model = RegressionResults(self.results, basis=basis, + feature_names=feature_names) + + res_predict = model.predict(self.data[['X']], project=True) + A = np.array # aliasing np.array for the sake of pep8 + exp_predict = pd.DataFrame({'s1': ilr_inv(A([1.986842, 1.236842])), + 's2': ilr_inv(A([3.065789, 3.815789])), + 's3': ilr_inv(A([2.526316, 2.526316])), + 's4': ilr_inv(A([3.605263, 5.105263])), + 's5': ilr_inv(A([3.065789, 3.815789])), + 's6': ilr_inv(A([4.144737, 6.394737])), + 's7': ilr_inv(A([3.605263, 5.105263]))}, + index=feature_names).T + + pdt.assert_frame_equal(res_predict, exp_predict) + + def test_regression_results_predict_none(self): + model = RegressionResults(self.results) + res_predict = model.predict() + + exp_predict = pd.DataFrame({'s1': [1.986842, 1.236842], + 's2': [3.065789, 3.815789], + 's3': [2.526316, 2.526316], + 's4': [3.605263, 5.105263], + 's5': [3.065789, 3.815789], + 's6': [4.144737, 6.394737], + 's7': [3.605263, 5.105263]}, + index=['Y1', 'Y2']).T + + pdt.assert_frame_equal(res_predict, exp_predict) + + if __name__ == "__main__": unittest.main() diff --git a/ipynb/balance_trees.ipynb b/ipynb/balance_trees.ipynb index 13d41be..16fcd27 100644 --- a/ipynb/balance_trees.ipynb +++ b/ipynb/balance_trees.ipynb @@ -459,7 +459,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" } }, "nbformat": 4,