From 245bb3a07287a71f166f2ce7880c41f808df6ed2 Mon Sep 17 00:00:00 2001 From: Ddelval Date: Mon, 8 May 2023 01:11:18 +0200 Subject: [PATCH 01/46] Initial PLS implementation --- skfda/ml/regression/__init__.py | 2 + skfda/ml/regression/_fpls_regression.py | 141 +++++ skfda/preprocessing/dim_reduction/__init__.py | 2 + skfda/preprocessing/dim_reduction/_fpls.py | 494 ++++++++++++++++++ skfda/tests/test_FPLSRegression.py | 337 ++++++++++++ .../test_fda_usc_no_reg_data.npy | Bin 0 -> 4128 bytes .../test_fda_usc_reg_data.npy | Bin 0 -> 4128 bytes 7 files changed, 976 insertions(+) create mode 100644 skfda/ml/regression/_fpls_regression.py create mode 100644 skfda/preprocessing/dim_reduction/_fpls.py create mode 100644 skfda/tests/test_FPLSRegression.py create mode 100644 skfda/tests/test_FPLSRegression_data/test_fda_usc_no_reg_data.npy create mode 100644 skfda/tests/test_FPLSRegression_data/test_fda_usc_reg_data.npy diff --git a/skfda/ml/regression/__init__.py b/skfda/ml/regression/__init__.py index d01c3fdc4..73082f296 100644 --- a/skfda/ml/regression/__init__.py +++ b/skfda/ml/regression/__init__.py @@ -10,6 +10,7 @@ "_kernel_regression": ["KernelRegression"], "_linear_regression": ["LinearRegression"], "_fpca_regression": ["FPCARegression"], + "_fpls_regression": ["FPLSRegression"], "_neighbors_regression": [ "KNeighborsRegressor", "RadiusNeighborsRegressor", @@ -19,6 +20,7 @@ if TYPE_CHECKING: from ._fpca_regression import FPCARegression + from ._fpls_regression import FPLSRegression from ._historical_linear_model import ( HistoricalLinearRegression as HistoricalLinearRegression, ) diff --git a/skfda/ml/regression/_fpls_regression.py b/skfda/ml/regression/_fpls_regression.py new file mode 100644 index 000000000..cab8bddfe --- /dev/null +++ b/skfda/ml/regression/_fpls_regression.py @@ -0,0 +1,141 @@ +from __future__ import annotations + +from typing import TypeVar, Union + +import multimethod +from sklearn.utils.validation import check_is_fitted + +from ..._utils._sklearn_adapter import BaseEstimator, RegressorMixin +from ...misc.regularization import L2Regularization +from ...preprocessing.dim_reduction import FPLS +from ...representation import FData, FDataGrid +from ...representation.basis import Basis, FDataBasis +from ...typing._numpy import NDArrayFloat + +FPLSRegressionSelf = TypeVar("FPLSRegressionSelf", bound="FPLSRegression") + +Input = Union[FData, NDArrayFloat] + + +class FPLSRegression( + BaseEstimator, + RegressorMixin[Input, Input], +): + """ + Regression using Functional Partial Least Squares. + + Args: + n_components: Number of components to keep. Defaults to 5. + ... + """ + + def __init__( + self, + n_components: int = 5, + scale: bool = False, + integration_weights_X: NDArrayFloat | None = None, + integration_weights_Y: NDArrayFloat | None = None, + regularization_X: L2Regularization | None = None, + weight_basis_X: Basis | None = None, + weight_basis_Y: Basis | None = None, + ) -> None: + self.n_components = n_components + self.scale = scale + self.integration_weights_X = integration_weights_X + self.integration_weights_Y = integration_weights_Y + self.regularization_X = regularization_X + self.weight_basis_X = weight_basis_X + self.weight_basis_Y = weight_basis_Y + + @multimethod.multidispatch + def _configure_x(self, X): + self.y_predictor = lambda X: X @ self.fpls_.G_xw @ self.coef_ + return X + + @_configure_x.register + def _configure_x_grid(self, X: FDataGrid): + self.y_predictor = ( + lambda X: X.data_matrix[..., 0] @ self.fpls_.G_xw @ self.coef_ + ) + + @_configure_x.register + def _configure_x_basis(self, X: FDataBasis): + self.y_predictor = ( + lambda X: X.coefficients @ self.fpls_.G_xw @ self.coef_ + ) + + @multimethod.multidispatch + def _configure_y(self, Y): + self.postprocess_response = lambda y_data: y_data + + @_configure_y.register + def _configure_y_grid(self, Y: FDataGrid): + self.postprocess_response = lambda y_data: Y.copy( + data_matrix=y_data, + sample_names=(None,) * y_data.shape[0], + ) + + @_configure_y.register + def _configure_y_basis(self, Y: FDataBasis): + self.postprocess_response = lambda y_data: Y.copy( + coefficients=y_data, + sample_names=(None,) * y_data.shape[0], + ) + + def fit( + self, + X: FData, + y: NDArrayFloat, + ) -> FPLSRegressionSelf: + """Fit the model using X as training data and y as target values.""" + # Center and scale data + self.x_mean = X.mean(axis=0) + self.y_mean = y.mean(axis=0) + if self.scale: + self.x_std = X.std() + self.y_std = y.std() + else: + self.x_std = 1 + self.y_std = 1 + + X = (X - self.x_mean) / self.x_std + y = (y - self.y_mean) / self.y_std + + self._configure_x(X) + self._configure_y(y) + + self.fpls_ = FPLS( + n_components=self.n_components, + scale=False, + integration_weights_X=self.integration_weights_X, + integration_weights_Y=self.integration_weights_Y, + regularization_X=self.regularization_X, + weight_basis_X=self.weight_basis_X, + weight_basis_Y=self.weight_basis_Y, + deflation_mode="reg", + ) + + self.fpls_.fit(X, y) + + self.coef_ = self.fpls_.x_rotations_ @ self.fpls_.y_loadings_.T + + return self + + def predict(self, X: FData) -> NDArrayFloat: + """Predict using the model. + + Args: + X: FData to predict. + + Returns: + Predicted values. + + """ + check_is_fitted(self) + + X = (X - self.x_mean) / self.x_std + + y_scaled = self.y_predictor(X) + y_scaled = self.postprocess_response(y_scaled) + + return y_scaled * self.y_std + self.y_mean diff --git a/skfda/preprocessing/dim_reduction/__init__.py b/skfda/preprocessing/dim_reduction/__init__.py index 79df95a6b..b9fef1bb8 100644 --- a/skfda/preprocessing/dim_reduction/__init__.py +++ b/skfda/preprocessing/dim_reduction/__init__.py @@ -13,11 +13,13 @@ ], submod_attrs={ "_fpca": ["FPCA"], + "_fpls": ["FPLS"], }, ) if TYPE_CHECKING: from ._fpca import FPCA as FPCA + from ._fpls import FPLS as FPLS def __getattr__(name: str) -> Any: diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py new file mode 100644 index 000000000..75ae8a09f --- /dev/null +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -0,0 +1,494 @@ +from __future__ import annotations + +from typing import Tuple, TypeVar, Union, Optional + +import multimethod +import numpy as np +from sklearn.utils.validation import check_is_fitted + +from skfda.misc.hat_matrix import Input + +from ..._utils._sklearn_adapter import BaseEstimator, InductiveTransformerMixin +from ...misc.regularization import L2Regularization, compute_penalty_matrix +from ...representation import FData, FDataGrid +from ...representation.basis import Basis, FDataBasis, _GridBasis +from ...typing._numpy import NDArrayFloat + +POWER_SOLVER_EPS = 1e-15 + + +def _power_solver(X): + t = X[:, 0] + t_prev = np.ones(t.shape) * np.max(t) * 2 + iter_count = 0 + while np.linalg.norm(t - t_prev) > POWER_SOLVER_EPS: + t_prev = t + t = X @ t + t /= np.linalg.norm(t) + iter_count += 1 + if iter_count > 1000: + break + return t + + +def _calculate_weights( + X, + Y, + G_ww, + G_xw, + G_cc, + G_yc, + L_X_inv, + L_Y_inv, +): + X = X @ G_xw @ L_X_inv.T + Y = Y @ G_yc @ L_Y_inv.T + S = X.T @ Y + w = _power_solver(S @ S.T) + + # Calculate the other weight + c = np.dot(Y.T, np.dot(X, w)) + + # Undo the transformation + w = L_X_inv.T @ w + + # Normalize + w /= np.sqrt(np.dot(w.T, G_ww @ w)) + + # Undo the transformation + c = L_Y_inv.T @ c + + # Normalize the other weight + c /= np.sqrt(np.dot(c.T, G_cc @ c)) + + return w, c + + +def _pls_nipals( + X, + Y, + n_components, + G_ww, + G_xw, + G_cc, + G_yc, + L_X_inv, + L_Y_inv, + deflation="reg", +): + X = X.copy() + Y = Y.copy() + X = X - np.mean(X, axis=0) + Y = Y - np.mean(Y, axis=0) + + if len(Y.shape) == 1: + Y = Y[:, np.newaxis] + + W, C = [], [] + T, U = [], [] + P, Q = [], [] + for _ in range(n_components): + w, c = _calculate_weights( + X, + Y, + G_ww=G_ww, + G_xw=G_xw, + G_cc=G_cc, + G_yc=G_yc, + L_X_inv=L_X_inv, + L_Y_inv=L_Y_inv, + ) + + t = np.dot(X @ G_xw, w) + u = np.dot(Y @ G_yc, c) + + p = np.dot(X.T, t) / np.dot(t.T, t) + + y_proyection = t if deflation == "reg" else u + + q = np.dot(Y.T, y_proyection) / np.dot(y_proyection, y_proyection) + + X = X - np.outer(t, p) + Y = Y - np.outer(y_proyection, q) + + W.append(w) + C.append(c) + T.append(t) + U.append(u) + P.append(p) + Q.append(q) + + W = np.array(W).T + C = np.array(C).T + T = np.array(T).T + U = np.array(U).T + P = np.array(P).T + Q = np.array(Q).T + + # Ignore flake8 waring of too long output tuple + return W, C, T, U, P, Q # noqa: WPS227 + + +InputType = Union[FData, NDArrayFloat] + + +class FPLS( + BaseEstimator, + InductiveTransformerMixin[InputType, InputType, Optional[InputType]], +): + """ + Functional Partial Least Squares Regression. + + Attributes: + n_components: Number of components to keep. + ... + """ + + def __init__( + self, + n_components: int = 5, + scale: bool = False, + integration_weights_X: NDArrayFloat | None = None, + integration_weights_Y: NDArrayFloat | None = None, + regularization_X: L2Regularization | None = None, + regularization_Y: L2Regularization | None = None, + weight_basis_X: Basis | None = None, + weight_basis_Y: Basis | None = None, + deflation_mode: str = "reg", + ) -> None: + self.n_components = n_components + self.scale = scale + self.integration_weights_X = integration_weights_X + self.integration_weights_Y = integration_weights_Y + self.regularization_X = regularization_X + self.regularization_Y = regularization_Y + self.weight_basis_X = weight_basis_X + self.weight_basis_Y = weight_basis_Y + self.deflation_mode = deflation_mode + + @multimethod.multidispatch + def _process_input_x(self, X): + """ + Process the input data of the X block. + + This method is called by the fit method and + it is implemented for each type of input data. + """ + self.G_xw = np.identity(X.shape[1]) + self.G_ww = self.G_xw + self._y_predictor = lambda X: X @ self.G_xw @ self.coef_ + self._make_component_x = lambda: self.x_rotations_ + self._regularization_matrix_X = 0 + + self._transform_x = lambda X: X @ self.G_xw @ self.x_rotations_ + self._inv_transform_x = lambda T: T @ self.x_loadings_.T + + return X + + @_process_input_x.register + def _process_input_x_grid(self, X: FDataGrid): + x_mat = X.data_matrix[..., 0] + if self.integration_weights_X is None: + self.integration_weights_X = np.ones(x_mat.shape[1]) + + self.G_xw = np.diag(self.integration_weights_X) + self.G_ww = self.G_xw + + if self.regularization_X is not None: + self._regularization_matrix_X = compute_penalty_matrix( + basis_iterable=(_GridBasis(grid_points=X.grid_points),), + regularization_parameter=1, + regularization=self.regularization_X, + ) + else: + self._regularization_matrix_X = 0 + + self._y_predictor = ( + lambda X: X.data_matrix[..., 0] @ self.G_xw @ self.coef_ + ) + + self._make_component_x = lambda: X.copy( + data_matrix=np.transpose(self.x_rotations_), + sample_names=(None,) * self.n_components, + dataset_name="FPLS X components", + ) + + self._transform_x = ( + lambda X: X.data_matrix[..., 0] @ self.G_xw @ self.x_rotations_ + ) + + self._inv_transform_x = lambda T: X.copy( + data_matrix=T @ self.x_loadings_.T, + sample_names=(None,) * T.shape[0], + dataset_name="FPLS X components", + ) + + return x_mat + + @_process_input_x.register + def _process_input_x_basis(self, X: FDataBasis): + x_mat = X.coefficients + + if self.weight_basis_Y is None: + self.weight_basis_Y = X.basis + + self.G_xw = X.basis.inner_product_matrix(self.weight_basis_Y) + self.G_ww = self.weight_basis_Y.gram_matrix() + + if self.regularization_X is not None: + self._regularization_matrix_X = compute_penalty_matrix( + basis_iterable=(self.weight_basis_Y,), + regularization_parameter=1, + regularization=self.regularization_X, + ) + else: + self._regularization_matrix_X = 0 + + self._y_predictor = lambda X: X.coefficients @ self.G_xw @ self.coef_ + + self._make_component_x = lambda: X.copy( + coefficients=np.transpose(self.x_rotations_), + sample_names=(None,) * self.n_components, + dataset_name="FPLS X components", + ) + + self._transform_x = ( + lambda X: X.coefficients @ self.G_xw @ self.x_rotations_ + ) + + self._inv_transform_x = lambda T: X.copy( + coefficients=T @ self.x_loadings_.T, + sample_names=(None,) * T.shape[0], + dataset_name="FPLS X reconstructions", + ) + + return x_mat + + @multimethod.multidispatch + def _process_input_y(self, Y): + """ + Process the input data of the Y block. + + This method is called by the fit method and + it is implemented for each type of input data. + """ + self.G_yc = np.identity(Y.shape[1]) + self.G_cc = np.identity(Y.shape[1]) + self._regularization_matrix_Y = 0 + self._make_component_y = lambda: self.y_rotations_ + + self._transform_y = lambda Y: Y @ self.G_yc @ self.y_rotations_ + self._inv_transform_y = lambda T: T @ self.y_loadings_.T + return Y + + @_process_input_y.register + def _process_input_y_grid(self, Y: FDataGrid): + y_mat = Y.data_matrix[..., 0] + if self.integration_weights_Y is None: + self.integration_weights_Y = np.ones(y_mat.shape[1]) + + self.G_yc = np.diag(self.integration_weights_Y) + self.G_cc = self.G_yc + + if self.regularization_Y is not None: + self._regularization_matrix_Y = compute_penalty_matrix( + basis_iterable=(_GridBasis(grid_points=Y.grid_points),), + regularization_parameter=1, + regularization=self.regularization_Y, + ) + else: + self._regularization_matrix_Y = 0 + + self._make_component_y = lambda: Y.copy( + data_matrix=np.transpose(self.y_rotations_), + sample_names=(None,) * self.n_components, + dataset_name="FPLS Y components", + ) + + self._transform_y = ( + lambda Y: Y.data_matrix[..., 0] @ self.G_yc @ self.y_rotations_ + ) + + self._inv_transform_y = lambda T: Y.copy( + data_matrix=T @ self.y_loadings_.T, + sample_names=(None,) * T.shape[0], + dataset_name="FPLS Y reconstructins", + ) + + return y_mat + + @_process_input_y.register + def _process_input_y_basis(self, Y: FDataBasis): + y_mat = Y.coefficients + self.G_yc = Y.basis.gram_matrix() + self.G_cc = self.G_yc + + if self.weight_basis_Y is None: + self.weight_basis_Y = Y.basis + + if self.regularization_Y is not None: + self._regularization_matrix_Y = compute_penalty_matrix( + basis_iterable=(self.weight_basis_Y,), + regularization_parameter=1, + regularization=self.regularization_Y, + ) + else: + self._regularization_matrix_Y = 0 + + self._make_component_y = lambda: Y.copy( + coefficients=np.transpose(self.y_rotations_), + sample_names=(None,) * self.n_components, + dataset_name="FPLS Y components", + ) + + self._transform_y = ( + lambda Y: Y.coefficients @ self.G_yc @ self.y_rotations_ + ) + + self._inv_transform_y = lambda T: Y.copy( + coefficients=T @ self.y_loadings_.T, + sample_names=(None,) * T.shape[0], + dataset_name="FPLS Y reconstructions", + ) + + return y_mat + + def _fit_data( + self, + X: InputType, + Y: InputType, + ) -> None: + """Fit the model using X and Y as already centered data.""" + x_mat = self._process_input_x(X) + + penalization_matrix = self.G_ww + self._regularization_matrix_X + L_X_inv = np.linalg.inv(np.linalg.cholesky(penalization_matrix)) + + y_mat = self._process_input_y(Y) + + penalization_matrix = self.G_cc + self._regularization_matrix_Y + L_Y_inv = np.linalg.inv(np.linalg.cholesky(penalization_matrix)) + + # Supress flake8 warning about too many values to unpack + W, C, T, U, P, Q = _pls_nipals( # noqa: WPS236 + x_mat, + y_mat, + self.n_components, + G_ww=self.G_ww, + G_xw=self.G_xw, + G_cc=self.G_cc, + G_yc=self.G_yc, + L_X_inv=L_X_inv, + L_Y_inv=L_Y_inv, + deflation=self.deflation_mode, + ) + + self.x_weights_ = W + self.y_weights_ = C + self.x_scores_ = T + self.y_scores_ = U + self.x_loadings_ = P + self.y_loadings_ = Q + + self.x_rotations_ = W @ np.linalg.pinv(P.T @ self.G_xw @ W) + + self.y_rotations_ = C @ np.linalg.pinv(Q.T @ self.G_yc @ C) + + self.components_x_ = self._make_component_x() + self.components_y_ = self._make_component_y() + + self.coef_ = self.x_rotations_ @ Q.T + + def fit( + self, + X: InputType, + y: InputType, + ) -> FPLS: + """ + Fit the model using the data for both blocks. + + Any of the parameters can be a FDataGrid, FDataBasis or numpy array. + + Args: + X: Data of the X block + y: Data of the Y block + + Returns: + self + """ + calculate_mean = lambda x: x.mean() if isinstance(x, FData) else x.mean(axis=0) + # Center and scale data + self.x_mean = calculate_mean(X) + self.y_mean = calculate_mean(y) + if self.scale: + self.x_std = X.std() + self.y_std = y.std() + else: + self.x_std = 1 + self.y_std = 1 + + X = (X - self.x_mean) / self.x_std + y = (y - self.y_mean) / self.y_std + + self._fit_data(X, y) + return self + + def transform( + self, + X: InputType, + y: InputType | None = None, + ) -> NDArrayFloat | Tuple[NDArrayFloat, NDArrayFloat]: + """ + Apply the dimension reduction learned on the train data. + + Args: + X: Data to transform. Must have the same number of features and + type as the data used to train the model. + y : Data to transform. Must have the same number of features and + type as the data used to train the model. + If None, only X is transformed. + + Returns: + x_scores: Data transformed. + y_scores: Data transformed (if y is not None) + """ + check_is_fitted(self) + + X = (X - self.x_mean) / self.x_std + x_scores = self._transform_x(X) + + if y is not None: + y = (y - self.y_mean) / self.y_std + y_scores = self._transform_y(y) + return x_scores, y_scores + return x_scores + + def inverse_transform( + self, + X: NDArrayFloat, + Y: NDArrayFloat | None = None, + ) -> InputType | Tuple[InputType, InputType]: + """ + Transform data back to its original space. + + Args: + X: Data to transform back. Must have the same number of columns + as the number of components of the model. + Y: Data to transform back. Must have the same number of columns + as the number of components of the model. + + Returns: + X: Data reconstructed from the transformed data. + Y: Data reconstructed from the transformed data + (if Y is not None) + """ + check_is_fitted(self) + + X = X * self.x_std + self.x_mean + x_recon = self._inv_transform_x(X) + + if Y is not None: + Y = Y * self.y_std + self.y_mean + y_recon = self._inv_transform_y(Y) + return x_recon, y_recon + return x_recon diff --git a/skfda/tests/test_FPLSRegression.py b/skfda/tests/test_FPLSRegression.py new file mode 100644 index 000000000..ee66eaeb4 --- /dev/null +++ b/skfda/tests/test_FPLSRegression.py @@ -0,0 +1,337 @@ +"""Test the FPLSRegression class.""" +import numpy as np +import pytest +import scipy +from sklearn.cross_decomposition import PLSRegression + +from skfda.datasets import fetch_tecator +from skfda.misc.operators import LinearDifferentialOperator +from skfda.misc.regularization import L2Regularization +from skfda.ml.regression import FPLSRegression +from skfda.representation.basis import BSplineBasis, FDataBasis + + +class TestFPLSRegression: + """Test the FPLSRegression class.""" + + def test_sklearn(self): + """Compare results with sklearn.""" + # Load the data + X, y = fetch_tecator( + return_X_y=True, + ) + + # Fit the model + fplsr = FPLSRegression(n_components=5) + fplsr.fit(X, y) + + sklearnpls = PLSRegression(n_components=5, scale=False) + sklearnpls.fit(X.data_matrix[..., 0], y) + + rtol = 3e-5 + atol = 1e-6 + # Compare the results + np.testing.assert_allclose( + fplsr.coef_.flatten(), + sklearnpls.coef_.flatten(), + rtol=rtol, + atol=atol, + ) + + # Compare predictions + np.testing.assert_allclose( + fplsr.predict(X).flatten(), + sklearnpls.predict(X.data_matrix[..., 0]).flatten(), + rtol=rtol, + atol=atol, + ) + + # Check that the rotations are correc + np.testing.assert_allclose( + np.abs(fplsr.fpls_.x_rotations_).flatten(), + np.abs(sklearnpls.x_rotations_).flatten(), + rtol=rtol, + atol=atol, + ) + + def test_fda_usc_no_reg(self): + """ + Test a multivariate regression with no regularization. + + Replication Code: + data(tecator) + x <- tecator$absorp.fdata + y <- tecator$y + res2 <- fdata2pls(x, y, ncomp=5) + for (i in res2$l) { + c <- res2$loading.weigths$data[i, ] + c <- c / c(sqrt(crossprod(c))) + print( + paste( + round(c, 8), + collapse = ", " + ) # components + ) + } + """ + # Results of fda.usc: + + with open( + "test_FPLSRegression_data/test_fda_usc_no_reg_data.npy", + "rb", + ) as f: + r_results = np.load(f, allow_pickle=False) + + signs = np.array([1, -1, 1, -1, 1]) + + r_results *= signs + + X, y = fetch_tecator( + return_X_y=True, + ) + + plsReg = FPLSRegression(n_components=5, scale=False) + plsReg.fit(X, y) + + W = plsReg.fpls_.x_weights_ + np.testing.assert_allclose(W, r_results, atol=1e-8) + + def test_fda_usc_reg(self): + """ + Test the FPLSRegression with regularization against fda.usc. + + Replication Code: + data(tecator) + x=tecator$absorp.fdata + y=tecator$y + res2=fdata2pls(x,y,ncomp=5,P=c(0,0,1),lambda = 1000) + for(i in res2$l){ + c = res2$loading.weigths$data[i,] + c = c/ c(sqrt(crossprod(c))) + print( + paste( + round(c, 8), + collapse=", " + ) # components + ) + } + """ + X, y = fetch_tecator( + return_X_y=True, + ) + + pen_order = 2 + reg_param = 1000 + # This factor compensates for the fact that the difference + # matrices in fda.usc are scaled by the mean of the deltas + # between grid points. This diference is introduced in + # the function D.penalty (fdata2pc.R:479) in fda.usc. + + grid_points = X[0].grid_points[0] + grid_step = np.mean(np.diff(grid_points)) + factor1 = grid_step ** (2 * pen_order - 1) + + reg_param *= factor1 + + regularization = L2Regularization( + LinearDifferentialOperator(pen_order), + regularization_parameter=reg_param, + ) + + # Fit the model + fplsr = FPLSRegression( + n_components=5, + regularization_X=regularization, + ) + fplsr.fit(X, y) + + with open( + "test_FPLSRegression_data/test_fda_usc_reg_data.npy", + "rb", + ) as f: + r_results = np.load(f, allow_pickle=False) + + signs = np.array([1, -1, 1, -1, 1]) + + w_mat = fplsr.fpls_.x_weights_ @ np.diag(signs) + np.testing.assert_allclose(w_mat, r_results, atol=6e-6, rtol=6e-4) + + def test_basis_vs_grid(self): + """Test that the results are the same in basis and grid.""" + X, y = fetch_tecator( + return_X_y=True, + ) + + original_grid_points = X.grid_points[0] + # Express the data in a FDataBasis + X = X.to_basis(BSplineBasis(n_basis=20)) + + # Fit the model + fplsr = FPLSRegression(n_components=5) + fplsr.fit(X, y) + basis_components = fplsr.fpls_.components_x_(original_grid_points) + + # Go back to grid + new_grid_points = np.linspace( + original_grid_points[0], + original_grid_points[-1], + 1000, + ) + X = X.to_grid(grid_points=new_grid_points) + + # Get the weights for the Simpson's rule + identity = np.eye(len(X.grid_points[0])) + ss_weights = scipy.integrate.simps(identity, X.grid_points[0]) + + # Fit the model with weights + fplsr = FPLSRegression( + n_components=5, + integration_weights_X=ss_weights, + ) + fplsr.fit(X, y) + + grid_components = fplsr.fpls_.components_x_(original_grid_points) + + np.testing.assert_allclose( + basis_components, + grid_components, + rtol=3e-3, + ) + + def test_multivariate_regression(self): + """Test the multivariate regression.""" + self.create_latent_variables(n_latent=5, n_samples=100) + + # Check that the model is able to recover the latent variables + # if it has enough components + y_observed, y_rotations = self.create_observed_multivariate_variable( + n_features=5, + ) + X_observed, X_rotations = self.create_observed_multivariate_variable( + n_features=10, + ) + + plsreg = FPLSRegression(n_components=5) + plsreg.fit(X_observed, y_observed) + + minimum_score = 0.99 + assert plsreg.score(X_observed, y_observed) > minimum_score + + @pytest.mark.parametrize("discretized", [True, False]) + @pytest.mark.parametrize("n_features", [1, 5]) + @pytest.mark.parametrize("noise_std", [0, 1]) + def test_simple_regresion(self, discretized, n_features, noise_std): + """Test multivariate regressor and functional response.""" + self.create_latent_variables(n_latent=5, n_samples=100) + + # Check that the model is able to recover the latent variables if + # it has enough components + y_observed, y_rotations = self.create_observed_multivariate_variable( + n_features=n_features, + noise=noise_std, + ) + X_observed, X_rotations = self.create_observed_functional_variable( + discretized=discretized, + noise=noise_std, + ) + + plsreg = FPLSRegression(n_components=5) + plsreg.fit(X_observed, y_observed) + + minimum_score = 0.99 if noise_std == 0 else 0.98 + assert plsreg.score(X_observed, y_observed) > minimum_score + + @pytest.mark.parametrize("discretized_observed", [True, False]) + @pytest.mark.parametrize("discretized_response", [True, False]) + @pytest.mark.parametrize("noise_std", [0, 1]) + def test_simple_regresion_dataset_functional( + self, + discretized_observed, + discretized_response, + noise_std, + ): + """Test multivariate regressor and functional response.""" + self.create_latent_variables(n_latent=5, n_samples=100) + + # Check that the model is able to recover the latent variables if + # it has enough components + X_observed, X_rotations = self.create_observed_functional_variable( + discretized=discretized_observed, + noise=noise_std, + ) + y_observed, y_rotations = self.create_observed_functional_variable( + discretized=discretized_response, + noise=noise_std, + ) + + plsreg = FPLSRegression(n_components=5) + plsreg.fit(X_observed, y_observed) + minimum_score = 0.99 if noise_std == 0 else 0.98 + assert plsreg.score(X_observed, y_observed) > minimum_score + + def create_latent_variables(self, n_latent, n_samples): + """Create latent variables for testing.""" + self.rng = np.random.RandomState(0) + self.n_latent = n_latent + self.n_samples = n_samples + + # Get the means of the latent variables + latent_means = self.rng.uniform(low=1, high=10, size=(n_latent)) + + # Sample the variables + self.latent_samples = np.array( + [ + self.rng.normal(loc=mean, scale=1, size=n_samples) + for mean in latent_means + ], + ).T + + def create_observed_multivariate_variable(self, n_features, noise=0): + """Create observed multivariate variable for testing.""" + rotations = self.rng.uniform( + low=0, + high=10, + size=(n_features, self.n_latent), + ) + + observed_values = self.latent_samples @ rotations.T + observed_noise = noise * self.rng.normal( + size=(self.n_samples, n_features), + ) + observed_values += observed_noise + + return observed_values, rotations + + def create_observed_functional_variable(self, noise=0, discretized=False): + """Create observed functional variable for testing.""" + n_basis = 20 + basis = BSplineBasis(n_basis=n_basis) + + rotation_coef = self.rng.uniform( + low=0, + high=10, + size=(self.n_latent, n_basis), + ) + rotation = FDataBasis(basis, rotation_coef) + + observed_coef = self.latent_samples @ rotation_coef + observed_func = FDataBasis(basis, observed_coef) + + if discretized: + observed_func = observed_func.to_grid( + grid_points=np.linspace(0, 1, 100), + ) + + func_noise = noise * self.rng.normal(size=(self.n_samples, 100)) + observed_func.data_matrix[..., 0] += func_noise + + else: + observed_func.coefficients += noise * self.rng.normal( + size=(self.n_samples, n_basis), + ) + + return observed_func, rotation + + +if __name__ == "__main__": + TestFPLSRegression().test_fda_usc_reg() diff --git a/skfda/tests/test_FPLSRegression_data/test_fda_usc_no_reg_data.npy b/skfda/tests/test_FPLSRegression_data/test_fda_usc_no_reg_data.npy new file mode 100644 index 0000000000000000000000000000000000000000..36b54860f8e53b510404795f2292dce62db9d1d5 GIT binary patch literal 4128 zcmbW3`8yQc`^VE_i!33@)?zD_r977E*5)DEvb1&NmO=|Hl**!z|YohDb^68?}OJ^QElUM&g)ynfTL*NFn1C%o&4(G&<=yC(~xv&3zKOz^PxB!(r!Pov({rlc){1K z$Veto;th^UEn~uU&uvG<+8A(YtgeFUlxf8s^-3D ze{q-)X&c7B-ok|4Wl0H|ubH5AzwqsbJ|=u@>a!*b=g*~Ga_F9B0%;=a#&vNP=&AC( zPN}eo^J2lH{YT=0qFAtd#!)G(kOlYFFW5E9V}boR zUCQY*3uJ0kB;)3Do~z%|x)9iKp_?!u7R(08nHJ5|N;b6G4&LY)VZ(vZXtg3&MSmt+^#!*q*8*qtD}lOS0J zWlv?ATe%>umDKlx$c1>(*PDB<&A-2al#<)-22|YdCw!CLfG34UFLb{&U}u$x(U>t0 zn*&SCmt5qb$fRrEvoanM4-{#&bntNE;7rDm5gs}R+(=(K&cioH&-zEq>tB}naUt{j zj40y|#|S*s5c>8j9Ot1_WZZeBnFjP-7f>j1p#c{yp3=Y9QjhtfezxSlbr|t-6Pn1> zVO_z0*N=>I(Qee&#pxURS!-O8u12m1*wQ$;!AaKvFhyR`=e-B;W2r3qg zhoTk&V5_8vhf$kgMX{X9G1X?!dvw-8O0x-??Kg!P7c@d`g0tCXK^+tw7=P<6!T=TT zE93G6B4qZPWW>``K}TJ*w6!7yyVDnj>`gDm$^D`f@2Uz6o4mNY=?NMA^>rE+o~uS} z>-0%WNh(Sge+wBBR^z7jAq|yN)p%;-%j%d~3ikGY454UHP{Xw^_s{k=`YEQJE8-x`%-8=r*&j zItsBpeYED7VgWwOJx>pD$U#x_p=%L_X&CYRiNQ{XB((Jp4F07Yiq~oyBfDDM;6b#u z$$M5TBvE_guP#r6U*&=`DQ5S<&iHWk0^?#BRZZBkTe=kP?7bs~9u<(ArR`jJnFNJe zXS06~kU^oa^6GxgYVZ+U)*Sby!h`&TmOUS6;PvB1`@I}GoLK=z4fEVtIR0*I;3OLi z+PMXvH*w+1_L`6Jv|3mrrsjTn2w`pFJ85aBI_N6dF>=wj4&t}iUduJ81D$`OErekR zCtOZA{|YC=@)gaS~B9# z7$ySA(8WJDz8vygo)dTbRzPW4^Rxb#N{GH8eWQ4=3XIkbP7XAZK$5ne5|ogM5TMIlW1rA|`NeE3sVeRRjIhnl&TG%OU=o zXv&VcLTHmWYL#D|0ap`8rmQbS!FoYwvY(F+Cinjs)E-Je6R}Tk=ANcuUWPC{@pmrL zjN*FYbsylDqSo}sW<$TMI^H;1#4<%GTp+8kW1=F!ty#X&`fg5S9sT(qs-6Wtoj#U(WdR!3EH zQMmKwf^FSgT+a~FB8Isb7MlF_)N3wg-?dbxWOH#)&l=H;i<1ReMc#Z4rWh@fW7=|1 zD`xYphHf_Mas8GOqS#iz4;EengfFtDbBiD#$^A&2vsC=g7VHQZsMzolWIHHV2` zUs!18>9DY|qb|k2lZB3#vCfOf#!zR6Z>jtn=t8w2{T<81XkGi_FVYY2o6wlWA17eg zba07CaT&IlmUb_ct3tI~&w2$}WHiLY`DClnEo$nc*LE7_E6ypXwAP@=IrVoEItpBg06+0`gnWkcB z^3lT304h!wB-Fpis7CU;<6j7d6#SNWU$0Vvgs%5RHx}rWVRi_Awd>D9Y}?*@ z{w`K+q=j6;#;_|Qlh30-WY{2ELNW_HjxQeL%@u*>62(hx>~eS=l(0qQI|W)fGisY{ z=&IQ)1xV#w(A~`iM$GQ5R(@=Vw+Nd1T1bb3#QY~$2oz{GU-wkl zQ3g&i%Z@*8e*o$a??;_f&H}or(W7jyG}vw(n6LjS3kYiyq?Q%U&ozR9o|-{9BptH6 ztRYQ>i?^=JrJiEJ_Mc9zY3u4BPC>uuacBb=E*7j(lxYO>-Uocb*G721W7(7g@WI`L z5KT1Y1C)5V3;#BP!px=*Zwnh?i)X|_?)pX`xNThfGL;APHj9ymuNxqIB`uZpyB?(S zyzgw1sE2>K#;Q$|2%FMfrh6;6@Xv=4Wkt*RdyQ_kUd3a;l~=RDZ!KzI_#lDxfKUy4 z{JK16W=TL${vTlH^SZiwPme(fNJi|x`LX^13}p0ueOjLdoyP71-iwlV%epx^l49I>V4Sh-`#QF~i5 z{*$pT`7M!(pIUx?K0ZOmz25B6*CO-(jlRPgUVJY8IXf;Zr&EV&pO5EQ9IeMc+7aCT z)CTM>{^NhDg@>wZe#EsbaDg@0Y&HDRw$>6QUxAhRefZr;X zJrJa2psvZ}L088jwDBukV0yOy71G6UxtEIW3dW2SBGlr}jo$}QA?{{DU~T8K#)km!j2xhrlt*M^wJ zc3yw!Z0$UUQ`iCX^+k)SYY*+C%H+@RqW%~R z|3mX+?z;&uETre+x{JZ0OD8b$1sUFbEvl3yGr-Gv*HZ82Tu3UtJuEL;2TSrszdQTZ z!laG=)6~eHj{<6g$-6`KNN~>=Bg?aJW1lxk zbpZwc2#gLrIq?t|@Lc6HM>3F^Ap0WS{2GSbe#0FP4TKQ^d+iBYA|xm`D@_L71?tV5 z_JYe{}^ zb?Kl!`RU)2fmyJ-Fhk7edp1}W-z(hamJNAbdJR8rX28n!bqdci@4&0TgY5=4Z-JG7 z8mS_l1bNS-)@e%L1oO_jB{pHHFm*gOOuqdt>=|&V8dcAP!&_>J6Qpb)Dt79HtjdK7 i9jngIF*z`1*21h`mI>*@bz;Nfsc>o4c^6XcW%xgj8$;j# literal 0 HcmV?d00001 diff --git a/skfda/tests/test_FPLSRegression_data/test_fda_usc_reg_data.npy b/skfda/tests/test_FPLSRegression_data/test_fda_usc_reg_data.npy new file mode 100644 index 0000000000000000000000000000000000000000..d1606e5e72dcdee93e6e1a7ba8abb4fcbe6014d7 GIT binary patch literal 4128 zcmbVO_dgYm_cjvAEJ8916{(DhmStTQxArk3I#x3Ixci(&NwW$=* zkW`XVD4HUDeE)^-`Q^OM^EyAA*YkQk&v|m~oNNys6&8vRIjMe=AmgN2LkA}W? z>fwojv5L+8dI-+0VI7>Thu!%|CCGPGv-Yt!xLyZB-cg;X+}UUJZ)v6&2?ex^BTE~LPWb}8aSWigQp=Ka;e>4NJKgi&kVFwtzkey&s97{WPq`TKLPcb zaF^Np<>Wmk$VCy`D%P@~XGI@hP|kw5w=^S$G8?|U_K1(W#0Kg!nN*1pHu%Sk*`L_T zfy(nSt+f|9z?bja>BHi{x5+n0_r2x7tH%O>9$DlDwEFOLoKrAc-w)@+Dnu+6lE z*sxW8{I?&Q1@+l8cF{2`=>5L5#(g&n(sV91GnTR-V*6+W=Mxi3+BNKsbTA<&mG@(N z9TN(p)B44VnGpPDONn+C6K1}w(2-1KLQviMj(>`qmGL>Mzg?>{iC;|gase$Ot!pkWWm!9EyKQ}EU4R(=xQp@hCa24 z0%>zLbez_GO7djGOkj0re+C;WYvcu67;Mm6qp{woLGi->sMl;^_v43m}C8@Vtm zl(;(8o(pS#8^1aoz=eiu1)FA4xbS_Aqs)miE~sR->d@I-m^tw8;Fex4+&pA;bHx}J zKB{&+C==y@_U}tMqN+TwcKo*3Td58Is60&Y6K})8DPP5=pRLGzAac-bxE0;?H)(2h zwPLREIl?(|D=sPxeLkJpini^V{aZa+@$Nc(QLb()zR$B7|1;2n-?XRRUcAtP_mXb^ z5MI)PQ%9ahI;A(`bo`Bx>k~~FDQ>A}wz&y~qV%+%&;hVz3Hr zs6}2yr39qNN{>kH&qHsC{3)}UP+XPl5^Oke3C^F|85}Tl9kw}4ZqvV54%1sM|LWqcjC5BE6>9#7g5KlU-eWzLf)YfXdh!c)QbtSQj0ewxi2 zsRO0)@biOhRnT?gYKwaa0VLiQsHR%vK`$ffrSZpeFthQPnAwgvJnvop&qF&8>*n%l zCYhyJy6VNB!!OIRBEn|wwpBIS9KV*y6|X_#MGw6u-gy9d9JHCe9g0?9midTC%@R!`^PEo9e+_M2w{?0WZys_<%MpHdZlR3@o z$7DD(O&fmOLxq$;&b80&G^o+i6S+cWKp{~!zAB3aVyhJ^9NamOZ*H8@vWf>EgD#zm ze}!z$f+4_1QPeb+W8MkP3hexAv>SqZxLTfRKcuY`TP z!(PX4S3*HNK`iJ&CC~@#WBiv@fiYWYU3794NI&Z%#%WbU+K8~h)aPn=_UKAg(ZO-KfA-(qz-Xdhz=l$gTsv081-{pHlUX-} z&13tey|eg0pua3j`%eJx?ncJl+u8{6K6x%Ne;Xn3dBPnlViTwbxouo&(F~!4tfaLy z&EOt>LH*vZW{Ce25ahM21x^-9PBRy@z!Amv%OcneQhb?)$2*(h`kpIqwq-Q|UGDET zojZ+i`b^u^9Vr6X;LkPJHs`}N{XXB3dEXKdC_FHo_gl`9W0p}rXrL3CrQ#Drfq%&Z zc8=sahzgl%I~rRB`j!2E^j!(iosj!jvNI1H0@%30J{CH^t&M3jkHCMjpM=kK3jLy5Wu|bH0mAxl3%YKouGjdH`SR)lz zMr&z_ZmwlsoSfz0 zR|&UBg%U0b?{8}vQR89h#V6zB1Rk2-GSBX9<)QJ?Yz>^8$Nk6(qN0csGi9BJW_kF1 zgm@(3H4hiY+9+SC<)OOC!0a`D9vZ|5k9<+);ez`CpGBBlEQ{OHY;VWK|1k8I!%Gfo z%hr(sPI2&qos(7jdp6FU3?JTinvEJl`<};sWMR!+3-_}BSeR2M)kK_N;(?+gyPU(B zNE|2~fqxAAYU0v2oX$Xkb)rj|Is>&_(YK09$Md5Dx5w@1cye8~=*n>#W(0EIkTYpG z=Swx{G@@bc@y_MJ&l_;S+qIb)*MQQ;{Ul;bD{#dowOzmGsPen)Cm_&(~nR@c|C5PTw}t;mpAs&Nd~6t2wyLQ_Sly zkBxOrd+t3{X5*0`Lq|EA=XntQb(X2h#Mr%UR8|WeEvZ?Gvzur*KDt!qR}d8!m})sh zhLh1V^Qz)~t9lH|_cPe{_%2EmRhSdm73jZ7ba~iu0)DI^m!!uQ;LCEc0n(ckWR50H zUe;fSbvE+5G(IJRnWTf3*w{6&@G!K~n!N?W(Ptt zkH$Nn&|xa~VTbhw1_*UH_Af4C070fsp+JoZBCH1%GmT7e8DDn9D1!yvB~iu8qS?^y z=BfI2KHsTKetn}+!i7%lAx$9;57_dQPYvA&vq};oUcP*wIp1qfdddfp8x2w#h5}IT zbG6m>7l1U*73QT0K$m&HAbP&LWq$v`^AmtnxzaDC)dKiC?YL=T-cJY5ReSB+&4;V~ zN=sH-BY5p)b!T*QL9)ju78|pN~JD2P%hW_+TTje}*p!DhP?vBtX zpl6TK>X(LM%;HW)k5D#-qefS0nt=XpNd*U{#=sAMjifRn@RV(lts>P>C`&2Ku zkTF7_O_$!@fM?>>CSI&z;JbHIwFmT`%!?U3R{zl$lr5O$Iz%3lqO@+r1;(N_w(L^YL#7ZeF3wN`?>>?maTMMvato! zgo0h_%J?AEFR<&0WI%SN$AVo_^^o`}B;8g07M$$*nw2hl3Tk@%G*+fu$G29gejXyd#;tOyDJ?SSYdsvmzxPJqfYY$-FvMyy`C#Dhp{ zME^`pN2?!=cv`Kre{D}Ay1J(*+t@eacmAT-f~@v7 zGF}9P^Tlb8Uzfwpupj1Ogc4u6q4U zh5-5*->x1@yamJWs+N7Fl|thN_YNn6n^4*LD}A?bF}P{|^0W>t0wXVd^@n>4fur|d zv)=rjG0{evzb@s%?lY=ZJHBSaUcw7w$Ix_m^dyzM9FxHzCwxvukN^v6L~1I(g+a-e h(IK_ilQ1ke=p&xwjL%bx+TQzy;?vJEa( Date: Mon, 8 May 2023 01:17:45 +0200 Subject: [PATCH 02/46] Fix some style issues --- skfda/preprocessing/dim_reduction/_fpls.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index 75ae8a09f..2b7ab468b 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -1,13 +1,11 @@ from __future__ import annotations -from typing import Tuple, TypeVar, Union, Optional +from typing import Optional, Tuple, Union import multimethod import numpy as np from sklearn.utils.validation import check_is_fitted -from skfda.misc.hat_matrix import Input - from ..._utils._sklearn_adapter import BaseEstimator, InductiveTransformerMixin from ...misc.regularization import L2Regularization, compute_penalty_matrix from ...representation import FData, FDataGrid @@ -416,7 +414,9 @@ def fit( Returns: self """ - calculate_mean = lambda x: x.mean() if isinstance(x, FData) else x.mean(axis=0) + calculate_mean = ( + lambda x: x.mean() if isinstance(x, FData) else x.mean(axis=0) + ) # Center and scale data self.x_mean = calculate_mean(X) self.y_mean = calculate_mean(y) From 548594cc4de0277420a1d87bef437ca35ef0a55e Mon Sep 17 00:00:00 2001 From: Ddelval Date: Mon, 8 May 2023 01:31:09 +0200 Subject: [PATCH 03/46] Fix test execution --- skfda/tests/test_FPLSRegression.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/skfda/tests/test_FPLSRegression.py b/skfda/tests/test_FPLSRegression.py index ee66eaeb4..6d61aa3d0 100644 --- a/skfda/tests/test_FPLSRegression.py +++ b/skfda/tests/test_FPLSRegression.py @@ -1,4 +1,6 @@ """Test the FPLSRegression class.""" +import os + import numpy as np import pytest import scipy @@ -74,12 +76,13 @@ def test_fda_usc_no_reg(self): ) } """ - # Results of fda.usc: - - with open( - "test_FPLSRegression_data/test_fda_usc_no_reg_data.npy", - "rb", - ) as f: + # Results from fda.usc: + path = os.path.join( + os.path.dirname(__file__), + "test_FPLSRegression_data", + "test_fda_usc_no_reg_data.npy", + ) + with open(path, "rb") as f: r_results = np.load(f, allow_pickle=False) signs = np.array([1, -1, 1, -1, 1]) @@ -145,10 +148,12 @@ def test_fda_usc_reg(self): ) fplsr.fit(X, y) - with open( - "test_FPLSRegression_data/test_fda_usc_reg_data.npy", - "rb", - ) as f: + path = os.path.join( + os.path.dirname(__file__), + "test_FPLSRegression_data", + "test_fda_usc_reg_data.npy", + ) + with open(path, "rb") as f: r_results = np.load(f, allow_pickle=False) signs = np.array([1, -1, 1, -1, 1]) From 0891dcf9848d62d4150578fb5904c69fd06d15cf Mon Sep 17 00:00:00 2001 From: Ddelval Date: Mon, 8 May 2023 08:28:13 +0200 Subject: [PATCH 04/46] Rename test file --- ...st_FPLSRegression.py => test_fpls_regression.py} | 10 ++-------- .../test_fda_usc_no_reg_data.npy | Bin .../test_fda_usc_reg_data.npy | Bin 3 files changed, 2 insertions(+), 8 deletions(-) rename skfda/tests/{test_FPLSRegression.py => test_fpls_regression.py} (97%) rename skfda/tests/{test_FPLSRegression_data => test_fpls_regression_data}/test_fda_usc_no_reg_data.npy (100%) rename skfda/tests/{test_FPLSRegression_data => test_fpls_regression_data}/test_fda_usc_reg_data.npy (100%) diff --git a/skfda/tests/test_FPLSRegression.py b/skfda/tests/test_fpls_regression.py similarity index 97% rename from skfda/tests/test_FPLSRegression.py rename to skfda/tests/test_fpls_regression.py index 6d61aa3d0..ad5932232 100644 --- a/skfda/tests/test_FPLSRegression.py +++ b/skfda/tests/test_fpls_regression.py @@ -78,8 +78,7 @@ def test_fda_usc_no_reg(self): """ # Results from fda.usc: path = os.path.join( - os.path.dirname(__file__), - "test_FPLSRegression_data", + __file__[:-3]+ "_data", # Trim .py ending "test_fda_usc_no_reg_data.npy", ) with open(path, "rb") as f: @@ -149,8 +148,7 @@ def test_fda_usc_reg(self): fplsr.fit(X, y) path = os.path.join( - os.path.dirname(__file__), - "test_FPLSRegression_data", + __file__[:-3] + "_data", # Trim .py ending "test_fda_usc_reg_data.npy", ) with open(path, "rb") as f: @@ -336,7 +334,3 @@ def create_observed_functional_variable(self, noise=0, discretized=False): ) return observed_func, rotation - - -if __name__ == "__main__": - TestFPLSRegression().test_fda_usc_reg() diff --git a/skfda/tests/test_FPLSRegression_data/test_fda_usc_no_reg_data.npy b/skfda/tests/test_fpls_regression_data/test_fda_usc_no_reg_data.npy similarity index 100% rename from skfda/tests/test_FPLSRegression_data/test_fda_usc_no_reg_data.npy rename to skfda/tests/test_fpls_regression_data/test_fda_usc_no_reg_data.npy diff --git a/skfda/tests/test_FPLSRegression_data/test_fda_usc_reg_data.npy b/skfda/tests/test_fpls_regression_data/test_fda_usc_reg_data.npy similarity index 100% rename from skfda/tests/test_FPLSRegression_data/test_fda_usc_reg_data.npy rename to skfda/tests/test_fpls_regression_data/test_fda_usc_reg_data.npy From eed4be07eab1d2d07bb641a159ab062022c9a06d Mon Sep 17 00:00:00 2001 From: Ddelval Date: Sat, 17 Jun 2023 19:17:05 +0200 Subject: [PATCH 05/46] Add pls tets --- skfda/tests/test_fpls.py | 166 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 skfda/tests/test_fpls.py diff --git a/skfda/tests/test_fpls.py b/skfda/tests/test_fpls.py new file mode 100644 index 000000000..78263e4e0 --- /dev/null +++ b/skfda/tests/test_fpls.py @@ -0,0 +1,166 @@ +"""Test the FPLS class.""" +import os + +import numpy as np +import pytest +import scipy +from sklearn.cross_decomposition import PLSCanonical + +from skfda.datasets import fetch_tecator +from skfda.misc.operators import LinearDifferentialOperator +from skfda.misc.regularization import L2Regularization +from skfda.preprocessing.dim_reduction import FPLS +from skfda.representation.basis import BSplineBasis, FDataBasis + +class LatentVariablesModel: + """Simulate model driven by latent variables.""" + + def create_latent_variables(self, n_latent, n_samples): + """Create latent variables for testing.""" + self.rng = np.random.RandomState(0) + self.n_latent = n_latent + self.n_samples = n_samples + self.grid_points = np.linspace(0, 1, 100) + + # Get the means of the latent variables + latent_means = self.rng.uniform(low=1, high=10, size=(n_latent)) + + # Sample the variables + self.latent_samples = np.array( + [ + self.rng.normal(loc=mean, scale=1, size=n_samples) + for mean in latent_means + ], + ).T + + def create_observed_multivariate_variable(self, n_features, noise=0): + """Create observed multivariate variable for testing.""" + rotations = self.rng.uniform( + low=0, + high=10, + size=(n_features, self.n_latent), + ) + + observed_values = self.latent_samples @ rotations.T + observed_noise = noise * self.rng.normal( + size=(self.n_samples, n_features), + ) + observed_values += observed_noise + + return observed_values, rotations + + def create_observed_functional_variable(self, noise=0, discretized=False): + """Create observed functional variable for testing.""" + n_basis = 20 + basis = BSplineBasis(n_basis=n_basis) + + rotation_coef = self.rng.uniform( + low=0, + high=10, + size=(self.n_latent, n_basis), + ) + rotation = FDataBasis(basis, rotation_coef) + + observed_coef = self.latent_samples @ rotation_coef + observed_func = FDataBasis(basis, observed_coef) + + if discretized: + observed_func = observed_func.to_grid( + grid_points=self.grid_points, + ) + + func_noise = noise * self.rng.normal(size=(self.n_samples, 100)) + observed_func.data_matrix[..., 0] += func_noise + + else: + observed_func.coefficients += noise * self.rng.normal( + size=(self.n_samples, n_basis), + ) + + return observed_func, rotation + + +class TestFPLS(LatentVariablesModel): + """Test the FPLS class.""" + + def test_sklearn(self): + """Compare results with sklearn.""" + # Load the data + X, y = fetch_tecator( + return_X_y=True, + ) + + integration_weights = np.ones(len(X.grid_points[0])) + + # Fit the model + fpls = FPLS(n_components=3, + integration_weights_X=integration_weights, + ) + fpls.fit(X, y) + + sklearnpls = PLSCanonical(n_components=3, scale=False) + sklearnpls.fit(X.data_matrix[..., 0], y) + + rtol = 2e-4 + atol = 1e-6 + + # Check that the rotations are correct + np.testing.assert_allclose( + np.abs(fpls.x_rotations_).flatten(), + np.abs(sklearnpls.x_rotations_).flatten(), + rtol=rtol, + atol=atol, + ) + + + def test_basis_vs_grid(self,): + """Test that the results are the same in basis and grid.""" + + n_components = 5 + self.create_latent_variables(n_latent=n_components, n_samples=100) + + # Create the observed variable as basis variables + X_observed, X_rotations = self.create_observed_functional_variable( + discretized=False, + noise=0, + ) + y_observed, y_rotations = self.create_observed_functional_variable( + discretized=False, + noise=0, + ) + + # Fit the model + fpls = FPLS(n_components=n_components) + fpls.fit(X_observed, y_observed) + + # Convert the observed variables to grid + grid_points = np.linspace(0, 1, 2000) + X_observed_grid = X_observed.to_grid( + grid_points=grid_points, + ) + + y_observed_grid = y_observed.to_grid( + grid_points=grid_points, + ) + + # Fit the model with the grid variables + fpls_grid = FPLS(n_components=n_components) + fpls_grid.fit(X_observed_grid, y_observed_grid) + + + # Check that the results are the same + np.testing.assert_allclose( + np.abs(fpls.components_x_(self.grid_points)), + np.abs(fpls_grid.components_x_(self.grid_points)), + rtol=5e-3, + ) + + np.testing.assert_allclose( + np.abs(fpls.components_y_(self.grid_points)), + np.abs(fpls_grid.components_y_(self.grid_points)), + rtol=5e-3, + ) + + + + From d6569c23291023266c9745dbd555fd9fa14f5d37 Mon Sep 17 00:00:00 2001 From: Ddelval Date: Sat, 17 Jun 2023 19:17:16 +0200 Subject: [PATCH 06/46] Change default weights --- skfda/preprocessing/dim_reduction/_fpls.py | 12 +++++++++--- skfda/tests/test_fpls_regression.py | 16 +++++++++++----- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index 2b7ab468b..5c818fa3f 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -4,6 +4,7 @@ import multimethod import numpy as np +import scipy from sklearn.utils.validation import check_is_fitted from ..._utils._sklearn_adapter import BaseEstimator, InductiveTransformerMixin @@ -152,7 +153,7 @@ def __init__( regularization_Y: L2Regularization | None = None, weight_basis_X: Basis | None = None, weight_basis_Y: Basis | None = None, - deflation_mode: str = "reg", + deflation_mode: str = "can", ) -> None: self.n_components = n_components self.scale = scale @@ -187,7 +188,8 @@ def _process_input_x(self, X): def _process_input_x_grid(self, X: FDataGrid): x_mat = X.data_matrix[..., 0] if self.integration_weights_X is None: - self.integration_weights_X = np.ones(x_mat.shape[1]) + identity = np.eye(x_mat.shape[1]) + self.integration_weights_X = scipy.integrate.simps(identity, X.grid_points[0]) self.G_xw = np.diag(self.integration_weights_X) self.G_ww = self.G_xw @@ -283,7 +285,8 @@ def _process_input_y(self, Y): def _process_input_y_grid(self, Y: FDataGrid): y_mat = Y.data_matrix[..., 0] if self.integration_weights_Y is None: - self.integration_weights_Y = np.ones(y_mat.shape[1]) + identity = np.eye(y_mat.shape[1]) + self.integration_weights_Y = scipy.integrate.simps(identity, Y.grid_points[0]) self.G_yc = np.diag(self.integration_weights_Y) self.G_cc = self.G_yc @@ -414,6 +417,9 @@ def fit( Returns: self """ + if isinstance(y, np.ndarray) and len(y.shape) == 1: + y = y[:, np.newaxis] + calculate_mean = ( lambda x: x.mean() if isinstance(x, FData) else x.mean(axis=0) ) diff --git a/skfda/tests/test_fpls_regression.py b/skfda/tests/test_fpls_regression.py index ad5932232..c6c806cea 100644 --- a/skfda/tests/test_fpls_regression.py +++ b/skfda/tests/test_fpls_regression.py @@ -24,7 +24,7 @@ def test_sklearn(self): ) # Fit the model - fplsr = FPLSRegression(n_components=5) + fplsr = FPLSRegression(n_components=5, integration_weights_X=np.ones(len(X.grid_points[0]))) fplsr.fit(X, y) sklearnpls = PLSRegression(n_components=5, scale=False) @@ -92,7 +92,8 @@ def test_fda_usc_no_reg(self): return_X_y=True, ) - plsReg = FPLSRegression(n_components=5, scale=False) + plsReg = FPLSRegression(n_components=5, integration_weights_X=np.ones(len(X.grid_points[0]))) + print(plsReg.integration_weights_X) plsReg.fit(X, y) W = plsReg.fpls_.x_weights_ @@ -144,6 +145,7 @@ def test_fda_usc_reg(self): fplsr = FPLSRegression( n_components=5, regularization_X=regularization, + integration_weights_X=np.ones(len(X.grid_points[0])), ) fplsr.fit(X, y) @@ -201,14 +203,18 @@ def test_basis_vs_grid(self): rtol=3e-3, ) - def test_multivariate_regression(self): - """Test the multivariate regression.""" + @pytest.mark.parametrize("y_features", [1, 5]) + def test_multivariate_regression(self, y_features): + """Test the multivariate regression. + + Consider both scalar and multivariate responses. + """ self.create_latent_variables(n_latent=5, n_samples=100) # Check that the model is able to recover the latent variables # if it has enough components y_observed, y_rotations = self.create_observed_multivariate_variable( - n_features=5, + n_features=y_features, ) X_observed, X_rotations = self.create_observed_multivariate_variable( n_features=10, From 73e6689a7987c97023ffe068ae92cb346ae3b535 Mon Sep 17 00:00:00 2001 From: Ddelval Date: Sat, 17 Jun 2023 21:15:11 +0200 Subject: [PATCH 07/46] Refactor PLS to improve typing --- skfda/ml/regression/_fpls_regression.py | 66 ++- skfda/preprocessing/dim_reduction/_fpls.py | 502 +++++++++++++-------- 2 files changed, 330 insertions(+), 238 deletions(-) diff --git a/skfda/ml/regression/_fpls_regression.py b/skfda/ml/regression/_fpls_regression.py index cab8bddfe..98aa51b99 100644 --- a/skfda/ml/regression/_fpls_regression.py +++ b/skfda/ml/regression/_fpls_regression.py @@ -16,7 +16,6 @@ Input = Union[FData, NDArrayFloat] - class FPLSRegression( BaseEstimator, RegressorMixin[Input, Input], @@ -47,40 +46,27 @@ def __init__( self.weight_basis_X = weight_basis_X self.weight_basis_Y = weight_basis_Y - @multimethod.multidispatch - def _configure_x(self, X): - self.y_predictor = lambda X: X @ self.fpls_.G_xw @ self.coef_ - return X - - @_configure_x.register - def _configure_x_grid(self, X: FDataGrid): - self.y_predictor = ( - lambda X: X.data_matrix[..., 0] @ self.fpls_.G_xw @ self.coef_ - ) - - @_configure_x.register - def _configure_x_basis(self, X: FDataBasis): - self.y_predictor = ( - lambda X: X.coefficients @ self.fpls_.G_xw @ self.coef_ - ) - - @multimethod.multidispatch - def _configure_y(self, Y): - self.postprocess_response = lambda y_data: y_data - - @_configure_y.register - def _configure_y_grid(self, Y: FDataGrid): - self.postprocess_response = lambda y_data: Y.copy( - data_matrix=y_data, - sample_names=(None,) * y_data.shape[0], - ) - - @_configure_y.register - def _configure_y_basis(self, Y: FDataBasis): - self.postprocess_response = lambda y_data: Y.copy( - coefficients=y_data, - sample_names=(None,) * y_data.shape[0], - ) + def _predict_y(self, X: FData) -> FData: + if isinstance(X, FDataGrid): + return X.data_matrix[..., 0] @ self.fpls_.x_block.G_data_weights @ self.coef_ + elif isinstance(X, FDataBasis): + return X.coefficients @ self.fpls_.x_block.G_data_weights @ self.coef_ + else: + return X @ self.fpls_.x_block.G_data_weights @ self.coef_ + + def _postprocess_response(self, y_data: NDArrayFloat) -> FData: + if isinstance(self.train_y, FDataGrid): + return self.train_y.copy( + data_matrix=y_data, + sample_names=(None,) * y_data.shape[0], + ) + elif isinstance(self.train_y, FDataBasis): + return self.train_y.copy( + coefficients=y_data, + sample_names=(None,) * y_data.shape[0], + ) + else: + return y_data def fit( self, @@ -101,9 +87,6 @@ def fit( X = (X - self.x_mean) / self.x_std y = (y - self.y_mean) / self.y_std - self._configure_x(X) - self._configure_y(y) - self.fpls_ = FPLS( n_components=self.n_components, scale=False, @@ -119,6 +102,9 @@ def fit( self.coef_ = self.fpls_.x_rotations_ @ self.fpls_.y_loadings_.T + self.train_X = X + self.train_y = y + return self def predict(self, X: FData) -> NDArrayFloat: @@ -135,7 +121,7 @@ def predict(self, X: FData) -> NDArrayFloat: X = (X - self.x_mean) / self.x_std - y_scaled = self.y_predictor(X) - y_scaled = self.postprocess_response(y_scaled) + y_scaled = self._predict_y(X) + y_scaled = self._postprocess_response(y_scaled) return y_scaled * self.y_std + self.y_mean diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index 5c818fa3f..cbdd85e03 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -1,4 +1,5 @@ from __future__ import annotations +from abc import abstractmethod from typing import Optional, Tuple, Union @@ -131,228 +132,303 @@ def _pls_nipals( InputType = Union[FData, NDArrayFloat] -class FPLS( - BaseEstimator, - InductiveTransformerMixin[InputType, InputType, Optional[InputType]], -): - """ - Functional Partial Least Squares Regression. - - Attributes: - n_components: Number of components to keep. - ... - """ - +class FPLSBlock: def __init__( self, - n_components: int = 5, - scale: bool = False, - integration_weights_X: NDArrayFloat | None = None, - integration_weights_Y: NDArrayFloat | None = None, - regularization_X: L2Regularization | None = None, - regularization_Y: L2Regularization | None = None, - weight_basis_X: Basis | None = None, - weight_basis_Y: Basis | None = None, - deflation_mode: str = "can", + n_components: int, + label: str, + G_data_weights: NDArrayFloat, + G_weights: NDArrayFloat, + data_matrix: NDArrayFloat, + regularization_matrix: NDArrayFloat | None = None, ) -> None: self.n_components = n_components - self.scale = scale - self.integration_weights_X = integration_weights_X - self.integration_weights_Y = integration_weights_Y - self.regularization_X = regularization_X - self.regularization_Y = regularization_Y - self.weight_basis_X = weight_basis_X - self.weight_basis_Y = weight_basis_Y - self.deflation_mode = deflation_mode + self.label = label + self.G_data_weights = G_data_weights + self.G_weights = G_weights + self.data_matrix = data_matrix - @multimethod.multidispatch - def _process_input_x(self, X): - """ - Process the input data of the X block. + if regularization_matrix is None: + regularization_matrix = np.zeros((data_matrix.shape[1], data_matrix.shape[1])) + self.regularization_matrix = regularization_matrix - This method is called by the fit method and - it is implemented for each type of input data. - """ - self.G_xw = np.identity(X.shape[1]) - self.G_ww = self.G_xw - self._y_predictor = lambda X: X @ self.G_xw @ self.coef_ - self._make_component_x = lambda: self.x_rotations_ - self._regularization_matrix_X = 0 - - self._transform_x = lambda X: X @ self.G_xw @ self.x_rotations_ - self._inv_transform_x = lambda T: T @ self.x_loadings_.T - - return X - - @_process_input_x.register - def _process_input_x_grid(self, X: FDataGrid): - x_mat = X.data_matrix[..., 0] - if self.integration_weights_X is None: - identity = np.eye(x_mat.shape[1]) - self.integration_weights_X = scipy.integrate.simps(identity, X.grid_points[0]) - - self.G_xw = np.diag(self.integration_weights_X) - self.G_ww = self.G_xw - - if self.regularization_X is not None: - self._regularization_matrix_X = compute_penalty_matrix( - basis_iterable=(_GridBasis(grid_points=X.grid_points),), - regularization_parameter=1, - regularization=self.regularization_X, - ) - else: - self._regularization_matrix_X = 0 - self._y_predictor = ( - lambda X: X.data_matrix[..., 0] @ self.G_xw @ self.coef_ - ) + def set_values( + self, + rotations: NDArrayFloat, + loadings: NDArrayFloat, + ): + self.rotations = rotations + self.loadings = loadings - self._make_component_x = lambda: X.copy( - data_matrix=np.transpose(self.x_rotations_), - sample_names=(None,) * self.n_components, - dataset_name="FPLS X components", - ) + @abstractmethod + def make_component(self) -> NDArrayFloat | FData: + pass - self._transform_x = ( - lambda X: X.data_matrix[..., 0] @ self.G_xw @ self.x_rotations_ - ) + @abstractmethod + def transform(self, X: NDArrayFloat | FData) -> NDArrayFloat: + pass - self._inv_transform_x = lambda T: X.copy( - data_matrix=T @ self.x_loadings_.T, - sample_names=(None,) * T.shape[0], - dataset_name="FPLS X components", - ) + @abstractmethod + def inverse_transform( + self, components: NDArrayFloat + ) -> NDArrayFloat | FData: + pass - return x_mat + @abstractmethod + def y_predict( + self, X: NDArrayFloat | FData, coef: NDArrayFloat | FData + ) -> NDArrayFloat | FData: + pass - @_process_input_x.register - def _process_input_x_basis(self, X: FDataBasis): - x_mat = X.coefficients - if self.weight_basis_Y is None: - self.weight_basis_Y = X.basis +class FPLSBlockDataMultivariate(FPLSBlock): + def __init__( + self, + data: NDArrayFloat, + n_components: int, + label: str, + ) -> None: + self.data = data + + super().__init__( + n_components=n_components, + label=label, + G_data_weights=np.identity(data.shape[1]), + G_weights=np.identity(data.shape[1]), + data_matrix=data, + ) - self.G_xw = X.basis.inner_product_matrix(self.weight_basis_Y) - self.G_ww = self.weight_basis_Y.gram_matrix() + def make_component(self): + return self.rotations - if self.regularization_X is not None: - self._regularization_matrix_X = compute_penalty_matrix( - basis_iterable=(self.weight_basis_Y,), - regularization_parameter=1, - regularization=self.regularization_X, + def transform(self, data: NDArrayFloat | FData) -> NDArrayFloat: + if not isinstance(data, NDArrayFloat): + raise TypeError( + "Attribute in block " + self.label + " must be a numpy array" ) - else: - self._regularization_matrix_X = 0 + return data @ self.rotations - self._y_predictor = lambda X: X.coefficients @ self.G_xw @ self.coef_ - - self._make_component_x = lambda: X.copy( - coefficients=np.transpose(self.x_rotations_), - sample_names=(None,) * self.n_components, - dataset_name="FPLS X components", - ) + def inverse_transform(self, components: NDArrayFloat) -> NDArrayFloat: + return components @ self.loadings.T - self._transform_x = ( - lambda X: X.coefficients @ self.G_xw @ self.x_rotations_ - ) + def y_predict( + self, X: NDArrayFloat | FData, coef: NDArrayFloat | FData + ) -> NDArrayFloat | FData: + if not isinstance(X, NDArrayFloat): + raise TypeError( + "Attribute in block " + self.label + " must be a numpy array" + ) + return X @ coef - self._inv_transform_x = lambda T: X.copy( - coefficients=T @ self.x_loadings_.T, - sample_names=(None,) * T.shape[0], - dataset_name="FPLS X reconstructions", - ) - return x_mat +class FPLSBlockDataGrid(FPLSBlock): + def __init__( + self, + data: FDataGrid, + n_components: int, + label: str, + integration_weights: NDArrayFloat | None, + regularization: L2Regularization | None, + ) -> None: + self.data = data + data_mat = data.data_matrix[..., 0] + + if integration_weights is None: + identity = np.identity(data_mat.shape[1]) + integration_weights = scipy.integrate.simps( + identity, + data.grid_points[0], + ) + self.integration_weights = integration_weights - @multimethod.multidispatch - def _process_input_y(self, Y): - """ - Process the input data of the Y block. + self.G_data_weights = np.diag(self.integration_weights) + self.G_weights = np.diag(self.integration_weights) - This method is called by the fit method and - it is implemented for each type of input data. - """ - self.G_yc = np.identity(Y.shape[1]) - self.G_cc = np.identity(Y.shape[1]) - self._regularization_matrix_Y = 0 - self._make_component_y = lambda: self.y_rotations_ - - self._transform_y = lambda Y: Y @ self.G_yc @ self.y_rotations_ - self._inv_transform_y = lambda T: T @ self.y_loadings_.T - return Y - - @_process_input_y.register - def _process_input_y_grid(self, Y: FDataGrid): - y_mat = Y.data_matrix[..., 0] - if self.integration_weights_Y is None: - identity = np.eye(y_mat.shape[1]) - self.integration_weights_Y = scipy.integrate.simps(identity, Y.grid_points[0]) - - self.G_yc = np.diag(self.integration_weights_Y) - self.G_cc = self.G_yc - - if self.regularization_Y is not None: - self._regularization_matrix_Y = compute_penalty_matrix( - basis_iterable=(_GridBasis(grid_points=Y.grid_points),), + regularization_matrix = None + if regularization is not None: + regularization_matrix = compute_penalty_matrix( + basis_iterable=(_GridBasis(grid_points=data.grid_points),), regularization_parameter=1, - regularization=self.regularization_Y, + regularization=regularization, ) - else: - self._regularization_matrix_Y = 0 + + super().__init__( + n_components=n_components, + label=label, + G_data_weights=np.diag(self.integration_weights), + G_weights=np.diag(self.integration_weights), + regularization_matrix=regularization_matrix, + data_matrix=data_mat, + ) - self._make_component_y = lambda: Y.copy( - data_matrix=np.transpose(self.y_rotations_), + def make_component(self) -> FDataGrid: + return self.data.copy( + data_matrix=np.transpose(self.rotations), sample_names=(None,) * self.n_components, - dataset_name="FPLS Y components", + dataset_name="FPLS " + self.label + " components", ) - self._transform_y = ( - lambda Y: Y.data_matrix[..., 0] @ self.G_yc @ self.y_rotations_ - ) + def transform(self, data: FDataGrid | NDArrayFloat) -> NDArrayFloat: + if not isinstance(data, FDataGrid): + raise TypeError( + "Attribute in block " + self.label + " must be an FDataGrid" + ) + return data.data_matrix[..., 0] @ self.G_data_weights @ self.rotations - self._inv_transform_y = lambda T: Y.copy( - data_matrix=T @ self.y_loadings_.T, - sample_names=(None,) * T.shape[0], - dataset_name="FPLS Y reconstructins", + def inverse_transform(self, components: NDArrayFloat) -> NDArrayFloat: + return self.data.copy( + data_matrix=components @ self.loadings.T, + sample_names=(None,) * components.shape[0], + dataset_name="FPLS " + self.label + " reconstructions", ) - return y_mat + def y_predict( + self, X: NDArrayFloat | FData, coef: NDArrayFloat + ) -> NDArrayFloat: + if not isinstance(X, FDataGrid): + raise TypeError( + "Attribute in block " + self.label + " must be an FDataGrid" + ) + return X.data_matrix[..., 0] @ self.G_data_weights @ coef - @_process_input_y.register - def _process_input_y_basis(self, Y: FDataBasis): - y_mat = Y.coefficients - self.G_yc = Y.basis.gram_matrix() - self.G_cc = self.G_yc - if self.weight_basis_Y is None: - self.weight_basis_Y = Y.basis +class FPLSBlockDataBasis(FPLSBlock): + def __init__( + self, + data: FDataBasis, + n_components: int, + label: str, + weight_basis: Basis | None, + regularization: L2Regularization | None, + ) -> None: + self.data = data - if self.regularization_Y is not None: - self._regularization_matrix_Y = compute_penalty_matrix( - basis_iterable=(self.weight_basis_Y,), + if weight_basis is None: + self.weight_basis = data.basis + else: + self.weight_basis = weight_basis + + regularization_matrix = None + if regularization is not None: + regularization_matrix = compute_penalty_matrix( + basis_iterable=(self.weight_basis,), regularization_parameter=1, - regularization=self.regularization_Y, + regularization=regularization, ) - else: - self._regularization_matrix_Y = 0 - self._make_component_y = lambda: Y.copy( - coefficients=np.transpose(self.y_rotations_), + super().__init__( + n_components=n_components, + label=label, + G_data_weights=self.weight_basis.gram_matrix(), + G_weights=data.basis.inner_product_matrix( + self.weight_basis, + ), + regularization_matrix=regularization_matrix, + data_matrix=data.coefficients, + ) + + def check_type(self, data: NDArrayFloat | FData) -> None: + if not isinstance(data, FDataBasis): + raise TypeError( + "Attribute in block " + self.label + " must be an FDataBasis" + ) + + def make_component(self): + return self.data.copy( + coefficients=np.transpose(self.rotations), sample_names=(None,) * self.n_components, - dataset_name="FPLS Y components", + dataset_name="FPLS " + self.label + " components", ) - self._transform_y = ( - lambda Y: Y.coefficients @ self.G_yc @ self.y_rotations_ + def transform(self, data: NDArrayFloat | FData) -> NDArrayFloat: + if not isinstance(data, FDataBasis): + raise TypeError( + "Attribute in block " + self.label + " must be an FDataBasis" + ) + return data.coefficients @ self.G_weights @ self.rotations + + def inverse_transform(self, components: NDArrayFloat) -> NDArrayFloat: + return self.data.copy( + coefficients=components @ self.loadings.T, + sample_names=(None,) * components.shape[0], + dataset_name="FPLS " + self.label + " reconstructions", ) - self._inv_transform_y = lambda T: Y.copy( - coefficients=T @ self.y_loadings_.T, - sample_names=(None,) * T.shape[0], - dataset_name="FPLS Y reconstructions", + def y_predict( + self, X: NDArrayFloat | FData, coef: NDArrayFloat + ) -> NDArrayFloat: + if not isinstance(X, FDataBasis): + raise TypeError( + "Attribute in block " + self.label + " must be an FDataBasis" + ) + return X.coefficients @ self.G_data_weights @ coef + + +def blockFactory( + data, + n_components: int, + label: str, + integration_weights: NDArrayFloat | None, + regularization: L2Regularization | None, + weight_basis: Basis | None, +) -> FPLSBlock: + if isinstance(data, FDataGrid): + return FPLSBlockDataGrid( + data=data, + n_components=n_components, + label=label, + integration_weights=integration_weights, + regularization=regularization, ) + elif isinstance(data, FDataBasis): + return FPLSBlockDataBasis( + data=data, + n_components=n_components, + label=label, + weight_basis=weight_basis, + regularization=regularization, + ) + return FPLSBlockDataMultivariate( + data=data, + n_components=n_components, + label=label, + ) + + +class FPLS( + BaseEstimator, + InductiveTransformerMixin[InputType, InputType, Optional[InputType]], +): + """ + Functional Partial Least Squares Regression. + + Attributes: + n_components: Number of components to keep. + ... + """ - return y_mat + def __init__( + self, + n_components: int = 5, + scale: bool = False, + integration_weights_X: NDArrayFloat | None = None, + integration_weights_Y: NDArrayFloat | None = None, + regularization_X: L2Regularization | None = None, + regularization_Y: L2Regularization | None = None, + weight_basis_X: Basis | None = None, + weight_basis_Y: Basis | None = None, + deflation_mode: str = "can", + ) -> None: + self.n_components = n_components + self.scale = scale + self.integration_weights_X = integration_weights_X + self.integration_weights_Y = integration_weights_Y + self.regularization_X = regularization_X + self.regularization_Y = regularization_Y + self.weight_basis_X = weight_basis_X + self.weight_basis_Y = weight_basis_Y + self.deflation_mode = deflation_mode def _fit_data( self, @@ -360,25 +436,43 @@ def _fit_data( Y: InputType, ) -> None: """Fit the model using X and Y as already centered data.""" - x_mat = self._process_input_x(X) + self.x_block = blockFactory( + data=X, + n_components=self.n_components, + label="X", + integration_weights=self.integration_weights_X, + regularization=self.regularization_X, + weight_basis=self.weight_basis_X, + ) - penalization_matrix = self.G_ww + self._regularization_matrix_X + penalization_matrix = ( + self.x_block.G_weights + self.x_block.regularization_matrix + ) L_X_inv = np.linalg.inv(np.linalg.cholesky(penalization_matrix)) - y_mat = self._process_input_y(Y) + self.y_block = blockFactory( + data=Y, + n_components=self.n_components, + label="X", + integration_weights=self.integration_weights_Y, + regularization=self.regularization_Y, + weight_basis=self.weight_basis_Y, + ) - penalization_matrix = self.G_cc + self._regularization_matrix_Y + penalization_matrix = ( + self.y_block.G_weights + self.y_block.regularization_matrix + ) L_Y_inv = np.linalg.inv(np.linalg.cholesky(penalization_matrix)) # Supress flake8 warning about too many values to unpack W, C, T, U, P, Q = _pls_nipals( # noqa: WPS236 - x_mat, - y_mat, - self.n_components, - G_ww=self.G_ww, - G_xw=self.G_xw, - G_cc=self.G_cc, - G_yc=self.G_yc, + X=self.x_block.data_matrix, + Y=self.y_block.data_matrix, + n_components=self.n_components, + G_ww=self.x_block.G_weights, + G_xw=self.x_block.G_data_weights, + G_cc=self.y_block.G_weights, + G_yc=self.y_block.G_data_weights, L_X_inv=L_X_inv, L_Y_inv=L_Y_inv, deflation=self.deflation_mode, @@ -390,13 +484,25 @@ def _fit_data( self.y_scores_ = U self.x_loadings_ = P self.y_loadings_ = Q + self.x_rotations_ = W @ np.linalg.pinv( + P.T @ self.x_block.G_data_weights @ W + ) - self.x_rotations_ = W @ np.linalg.pinv(P.T @ self.G_xw @ W) + self.y_rotations_ = C @ np.linalg.pinv( + Q.T @ self.y_block.G_data_weights @ C + ) - self.y_rotations_ = C @ np.linalg.pinv(Q.T @ self.G_yc @ C) + self.x_block.set_values( + rotations=self.x_rotations_, + loadings=self.x_loadings_, + ) + self.y_block.set_values( + rotations=self.y_rotations_, + loadings=self.y_loadings_, + ) - self.components_x_ = self._make_component_x() - self.components_y_ = self._make_component_y() + self.components_x_ = self.x_block.make_component() + self.components_y_ = self.y_block.make_component() self.coef_ = self.x_rotations_ @ Q.T @@ -418,7 +524,7 @@ def fit( self """ if isinstance(y, np.ndarray) and len(y.shape) == 1: - y = y[:, np.newaxis] + y = y[:, np.newaxis] calculate_mean = ( lambda x: x.mean() if isinstance(x, FData) else x.mean(axis=0) @@ -461,11 +567,11 @@ def transform( check_is_fitted(self) X = (X - self.x_mean) / self.x_std - x_scores = self._transform_x(X) + x_scores = self.x_block.transform(X) if y is not None: y = (y - self.y_mean) / self.y_std - y_scores = self._transform_y(y) + y_scores = self.y_block.transform(y) return x_scores, y_scores return x_scores @@ -491,10 +597,10 @@ def inverse_transform( check_is_fitted(self) X = X * self.x_std + self.x_mean - x_recon = self._inv_transform_x(X) + x_recon = self.x_block.inverse_transform(X) if Y is not None: Y = Y * self.y_std + self.y_mean - y_recon = self._inv_transform_y(Y) + y_recon = self.y_block.inverse_transform(Y) return x_recon, y_recon return x_recon From 49748fc8d4908168a0de73620471a1bcbff74f8d Mon Sep 17 00:00:00 2001 From: Ddelval Date: Sat, 17 Jun 2023 22:40:24 +0200 Subject: [PATCH 08/46] Fix style issues in _fpls --- skfda/ml/regression/_fpls_regression.py | 6 +- skfda/preprocessing/dim_reduction/_fpls.py | 119 +++++++++++---------- 2 files changed, 64 insertions(+), 61 deletions(-) diff --git a/skfda/ml/regression/_fpls_regression.py b/skfda/ml/regression/_fpls_regression.py index 98aa51b99..2cb9cba58 100644 --- a/skfda/ml/regression/_fpls_regression.py +++ b/skfda/ml/regression/_fpls_regression.py @@ -48,11 +48,11 @@ def __init__( def _predict_y(self, X: FData) -> FData: if isinstance(X, FDataGrid): - return X.data_matrix[..., 0] @ self.fpls_.x_block.G_data_weights @ self.coef_ + return X.data_matrix[..., 0] @ self.fpls_._x_block.G_data_weights @ self.coef_ elif isinstance(X, FDataBasis): - return X.coefficients @ self.fpls_.x_block.G_data_weights @ self.coef_ + return X.coefficients @ self.fpls_._x_block.G_data_weights @ self.coef_ else: - return X @ self.fpls_.x_block.G_data_weights @ self.coef_ + return X @ self.fpls_._x_block.G_data_weights @ self.coef_ def _postprocess_response(self, y_data: NDArrayFloat) -> FData: if isinstance(self.train_y, FDataGrid): diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index cbdd85e03..872666c42 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -1,7 +1,8 @@ from __future__ import annotations -from abc import abstractmethod -from typing import Optional, Tuple, Union +from abc import abstractmethod +from typing import Optional, Tuple, Union, overload +from typing_extensions import override import multimethod import numpy as np @@ -149,10 +150,11 @@ def __init__( self.data_matrix = data_matrix if regularization_matrix is None: - regularization_matrix = np.zeros((data_matrix.shape[1], data_matrix.shape[1])) + regularization_matrix = np.zeros( + (data_matrix.shape[1], data_matrix.shape[1]), + ) self.regularization_matrix = regularization_matrix - def set_values( self, rotations: NDArrayFloat, @@ -166,18 +168,24 @@ def make_component(self) -> NDArrayFloat | FData: pass @abstractmethod - def transform(self, X: NDArrayFloat | FData) -> NDArrayFloat: + def transform( + self, + X: NDArrayFloat | FData, + ) -> NDArrayFloat: pass @abstractmethod def inverse_transform( - self, components: NDArrayFloat + self, + components: NDArrayFloat, ) -> NDArrayFloat | FData: pass @abstractmethod def y_predict( - self, X: NDArrayFloat | FData, coef: NDArrayFloat | FData + self, + X: NDArrayFloat | FData, + coef: NDArrayFloat | FData, ) -> NDArrayFloat | FData: pass @@ -205,7 +213,7 @@ def make_component(self): def transform(self, data: NDArrayFloat | FData) -> NDArrayFloat: if not isinstance(data, NDArrayFloat): raise TypeError( - "Attribute in block " + self.label + " must be a numpy array" + f"Data in block {self.label} must be a numpy array", ) return data @ self.rotations @@ -213,11 +221,11 @@ def inverse_transform(self, components: NDArrayFloat) -> NDArrayFloat: return components @ self.loadings.T def y_predict( - self, X: NDArrayFloat | FData, coef: NDArrayFloat | FData + self, X: NDArrayFloat | FData, coef: NDArrayFloat | FData, ) -> NDArrayFloat | FData: if not isinstance(X, NDArrayFloat): raise TypeError( - "Attribute in block " + self.label + " must be a numpy array" + f"X data in block {self.label} must be a numpy array", ) return X @ coef @@ -252,7 +260,7 @@ def __init__( regularization_parameter=1, regularization=regularization, ) - + super().__init__( n_components=n_components, label=label, @@ -266,13 +274,13 @@ def make_component(self) -> FDataGrid: return self.data.copy( data_matrix=np.transpose(self.rotations), sample_names=(None,) * self.n_components, - dataset_name="FPLS " + self.label + " components", + dataset_name=f"FPLS {self.label} components", ) def transform(self, data: FDataGrid | NDArrayFloat) -> NDArrayFloat: if not isinstance(data, FDataGrid): raise TypeError( - "Attribute in block " + self.label + " must be an FDataGrid" + f"Data in block {self.label} must be an FDataGrid", ) return data.data_matrix[..., 0] @ self.G_data_weights @ self.rotations @@ -280,15 +288,15 @@ def inverse_transform(self, components: NDArrayFloat) -> NDArrayFloat: return self.data.copy( data_matrix=components @ self.loadings.T, sample_names=(None,) * components.shape[0], - dataset_name="FPLS " + self.label + " reconstructions", + dataset_name=f"FPLS {self.label} components", ) def y_predict( - self, X: NDArrayFloat | FData, coef: NDArrayFloat + self, X: NDArrayFloat | FData, coef: NDArrayFloat, ) -> NDArrayFloat: if not isinstance(X, FDataGrid): raise TypeError( - "Attribute in block " + self.label + " must be an FDataGrid" + f"X data in block {self.label} must be an FDataGrid", ) return X.data_matrix[..., 0] @ self.G_data_weights @ coef @@ -328,23 +336,18 @@ def __init__( data_matrix=data.coefficients, ) - def check_type(self, data: NDArrayFloat | FData) -> None: - if not isinstance(data, FDataBasis): - raise TypeError( - "Attribute in block " + self.label + " must be an FDataBasis" - ) def make_component(self): return self.data.copy( coefficients=np.transpose(self.rotations), sample_names=(None,) * self.n_components, - dataset_name="FPLS " + self.label + " components", + dataset_name=f"FPLS {self.label} components", ) def transform(self, data: NDArrayFloat | FData) -> NDArrayFloat: if not isinstance(data, FDataBasis): raise TypeError( - "Attribute in block " + self.label + " must be an FDataBasis" + f"Data in block {self.label} must be an FDataBasis", ) return data.coefficients @ self.G_weights @ self.rotations @@ -352,7 +355,7 @@ def inverse_transform(self, components: NDArrayFloat) -> NDArrayFloat: return self.data.copy( coefficients=components @ self.loadings.T, sample_names=(None,) * components.shape[0], - dataset_name="FPLS " + self.label + " reconstructions", + dataset_name=f"FPLS {self.label} reconstructions", ) def y_predict( @@ -360,12 +363,12 @@ def y_predict( ) -> NDArrayFloat: if not isinstance(X, FDataBasis): raise TypeError( - "Attribute in block " + self.label + " must be an FDataBasis" + f"X data in block {self.label} must be an FDataBasis", ) return X.coefficients @ self.G_data_weights @ coef -def blockFactory( +def block_factory( data, n_components: int, label: str, @@ -436,7 +439,7 @@ def _fit_data( Y: InputType, ) -> None: """Fit the model using X and Y as already centered data.""" - self.x_block = blockFactory( + self._x_block = block_factory( data=X, n_components=self.n_components, label="X", @@ -446,11 +449,11 @@ def _fit_data( ) penalization_matrix = ( - self.x_block.G_weights + self.x_block.regularization_matrix + self._x_block.G_weights + self._x_block.regularization_matrix ) L_X_inv = np.linalg.inv(np.linalg.cholesky(penalization_matrix)) - self.y_block = blockFactory( + self._y_block = block_factory( data=Y, n_components=self.n_components, label="X", @@ -460,19 +463,19 @@ def _fit_data( ) penalization_matrix = ( - self.y_block.G_weights + self.y_block.regularization_matrix + self._y_block.G_weights + self._y_block.regularization_matrix ) L_Y_inv = np.linalg.inv(np.linalg.cholesky(penalization_matrix)) # Supress flake8 warning about too many values to unpack W, C, T, U, P, Q = _pls_nipals( # noqa: WPS236 - X=self.x_block.data_matrix, - Y=self.y_block.data_matrix, + X=self._x_block.data_matrix, + Y=self._y_block.data_matrix, n_components=self.n_components, - G_ww=self.x_block.G_weights, - G_xw=self.x_block.G_data_weights, - G_cc=self.y_block.G_weights, - G_yc=self.y_block.G_data_weights, + G_ww=self._x_block.G_weights, + G_xw=self._x_block.G_data_weights, + G_cc=self._y_block.G_weights, + G_yc=self._y_block.G_data_weights, L_X_inv=L_X_inv, L_Y_inv=L_Y_inv, deflation=self.deflation_mode, @@ -485,24 +488,24 @@ def _fit_data( self.x_loadings_ = P self.y_loadings_ = Q self.x_rotations_ = W @ np.linalg.pinv( - P.T @ self.x_block.G_data_weights @ W + P.T @ self._x_block.G_data_weights @ W, ) self.y_rotations_ = C @ np.linalg.pinv( - Q.T @ self.y_block.G_data_weights @ C + Q.T @ self._y_block.G_data_weights @ C, ) - self.x_block.set_values( + self._x_block.set_values( rotations=self.x_rotations_, loadings=self.x_loadings_, ) - self.y_block.set_values( + self._y_block.set_values( rotations=self.y_rotations_, loadings=self.y_loadings_, ) - self.components_x_ = self.x_block.make_component() - self.components_y_ = self.y_block.make_component() + self.components_x_ = self._x_block.make_component() + self.components_y_ = self._y_block.make_component() self.coef_ = self.x_rotations_ @ Q.T @@ -530,17 +533,17 @@ def fit( lambda x: x.mean() if isinstance(x, FData) else x.mean(axis=0) ) # Center and scale data - self.x_mean = calculate_mean(X) - self.y_mean = calculate_mean(y) + self._x_mean = calculate_mean(X) + self._y_mean = calculate_mean(y) if self.scale: - self.x_std = X.std() - self.y_std = y.std() + self._x_std = X.std() + self._y_std = y.std() else: - self.x_std = 1 - self.y_std = 1 + self._x_std = 1 + self._y_std = 1 - X = (X - self.x_mean) / self.x_std - y = (y - self.y_mean) / self.y_std + X = (X - self._x_mean) / self._x_std + y = (y - self._y_mean) / self._y_std self._fit_data(X, y) return self @@ -566,12 +569,12 @@ def transform( """ check_is_fitted(self) - X = (X - self.x_mean) / self.x_std - x_scores = self.x_block.transform(X) + X = (X - self._x_mean) / self._x_std + x_scores = self._x_block.transform(X) if y is not None: - y = (y - self.y_mean) / self.y_std - y_scores = self.y_block.transform(y) + y = (y - self._y_mean) / self._y_std + y_scores = self._y_block.transform(y) return x_scores, y_scores return x_scores @@ -596,11 +599,11 @@ def inverse_transform( """ check_is_fitted(self) - X = X * self.x_std + self.x_mean - x_recon = self.x_block.inverse_transform(X) + X = X * self._x_std + self._x_mean + x_recon = self._x_block.inverse_transform(X) if Y is not None: - Y = Y * self.y_std + self.y_mean - y_recon = self.y_block.inverse_transform(Y) + Y = Y * self._y_std + self._y_mean + y_recon = self._y_block.inverse_transform(Y) return x_recon, y_recon return x_recon From 4ceb41458852232b8fc3007bd4efef3fae68bd94 Mon Sep 17 00:00:00 2001 From: Ddelval Date: Wed, 21 Jun 2023 21:39:20 +0200 Subject: [PATCH 09/46] Check transform methods in test --- skfda/tests/test_fpls.py | 60 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/skfda/tests/test_fpls.py b/skfda/tests/test_fpls.py index 78263e4e0..df9225508 100644 --- a/skfda/tests/test_fpls.py +++ b/skfda/tests/test_fpls.py @@ -112,6 +112,37 @@ def test_sklearn(self): atol=atol, ) + # Check the transformation of X + np.testing.assert_allclose( + fpls.transform(X, y), + sklearnpls.transform(X.data_matrix[..., 0],y), + rtol=rtol, + atol=1e-5, + ) + + comp_x, comp_y = fpls.transform(X,y) + + fpls_inv_x, fpls_inv_y = fpls.inverse_transform(comp_x, comp_y) + sklearnpls_inv_x, sklearnpls_inv_y = sklearnpls.inverse_transform( + comp_x, + comp_y, + ) + # Check the inverse transformation of X + np.testing.assert_allclose( + fpls_inv_x.data_matrix.flatten(), + sklearnpls_inv_x.flatten(), + rtol=rtol, + atol=atol, + ) + + # Check the inverse transformation of y + np.testing.assert_allclose( + fpls_inv_y.flatten(), + sklearnpls_inv_y.flatten(), + rtol=rtol, + atol=atol, + ) + def test_basis_vs_grid(self,): """Test that the results are the same in basis and grid.""" @@ -161,6 +192,35 @@ def test_basis_vs_grid(self,): rtol=5e-3, ) + # Check that the transform is the same + np.testing.assert_allclose( + np.abs(fpls.transform(X_observed, y_observed)), + np.abs(fpls_grid.transform(X_observed_grid, y_observed_grid)), + rtol=5e-3, + ) + # Check the inverse transform + comp_x, comp_y = fpls.transform(X_observed, y_observed) + + fpls_inv_x, fpls_inv_y = fpls.inverse_transform(comp_x, comp_y) + fpls_grid_x, fpsl_grid_y = fpls_grid.inverse_transform( + comp_x, + comp_y, + ) + # Check the inverse transformation of X + np.testing.assert_allclose( + fpls_inv_x(grid_points), + fpls_grid_x(grid_points), + rtol=7e-2, + ) + + # Check the inverse transformation of y + np.testing.assert_allclose( + fpls_inv_y(grid_points), + fpsl_grid_y(grid_points), + rtol=0.13, + ) + + From b6280fc982b7d7c56bfff012a3fa1de0d85bfac2 Mon Sep 17 00:00:00 2001 From: Ddelval Date: Wed, 21 Jun 2023 21:39:31 +0200 Subject: [PATCH 10/46] Cleanup fpls implementation --- skfda/preprocessing/dim_reduction/_fpls.py | 35 +++++++++------------- skfda/tests/test_fpls_regression.py | 3 +- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index 872666c42..8ec9585f6 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -211,7 +211,7 @@ def make_component(self): return self.rotations def transform(self, data: NDArrayFloat | FData) -> NDArrayFloat: - if not isinstance(data, NDArrayFloat): + if not isinstance(data, np.ndarray): raise TypeError( f"Data in block {self.label} must be a numpy array", ) @@ -223,7 +223,7 @@ def inverse_transform(self, components: NDArrayFloat) -> NDArrayFloat: def y_predict( self, X: NDArrayFloat | FData, coef: NDArrayFloat | FData, ) -> NDArrayFloat | FData: - if not isinstance(X, NDArrayFloat): + if not isinstance(X, np.ndarray): raise TypeError( f"X data in block {self.label} must be a numpy array", ) @@ -250,9 +250,6 @@ def __init__( ) self.integration_weights = integration_weights - self.G_data_weights = np.diag(self.integration_weights) - self.G_weights = np.diag(self.integration_weights) - regularization_matrix = None if regularization is not None: regularization_matrix = compute_penalty_matrix( @@ -307,20 +304,20 @@ def __init__( data: FDataBasis, n_components: int, label: str, - weight_basis: Basis | None, + weights_basis: Basis | None, regularization: L2Regularization | None, ) -> None: self.data = data - if weight_basis is None: - self.weight_basis = data.basis + if weights_basis is None: + self.weights_basis = data.basis else: - self.weight_basis = weight_basis + self.weights_basis = weights_basis regularization_matrix = None if regularization is not None: regularization_matrix = compute_penalty_matrix( - basis_iterable=(self.weight_basis,), + basis_iterable=(self.weights_basis,), regularization_parameter=1, regularization=regularization, ) @@ -328,9 +325,9 @@ def __init__( super().__init__( n_components=n_components, label=label, - G_data_weights=self.weight_basis.gram_matrix(), + G_data_weights=self.weights_basis.gram_matrix(), G_weights=data.basis.inner_product_matrix( - self.weight_basis, + self.weights_basis, ), regularization_matrix=regularization_matrix, data_matrix=data.coefficients, @@ -389,7 +386,7 @@ def block_factory( data=data, n_components=n_components, label=label, - weight_basis=weight_basis, + weights_basis=weight_basis, regularization=regularization, ) return FPLSBlockDataMultivariate( @@ -535,12 +532,8 @@ def fit( # Center and scale data self._x_mean = calculate_mean(X) self._y_mean = calculate_mean(y) - if self.scale: - self._x_std = X.std() - self._y_std = y.std() - else: - self._x_std = 1 - self._y_std = 1 + self._x_std = 1 + self._y_std = 1 X = (X - self._x_mean) / self._x_std y = (y - self._y_mean) / self._y_std @@ -599,11 +592,11 @@ def inverse_transform( """ check_is_fitted(self) - X = X * self._x_std + self._x_mean x_recon = self._x_block.inverse_transform(X) + x_recon = x_recon * self._x_std + self._x_mean if Y is not None: - Y = Y * self._y_std + self._y_mean y_recon = self._y_block.inverse_transform(Y) + y_recon = y_recon * self._y_std + self._y_mean return x_recon, y_recon return x_recon diff --git a/skfda/tests/test_fpls_regression.py b/skfda/tests/test_fpls_regression.py index c6c806cea..bbc5b0dc5 100644 --- a/skfda/tests/test_fpls_regression.py +++ b/skfda/tests/test_fpls_regression.py @@ -16,6 +16,7 @@ class TestFPLSRegression: """Test the FPLSRegression class.""" + def test_sklearn(self): """Compare results with sklearn.""" # Load the data @@ -24,7 +25,7 @@ def test_sklearn(self): ) # Fit the model - fplsr = FPLSRegression(n_components=5, integration_weights_X=np.ones(len(X.grid_points[0]))) + fplsr = FPLSRegression(n_components=5, integration_weights_X=np.ones(len(X.grid_points[0])), scale=False) fplsr.fit(X, y) sklearnpls = PLSRegression(n_components=5, scale=False) From d16299c7a4224516bd65404ab23bd3c732c82e4e Mon Sep 17 00:00:00 2001 From: David del Val Date: Thu, 22 Jun 2023 21:17:09 +0200 Subject: [PATCH 11/46] Fix typing and linting errors --- skfda/preprocessing/dim_reduction/_fpls.py | 345 ++++++++++++++------- 1 file changed, 241 insertions(+), 104 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index 8ec9585f6..1b39c03b2 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -1,8 +1,7 @@ from __future__ import annotations from abc import abstractmethod -from typing import Optional, Tuple, Union, overload -from typing_extensions import override +from typing import Optional, Tuple, Union, Any import multimethod import numpy as np @@ -18,7 +17,8 @@ POWER_SOLVER_EPS = 1e-15 -def _power_solver(X): +def _power_solver(X: NDArrayFloat) -> NDArrayFloat: + """Return the dominant eigenvector of a matrix using the power method.""" t = X[:, 0] t_prev = np.ones(t.shape) * np.max(t) * 2 iter_count = 0 @@ -33,15 +33,25 @@ def _power_solver(X): def _calculate_weights( - X, - Y, - G_ww, - G_xw, - G_cc, - G_yc, - L_X_inv, - L_Y_inv, -): + X: NDArrayFloat, + Y: NDArrayFloat, + G_ww: NDArrayFloat, + G_xw: NDArrayFloat, + G_cc: NDArrayFloat, + G_yc: NDArrayFloat, + L_X_inv: NDArrayFloat, + L_Y_inv: NDArrayFloat, +) -> Tuple[NDArrayFloat, NDArrayFloat]: + """ + Calculate the weights for the PLS algorithm. + + Parameters as in _pls_nipals. + Returns: + - w: (n_features, 1) + The X block weights. + - c: (n_targets, 1) + The Y block weights. + """ X = X @ G_xw @ L_X_inv.T Y = Y @ G_yc @ L_Y_inv.T S = X.T @ Y @@ -65,18 +75,74 @@ def _calculate_weights( return w, c -def _pls_nipals( - X, - Y, - n_components, - G_ww, - G_xw, - G_cc, - G_yc, - L_X_inv, - L_Y_inv, - deflation="reg", -): +def _pls_nipals( # noqa WPS320 (multi-line function annotation) + X: NDArrayFloat, + Y: NDArrayFloat, + n_components: int, + G_ww: NDArrayFloat, + G_xw: NDArrayFloat, + G_cc: NDArrayFloat, + G_yc: NDArrayFloat, + L_X_inv: NDArrayFloat, + L_Y_inv: NDArrayFloat, + deflation: str = "reg", +) -> Tuple[ + NDArrayFloat, + NDArrayFloat, + NDArrayFloat, + NDArrayFloat, + NDArrayFloat, + NDArrayFloat, +]: + """ + Perform the NIPALS algorithm for PLS. + + Parameters: + - X: (n_samples, n_features) + The X block data matrix. + - Y: (n_samples, n_targets) + The Y block data matrix. + - n_components: number of components to extract. + + - G_ww: (n_features, n_features) + The inner product matrix for the X block weights + (The discretization weights matrix in the case of FDataGrid). + - G_xw: (n_features, n_features) + The inner product matrix for the X block data and weights + (The discretization weights matrix in the case of FDataGrid). + + - G_cc: (n_targets, n_targets) + The inner product matrix for the Y block weights + (The discretization weights matrix in the case of FDataGrid). + - G_yc: (n_targets, n_targets) + The inner product matrix for the Y block data and weights + (The discretization weights matrix in the case of FDataGrid). + + - L_X_inv: (n_features, n_features) + The inverse of the Cholesky decomposition: + L_X @ L_X.T = G_ww + P_x, + where P_x is a the penalty matrix for the X block. + - L_Y_inv: (n_targets, n_targets) + The inverse of the Cholesky decomposition: + L_Y @ L_Y.T = G_cc + P_y, + where P_y is a the penalty matrix for the Y block. + + - deflation: The deflation method to use. + Can be "reg" for regression or "can" for dimensionality reduction. + Returns: + - W: (n_features, n_components) + The X block weights. + - C: (n_targets, n_components) + The Y block weights. + - T: (n_samples, n_components) + The X block scores. + - U: (n_samples, n_components) + The Y block scores. + - P: (n_features, n_components) + The X block loadings. + - Q: (n_targets, n_components) + The Y block loadings. + """ X = X.copy() Y = Y.copy() X = X - np.mean(X, axis=0) @@ -85,6 +151,7 @@ def _pls_nipals( if len(Y.shape) == 1: Y = Y[:, np.newaxis] + # Store the matrices as list of columns W, C = [], [] T, U = [], [] P, Q = [], [] @@ -119,21 +186,37 @@ def _pls_nipals( P.append(p) Q.append(q) - W = np.array(W).T - C = np.array(C).T - T = np.array(T).T - U = np.array(U).T - P = np.array(P).T - Q = np.array(Q).T - - # Ignore flake8 waring of too long output tuple - return W, C, T, U, P, Q # noqa: WPS227 + # Convert each least of columns to a matrix + return ( # noqa: WPS227 (too long output tuple) + np.array(W).T, + np.array(C).T, + np.array(T).T, + np.array(U).T, + np.array(P).T, + np.array(Q).T, + ) InputType = Union[FData, NDArrayFloat] class FPLSBlock: + """ + Class to store the data of a block of a FPLS model. + + Attributes: + - n_components: number of components to extract. + - label: label of the block (X or Y). + - G_data_weights: (n_samples, n_samples) + The inner product matrix for the data and weights + (The discretization weights matrix in the case of FDataGrid). + - G_weights: (n_samples, n_samples) + The inner product matrix for the weights + (The discretization weights matrix in the case of FDataGrid). + - data_matrix: (n_samples, n_features) + The data matrix of the block. + """ + def __init__( self, n_components: int, @@ -159,19 +242,28 @@ def set_values( self, rotations: NDArrayFloat, loadings: NDArrayFloat, - ): + ) -> None: + """Set the results of NIPALS.""" self.rotations = rotations self.loadings = loadings @abstractmethod def make_component(self) -> NDArrayFloat | FData: + """ + Return the component of the block. + + This method must be called once set_values has been called. + It returns the component of the block, which can be a matrix + or a FData object. + """ pass @abstractmethod def transform( self, - X: NDArrayFloat | FData, + data: NDArrayFloat | FData, ) -> NDArrayFloat: + """Transform from the data space to the component space.""" pass @abstractmethod @@ -179,18 +271,19 @@ def inverse_transform( self, components: NDArrayFloat, ) -> NDArrayFloat | FData: - pass - - @abstractmethod - def y_predict( - self, - X: NDArrayFloat | FData, - coef: NDArrayFloat | FData, - ) -> NDArrayFloat | FData: + """Transform from the component space to the data space.""" pass class FPLSBlockDataMultivariate(FPLSBlock): + """ + FPLS block model specialized for multivariate data. + + Attributes: + - data: (n_samples, n_features) + The data matrix of the block. + """ + def __init__( self, data: NDArrayFloat, @@ -207,37 +300,43 @@ def __init__( data_matrix=data, ) - def make_component(self): + def make_component(self) -> NDArrayFloat: # noqa: D102 return self.rotations - def transform(self, data: NDArrayFloat | FData) -> NDArrayFloat: + def transform( # noqa: D102 + self, + data: NDArrayFloat | FData, + ) -> NDArrayFloat: if not isinstance(data, np.ndarray): raise TypeError( f"Data in block {self.label} must be a numpy array", ) return data @ self.rotations - def inverse_transform(self, components: NDArrayFloat) -> NDArrayFloat: + def inverse_transform( # noqa: D102 + self, + components: NDArrayFloat, + ) -> NDArrayFloat: return components @ self.loadings.T - def y_predict( - self, X: NDArrayFloat | FData, coef: NDArrayFloat | FData, - ) -> NDArrayFloat | FData: - if not isinstance(X, np.ndarray): - raise TypeError( - f"X data in block {self.label} must be a numpy array", - ) - return X @ coef - class FPLSBlockDataGrid(FPLSBlock): + """ + FPLS block model specialized for multivariate data. + + Attributes: + - data: FDataGrid object. + - integration_weights: (n_samples, n_samples) + The discretization weights matrix. + """ + def __init__( self, data: FDataGrid, n_components: int, label: str, integration_weights: NDArrayFloat | None, - regularization: L2Regularization | None, + regularization: L2Regularization[FDataGrid] | None, ) -> None: self.data = data data_mat = data.data_matrix[..., 0] @@ -267,45 +366,50 @@ def __init__( data_matrix=data_mat, ) - def make_component(self) -> FDataGrid: + def make_component(self) -> FDataGrid: # noqa: D102 return self.data.copy( data_matrix=np.transpose(self.rotations), sample_names=(None,) * self.n_components, dataset_name=f"FPLS {self.label} components", ) - def transform(self, data: FDataGrid | NDArrayFloat) -> NDArrayFloat: + def transform( # noqa: D102 + self, + data: FDataGrid | NDArrayFloat, + ) -> NDArrayFloat: if not isinstance(data, FDataGrid): raise TypeError( f"Data in block {self.label} must be an FDataGrid", ) return data.data_matrix[..., 0] @ self.G_data_weights @ self.rotations - def inverse_transform(self, components: NDArrayFloat) -> NDArrayFloat: + def inverse_transform( # noqa: D102 + self, + components: NDArrayFloat, + ) -> NDArrayFloat: return self.data.copy( data_matrix=components @ self.loadings.T, sample_names=(None,) * components.shape[0], dataset_name=f"FPLS {self.label} components", ) - def y_predict( - self, X: NDArrayFloat | FData, coef: NDArrayFloat, - ) -> NDArrayFloat: - if not isinstance(X, FDataGrid): - raise TypeError( - f"X data in block {self.label} must be an FDataGrid", - ) - return X.data_matrix[..., 0] @ self.G_data_weights @ coef - class FPLSBlockDataBasis(FPLSBlock): + """ + FPLS block model specialized for basis expansion data. + + Attributes: + - data: FDataBasis object. + - weights_basis: Basis object for the weights + """ + def __init__( self, data: FDataBasis, n_components: int, label: str, weights_basis: Basis | None, - regularization: L2Regularization | None, + regularization: L2Regularization[FDataBasis] | None, ) -> None: self.data = data @@ -333,62 +437,95 @@ def __init__( data_matrix=data.coefficients, ) - - def make_component(self): + def make_component(self) -> FDataBasis: # noqa: D102 return self.data.copy( coefficients=np.transpose(self.rotations), sample_names=(None,) * self.n_components, dataset_name=f"FPLS {self.label} components", ) - def transform(self, data: NDArrayFloat | FData) -> NDArrayFloat: + def transform( # noqa: D102 + self, + data: NDArrayFloat | FData, + ) -> NDArrayFloat: if not isinstance(data, FDataBasis): raise TypeError( f"Data in block {self.label} must be an FDataBasis", ) return data.coefficients @ self.G_weights @ self.rotations - def inverse_transform(self, components: NDArrayFloat) -> NDArrayFloat: + def inverse_transform( # noqa: D102 + self, + components: NDArrayFloat, + ) -> NDArrayFloat: return self.data.copy( coefficients=components @ self.loadings.T, sample_names=(None,) * components.shape[0], dataset_name=f"FPLS {self.label} reconstructions", ) - def y_predict( - self, X: NDArrayFloat | FData, coef: NDArrayFloat - ) -> NDArrayFloat: - if not isinstance(X, FDataBasis): - raise TypeError( - f"X data in block {self.label} must be an FDataBasis", - ) - return X.coefficients @ self.G_data_weights @ coef - +@multimethod.multidispatch def block_factory( - data, + data: Union[FData, NDArrayFloat], n_components: int, label: str, - integration_weights: NDArrayFloat | None, - regularization: L2Regularization | None, - weight_basis: Basis | None, + integration_weights: Optional[np.ndarray], + regularization: Optional[L2Regularization[Any]], + weight_basis: Optional[Basis], ) -> FPLSBlock: - if isinstance(data, FDataGrid): - return FPLSBlockDataGrid( - data=data, - n_components=n_components, - label=label, - integration_weights=integration_weights, - regularization=regularization, - ) - elif isinstance(data, FDataBasis): - return FPLSBlockDataBasis( - data=data, - n_components=n_components, - label=label, - weights_basis=weight_basis, - regularization=regularization, - ) + """Create a PLSBlock depending on the data type.""" + return NotImplemented + + +@block_factory.register +def block_factory_data_grid( + data: FDataGrid, + n_components: int, + label: str, + integration_weights: Optional[np.ndarray], + regularization: Optional[L2Regularization[Any]], + weight_basis: None, +) -> FPLSBlock: + """Create a PLSBlock for FDataGrid objects.""" + return FPLSBlockDataGrid( + data=data, + n_components=n_components, + label=label, + integration_weights=integration_weights, + regularization=regularization, + ) + + +@block_factory.register +def block_factory_data_basis( + data: FDataBasis, + n_components: int, + label: str, + integration_weights: None, + regularization: Optional[L2Regularization[Any]], + weight_basis: Optional[Basis], +) -> FPLSBlock: + """Create a PLSBlock for FDataBasis objects.""" + return FPLSBlockDataBasis( + data=data, + n_components=n_components, + label=label, + weights_basis=weight_basis, + regularization=regularization, + ) + + +@block_factory.register +def block_factory_multivariate( + data: np.ndarray, + n_components: int, + label: str, + integration_weights: None, + regularization: None, + weight_basis: None, +) -> FPLSBlock: + """Create a PLSBlock for multivariate data.""" return FPLSBlockDataMultivariate( data=data, n_components=n_components, @@ -414,8 +551,8 @@ def __init__( scale: bool = False, integration_weights_X: NDArrayFloat | None = None, integration_weights_Y: NDArrayFloat | None = None, - regularization_X: L2Regularization | None = None, - regularization_Y: L2Regularization | None = None, + regularization_X: L2Regularization[InputType] | None = None, + regularization_Y: L2Regularization[InputType] | None = None, weight_basis_X: Basis | None = None, weight_basis_Y: Basis | None = None, deflation_mode: str = "can", From a1773e27a632360daa95cfe3e8ecd2e8828398e2 Mon Sep 17 00:00:00 2001 From: David del Val Date: Thu, 22 Jun 2023 21:19:05 +0200 Subject: [PATCH 12/46] Use mean method of FData --- skfda/preprocessing/dim_reduction/_fpls.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index 1b39c03b2..536f5dd42 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -663,12 +663,9 @@ def fit( if isinstance(y, np.ndarray) and len(y.shape) == 1: y = y[:, np.newaxis] - calculate_mean = ( - lambda x: x.mean() if isinstance(x, FData) else x.mean(axis=0) - ) # Center and scale data - self._x_mean = calculate_mean(X) - self._y_mean = calculate_mean(y) + self._x_mean = X.mean(axis=0) + self._y_mean = y.mean(axis=0) self._x_std = 1 self._y_std = 1 From dea6013784940c01301d2b5350ac2e6ec5e2c2f5 Mon Sep 17 00:00:00 2001 From: David del Val Date: Fri, 23 Jun 2023 08:45:17 +0200 Subject: [PATCH 13/46] Redesign fpls blocks with generics --- skfda/preprocessing/dim_reduction/_fpls.py | 438 ++++++++------------- 1 file changed, 163 insertions(+), 275 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index 536f5dd42..ffe97fac7 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -1,13 +1,16 @@ from __future__ import annotations from abc import abstractmethod -from typing import Optional, Tuple, Union, Any +from typing import Optional, Tuple, Union, Any, Generic, TypeVar, cast import multimethod import numpy as np import scipy from sklearn.utils.validation import check_is_fitted +from skfda.representation import FData +from skfda.typing._numpy import NDArrayFloat + from ..._utils._sklearn_adapter import BaseEstimator, InductiveTransformerMixin from ...misc.regularization import L2Regularization, compute_penalty_matrix from ...representation import FData, FDataGrid @@ -75,7 +78,8 @@ def _calculate_weights( return w, c -def _pls_nipals( # noqa WPS320 (multi-line function annotation) +# ignore flake8 multi-line function annotation +def _pls_nipals( # noqa WPS320 X: NDArrayFloat, Y: NDArrayFloat, n_components: int, @@ -197,10 +201,13 @@ def _pls_nipals( # noqa WPS320 (multi-line function annotation) ) -InputType = Union[FData, NDArrayFloat] +BlockType = TypeVar( + "BlockType", + bound=Union[FDataGrid, FDataBasis, NDArrayFloat], +) -class FPLSBlock: +class _FPLSBlock(Generic[BlockType]): """ Class to store the data of a block of a FPLS model. @@ -219,135 +226,68 @@ class FPLSBlock: def __init__( self, + data: BlockType, n_components: int, label: str, - G_data_weights: NDArrayFloat, - G_weights: NDArrayFloat, - data_matrix: NDArrayFloat, - regularization_matrix: NDArrayFloat | None = None, + integration_weights: NDArrayFloat | None, + regularization: L2Regularization[Any] | None, + weights_basis: Basis | None, ) -> None: self.n_components = n_components self.label = label - self.G_data_weights = G_data_weights - self.G_weights = G_weights - self.data_matrix = data_matrix - - if regularization_matrix is None: - regularization_matrix = np.zeros( - (data_matrix.shape[1], data_matrix.shape[1]), - ) - self.regularization_matrix = regularization_matrix + self.data = data + self._initialize_data( + data=data, + integration_weights=integration_weights, + regularization=regularization, + weights_basis=weights_basis, + ) - def set_values( + @multimethod.multidispatch + def _initialize_data( self, - rotations: NDArrayFloat, - loadings: NDArrayFloat, + data: Union[FData, NDArrayFloat], + integration_weights: Optional[NDArrayFloat], + regularization: Optional[L2Regularization[Any]], + weights_basis: Optional[Basis], ) -> None: - """Set the results of NIPALS.""" - self.rotations = rotations - self.loadings = loadings - - @abstractmethod - def make_component(self) -> NDArrayFloat | FData: - """ - Return the component of the block. - - This method must be called once set_values has been called. - It returns the component of the block, which can be a matrix - or a FData object. - """ - pass - - @abstractmethod - def transform( - self, - data: NDArrayFloat | FData, - ) -> NDArrayFloat: - """Transform from the data space to the component space.""" - pass - - @abstractmethod - def inverse_transform( - self, - components: NDArrayFloat, - ) -> NDArrayFloat | FData: - """Transform from the component space to the data space.""" - pass - - -class FPLSBlockDataMultivariate(FPLSBlock): - """ - FPLS block model specialized for multivariate data. - - Attributes: - - data: (n_samples, n_features) - The data matrix of the block. - """ + """Initialize the data of the block.""" + raise NotImplementedError( + f"Data type {type(data)} not supported", + ) - def __init__( + @_initialize_data.register + def _initialize_data_multivariate( self, - data: NDArrayFloat, - n_components: int, - label: str, + data: np.ndarray, + integration_weights: None, + regularization: None, + weights_basis: None, ) -> None: - self.data = data - - super().__init__( - n_components=n_components, - label=label, - G_data_weights=np.identity(data.shape[1]), - G_weights=np.identity(data.shape[1]), - data_matrix=data, + """Initialize the data of the block.""" + self.G_data_weights = np.identity(data.shape[1]) + self.G_weights = np.identity(data.shape[1]) + self.data_matrix = data + self.regularization_matrix = np.zeros( + (data.shape[1], data.shape[1]), ) - def make_component(self) -> NDArrayFloat: # noqa: D102 - return self.rotations - - def transform( # noqa: D102 - self, - data: NDArrayFloat | FData, - ) -> NDArrayFloat: - if not isinstance(data, np.ndarray): - raise TypeError( - f"Data in block {self.label} must be a numpy array", - ) - return data @ self.rotations - - def inverse_transform( # noqa: D102 - self, - components: NDArrayFloat, - ) -> NDArrayFloat: - return components @ self.loadings.T - - -class FPLSBlockDataGrid(FPLSBlock): - """ - FPLS block model specialized for multivariate data. - - Attributes: - - data: FDataGrid object. - - integration_weights: (n_samples, n_samples) - The discretization weights matrix. - """ - - def __init__( + @_initialize_data.register + def _initialize_data_fdatagrid( self, data: FDataGrid, - n_components: int, - label: str, - integration_weights: NDArrayFloat | None, - regularization: L2Regularization[FDataGrid] | None, + integration_weights: Optional[np.ndarray], + regularization: Optional[L2Regularization[Any]], + weights_basis: None, ) -> None: - self.data = data data_mat = data.data_matrix[..., 0] - if integration_weights is None: identity = np.identity(data_mat.shape[1]) integration_weights = scipy.integrate.simps( identity, data.grid_points[0], ) - self.integration_weights = integration_weights + self.integration_weights = np.diag(integration_weights) regularization_matrix = None if regularization is not None: @@ -356,63 +296,25 @@ def __init__( regularization_parameter=1, regularization=regularization, ) - - super().__init__( - n_components=n_components, - label=label, - G_data_weights=np.diag(self.integration_weights), - G_weights=np.diag(self.integration_weights), - regularization_matrix=regularization_matrix, - data_matrix=data_mat, - ) - - def make_component(self) -> FDataGrid: # noqa: D102 - return self.data.copy( - data_matrix=np.transpose(self.rotations), - sample_names=(None,) * self.n_components, - dataset_name=f"FPLS {self.label} components", - ) - - def transform( # noqa: D102 - self, - data: FDataGrid | NDArrayFloat, - ) -> NDArrayFloat: - if not isinstance(data, FDataGrid): - raise TypeError( - f"Data in block {self.label} must be an FDataGrid", + if regularization_matrix is None: + regularization_matrix = np.zeros( + (data_mat.shape[1], data_mat.shape[1]), ) - return data.data_matrix[..., 0] @ self.G_data_weights @ self.rotations - - def inverse_transform( # noqa: D102 - self, - components: NDArrayFloat, - ) -> NDArrayFloat: - return self.data.copy( - data_matrix=components @ self.loadings.T, - sample_names=(None,) * components.shape[0], - dataset_name=f"FPLS {self.label} components", - ) + self.regularization_matrix = regularization_matrix + self.G_data_weights = self.integration_weights + self.G_weights = self.integration_weights + self.data_matrix = data_mat -class FPLSBlockDataBasis(FPLSBlock): - """ - FPLS block model specialized for basis expansion data. - - Attributes: - - data: FDataBasis object. - - weights_basis: Basis object for the weights - """ - - def __init__( + @_initialize_data.register + def _initialize_data_fdatabasis( self, data: FDataBasis, - n_components: int, - label: str, - weights_basis: Basis | None, - regularization: L2Regularization[FDataBasis] | None, + integration_weights: None, + regularization: Optional[L2Regularization[Any]], + weights_basis: Optional[Basis], ) -> None: - self.data = data - + data_mat = data.coefficients if weights_basis is None: self.weights_basis = data.basis else: @@ -425,117 +327,105 @@ def __init__( regularization_parameter=1, regularization=regularization, ) + if regularization_matrix is None: + regularization_matrix = np.zeros( + (data_mat.shape[1], data_mat.shape[1]), + ) - super().__init__( - n_components=n_components, - label=label, - G_data_weights=self.weights_basis.gram_matrix(), - G_weights=data.basis.inner_product_matrix( - self.weights_basis, - ), - regularization_matrix=regularization_matrix, - data_matrix=data.coefficients, + self.regularization_matrix = regularization_matrix + self.G_weights = self.weights_basis.gram_matrix() + self.G_data_weights = data.basis.inner_product_matrix( + self.weights_basis, ) + self.data_matrix = data_mat + + def calculate_components( + self, + rotations: NDArrayFloat, + loadings: NDArrayFloat, + ) -> BlockType: + """Set the results of NIPALS.""" + self.rotations = rotations + self.loadings = loadings + self.components = self._calculate_component() + return self.components + + def _calculate_component(self) -> BlockType: + if isinstance(self.data, FDataGrid): + return self.data.copy( + data_matrix=np.transpose(self.rotations), + sample_names=(None,) * self.n_components, + dataset_name=f"FPLS {self.label} components", + ) + elif isinstance(self.data, FDataBasis): + return self.data.copy( + coefficients=np.transpose(self.rotations), + sample_names=(None,) * self.n_components, + dataset_name=f"FPLS {self.label} components", + ) + elif isinstance(self.data, np.ndarray): + return cast(BlockType, self.rotations) - def make_component(self) -> FDataBasis: # noqa: D102 - return self.data.copy( - coefficients=np.transpose(self.rotations), - sample_names=(None,) * self.n_components, - dataset_name=f"FPLS {self.label} components", + raise NotImplementedError( + f"Data type {type(self.data)} not supported", ) - def transform( # noqa: D102 + def transform( self, - data: NDArrayFloat | FData, + data: BlockType, ) -> NDArrayFloat: - if not isinstance(data, FDataBasis): - raise TypeError( - f"Data in block {self.label} must be an FDataBasis", + """Transform from the data space to the component space.""" + if isinstance(data, FDataGrid): + return ( + data.data_matrix[..., 0] @ self.G_data_weights @ self.rotations ) - return data.coefficients @ self.G_weights @ self.rotations + elif isinstance(data, FDataBasis): + return data.coefficients @ self.G_data_weights @ self.rotations + elif isinstance(data, np.ndarray): + return data @ self.rotations - def inverse_transform( # noqa: D102 - self, - components: NDArrayFloat, - ) -> NDArrayFloat: - return self.data.copy( - coefficients=components @ self.loadings.T, - sample_names=(None,) * components.shape[0], - dataset_name=f"FPLS {self.label} reconstructions", + raise NotImplementedError( + f"Data type {type(data)} not supported", ) + def inverse_transform( + self, + components: NDArrayFloat, + ) -> BlockType: + """Transform from the component space to the data space.""" + if isinstance(self.data, FDataGrid): + return self.data.copy( + data_matrix=components @ self.loadings.T, + sample_names=(None,) * components.shape[0], + dataset_name=f"FPLS {self.label} components", + ) + elif isinstance(self.data, FDataBasis): + return self.data.copy( + coefficients=components @ self.loadings.T, + sample_names=(None,) * components.shape[0], + dataset_name=f"FPLS {self.label} components", + ) + elif isinstance(self.data, np.ndarray): + return cast(BlockType, components @ self.loadings.T) -@multimethod.multidispatch -def block_factory( - data: Union[FData, NDArrayFloat], - n_components: int, - label: str, - integration_weights: Optional[np.ndarray], - regularization: Optional[L2Regularization[Any]], - weight_basis: Optional[Basis], -) -> FPLSBlock: - """Create a PLSBlock depending on the data type.""" - return NotImplemented - - -@block_factory.register -def block_factory_data_grid( - data: FDataGrid, - n_components: int, - label: str, - integration_weights: Optional[np.ndarray], - regularization: Optional[L2Regularization[Any]], - weight_basis: None, -) -> FPLSBlock: - """Create a PLSBlock for FDataGrid objects.""" - return FPLSBlockDataGrid( - data=data, - n_components=n_components, - label=label, - integration_weights=integration_weights, - regularization=regularization, - ) - - -@block_factory.register -def block_factory_data_basis( - data: FDataBasis, - n_components: int, - label: str, - integration_weights: None, - regularization: Optional[L2Regularization[Any]], - weight_basis: Optional[Basis], -) -> FPLSBlock: - """Create a PLSBlock for FDataBasis objects.""" - return FPLSBlockDataBasis( - data=data, - n_components=n_components, - label=label, - weights_basis=weight_basis, - regularization=regularization, - ) + raise NotImplementedError( + f"Data type {type(self.data)} not supported", + ) -@block_factory.register -def block_factory_multivariate( - data: np.ndarray, - n_components: int, - label: str, - integration_weights: None, - regularization: None, - weight_basis: None, -) -> FPLSBlock: - """Create a PLSBlock for multivariate data.""" - return FPLSBlockDataMultivariate( - data=data, - n_components=n_components, - label=label, - ) +InputTypeX = TypeVar( + "InputTypeX", + bound=Union[FDataGrid, FDataBasis, NDArrayFloat], +) +InputTypeY = TypeVar( + "InputTypeY", + bound=Union[FDataGrid, FDataBasis, NDArrayFloat], +) class FPLS( BaseEstimator, - InductiveTransformerMixin[InputType, InputType, Optional[InputType]], + Generic[InputTypeX, InputTypeY], ): """ Functional Partial Least Squares Regression. @@ -551,8 +441,8 @@ def __init__( scale: bool = False, integration_weights_X: NDArrayFloat | None = None, integration_weights_Y: NDArrayFloat | None = None, - regularization_X: L2Regularization[InputType] | None = None, - regularization_Y: L2Regularization[InputType] | None = None, + regularization_X: L2Regularization[InputTypeX] | None = None, + regularization_Y: L2Regularization[InputTypeY] | None = None, weight_basis_X: Basis | None = None, weight_basis_Y: Basis | None = None, deflation_mode: str = "can", @@ -569,17 +459,17 @@ def __init__( def _fit_data( self, - X: InputType, - Y: InputType, + X: InputTypeX, + Y: InputTypeY, ) -> None: """Fit the model using X and Y as already centered data.""" - self._x_block = block_factory( + self._x_block = _FPLSBlock( data=X, n_components=self.n_components, label="X", integration_weights=self.integration_weights_X, regularization=self.regularization_X, - weight_basis=self.weight_basis_X, + weights_basis=self.weight_basis_X, ) penalization_matrix = ( @@ -587,13 +477,13 @@ def _fit_data( ) L_X_inv = np.linalg.inv(np.linalg.cholesky(penalization_matrix)) - self._y_block = block_factory( + self._y_block = _FPLSBlock( data=Y, n_components=self.n_components, label="X", integration_weights=self.integration_weights_Y, regularization=self.regularization_Y, - weight_basis=self.weight_basis_Y, + weights_basis=self.weight_basis_Y, ) penalization_matrix = ( @@ -621,6 +511,7 @@ def _fit_data( self.y_scores_ = U self.x_loadings_ = P self.y_loadings_ = Q + self.x_rotations_ = W @ np.linalg.pinv( P.T @ self._x_block.G_data_weights @ W, ) @@ -629,25 +520,22 @@ def _fit_data( Q.T @ self._y_block.G_data_weights @ C, ) - self._x_block.set_values( + self.x_components_ = self._x_block.calculate_components( rotations=self.x_rotations_, - loadings=self.x_loadings_, + loadings=P, ) - self._y_block.set_values( + self.y_components_ = self._y_block.calculate_components( rotations=self.y_rotations_, - loadings=self.y_loadings_, + loadings=Q, ) - self.components_x_ = self._x_block.make_component() - self.components_y_ = self._y_block.make_component() - self.coef_ = self.x_rotations_ @ Q.T def fit( self, - X: InputType, - y: InputType, - ) -> FPLS: + X: InputTypeX, + y: InputTypeY, + ) -> FPLS[InputTypeX, InputTypeY]: """ Fit the model using the data for both blocks. @@ -677,8 +565,8 @@ def fit( def transform( self, - X: InputType, - y: InputType | None = None, + X: InputTypeX, + y: InputTypeY | None = None, ) -> NDArrayFloat | Tuple[NDArrayFloat, NDArrayFloat]: """ Apply the dimension reduction learned on the train data. @@ -709,7 +597,7 @@ def inverse_transform( self, X: NDArrayFloat, Y: NDArrayFloat | None = None, - ) -> InputType | Tuple[InputType, InputType]: + ) -> InputTypeX | Tuple[InputTypeX, InputTypeY]: """ Transform data back to its original space. From 0fe647f4fa974822c15f7bb2beb6a4d8ef701d8a Mon Sep 17 00:00:00 2001 From: David del Val Date: Fri, 23 Jun 2023 19:18:08 +0200 Subject: [PATCH 14/46] Move centering operations into FPLSBlock --- skfda/preprocessing/dim_reduction/_fpls.py | 154 +++++++++++---------- 1 file changed, 78 insertions(+), 76 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index ffe97fac7..373f7e857 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -1,17 +1,13 @@ from __future__ import annotations -from abc import abstractmethod -from typing import Optional, Tuple, Union, Any, Generic, TypeVar, cast +from typing import Any, Generic, Optional, Tuple, TypeVar, Union, cast import multimethod import numpy as np import scipy from sklearn.utils.validation import check_is_fitted -from skfda.representation import FData -from skfda.typing._numpy import NDArrayFloat - -from ..._utils._sklearn_adapter import BaseEstimator, InductiveTransformerMixin +from ..._utils._sklearn_adapter import BaseEstimator from ...misc.regularization import L2Regularization, compute_penalty_matrix from ...representation import FData, FDataGrid from ...representation.basis import Basis, FDataBasis, _GridBasis @@ -79,7 +75,7 @@ def _calculate_weights( # ignore flake8 multi-line function annotation -def _pls_nipals( # noqa WPS320 +def _pls_nipals( # noqa: WPS320 X: NDArrayFloat, Y: NDArrayFloat, n_components: int, @@ -207,7 +203,8 @@ def _pls_nipals( # noqa WPS320 ) -class _FPLSBlock(Generic[BlockType]): +# Ignore too many public instance attributes +class _FPLSBlock(Generic[BlockType]): # noqa: WPS230 """ Class to store the data of a block of a FPLS model. @@ -224,6 +221,8 @@ class _FPLSBlock(Generic[BlockType]): The data matrix of the block. """ + mean: BlockType + def __init__( self, data: BlockType, @@ -235,7 +234,6 @@ def __init__( ) -> None: self.n_components = n_components self.label = label - self.data = data self._initialize_data( data=data, integration_weights=integration_weights, @@ -259,7 +257,7 @@ def _initialize_data( @_initialize_data.register def _initialize_data_multivariate( self, - data: np.ndarray, + data: np.ndarray, # type: ignore integration_weights: None, regularization: None, weights_basis: None, @@ -267,7 +265,13 @@ def _initialize_data_multivariate( """Initialize the data of the block.""" self.G_data_weights = np.identity(data.shape[1]) self.G_weights = np.identity(data.shape[1]) - self.data_matrix = data + + self.mean = np.mean(data, axis=0) + self.data_matrix = data - self.mean + if len(self.data_matrix.shape) == 1: + self.data_matrix = self.data_matrix[:, np.newaxis] + self.data = data - self.mean + self.regularization_matrix = np.zeros( (data.shape[1], data.shape[1]), ) @@ -276,10 +280,14 @@ def _initialize_data_multivariate( def _initialize_data_fdatagrid( self, data: FDataGrid, - integration_weights: Optional[np.ndarray], + integration_weights: Optional[np.ndarray], # type: ignore regularization: Optional[L2Regularization[Any]], weights_basis: None, ) -> None: + self.mean = data.mean() + data = data - self.mean + self.data = data + data_mat = data.data_matrix[..., 0] if integration_weights is None: identity = np.identity(data_mat.shape[1]) @@ -314,6 +322,10 @@ def _initialize_data_fdatabasis( regularization: Optional[L2Regularization[Any]], weights_basis: Optional[Basis], ) -> None: + self.mean = data.mean() + data = data - self.mean + self.data = data + data_mat = data.coefficients if weights_basis is None: self.weights_basis = data.basis @@ -376,13 +388,20 @@ def transform( ) -> NDArrayFloat: """Transform from the data space to the component space.""" if isinstance(data, FDataGrid): + data_grid = data - cast(FDataGrid, self.mean) return ( - data.data_matrix[..., 0] @ self.G_data_weights @ self.rotations + data_grid.data_matrix[..., 0] + @ self.G_data_weights + @ self.rotations ) elif isinstance(data, FDataBasis): - return data.coefficients @ self.G_data_weights @ self.rotations + data_basis = data - cast(FDataBasis, self.mean) + return ( + data_basis.coefficients @ self.G_data_weights @ self.rotations + ) elif isinstance(data, np.ndarray): - return data @ self.rotations + data_array = data - cast(NDArrayFloat, self.mean) + return data_array @ self.rotations raise NotImplementedError( f"Data type {type(data)} not supported", @@ -394,24 +413,38 @@ def inverse_transform( ) -> BlockType: """Transform from the component space to the data space.""" if isinstance(self.data, FDataGrid): - return self.data.copy( + reconstructed_grid = self.data.copy( data_matrix=components @ self.loadings.T, sample_names=(None,) * components.shape[0], dataset_name=f"FPLS {self.label} components", ) + return reconstructed_grid + cast(FDataGrid, self.mean) + elif isinstance(self.data, FDataBasis): - return self.data.copy( + reconstructed_basis = self.data.copy( coefficients=components @ self.loadings.T, sample_names=(None,) * components.shape[0], dataset_name=f"FPLS {self.label} components", ) + return reconstructed_basis + cast(FDataBasis, self.mean) + elif isinstance(self.data, np.ndarray): - return cast(BlockType, components @ self.loadings.T) + reconstructed = components @ self.loadings.T + reconstructed += self.mean + return cast(BlockType, reconstructed) raise NotImplementedError( f"Data type {type(self.data)} not supported", ) + def get_penalty_matrix(self) -> NDArrayFloat: + """Return the penalty matrix.""" + return self.G_weights + self.regularization_matrix + + def get_cholesky_inv_penalty_matrix(self) -> NDArrayFloat: + """Return the Cholesky decomposition of the penalty matrix.""" + return np.linalg.inv(np.linalg.cholesky(self.get_penalty_matrix())) + InputTypeX = TypeVar( "InputTypeX", @@ -423,7 +456,8 @@ def inverse_transform( ) -class FPLS( +# Ignore too many public instance attributes +class FPLS( # noqa: WPS230 BaseEstimator, Generic[InputTypeX, InputTypeY], ): @@ -457,12 +491,7 @@ def __init__( self.weight_basis_Y = weight_basis_Y self.deflation_mode = deflation_mode - def _fit_data( - self, - X: InputTypeX, - Y: InputTypeY, - ) -> None: - """Fit the model using X and Y as already centered data.""" + def _initialize_blocks(self, X: InputTypeX, Y: InputTypeY) -> None: self._x_block = _FPLSBlock( data=X, n_components=self.n_components, @@ -471,12 +500,6 @@ def _fit_data( regularization=self.regularization_X, weights_basis=self.weight_basis_X, ) - - penalization_matrix = ( - self._x_block.G_weights + self._x_block.regularization_matrix - ) - L_X_inv = np.linalg.inv(np.linalg.cholesky(penalization_matrix)) - self._y_block = _FPLSBlock( data=Y, n_components=self.n_components, @@ -486,10 +509,24 @@ def _fit_data( weights_basis=self.weight_basis_Y, ) - penalization_matrix = ( - self._y_block.G_weights + self._y_block.regularization_matrix - ) - L_Y_inv = np.linalg.inv(np.linalg.cholesky(penalization_matrix)) + def fit( + self, + X: InputTypeX, + y: InputTypeY, + ) -> FPLS[InputTypeX, InputTypeY]: + """ + Fit the model using the data for both blocks. + + Any of the parameters can be a FDataGrid, FDataBasis or numpy array. + + Args: + X: Data of the X block + y: Data of the Y block + + Returns: + self + """ + self._initialize_blocks(X, y) # Supress flake8 warning about too many values to unpack W, C, T, U, P, Q = _pls_nipals( # noqa: WPS236 @@ -500,8 +537,8 @@ def _fit_data( G_xw=self._x_block.G_data_weights, G_cc=self._y_block.G_weights, G_yc=self._y_block.G_data_weights, - L_X_inv=L_X_inv, - L_Y_inv=L_Y_inv, + L_X_inv=self._x_block.get_cholesky_inv_penalty_matrix(), + L_Y_inv=self._y_block.get_cholesky_inv_penalty_matrix(), deflation=self.deflation_mode, ) @@ -530,37 +567,6 @@ def _fit_data( ) self.coef_ = self.x_rotations_ @ Q.T - - def fit( - self, - X: InputTypeX, - y: InputTypeY, - ) -> FPLS[InputTypeX, InputTypeY]: - """ - Fit the model using the data for both blocks. - - Any of the parameters can be a FDataGrid, FDataBasis or numpy array. - - Args: - X: Data of the X block - y: Data of the Y block - - Returns: - self - """ - if isinstance(y, np.ndarray) and len(y.shape) == 1: - y = y[:, np.newaxis] - - # Center and scale data - self._x_mean = X.mean(axis=0) - self._y_mean = y.mean(axis=0) - self._x_std = 1 - self._y_std = 1 - - X = (X - self._x_mean) / self._x_std - y = (y - self._y_mean) / self._y_std - - self._fit_data(X, y) return self def transform( @@ -584,11 +590,9 @@ def transform( """ check_is_fitted(self) - X = (X - self._x_mean) / self._x_std x_scores = self._x_block.transform(X) if y is not None: - y = (y - self._y_mean) / self._y_std y_scores = self._y_block.transform(y) return x_scores, y_scores return x_scores @@ -614,11 +618,9 @@ def inverse_transform( """ check_is_fitted(self) - x_recon = self._x_block.inverse_transform(X) - x_recon = x_recon * self._x_std + self._x_mean + x_reconstructed = self._x_block.inverse_transform(X) if Y is not None: - y_recon = self._y_block.inverse_transform(Y) - y_recon = y_recon * self._y_std + self._y_mean - return x_recon, y_recon - return x_recon + y_reconstructed = self._y_block.inverse_transform(Y) + return x_reconstructed, y_reconstructed + return x_reconstructed From 24303765e0c57a43eb5c7900ccdb7d4207e174d8 Mon Sep 17 00:00:00 2001 From: David del Val Date: Fri, 23 Jun 2023 21:06:08 +0200 Subject: [PATCH 15/46] Simplify FPLS regression implementation --- skfda/ml/regression/_fpls_regression.py | 78 ++++++------------------- 1 file changed, 19 insertions(+), 59 deletions(-) diff --git a/skfda/ml/regression/_fpls_regression.py b/skfda/ml/regression/_fpls_regression.py index 2cb9cba58..1abf1ebb3 100644 --- a/skfda/ml/regression/_fpls_regression.py +++ b/skfda/ml/regression/_fpls_regression.py @@ -1,24 +1,30 @@ from __future__ import annotations -from typing import TypeVar, Union +from typing import Any, TypeVar, Union -import multimethod from sklearn.utils.validation import check_is_fitted from ..._utils._sklearn_adapter import BaseEstimator, RegressorMixin from ...misc.regularization import L2Regularization from ...preprocessing.dim_reduction import FPLS -from ...representation import FData, FDataGrid +from ...representation import FDataGrid from ...representation.basis import Basis, FDataBasis from ...typing._numpy import NDArrayFloat -FPLSRegressionSelf = TypeVar("FPLSRegressionSelf", bound="FPLSRegression") +InputType = TypeVar( + "InputType", + bound=Union[FDataGrid, FDataBasis, NDArrayFloat], +) + +OutputType = TypeVar( + "OutputType", + bound=Union[FDataGrid, FDataBasis, NDArrayFloat], +) -Input = Union[FData, NDArrayFloat] class FPLSRegression( BaseEstimator, - RegressorMixin[Input, Input], + RegressorMixin[InputType, OutputType], ): """ Regression using Functional Partial Least Squares. @@ -34,7 +40,7 @@ def __init__( scale: bool = False, integration_weights_X: NDArrayFloat | None = None, integration_weights_Y: NDArrayFloat | None = None, - regularization_X: L2Regularization | None = None, + regularization_X: L2Regularization[Any] | None = None, weight_basis_X: Basis | None = None, weight_basis_Y: Basis | None = None, ) -> None: @@ -46,48 +52,13 @@ def __init__( self.weight_basis_X = weight_basis_X self.weight_basis_Y = weight_basis_Y - def _predict_y(self, X: FData) -> FData: - if isinstance(X, FDataGrid): - return X.data_matrix[..., 0] @ self.fpls_._x_block.G_data_weights @ self.coef_ - elif isinstance(X, FDataBasis): - return X.coefficients @ self.fpls_._x_block.G_data_weights @ self.coef_ - else: - return X @ self.fpls_._x_block.G_data_weights @ self.coef_ - - def _postprocess_response(self, y_data: NDArrayFloat) -> FData: - if isinstance(self.train_y, FDataGrid): - return self.train_y.copy( - data_matrix=y_data, - sample_names=(None,) * y_data.shape[0], - ) - elif isinstance(self.train_y, FDataBasis): - return self.train_y.copy( - coefficients=y_data, - sample_names=(None,) * y_data.shape[0], - ) - else: - return y_data - def fit( self, - X: FData, - y: NDArrayFloat, - ) -> FPLSRegressionSelf: + X: InputType, + y: OutputType, + ) -> FPLSRegression[InputType, OutputType]: """Fit the model using X as training data and y as target values.""" - # Center and scale data - self.x_mean = X.mean(axis=0) - self.y_mean = y.mean(axis=0) - if self.scale: - self.x_std = X.std() - self.y_std = y.std() - else: - self.x_std = 1 - self.y_std = 1 - - X = (X - self.x_mean) / self.x_std - y = (y - self.y_mean) / self.y_std - - self.fpls_ = FPLS( + self.fpls_ = FPLS[InputType, OutputType]( n_components=self.n_components, scale=False, integration_weights_X=self.integration_weights_X, @@ -100,14 +71,9 @@ def fit( self.fpls_.fit(X, y) - self.coef_ = self.fpls_.x_rotations_ @ self.fpls_.y_loadings_.T - - self.train_X = X - self.train_y = y - return self - def predict(self, X: FData) -> NDArrayFloat: + def predict(self, X: InputType) -> OutputType: """Predict using the model. Args: @@ -118,10 +84,4 @@ def predict(self, X: FData) -> NDArrayFloat: """ check_is_fitted(self) - - X = (X - self.x_mean) / self.x_std - - y_scaled = self._predict_y(X) - y_scaled = self._postprocess_response(y_scaled) - - return y_scaled * self.y_std + self.y_mean + return self.fpls_.inverse_transform_y(self.fpls_.transform_x(X)) From 5481f63f33c6b56299141976cd47b2c583d38724 Mon Sep 17 00:00:00 2001 From: David del Val Date: Fri, 23 Jun 2023 21:24:55 +0200 Subject: [PATCH 16/46] Add individual transform methods --- skfda/preprocessing/dim_reduction/_fpls.py | 74 ++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index 373f7e857..434301e06 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -597,6 +597,44 @@ def transform( return x_scores, y_scores return x_scores + def transform_x( + self, + X: InputTypeX, + ) -> NDArrayFloat: + """ + Apply the dimension reduction learned on the train data. + + Args: + X: Data to transform. Must have the same number of features and + type as the data used to train the model. + + + Returns: + - Data transformed. + """ + check_is_fitted(self) + + return self._x_block.transform(X) + + def transform_y( + self, + Y: InputTypeY, + ) -> NDArrayFloat: + """ + Apply the dimension reduction learned on the train data. + + Args: + Y: Data to transform. Must have the same number of features and + type as the data used to train the model. + + + Returns: + - Data transformed. + """ + check_is_fitted(self) + + return self._y_block.transform(Y) + def inverse_transform( self, X: NDArrayFloat, @@ -624,3 +662,39 @@ def inverse_transform( y_reconstructed = self._y_block.inverse_transform(Y) return x_reconstructed, y_reconstructed return x_reconstructed + + def inverse_transform_x( + self, + X: NDArrayFloat, + ) -> InputTypeX: + """ + Transform X data back to its original space. + + Args: + X: Data to transform back. Must have the same number of columns + as the number of components of the model. + + Returns: + - Data reconstructed from the transformed data. + """ + check_is_fitted(self) + + return self._x_block.inverse_transform(X) + + def inverse_transform_y( + self, + Y: NDArrayFloat, + ) -> InputTypeY: + """ + Transform Y data back to its original space. + + Args: + Y: Data to transform back. Must have the same number of columns + as the number of components of the model. + + Returns: + - Data reconstructed from the transformed data. + """ + check_is_fitted(self) + + return self._y_block.inverse_transform(Y) From 238d62277e24683264176c83d898e5c643467c3a Mon Sep 17 00:00:00 2001 From: David del Val Date: Fri, 23 Jun 2023 21:25:04 +0200 Subject: [PATCH 17/46] Fix typing and style issues --- skfda/ml/regression/__init__.py | 2 +- skfda/tests/test_fpls.py | 62 +++++++------- skfda/tests/test_fpls_regression.py | 128 +++++++++------------------- 3 files changed, 73 insertions(+), 119 deletions(-) diff --git a/skfda/ml/regression/__init__.py b/skfda/ml/regression/__init__.py index 73082f296..3fb0a892c 100644 --- a/skfda/ml/regression/__init__.py +++ b/skfda/ml/regression/__init__.py @@ -20,7 +20,7 @@ if TYPE_CHECKING: from ._fpca_regression import FPCARegression - from ._fpls_regression import FPLSRegression + from ._fpls_regression import FPLSRegression as FPLSRegression from ._historical_linear_model import ( HistoricalLinearRegression as HistoricalLinearRegression, ) diff --git a/skfda/tests/test_fpls.py b/skfda/tests/test_fpls.py index df9225508..e0cc1c1bd 100644 --- a/skfda/tests/test_fpls.py +++ b/skfda/tests/test_fpls.py @@ -1,21 +1,21 @@ """Test the FPLS class.""" -import os + +from typing import Tuple import numpy as np -import pytest -import scipy from sklearn.cross_decomposition import PLSCanonical from skfda.datasets import fetch_tecator -from skfda.misc.operators import LinearDifferentialOperator -from skfda.misc.regularization import L2Regularization from skfda.preprocessing.dim_reduction import FPLS +from skfda.representation import FData, FDataGrid from skfda.representation.basis import BSplineBasis, FDataBasis +from skfda.typing._numpy import NDArrayFloat + class LatentVariablesModel: """Simulate model driven by latent variables.""" - def create_latent_variables(self, n_latent, n_samples): + def create_latent_variables(self, n_latent: int, n_samples: int) -> None: """Create latent variables for testing.""" self.rng = np.random.RandomState(0) self.n_latent = n_latent @@ -33,7 +33,11 @@ def create_latent_variables(self, n_latent, n_samples): ], ).T - def create_observed_multivariate_variable(self, n_features, noise=0): + def create_observed_multivariate_variable( + self, + n_features: int, + noise: float = 0, + ) -> Tuple[NDArrayFloat, NDArrayFloat]: """Create observed multivariate variable for testing.""" rotations = self.rng.uniform( low=0, @@ -49,7 +53,11 @@ def create_observed_multivariate_variable(self, n_features, noise=0): return observed_values, rotations - def create_observed_functional_variable(self, noise=0, discretized=False): + def create_observed_functional_variable( + self, + noise: float = 0, + discretized: bool = False, + ) -> Tuple[FData, FData]: """Create observed functional variable for testing.""" n_basis = 20 basis = BSplineBasis(n_basis=n_basis) @@ -78,12 +86,12 @@ def create_observed_functional_variable(self, noise=0, discretized=False): ) return observed_func, rotation - + class TestFPLS(LatentVariablesModel): """Test the FPLS class.""" - def test_sklearn(self): + def test_sklearn(self) -> None: """Compare results with sklearn.""" # Load the data X, y = fetch_tecator( @@ -93,8 +101,9 @@ def test_sklearn(self): integration_weights = np.ones(len(X.grid_points[0])) # Fit the model - fpls = FPLS(n_components=3, - integration_weights_X=integration_weights, + fpls = FPLS[FDataGrid, NDArrayFloat]( + n_components=3, + integration_weights_X=integration_weights, ) fpls.fit(X, y) @@ -115,12 +124,12 @@ def test_sklearn(self): # Check the transformation of X np.testing.assert_allclose( fpls.transform(X, y), - sklearnpls.transform(X.data_matrix[..., 0],y), + sklearnpls.transform(X.data_matrix[..., 0], y), rtol=rtol, atol=1e-5, ) - comp_x, comp_y = fpls.transform(X,y) + comp_x, comp_y = fpls.transform(X, y) fpls_inv_x, fpls_inv_y = fpls.inverse_transform(comp_x, comp_y) sklearnpls_inv_x, sklearnpls_inv_y = sklearnpls.inverse_transform( @@ -143,10 +152,11 @@ def test_sklearn(self): atol=atol, ) - - def test_basis_vs_grid(self,): + # Ignoring WPS210: Found too many local variables + def test_basis_vs_grid( # noqa: WPS210 + self, + ) -> None: """Test that the results are the same in basis and grid.""" - n_components = 5 self.create_latent_variables(n_latent=n_components, n_samples=100) @@ -161,7 +171,7 @@ def test_basis_vs_grid(self,): ) # Fit the model - fpls = FPLS(n_components=n_components) + fpls = FPLS[FData, FData](n_components=n_components) fpls.fit(X_observed, y_observed) # Convert the observed variables to grid @@ -175,20 +185,19 @@ def test_basis_vs_grid(self,): ) # Fit the model with the grid variables - fpls_grid = FPLS(n_components=n_components) + fpls_grid = FPLS[FData, FData](n_components=n_components) fpls_grid.fit(X_observed_grid, y_observed_grid) - # Check that the results are the same np.testing.assert_allclose( - np.abs(fpls.components_x_(self.grid_points)), - np.abs(fpls_grid.components_x_(self.grid_points)), + np.abs(fpls.x_components_(self.grid_points)), + np.abs(fpls_grid.x_components_(self.grid_points)), rtol=5e-3, ) np.testing.assert_allclose( - np.abs(fpls.components_y_(self.grid_points)), - np.abs(fpls_grid.components_y_(self.grid_points)), + np.abs(fpls.y_components_(self.grid_points)), + np.abs(fpls_grid.y_components_(self.grid_points)), rtol=5e-3, ) @@ -219,8 +228,3 @@ def test_basis_vs_grid(self,): fpsl_grid_y(grid_points), rtol=0.13, ) - - - - - diff --git a/skfda/tests/test_fpls_regression.py b/skfda/tests/test_fpls_regression.py index bbc5b0dc5..99aaddc50 100644 --- a/skfda/tests/test_fpls_regression.py +++ b/skfda/tests/test_fpls_regression.py @@ -10,14 +10,16 @@ from skfda.misc.operators import LinearDifferentialOperator from skfda.misc.regularization import L2Regularization from skfda.ml.regression import FPLSRegression -from skfda.representation.basis import BSplineBasis, FDataBasis +from skfda.representation import FData +from skfda.representation.basis import BSplineBasis +from skfda.tests.test_fpls import LatentVariablesModel +from skfda.typing._numpy import NDArrayFloat -class TestFPLSRegression: +class TestFPLSRegression(LatentVariablesModel): """Test the FPLSRegression class.""" - - def test_sklearn(self): + def test_sklearn(self) -> None: """Compare results with sklearn.""" # Load the data X, y = fetch_tecator( @@ -25,7 +27,11 @@ def test_sklearn(self): ) # Fit the model - fplsr = FPLSRegression(n_components=5, integration_weights_X=np.ones(len(X.grid_points[0])), scale=False) + fplsr = FPLSRegression[NDArrayFloat, NDArrayFloat]( + n_components=5, + integration_weights_X=np.ones(len(X.grid_points[0])), + scale=False, + ) fplsr.fit(X, y) sklearnpls = PLSRegression(n_components=5, scale=False) @@ -57,7 +63,7 @@ def test_sklearn(self): atol=atol, ) - def test_fda_usc_no_reg(self): + def test_fda_usc_no_reg(self) -> None: """ Test a multivariate regression with no regularization. @@ -79,7 +85,7 @@ def test_fda_usc_no_reg(self): """ # Results from fda.usc: path = os.path.join( - __file__[:-3]+ "_data", # Trim .py ending + f"{__file__[:-3]}_data", # Trim .py ending "test_fda_usc_no_reg_data.npy", ) with open(path, "rb") as f: @@ -93,14 +99,16 @@ def test_fda_usc_no_reg(self): return_X_y=True, ) - plsReg = FPLSRegression(n_components=5, integration_weights_X=np.ones(len(X.grid_points[0]))) - print(plsReg.integration_weights_X) + plsReg = FPLSRegression[FData, NDArrayFloat]( + n_components=5, + integration_weights_X=np.ones(len(X.grid_points[0])), + ) plsReg.fit(X, y) W = plsReg.fpls_.x_weights_ np.testing.assert_allclose(W, r_results, atol=1e-8) - def test_fda_usc_reg(self): + def test_fda_usc_reg(self) -> None: """ Test the FPLSRegression with regularization against fda.usc. @@ -143,7 +151,7 @@ def test_fda_usc_reg(self): ) # Fit the model - fplsr = FPLSRegression( + fplsr = FPLSRegression[FData, NDArrayFloat]( n_components=5, regularization_X=regularization, integration_weights_X=np.ones(len(X.grid_points[0])), @@ -151,7 +159,7 @@ def test_fda_usc_reg(self): fplsr.fit(X, y) path = os.path.join( - __file__[:-3] + "_data", # Trim .py ending + f"{__file__[:-3]}_data", # Trim .py ending "test_fda_usc_reg_data.npy", ) with open(path, "rb") as f: @@ -162,7 +170,7 @@ def test_fda_usc_reg(self): w_mat = fplsr.fpls_.x_weights_ @ np.diag(signs) np.testing.assert_allclose(w_mat, r_results, atol=6e-6, rtol=6e-4) - def test_basis_vs_grid(self): + def test_basis_vs_grid(self) -> None: """Test that the results are the same in basis and grid.""" X, y = fetch_tecator( return_X_y=True, @@ -173,9 +181,9 @@ def test_basis_vs_grid(self): X = X.to_basis(BSplineBasis(n_basis=20)) # Fit the model - fplsr = FPLSRegression(n_components=5) + fplsr = FPLSRegression[FData, NDArrayFloat](n_components=5) fplsr.fit(X, y) - basis_components = fplsr.fpls_.components_x_(original_grid_points) + basis_components = fplsr.fpls_.x_components_(original_grid_points) # Go back to grid new_grid_points = np.linspace( @@ -196,7 +204,7 @@ def test_basis_vs_grid(self): ) fplsr.fit(X, y) - grid_components = fplsr.fpls_.components_x_(original_grid_points) + grid_components = fplsr.fpls_.x_components_(original_grid_points) np.testing.assert_allclose( basis_components, @@ -205,11 +213,11 @@ def test_basis_vs_grid(self): ) @pytest.mark.parametrize("y_features", [1, 5]) - def test_multivariate_regression(self, y_features): + def test_multivariate_regression(self, y_features: int) -> None: """Test the multivariate regression. Consider both scalar and multivariate responses. - """ + """ self.create_latent_variables(n_latent=5, n_samples=100) # Check that the model is able to recover the latent variables @@ -221,7 +229,7 @@ def test_multivariate_regression(self, y_features): n_features=10, ) - plsreg = FPLSRegression(n_components=5) + plsreg = FPLSRegression[NDArrayFloat, NDArrayFloat](n_components=5) plsreg.fit(X_observed, y_observed) minimum_score = 0.99 @@ -230,7 +238,12 @@ def test_multivariate_regression(self, y_features): @pytest.mark.parametrize("discretized", [True, False]) @pytest.mark.parametrize("n_features", [1, 5]) @pytest.mark.parametrize("noise_std", [0, 1]) - def test_simple_regresion(self, discretized, n_features, noise_std): + def test_simple_regresion( + self, + discretized: bool, + n_features: int, + noise_std: float, + ) -> None: """Test multivariate regressor and functional response.""" self.create_latent_variables(n_latent=5, n_samples=100) @@ -245,7 +258,7 @@ def test_simple_regresion(self, discretized, n_features, noise_std): noise=noise_std, ) - plsreg = FPLSRegression(n_components=5) + plsreg = FPLSRegression[FData, NDArrayFloat](n_components=5) plsreg.fit(X_observed, y_observed) minimum_score = 0.99 if noise_std == 0 else 0.98 @@ -256,10 +269,10 @@ def test_simple_regresion(self, discretized, n_features, noise_std): @pytest.mark.parametrize("noise_std", [0, 1]) def test_simple_regresion_dataset_functional( self, - discretized_observed, - discretized_response, - noise_std, - ): + discretized_observed: bool, + discretized_response: bool, + noise_std: float, + ) -> None: """Test multivariate regressor and functional response.""" self.create_latent_variables(n_latent=5, n_samples=100) @@ -274,70 +287,7 @@ def test_simple_regresion_dataset_functional( noise=noise_std, ) - plsreg = FPLSRegression(n_components=5) + plsreg = FPLSRegression[FData, FData](n_components=5) plsreg.fit(X_observed, y_observed) minimum_score = 0.99 if noise_std == 0 else 0.98 assert plsreg.score(X_observed, y_observed) > minimum_score - - def create_latent_variables(self, n_latent, n_samples): - """Create latent variables for testing.""" - self.rng = np.random.RandomState(0) - self.n_latent = n_latent - self.n_samples = n_samples - - # Get the means of the latent variables - latent_means = self.rng.uniform(low=1, high=10, size=(n_latent)) - - # Sample the variables - self.latent_samples = np.array( - [ - self.rng.normal(loc=mean, scale=1, size=n_samples) - for mean in latent_means - ], - ).T - - def create_observed_multivariate_variable(self, n_features, noise=0): - """Create observed multivariate variable for testing.""" - rotations = self.rng.uniform( - low=0, - high=10, - size=(n_features, self.n_latent), - ) - - observed_values = self.latent_samples @ rotations.T - observed_noise = noise * self.rng.normal( - size=(self.n_samples, n_features), - ) - observed_values += observed_noise - - return observed_values, rotations - - def create_observed_functional_variable(self, noise=0, discretized=False): - """Create observed functional variable for testing.""" - n_basis = 20 - basis = BSplineBasis(n_basis=n_basis) - - rotation_coef = self.rng.uniform( - low=0, - high=10, - size=(self.n_latent, n_basis), - ) - rotation = FDataBasis(basis, rotation_coef) - - observed_coef = self.latent_samples @ rotation_coef - observed_func = FDataBasis(basis, observed_coef) - - if discretized: - observed_func = observed_func.to_grid( - grid_points=np.linspace(0, 1, 100), - ) - - func_noise = noise * self.rng.normal(size=(self.n_samples, 100)) - observed_func.data_matrix[..., 0] += func_noise - - else: - observed_func.coefficients += noise * self.rng.normal( - size=(self.n_samples, n_basis), - ) - - return observed_func, rotation From beeb58c09ce0e55335e93deaec0f8d7d387ca000 Mon Sep 17 00:00:00 2001 From: David del Val Date: Fri, 23 Jun 2023 21:34:33 +0200 Subject: [PATCH 18/46] Fix FPLSRegression test --- skfda/tests/test_fpls_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/tests/test_fpls_regression.py b/skfda/tests/test_fpls_regression.py index 99aaddc50..99e6fcd6b 100644 --- a/skfda/tests/test_fpls_regression.py +++ b/skfda/tests/test_fpls_regression.py @@ -41,7 +41,7 @@ def test_sklearn(self) -> None: atol = 1e-6 # Compare the results np.testing.assert_allclose( - fplsr.coef_.flatten(), + fplsr.fpls_.coef_.flatten(), # noqa: WPS437 sklearnpls.coef_.flatten(), rtol=rtol, atol=atol, From e171549dea734728176f84b79d630a9e708c18a5 Mon Sep 17 00:00:00 2001 From: David del Val Date: Sat, 24 Jun 2023 15:56:03 +0200 Subject: [PATCH 19/46] Ignore linter warning in init --- skfda/preprocessing/dim_reduction/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/preprocessing/dim_reduction/__init__.py b/skfda/preprocessing/dim_reduction/__init__.py index b9fef1bb8..e5c221434 100644 --- a/skfda/preprocessing/dim_reduction/__init__.py +++ b/skfda/preprocessing/dim_reduction/__init__.py @@ -22,7 +22,7 @@ from ._fpls import FPLS as FPLS -def __getattr__(name: str) -> Any: +def __getattr__(name: str) -> Any: # noqa: WPS413 if name in {"projection", "feature_extraction"}: return importlib.import_module(f".{name}", __name__) return _normal_getattr(name) From d3e4178a599326e3016ef3ffd5959866db74f3d1 Mon Sep 17 00:00:00 2001 From: David del Val Date: Sat, 24 Jun 2023 18:19:01 +0200 Subject: [PATCH 20/46] Specify ignore type --- skfda/preprocessing/dim_reduction/_fpls.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index 434301e06..2afb63ae9 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -257,7 +257,7 @@ def _initialize_data( @_initialize_data.register def _initialize_data_multivariate( self, - data: np.ndarray, # type: ignore + data: np.ndarray, # type: ignore[type-arg] integration_weights: None, regularization: None, weights_basis: None, @@ -280,7 +280,7 @@ def _initialize_data_multivariate( def _initialize_data_fdatagrid( self, data: FDataGrid, - integration_weights: Optional[np.ndarray], # type: ignore + integration_weights: Optional[np.ndarray], # type: ignore[type-arg] regularization: Optional[L2Regularization[Any]], weights_basis: None, ) -> None: From ec5471c9e2a3f36ac3495a0df2f250d08a3984a1 Mon Sep 17 00:00:00 2001 From: David del Val Date: Sat, 24 Jun 2023 18:20:19 +0200 Subject: [PATCH 21/46] Test different mypy command --- .github/workflows/mypy.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml index ab848718b..3c2d690ce 100644 --- a/.github/workflows/mypy.yml +++ b/.github/workflows/mypy.yml @@ -16,5 +16,6 @@ jobs: reporter: github-pr-review # Change reporter level if you need. # GitHub Status Check won't become failure with warning. - level: warning + level: error + setup_command: pip install -r requirements.txt pytest mypy mypy_flags: '' From b0d82125677f89d6169726a019a279187216f0a4 Mon Sep 17 00:00:00 2001 From: David del Val Date: Sat, 24 Jun 2023 18:25:49 +0200 Subject: [PATCH 22/46] Revert "Test different mypy command" This reverts commit ec5471c9e2a3f36ac3495a0df2f250d08a3984a1. --- .github/workflows/mypy.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml index 3c2d690ce..ab848718b 100644 --- a/.github/workflows/mypy.yml +++ b/.github/workflows/mypy.yml @@ -16,6 +16,5 @@ jobs: reporter: github-pr-review # Change reporter level if you need. # GitHub Status Check won't become failure with warning. - level: error - setup_command: pip install -r requirements.txt pytest mypy + level: warning mypy_flags: '' From 9c35a8c764d9bac2d85740cab80f2629cfee7483 Mon Sep 17 00:00:00 2001 From: David del Val Date: Mon, 26 Jun 2023 15:58:56 +0200 Subject: [PATCH 23/46] Add an epsilon to normalization calculations --- skfda/preprocessing/dim_reduction/_fpls.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index 2afb63ae9..1ee936112 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -14,6 +14,7 @@ from ...typing._numpy import NDArrayFloat POWER_SOLVER_EPS = 1e-15 +INV_EPS = 1e-15 def _power_solver(X: NDArrayFloat) -> NDArrayFloat: @@ -63,13 +64,13 @@ def _calculate_weights( w = L_X_inv.T @ w # Normalize - w /= np.sqrt(np.dot(w.T, G_ww @ w)) + w /= np.sqrt(np.dot(w.T, G_ww @ w)) + INV_EPS # Undo the transformation c = L_Y_inv.T @ c # Normalize the other weight - c /= np.sqrt(np.dot(c.T, G_cc @ c)) + c /= np.sqrt(np.dot(c.T, G_cc @ c)) + INV_EPS return w, c @@ -170,11 +171,13 @@ def _pls_nipals( # noqa: WPS320 t = np.dot(X @ G_xw, w) u = np.dot(Y @ G_yc, c) - p = np.dot(X.T, t) / np.dot(t.T, t) + p = np.dot(X.T, t) / (np.dot(t.T, t) + INV_EPS) y_proyection = t if deflation == "reg" else u - q = np.dot(Y.T, y_proyection) / np.dot(y_proyection, y_proyection) + q = np.dot(Y.T, y_proyection) / ( + np.dot(y_proyection, y_proyection) + INV_EPS + ) X = X - np.outer(t, p) Y = Y - np.outer(y_proyection, q) From 0d96714495379e62f0f990e12d7d27b88c9f5e7c Mon Sep 17 00:00:00 2001 From: David del Val Date: Mon, 26 Jun 2023 15:59:14 +0200 Subject: [PATCH 24/46] Fix bugs --- skfda/preprocessing/dim_reduction/_fpls.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index 1ee936112..3cf921960 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -266,13 +266,14 @@ def _initialize_data_multivariate( weights_basis: None, ) -> None: """Initialize the data of the block.""" + if len(data.shape) == 1: + data = data[:, np.newaxis] + self.G_data_weights = np.identity(data.shape[1]) self.G_weights = np.identity(data.shape[1]) self.mean = np.mean(data, axis=0) self.data_matrix = data - self.mean - if len(self.data_matrix.shape) == 1: - self.data_matrix = self.data_matrix[:, np.newaxis] self.data = data - self.mean self.regularization_matrix = np.zeros( @@ -403,6 +404,8 @@ def transform( data_basis.coefficients @ self.G_data_weights @ self.rotations ) elif isinstance(data, np.ndarray): + if len(data.shape) == 1: + data = data[:, np.newaxis] data_array = data - cast(NDArrayFloat, self.mean) return data_array @ self.rotations @@ -419,7 +422,7 @@ def inverse_transform( reconstructed_grid = self.data.copy( data_matrix=components @ self.loadings.T, sample_names=(None,) * components.shape[0], - dataset_name=f"FPLS {self.label} components", + dataset_name=f"FPLS {self.label} inverse transformed", ) return reconstructed_grid + cast(FDataGrid, self.mean) @@ -427,7 +430,7 @@ def inverse_transform( reconstructed_basis = self.data.copy( coefficients=components @ self.loadings.T, sample_names=(None,) * components.shape[0], - dataset_name=f"FPLS {self.label} components", + dataset_name=f"FPLS {self.label} inverse transformed", ) return reconstructed_basis + cast(FDataBasis, self.mean) @@ -506,7 +509,7 @@ def _initialize_blocks(self, X: InputTypeX, Y: InputTypeY) -> None: self._y_block = _FPLSBlock( data=Y, n_components=self.n_components, - label="X", + label="Y", integration_weights=self.integration_weights_Y, regularization=self.regularization_Y, weights_basis=self.weight_basis_Y, From 1080dcf5ef1bf0efd82ea1b4b9437a20cbb1f295 Mon Sep 17 00:00:00 2001 From: David del Val Date: Mon, 26 Jun 2023 18:00:45 +0200 Subject: [PATCH 25/46] Tidy fpls implementation --- skfda/ml/regression/_fpls_regression.py | 29 ++- skfda/preprocessing/dim_reduction/_fpls.py | 242 ++++++++++++++------- skfda/tests/test_fpls.py | 2 +- skfda/tests/test_fpls_regression.py | 9 +- 4 files changed, 187 insertions(+), 95 deletions(-) diff --git a/skfda/ml/regression/_fpls_regression.py b/skfda/ml/regression/_fpls_regression.py index 1abf1ebb3..ce3a8e090 100644 --- a/skfda/ml/regression/_fpls_regression.py +++ b/skfda/ml/regression/_fpls_regression.py @@ -37,20 +37,18 @@ class FPLSRegression( def __init__( self, n_components: int = 5, - scale: bool = False, - integration_weights_X: NDArrayFloat | None = None, - integration_weights_Y: NDArrayFloat | None = None, regularization_X: L2Regularization[Any] | None = None, - weight_basis_X: Basis | None = None, - weight_basis_Y: Basis | None = None, + component_basis_X: Basis | None = None, + component_basis_Y: Basis | None = None, + _integration_weights_X: NDArrayFloat | None = None, + _integration_weights_Y: NDArrayFloat | None = None, ) -> None: self.n_components = n_components - self.scale = scale - self.integration_weights_X = integration_weights_X - self.integration_weights_Y = integration_weights_Y + self._integration_weights_X = _integration_weights_X + self._integration_weights_Y = _integration_weights_Y self.regularization_X = regularization_X - self.weight_basis_X = weight_basis_X - self.weight_basis_Y = weight_basis_Y + self.weight_basis_X = component_basis_X + self.weight_basis_Y = component_basis_Y def fit( self, @@ -60,13 +58,12 @@ def fit( """Fit the model using X as training data and y as target values.""" self.fpls_ = FPLS[InputType, OutputType]( n_components=self.n_components, - scale=False, - integration_weights_X=self.integration_weights_X, - integration_weights_Y=self.integration_weights_Y, regularization_X=self.regularization_X, - weight_basis_X=self.weight_basis_X, - weight_basis_Y=self.weight_basis_Y, - deflation_mode="reg", + component_basis_X=self.weight_basis_X, + component_basis_Y=self.weight_basis_Y, + _integration_weights_X=self._integration_weights_X, + _integration_weights_Y=self._integration_weights_Y, + _deflation_mode="reg", ) self.fpls_.fit(X, y) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index 3cf921960..be35c406c 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -211,17 +211,58 @@ class _FPLSBlock(Generic[BlockType]): # noqa: WPS230 """ Class to store the data of a block of a FPLS model. - Attributes: - - n_components: number of components to extract. - - label: label of the block (X or Y). - - G_data_weights: (n_samples, n_samples) - The inner product matrix for the data and weights - (The discretization weights matrix in the case of FDataGrid). - - G_weights: (n_samples, n_samples) - The inner product matrix for the weights - (The discretization weights matrix in the case of FDataGrid). - - data_matrix: (n_samples, n_features) - The data matrix of the block. + This is an internal class, intended to be used only by the FPLS class. + It provides a common interface for the different types of blocks, + simplifying the implementation of the FPLS algorithm. + + There are three types of blocks (depending on BlockType): + mutltivariate (NDArrayFloat), basis (FDataBasis) and grid (FDataGrid). + + In the following, n_samples is the number of samples of the block. + n_features is: + - The number of features of the block in the case of multivariate + block. + - The number of basis functions in the case of a FDataBasis block. + - The number of points in the case of a FDataGrid block. + + Parameters: + - data: The data of the block. + - n_components: Number of components to extract. + - label: Label of the block (X or Y). + - integration_weights: Array with shape (n_features,). + The integration weights of the block. It must be None for + multivariate or FDataBasis blocks. + - regularization: The regularization to apply to the block. + It must be None for multivariate blocks. + - weights_basis: The basis of the weights. It must be None for + multivariate or grid blocks. + + Attributes set during the initialization: + - n_components: `int`. Number of components to extract. + - label: `str`. Label of the block (X or Y). + - data: `BlockType`. The data of the block. + - data_matrix: `NDArrayFloat` (n_samples, n_features). The data + matrix of the block. + - mean: `BlockType`. The mean of the data. + - weights_basis: `Basis`. The basis of the weights. + - regularization_matrix: `NDArrayFloat` (n_features, n_features). + Inner product matrix of the regularization operator applied + to the basis or the grid. + - integration_matrix: `NDArrayFloat` (n_features, n_features). + Diagonal matrix of the integration weights. + - G_data_weights: `NDArrayFloat` (n_samples, n_samples). The inner + product matrix for the data and weights + (The discretization matrix in grid blocks). + - G_weights: `NDArrayFloat` (n_samples, n_samples). The inner product + matrix for the weights (The discretization matrix in grid blocks). + + Attributes set after calculating the components: + - rotations: `NDArrayFloat` (n_features, n_components). The + rotations of the block. + - loadings: `NDArrayFloat` (n_features, n_components). The + loadings of the block. + - components: `BlockType`. The components of the block. + """ mean: BlockType @@ -254,7 +295,8 @@ def _initialize_data( ) -> None: """Initialize the data of the block.""" raise NotImplementedError( - f"Data type {type(data)} not supported", + f"Data type {type(data)} or combination of arguments" + "not supported", ) @_initialize_data.register @@ -265,7 +307,7 @@ def _initialize_data_multivariate( regularization: None, weights_basis: None, ) -> None: - """Initialize the data of the block.""" + """Initialize the data of a multivariate block.""" if len(data.shape) == 1: data = data[:, np.newaxis] @@ -281,61 +323,70 @@ def _initialize_data_multivariate( ) @_initialize_data.register - def _initialize_data_fdatagrid( + def _initialize_data_grid( self, data: FDataGrid, integration_weights: Optional[np.ndarray], # type: ignore[type-arg] regularization: Optional[L2Regularization[Any]], weights_basis: None, ) -> None: + """Initialize the data of a grid block.""" self.mean = data.mean() - data = data - self.mean - self.data = data + self.data = data - self.mean + self.data_matrix = data.data_matrix[..., 0] - data_mat = data.data_matrix[..., 0] + # Arrange the integration weights in a diagonal matrix + # By default, use Simpson's rule if integration_weights is None: - identity = np.identity(data_mat.shape[1]) + identity = np.identity(self.data_matrix.shape[1]) integration_weights = scipy.integrate.simps( identity, - data.grid_points[0], + self.data.grid_points[0], ) self.integration_weights = np.diag(integration_weights) + # Compute the regularization matrix + # By default, all zeros (no regularization) regularization_matrix = None if regularization is not None: regularization_matrix = compute_penalty_matrix( - basis_iterable=(_GridBasis(grid_points=data.grid_points),), + basis_iterable=( + _GridBasis(grid_points=self.data.grid_points), + ), regularization_parameter=1, regularization=regularization, ) if regularization_matrix is None: regularization_matrix = np.zeros( - (data_mat.shape[1], data_mat.shape[1]), + (self.data_matrix.shape[1], self.data_matrix.shape[1]), ) self.regularization_matrix = regularization_matrix self.G_data_weights = self.integration_weights self.G_weights = self.integration_weights - self.data_matrix = data_mat @_initialize_data.register - def _initialize_data_fdatabasis( + def _initialize_data_basis( self, data: FDataBasis, integration_weights: None, regularization: Optional[L2Regularization[Any]], weights_basis: Optional[Basis], ) -> None: + """Initialize the data of a basis block.""" self.mean = data.mean() - data = data - self.mean - self.data = data + self.data = data - self.mean + self.data_matrix = self.data.coefficients - data_mat = data.coefficients + # By default, use the basis of the input data + # for the weights if weights_basis is None: self.weights_basis = data.basis else: self.weights_basis = weights_basis + # Compute the regularization matrix + # By default, all zeros (no regularization) regularization_matrix = None if regularization is not None: regularization_matrix = compute_penalty_matrix( @@ -345,42 +396,52 @@ def _initialize_data_fdatabasis( ) if regularization_matrix is None: regularization_matrix = np.zeros( - (data_mat.shape[1], data_mat.shape[1]), + (self.data_matrix.shape[1], self.data_matrix.shape[1]), ) self.regularization_matrix = regularization_matrix self.G_weights = self.weights_basis.gram_matrix() - self.G_data_weights = data.basis.inner_product_matrix( + self.G_data_weights = self.data.basis.inner_product_matrix( self.weights_basis, ) - self.data_matrix = data_mat - def calculate_components( + def set_nipals_results( self, rotations: NDArrayFloat, loadings: NDArrayFloat, - ) -> BlockType: + ) -> None: """Set the results of NIPALS.""" self.rotations = rotations - self.loadings = loadings - self.components = self._calculate_component() - return self.components + self.loadings_matrix = loadings + self.components = self._to_block_type(rotations, "Component") + self.loadings = self._to_block_type(self.loadings_matrix, "Loading") - def _calculate_component(self) -> BlockType: + def _to_block_type( + self, + nipals_matrix: NDArrayFloat, + title: str, + ) -> BlockType: + # Each column of the matrix generated by NIPALS corresponds to + # an obsevation or direction. Therefore, there must be transposed + # so that each row corresponds ot an observation or direction if isinstance(self.data, FDataGrid): return self.data.copy( - data_matrix=np.transpose(self.rotations), - sample_names=(None,) * self.n_components, - dataset_name=f"FPLS {self.label} components", + data_matrix=nipals_matrix.T, + sample_names=[ + f"{title} {i}" for i in range(self.n_components) + ], + dataset_name=f"FPLS {self.label} {title}s", ) elif isinstance(self.data, FDataBasis): return self.data.copy( - coefficients=np.transpose(self.rotations), - sample_names=(None,) * self.n_components, - dataset_name=f"FPLS {self.label} components", + coefficients=nipals_matrix.T, + sample_names=[ + f"{title} {i}" for i in range(self.n_components) + ], + dataset_name=f"FPLS {self.label} {title}s", ) elif isinstance(self.data, np.ndarray): - return cast(BlockType, self.rotations) + return cast(BlockType, nipals_matrix.T) raise NotImplementedError( f"Data type {type(self.data)} not supported", @@ -392,7 +453,7 @@ def transform( ) -> NDArrayFloat: """Transform from the data space to the component space.""" if isinstance(data, FDataGrid): - data_grid = data - cast(FDataGrid, self.mean) + data_grid = data - self.mean return ( data_grid.data_matrix[..., 0] @ self.G_data_weights @@ -404,9 +465,10 @@ def transform( data_basis.coefficients @ self.G_data_weights @ self.rotations ) elif isinstance(data, np.ndarray): - if len(data.shape) == 1: - data = data[:, np.newaxis] - data_array = data - cast(NDArrayFloat, self.mean) + data_array = data + if len(data_array.shape) == 1: + data_array = data_array[:, np.newaxis] + data_array = data_array - self.mean return data_array @ self.rotations raise NotImplementedError( @@ -420,7 +482,7 @@ def inverse_transform( """Transform from the component space to the data space.""" if isinstance(self.data, FDataGrid): reconstructed_grid = self.data.copy( - data_matrix=components @ self.loadings.T, + data_matrix=components @ self.loadings_matrix.T, sample_names=(None,) * components.shape[0], dataset_name=f"FPLS {self.label} inverse transformed", ) @@ -428,14 +490,14 @@ def inverse_transform( elif isinstance(self.data, FDataBasis): reconstructed_basis = self.data.copy( - coefficients=components @ self.loadings.T, + coefficients=components @ self.loadings_matrix.T, sample_names=(None,) * components.shape[0], dataset_name=f"FPLS {self.label} inverse transformed", ) return reconstructed_basis + cast(FDataBasis, self.mean) elif isinstance(self.data, np.ndarray): - reconstructed = components @ self.loadings.T + reconstructed = components @ self.loadings_matrix.T reconstructed += self.mean return cast(BlockType, reconstructed) @@ -467,52 +529,82 @@ class FPLS( # noqa: WPS230 BaseEstimator, Generic[InputTypeX, InputTypeY], ): - """ + r""" Functional Partial Least Squares Regression. + Given two data blocks, X and Y, FPLS finds pairs of latent variables + (aka extracted variables). Each extracted variable is the result of + projecting the data along a direction (vector or function). + In this class, the term component is used to refer to these + projection directions, otherwise known as rotations. + + This is a generic class. When instantiated, the type of the + data in each block can be specified. The possiblities are: + NDArrayFloat, FDataGrid and FDataBasis. + + Parameters: + n_components: Number of components to extract. + regularization_X: Regularization to apply to the X block. + regularization_Y: Regularization to apply to the Y block. + component_basis_X: Basis to use for the X block. Only + applicable if X is a FDataBasis. Otherwise it must be None. + component_basis_Y: Basis to use for the Y block. Only + applicable if Y is a FDataBasis. Otherwise it must be None. + + _deflation_mode: Mode to use for deflation. Can be "can" + (dimensionality reduction) or "reg" (regression). + _integration_weights_X: One-dimensional array with the integration + weights for the X block. + Only applicable if X is a FDataGrid. Otherwise it must be None. + _integration_weights_Y: One-dimensional array with the integration + weights for the Y block. + Only applicable if Y is a FDataGrid. Otherwise it must be None. + Attributes: - n_components: Number of components to keep. - ... + x_components\_: Components of the X block (Same type as X). + y_components\_: Components of the Y block (Same type as y). + x_loadings + + """ def __init__( self, n_components: int = 5, - scale: bool = False, - integration_weights_X: NDArrayFloat | None = None, - integration_weights_Y: NDArrayFloat | None = None, + *, regularization_X: L2Regularization[InputTypeX] | None = None, regularization_Y: L2Regularization[InputTypeY] | None = None, - weight_basis_X: Basis | None = None, - weight_basis_Y: Basis | None = None, - deflation_mode: str = "can", + component_basis_X: Basis | None = None, + component_basis_Y: Basis | None = None, + _deflation_mode: str = "can", + _integration_weights_X: NDArrayFloat | None = None, + _integration_weights_Y: NDArrayFloat | None = None, ) -> None: self.n_components = n_components - self.scale = scale - self.integration_weights_X = integration_weights_X - self.integration_weights_Y = integration_weights_Y + self._integration_weights_X = _integration_weights_X + self._integration_weights_Y = _integration_weights_Y self.regularization_X = regularization_X self.regularization_Y = regularization_Y - self.weight_basis_X = weight_basis_X - self.weight_basis_Y = weight_basis_Y - self.deflation_mode = deflation_mode + self.component_basis_X = component_basis_X + self.component_basis_Y = component_basis_Y + self._deflation_mode = _deflation_mode def _initialize_blocks(self, X: InputTypeX, Y: InputTypeY) -> None: self._x_block = _FPLSBlock( data=X, n_components=self.n_components, label="X", - integration_weights=self.integration_weights_X, + integration_weights=self._integration_weights_X, regularization=self.regularization_X, - weights_basis=self.weight_basis_X, + weights_basis=self.component_basis_X, ) self._y_block = _FPLSBlock( data=Y, n_components=self.n_components, label="Y", - integration_weights=self.integration_weights_Y, + integration_weights=self._integration_weights_Y, regularization=self.regularization_Y, - weights_basis=self.weight_basis_Y, + weights_basis=self.component_basis_Y, ) def fit( @@ -545,15 +637,13 @@ def fit( G_yc=self._y_block.G_data_weights, L_X_inv=self._x_block.get_cholesky_inv_penalty_matrix(), L_Y_inv=self._y_block.get_cholesky_inv_penalty_matrix(), - deflation=self.deflation_mode, + deflation=self._deflation_mode, ) self.x_weights_ = W self.y_weights_ = C self.x_scores_ = T self.y_scores_ = U - self.x_loadings_ = P - self.y_loadings_ = Q self.x_rotations_ = W @ np.linalg.pinv( P.T @ self._x_block.G_data_weights @ W, @@ -563,15 +653,21 @@ def fit( Q.T @ self._y_block.G_data_weights @ C, ) - self.x_components_ = self._x_block.calculate_components( + self._x_block.set_nipals_results( rotations=self.x_rotations_, loadings=P, ) - self.y_components_ = self._y_block.calculate_components( + self._y_block.set_nipals_results( rotations=self.y_rotations_, loadings=Q, ) + self.x_components_ = self._x_block.components + self.y_components_ = self._y_block.components + + self.x_loadings_ = self._x_block.loadings + self.y_loadings_ = self._y_block.loadings + self.coef_ = self.x_rotations_ @ Q.T return self diff --git a/skfda/tests/test_fpls.py b/skfda/tests/test_fpls.py index e0cc1c1bd..c289cc9cb 100644 --- a/skfda/tests/test_fpls.py +++ b/skfda/tests/test_fpls.py @@ -103,7 +103,7 @@ def test_sklearn(self) -> None: # Fit the model fpls = FPLS[FDataGrid, NDArrayFloat]( n_components=3, - integration_weights_X=integration_weights, + _integration_weights_X=integration_weights, ) fpls.fit(X, y) diff --git a/skfda/tests/test_fpls_regression.py b/skfda/tests/test_fpls_regression.py index 99e6fcd6b..5119fba59 100644 --- a/skfda/tests/test_fpls_regression.py +++ b/skfda/tests/test_fpls_regression.py @@ -29,8 +29,7 @@ def test_sklearn(self) -> None: # Fit the model fplsr = FPLSRegression[NDArrayFloat, NDArrayFloat]( n_components=5, - integration_weights_X=np.ones(len(X.grid_points[0])), - scale=False, + _integration_weights_X=np.ones(len(X.grid_points[0])), ) fplsr.fit(X, y) @@ -101,7 +100,7 @@ def test_fda_usc_no_reg(self) -> None: plsReg = FPLSRegression[FData, NDArrayFloat]( n_components=5, - integration_weights_X=np.ones(len(X.grid_points[0])), + _integration_weights_X=np.ones(len(X.grid_points[0])), ) plsReg.fit(X, y) @@ -154,7 +153,7 @@ def test_fda_usc_reg(self) -> None: fplsr = FPLSRegression[FData, NDArrayFloat]( n_components=5, regularization_X=regularization, - integration_weights_X=np.ones(len(X.grid_points[0])), + _integration_weights_X=np.ones(len(X.grid_points[0])), ) fplsr.fit(X, y) @@ -200,7 +199,7 @@ def test_basis_vs_grid(self) -> None: # Fit the model with weights fplsr = FPLSRegression( n_components=5, - integration_weights_X=ss_weights, + _integration_weights_X=ss_weights, ) fplsr.fit(X, y) From 6b61c178f57ed512a0f925fe55077359a7c54bbe Mon Sep 17 00:00:00 2001 From: David del Val Date: Tue, 27 Jun 2023 19:00:07 +0200 Subject: [PATCH 26/46] Complete fpls documentation --- skfda/ml/regression/_fpls_regression.py | 38 ++++++-- skfda/preprocessing/dim_reduction/_fpls.py | 105 ++++++++++++--------- skfda/tests/test_fpls.py | 10 +- skfda/tests/test_fpls_regression.py | 8 +- 4 files changed, 100 insertions(+), 61 deletions(-) diff --git a/skfda/ml/regression/_fpls_regression.py b/skfda/ml/regression/_fpls_regression.py index ce3a8e090..737f598bd 100644 --- a/skfda/ml/regression/_fpls_regression.py +++ b/skfda/ml/regression/_fpls_regression.py @@ -26,12 +26,26 @@ class FPLSRegression( BaseEstimator, RegressorMixin[InputType, OutputType], ): - """ + r""" Regression using Functional Partial Least Squares. - Args: + Parameters: n_components: Number of components to keep. Defaults to 5. - ... + regularization_X: Regularization for the calculation of the X weights. + component_basis_X: Basis to use for the X block. Only + applicable if X is a FDataBasis. Otherwise it must be None. + component_basis_Y: Basis to use for the Y block. Only + applicable if Y is a FDataBasis. Otherwise it must be None. + _integration_weights_X: One-dimensional array with the integration + weights for the X block. + Only applicable if X is a FDataGrid. Otherwise it must be None. + _integration_weights_Y: One-dimensional array with the integration + weights for the Y block. + Only applicable if Y is a FDataGrid. Otherwise it must be None. + + Attributes: + coef\_: Coefficients of the linear model. + fpls\_: FPLS object used to fit the model. """ def __init__( @@ -55,7 +69,16 @@ def fit( X: InputType, y: OutputType, ) -> FPLSRegression[InputType, OutputType]: - """Fit the model using X as training data and y as target values.""" + """ + Fit the model using the data for both blocks. + + Args: + X: Data of the X block + y: Data of the Y block + + Returns: + self + """ self.fpls_ = FPLS[InputType, OutputType]( n_components=self.n_components, regularization_X=self.regularization_X, @@ -68,17 +91,20 @@ def fit( self.fpls_.fit(X, y) + self.coef_ = ( + self.fpls_.x_rotations_matrix_ + @ self.fpls_.y_loadings_matrix_.T + ) return self def predict(self, X: InputType) -> OutputType: """Predict using the model. Args: - X: FData to predict. + X: Data to predict. Returns: Predicted values. - """ check_is_fitted(self) return self.fpls_.inverse_transform_y(self.fpls_.transform_x(X)) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index be35c406c..08c4234a8 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -237,31 +237,30 @@ class _FPLSBlock(Generic[BlockType]): # noqa: WPS230 - weights_basis: The basis of the weights. It must be None for multivariate or grid blocks. - Attributes set during the initialization: - - n_components: `int`. Number of components to extract. - - label: `str`. Label of the block (X or Y). - - data: `BlockType`. The data of the block. - - data_matrix: `NDArrayFloat` (n_samples, n_features). The data + Attributes: + - n_components: Number of components to extract. + - label: Label of the block (X or Y). + - data: The data of the block. + - data_matrix: (n_samples, n_features) matrix. The data matrix of the block. - - mean: `BlockType`. The mean of the data. - - weights_basis: `Basis`. The basis of the weights. - - regularization_matrix: `NDArrayFloat` (n_features, n_features). + - mean: The mean of the data. + - weights_basis: The basis of the weights. + - regularization_matrix: (n_features, n_features) matrix. Inner product matrix of the regularization operator applied to the basis or the grid. - - integration_matrix: `NDArrayFloat` (n_features, n_features). + - integration_matrix: (n_features, n_features) matrix. Diagonal matrix of the integration weights. - - G_data_weights: `NDArrayFloat` (n_samples, n_samples). The inner + - G_data_weights: (n_samples, n_samples) matrix. The inner product matrix for the data and weights (The discretization matrix in grid blocks). - - G_weights: `NDArrayFloat` (n_samples, n_samples). The inner product + - G_weights: (n_samples, n_samples) matrix. The inner product matrix for the weights (The discretization matrix in grid blocks). - Attributes set after calculating the components: - - rotations: `NDArrayFloat` (n_features, n_components). The + - rotations: (n_features, n_components) matrix. The rotations of the block. - - loadings: `NDArrayFloat` (n_features, n_components). The + - loadings_matrix: (n_features, n_components) matrix. The loadings of the block. - - components: `BlockType`. The components of the block. + - loadings: The loadings of the block (same type as the data). """ @@ -411,10 +410,10 @@ def set_nipals_results( loadings: NDArrayFloat, ) -> None: """Set the results of NIPALS.""" - self.rotations = rotations + self.rotations_matrix = rotations self.loadings_matrix = loadings - self.components = self._to_block_type(rotations, "Component") - self.loadings = self._to_block_type(self.loadings_matrix, "Loading") + self.rotations = self._to_block_type(self.rotations_matrix, "rotation") + self.loadings = self._to_block_type(self.loadings_matrix, "loading") def _to_block_type( self, @@ -422,13 +421,14 @@ def _to_block_type( title: str, ) -> BlockType: # Each column of the matrix generated by NIPALS corresponds to - # an obsevation or direction. Therefore, there must be transposed + # an obsevation or direction. Therefore, they must be transposed # so that each row corresponds ot an observation or direction if isinstance(self.data, FDataGrid): return self.data.copy( data_matrix=nipals_matrix.T, sample_names=[ - f"{title} {i}" for i in range(self.n_components) + f"{title.capitalize()} {i}" + for i in range(self.n_components) ], dataset_name=f"FPLS {self.label} {title}s", ) @@ -436,7 +436,8 @@ def _to_block_type( return self.data.copy( coefficients=nipals_matrix.T, sample_names=[ - f"{title} {i}" for i in range(self.n_components) + f"{title.capitalize()} {i}" + for i in range(self.n_components) ], dataset_name=f"FPLS {self.label} {title}s", ) @@ -457,19 +458,21 @@ def transform( return ( data_grid.data_matrix[..., 0] @ self.G_data_weights - @ self.rotations + @ self.rotations_matrix ) elif isinstance(data, FDataBasis): data_basis = data - cast(FDataBasis, self.mean) return ( - data_basis.coefficients @ self.G_data_weights @ self.rotations + data_basis.coefficients + @ self.G_data_weights + @ self.rotations_matrix ) elif isinstance(data, np.ndarray): data_array = data if len(data_array.shape) == 1: data_array = data_array[:, np.newaxis] data_array = data_array - self.mean - return data_array @ self.rotations + return data_array @ self.rotations_matrix raise NotImplementedError( f"Data type {type(data)} not supported", @@ -532,12 +535,6 @@ class FPLS( # noqa: WPS230 r""" Functional Partial Least Squares Regression. - Given two data blocks, X and Y, FPLS finds pairs of latent variables - (aka extracted variables). Each extracted variable is the result of - projecting the data along a direction (vector or function). - In this class, the term component is used to refer to these - projection directions, otherwise known as rotations. - This is a generic class. When instantiated, the type of the data in each block can be specified. The possiblities are: NDArrayFloat, FDataGrid and FDataBasis. @@ -550,7 +547,6 @@ class FPLS( # noqa: WPS230 applicable if X is a FDataBasis. Otherwise it must be None. component_basis_Y: Basis to use for the Y block. Only applicable if Y is a FDataBasis. Otherwise it must be None. - _deflation_mode: Mode to use for deflation. Can be "can" (dimensionality reduction) or "reg" (regression). _integration_weights_X: One-dimensional array with the integration @@ -561,10 +557,26 @@ class FPLS( # noqa: WPS230 Only applicable if Y is a FDataGrid. Otherwise it must be None. Attributes: - x_components\_: Components of the X block (Same type as X). - y_components\_: Components of the Y block (Same type as y). - x_loadings - + x_weights\_: (n_features_X, n_components) array with the X weights + extracted by NIPALS. + y_weights\_: (n_features_Y, n_components) array with the Y weights + extracted by NIPALS. + x_scores\_: (n_samples, n_components) array with the X scores + extracted by NIPALS. + y_scores\_: (n_samples, n_components) array with the Y scores + extracted by NIPALS. + x_rotations_matrix\_: (n_features_X, n_components) array with the + X rotations. + y_rotations_matrix\_: (n_features_Y, n_components) array with the + Y rotations. + x_loadings_matrix\_: (n_features_X, n_components) array with the + X loadings. + y_loadings_matrix\_: (n_features_Y, n_components) array with the + Y loadings. + x_rotations\_: Projection directions for the X block (same type as X). + y_rotations\_: Projection directions for the Y block (same type as Y). + x_loadings\_: Loadings for the X block (same type as X). + y_loadings\_: Loadings for the Y block (same type as Y). """ @@ -615,8 +627,6 @@ def fit( """ Fit the model using the data for both blocks. - Any of the parameters can be a FDataGrid, FDataBasis or numpy array. - Args: X: Data of the X block y: Data of the Y block @@ -644,31 +654,32 @@ def fit( self.y_weights_ = C self.x_scores_ = T self.y_scores_ = U + self.x_loadings_matrix_ = P + self.y_loadings_matrix_ = Q - self.x_rotations_ = W @ np.linalg.pinv( + self.x_rotations_matrix_ = W @ np.linalg.pinv( P.T @ self._x_block.G_data_weights @ W, ) - self.y_rotations_ = C @ np.linalg.pinv( + self.y_rotation_matrix_ = C @ np.linalg.pinv( Q.T @ self._y_block.G_data_weights @ C, ) self._x_block.set_nipals_results( - rotations=self.x_rotations_, - loadings=P, + rotations=self.x_rotations_matrix_, + loadings=self.x_loadings_matrix_, ) self._y_block.set_nipals_results( - rotations=self.y_rotations_, - loadings=Q, + rotations=self.y_rotation_matrix_, + loadings=self.y_loadings_matrix_, ) - self.x_components_ = self._x_block.components - self.y_components_ = self._y_block.components + self.x_rotations_ = self._x_block.rotations + self.y_rotations_ = self._y_block.rotations self.x_loadings_ = self._x_block.loadings self.y_loadings_ = self._y_block.loadings - self.coef_ = self.x_rotations_ @ Q.T return self def transform( @@ -697,6 +708,7 @@ def transform( if y is not None: y_scores = self._y_block.transform(y) return x_scores, y_scores + return x_scores def transform_x( @@ -763,6 +775,7 @@ def inverse_transform( if Y is not None: y_reconstructed = self._y_block.inverse_transform(Y) return x_reconstructed, y_reconstructed + return x_reconstructed def inverse_transform_x( diff --git a/skfda/tests/test_fpls.py b/skfda/tests/test_fpls.py index c289cc9cb..7ed2ae45a 100644 --- a/skfda/tests/test_fpls.py +++ b/skfda/tests/test_fpls.py @@ -115,7 +115,7 @@ def test_sklearn(self) -> None: # Check that the rotations are correct np.testing.assert_allclose( - np.abs(fpls.x_rotations_).flatten(), + np.abs(fpls.x_rotations_matrix_).flatten(), np.abs(sklearnpls.x_rotations_).flatten(), rtol=rtol, atol=atol, @@ -190,14 +190,14 @@ def test_basis_vs_grid( # noqa: WPS210 # Check that the results are the same np.testing.assert_allclose( - np.abs(fpls.x_components_(self.grid_points)), - np.abs(fpls_grid.x_components_(self.grid_points)), + np.abs(fpls.x_rotations_(self.grid_points)), + np.abs(fpls_grid.x_rotations_(self.grid_points)), rtol=5e-3, ) np.testing.assert_allclose( - np.abs(fpls.y_components_(self.grid_points)), - np.abs(fpls_grid.y_components_(self.grid_points)), + np.abs(fpls.y_rotations_(self.grid_points)), + np.abs(fpls_grid.y_rotations_(self.grid_points)), rtol=5e-3, ) diff --git a/skfda/tests/test_fpls_regression.py b/skfda/tests/test_fpls_regression.py index 5119fba59..edfb6cda1 100644 --- a/skfda/tests/test_fpls_regression.py +++ b/skfda/tests/test_fpls_regression.py @@ -40,7 +40,7 @@ def test_sklearn(self) -> None: atol = 1e-6 # Compare the results np.testing.assert_allclose( - fplsr.fpls_.coef_.flatten(), # noqa: WPS437 + fplsr.coef_.flatten(), sklearnpls.coef_.flatten(), rtol=rtol, atol=atol, @@ -56,7 +56,7 @@ def test_sklearn(self) -> None: # Check that the rotations are correc np.testing.assert_allclose( - np.abs(fplsr.fpls_.x_rotations_).flatten(), + np.abs(fplsr.fpls_.x_rotations_matrix_).flatten(), np.abs(sklearnpls.x_rotations_).flatten(), rtol=rtol, atol=atol, @@ -182,7 +182,7 @@ def test_basis_vs_grid(self) -> None: # Fit the model fplsr = FPLSRegression[FData, NDArrayFloat](n_components=5) fplsr.fit(X, y) - basis_components = fplsr.fpls_.x_components_(original_grid_points) + basis_components = fplsr.fpls_.x_rotations_(original_grid_points) # Go back to grid new_grid_points = np.linspace( @@ -203,7 +203,7 @@ def test_basis_vs_grid(self) -> None: ) fplsr.fit(X, y) - grid_components = fplsr.fpls_.x_components_(original_grid_points) + grid_components = fplsr.fpls_.x_rotations_(original_grid_points) np.testing.assert_allclose( basis_components, From b24f6770170f6fa540a17d583d4a1bd1ed9a0fd9 Mon Sep 17 00:00:00 2001 From: David del Val Date: Fri, 30 Jun 2023 20:30:23 +0200 Subject: [PATCH 27/46] Tidy fpls implementation --- skfda/ml/regression/_fpls_regression.py | 12 ++++++------ skfda/preprocessing/dim_reduction/_fpls.py | 2 ++ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/skfda/ml/regression/_fpls_regression.py b/skfda/ml/regression/_fpls_regression.py index 737f598bd..f8380fe94 100644 --- a/skfda/ml/regression/_fpls_regression.py +++ b/skfda/ml/regression/_fpls_regression.py @@ -32,9 +32,9 @@ class FPLSRegression( Parameters: n_components: Number of components to keep. Defaults to 5. regularization_X: Regularization for the calculation of the X weights. - component_basis_X: Basis to use for the X block. Only + weight_basis_X: Basis to use for the X block. Only applicable if X is a FDataBasis. Otherwise it must be None. - component_basis_Y: Basis to use for the Y block. Only + weight_basis_Y: Basis to use for the Y block. Only applicable if Y is a FDataBasis. Otherwise it must be None. _integration_weights_X: One-dimensional array with the integration weights for the X block. @@ -52,8 +52,8 @@ def __init__( self, n_components: int = 5, regularization_X: L2Regularization[Any] | None = None, - component_basis_X: Basis | None = None, - component_basis_Y: Basis | None = None, + weight_basis_X: Basis | None = None, + weight_basis_Y: Basis | None = None, _integration_weights_X: NDArrayFloat | None = None, _integration_weights_Y: NDArrayFloat | None = None, ) -> None: @@ -61,8 +61,8 @@ def __init__( self._integration_weights_X = _integration_weights_X self._integration_weights_Y = _integration_weights_Y self.regularization_X = regularization_X - self.weight_basis_X = component_basis_X - self.weight_basis_Y = component_basis_Y + self.weight_basis_X = weight_basis_X + self.weight_basis_Y = weight_basis_Y def fit( self, diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index 08c4234a8..5d721ba24 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -430,6 +430,7 @@ def _to_block_type( f"{title.capitalize()} {i}" for i in range(self.n_components) ], + coordinate_names=(f"FPLS {self.label} {title} value",), dataset_name=f"FPLS {self.label} {title}s", ) elif isinstance(self.data, FDataBasis): @@ -439,6 +440,7 @@ def _to_block_type( f"{title.capitalize()} {i}" for i in range(self.n_components) ], + coordinate_names=(f"FPLS {self.label} {title} value",), dataset_name=f"FPLS {self.label} {title}s", ) elif isinstance(self.data, np.ndarray): From 80879ede3e8c6edb4bbf61c5546da9e641e2bab3 Mon Sep 17 00:00:00 2001 From: David del Val Date: Mon, 31 Jul 2023 19:40:01 +0200 Subject: [PATCH 28/46] Add doctest to PLS classes --- skfda/ml/regression/_fpls_regression.py | 10 ++++++++++ skfda/preprocessing/dim_reduction/_fpls.py | 9 +++++++++ 2 files changed, 19 insertions(+) diff --git a/skfda/ml/regression/_fpls_regression.py b/skfda/ml/regression/_fpls_regression.py index f8380fe94..f0a939731 100644 --- a/skfda/ml/regression/_fpls_regression.py +++ b/skfda/ml/regression/_fpls_regression.py @@ -46,6 +46,16 @@ class FPLSRegression( Attributes: coef\_: Coefficients of the linear model. fpls\_: FPLS object used to fit the model. + + Example: + Fit a FPLS regression model with two components. + >>> from skfda.ml.regression import FPLSRegression + >>> from skfda.datasets import fetch_tecator + >>> from skfda.representation import FDataGrid + >>> from skfda.typing._numpy import NDArrayFloat + >>> X, y = fetch_tecator(return_X_y=True) + >>> fpls = FPLSRegression[FDataGrid, NDArrayFloat](n_components=2) + >>> fpls = fpls.fit(X, y) """ def __init__( diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index 5d721ba24..b150fad02 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -262,6 +262,15 @@ class _FPLSBlock(Generic[BlockType]): # noqa: WPS230 loadings of the block. - loadings: The loadings of the block (same type as the data). + Example: + Fit a FPLS model with two components. + >>> from skfda.preprocessing.dim_reduction import FPLS + >>> from skfda.datasets import fetch_tecator + >>> from skfda.representation import FDataGrid + >>> from skfda.typing._numpy import NDArrayFloat + >>> X, y = fetch_tecator(return_X_y=True) + >>> fpls = FPLS[FDataGrid, NDArrayFloat](n_components=2) + >>> fpls = fpls.fit(X, y) """ mean: BlockType From 28d48b25b8a2f6325c13232722908a4750b3669f Mon Sep 17 00:00:00 2001 From: David del Val Date: Mon, 31 Jul 2023 19:46:44 +0200 Subject: [PATCH 29/46] In the new version of sklearn the coefficients are transposed --- skfda/tests/test_fpls_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/tests/test_fpls_regression.py b/skfda/tests/test_fpls_regression.py index edfb6cda1..b658e8863 100644 --- a/skfda/tests/test_fpls_regression.py +++ b/skfda/tests/test_fpls_regression.py @@ -41,7 +41,7 @@ def test_sklearn(self) -> None: # Compare the results np.testing.assert_allclose( fplsr.coef_.flatten(), - sklearnpls.coef_.flatten(), + sklearnpls.coef_.T.flatten(), rtol=rtol, atol=atol, ) From 7f0c84d9734c13e10c4726f976bac555f1957134 Mon Sep 17 00:00:00 2001 From: David del Val Date: Tue, 1 Aug 2023 07:20:57 +0200 Subject: [PATCH 30/46] Fix documentation generation for PLS --- docs/modules/ml/regression.rst | 15 ++++++++++++++- docs/modules/preprocessing/dim_reduction.rst | 7 +++++-- skfda/ml/regression/_fpls_regression.py | 4 +++- skfda/preprocessing/dim_reduction/_fpls.py | 13 ++++++++----- 4 files changed, 30 insertions(+), 9 deletions(-) diff --git a/docs/modules/ml/regression.rst b/docs/modules/ml/regression.rst index 539d38ec2..30eed2545 100644 --- a/docs/modules/ml/regression.rst +++ b/docs/modules/ml/regression.rst @@ -57,4 +57,17 @@ regression is fitted using the coefficients of the functions in said basis. .. autosummary:: :toctree: autosummary - skfda.ml.regression.FPCARegression \ No newline at end of file + skfda.ml.regression.FPCARegression + +FPLS regression +----------------- +This module includes the implementation of FPLS (Functional Partial Least Squares) +regression. This implementation accepts either functional or multivariate data as the regressor and the response. +FPLS regression is performed by performing the FPLS dimensionality reduction algorithm +but using a regression deflation strategy. + + +.. autosummary:: + :toctree: autosummary + + skfda.ml.regression.FPLSRegression \ No newline at end of file diff --git a/docs/modules/preprocessing/dim_reduction.rst b/docs/modules/preprocessing/dim_reduction.rst index e35e5675f..d78d8705c 100644 --- a/docs/modules/preprocessing/dim_reduction.rst +++ b/docs/modules/preprocessing/dim_reduction.rst @@ -36,9 +36,12 @@ Other dimensionality reduction methods construct new features from existing ones. For example, in functional principal component analysis, we project the data samples into a smaller sample of functions that preserve most of the original -variance. +variance. Similarly, in functional partial least squares, we project +the data samples into a smaller sample of functions that preserve most +of the covariance between the two data blocks. .. autosummary:: :toctree: autosummary - skfda.preprocessing.dim_reduction.FPCA \ No newline at end of file + skfda.preprocessing.dim_reduction.FPCA + skfda.preprocessing.dim_reduction.FPLS \ No newline at end of file diff --git a/skfda/ml/regression/_fpls_regression.py b/skfda/ml/regression/_fpls_regression.py index f0a939731..0e105d019 100644 --- a/skfda/ml/regression/_fpls_regression.py +++ b/skfda/ml/regression/_fpls_regression.py @@ -47,8 +47,9 @@ class FPLSRegression( coef\_: Coefficients of the linear model. fpls\_: FPLS object used to fit the model. - Example: + Examples: Fit a FPLS regression model with two components. + >>> from skfda.ml.regression import FPLSRegression >>> from skfda.datasets import fetch_tecator >>> from skfda.representation import FDataGrid @@ -56,6 +57,7 @@ class FPLSRegression( >>> X, y = fetch_tecator(return_X_y=True) >>> fpls = FPLSRegression[FDataGrid, NDArrayFloat](n_components=2) >>> fpls = fpls.fit(X, y) + """ def __init__( diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index b150fad02..7df86475b 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -262,8 +262,9 @@ class _FPLSBlock(Generic[BlockType]): # noqa: WPS230 loadings of the block. - loadings: The loadings of the block (same type as the data). - Example: + Examples: Fit a FPLS model with two components. + >>> from skfda.preprocessing.dim_reduction import FPLS >>> from skfda.datasets import fetch_tecator >>> from skfda.representation import FDataGrid @@ -271,6 +272,7 @@ class _FPLSBlock(Generic[BlockType]): # noqa: WPS230 >>> X, y = fetch_tecator(return_X_y=True) >>> fpls = FPLS[FDataGrid, NDArrayFloat](n_components=2) >>> fpls = fpls.fit(X, y) + """ mean: BlockType @@ -709,8 +711,8 @@ def transform( If None, only X is transformed. Returns: - x_scores: Data transformed. - y_scores: Data transformed (if y is not None) + - x_scores: Data transformed. + - y_scores: Data transformed (if y is not None) """ check_is_fitted(self) @@ -775,9 +777,10 @@ def inverse_transform( as the number of components of the model. Returns: - X: Data reconstructed from the transformed data. - Y: Data reconstructed from the transformed data + - X: Data reconstructed from the transformed data. + - Y: Data reconstructed from the transformed data (if Y is not None) + """ check_is_fitted(self) From 24fdc4c2f9dd93c67dff0c31daae66336a89577e Mon Sep 17 00:00:00 2001 From: David del Val Date: Sat, 28 Oct 2023 20:03:43 +0200 Subject: [PATCH 31/46] Move NIPALS into the FPLS class. This simplifies the notation and removes unnecesary parameters that are already stored in the FPLSClass --- skfda/preprocessing/dim_reduction/_fpls.py | 249 +++++++++------------ 1 file changed, 107 insertions(+), 142 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index 7df86475b..2caced967 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -13,21 +13,24 @@ from ...representation.basis import Basis, FDataBasis, _GridBasis from ...typing._numpy import NDArrayFloat -POWER_SOLVER_EPS = 1e-15 INV_EPS = 1e-15 -def _power_solver(X: NDArrayFloat) -> NDArrayFloat: +def _power_solver( + X: NDArrayFloat, + tol: float, + max_iter: int, +) -> NDArrayFloat: """Return the dominant eigenvector of a matrix using the power method.""" t = X[:, 0] t_prev = np.ones(t.shape) * np.max(t) * 2 iter_count = 0 - while np.linalg.norm(t - t_prev) > POWER_SOLVER_EPS: + while np.linalg.norm(t - t_prev) > tol: t_prev = t t = X @ t t /= np.linalg.norm(t) iter_count += 1 - if iter_count > 1000: + if iter_count > max_iter: break return t @@ -41,63 +44,12 @@ def _calculate_weights( G_yc: NDArrayFloat, L_X_inv: NDArrayFloat, L_Y_inv: NDArrayFloat, + tol: float, + max_iter: int, ) -> Tuple[NDArrayFloat, NDArrayFloat]: """ Calculate the weights for the PLS algorithm. - Parameters as in _pls_nipals. - Returns: - - w: (n_features, 1) - The X block weights. - - c: (n_targets, 1) - The Y block weights. - """ - X = X @ G_xw @ L_X_inv.T - Y = Y @ G_yc @ L_Y_inv.T - S = X.T @ Y - w = _power_solver(S @ S.T) - - # Calculate the other weight - c = np.dot(Y.T, np.dot(X, w)) - - # Undo the transformation - w = L_X_inv.T @ w - - # Normalize - w /= np.sqrt(np.dot(w.T, G_ww @ w)) + INV_EPS - - # Undo the transformation - c = L_Y_inv.T @ c - - # Normalize the other weight - c /= np.sqrt(np.dot(c.T, G_cc @ c)) + INV_EPS - - return w, c - - -# ignore flake8 multi-line function annotation -def _pls_nipals( # noqa: WPS320 - X: NDArrayFloat, - Y: NDArrayFloat, - n_components: int, - G_ww: NDArrayFloat, - G_xw: NDArrayFloat, - G_cc: NDArrayFloat, - G_yc: NDArrayFloat, - L_X_inv: NDArrayFloat, - L_Y_inv: NDArrayFloat, - deflation: str = "reg", -) -> Tuple[ - NDArrayFloat, - NDArrayFloat, - NDArrayFloat, - NDArrayFloat, - NDArrayFloat, - NDArrayFloat, -]: - """ - Perform the NIPALS algorithm for PLS. - Parameters: - X: (n_samples, n_features) The X block data matrix. @@ -128,76 +80,39 @@ def _pls_nipals( # noqa: WPS320 L_Y @ L_Y.T = G_cc + P_y, where P_y is a the penalty matrix for the Y block. - - deflation: The deflation method to use. - Can be "reg" for regression or "can" for dimensionality reduction. + - tol: The tolerance for the power method. + - max_iter: The maximum number of iterations for the power method. Returns: - - W: (n_features, n_components) + - w: (n_features, 1) The X block weights. - - C: (n_targets, n_components) + - c: (n_targets, 1) The Y block weights. - - T: (n_samples, n_components) - The X block scores. - - U: (n_samples, n_components) - The Y block scores. - - P: (n_features, n_components) - The X block loadings. - - Q: (n_targets, n_components) - The Y block loadings. """ - X = X.copy() - Y = Y.copy() - X = X - np.mean(X, axis=0) - Y = Y - np.mean(Y, axis=0) - - if len(Y.shape) == 1: - Y = Y[:, np.newaxis] - - # Store the matrices as list of columns - W, C = [], [] - T, U = [], [] - P, Q = [], [] - for _ in range(n_components): - w, c = _calculate_weights( - X, - Y, - G_ww=G_ww, - G_xw=G_xw, - G_cc=G_cc, - G_yc=G_yc, - L_X_inv=L_X_inv, - L_Y_inv=L_Y_inv, - ) + X = X @ G_xw @ L_X_inv.T + Y = Y @ G_yc @ L_Y_inv.T + S = X.T @ Y + w = _power_solver( + S @ S.T, + tol=tol, + max_iter=max_iter, + ) - t = np.dot(X @ G_xw, w) - u = np.dot(Y @ G_yc, c) + # Calculate the other weight + c = np.dot(Y.T, np.dot(X, w)) - p = np.dot(X.T, t) / (np.dot(t.T, t) + INV_EPS) + # Undo the transformation + w = L_X_inv.T @ w - y_proyection = t if deflation == "reg" else u + # Normalize + w /= np.sqrt(np.dot(w.T, G_ww @ w)) + INV_EPS - q = np.dot(Y.T, y_proyection) / ( - np.dot(y_proyection, y_proyection) + INV_EPS - ) + # Undo the transformation + c = L_Y_inv.T @ c - X = X - np.outer(t, p) - Y = Y - np.outer(y_proyection, q) - - W.append(w) - C.append(c) - T.append(t) - U.append(u) - P.append(p) - Q.append(q) - - # Convert each least of columns to a matrix - return ( # noqa: WPS227 (too long output tuple) - np.array(W).T, - np.array(C).T, - np.array(T).T, - np.array(U).T, - np.array(P).T, - np.array(Q).T, - ) + # Normalize the other weight + c /= np.sqrt(np.dot(c.T, G_cc @ c)) + INV_EPS + + return w, c BlockType = TypeVar( @@ -601,6 +516,8 @@ def __init__( regularization_Y: L2Regularization[InputTypeY] | None = None, component_basis_X: Basis | None = None, component_basis_Y: Basis | None = None, + tol: float = 1e-6, + max_iter: int = 500, _deflation_mode: str = "can", _integration_weights_X: NDArrayFloat | None = None, _integration_weights_Y: NDArrayFloat | None = None, @@ -613,6 +530,8 @@ def __init__( self.component_basis_X = component_basis_X self.component_basis_Y = component_basis_Y self._deflation_mode = _deflation_mode + self.tol = tol + self.max_iter = max_iter def _initialize_blocks(self, X: InputTypeX, Y: InputTypeY) -> None: self._x_block = _FPLSBlock( @@ -632,6 +551,67 @@ def _initialize_blocks(self, X: InputTypeX, Y: InputTypeY) -> None: weights_basis=self.component_basis_Y, ) + def _perform_nipals(self) -> None: + X = self._x_block.data_matrix + Y = self._y_block.data_matrix + X = X - np.mean(X, axis=0) + Y = Y - np.mean(Y, axis=0) + + if len(Y.shape) == 1: + Y = Y[:, np.newaxis] + + # Store the matrices as list of columns + W, C = [], [] + T, U = [], [] + P, Q = [], [] + + # Calculate the penalty matrices in advance + L_X_inv = self._x_block.get_cholesky_inv_penalty_matrix() + L_Y_inv = self._y_block.get_cholesky_inv_penalty_matrix() + + for _ in range(self.n_components): + w, c = _calculate_weights( + X, + Y, + G_ww=self._x_block.G_weights, + G_xw=self._x_block.G_data_weights, + G_cc=self._y_block.G_weights, + G_yc=self._y_block.G_data_weights, + L_X_inv=L_X_inv, + L_Y_inv=L_Y_inv, + tol=self.tol, + max_iter=self.max_iter, + ) + + t = np.dot(X @ self._x_block.G_data_weights, w) + u = np.dot(Y @ self._y_block.G_data_weights, c) + + p = np.dot(X.T, t) / (np.dot(t.T, t) + INV_EPS) + + y_proyection = t if self._deflation_mode == "reg" else u + + q = np.dot(Y.T, y_proyection) / ( + np.dot(y_proyection, y_proyection) + INV_EPS + ) + + X = X - np.outer(t, p) + Y = Y - np.outer(y_proyection, q) + + W.append(w) + C.append(c) + T.append(t) + U.append(u) + P.append(p) + Q.append(q) + + # Convert each list of columns to a matrix + self.x_weights_ = np.array(W).T + self.y_weights_ = np.array(C).T + self.x_scores_ = np.array(T).T + self.y_scores_ = np.array(U).T + self.x_loadings_matrix_ = np.array(P).T + self.y_loadings_matrix_ = np.array(Q).T + def fit( self, X: InputTypeX, @@ -649,33 +629,18 @@ def fit( """ self._initialize_blocks(X, y) - # Supress flake8 warning about too many values to unpack - W, C, T, U, P, Q = _pls_nipals( # noqa: WPS236 - X=self._x_block.data_matrix, - Y=self._y_block.data_matrix, - n_components=self.n_components, - G_ww=self._x_block.G_weights, - G_xw=self._x_block.G_data_weights, - G_cc=self._y_block.G_weights, - G_yc=self._y_block.G_data_weights, - L_X_inv=self._x_block.get_cholesky_inv_penalty_matrix(), - L_Y_inv=self._y_block.get_cholesky_inv_penalty_matrix(), - deflation=self._deflation_mode, - ) - - self.x_weights_ = W - self.y_weights_ = C - self.x_scores_ = T - self.y_scores_ = U - self.x_loadings_matrix_ = P - self.y_loadings_matrix_ = Q + self._perform_nipals() - self.x_rotations_matrix_ = W @ np.linalg.pinv( - P.T @ self._x_block.G_data_weights @ W, + self.x_rotations_matrix_ = self.x_weights_ @ np.linalg.pinv( + self.x_loadings_matrix_.T + @ self._x_block.G_data_weights + @ self.x_weights_, ) - self.y_rotation_matrix_ = C @ np.linalg.pinv( - Q.T @ self._y_block.G_data_weights @ C, + self.y_rotation_matrix_ = self.y_weights_ @ np.linalg.pinv( + self.y_loadings_matrix_.T + @ self._y_block.G_data_weights + @ self.y_weights_, ) self._x_block.set_nipals_results( From fb366d3709c138461882a7c1aae0e81337851997 Mon Sep 17 00:00:00 2001 From: David del Val Date: Sat, 28 Oct 2023 20:53:53 +0200 Subject: [PATCH 32/46] FPLSBlock subclasses for each representation --- skfda/preprocessing/dim_reduction/_fpls.py | 392 ++++++++++++--------- 1 file changed, 227 insertions(+), 165 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index 2caced967..3bd11e04e 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -1,15 +1,15 @@ from __future__ import annotations +from abc import abstractmethod from typing import Any, Generic, Optional, Tuple, TypeVar, Union, cast -import multimethod import numpy as np import scipy from sklearn.utils.validation import check_is_fitted from ..._utils._sklearn_adapter import BaseEstimator from ...misc.regularization import L2Regularization, compute_penalty_matrix -from ...representation import FData, FDataGrid +from ...representation import FDataGrid from ...representation.basis import Basis, FDataBasis, _GridBasis from ...typing._numpy import NDArrayFloat @@ -191,48 +191,67 @@ class _FPLSBlock(Generic[BlockType]): # noqa: WPS230 """ mean: BlockType + data_matrix: NDArrayFloat + G_weights: NDArrayFloat + G_data_weights: NDArrayFloat + regularization_matrix: NDArrayFloat - def __init__( + def set_nipals_results( self, - data: BlockType, - n_components: int, - label: str, - integration_weights: NDArrayFloat | None, - regularization: L2Regularization[Any] | None, - weights_basis: Basis | None, + rotations: NDArrayFloat, + loadings: NDArrayFloat, ) -> None: - self.n_components = n_components - self.label = label - self._initialize_data( - data=data, - integration_weights=integration_weights, - regularization=regularization, - weights_basis=weights_basis, - ) + """Set the results of NIPALS.""" + self.rotations_matrix = rotations + self.loadings_matrix = loadings + self.rotations = self._to_block_type(self.rotations_matrix, "rotation") + self.loadings = self._to_block_type(self.loadings_matrix, "loading") - @multimethod.multidispatch - def _initialize_data( + @abstractmethod + def _to_block_type( self, - data: Union[FData, NDArrayFloat], - integration_weights: Optional[NDArrayFloat], - regularization: Optional[L2Regularization[Any]], - weights_basis: Optional[Basis], - ) -> None: - """Initialize the data of the block.""" - raise NotImplementedError( - f"Data type {type(data)} or combination of arguments" - "not supported", - ) + nipals_matrix: NDArrayFloat, + title: str, + ) -> BlockType: + # Each column of the matrix generated by NIPALS corresponds to + # an obsevation or direction. Therefore, they must be transposed + # so that each row corresponds ot an observation or direction + pass - @_initialize_data.register - def _initialize_data_multivariate( + @abstractmethod + def transform( self, - data: np.ndarray, # type: ignore[type-arg] - integration_weights: None, - regularization: None, - weights_basis: None, + data: BlockType, + ) -> NDArrayFloat: + """Transform from the data space to the component space.""" + pass + + @abstractmethod + def inverse_transform( + self, + components: NDArrayFloat, + ) -> BlockType: + """Transform from the component space to the data space.""" + pass + + def get_penalty_matrix(self) -> NDArrayFloat: + """Return the penalty matrix.""" + return self.G_weights + self.regularization_matrix + + def get_cholesky_inv_penalty_matrix(self) -> NDArrayFloat: + """Return the Cholesky decomposition of the penalty matrix.""" + return np.linalg.inv(np.linalg.cholesky(self.get_penalty_matrix())) + + +class _FPLSBlockMultivariate(_FPLSBlock[NDArrayFloat]): + def __init__( + self, + data: NDArrayFloat, + n_components: int, + label: str, ) -> None: - """Initialize the data of a multivariate block.""" + self.n_components = n_components + self.label = label if len(data.shape) == 1: data = data[:, np.newaxis] @@ -247,58 +266,46 @@ def _initialize_data_multivariate( (data.shape[1], data.shape[1]), ) - @_initialize_data.register - def _initialize_data_grid( + def _to_block_type( self, - data: FDataGrid, - integration_weights: Optional[np.ndarray], # type: ignore[type-arg] - regularization: Optional[L2Regularization[Any]], - weights_basis: None, - ) -> None: - """Initialize the data of a grid block.""" - self.mean = data.mean() - self.data = data - self.mean - self.data_matrix = data.data_matrix[..., 0] + nipals_matrix: NDArrayFloat, + title: str, + ) -> NDArrayFloat: + return nipals_matrix.T - # Arrange the integration weights in a diagonal matrix - # By default, use Simpson's rule - if integration_weights is None: - identity = np.identity(self.data_matrix.shape[1]) - integration_weights = scipy.integrate.simps( - identity, - self.data.grid_points[0], - ) - self.integration_weights = np.diag(integration_weights) + def transform( + self, + data: NDArrayFloat, + ) -> NDArrayFloat: + """Transform from the data space to the component space.""" + data_array = data + if len(data_array.shape) == 1: + data_array = data_array[:, np.newaxis] + data_array = data_array - self.mean + return data_array @ self.rotations_matrix - # Compute the regularization matrix - # By default, all zeros (no regularization) - regularization_matrix = None - if regularization is not None: - regularization_matrix = compute_penalty_matrix( - basis_iterable=( - _GridBasis(grid_points=self.data.grid_points), - ), - regularization_parameter=1, - regularization=regularization, - ) - if regularization_matrix is None: - regularization_matrix = np.zeros( - (self.data_matrix.shape[1], self.data_matrix.shape[1]), - ) + def inverse_transform( + self, + components: NDArrayFloat, + ) -> NDArrayFloat: + """Transform from the component space to the data space.""" + reconstructed = components @ self.loadings_matrix.T + reconstructed += self.mean + return reconstructed - self.regularization_matrix = regularization_matrix - self.G_data_weights = self.integration_weights - self.G_weights = self.integration_weights - @_initialize_data.register - def _initialize_data_basis( +class _FPLSBlockBasis(_FPLSBlock[FDataBasis]): + def __init__( self, data: FDataBasis, - integration_weights: None, + n_components: int, + label: str, regularization: Optional[L2Regularization[Any]], weights_basis: Optional[Basis], ) -> None: """Initialize the data of a basis block.""" + self.n_components = n_components + self.label = label self.mean = data.mean() self.data = data - self.mean self.data_matrix = self.data.coefficients @@ -330,119 +337,174 @@ def _initialize_data_basis( self.weights_basis, ) - def set_nipals_results( - self, - rotations: NDArrayFloat, - loadings: NDArrayFloat, - ) -> None: - """Set the results of NIPALS.""" - self.rotations_matrix = rotations - self.loadings_matrix = loadings - self.rotations = self._to_block_type(self.rotations_matrix, "rotation") - self.loadings = self._to_block_type(self.loadings_matrix, "loading") - def _to_block_type( self, nipals_matrix: NDArrayFloat, title: str, - ) -> BlockType: + ) -> FDataBasis: # Each column of the matrix generated by NIPALS corresponds to # an obsevation or direction. Therefore, they must be transposed # so that each row corresponds ot an observation or direction - if isinstance(self.data, FDataGrid): - return self.data.copy( - data_matrix=nipals_matrix.T, - sample_names=[ - f"{title.capitalize()} {i}" - for i in range(self.n_components) - ], - coordinate_names=(f"FPLS {self.label} {title} value",), - dataset_name=f"FPLS {self.label} {title}s", - ) - elif isinstance(self.data, FDataBasis): - return self.data.copy( - coefficients=nipals_matrix.T, - sample_names=[ - f"{title.capitalize()} {i}" - for i in range(self.n_components) - ], - coordinate_names=(f"FPLS {self.label} {title} value",), - dataset_name=f"FPLS {self.label} {title}s", - ) - elif isinstance(self.data, np.ndarray): - return cast(BlockType, nipals_matrix.T) - - raise NotImplementedError( - f"Data type {type(self.data)} not supported", + return self.data.copy( + coefficients=nipals_matrix.T, + sample_names=[ + f"{title.capitalize()} {i}" for i in range(self.n_components) + ], + coordinate_names=(f"FPLS {self.label} {title} value",), + dataset_name=f"FPLS {self.label} {title}s", ) def transform( self, - data: BlockType, + data: FDataBasis, ) -> NDArrayFloat: """Transform from the data space to the component space.""" - if isinstance(data, FDataGrid): - data_grid = data - self.mean - return ( - data_grid.data_matrix[..., 0] - @ self.G_data_weights - @ self.rotations_matrix - ) - elif isinstance(data, FDataBasis): - data_basis = data - cast(FDataBasis, self.mean) - return ( - data_basis.coefficients - @ self.G_data_weights - @ self.rotations_matrix - ) - elif isinstance(data, np.ndarray): - data_array = data - if len(data_array.shape) == 1: - data_array = data_array[:, np.newaxis] - data_array = data_array - self.mean - return data_array @ self.rotations_matrix - - raise NotImplementedError( - f"Data type {type(data)} not supported", + data_basis = data - self.mean + return ( + data_basis.coefficients + @ self.G_data_weights + @ self.rotations_matrix ) def inverse_transform( self, components: NDArrayFloat, - ) -> BlockType: + ) -> FDataBasis: """Transform from the component space to the data space.""" - if isinstance(self.data, FDataGrid): - reconstructed_grid = self.data.copy( - data_matrix=components @ self.loadings_matrix.T, - sample_names=(None,) * components.shape[0], - dataset_name=f"FPLS {self.label} inverse transformed", + reconstructed_basis = self.data.copy( + coefficients=components @ self.loadings_matrix.T, + sample_names=(None,) * components.shape[0], + dataset_name=f"FPLS {self.label} inverse transformed", + ) + return reconstructed_basis + self.mean + + +class _FPLSBlockGrid(_FPLSBlock[FDataGrid]): + def __init__( + self, + data: FDataGrid, + n_components: int, + label: str, + integration_weights: Optional[np.ndarray], # type: ignore[type-arg] + regularization: Optional[L2Regularization[Any]], + ) -> None: + """Initialize the data of a grid block.""" + self.n_components = n_components + self.label = label + + self.mean = data.mean() + self.data = data - self.mean + self.data_matrix = data.data_matrix[..., 0] + + # Arrange the integration weights in a diagonal matrix + # By default, use Simpson's rule + if integration_weights is None: + identity = np.identity(self.data_matrix.shape[1]) + integration_weights = scipy.integrate.simps( + identity, + self.data.grid_points[0], ) - return reconstructed_grid + cast(FDataGrid, self.mean) + self.integration_weights = np.diag(integration_weights) - elif isinstance(self.data, FDataBasis): - reconstructed_basis = self.data.copy( - coefficients=components @ self.loadings_matrix.T, - sample_names=(None,) * components.shape[0], - dataset_name=f"FPLS {self.label} inverse transformed", + # Compute the regularization matrix + # By default, all zeros (no regularization) + regularization_matrix = None + if regularization is not None: + regularization_matrix = compute_penalty_matrix( + basis_iterable=( + _GridBasis(grid_points=self.data.grid_points), + ), + regularization_parameter=1, + regularization=regularization, + ) + if regularization_matrix is None: + regularization_matrix = np.zeros( + (self.data_matrix.shape[1], self.data_matrix.shape[1]), ) - return reconstructed_basis + cast(FDataBasis, self.mean) - elif isinstance(self.data, np.ndarray): - reconstructed = components @ self.loadings_matrix.T - reconstructed += self.mean - return cast(BlockType, reconstructed) + self.regularization_matrix = regularization_matrix + self.G_data_weights = self.integration_weights + self.G_weights = self.integration_weights - raise NotImplementedError( - f"Data type {type(self.data)} not supported", + def _to_block_type( + self, + nipals_matrix: NDArrayFloat, + title: str, + ) -> FDataGrid: + # Each column of the matrix generated by NIPALS corresponds to + # an obsevation or direction. Therefore, they must be transposed + # so that each row corresponds ot an observation or direction + return self.data.copy( + data_matrix=nipals_matrix.T, + sample_names=[ + f"{title.capitalize()} {i}" for i in range(self.n_components) + ], + coordinate_names=(f"FPLS {self.label} {title} value",), + dataset_name=f"FPLS {self.label} {title}s", ) - def get_penalty_matrix(self) -> NDArrayFloat: - """Return the penalty matrix.""" - return self.G_weights + self.regularization_matrix + def transform( + self, + data: FDataGrid, + ) -> NDArrayFloat: + """Transform from the data space to the component space.""" + data_grid = data - self.mean + return ( + data_grid.data_matrix[..., 0] + @ self.G_data_weights + @ self.rotations_matrix + ) - def get_cholesky_inv_penalty_matrix(self) -> NDArrayFloat: - """Return the Cholesky decomposition of the penalty matrix.""" - return np.linalg.inv(np.linalg.cholesky(self.get_penalty_matrix())) + def inverse_transform( + self, + components: NDArrayFloat, + ) -> FDataGrid: + """Transform from the component space to the data space.""" + reconstructed_grid = self.data.copy( + data_matrix=components @ self.loadings_matrix.T, + sample_names=(None,) * components.shape[0], + dataset_name=f"FPLS {self.label} inverse transformed", + ) + return reconstructed_grid + self.mean + + +def _fpls_block_factory( + data: BlockType, + n_components: int, + label: str, + integration_weights: NDArrayFloat | None, + regularization: L2Regularization[Any] | None, + weights_basis: Basis | None, +) -> _FPLSBlock[BlockType]: + if isinstance(data, np.ndarray): + return cast( + _FPLSBlock[BlockType], + _FPLSBlockMultivariate(data, n_components, label), + ) + elif isinstance(data, FDataBasis): + return cast( + _FPLSBlock[BlockType], + _FPLSBlockBasis( + data, + n_components, + label, + regularization, + weights_basis, + ), + ) + elif isinstance(data, FDataGrid): + return cast( + _FPLSBlock[BlockType], + _FPLSBlockGrid( + data, + n_components, + label, + integration_weights, + regularization, + ), + ) + + raise TypeError("Invalid type for data") InputTypeX = TypeVar( @@ -534,7 +596,7 @@ def __init__( self.max_iter = max_iter def _initialize_blocks(self, X: InputTypeX, Y: InputTypeY) -> None: - self._x_block = _FPLSBlock( + self._x_block = _fpls_block_factory( data=X, n_components=self.n_components, label="X", @@ -542,7 +604,7 @@ def _initialize_blocks(self, X: InputTypeX, Y: InputTypeY) -> None: regularization=self.regularization_X, weights_basis=self.component_basis_X, ) - self._y_block = _FPLSBlock( + self._y_block = _fpls_block_factory( data=Y, n_components=self.n_components, label="Y", From 5c63ea7e8965a7f1f52fb360ef5b66a5d3bf6463 Mon Sep 17 00:00:00 2001 From: David del Val Date: Sat, 28 Oct 2023 23:20:42 +0200 Subject: [PATCH 33/46] Improve documentation --- docs/modules/ml/regression.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/modules/ml/regression.rst b/docs/modules/ml/regression.rst index 30eed2545..b3e224e78 100644 --- a/docs/modules/ml/regression.rst +++ b/docs/modules/ml/regression.rst @@ -63,7 +63,7 @@ FPLS regression ----------------- This module includes the implementation of FPLS (Functional Partial Least Squares) regression. This implementation accepts either functional or multivariate data as the regressor and the response. -FPLS regression is performed by performing the FPLS dimensionality reduction algorithm +FPLS regression consists on performing the FPLS dimensionality reduction algorithm but using a regression deflation strategy. From 04731f2a3ed6da77850c913492ebfc43318a049e Mon Sep 17 00:00:00 2001 From: David del Val Date: Sat, 28 Oct 2023 23:21:25 +0200 Subject: [PATCH 34/46] Ignore WPS413: Found bad magic module function --- setup.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.cfg b/setup.cfg index bcbba2d86..f628b58b6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -84,6 +84,8 @@ ignore = WPS507, # Comparison with not is not the same as with equality WPS520, + # Found bad magic module function: {0} + WPS413 per-file-ignores = __init__.py: From 508709b61b00edce67ecf00077f6d6695ec2bea3 Mon Sep 17 00:00:00 2001 From: David del Val Date: Sat, 28 Oct 2023 23:22:41 +0200 Subject: [PATCH 35/46] Address review comments --- skfda/ml/regression/_fpls_regression.py | 9 ++--- skfda/preprocessing/dim_reduction/_fpls.py | 42 +++++++++++----------- 2 files changed, 24 insertions(+), 27 deletions(-) diff --git a/skfda/ml/regression/_fpls_regression.py b/skfda/ml/regression/_fpls_regression.py index 0e105d019..6299df194 100644 --- a/skfda/ml/regression/_fpls_regression.py +++ b/skfda/ml/regression/_fpls_regression.py @@ -36,12 +36,6 @@ class FPLSRegression( applicable if X is a FDataBasis. Otherwise it must be None. weight_basis_Y: Basis to use for the Y block. Only applicable if Y is a FDataBasis. Otherwise it must be None. - _integration_weights_X: One-dimensional array with the integration - weights for the X block. - Only applicable if X is a FDataGrid. Otherwise it must be None. - _integration_weights_Y: One-dimensional array with the integration - weights for the Y block. - Only applicable if Y is a FDataGrid. Otherwise it must be None. Attributes: coef\_: Coefficients of the linear model. @@ -54,6 +48,7 @@ class FPLSRegression( >>> from skfda.datasets import fetch_tecator >>> from skfda.representation import FDataGrid >>> from skfda.typing._numpy import NDArrayFloat + >>> X, y = fetch_tecator(return_X_y=True) >>> fpls = FPLSRegression[FDataGrid, NDArrayFloat](n_components=2) >>> fpls = fpls.fit(X, y) @@ -62,7 +57,7 @@ class FPLSRegression( def __init__( self, - n_components: int = 5, + n_components: int = 2, regularization_X: L2Regularization[Any] | None = None, weight_basis_X: Basis | None = None, weight_basis_Y: Basis | None = None, diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index 3bd11e04e..0189d1ff4 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -1,7 +1,7 @@ from __future__ import annotations from abc import abstractmethod -from typing import Any, Generic, Optional, Tuple, TypeVar, Union, cast +from typing import Any, Generic, Literal, Optional, Tuple, TypeVar, Union, cast import numpy as np import scipy @@ -184,12 +184,14 @@ class _FPLSBlock(Generic[BlockType]): # noqa: WPS230 >>> from skfda.datasets import fetch_tecator >>> from skfda.representation import FDataGrid >>> from skfda.typing._numpy import NDArrayFloat + >>> X, y = fetch_tecator(return_X_y=True) >>> fpls = FPLS[FDataGrid, NDArrayFloat](n_components=2) >>> fpls = fpls.fit(X, y) """ + # Attributes that must be defined in the subclasses mean: BlockType data_matrix: NDArrayFloat G_weights: NDArrayFloat @@ -479,28 +481,32 @@ def _fpls_block_factory( if isinstance(data, np.ndarray): return cast( _FPLSBlock[BlockType], - _FPLSBlockMultivariate(data, n_components, label), + _FPLSBlockMultivariate( + data=data, + n_components=n_components, + label=label, + ), ) elif isinstance(data, FDataBasis): return cast( _FPLSBlock[BlockType], _FPLSBlockBasis( - data, - n_components, - label, - regularization, - weights_basis, + data=data, + n_components=n_components, + label=label, + regularization=regularization, + weights_basis=weights_basis, ), ) elif isinstance(data, FDataGrid): return cast( _FPLSBlock[BlockType], _FPLSBlockGrid( - data, - n_components, - label, - integration_weights, - regularization, + data=data, + n_components=n_components, + label=label, + integration_weights=integration_weights, + regularization=regularization, ), ) @@ -516,6 +522,8 @@ def _fpls_block_factory( bound=Union[FDataGrid, FDataBasis, NDArrayFloat], ) +DeflationMode = Literal["reg", "can"] + # Ignore too many public instance attributes class FPLS( # noqa: WPS230 @@ -539,12 +547,6 @@ class FPLS( # noqa: WPS230 applicable if Y is a FDataBasis. Otherwise it must be None. _deflation_mode: Mode to use for deflation. Can be "can" (dimensionality reduction) or "reg" (regression). - _integration_weights_X: One-dimensional array with the integration - weights for the X block. - Only applicable if X is a FDataGrid. Otherwise it must be None. - _integration_weights_Y: One-dimensional array with the integration - weights for the Y block. - Only applicable if Y is a FDataGrid. Otherwise it must be None. Attributes: x_weights\_: (n_features_X, n_components) array with the X weights @@ -572,7 +574,7 @@ class FPLS( # noqa: WPS230 def __init__( self, - n_components: int = 5, + n_components: int = 2, *, regularization_X: L2Regularization[InputTypeX] | None = None, regularization_Y: L2Regularization[InputTypeY] | None = None, @@ -580,7 +582,7 @@ def __init__( component_basis_Y: Basis | None = None, tol: float = 1e-6, max_iter: int = 500, - _deflation_mode: str = "can", + _deflation_mode: DeflationMode = "can", _integration_weights_X: NDArrayFloat | None = None, _integration_weights_Y: NDArrayFloat | None = None, ) -> None: From 93412206c09fb2ef605aa2c84d6596ac09a55d67 Mon Sep 17 00:00:00 2001 From: David del Val Date: Sun, 29 Oct 2023 01:05:03 +0200 Subject: [PATCH 36/46] Improve the behaviour when the number of components is not adecuate. --- skfda/preprocessing/dim_reduction/_fpls.py | 42 +++++++++++--- skfda/tests/test_fpls.py | 65 ++++++++++++++++++++++ 2 files changed, 98 insertions(+), 9 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index 0189d1ff4..6cb8833b4 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -13,8 +13,6 @@ from ...representation.basis import Basis, FDataBasis, _GridBasis from ...typing._numpy import NDArrayFloat -INV_EPS = 1e-15 - def _power_solver( X: NDArrayFloat, @@ -23,7 +21,7 @@ def _power_solver( ) -> NDArrayFloat: """Return the dominant eigenvector of a matrix using the power method.""" t = X[:, 0] - t_prev = np.ones(t.shape) * np.max(t) * 2 + t_prev = np.ones(t.shape) * max(np.max(t), 1) * 2 iter_count = 0 while np.linalg.norm(t - t_prev) > tol: t_prev = t @@ -104,13 +102,13 @@ def _calculate_weights( w = L_X_inv.T @ w # Normalize - w /= np.sqrt(np.dot(w.T, G_ww @ w)) + INV_EPS + w /= np.sqrt(np.dot(w.T, G_ww @ w)) # Undo the transformation c = L_Y_inv.T @ c # Normalize the other weight - c /= np.sqrt(np.dot(c.T, G_cc @ c)) + INV_EPS + c /= np.sqrt(np.dot(c.T, G_cc @ c)) return w, c @@ -572,7 +570,8 @@ class FPLS( # noqa: WPS230 """ - def __init__( + # Ignore too many arguments + def __init__( # noqa: WPS211 self, n_components: int = 2, *, @@ -615,7 +614,8 @@ def _initialize_blocks(self, X: InputTypeX, Y: InputTypeY) -> None: weights_basis=self.component_basis_Y, ) - def _perform_nipals(self) -> None: + # Ignore too many local variables + def _perform_nipals(self) -> None: # noqa: WPS210 X = self._x_block.data_matrix Y = self._y_block.data_matrix X = X - np.mean(X, axis=0) @@ -633,6 +633,10 @@ def _perform_nipals(self) -> None: L_X_inv = self._x_block.get_cholesky_inv_penalty_matrix() L_Y_inv = self._y_block.get_cholesky_inv_penalty_matrix() + # Determine some tolerances to stop the algorithm + x_epsilon = np.finfo(X.dtype).eps * np.linalg.norm(X) + y_epsilon = np.finfo(Y.dtype).eps * np.linalg.norm(Y) + for _ in range(self.n_components): w, c = _calculate_weights( X, @@ -650,12 +654,12 @@ def _perform_nipals(self) -> None: t = np.dot(X @ self._x_block.G_data_weights, w) u = np.dot(Y @ self._y_block.G_data_weights, c) - p = np.dot(X.T, t) / (np.dot(t.T, t) + INV_EPS) + p = np.dot(X.T, t) / (np.dot(t.T, t)) y_proyection = t if self._deflation_mode == "reg" else u q = np.dot(Y.T, y_proyection) / ( - np.dot(y_proyection, y_proyection) + INV_EPS + np.dot(y_proyection, y_proyection) ) X = X - np.outer(t, p) @@ -668,6 +672,10 @@ def _perform_nipals(self) -> None: P.append(p) Q.append(q) + # Check if the algorithm has converged + if np.linalg.norm(X) < x_epsilon or np.linalg.norm(Y) < y_epsilon: + break + # Convert each list of columns to a matrix self.x_weights_ = np.array(W).T self.y_weights_ = np.array(C).T @@ -693,6 +701,22 @@ def fit( """ self._initialize_blocks(X, y) + # In regression mode, the number of components is limited + # only by the rank of the X data matrix since components are + # only extracted from the X block. + if self._deflation_mode == "reg": + range_upper_bound = min(*self._x_block.data_matrix.shape) + else: + range_upper_bound = min( + *self._x_block.data_matrix.shape, + *self._y_block.data_matrix.shape, + ) + + if self.n_components > range_upper_bound: + raise ValueError( + f"n_components must be less or equal than {range_upper_bound}", + ) + self._perform_nipals() self.x_rotations_matrix_ = self.x_weights_ @ np.linalg.pinv( diff --git a/skfda/tests/test_fpls.py b/skfda/tests/test_fpls.py index 7ed2ae45a..e6ec918c6 100644 --- a/skfda/tests/test_fpls.py +++ b/skfda/tests/test_fpls.py @@ -3,6 +3,7 @@ from typing import Tuple import numpy as np +import pytest from sklearn.cross_decomposition import PLSCanonical from skfda.datasets import fetch_tecator @@ -228,3 +229,67 @@ def test_basis_vs_grid( # noqa: WPS210 fpsl_grid_y(grid_points), rtol=0.13, ) + + def _generate_random_matrix_by_rank(self, n_samples, n_features, rank): + random_data = np.random.random(n_samples * rank).reshape( + n_samples, + rank, + ) + if rank == n_features: + return random_data + repeated_data = np.array([random_data[:, 0]] * (n_features - rank)).T + + return np.concatenate( + [random_data, repeated_data], + axis=1, + ) + + @pytest.mark.parametrize("rank", [1, 2, 5]) + def test_collinear_matrix( + self, + rank: int, + ) -> None: + """Check that the behaviour is correct with collinear matrices.""" + n_samples = 100 + n_features = 10 + + X = self._generate_random_matrix_by_rank( + n_samples=n_samples, + n_features=n_features, + rank=rank, + ) + y = self._generate_random_matrix_by_rank( + n_samples=n_samples, + n_features=n_features, + rank=rank, + ) + + fpls = FPLS(n_components=rank) + fpls.fit(X, y) + + fpls = FPLS(n_components=5) + fpls.fit(X, y) + + # Check that only as many components as rank are returned + assert fpls.x_weights_.shape == (n_features, rank) + + def test_number_of_components( + self, + ) -> None: + """Check error when number of components is too large.""" + n_samples = 100 + n_features = 10 + + X = self._generate_random_matrix_by_rank( + n_samples=n_samples, + n_features=n_features, + rank=n_features, + ) + y = self._generate_random_matrix_by_rank( + n_samples=n_samples, + n_features=n_features, + rank=n_features, + ) + + with pytest.raises(ValueError): + FPLS(n_components=n_features + 1).fit(X, y) From aa2074951092b1836037ccc523f02a7b558a4d7c Mon Sep 17 00:00:00 2001 From: David del Val Date: Sun, 29 Oct 2023 01:27:22 +0200 Subject: [PATCH 37/46] Improve tolerance definition. In sklearn, they set to zeros the values that are very close to zero. Then they try to perform the power method. If all columns of the Y matrix are too small, then they consider the algorihtm to have converged. This is a very similar approach, but taking into account the magnitude of X and Y to calculate the epsilons --- skfda/preprocessing/dim_reduction/_fpls.py | 20 ++++++++++++++++---- skfda/tests/test_fpls.py | 3 +++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index 6cb8833b4..e582cfcda 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -634,8 +634,16 @@ def _perform_nipals(self) -> None: # noqa: WPS210 L_Y_inv = self._y_block.get_cholesky_inv_penalty_matrix() # Determine some tolerances to stop the algorithm - x_epsilon = np.finfo(X.dtype).eps * np.linalg.norm(X) - y_epsilon = np.finfo(Y.dtype).eps * np.linalg.norm(Y) + x_epsilon = ( + 10 + * np.finfo(X.dtype).eps + * np.abs(self._x_block.data_matrix).mean() + ) + y_epsilon = ( + 10 + * np.finfo(Y.dtype).eps + * np.abs(self._y_block.data_matrix).mean() + ) for _ in range(self.n_components): w, c = _calculate_weights( @@ -672,8 +680,12 @@ def _perform_nipals(self) -> None: # noqa: WPS210 P.append(p) Q.append(q) - # Check if the algorithm has converged - if np.linalg.norm(X) < x_epsilon or np.linalg.norm(Y) < y_epsilon: + # Set to zero the values that are close to zero + X[abs(X) < x_epsilon] = 0 + Y[abs(Y) < y_epsilon] = 0 + + # Stop if the matrix is all zeros + if np.all(X == 0) or np.all(Y == 0): break # Convert each list of columns to a matrix diff --git a/skfda/tests/test_fpls.py b/skfda/tests/test_fpls.py index e6ec918c6..d8d717618 100644 --- a/skfda/tests/test_fpls.py +++ b/skfda/tests/test_fpls.py @@ -231,6 +231,7 @@ def test_basis_vs_grid( # noqa: WPS210 ) def _generate_random_matrix_by_rank(self, n_samples, n_features, rank): + random_data = np.random.random(n_samples * rank).reshape( n_samples, rank, @@ -252,6 +253,7 @@ def test_collinear_matrix( """Check that the behaviour is correct with collinear matrices.""" n_samples = 100 n_features = 10 + np.random.seed(0) X = self._generate_random_matrix_by_rank( n_samples=n_samples, @@ -279,6 +281,7 @@ def test_number_of_components( """Check error when number of components is too large.""" n_samples = 100 n_features = 10 + np.random.seed(0) X = self._generate_random_matrix_by_rank( n_samples=n_samples, From 9387a7c0687f1f519404557fd953364778a4bf49 Mon Sep 17 00:00:00 2001 From: David del Val Date: Sun, 29 Oct 2023 01:43:32 +0200 Subject: [PATCH 38/46] Add warning when convergence is reached --- skfda/preprocessing/dim_reduction/_fpls.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index e582cfcda..493fe8121 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -2,6 +2,7 @@ from abc import abstractmethod from typing import Any, Generic, Literal, Optional, Tuple, TypeVar, Union, cast +import warnings import numpy as np import scipy @@ -645,7 +646,18 @@ def _perform_nipals(self) -> None: # noqa: WPS210 * np.abs(self._y_block.data_matrix).mean() ) - for _ in range(self.n_components): + for n_comp in range(self.n_components): + # Stop if either matrix is all zeros + if np.all(X == 0) or np.all(Y == 0): + warnings.warn( + f"After extracting {n_comp} components, " + "one of the matrices is completely deflated. " + f"The algorithm will return {n_comp} components," + f"instead of {self.n_components}.", + stacklevel=2, + ) + break + w, c = _calculate_weights( X, Y, @@ -684,10 +696,6 @@ def _perform_nipals(self) -> None: # noqa: WPS210 X[abs(X) < x_epsilon] = 0 Y[abs(Y) < y_epsilon] = 0 - # Stop if the matrix is all zeros - if np.all(X == 0) or np.all(Y == 0): - break - # Convert each list of columns to a matrix self.x_weights_ = np.array(W).T self.y_weights_ = np.array(C).T From 9c032ccff52748401b32b8a2bd98e62ccf6e662c Mon Sep 17 00:00:00 2001 From: David del Val Date: Sun, 29 Oct 2023 01:45:20 +0200 Subject: [PATCH 39/46] Sort imports --- skfda/preprocessing/dim_reduction/_fpls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index 493fe8121..a71f01e0d 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -1,8 +1,8 @@ from __future__ import annotations +import warnings from abc import abstractmethod from typing import Any, Generic, Literal, Optional, Tuple, TypeVar, Union, cast -import warnings import numpy as np import scipy From a7783e098c8723bbf2569c2af88fab09e8fb1167 Mon Sep 17 00:00:00 2001 From: David del Val Date: Sun, 29 Oct 2023 01:52:12 +0200 Subject: [PATCH 40/46] Check that the warning is being generated in the test. --- skfda/preprocessing/dim_reduction/_fpls.py | 2 +- skfda/tests/test_fpls.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index a71f01e0d..54b3f8f1b 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -654,7 +654,7 @@ def _perform_nipals(self) -> None: # noqa: WPS210 "one of the matrices is completely deflated. " f"The algorithm will return {n_comp} components," f"instead of {self.n_components}.", - stacklevel=2, + stacklevel=3, ) break diff --git a/skfda/tests/test_fpls.py b/skfda/tests/test_fpls.py index d8d717618..9c2497553 100644 --- a/skfda/tests/test_fpls.py +++ b/skfda/tests/test_fpls.py @@ -270,7 +270,14 @@ def test_collinear_matrix( fpls.fit(X, y) fpls = FPLS(n_components=5) - fpls.fit(X, y) + + # Check that a warning is raised when the rank is lower than the + # number of components + if rank < 5: + with pytest.warns(UserWarning): + fpls.fit(X, y) + else: + fpls.fit(X, y) # Check that only as many components as rank are returned assert fpls.x_weights_.shape == (n_features, rank) From c0fdc20d87c0515ec9570873ac67764d86e95756 Mon Sep 17 00:00:00 2001 From: David del Val Date: Sun, 29 Oct 2023 01:58:12 +0200 Subject: [PATCH 41/46] Fix style error --- skfda/tests/test_fpls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/tests/test_fpls.py b/skfda/tests/test_fpls.py index 9c2497553..a592a156a 100644 --- a/skfda/tests/test_fpls.py +++ b/skfda/tests/test_fpls.py @@ -270,7 +270,7 @@ def test_collinear_matrix( fpls.fit(X, y) fpls = FPLS(n_components=5) - + # Check that a warning is raised when the rank is lower than the # number of components if rank < 5: From 4c89c5ac19075e91bd8aa178be28676b725c33f4 Mon Sep 17 00:00:00 2001 From: David del Val Date: Wed, 1 Nov 2023 14:15:43 +0100 Subject: [PATCH 42/46] Remove unnecessary noqa --- skfda/preprocessing/dim_reduction/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/preprocessing/dim_reduction/__init__.py b/skfda/preprocessing/dim_reduction/__init__.py index f4926cf3c..5c8ae8405 100644 --- a/skfda/preprocessing/dim_reduction/__init__.py +++ b/skfda/preprocessing/dim_reduction/__init__.py @@ -26,7 +26,7 @@ ) -def __getattr__(name: str) -> Any: # noqa: WPS413 +def __getattr__(name: str) -> Any: if name in {"projection", "feature_extraction"}: return importlib.import_module(f".{name}", __name__) return _normal_getattr(name) From 65eb00769dd9e7e84f489a4c1ed1e0e937b3c07c Mon Sep 17 00:00:00 2001 From: David del Val Date: Wed, 1 Nov 2023 14:17:36 +0100 Subject: [PATCH 43/46] Create new FData instead of copying --- skfda/preprocessing/dim_reduction/_fpls.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index 54b3f8f1b..dedfcabc7 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -346,10 +346,13 @@ def _to_block_type( # Each column of the matrix generated by NIPALS corresponds to # an obsevation or direction. Therefore, they must be transposed # so that each row corresponds ot an observation or direction - return self.data.copy( + basis = self.weights_basis if title == "rotation" else self.data.basis + return FDataBasis( coefficients=nipals_matrix.T, + basis=basis, sample_names=[ - f"{title.capitalize()} {i}" for i in range(self.n_components) + f"{title.capitalize()} {i}" + for i in range(nipals_matrix.shape[1]) ], coordinate_names=(f"FPLS {self.label} {title} value",), dataset_name=f"FPLS {self.label} {title}s", @@ -435,13 +438,15 @@ def _to_block_type( # Each column of the matrix generated by NIPALS corresponds to # an obsevation or direction. Therefore, they must be transposed # so that each row corresponds ot an observation or direction - return self.data.copy( + return FDataGrid( data_matrix=nipals_matrix.T, sample_names=[ - f"{title.capitalize()} {i}" for i in range(self.n_components) + f"{title.capitalize()} {i}" + for i in range(nipals_matrix.shape[1]) ], coordinate_names=(f"FPLS {self.label} {title} value",), dataset_name=f"FPLS {self.label} {title}s", + sample_points=self.data.grid_points[0], ) def transform( From 74b3ea2b93f002076b92f5285e8ec6d0f402567d Mon Sep 17 00:00:00 2001 From: David del Val Date: Wed, 1 Nov 2023 14:18:23 +0100 Subject: [PATCH 44/46] Extract the maximum number of components by default --- skfda/preprocessing/dim_reduction/_fpls.py | 87 ++++++++-------------- skfda/tests/test_fpls.py | 6 ++ 2 files changed, 39 insertions(+), 54 deletions(-) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index dedfcabc7..9287904e4 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -54,7 +54,6 @@ def _calculate_weights( The X block data matrix. - Y: (n_samples, n_targets) The Y block data matrix. - - n_components: number of components to extract. - G_ww: (n_features, n_features) The inner product matrix for the X block weights @@ -141,7 +140,6 @@ class _FPLSBlock(Generic[BlockType]): # noqa: WPS230 Parameters: - data: The data of the block. - - n_components: Number of components to extract. - label: Label of the block (X or Y). - integration_weights: Array with shape (n_features,). The integration weights of the block. It must be None for @@ -152,7 +150,6 @@ class _FPLSBlock(Generic[BlockType]): # noqa: WPS230 multivariate or grid blocks. Attributes: - - n_components: Number of components to extract. - label: Label of the block (X or Y). - data: The data of the block. - data_matrix: (n_samples, n_features) matrix. The data @@ -176,18 +173,6 @@ class _FPLSBlock(Generic[BlockType]): # noqa: WPS230 loadings of the block. - loadings: The loadings of the block (same type as the data). - Examples: - Fit a FPLS model with two components. - - >>> from skfda.preprocessing.dim_reduction import FPLS - >>> from skfda.datasets import fetch_tecator - >>> from skfda.representation import FDataGrid - >>> from skfda.typing._numpy import NDArrayFloat - - >>> X, y = fetch_tecator(return_X_y=True) - >>> fpls = FPLS[FDataGrid, NDArrayFloat](n_components=2) - >>> fpls = fpls.fit(X, y) - """ # Attributes that must be defined in the subclasses @@ -214,9 +199,6 @@ def _to_block_type( nipals_matrix: NDArrayFloat, title: str, ) -> BlockType: - # Each column of the matrix generated by NIPALS corresponds to - # an obsevation or direction. Therefore, they must be transposed - # so that each row corresponds ot an observation or direction pass @abstractmethod @@ -248,10 +230,8 @@ class _FPLSBlockMultivariate(_FPLSBlock[NDArrayFloat]): def __init__( self, data: NDArrayFloat, - n_components: int, label: str, ) -> None: - self.n_components = n_components self.label = label if len(data.shape) == 1: data = data[:, np.newaxis] @@ -299,13 +279,11 @@ class _FPLSBlockBasis(_FPLSBlock[FDataBasis]): def __init__( self, data: FDataBasis, - n_components: int, label: str, regularization: Optional[L2Regularization[Any]], weights_basis: Optional[Basis], ) -> None: """Initialize the data of a basis block.""" - self.n_components = n_components self.label = label self.mean = data.mean() self.data = data - self.mean @@ -387,13 +365,11 @@ class _FPLSBlockGrid(_FPLSBlock[FDataGrid]): def __init__( self, data: FDataGrid, - n_components: int, label: str, integration_weights: Optional[np.ndarray], # type: ignore[type-arg] regularization: Optional[L2Regularization[Any]], ) -> None: """Initialize the data of a grid block.""" - self.n_components = n_components self.label = label self.mean = data.mean() @@ -476,7 +452,6 @@ def inverse_transform( def _fpls_block_factory( data: BlockType, - n_components: int, label: str, integration_weights: NDArrayFloat | None, regularization: L2Regularization[Any] | None, @@ -487,7 +462,6 @@ def _fpls_block_factory( _FPLSBlock[BlockType], _FPLSBlockMultivariate( data=data, - n_components=n_components, label=label, ), ) @@ -496,7 +470,6 @@ def _fpls_block_factory( _FPLSBlock[BlockType], _FPLSBlockBasis( data=data, - n_components=n_components, label=label, regularization=regularization, weights_basis=weights_basis, @@ -507,7 +480,6 @@ def _fpls_block_factory( _FPLSBlock[BlockType], _FPLSBlockGrid( data=data, - n_components=n_components, label=label, integration_weights=integration_weights, regularization=regularization, @@ -542,7 +514,8 @@ class FPLS( # noqa: WPS230 NDArrayFloat, FDataGrid and FDataBasis. Parameters: - n_components: Number of components to extract. + n_components: Number of components to extract. By default, the + maximum number of components is extracted. regularization_X: Regularization to apply to the X block. regularization_Y: Regularization to apply to the Y block. component_basis_X: Basis to use for the X block. Only @@ -579,7 +552,7 @@ class FPLS( # noqa: WPS230 # Ignore too many arguments def __init__( # noqa: WPS211 self, - n_components: int = 2, + n_components: int | None = None, *, regularization_X: L2Regularization[InputTypeX] | None = None, regularization_Y: L2Regularization[InputTypeY] | None = None, @@ -605,7 +578,6 @@ def __init__( # noqa: WPS211 def _initialize_blocks(self, X: InputTypeX, Y: InputTypeY) -> None: self._x_block = _fpls_block_factory( data=X, - n_components=self.n_components, label="X", integration_weights=self._integration_weights_X, regularization=self.regularization_X, @@ -613,7 +585,6 @@ def _initialize_blocks(self, X: InputTypeX, Y: InputTypeY) -> None: ) self._y_block = _fpls_block_factory( data=Y, - n_components=self.n_components, label="Y", integration_weights=self._integration_weights_Y, regularization=self.regularization_Y, @@ -651,18 +622,24 @@ def _perform_nipals(self) -> None: # noqa: WPS210 * np.abs(self._y_block.data_matrix).mean() ) - for n_comp in range(self.n_components): + n_comp = 0 + while self.n_components is None or n_comp < self.n_components: # Stop if either matrix is all zeros if np.all(X == 0) or np.all(Y == 0): - warnings.warn( - f"After extracting {n_comp} components, " - "one of the matrices is completely deflated. " - f"The algorithm will return {n_comp} components," - f"instead of {self.n_components}.", - stacklevel=3, - ) + # If the number of components was specified, warn the user + if self.n_components is not None: + warnings.warn( + f"After extracting {n_comp} components, " + "one of the matrices is completely deflated. " + f"The algorithm will return {n_comp} components," + f"instead of {self.n_components}.", + stacklevel=3, + ) break + # Increase number of components + n_comp += 1 + w, c = _calculate_weights( X, Y, @@ -726,21 +703,23 @@ def fit( """ self._initialize_blocks(X, y) - # In regression mode, the number of components is limited - # only by the rank of the X data matrix since components are - # only extracted from the X block. - if self._deflation_mode == "reg": - range_upper_bound = min(*self._x_block.data_matrix.shape) - else: - range_upper_bound = min( - *self._x_block.data_matrix.shape, - *self._y_block.data_matrix.shape, - ) + if self.n_components is not None: + # In regression mode, the number of components is limited + # only by the rank of the X data matrix since components are + # only extracted from the X block. + if self._deflation_mode == "reg": + range_upper_bound = min(*self._x_block.data_matrix.shape) + else: + range_upper_bound = min( + *self._x_block.data_matrix.shape, + *self._y_block.data_matrix.shape, + ) - if self.n_components > range_upper_bound: - raise ValueError( - f"n_components must be less or equal than {range_upper_bound}", - ) + if self.n_components > range_upper_bound: + raise ValueError( + f"n_components must be less or equal " + f"than {range_upper_bound}", + ) self._perform_nipals() diff --git a/skfda/tests/test_fpls.py b/skfda/tests/test_fpls.py index a592a156a..af322959f 100644 --- a/skfda/tests/test_fpls.py +++ b/skfda/tests/test_fpls.py @@ -282,6 +282,12 @@ def test_collinear_matrix( # Check that only as many components as rank are returned assert fpls.x_weights_.shape == (n_features, rank) + # Check that the waring is not raised when the number of components + # is not set + fpls = FPLS().fit(X, y) + # Check that only as many components as rank are returned + assert fpls.x_weights_.shape == (n_features, rank) + def test_number_of_components( self, ) -> None: From ab7f91cc961b49b3420421c5a90cdca0584c3bc1 Mon Sep 17 00:00:00 2001 From: David del Val Date: Wed, 1 Nov 2023 23:17:07 +0100 Subject: [PATCH 45/46] Fix deprecation warning --- skfda/preprocessing/dim_reduction/_fpls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py index 9287904e4..8641f037a 100644 --- a/skfda/preprocessing/dim_reduction/_fpls.py +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -422,7 +422,7 @@ def _to_block_type( ], coordinate_names=(f"FPLS {self.label} {title} value",), dataset_name=f"FPLS {self.label} {title}s", - sample_points=self.data.grid_points[0], + grid_points=self.data.grid_points[0], ) def transform( From e7c79afb80410780f147cdf16b3c8fddd3e5be15 Mon Sep 17 00:00:00 2001 From: David del Val Date: Wed, 1 Nov 2023 23:17:35 +0100 Subject: [PATCH 46/46] Use all components in regression as well --- skfda/ml/regression/_fpls_regression.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/skfda/ml/regression/_fpls_regression.py b/skfda/ml/regression/_fpls_regression.py index 6299df194..ea4dd4eac 100644 --- a/skfda/ml/regression/_fpls_regression.py +++ b/skfda/ml/regression/_fpls_regression.py @@ -30,7 +30,8 @@ class FPLSRegression( Regression using Functional Partial Least Squares. Parameters: - n_components: Number of components to keep. Defaults to 5. + n_components: Number of components to keep. + By default all available components are utilized. regularization_X: Regularization for the calculation of the X weights. weight_basis_X: Basis to use for the X block. Only applicable if X is a FDataBasis. Otherwise it must be None. @@ -57,7 +58,7 @@ class FPLSRegression( def __init__( self, - n_components: int = 2, + n_components: int | None = None, regularization_X: L2Regularization[Any] | None = None, weight_basis_X: Basis | None = None, weight_basis_Y: Basis | None = None,