From 73045582483dfcda2f2c520c155f40e425344215 Mon Sep 17 00:00:00 2001 From: Joel Berkeley <16429957+joelberkeley@users.noreply.github.com> Date: Tue, 17 Aug 2021 15:53:09 +0100 Subject: [PATCH] implement conjugate GP regression as a trainable probabilistic model (#113) --- spidr.ipkg | 2 +- src/BayesianOptimization.idr | 2 +- src/BayesianOptimization/Acquisition.idr | 4 +- .../{Util.idr => Morphisms.idr} | 13 +-- src/Model.idr | 23 ++-- src/Model/GaussianProcess.idr | 103 ++++++++++-------- src/Optimize.idr | 15 ++- src/Tensor.idr | 16 +++ tutorials/BayesianOptimization.md | 37 ++++--- 9 files changed, 132 insertions(+), 83 deletions(-) rename src/BayesianOptimization/{Util.idr => Morphisms.idr} (67%) diff --git a/spidr.ipkg b/spidr.ipkg index 5403977f8..4cf4c5482 100644 --- a/spidr.ipkg +++ b/spidr.ipkg @@ -6,7 +6,7 @@ sourcedir = "src" modules = BayesianOptimization, BayesianOptimization.Acquisition, - BayesianOptimization.Util, + BayesianOptimization.Morphisms, Distribution, diff --git a/src/BayesianOptimization.idr b/src/BayesianOptimization.idr index a43ee46a1..64dcb7d37 100644 --- a/src/BayesianOptimization.idr +++ b/src/BayesianOptimization.idr @@ -21,4 +21,4 @@ limitations under the License. module BayesianOptimization import public BayesianOptimization.Acquisition as BayesianOptimization -import public BayesianOptimization.Util as BayesianOptimization +import public BayesianOptimization.Morphisms as BayesianOptimization diff --git a/src/BayesianOptimization/Acquisition.idr b/src/BayesianOptimization/Acquisition.idr index 9ee43c82c..0959a1a46 100644 --- a/src/BayesianOptimization/Acquisition.idr +++ b/src/BayesianOptimization/Acquisition.idr @@ -20,7 +20,7 @@ import Distribution import Tensor import Model import Optimize -import BayesianOptimization.Util +import BayesianOptimization.Morphisms import Util ||| An `Empiric` constructs values from historic data and the model over that data. @@ -29,7 +29,7 @@ import Util ||| @out The type of the value constructed by the `Empiric`. public export 0 Empiric : Distribution targets marginal => (0 features : Shape) -> (0 out : Type) -> Type -Empiric features out = forall s . +Empiric features out = {s : _} -> Data {samples=S s} features targets -> ProbabilisticModel features {marginal} -> out ||| An `Acquisition` function quantifies how useful it would be to query the objective at a given diff --git a/src/BayesianOptimization/Util.idr b/src/BayesianOptimization/Morphisms.idr similarity index 67% rename from src/BayesianOptimization/Util.idr rename to src/BayesianOptimization/Morphisms.idr index 549facae4..ae79a1522 100644 --- a/src/BayesianOptimization/Util.idr +++ b/src/BayesianOptimization/Morphisms.idr @@ -13,23 +13,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --} -module BayesianOptimization.Util +module BayesianOptimization.Morphisms import public Data.Morphisms import Distribution import Tensor import Model -||| Objective query points and either corresponding objective values or metadata. -||| -||| @samples The number of points in each of the feature and target data. -||| @features The shape of the feature domain. -||| @targets The shape of the target domain. -public export 0 -Data : {0 samples : Nat} -> (0 features : Shape) -> (0 targets : Shape) -> Type -Data features targets = - (Tensor (samples :: features) Double, Tensor (samples :: targets) Double) - infix 9 >>> ||| Compose two functions that each use two values and wrap them in a morphism. This is a @@ -39,6 +29,7 @@ export (>>>) : (i -> (a, b)) -> (a -> b -> o) -> i ~> o f >>> g = Mor (uncurry g . f) +||| Extract the underlying function from a `Morphism`. export run : (i ~> o) -> i -> o run = applyMor diff --git a/src/Model.idr b/src/Model.idr index 206abb102..b4a1e30a7 100644 --- a/src/Model.idr +++ b/src/Model.idr @@ -16,17 +16,26 @@ limitations under the License. ||| This module contains definitions and utilities for probabilistic models. module Model -import public Model.GaussianProcess as Model -import public Model.Kernel as Model -import public Model.MeanFunction as Model - import Distribution import Tensor -||| A `ProbabilisticModel` is a mapping from a feature space to a probability distribution over -||| a target space. +||| Observed pairs of data points from feature and target domains. Data sets such as this are +||| commonly used in supervised learning settings. +||| +||| @samples The number of points in each of the feature and target data. +||| @features The shape of the feature domain. +||| @targets The shape of the target domain. +public export 0 +Data : {0 samples : Nat} -> (0 features : Shape) -> (0 targets : Shape) -> Type +Data features targets = + (Tensor (Vect.(::) samples features) Double, Tensor (Vect.(::) samples targets) Double) + +||| A `ProbabilisticModel` is a mapping from a feature domain to a probability distribution over +||| a target domain. ||| ||| @features The shape of the feature domain. +||| @targets The shape of the target domain. +||| @marginal The type of mulitvariate marginal distribution over the target domain. public export 0 ProbabilisticModel : Distribution targets marginal => (0 features : Shape) -> Type -ProbabilisticModel features = forall n . Tensor (Vect.(::) n features) Double -> marginal n +ProbabilisticModel features = {n : _} -> Tensor (Vect.(::) (S n) features) Double -> marginal (S n) diff --git a/src/Model/GaussianProcess.idr b/src/Model/GaussianProcess.idr index 7dd3be692..0a7e95e2f 100644 --- a/src/Model/GaussianProcess.idr +++ b/src/Model/GaussianProcess.idr @@ -13,12 +13,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --} -||| This module contains the `GaussianProcess` type for defining a Gaussian process, along with -||| functionality for training and inference. +||| This module contains functionality for Gaussian process inference. module Model.GaussianProcess -import Data.Nat import Tensor +import Model import Model.Kernel import Model.MeanFunction import Optimize @@ -35,23 +34,13 @@ data GaussianProcess : (0 features : Shape) -> Type where ||| Construct a `GaussianProcess` as a pair of mean function and kernel. MkGP : MeanFunction features -> Kernel features -> GaussianProcess features -||| The marginal distribution of the Gaussian process at the specified feature values. -||| -||| @at The feature values at which to evaluate the marginal distribution. -export -marginalise : {s : _} +posterior : + GaussianProcess features + -> Tensor [] Double + -> forall s . (Tensor ((S s) :: features) Double, Tensor [S s] Double) -> GaussianProcess features - -> Tensor ((S s) :: features) Double - -> Gaussian [] (S s) -marginalise (MkGP mean_function kernel) x = MkGaussian (mean_function x) (kernel x x) - -posterior : forall s . - (prior : GaussianProcess features) - -> (likelihood : Gaussian [] (S s)) - -> (training_data : (Tensor ((S s) :: features) Double, Tensor [S s] Double)) - -> GaussianProcess features -posterior (MkGP prior_meanf prior_kernel) (MkGaussian _ cov) (x_train, y_train) = - let l = cholesky (prior_kernel x_train x_train + cov) +posterior (MkGP prior_meanf prior_kernel) noise (x_train, y_train) = + let l = cholesky (prior_kernel x_train x_train + diag {n=S s} noise) alpha = l.T \\ (l \\ y_train) posterior_meanf : MeanFunction features @@ -63,38 +52,60 @@ posterior (MkGP prior_meanf prior_kernel) (MkGaussian _ cov) (x_train, y_train) in MkGP posterior_meanf posterior_kernel -log_marginal_likelihood : {s : _} - -> GaussianProcess features - -> Gaussian [] (S s) - -> (Tensor ((S s) :: features) Double, Tensor [S s] Double) - -> Tensor [] Double -log_marginal_likelihood (MkGP _ kernel) (MkGaussian _ cov) (x, y) = - let l = cholesky (kernel x x + cov) +log_marginal_likelihood : + GaussianProcess features + -> Tensor [] Double + -> {s : _} -> (Tensor ((S s) :: features) Double, Tensor [S s] Double) + -> Tensor [] Double +log_marginal_likelihood (MkGP _ kernel) noise (x, y) = + let l = cholesky (kernel x x + diag {n=S s} noise) alpha = l.T \\ (l \\ y) n = const {shape=[]} $ cast (S s) log2pi = log $ const {shape=[]} $ 2.0 * PI two = const {shape=[]} 2 in - y @@ alpha / two - trace (log l) - n * log2pi / two -||| Find the hyperparameters which maximize the marginal likelihood for the specified structures of -||| prior and likelihood, then return the posterior for that given prior, likelihood and -||| hyperparameters. +||| A trainable model implementing vanilla Gaussian process regression. That is, regression with a +||| Gaussian process as conjugate prior for homoscedastic Gaussian likelihoods. See the following +||| for details: +||| +||| Gaussian Processes for Machine Learning +||| Carl Edward Rasmussen and Christopher K. I. Williams +||| The MIT Press, 2006. ISBN 0-262-18253-X. ||| -||| @optimizer The optimization tactic. -||| @prior Constructs the prior from *all* the hyperparameters. -||| @likelihood Constructs the likelihood from *all* the hyperparameters. -||| @training_data The observed data. +||| or +||| +||| Pattern Recognition and Machine Learning, Christopher M. Bishop +public export +data ConjugateGPRegression : (0 features : Shape) -> Type where + MkConjugateGPR : (Tensor [p] Double -> GaussianProcess features) -> Tensor [p] Double + -> Tensor [] Double -> ConjugateGPRegression features + +||| Construct a probabilistic model for the latent target values. export -fit : {s : _} - -> (optimizer: Optimizer hp) - -> (prior : hp -> GaussianProcess features) - -> (likelihood : hp -> Gaussian [] (S s)) - -> (training_data : (Tensor ((S s) :: features) Double, Tensor [S s] Double)) - -> GaussianProcess features -fit optimizer prior likelihood training_data = - let objective : hp -> Tensor [] Double - objective hp = log_marginal_likelihood (prior hp) (likelihood hp) training_data - - params := optimizer objective - - in posterior (prior params) (likelihood params) training_data +predict_latent : ConjugateGPRegression features + -> ProbabilisticModel features {marginal=Gaussian [1]} +predict_latent (MkConjugateGPR mk_gp gp_params _) x = + let (MkGP meanf kernel) = mk_gp gp_params + in MkGaussian (expand 1 $ meanf x) (expand 2 $ kernel x x) + +||| Fit the Gaussian process and noise to the specified data. +export +fit : ConjugateGPRegression features + -> (forall n . Tensor [n] Double -> Optimizer $ Tensor [n] Double) + -> {s : _} -> Data {samples=S s} features [1] + -> ConjugateGPRegression features +fit (MkConjugateGPR {p} mk_prior gp_params noise) optimizer {s} training_data = + let objective : Tensor [S p] Double -> Tensor [] Double + objective params = let (noise, prior_params) = split 1 params + (x, y) = training_data + in log_marginal_likelihood (mk_prior prior_params) + (squeeze noise) (x, squeeze y) + + (noise, gp_params) := split 1 $ optimizer (concat (expand 0 noise) gp_params) objective + + mk_posterior : Tensor [p] Double -> GaussianProcess features + mk_posterior params' = let (x, y) = training_data + in posterior (mk_prior params') (squeeze noise) (x, squeeze y) + + in MkConjugateGPR mk_posterior gp_params (squeeze noise) diff --git a/src/Optimize.idr b/src/Optimize.idr index 4fa502b51..55caf6cdc 100644 --- a/src/Optimize.idr +++ b/src/Optimize.idr @@ -31,7 +31,7 @@ public export 0 Optimizer : {default id 0 m : Type -> Type} -> (0 domain : Type) -> Type Optimizer a = (a -> m $ Tensor [] Double) -> m a -||| Construct a `Optimizer` that implements grid search over a scalar feature space. Grid search +||| Construct an `Optimizer` that implements grid search over a scalar feature space. Grid search ||| approximates the optimum by evaluating the objective over a finite, evenly-spaced grid. ||| ||| @density The density of the grid. @@ -42,3 +42,16 @@ gridSearch : (density : Tensor [d] Integer) -> (lower : Tensor [d] Double) -> (upper : Tensor [d] Double) -> Optimizer (Tensor [d] Double) + +||| The limited-memory BFGS (L-BFGS) optimization tactic, see +||| +||| Nocedal, Jorge, Updating quasi-Newton matrices with limited storage. +||| Math. Comp. 35 (1980), no. 151, 773–782. +||| +||| available at +||| +||| https://www.ams.org/journals/mcom/1980-35-151/S0025-5718-1980-0572855-7/ +||| +||| @initial_points The points from which to start optimization. +export +lbfgs : forall n . (initial_points : Tensor [n] Double) -> Optimizer (Tensor [n] Double) diff --git a/src/Tensor.idr b/src/Tensor.idr index 4b6b21cea..a581e8f5e 100644 --- a/src/Tensor.idr +++ b/src/Tensor.idr @@ -154,6 +154,22 @@ freeze : (1 _ : Variable shape dtype) -> Tensor shape dtype export index : (idx : Fin d) -> Tensor (d :: ds) dtype -> Tensor ds dtype +||| Split a `Tensor` along the first axis at the specified index. For example, +||| `split 1 const [[1, 2], [3, 4], [5, 6]]` is equivalent to +||| `(const [[1, 2]], const [[3, 4], [5, 6]])`. +||| +||| @idx The index of the row at which to split the `Tensor`. The row with index `idx` in +||| the input `Tensor` will appear in the result as the first row in the second `Tensor`. +export +split : (idx : Nat) -> Tensor ((idx + rest) :: tl) dtype + -> (Tensor (idx :: tl) dtype, Tensor (rest :: tl) dtype) + +||| Concatenate two `Tensor`s along their first axis. For example, +||| `concat (const [[1, 2], [3, 4]]) (const [[5, 6]])` is equivalent to +||| `const [[1, 2], [3, 4], [5, 6]]`. +export +concat : Tensor (n :: tl) dtype -> Tensor (m :: tl) dtype -> Tensor ((n + m) :: tl) dtype + ||| Add a dimension of length one at the specified `axis`. The new dimension will be at the ||| specified axis in the new `Tensor` (as opposed to the original `Tensor`). For example, ||| `expand 1 $ const [[1, 2], [3, 4], [5, 6]]` is equivalent to diff --git a/tutorials/BayesianOptimization.md b/tutorials/BayesianOptimization.md index 5e7b2706e..d8c8edfa7 100644 --- a/tutorials/BayesianOptimization.md +++ b/tutorials/BayesianOptimization.md @@ -45,6 +45,9 @@ How we produce the new points from the data and models depends on the problem at import Tensor import BayesianOptimization import Model +import Model.GaussianProcess +import Model.Kernel +import Model.MeanFunction import Distribution import Optimize import Util @@ -57,12 +60,12 @@ historicData = (const [[0.3, 0.4], [0.5, 0.2], [0.3, 0.9]], const [[1.2], [-0.5] and model that data ```idris -model : ProbabilisticModel [2] {marginal=Gaussian [1]} -model = let optimizer = gridSearch (const [100, 100]) (const [-10, 10]) (const [-10, 10]) - mk_prior = \len_and_noise => MkGP zero (rbf $ index 0 len_and_noise) - mk_likelihood = \len_and_noise => MkGaussian (fill 0) (diag $ index 1 len_and_noise) - (qp, obs) = historicData - in ?marginalise $ fit optimizer mk_prior mk_likelihood (qp, squeeze obs) +model : ConjugateGPRegression [2] +model = let mk_gp = \len => MkGP zero (matern52 (const {shape=[]} 1.0) $ squeeze len) + length_scale = const {shape=[1]} [0.5] + noise = const {shape=[]} 0.2 + model = MkConjugateGPR mk_gp length_scale noise + in fit model lbfgs historicData ``` then optimize over the marginal mean @@ -73,7 +76,7 @@ optimizer = let gs = gridSearch (const [100, 100]) (const [0.0, 0.0]) (const [1. in \f => broadcast . gs $ f . broadcast newPoint : Tensor [1, 2] Double -newPoint = optimizer $ squeeze . mean . model +newPoint = optimizer $ squeeze . mean . (predict_latent model) ``` This is a particularly simple example of the standard approach of defining an _acquisition function_ over the input space which quantifies how useful it would be evaluate the objective at a set of points, then finding the points that optimize this acquisition function. We can visualise this: @@ -110,12 +113,12 @@ In this case, our acquisition function depends on the model (which in turn depen In the above example, we constructed the acquisition function from our model, then optimized it, and in doing so, we assumed that we have access to the data and models when we compose the acquisition function with the optimizer. This might not be the case: we may want to compose things before we get the data and model. Using spidr's names, we'd apply an `Optimizer` to an `i -> Acquisition`. We'd normally do this with `map`, a method on the `Functor` interface, but functions, including `i -> o`, don't implement `Functor` (indeed, in Idris, they can't). We can however, wrap an `i -> o` in the `Morphism i o` type (also called `i ~> o` with a tilde) which does implement `Functor`. We can `map` an `Optimizer` over a `i ~> Acquisition`, as follows: ```idris -modelMean : ProbabilisticModel [2] {targets=[1]} {marginal=Gaussian [1]} -> Acquisition 1 [2] -modelMean model = squeeze . mean . model +modelMean : ProbabilisticModel [2] {marginal=Gaussian [1]} -> Acquisition 1 [2] +modelMean predict = squeeze . mean . predict newPoint' : Tensor [1, 2] Double newPoint' = let acquisition = map optimizer (Mor modelMean) -- `Mor` makes a `Morphism` - in run acquisition model -- `run` turns a `Morphism` into a function + in run acquisition (predict_latent model) -- `run` turns a `Morphism` into a function ``` ## Combining empirical values with `Applicative` @@ -173,7 +176,12 @@ With this new functionality at hand, we'll return to our objective with failure failureData : Data {samples=4} [2] [1] failureData = (const [[0.3, 0.4], [0.5, 0.2], [0.3, 0.9], [0.7, 0.1]], const [[0], [0], [0], [1]]) -failureModel : ProbabilisticModel [2] {marginal=Gaussian [1]} +failureModel : ConjugateGPRegression [2] +failureModel = let mk_gp = \len => MkGP zero (rbf $ squeeze len) + length_scale = const {shape=[1]} [0.2] + noise = const {shape=[]} 0.1 + model = MkConjugateGPR mk_gp length_scale noise + in fit model lbfgs failureData ``` and we'll gather all the data and models in a `record`: @@ -189,9 +197,10 @@ Idris generates two methods `objective` and `failure` from this `record`, which ```idris newPoint'' : Tensor [1, 2] Double -newPoint'' = let eci = objective >>> expectedConstrainedImprovement (const 0.5) - pof = failure >>> probabilityOfFeasibility (const 0.5) +newPoint'' = let eci = objective >>> expectedConstrainedImprovement (const 0.5) {s=_} + pof = failure >>> probabilityOfFeasibility (const 0.5) {s=_} acquisition = map optimizer (eci <*> pof) - dataAndModel = Label (historicData, model) (failureData, failureModel) + dataAndModel = Label (historicData, predict_latent model) + (failureData, predict_latent failureModel) in run acquisition dataAndModel ```