Skip to content

Commit

Permalink
implement conjugate GP regression as a trainable probabilistic model (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
joelberkeley authored Aug 17, 2021
1 parent 058a4b7 commit 7304558
Show file tree
Hide file tree
Showing 9 changed files with 132 additions and 83 deletions.
2 changes: 1 addition & 1 deletion spidr.ipkg
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ sourcedir = "src"
modules =
BayesianOptimization,
BayesianOptimization.Acquisition,
BayesianOptimization.Util,
BayesianOptimization.Morphisms,

Distribution,

Expand Down
2 changes: 1 addition & 1 deletion src/BayesianOptimization.idr
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ limitations under the License.
module BayesianOptimization

import public BayesianOptimization.Acquisition as BayesianOptimization
import public BayesianOptimization.Util as BayesianOptimization
import public BayesianOptimization.Morphisms as BayesianOptimization
4 changes: 2 additions & 2 deletions src/BayesianOptimization/Acquisition.idr
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import Distribution
import Tensor
import Model
import Optimize
import BayesianOptimization.Util
import BayesianOptimization.Morphisms
import Util

||| An `Empiric` constructs values from historic data and the model over that data.
Expand All @@ -29,7 +29,7 @@ import Util
||| @out The type of the value constructed by the `Empiric`.
public export 0
Empiric : Distribution targets marginal => (0 features : Shape) -> (0 out : Type) -> Type
Empiric features out = forall s .
Empiric features out = {s : _} ->
Data {samples=S s} features targets -> ProbabilisticModel features {marginal} -> out

||| An `Acquisition` function quantifies how useful it would be to query the objective at a given
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
--}
module BayesianOptimization.Util
module BayesianOptimization.Morphisms

import public Data.Morphisms
import Distribution
import Tensor
import Model

||| Objective query points and either corresponding objective values or metadata.
|||
||| @samples The number of points in each of the feature and target data.
||| @features The shape of the feature domain.
||| @targets The shape of the target domain.
public export 0
Data : {0 samples : Nat} -> (0 features : Shape) -> (0 targets : Shape) -> Type
Data features targets =
(Tensor (samples :: features) Double, Tensor (samples :: targets) Double)

infix 9 >>>

||| Compose two functions that each use two values and wrap them in a morphism. This is a
Expand All @@ -39,6 +29,7 @@ export
(>>>) : (i -> (a, b)) -> (a -> b -> o) -> i ~> o
f >>> g = Mor (uncurry g . f)

||| Extract the underlying function from a `Morphism`.
export
run : (i ~> o) -> i -> o
run = applyMor
23 changes: 16 additions & 7 deletions src/Model.idr
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,26 @@ limitations under the License.
||| This module contains definitions and utilities for probabilistic models.
module Model

import public Model.GaussianProcess as Model
import public Model.Kernel as Model
import public Model.MeanFunction as Model

import Distribution
import Tensor

||| A `ProbabilisticModel` is a mapping from a feature space to a probability distribution over
||| a target space.
||| Observed pairs of data points from feature and target domains. Data sets such as this are
||| commonly used in supervised learning settings.
|||
||| @samples The number of points in each of the feature and target data.
||| @features The shape of the feature domain.
||| @targets The shape of the target domain.
public export 0
Data : {0 samples : Nat} -> (0 features : Shape) -> (0 targets : Shape) -> Type
Data features targets =
(Tensor (Vect.(::) samples features) Double, Tensor (Vect.(::) samples targets) Double)

||| A `ProbabilisticModel` is a mapping from a feature domain to a probability distribution over
||| a target domain.
|||
||| @features The shape of the feature domain.
||| @targets The shape of the target domain.
||| @marginal The type of mulitvariate marginal distribution over the target domain.
public export 0
ProbabilisticModel : Distribution targets marginal => (0 features : Shape) -> Type
ProbabilisticModel features = forall n . Tensor (Vect.(::) n features) Double -> marginal n
ProbabilisticModel features = {n : _} -> Tensor (Vect.(::) (S n) features) Double -> marginal (S n)
103 changes: 57 additions & 46 deletions src/Model/GaussianProcess.idr
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
--}
||| This module contains the `GaussianProcess` type for defining a Gaussian process, along with
||| functionality for training and inference.
||| This module contains functionality for Gaussian process inference.
module Model.GaussianProcess

import Data.Nat
import Tensor
import Model
import Model.Kernel
import Model.MeanFunction
import Optimize
Expand All @@ -35,23 +34,13 @@ data GaussianProcess : (0 features : Shape) -> Type where
||| Construct a `GaussianProcess` as a pair of mean function and kernel.
MkGP : MeanFunction features -> Kernel features -> GaussianProcess features

||| The marginal distribution of the Gaussian process at the specified feature values.
|||
||| @at The feature values at which to evaluate the marginal distribution.
export
marginalise : {s : _}
posterior :
GaussianProcess features
-> Tensor [] Double
-> forall s . (Tensor ((S s) :: features) Double, Tensor [S s] Double)
-> GaussianProcess features
-> Tensor ((S s) :: features) Double
-> Gaussian [] (S s)
marginalise (MkGP mean_function kernel) x = MkGaussian (mean_function x) (kernel x x)

posterior : forall s .
(prior : GaussianProcess features)
-> (likelihood : Gaussian [] (S s))
-> (training_data : (Tensor ((S s) :: features) Double, Tensor [S s] Double))
-> GaussianProcess features
posterior (MkGP prior_meanf prior_kernel) (MkGaussian _ cov) (x_train, y_train) =
let l = cholesky (prior_kernel x_train x_train + cov)
posterior (MkGP prior_meanf prior_kernel) noise (x_train, y_train) =
let l = cholesky (prior_kernel x_train x_train + diag {n=S s} noise)
alpha = l.T \\ (l \\ y_train)

posterior_meanf : MeanFunction features
Expand All @@ -63,38 +52,60 @@ posterior (MkGP prior_meanf prior_kernel) (MkGaussian _ cov) (x_train, y_train)

in MkGP posterior_meanf posterior_kernel

log_marginal_likelihood : {s : _}
-> GaussianProcess features
-> Gaussian [] (S s)
-> (Tensor ((S s) :: features) Double, Tensor [S s] Double)
-> Tensor [] Double
log_marginal_likelihood (MkGP _ kernel) (MkGaussian _ cov) (x, y) =
let l = cholesky (kernel x x + cov)
log_marginal_likelihood :
GaussianProcess features
-> Tensor [] Double
-> {s : _} -> (Tensor ((S s) :: features) Double, Tensor [S s] Double)
-> Tensor [] Double
log_marginal_likelihood (MkGP _ kernel) noise (x, y) =
let l = cholesky (kernel x x + diag {n=S s} noise)
alpha = l.T \\ (l \\ y)
n = const {shape=[]} $ cast (S s)
log2pi = log $ const {shape=[]} $ 2.0 * PI
two = const {shape=[]} 2
in - y @@ alpha / two - trace (log l) - n * log2pi / two

||| Find the hyperparameters which maximize the marginal likelihood for the specified structures of
||| prior and likelihood, then return the posterior for that given prior, likelihood and
||| hyperparameters.
||| A trainable model implementing vanilla Gaussian process regression. That is, regression with a
||| Gaussian process as conjugate prior for homoscedastic Gaussian likelihoods. See the following
||| for details:
|||
||| Gaussian Processes for Machine Learning
||| Carl Edward Rasmussen and Christopher K. I. Williams
||| The MIT Press, 2006. ISBN 0-262-18253-X.
|||
||| @optimizer The optimization tactic.
||| @prior Constructs the prior from *all* the hyperparameters.
||| @likelihood Constructs the likelihood from *all* the hyperparameters.
||| @training_data The observed data.
||| or
|||
||| Pattern Recognition and Machine Learning, Christopher M. Bishop
public export
data ConjugateGPRegression : (0 features : Shape) -> Type where
MkConjugateGPR : (Tensor [p] Double -> GaussianProcess features) -> Tensor [p] Double
-> Tensor [] Double -> ConjugateGPRegression features

||| Construct a probabilistic model for the latent target values.
export
fit : {s : _}
-> (optimizer: Optimizer hp)
-> (prior : hp -> GaussianProcess features)
-> (likelihood : hp -> Gaussian [] (S s))
-> (training_data : (Tensor ((S s) :: features) Double, Tensor [S s] Double))
-> GaussianProcess features
fit optimizer prior likelihood training_data =
let objective : hp -> Tensor [] Double
objective hp = log_marginal_likelihood (prior hp) (likelihood hp) training_data

params := optimizer objective

in posterior (prior params) (likelihood params) training_data
predict_latent : ConjugateGPRegression features
-> ProbabilisticModel features {marginal=Gaussian [1]}
predict_latent (MkConjugateGPR mk_gp gp_params _) x =
let (MkGP meanf kernel) = mk_gp gp_params
in MkGaussian (expand 1 $ meanf x) (expand 2 $ kernel x x)

||| Fit the Gaussian process and noise to the specified data.
export
fit : ConjugateGPRegression features
-> (forall n . Tensor [n] Double -> Optimizer $ Tensor [n] Double)
-> {s : _} -> Data {samples=S s} features [1]
-> ConjugateGPRegression features
fit (MkConjugateGPR {p} mk_prior gp_params noise) optimizer {s} training_data =
let objective : Tensor [S p] Double -> Tensor [] Double
objective params = let (noise, prior_params) = split 1 params
(x, y) = training_data
in log_marginal_likelihood (mk_prior prior_params)
(squeeze noise) (x, squeeze y)

(noise, gp_params) := split 1 $ optimizer (concat (expand 0 noise) gp_params) objective

mk_posterior : Tensor [p] Double -> GaussianProcess features
mk_posterior params' = let (x, y) = training_data
in posterior (mk_prior params') (squeeze noise) (x, squeeze y)

in MkConjugateGPR mk_posterior gp_params (squeeze noise)
15 changes: 14 additions & 1 deletion src/Optimize.idr
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ public export 0
Optimizer : {default id 0 m : Type -> Type} -> (0 domain : Type) -> Type
Optimizer a = (a -> m $ Tensor [] Double) -> m a

||| Construct a `Optimizer` that implements grid search over a scalar feature space. Grid search
||| Construct an `Optimizer` that implements grid search over a scalar feature space. Grid search
||| approximates the optimum by evaluating the objective over a finite, evenly-spaced grid.
|||
||| @density The density of the grid.
Expand All @@ -42,3 +42,16 @@ gridSearch : (density : Tensor [d] Integer) ->
(lower : Tensor [d] Double) ->
(upper : Tensor [d] Double) ->
Optimizer (Tensor [d] Double)

||| The limited-memory BFGS (L-BFGS) optimization tactic, see
|||
||| Nocedal, Jorge, Updating quasi-Newton matrices with limited storage.
||| Math. Comp. 35 (1980), no. 151, 773–782.
|||
||| available at
|||
||| https://www.ams.org/journals/mcom/1980-35-151/S0025-5718-1980-0572855-7/
|||
||| @initial_points The points from which to start optimization.
export
lbfgs : forall n . (initial_points : Tensor [n] Double) -> Optimizer (Tensor [n] Double)
16 changes: 16 additions & 0 deletions src/Tensor.idr
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,22 @@ freeze : (1 _ : Variable shape dtype) -> Tensor shape dtype
export
index : (idx : Fin d) -> Tensor (d :: ds) dtype -> Tensor ds dtype

||| Split a `Tensor` along the first axis at the specified index. For example,
||| `split 1 const [[1, 2], [3, 4], [5, 6]]` is equivalent to
||| `(const [[1, 2]], const [[3, 4], [5, 6]])`.
|||
||| @idx The index of the row at which to split the `Tensor`. The row with index `idx` in
||| the input `Tensor` will appear in the result as the first row in the second `Tensor`.
export
split : (idx : Nat) -> Tensor ((idx + rest) :: tl) dtype
-> (Tensor (idx :: tl) dtype, Tensor (rest :: tl) dtype)

||| Concatenate two `Tensor`s along their first axis. For example,
||| `concat (const [[1, 2], [3, 4]]) (const [[5, 6]])` is equivalent to
||| `const [[1, 2], [3, 4], [5, 6]]`.
export
concat : Tensor (n :: tl) dtype -> Tensor (m :: tl) dtype -> Tensor ((n + m) :: tl) dtype

||| Add a dimension of length one at the specified `axis`. The new dimension will be at the
||| specified axis in the new `Tensor` (as opposed to the original `Tensor`). For example,
||| `expand 1 $ const [[1, 2], [3, 4], [5, 6]]` is equivalent to
Expand Down
37 changes: 23 additions & 14 deletions tutorials/BayesianOptimization.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ How we produce the new points from the data and models depends on the problem at
import Tensor
import BayesianOptimization
import Model
import Model.GaussianProcess
import Model.Kernel
import Model.MeanFunction
import Distribution
import Optimize
import Util
Expand All @@ -57,12 +60,12 @@ historicData = (const [[0.3, 0.4], [0.5, 0.2], [0.3, 0.9]], const [[1.2], [-0.5]
and model that data

```idris
model : ProbabilisticModel [2] {marginal=Gaussian [1]}
model = let optimizer = gridSearch (const [100, 100]) (const [-10, 10]) (const [-10, 10])
mk_prior = \len_and_noise => MkGP zero (rbf $ index 0 len_and_noise)
mk_likelihood = \len_and_noise => MkGaussian (fill 0) (diag $ index 1 len_and_noise)
(qp, obs) = historicData
in ?marginalise $ fit optimizer mk_prior mk_likelihood (qp, squeeze obs)
model : ConjugateGPRegression [2]
model = let mk_gp = \len => MkGP zero (matern52 (const {shape=[]} 1.0) $ squeeze len)
length_scale = const {shape=[1]} [0.5]
noise = const {shape=[]} 0.2
model = MkConjugateGPR mk_gp length_scale noise
in fit model lbfgs historicData
```

then optimize over the marginal mean
Expand All @@ -73,7 +76,7 @@ optimizer = let gs = gridSearch (const [100, 100]) (const [0.0, 0.0]) (const [1.
in \f => broadcast . gs $ f . broadcast
newPoint : Tensor [1, 2] Double
newPoint = optimizer $ squeeze . mean . model
newPoint = optimizer $ squeeze . mean . (predict_latent model)
```

This is a particularly simple example of the standard approach of defining an _acquisition function_ over the input space which quantifies how useful it would be evaluate the objective at a set of points, then finding the points that optimize this acquisition function. We can visualise this:
Expand Down Expand Up @@ -110,12 +113,12 @@ In this case, our acquisition function depends on the model (which in turn depen
In the above example, we constructed the acquisition function from our model, then optimized it, and in doing so, we assumed that we have access to the data and models when we compose the acquisition function with the optimizer. This might not be the case: we may want to compose things before we get the data and model. Using spidr's names, we'd apply an `Optimizer` to an `i -> Acquisition`. We'd normally do this with `map`, a method on the `Functor` interface, but functions, including `i -> o`, don't implement `Functor` (indeed, in Idris, they can't). We can however, wrap an `i -> o` in the `Morphism i o` type (also called `i ~> o` with a tilde) which does implement `Functor`. We can `map` an `Optimizer` over a `i ~> Acquisition`, as follows:

```idris
modelMean : ProbabilisticModel [2] {targets=[1]} {marginal=Gaussian [1]} -> Acquisition 1 [2]
modelMean model = squeeze . mean . model
modelMean : ProbabilisticModel [2] {marginal=Gaussian [1]} -> Acquisition 1 [2]
modelMean predict = squeeze . mean . predict
newPoint' : Tensor [1, 2] Double
newPoint' = let acquisition = map optimizer (Mor modelMean) -- `Mor` makes a `Morphism`
in run acquisition model -- `run` turns a `Morphism` into a function
in run acquisition (predict_latent model) -- `run` turns a `Morphism` into a function
```

## Combining empirical values with `Applicative`
Expand Down Expand Up @@ -173,7 +176,12 @@ With this new functionality at hand, we'll return to our objective with failure
failureData : Data {samples=4} [2] [1]
failureData = (const [[0.3, 0.4], [0.5, 0.2], [0.3, 0.9], [0.7, 0.1]], const [[0], [0], [0], [1]])
failureModel : ProbabilisticModel [2] {marginal=Gaussian [1]}
failureModel : ConjugateGPRegression [2]
failureModel = let mk_gp = \len => MkGP zero (rbf $ squeeze len)
length_scale = const {shape=[1]} [0.2]
noise = const {shape=[]} 0.1
model = MkConjugateGPR mk_gp length_scale noise
in fit model lbfgs failureData
```

and we'll gather all the data and models in a `record`:
Expand All @@ -189,9 +197,10 @@ Idris generates two methods `objective` and `failure` from this `record`, which

```idris
newPoint'' : Tensor [1, 2] Double
newPoint'' = let eci = objective >>> expectedConstrainedImprovement (const 0.5)
pof = failure >>> probabilityOfFeasibility (const 0.5)
newPoint'' = let eci = objective >>> expectedConstrainedImprovement (const 0.5) {s=_}
pof = failure >>> probabilityOfFeasibility (const 0.5) {s=_}
acquisition = map optimizer (eci <*> pof)
dataAndModel = Label (historicData, model) (failureData, failureModel)
dataAndModel = Label (historicData, predict_latent model)
(failureData, predict_latent failureModel)
in run acquisition dataAndModel
```

0 comments on commit 7304558

Please sign in to comment.