From 73045582483dfcda2f2c520c155f40e425344215 Mon Sep 17 00:00:00 2001
From: Joel Berkeley <16429957+joelberkeley@users.noreply.github.com>
Date: Tue, 17 Aug 2021 15:53:09 +0100
Subject: [PATCH] implement conjugate GP regression as a trainable
 probabilistic model (#113)

---
 spidr.ipkg                                    |   2 +-
 src/BayesianOptimization.idr                  |   2 +-
 src/BayesianOptimization/Acquisition.idr      |   4 +-
 .../{Util.idr => Morphisms.idr}               |  13 +--
 src/Model.idr                                 |  23 ++--
 src/Model/GaussianProcess.idr                 | 103 ++++++++++--------
 src/Optimize.idr                              |  15 ++-
 src/Tensor.idr                                |  16 +++
 tutorials/BayesianOptimization.md             |  37 ++++---
 9 files changed, 132 insertions(+), 83 deletions(-)
 rename src/BayesianOptimization/{Util.idr => Morphisms.idr} (67%)

diff --git a/spidr.ipkg b/spidr.ipkg
index 5403977f8..4cf4c5482 100644
--- a/spidr.ipkg
+++ b/spidr.ipkg
@@ -6,7 +6,7 @@ sourcedir = "src"
 modules =
   BayesianOptimization,
   BayesianOptimization.Acquisition,
-  BayesianOptimization.Util,
+  BayesianOptimization.Morphisms,
 
   Distribution,
 
diff --git a/src/BayesianOptimization.idr b/src/BayesianOptimization.idr
index a43ee46a1..64dcb7d37 100644
--- a/src/BayesianOptimization.idr
+++ b/src/BayesianOptimization.idr
@@ -21,4 +21,4 @@ limitations under the License.
 module BayesianOptimization
 
 import public BayesianOptimization.Acquisition as BayesianOptimization
-import public BayesianOptimization.Util as BayesianOptimization
+import public BayesianOptimization.Morphisms as BayesianOptimization
diff --git a/src/BayesianOptimization/Acquisition.idr b/src/BayesianOptimization/Acquisition.idr
index 9ee43c82c..0959a1a46 100644
--- a/src/BayesianOptimization/Acquisition.idr
+++ b/src/BayesianOptimization/Acquisition.idr
@@ -20,7 +20,7 @@ import Distribution
 import Tensor
 import Model
 import Optimize
-import BayesianOptimization.Util
+import BayesianOptimization.Morphisms
 import Util
 
 ||| An `Empiric` constructs values from historic data and the model over that data.
@@ -29,7 +29,7 @@ import Util
 ||| @out The type of the value constructed by the `Empiric`.
 public export 0
 Empiric : Distribution targets marginal => (0 features : Shape) -> (0 out : Type) -> Type
-Empiric features out = forall s .
+Empiric features out = {s : _} ->
   Data {samples=S s} features targets -> ProbabilisticModel features {marginal} -> out
 
 ||| An `Acquisition` function quantifies how useful it would be to query the objective at a given  
diff --git a/src/BayesianOptimization/Util.idr b/src/BayesianOptimization/Morphisms.idr
similarity index 67%
rename from src/BayesianOptimization/Util.idr
rename to src/BayesianOptimization/Morphisms.idr
index 549facae4..ae79a1522 100644
--- a/src/BayesianOptimization/Util.idr
+++ b/src/BayesianOptimization/Morphisms.idr
@@ -13,23 +13,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 --}
-module BayesianOptimization.Util
+module BayesianOptimization.Morphisms
 
 import public Data.Morphisms
 import Distribution
 import Tensor
 import Model
 
-||| Objective query points and either corresponding objective values or metadata.
-|||
-||| @samples The number of points in each of the feature and target data.
-||| @features The shape of the feature domain.
-||| @targets The shape of the target domain.
-public export 0
-Data : {0 samples : Nat} -> (0 features : Shape) -> (0 targets : Shape) -> Type
-Data features targets =
-  (Tensor (samples :: features) Double, Tensor (samples :: targets) Double)
-
 infix 9 >>>
 
 ||| Compose two functions that each use two values and wrap them in a morphism. This is a
@@ -39,6 +29,7 @@ export
 (>>>) : (i -> (a, b)) -> (a -> b -> o) -> i ~> o
 f >>> g = Mor (uncurry g . f)
 
+||| Extract the underlying function from a `Morphism`.
 export
 run : (i ~> o) -> i -> o
 run = applyMor
diff --git a/src/Model.idr b/src/Model.idr
index 206abb102..b4a1e30a7 100644
--- a/src/Model.idr
+++ b/src/Model.idr
@@ -16,17 +16,26 @@ limitations under the License.
 ||| This module contains definitions and utilities for probabilistic models.
 module Model
 
-import public Model.GaussianProcess as Model
-import public Model.Kernel as Model
-import public Model.MeanFunction as Model
-
 import Distribution
 import Tensor
 
-||| A `ProbabilisticModel` is a mapping from a feature space to a probability distribution over
-||| a target space.
+||| Observed pairs of data points from feature and target domains. Data sets such as this are
+||| commonly used in supervised learning settings.
+|||
+||| @samples The number of points in each of the feature and target data.
+||| @features The shape of the feature domain.
+||| @targets The shape of the target domain.
+public export 0
+Data : {0 samples : Nat} -> (0 features : Shape) -> (0 targets : Shape) -> Type
+Data features targets =
+  (Tensor (Vect.(::) samples features) Double, Tensor (Vect.(::) samples targets) Double)
+
+||| A `ProbabilisticModel` is a mapping from a feature domain to a probability distribution over
+||| a target domain.
 |||
 ||| @features The shape of the feature domain.
+||| @targets The shape of the target domain.
+||| @marginal The type of mulitvariate marginal distribution over the target domain.
 public export 0
 ProbabilisticModel : Distribution targets marginal => (0 features : Shape) -> Type
-ProbabilisticModel features = forall n . Tensor (Vect.(::) n features) Double -> marginal n
+ProbabilisticModel features = {n : _} -> Tensor (Vect.(::) (S n) features) Double -> marginal (S n)
diff --git a/src/Model/GaussianProcess.idr b/src/Model/GaussianProcess.idr
index 7dd3be692..0a7e95e2f 100644
--- a/src/Model/GaussianProcess.idr
+++ b/src/Model/GaussianProcess.idr
@@ -13,12 +13,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 --}
-||| This module contains the `GaussianProcess` type for defining a Gaussian process, along with
-||| functionality for training and inference.
+||| This module contains functionality for Gaussian process inference.
 module Model.GaussianProcess
 
-import Data.Nat
 import Tensor
+import Model
 import Model.Kernel
 import Model.MeanFunction
 import Optimize
@@ -35,23 +34,13 @@ data GaussianProcess : (0 features : Shape) -> Type where
   ||| Construct a `GaussianProcess` as a pair of mean function and kernel.
   MkGP : MeanFunction features -> Kernel features -> GaussianProcess features
 
-||| The marginal distribution of the Gaussian process at the specified feature values.
-|||
-||| @at The feature values at which to evaluate the marginal distribution.
-export
-marginalise : {s : _}
+posterior :
+  GaussianProcess features
+  -> Tensor [] Double
+  -> forall s . (Tensor ((S s) :: features) Double, Tensor [S s] Double)
   -> GaussianProcess features
-  -> Tensor ((S s) :: features) Double
-  -> Gaussian [] (S s)
-marginalise (MkGP mean_function kernel) x = MkGaussian (mean_function x) (kernel x x)
-
-posterior : forall s .
- (prior : GaussianProcess features)
- -> (likelihood : Gaussian [] (S s))
- -> (training_data : (Tensor ((S s) :: features) Double, Tensor [S s] Double))
- -> GaussianProcess features
-posterior (MkGP prior_meanf prior_kernel) (MkGaussian _ cov) (x_train, y_train) =
-  let l = cholesky (prior_kernel x_train x_train + cov)
+posterior (MkGP prior_meanf prior_kernel) noise (x_train, y_train) =
+  let l = cholesky (prior_kernel x_train x_train + diag {n=S s} noise)
       alpha = l.T \\ (l \\ y_train)
 
       posterior_meanf : MeanFunction features
@@ -63,38 +52,60 @@ posterior (MkGP prior_meanf prior_kernel) (MkGaussian _ cov) (x_train, y_train)
 
    in MkGP posterior_meanf posterior_kernel
 
-log_marginal_likelihood : {s : _}
- -> GaussianProcess features
- -> Gaussian [] (S s)
- -> (Tensor ((S s) :: features) Double, Tensor [S s] Double)
- -> Tensor [] Double
-log_marginal_likelihood (MkGP _ kernel) (MkGaussian _ cov) (x, y) =
-  let l = cholesky (kernel x x + cov)
+log_marginal_likelihood :
+  GaussianProcess features
+  -> Tensor [] Double
+  -> {s : _} -> (Tensor ((S s) :: features) Double, Tensor [S s] Double)
+  -> Tensor [] Double
+log_marginal_likelihood (MkGP _ kernel) noise (x, y) =
+  let l = cholesky (kernel x x + diag {n=S s} noise)
       alpha = l.T \\ (l \\ y)
       n = const {shape=[]} $ cast (S s)
       log2pi = log $ const {shape=[]} $ 2.0 * PI
       two = const {shape=[]} 2
    in - y @@ alpha / two - trace (log l) - n * log2pi / two
 
-||| Find the hyperparameters which maximize the marginal likelihood for the specified structures of
-||| prior and likelihood, then return the posterior for that given prior, likelihood and
-||| hyperparameters.
+||| A trainable model implementing vanilla Gaussian process regression. That is, regression with a
+||| Gaussian process as conjugate prior for homoscedastic Gaussian likelihoods. See the following
+||| for details:
+|||
+||| Gaussian Processes for Machine Learning
+||| Carl Edward Rasmussen and Christopher K. I. Williams
+||| The MIT Press, 2006. ISBN 0-262-18253-X.
 |||
-||| @optimizer The optimization tactic.
-||| @prior Constructs the prior from *all* the hyperparameters.
-||| @likelihood Constructs the likelihood from *all* the hyperparameters.
-||| @training_data The observed data.
+||| or
+|||
+||| Pattern Recognition and Machine Learning, Christopher M. Bishop
+public export
+data ConjugateGPRegression : (0 features : Shape) -> Type where
+  MkConjugateGPR : (Tensor [p] Double -> GaussianProcess features) -> Tensor [p] Double
+                   -> Tensor [] Double -> ConjugateGPRegression features
+
+||| Construct a probabilistic model for the latent target values.
 export
-fit : {s : _}
- -> (optimizer: Optimizer hp)
- -> (prior : hp -> GaussianProcess features)
- -> (likelihood : hp -> Gaussian [] (S s))
- -> (training_data : (Tensor ((S s) :: features) Double, Tensor [S s] Double))
- -> GaussianProcess features
-fit optimizer prior likelihood training_data =
-  let objective : hp -> Tensor [] Double
-      objective hp = log_marginal_likelihood (prior hp) (likelihood hp) training_data
-
-      params := optimizer objective
-
-   in posterior (prior params) (likelihood params) training_data
+predict_latent : ConjugateGPRegression features
+  -> ProbabilisticModel features {marginal=Gaussian [1]}
+predict_latent (MkConjugateGPR mk_gp gp_params _) x =
+  let (MkGP meanf kernel) = mk_gp gp_params
+    in MkGaussian (expand 1 $ meanf x) (expand 2 $ kernel x x)
+
+||| Fit the Gaussian process and noise to the specified data.
+export
+fit : ConjugateGPRegression features
+  -> (forall n . Tensor [n] Double -> Optimizer $ Tensor [n] Double)
+  -> {s : _} -> Data {samples=S s} features [1]
+  -> ConjugateGPRegression features
+fit (MkConjugateGPR {p} mk_prior gp_params noise) optimizer {s} training_data =
+  let objective : Tensor [S p] Double -> Tensor [] Double
+      objective params = let (noise, prior_params) = split 1 params
+                             (x, y) = training_data
+                          in log_marginal_likelihood (mk_prior prior_params)
+                             (squeeze noise) (x, squeeze y)
+
+      (noise, gp_params) := split 1 $ optimizer (concat (expand 0 noise) gp_params) objective
+
+      mk_posterior : Tensor [p] Double -> GaussianProcess features
+      mk_posterior params' = let (x, y) = training_data
+                              in posterior (mk_prior params') (squeeze noise) (x, squeeze y)
+
+    in MkConjugateGPR mk_posterior gp_params (squeeze noise)
diff --git a/src/Optimize.idr b/src/Optimize.idr
index 4fa502b51..55caf6cdc 100644
--- a/src/Optimize.idr
+++ b/src/Optimize.idr
@@ -31,7 +31,7 @@ public export 0
 Optimizer : {default id 0 m : Type -> Type} -> (0 domain : Type) -> Type
 Optimizer a = (a -> m $ Tensor [] Double) -> m a
 
-||| Construct a `Optimizer` that implements grid search over a scalar feature space. Grid search
+||| Construct an `Optimizer` that implements grid search over a scalar feature space. Grid search
 ||| approximates the optimum by evaluating the objective over a finite, evenly-spaced grid.
 |||
 ||| @density The density of the grid.
@@ -42,3 +42,16 @@ gridSearch : (density : Tensor [d] Integer) ->
              (lower : Tensor [d] Double) ->
              (upper : Tensor [d] Double) ->
              Optimizer (Tensor [d] Double)
+
+||| The limited-memory BFGS (L-BFGS) optimization tactic, see
+|||
+||| Nocedal, Jorge, Updating quasi-Newton matrices with limited storage.
+||| Math. Comp. 35 (1980), no. 151, 773–782.
+|||
+||| available at
+|||
+||| https://www.ams.org/journals/mcom/1980-35-151/S0025-5718-1980-0572855-7/
+|||
+||| @initial_points The points from which to start optimization.
+export
+lbfgs : forall n . (initial_points : Tensor [n] Double) -> Optimizer (Tensor [n] Double)
diff --git a/src/Tensor.idr b/src/Tensor.idr
index 4b6b21cea..a581e8f5e 100644
--- a/src/Tensor.idr
+++ b/src/Tensor.idr
@@ -154,6 +154,22 @@ freeze : (1 _ : Variable shape dtype) -> Tensor shape dtype
 export
 index : (idx : Fin d) -> Tensor (d :: ds) dtype -> Tensor ds dtype
 
+||| Split a `Tensor` along the first axis at the specified index. For example,
+||| `split 1 const [[1, 2], [3, 4], [5, 6]]` is equivalent to
+||| `(const [[1, 2]], const [[3, 4], [5, 6]])`.
+|||
+||| @idx The index of the row at which to split the `Tensor`. The row with index `idx` in
+|||   the input `Tensor` will appear in the result as the first row in the second `Tensor`.
+export
+split : (idx : Nat) -> Tensor ((idx + rest) :: tl) dtype
+  -> (Tensor (idx :: tl) dtype, Tensor (rest :: tl) dtype)
+
+||| Concatenate two `Tensor`s along their first axis. For example,
+||| `concat (const [[1, 2], [3, 4]]) (const [[5, 6]])` is equivalent to
+||| `const [[1, 2], [3, 4], [5, 6]]`.
+export
+concat : Tensor (n :: tl) dtype -> Tensor (m :: tl) dtype -> Tensor ((n + m) :: tl) dtype
+
 ||| Add a dimension of length one at the specified `axis`. The new dimension will be at the
 ||| specified axis in the new `Tensor` (as opposed to the original `Tensor`). For example, 
 ||| `expand 1 $ const [[1, 2], [3, 4], [5, 6]]` is equivalent to
diff --git a/tutorials/BayesianOptimization.md b/tutorials/BayesianOptimization.md
index 5e7b2706e..d8c8edfa7 100644
--- a/tutorials/BayesianOptimization.md
+++ b/tutorials/BayesianOptimization.md
@@ -45,6 +45,9 @@ How we produce the new points from the data and models depends on the problem at
 import Tensor
 import BayesianOptimization
 import Model
+import Model.GaussianProcess
+import Model.Kernel
+import Model.MeanFunction
 import Distribution
 import Optimize
 import Util
@@ -57,12 +60,12 @@ historicData = (const [[0.3, 0.4], [0.5, 0.2], [0.3, 0.9]], const [[1.2], [-0.5]
 and model that data
 
 ```idris
-model : ProbabilisticModel [2] {marginal=Gaussian [1]}
-model = let optimizer = gridSearch (const [100, 100]) (const [-10, 10]) (const [-10, 10])
-            mk_prior = \len_and_noise => MkGP zero (rbf $ index 0 len_and_noise)
-            mk_likelihood = \len_and_noise => MkGaussian (fill 0) (diag $ index 1 len_and_noise)
-            (qp, obs) = historicData
-         in ?marginalise $ fit optimizer mk_prior mk_likelihood (qp, squeeze obs)
+model : ConjugateGPRegression [2]
+model = let mk_gp = \len => MkGP zero (matern52 (const {shape=[]} 1.0) $ squeeze len)
+            length_scale = const {shape=[1]} [0.5]
+            noise = const {shape=[]} 0.2
+            model = MkConjugateGPR mk_gp length_scale noise
+         in fit model lbfgs historicData
 ```
 
 then optimize over the marginal mean
@@ -73,7 +76,7 @@ optimizer = let gs = gridSearch (const [100, 100]) (const [0.0, 0.0]) (const [1.
              in \f => broadcast . gs $ f . broadcast
 
 newPoint : Tensor [1, 2] Double
-newPoint = optimizer $ squeeze . mean . model
+newPoint = optimizer $ squeeze . mean . (predict_latent model)
 ```
 
 This is a particularly simple example of the standard approach of defining an _acquisition function_ over the input space which quantifies how useful it would be evaluate the objective at a set of points, then finding the points that optimize this acquisition function. We can visualise this:
@@ -110,12 +113,12 @@ In this case, our acquisition function depends on the model (which in turn depen
 In the above example, we constructed the acquisition function from our model, then optimized it, and in doing so, we assumed that we have access to the data and models when we compose the acquisition function with the optimizer. This might not be the case: we may want to compose things before we get the data and model. Using spidr's names, we'd apply an `Optimizer` to an `i -> Acquisition`. We'd normally do this with `map`, a method on the `Functor` interface, but functions, including `i -> o`, don't implement `Functor` (indeed, in Idris, they can't). We can however, wrap an `i -> o` in the `Morphism i o` type (also called `i ~> o` with a tilde) which does implement `Functor`. We can `map` an `Optimizer` over a `i ~> Acquisition`, as follows:
 
 ```idris
-modelMean : ProbabilisticModel [2] {targets=[1]} {marginal=Gaussian [1]} -> Acquisition 1 [2]
-modelMean model = squeeze . mean . model
+modelMean : ProbabilisticModel [2] {marginal=Gaussian [1]} -> Acquisition 1 [2]
+modelMean predict = squeeze . mean . predict
 
 newPoint' : Tensor [1, 2] Double
 newPoint' = let acquisition = map optimizer (Mor modelMean)  -- `Mor` makes a `Morphism`
-             in run acquisition model  -- `run` turns a `Morphism` into a function
+             in run acquisition (predict_latent model)  -- `run` turns a `Morphism` into a function
 ```
 
 ## Combining empirical values with `Applicative`
@@ -173,7 +176,12 @@ With this new functionality at hand, we'll return to our objective with failure
 failureData : Data {samples=4} [2] [1]
 failureData = (const [[0.3, 0.4], [0.5, 0.2], [0.3, 0.9], [0.7, 0.1]], const [[0], [0], [0], [1]])
 
-failureModel : ProbabilisticModel [2] {marginal=Gaussian [1]}
+failureModel : ConjugateGPRegression [2]
+failureModel = let mk_gp = \len => MkGP zero (rbf $ squeeze len)
+                   length_scale = const {shape=[1]} [0.2]
+                   noise = const {shape=[]} 0.1
+                   model = MkConjugateGPR mk_gp length_scale noise
+                in fit model lbfgs failureData
 ```
 
 and we'll gather all the data and models in a `record`:
@@ -189,9 +197,10 @@ Idris generates two methods `objective` and `failure` from this `record`, which
 
 ```idris
 newPoint'' : Tensor [1, 2] Double
-newPoint'' = let eci = objective >>> expectedConstrainedImprovement (const 0.5)
-                 pof = failure >>> probabilityOfFeasibility (const 0.5)
+newPoint'' = let eci = objective >>> expectedConstrainedImprovement (const 0.5) {s=_}
+                 pof = failure >>> probabilityOfFeasibility (const 0.5) {s=_}
                  acquisition = map optimizer (eci <*> pof)
-                 dataAndModel = Label (historicData, model) (failureData, failureModel)
+                 dataAndModel = Label (historicData, predict_latent model)
+                                      (failureData, predict_latent failureModel)
               in run acquisition dataAndModel
 ```