Merge branch 'wsvd_na' into dev

# Conflicts: # R/tfb-fpc.R # man/tfb_fpc.Rd # tests/testthat/test-rebase.R
tidyfun · Mar 11, 2024 · 188b9a9 · 188b9a9
2 parents 2f22ec9 + 3fde9b2
commit 188b9a9
Show file tree

Hide file tree

Showing 13 changed files with 418 additions and 75 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -62,6 +62,7 @@ Collate:
     'rebase.R'
     'rng.R'
     'smooth.R'
+    'soft-impute-svd.R'
     'summarize.R'
     'tf-package.R'
     'tfb-fpc.R'

diff --git a/R/bibentries.R b/R/bibentries.R
@@ -19,6 +19,27 @@ bibentries <- c(
     year = "2009",
     publisher = "Taylor & Francis"
   ),
+  mazumder2010 = bibentry("article",
+    title= "Spectral regularization algorithms for learning large incomplete matrices",
+    author = "Mazumder, Rahul and Hastie, Trevor and Tibshirani, Robert",
+    journal = "The Journal of Machine Learning Research",
+    volume = "11",
+    pages = "2287-2322",
+    year = "2010"
+  ),
+  softimpute = bibentry("manual",
+    title = "\\code{softImpute}: Matrix Completion via Iterative Soft-Thresholded SVD",
+    author = "Trevor Hastie and Rahul Mazumder",
+    year = "2021",
+    note = "R package version 1.4-1",
+    url = "https://CRAN.R-project.org/package=softImpute"
+  ),
+  meng2023mogsa = bibentry("manual",
+    title = "\\code{mogsa}: Multiple omics data integrative clustering and gene set analysis",
+    author = "Chen Meng",
+    year = "2023",
+    url = "https://bioconductor.org/packages/mogsa"
+  ),
   swihart2010lasagna = bibentry("article",
     title = "Lasagna plots: a saucy alternative to spaghetti plots",
     author = "Swihart, Bruce J and Caffo, Brian and James, Bryan D and Strand, Matthew and Schwartz, Brian S and Punjabi, Naresh M",

diff --git a/R/rebase.R b/R/rebase.R
@@ -19,6 +19,8 @@
 tf_rebase <- function(object, basis_from, arg = tf_arg(basis_from), ...) {
   all.equal(tf_domain(object), tf_domain(basis_from)) |>
     assert_true()
+  assert_arg(arg, object, check_unique = FALSE)
+  assert_arg(arg, basis_from, check_unique = FALSE)
   UseMethod("tf_rebase", object)
 }
 
@@ -59,6 +61,9 @@ tf_rebase.tfd.tfb_spline <-  function(object, basis_from, arg = tf_arg(basis_fro
 }
 #'@export
 tf_rebase.tfd.tfb_fpc <-  function(object, basis_from, arg = tf_arg(basis_from), ...) {
+  if (is_irreg(object)) {
+    warning("")
+  }
   data <- tf_interpolate(object, arg = arg) |> as.data.frame(unnest = TRUE)
   new_tfb_fpc(data = data, basis_from = basis_from,
               domain = tf_domain(basis_from),

diff --git a/R/smooth.R b/R/smooth.R
@@ -20,6 +20,7 @@
 #' @param x a `tf` object containing functional data
 #' @param method one of "lowess" (see [stats::lowess()]), "rollmean",
 #'   "rollmedian" (see [zoo::rollmean()]) or "savgol" (see [pracma::savgol()])
+#' @param verbose give lots of diagnostic messages? Defaults to TRUE
 #' @param ... arguments for the respective `method`. See Details.
 #' @returns a smoothed version of the input. For some methods/options, the
 #'   smoothed functions may be shorter than the original ones (at both ends).
@@ -31,8 +32,8 @@ tf_smooth <- function(x, ...) {
 
 #' @export
 #' @rdname tf_smooth
-tf_smooth.tfb <- function(x, ...) {
-  warning(
+tf_smooth.tfb <- function(x, verbose = TRUE, ...) {
+  if (verbose) warning(
     "you called tf_smooth on a tfb object, not on a tfd object -- ",
     "just use a smaller basis or stronger penalization.\n",
     "Returning unchanged tfb object.",
@@ -70,12 +71,12 @@ tf_smooth.tfb <- function(x, ...) {
 #' lines(f_sg, col = "red")
 tf_smooth.tfd <- function(x,
                           method = c("lowess", "rollmean", "rollmedian", "savgol"),
-                          ...) {
+                          verbose = TRUE, ...) {
   method <- match.arg(method)
   smoother <- get(method, mode = "function")
   dots <- list(...)
   if (method %in% c("savgol", "rollmean", "rollmedian")) {
-    if (!is_equidist(x)) {
+    if (verbose & !is_equidist(x)) {
       warning(
         "non-equidistant arg-values in ", sQuote(deparse(substitute(x))),
         " ignored by ", method, ".",
@@ -86,18 +87,18 @@ tf_smooth.tfd <- function(x,
       if (is.null(dots$k)) {
         dots$k <- ceiling(0.05 * min(tf_count(x)))
         dots$k <- dots$k + !(dots$k %% 2) # make uneven
-        message("using k = ", dots$k, " observations for rolling data window.")
+        if (verbose) message("using k = ", dots$k, " observations for rolling data window.")
       }
       if (is.null(dots$fill)) {
-        message("setting fill = 'extend' for start/end values.")
+        if (verbose) message("setting fill = 'extend' for start/end values.")
         dots$fill <- "extend"
       }
     }
     if (method == "savgol") {
       if (is.null(dots$fl)) {
         dots$fl <- ceiling(0.15 * min(tf_count(x)))
         dots$fl <- dots$fl + !(dots$fl %% 2) # make uneven
-        message("using fl = ", dots$fl, " observations for rolling data window.")
+        if (verbose) message("using fl = ", dots$fl, " observations for rolling data window.")
       }
     }
 
@@ -109,7 +110,7 @@ tf_smooth.tfd <- function(x,
   if (method == "lowess") {
     if (is.null(dots$f)) {
       dots$f <- 0.15
-      message("using f = ", dots$f, " as smoother span for lowess")
+      if (verbose) message("using f = ", dots$f, " as smoother span for lowess")
     }
     smoothed <- map(
       tf_evaluations(x), \(x) do.call(smoother, append(list(x), dots))$y

diff --git a/R/soft-impute-svd.R b/R/soft-impute-svd.R
@@ -0,0 +1,59 @@
+# copied from softImpute::Frob (v 1.4-1) under GPL-2
+# original authors: Trevor Hastie <hastie@stanford.edu> and Rahul Mazumder <rahul.mazumder@gmail.com>
+frob <- function(Uold, Dsqold, Vold, U, Dsq, V) {
+  denom = sum(Dsqold^2)
+  utu = Dsq * (t(U) %*% Uold)
+  vtv = Dsqold * (t(Vold) %*% V)
+  uvprod = sum(diag(utu %*% vtv))
+  num = denom + sum(Dsq^2) - 2 * uvprod
+  num/max(denom, 1e-09)
+}
+
+# code slightly adapted and shortened from softImpute::simpute.svd.R (v 1.4-1) under GPL-2
+# original authors: Trevor Hastie <hastie@stanford.edu> and Rahul Mazumder <rahul.mazumder@gmail.com>
+# adaptations:
+#  - no warm starts, no returned call, no attributes
+# inputs: x matrix of weighted (w/ integration weights) & centered (!) function evaluations, with NAs
+# output: (regularized) SVD of x
+simpute_svd <- function(x, J = min(dim(x)) - 1, thresh = 1e-05, lambda = 0, maxit = 100,
+                        trace.it = FALSE, ...) {
+  n <- dim(x)
+  m <- n[2]
+  n <- n[1]
+  xnas <- is.na(x)
+
+  nz <- m * n - sum(xnas)
+  xfill <- x
+  xfill[xnas] <- 0
+
+  svd.xfill <- svd(xfill)
+  ratio <- 1
+  iter <- 0
+  while ((ratio > thresh) & (iter < maxit)) {
+    iter <- iter + 1
+    svd.old <- svd.xfill
+    d <- svd.xfill$d
+    d <- pmax(d - lambda, 0)
+    xhat <- svd.xfill$u[, seq(J)] %*% (d[seq(J)] * t(svd.xfill$v[, seq(J)]))
+    xfill[xnas] <- xhat[xnas]
+    svd.xfill <- svd(xfill)
+    ratio <- frob(
+      svd.old$u[, seq(J)], d[seq(J)], svd.old$v[, seq(J)],
+      svd.xfill$u[, seq(J)], pmax(svd.xfill$d - lambda, 0)[seq(J)], svd.xfill$v[, seq(J)]
+    )
+    if (trace.it) {
+      obj <- (.5 * sum((xfill - xhat)[!xnas]^2) + lambda * sum(d)) / nz
+      cat(iter, ":", "obj", format(round(obj, 5)), "ratio", ratio, "\n")
+    }
+  }
+  d <- pmax(svd.xfill$d[seq(J)] - lambda, 0)
+  J <- min(sum(d > 0) + 1, J)
+  svd.xfill <- list(u = svd.xfill$u[, seq(J)], d = d[seq(J)], v = svd.xfill$v[, seq(J)])
+  if (iter == maxit) {
+    warning(paste("Incomplete-data-SVD convergence not achieved by", maxit,
+                  "iterations"), call. = FALSE)
+  }
+  svd.xfill
+}
+
+
diff --git a/R/tfb-fpc-utils.R b/R/tfb-fpc-utils.R
@@ -1,19 +1,43 @@
-#' Eigenfunctions via weighted SVD
+#' Eigenfunctions via weighted, regularized SVD
 #'
 #' Compute (truncated) orthonormal eigenfunctions and scores
-#' for data potentially on a non-equidistant grid.
+#' for (partially missing) data on a common (potentially non-equidistant) grid.
+#'
+#' Performs a weighted SVD with trapezoidal quadrature weights s.t. returned
+#' vectors represent (evaluations of)
+#' orthonormal eigen*functions* \eqn{\phi_j(t)}, not eigen*vectors*
+#' \eqn{\phi_j = (\phi_j(t_1), \dots, \phi_j(t_n))}, specifically:\cr
+#' \eqn{\int_T \phi_j(t)^2 dt \approx \sum_i \Delta_i \phi_j(t_i)^2 = 1}
+#' given quadrature weights \eqn{\Delta_i}, not
+#' \eqn{\phi_j'\phi_j = \sum_i \phi_j(t_i)^2 = 1};\cr
+#' \eqn{\int_T \phi_j(t) \phi_k(t) dt = 0} not
+#' \eqn{\phi_j'\phi_k = \sum_i \phi_j(t_i)\phi_k(t_i) = 0},
+#' c.f. `mogsa::wsvd()`.\cr
+#' For incomplete data, this uses an adaptation of `softImpute::softImpute()`,
+#' see references. Note that will not work well for data on a common grid if more
+#' than a few percent of data points are missing, and it breaks down completely
+#' for truly irregular data with no/few common timepoints, even if observed very
+#' densely. For such data, either re-evaluate on a common grid first or use more
+#' advanced FPCA approaches like `refund::fpca_sc()`,
+#' see last example for [tfb_fpc()]
+#'
 #' @param data numeric matrix of function evaluations
 #'   (each row is one curve, no NAs)
 #' @param arg numeric vector of argument values
 #' @param pve percentage of variance explained
 #' @returns a list with entries
-#' - `mu`` estimated mean function (numeric vector)
-#' - `efunctions`` estimated FPCs (numeric matrix, columns represent FPCs)
+#' - `mu` estimated mean function (numeric vector)
+#' - `efunctions` estimated FPCs (numeric matrix, columns represent FPCs)
 #' - `scores` estimated FPC scores (one row per observed curve)
 #' - `npc` how many FPCs were returned for the given `pve` (integer)
-#' @references code adapted from / inspired by `wsvd()` function of Bioconductor
-#'   package `mogsa` by Cheng Meng.
-#' @author Cheng Meng, Fabian Scheipl
+#' - `scoring_function` a function that returns FPC scores for new data
+#'    and given eigenfunctions, see `tf:::.fpc_wsvd_scores` for an example.
+#' @author Trevor Hastie, Rahul Mazumder, Cheng Meng, Fabian Scheipl
+#' @references code adapted from / inspired by `mogsa::wsvd()` by Cheng Meng
+#'   and `softImpute::softImpute()` by Trevor Hastie and Rahul Mazumder.\cr
+#' `r format_bib("meng2023mogsa")`\cr
+#' `r format_bib("mazumder2010")`\cr
+#' `r format_bib("softimpute")`
 #' @family tfb-class
 #' @family tfb_fpc-class
 fpc_wsvd <- function(data, arg, pve = 0.995) {
@@ -22,24 +46,42 @@ fpc_wsvd <- function(data, arg, pve = 0.995) {
 
 #' @rdname fpc_wsvd
 #' @importFrom utils head tail
+#' @importFrom stats lowess
 #' @export
 fpc_wsvd.matrix <- function(data, arg, pve = 0.995) {
-  assert_matrix(data, mode = "numeric", any.missing = FALSE,
-                min.cols = 2, min.rows = 1)
+  assert_matrix(data, mode = "numeric", min.cols = 2, min.rows = 1)
   assert_numeric(arg, any.missing = FALSE, sorted = TRUE, len = ncol(data))
   assert_number(pve, lower = 0, upper = 1)
 
   delta <- c(0, diff(arg))
   # trapezoid integration weights:
   weights <- 0.5 * c(delta[-1] + head(delta, -1), tail(delta, 1))
-  mean <- colMeans(data)
+  mean <- colMeans(data, na.rm = TRUE)
+
   data_wc <- t((t(data) - mean) * sqrt(weights))
 
-  pc <- svd(data_wc, nu = 0, nv = min(dim(data)))
+  nas <- is.na(data)
+  pc <- if (!any(nas)) {
+    svd(data_wc, nu = 0, nv = min(dim(data)))
+  } else {
+    message("Using softImpute SVD on ", round(mean(nas)*100, 1), "% missing data")
+    if (pve + mean(nas) > 1) {
+      warning("High <pve> with many missings likely to yield bad FPC estimates.",
+              call. = FALSE)
+    }
+    simpute_svd(data_wc)
+  }
+
   pve_observed <- cumsum(pc$d^2) / sum(pc$d^2)
   use <- min(which(pve_observed >= pve))
 
   efunctions <- pc$v[, 1:use] / sqrt(weights)
+
+  if (any(nas)) {
+    # slightly smooth efunctions from incomplete data to reduce artefacts
+    efunctions <- apply(efunctions, 2,
+                        \(ef) stats::lowess(x = arg, y = ef,  f = .15)$y)
+  }
   evalues <- (pc$d[1:use])^2
   scores <- .fpc_wsvd_scores(data, efunctions, mean, weights) #!!
 
@@ -51,9 +93,14 @@ fpc_wsvd.matrix <- function(data, arg, pve = 0.995) {
   )
 }
 
-#extract for reuse in tf_rebase
-.fpc_wsvd_scores <- function(data_matrix, efunctions,  mean, weights) {
-  data_wc <- t((t(data_matrix) - mean) * sqrt(weights))
+# extract scoring function for reuse in tf_rebase
+# performs simple (weighted) LS fit of data onto the eigenfunctions.
+.fpc_wsvd_scores <- function(data_matrix, efunctions, mean, weights) {
+  w_mat <- matrix(weights, ncol = length(weights), nrow = nrow(data_matrix),
+                  byrow = TRUE)
+  w_mat[is.na(data_matrix)] <- 0
+  data_matrix[is.na(data_matrix)] <- 0
+  data_wc <- t((t(data_matrix) - mean) * sqrt(t(w_mat)))
   t(qr.coef(qr(efunctions), t(data_wc) / sqrt(weights)))
 }
 

diff --git a/R/tfb-fpc.R b/R/tfb-fpc.R
@@ -14,7 +14,7 @@ new_tfb_fpc <- function(data, domain = NULL,
     stop("Can't specify both method *and* basis_from for new_tfb_fpc",
          call. = FALSE)
   }
-  arg <- sort(unique(data$arg))
+  arg <- mgcv::uniquecombs(data$arg, ordered = TRUE) |> unlist()
 
   domain <- domain %||% range(arg)
   if (!isTRUE(all.equal(domain, range(arg)))) {
@@ -74,20 +74,24 @@ new_tfb_fpc <- function(data, domain = NULL,
 #'
 #' These functions perform a (functional) principal component analysis (FPCA) of
 #' the input data and return an `tfb_fpc` `tf`-object that uses the empirical
-#' eigenfunctions as basis functions for representing the data. By default, a
-#' simple, not smoothed, truncated weighted SVD of the functions is used to
-#' compute those ("`method = fpc_wsvd`"). Note that this is suitable only for
-#' regular data all observed on the same (not necessarily equidistant) grid. See
-#' Details / Example for possible alternatives and extensions. \cr
+#' eigenfunctions as basis functions for representing the data. The default
+#' ("`method = fpc_wsvd`") uses a (truncated) weighted SVD for complete
+#' data on a common grid and a nuclear-norm regularized (truncated) weighted SVD
+#' for partially missing data on a common grid, see [fpc_wsvd()].
+#' The latter is likely to break down for high PVE and/or high amounts of
+#' missingness.\cr
 #'
-#' Any "factorization" method that accepts a `data.frame` with
+#' For the FPC basis, any factorization method that accepts a `data.frame` with
 #' columns `id`, `arg`, `value` containing the functional data and returns a
-#' list structured like the return object
+#' list with eigenfunctions and FPC scores structured like the return object
 #' of [fpc_wsvd()] can be used for the `method`` argument, see example below.
+#' Note that the mean function, with a fixed "score" of 1 for all functions,
+#' is used as the first basis function for all FPC bases.
 #'
 #' @export
 #' @param method the function to use that computes eigenfunctions and scores.
-#'   Defaults to [fpc_wsvd()], which gives unsmoothed eigenfunctions.
+#'   Defaults to [fpc_wsvd()], which is quick and easy but returns completely
+#'   unsmoothed eigenfunctions unlikely to be suited for noisy data.
 #' @param ... arguments to the `method` which computes the
 #'  (regularized/smoothed) FPCA - see e.g. [fpc_wsvd()].
 #'  Unless set by the user, uses proportion of variance explained
@@ -107,19 +111,44 @@ tfb_fpc <- function(data, ...) UseMethod("tfb_fpc")
 #' @export
 #' @inheritParams tfd.data.frame
 #' @examples
-#' # Apply FPCA for sparse data using refund::fpca.sc:
+#' set.seed(13121)
+#' x <- tf_rgp(25, nugget = .02)
+#' x_pc <- tfb_fpc(x, pve = .9)
+#' x_pc
+#' plot(x, lwd = 3)
+#' lines(x_pc, col = 2, lty = 2)
+#' x_pc_full <- tfb_fpc(x, pve = .995)
+#' x_pc_full
+#' lines(x_pc_full, col = 3, lty = 2)
+#' # partially missing data on common grid:
+#' x_mis <- x |> tf_sparsify(dropout = .05)
+#' x_pc_mis <- tfb_fpc(x_mis, pve = .9)
+#' x_pc_mis
+#' plot(x_mis, lwd = 3)
+#' lines(x_pc_mis, col = 4, lty = 2)
+#' # extract FPC basis --
+#' # first "eigenvector" in black is (always) the mean function
+#' x_pc |> tf_basis(as_tfd = TRUE) |> plot(col = 1:5)
+#' \donttest{
+#' # Apply FPCA for sparse, irregular data using refund::fpca.sc:
 #' set.seed(99290)
-#' # create sparse data:
-#' data <- tf_rgp(15) |>
-#'   tf_sparsify() |>
+#' # create sparse, irregular data:
+#' x_irreg <- x |>
+#'   tf_jiggle() |> tf_sparsify(dropout = 0.3)
+#' plot(x_irreg)
+#' x_df <- x_irreg |>
 #'   as.data.frame(unnest = TRUE)
 #' # wrap refund::fpca_sc for use as FPCA method in tfb_fpc:
 #' fpca_sc_wrapper <- function(data, arg, pve = 0.995, ...) {
 #'   data_mat <- tf:::df_2_mat(data)
 #'   fpca <- refund::fpca.sc(
 #'     Y = data_mat, argvals = attr(data_mat, "arg"), pve = pve, ...
 #'   )
-#'   fpca[c("mu", "efunctions", "scores", "npc")]
+#'   c(fpca[c("mu", "efunctions", "scores", "npc")],
+#'     scoring_function = tf:::.fpc_wsvd_scores)
+#' }
+#' x_pc <- tfb_fpc(x_df, method = fpca_sc_wrapper)
+#' lines(x_pc, col = 2, lty = 2)
 #' }
 tfb_fpc.data.frame <- function(data, id = 1, arg = 2, value = 3,
                                domain = NULL, method = fpc_wsvd, ...) {
@@ -146,11 +175,6 @@ tfb_fpc.numeric <- function(data, arg = NULL, domain = NULL,
   tfb_fpc(data = data, arg = arg, method = method, domain = domain, ...)
 }
 
-# #' @rdname tfb_fpc
-# #' @export
-# tfb_fpc.list <- function(data, arg = NULL, domain = NULL, method = fpc_wsvd,
-# TODO
-
 #' @rdname tfb_fpc
 #' @export
 tfb_fpc.tf <- function(data, arg = NULL, method = fpc_wsvd, ...) {