diff --git a/DESCRIPTION b/DESCRIPTION index 8aa655e3..fa5b6d0b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -62,6 +62,7 @@ Collate: 'rebase.R' 'rng.R' 'smooth.R' + 'soft-impute-svd.R' 'summarize.R' 'tf-package.R' 'tfb-fpc.R' diff --git a/R/bibentries.R b/R/bibentries.R index 6f9e4983..a30e10c9 100644 --- a/R/bibentries.R +++ b/R/bibentries.R @@ -19,6 +19,27 @@ bibentries <- c( year = "2009", publisher = "Taylor & Francis" ), + mazumder2010 = bibentry("article", + title= "Spectral regularization algorithms for learning large incomplete matrices", + author = "Mazumder, Rahul and Hastie, Trevor and Tibshirani, Robert", + journal = "The Journal of Machine Learning Research", + volume = "11", + pages = "2287-2322", + year = "2010" + ), + softimpute = bibentry("manual", + title = "\\code{softImpute}: Matrix Completion via Iterative Soft-Thresholded SVD", + author = "Trevor Hastie and Rahul Mazumder", + year = "2021", + note = "R package version 1.4-1", + url = "https://CRAN.R-project.org/package=softImpute" + ), + meng2023mogsa = bibentry("manual", + title = "\\code{mogsa}: Multiple omics data integrative clustering and gene set analysis", + author = "Chen Meng", + year = "2023", + url = "https://bioconductor.org/packages/mogsa" + ), swihart2010lasagna = bibentry("article", title = "Lasagna plots: a saucy alternative to spaghetti plots", author = "Swihart, Bruce J and Caffo, Brian and James, Bryan D and Strand, Matthew and Schwartz, Brian S and Punjabi, Naresh M", diff --git a/R/rebase.R b/R/rebase.R index 192b2a93..2c579a70 100644 --- a/R/rebase.R +++ b/R/rebase.R @@ -19,6 +19,8 @@ tf_rebase <- function(object, basis_from, arg = tf_arg(basis_from), ...) { all.equal(tf_domain(object), tf_domain(basis_from)) |> assert_true() + assert_arg(arg, object, check_unique = FALSE) + assert_arg(arg, basis_from, check_unique = FALSE) UseMethod("tf_rebase", object) } @@ -59,6 +61,9 @@ tf_rebase.tfd.tfb_spline <- function(object, basis_from, arg = tf_arg(basis_fro } #'@export tf_rebase.tfd.tfb_fpc <- function(object, basis_from, arg = tf_arg(basis_from), ...) { + if (is_irreg(object)) { + warning("") + } data <- tf_interpolate(object, arg = arg) |> as.data.frame(unnest = TRUE) new_tfb_fpc(data = data, basis_from = basis_from, domain = tf_domain(basis_from), diff --git a/R/smooth.R b/R/smooth.R index 9cb06403..995ff43a 100644 --- a/R/smooth.R +++ b/R/smooth.R @@ -20,6 +20,7 @@ #' @param x a `tf` object containing functional data #' @param method one of "lowess" (see [stats::lowess()]), "rollmean", #' "rollmedian" (see [zoo::rollmean()]) or "savgol" (see [pracma::savgol()]) +#' @param verbose give lots of diagnostic messages? Defaults to TRUE #' @param ... arguments for the respective `method`. See Details. #' @returns a smoothed version of the input. For some methods/options, the #' smoothed functions may be shorter than the original ones (at both ends). @@ -31,8 +32,8 @@ tf_smooth <- function(x, ...) { #' @export #' @rdname tf_smooth -tf_smooth.tfb <- function(x, ...) { - warning( +tf_smooth.tfb <- function(x, verbose = TRUE, ...) { + if (verbose) warning( "you called tf_smooth on a tfb object, not on a tfd object -- ", "just use a smaller basis or stronger penalization.\n", "Returning unchanged tfb object.", @@ -70,12 +71,12 @@ tf_smooth.tfb <- function(x, ...) { #' lines(f_sg, col = "red") tf_smooth.tfd <- function(x, method = c("lowess", "rollmean", "rollmedian", "savgol"), - ...) { + verbose = TRUE, ...) { method <- match.arg(method) smoother <- get(method, mode = "function") dots <- list(...) if (method %in% c("savgol", "rollmean", "rollmedian")) { - if (!is_equidist(x)) { + if (verbose & !is_equidist(x)) { warning( "non-equidistant arg-values in ", sQuote(deparse(substitute(x))), " ignored by ", method, ".", @@ -86,10 +87,10 @@ tf_smooth.tfd <- function(x, if (is.null(dots$k)) { dots$k <- ceiling(0.05 * min(tf_count(x))) dots$k <- dots$k + !(dots$k %% 2) # make uneven - message("using k = ", dots$k, " observations for rolling data window.") + if (verbose) message("using k = ", dots$k, " observations for rolling data window.") } if (is.null(dots$fill)) { - message("setting fill = 'extend' for start/end values.") + if (verbose) message("setting fill = 'extend' for start/end values.") dots$fill <- "extend" } } @@ -97,7 +98,7 @@ tf_smooth.tfd <- function(x, if (is.null(dots$fl)) { dots$fl <- ceiling(0.15 * min(tf_count(x))) dots$fl <- dots$fl + !(dots$fl %% 2) # make uneven - message("using fl = ", dots$fl, " observations for rolling data window.") + if (verbose) message("using fl = ", dots$fl, " observations for rolling data window.") } } @@ -109,7 +110,7 @@ tf_smooth.tfd <- function(x, if (method == "lowess") { if (is.null(dots$f)) { dots$f <- 0.15 - message("using f = ", dots$f, " as smoother span for lowess") + if (verbose) message("using f = ", dots$f, " as smoother span for lowess") } smoothed <- map( tf_evaluations(x), \(x) do.call(smoother, append(list(x), dots))$y diff --git a/R/soft-impute-svd.R b/R/soft-impute-svd.R new file mode 100644 index 00000000..4a6bc487 --- /dev/null +++ b/R/soft-impute-svd.R @@ -0,0 +1,59 @@ +# copied from softImpute::Frob (v 1.4-1) under GPL-2 +# original authors: Trevor Hastie and Rahul Mazumder +frob <- function(Uold, Dsqold, Vold, U, Dsq, V) { + denom = sum(Dsqold^2) + utu = Dsq * (t(U) %*% Uold) + vtv = Dsqold * (t(Vold) %*% V) + uvprod = sum(diag(utu %*% vtv)) + num = denom + sum(Dsq^2) - 2 * uvprod + num/max(denom, 1e-09) +} + +# code slightly adapted and shortened from softImpute::simpute.svd.R (v 1.4-1) under GPL-2 +# original authors: Trevor Hastie and Rahul Mazumder +# adaptations: +# - no warm starts, no returned call, no attributes +# inputs: x matrix of weighted (w/ integration weights) & centered (!) function evaluations, with NAs +# output: (regularized) SVD of x +simpute_svd <- function(x, J = min(dim(x)) - 1, thresh = 1e-05, lambda = 0, maxit = 100, + trace.it = FALSE, ...) { + n <- dim(x) + m <- n[2] + n <- n[1] + xnas <- is.na(x) + + nz <- m * n - sum(xnas) + xfill <- x + xfill[xnas] <- 0 + + svd.xfill <- svd(xfill) + ratio <- 1 + iter <- 0 + while ((ratio > thresh) & (iter < maxit)) { + iter <- iter + 1 + svd.old <- svd.xfill + d <- svd.xfill$d + d <- pmax(d - lambda, 0) + xhat <- svd.xfill$u[, seq(J)] %*% (d[seq(J)] * t(svd.xfill$v[, seq(J)])) + xfill[xnas] <- xhat[xnas] + svd.xfill <- svd(xfill) + ratio <- frob( + svd.old$u[, seq(J)], d[seq(J)], svd.old$v[, seq(J)], + svd.xfill$u[, seq(J)], pmax(svd.xfill$d - lambda, 0)[seq(J)], svd.xfill$v[, seq(J)] + ) + if (trace.it) { + obj <- (.5 * sum((xfill - xhat)[!xnas]^2) + lambda * sum(d)) / nz + cat(iter, ":", "obj", format(round(obj, 5)), "ratio", ratio, "\n") + } + } + d <- pmax(svd.xfill$d[seq(J)] - lambda, 0) + J <- min(sum(d > 0) + 1, J) + svd.xfill <- list(u = svd.xfill$u[, seq(J)], d = d[seq(J)], v = svd.xfill$v[, seq(J)]) + if (iter == maxit) { + warning(paste("Incomplete-data-SVD convergence not achieved by", maxit, + "iterations"), call. = FALSE) + } + svd.xfill +} + + diff --git a/R/tfb-fpc-utils.R b/R/tfb-fpc-utils.R index 08d25fa8..9c2f085a 100644 --- a/R/tfb-fpc-utils.R +++ b/R/tfb-fpc-utils.R @@ -1,19 +1,43 @@ -#' Eigenfunctions via weighted SVD +#' Eigenfunctions via weighted, regularized SVD #' #' Compute (truncated) orthonormal eigenfunctions and scores -#' for data potentially on a non-equidistant grid. +#' for (partially missing) data on a common (potentially non-equidistant) grid. +#' +#' Performs a weighted SVD with trapezoidal quadrature weights s.t. returned +#' vectors represent (evaluations of) +#' orthonormal eigen*functions* \eqn{\phi_j(t)}, not eigen*vectors* +#' \eqn{\phi_j = (\phi_j(t_1), \dots, \phi_j(t_n))}, specifically:\cr +#' \eqn{\int_T \phi_j(t)^2 dt \approx \sum_i \Delta_i \phi_j(t_i)^2 = 1} +#' given quadrature weights \eqn{\Delta_i}, not +#' \eqn{\phi_j'\phi_j = \sum_i \phi_j(t_i)^2 = 1};\cr +#' \eqn{\int_T \phi_j(t) \phi_k(t) dt = 0} not +#' \eqn{\phi_j'\phi_k = \sum_i \phi_j(t_i)\phi_k(t_i) = 0}, +#' c.f. `mogsa::wsvd()`.\cr +#' For incomplete data, this uses an adaptation of `softImpute::softImpute()`, +#' see references. Note that will not work well for data on a common grid if more +#' than a few percent of data points are missing, and it breaks down completely +#' for truly irregular data with no/few common timepoints, even if observed very +#' densely. For such data, either re-evaluate on a common grid first or use more +#' advanced FPCA approaches like `refund::fpca_sc()`, +#' see last example for [tfb_fpc()] +#' #' @param data numeric matrix of function evaluations #' (each row is one curve, no NAs) #' @param arg numeric vector of argument values #' @param pve percentage of variance explained #' @returns a list with entries -#' - `mu`` estimated mean function (numeric vector) -#' - `efunctions`` estimated FPCs (numeric matrix, columns represent FPCs) +#' - `mu` estimated mean function (numeric vector) +#' - `efunctions` estimated FPCs (numeric matrix, columns represent FPCs) #' - `scores` estimated FPC scores (one row per observed curve) #' - `npc` how many FPCs were returned for the given `pve` (integer) -#' @references code adapted from / inspired by `wsvd()` function of Bioconductor -#' package `mogsa` by Cheng Meng. -#' @author Cheng Meng, Fabian Scheipl +#' - `scoring_function` a function that returns FPC scores for new data +#' and given eigenfunctions, see `tf:::.fpc_wsvd_scores` for an example. +#' @author Trevor Hastie, Rahul Mazumder, Cheng Meng, Fabian Scheipl +#' @references code adapted from / inspired by `mogsa::wsvd()` by Cheng Meng +#' and `softImpute::softImpute()` by Trevor Hastie and Rahul Mazumder.\cr +#' `r format_bib("meng2023mogsa")`\cr +#' `r format_bib("mazumder2010")`\cr +#' `r format_bib("softimpute")` #' @family tfb-class #' @family tfb_fpc-class fpc_wsvd <- function(data, arg, pve = 0.995) { @@ -22,24 +46,42 @@ fpc_wsvd <- function(data, arg, pve = 0.995) { #' @rdname fpc_wsvd #' @importFrom utils head tail +#' @importFrom stats lowess #' @export fpc_wsvd.matrix <- function(data, arg, pve = 0.995) { - assert_matrix(data, mode = "numeric", any.missing = FALSE, - min.cols = 2, min.rows = 1) + assert_matrix(data, mode = "numeric", min.cols = 2, min.rows = 1) assert_numeric(arg, any.missing = FALSE, sorted = TRUE, len = ncol(data)) assert_number(pve, lower = 0, upper = 1) delta <- c(0, diff(arg)) # trapezoid integration weights: weights <- 0.5 * c(delta[-1] + head(delta, -1), tail(delta, 1)) - mean <- colMeans(data) + mean <- colMeans(data, na.rm = TRUE) + data_wc <- t((t(data) - mean) * sqrt(weights)) - pc <- svd(data_wc, nu = 0, nv = min(dim(data))) + nas <- is.na(data) + pc <- if (!any(nas)) { + svd(data_wc, nu = 0, nv = min(dim(data))) + } else { + message("Using softImpute SVD on ", round(mean(nas)*100, 1), "% missing data") + if (pve + mean(nas) > 1) { + warning("High with many missings likely to yield bad FPC estimates.", + call. = FALSE) + } + simpute_svd(data_wc) + } + pve_observed <- cumsum(pc$d^2) / sum(pc$d^2) use <- min(which(pve_observed >= pve)) efunctions <- pc$v[, 1:use] / sqrt(weights) + + if (any(nas)) { + # slightly smooth efunctions from incomplete data to reduce artefacts + efunctions <- apply(efunctions, 2, + \(ef) stats::lowess(x = arg, y = ef, f = .15)$y) + } evalues <- (pc$d[1:use])^2 scores <- .fpc_wsvd_scores(data, efunctions, mean, weights) #!! @@ -51,9 +93,14 @@ fpc_wsvd.matrix <- function(data, arg, pve = 0.995) { ) } -#extract for reuse in tf_rebase -.fpc_wsvd_scores <- function(data_matrix, efunctions, mean, weights) { - data_wc <- t((t(data_matrix) - mean) * sqrt(weights)) +# extract scoring function for reuse in tf_rebase +# performs simple (weighted) LS fit of data onto the eigenfunctions. +.fpc_wsvd_scores <- function(data_matrix, efunctions, mean, weights) { + w_mat <- matrix(weights, ncol = length(weights), nrow = nrow(data_matrix), + byrow = TRUE) + w_mat[is.na(data_matrix)] <- 0 + data_matrix[is.na(data_matrix)] <- 0 + data_wc <- t((t(data_matrix) - mean) * sqrt(t(w_mat))) t(qr.coef(qr(efunctions), t(data_wc) / sqrt(weights))) } diff --git a/R/tfb-fpc.R b/R/tfb-fpc.R index dcfa25dd..29bf12a2 100644 --- a/R/tfb-fpc.R +++ b/R/tfb-fpc.R @@ -14,7 +14,7 @@ new_tfb_fpc <- function(data, domain = NULL, stop("Can't specify both method *and* basis_from for new_tfb_fpc", call. = FALSE) } - arg <- sort(unique(data$arg)) + arg <- mgcv::uniquecombs(data$arg, ordered = TRUE) |> unlist() domain <- domain %||% range(arg) if (!isTRUE(all.equal(domain, range(arg)))) { @@ -74,20 +74,24 @@ new_tfb_fpc <- function(data, domain = NULL, #' #' These functions perform a (functional) principal component analysis (FPCA) of #' the input data and return an `tfb_fpc` `tf`-object that uses the empirical -#' eigenfunctions as basis functions for representing the data. By default, a -#' simple, not smoothed, truncated weighted SVD of the functions is used to -#' compute those ("`method = fpc_wsvd`"). Note that this is suitable only for -#' regular data all observed on the same (not necessarily equidistant) grid. See -#' Details / Example for possible alternatives and extensions. \cr +#' eigenfunctions as basis functions for representing the data. The default +#' ("`method = fpc_wsvd`") uses a (truncated) weighted SVD for complete +#' data on a common grid and a nuclear-norm regularized (truncated) weighted SVD +#' for partially missing data on a common grid, see [fpc_wsvd()]. +#' The latter is likely to break down for high PVE and/or high amounts of +#' missingness.\cr #' -#' Any "factorization" method that accepts a `data.frame` with +#' For the FPC basis, any factorization method that accepts a `data.frame` with #' columns `id`, `arg`, `value` containing the functional data and returns a -#' list structured like the return object +#' list with eigenfunctions and FPC scores structured like the return object #' of [fpc_wsvd()] can be used for the `method`` argument, see example below. +#' Note that the mean function, with a fixed "score" of 1 for all functions, +#' is used as the first basis function for all FPC bases. #' #' @export #' @param method the function to use that computes eigenfunctions and scores. -#' Defaults to [fpc_wsvd()], which gives unsmoothed eigenfunctions. +#' Defaults to [fpc_wsvd()], which is quick and easy but returns completely +#' unsmoothed eigenfunctions unlikely to be suited for noisy data. #' @param ... arguments to the `method` which computes the #' (regularized/smoothed) FPCA - see e.g. [fpc_wsvd()]. #' Unless set by the user, uses proportion of variance explained @@ -107,11 +111,32 @@ tfb_fpc <- function(data, ...) UseMethod("tfb_fpc") #' @export #' @inheritParams tfd.data.frame #' @examples -#' # Apply FPCA for sparse data using refund::fpca.sc: +#' set.seed(13121) +#' x <- tf_rgp(25, nugget = .02) +#' x_pc <- tfb_fpc(x, pve = .9) +#' x_pc +#' plot(x, lwd = 3) +#' lines(x_pc, col = 2, lty = 2) +#' x_pc_full <- tfb_fpc(x, pve = .995) +#' x_pc_full +#' lines(x_pc_full, col = 3, lty = 2) +#' # partially missing data on common grid: +#' x_mis <- x |> tf_sparsify(dropout = .05) +#' x_pc_mis <- tfb_fpc(x_mis, pve = .9) +#' x_pc_mis +#' plot(x_mis, lwd = 3) +#' lines(x_pc_mis, col = 4, lty = 2) +#' # extract FPC basis -- +#' # first "eigenvector" in black is (always) the mean function +#' x_pc |> tf_basis(as_tfd = TRUE) |> plot(col = 1:5) +#' \donttest{ +#' # Apply FPCA for sparse, irregular data using refund::fpca.sc: #' set.seed(99290) -#' # create sparse data: -#' data <- tf_rgp(15) |> -#' tf_sparsify() |> +#' # create sparse, irregular data: +#' x_irreg <- x |> +#' tf_jiggle() |> tf_sparsify(dropout = 0.3) +#' plot(x_irreg) +#' x_df <- x_irreg |> #' as.data.frame(unnest = TRUE) #' # wrap refund::fpca_sc for use as FPCA method in tfb_fpc: #' fpca_sc_wrapper <- function(data, arg, pve = 0.995, ...) { @@ -119,7 +144,11 @@ tfb_fpc <- function(data, ...) UseMethod("tfb_fpc") #' fpca <- refund::fpca.sc( #' Y = data_mat, argvals = attr(data_mat, "arg"), pve = pve, ... #' ) -#' fpca[c("mu", "efunctions", "scores", "npc")] +#' c(fpca[c("mu", "efunctions", "scores", "npc")], +#' scoring_function = tf:::.fpc_wsvd_scores) +#' } +#' x_pc <- tfb_fpc(x_df, method = fpca_sc_wrapper) +#' lines(x_pc, col = 2, lty = 2) #' } tfb_fpc.data.frame <- function(data, id = 1, arg = 2, value = 3, domain = NULL, method = fpc_wsvd, ...) { @@ -146,11 +175,6 @@ tfb_fpc.numeric <- function(data, arg = NULL, domain = NULL, tfb_fpc(data = data, arg = arg, method = method, domain = domain, ...) } -# #' @rdname tfb_fpc -# #' @export -# tfb_fpc.list <- function(data, arg = NULL, domain = NULL, method = fpc_wsvd, -# TODO - #' @rdname tfb_fpc #' @export tfb_fpc.tf <- function(data, arg = NULL, method = fpc_wsvd, ...) { diff --git a/attic/dev-wsvd-na.R b/attic/dev-wsvd-na.R new file mode 100644 index 00000000..a941b73e --- /dev/null +++ b/attic/dev-wsvd-na.R @@ -0,0 +1,103 @@ + +x <- tf_rgp(150) |> tfb_fpc() + +y <- tf_rgp(15, arg = 21L) |> tf_sparsify() + +y_xbase <- tf_rebase(y, x) + +layout(t(1:2)) +plot(y, lwd = 2) +plot(y_xbase, lty = 2, col = 2) + +all.equal(tf_basis(y_xbase, as_tfd = TRUE), + tf_basis(x, as_tfd = TRUE)) + + +#------------------------------------------------------------------------------ + +pve <- .85 +y <- tf_rgp(50, arg = 51L) +y_pc <- tfb_fpc(y, pve = pve) +y_mis <- y |> tf_sparsify(dropout = .2) +y_pc_sparse <- tfb_fpc(y_mis, pve = pve) +y_pc_sparse_impute <- y_mis |> + tf_interpolate(arg = tf_arg(y), evaluator = tf_approx_fill_extend) |> + tfb_fpc(pve = pve) +y_pc_rebase <- tf_rebase(y_mis, y_pc) + + +layout(t(1:4)) +plot(y[1:10], main = "full") +lines(y_pc[1:10], col = 2, lty = 2) +plot(y[1:10], main = "sparse") +lines(y_pc_sparse[1:10], col = 2, lty = 2) +plot(y[1:10], main = "interpolated") +lines(y_pc_sparse_impute[1:10], col = 2, lty = 2) +plot(y[1:10], main = "sparse scores") +lines(y_pc_rebase[1:10], col = 2, lty = 2) + + +#------------------------------------------------------------------------------- + +y <- tf_rgp(50, arg = 51L) +y_pc <- tfb_fpc(y, pve = 1) +y_pc_sparse <- tfb_fpc(y |> tf_jiggle(), pve = 1) +y_pc_sparse_impute <- y |> tf_jiggle(dropout = .3) |> + tf_interpolate(arg = tf_arg(y), evaluator = tf_approx_fill_extend) |> + tfb_fpc(pve = 1) +y_pc_rebase <- tf_rebase(y |> tf_jiggle() |> tfd(domain = tf_domain(y)), y_pc) + + +layout(t(1:4)) +plot(y[1:10], main = "full") +lines(y_pc[1:10], col = 2) +plot(y[1:10], main = "sparse") +lines(y_pc_sparse[1:10], col = 2) +plot(y[1:10], main = "interpolated") +lines(y_pc_sparse_impute[1:10], col = 2) +plot(y[1:10], main = "sparse scores") +lines(y_pc_rebase[1:10], col = 2) + + +g <- 201 +n <- 50 + +arg <- seq(0, 1, l = g) +basis <- cbind(sin(2 * pi * arg), cos(2 * pi * arg), + sin(4 * pi * arg), cos(4 * pi * arg)) +norms <- tfd(t(basis))^2 |> tf_integrate() |> sqrt() +basis <- basis/norms + +true_scores <- t(mvtnorm::rmvnorm(n, mean = rep(0, 4), sigma = diag((4:1)^2))) + +data <- t(basis %*% true_scores) +data[rbinom(n*g, 1, .3)] <- NA + + + +delta <- c(0, diff(arg)) +# trapezoid integration weights: +weights <- 0.5 * c(delta[-1] + head(delta, -1), tail(delta, 1)) +mean <- colMeans(data, na.rm = TRUE) +w_mat <- matrix(weights, ncol = length(arg), nrow = nrow(data), byrow = TRUE) +w_mat[is.na(data)] <- 0 +data[is.na(data)] <- 0 +data_wc <- t((t(data) - mean) * sqrt(t(w_mat))) + + + +pc <- svd(data_wc, nu = 0, nv = min(dim(data))) +pve_observed <- cumsum(pc$d^2) / sum(pc$d^2) +use <- min(which(pve_observed >= .99)) + +efunctions <- pc$v[, 1:use] / sqrt(weights) + +t(efuns) |> tfd() |> plot(col = 2) +t(basis) |> tfd() |> lines() + +evalues <- (pc$d[1:use])^2 +scores <- .fpc_wsvd_scores(data, efunctions, mean, weights) + +plot(as.vector(t(scores)), as.vector(true_scores)) +# check results are still orthogonal functions +# diff --git a/man/fpc_wsvd.Rd b/man/fpc_wsvd.Rd index 14ad5bd4..e5e0813f 100644 --- a/man/fpc_wsvd.Rd +++ b/man/fpc_wsvd.Rd @@ -4,7 +4,7 @@ \alias{fpc_wsvd} \alias{fpc_wsvd.matrix} \alias{fpc_wsvd.data.frame} -\title{Eigenfunctions via weighted SVD} +\title{Eigenfunctions via weighted, regularized SVD} \usage{ fpc_wsvd(data, arg, pve = 0.995) @@ -23,19 +23,49 @@ fpc_wsvd(data, arg, pve = 0.995) \value{ a list with entries \itemize{ -\item `mu`` estimated mean function (numeric vector) -\item `efunctions`` estimated FPCs (numeric matrix, columns represent FPCs) +\item \code{mu} estimated mean function (numeric vector) +\item \code{efunctions} estimated FPCs (numeric matrix, columns represent FPCs) \item \code{scores} estimated FPC scores (one row per observed curve) \item \code{npc} how many FPCs were returned for the given \code{pve} (integer) +\item \code{scoring_function} a function that returns FPC scores for new data +and given eigenfunctions, see \code{tf:::.fpc_wsvd_scores} for an example. } } \description{ Compute (truncated) orthonormal eigenfunctions and scores -for data potentially on a non-equidistant grid. +for (partially missing) data on a common (potentially non-equidistant) grid. +} +\details{ +Performs a weighted SVD with trapezoidal quadrature weights s.t. returned +vectors represent (evaluations of) +orthonormal eigen\emph{functions} \eqn{\phi_j(t)}, not eigen\emph{vectors} +\eqn{\phi_j = (\phi_j(t_1), \dots, \phi_j(t_n))}, specifically:\cr +\eqn{\int_T \phi_j(t)^2 dt \approx \sum_i \Delta_i \phi_j(t_i)^2 = 1} +given quadrature weights \eqn{\Delta_i}, not +\eqn{\phi_j'\phi_j = \sum_i \phi_j(t_i)^2 = 1};\cr +\eqn{\int_T \phi_j(t) \phi_k(t) dt = 0} not +\eqn{\phi_j'\phi_k = \sum_i \phi_j(t_i)\phi_k(t_i) = 0}, +c.f. \code{mogsa::wsvd()}.\cr +For incomplete data, this uses an adaptation of \code{softImpute::softImpute()}, +see references. Note that will not work well for data on a common grid if more +than a few percent of data points are missing, and it breaks down completely +for truly irregular data with no/few common timepoints, even if observed very +densely. For such data, either re-evaluate on a common grid first or use more +advanced FPCA approaches like \code{refund::fpca_sc()}, +see last example for \code{\link[=tfb_fpc]{tfb_fpc()}} } \references{ -code adapted from / inspired by \code{wsvd()} function of Bioconductor -package \code{mogsa} by Cheng Meng. +code adapted from / inspired by \code{mogsa::wsvd()} by Cheng Meng +and \code{softImpute::softImpute()} by Trevor Hastie and Rahul Mazumder.\cr +Meng C (2023). +\emph{\code{mogsa}: Multiple omics data integrative clustering and gene set analysis}. +\url{https://bioconductor.org/packages/mogsa}.\cr +Mazumder, Rahul, Hastie, Trevor, Tibshirani, Robert (2010). +\dQuote{Spectral regularization algorithms for learning large incomplete matrices.} +\emph{The Journal of Machine Learning Research}, \bold{11}, 2287-2322.\cr +Hastie T, Mazumder R (2021). +\emph{\code{softImpute}: Matrix Completion via Iterative Soft-Thresholded SVD}. +R package version 1.4-1, \url{https://CRAN.R-project.org/package=softImpute}. } \seealso{ Other tfb-class: @@ -47,7 +77,7 @@ Other tfb_fpc-class: \code{\link{tfb_fpc}()} } \author{ -Cheng Meng, Fabian Scheipl +Trevor Hastie, Rahul Mazumder, Cheng Meng, Fabian Scheipl } \concept{tfb-class} \concept{tfb_fpc-class} diff --git a/man/tf_smooth.Rd b/man/tf_smooth.Rd index 06a51cf3..efcc89a3 100644 --- a/man/tf_smooth.Rd +++ b/man/tf_smooth.Rd @@ -8,15 +8,22 @@ \usage{ tf_smooth(x, ...) -\method{tf_smooth}{tfb}(x, ...) +\method{tf_smooth}{tfb}(x, verbose = TRUE, ...) -\method{tf_smooth}{tfd}(x, method = c("lowess", "rollmean", "rollmedian", "savgol"), ...) +\method{tf_smooth}{tfd}( + x, + method = c("lowess", "rollmean", "rollmedian", "savgol"), + verbose = TRUE, + ... +) } \arguments{ \item{x}{a \code{tf} object containing functional data} \item{...}{arguments for the respective \code{method}. See Details.} +\item{verbose}{give lots of diagnostic messages? Defaults to TRUE} + \item{method}{one of "lowess" (see \code{\link[stats:lowess]{stats::lowess()}}), "rollmean", "rollmedian" (see \code{\link[zoo:rollmean]{zoo::rollmean()}}) or "savgol" (see \code{\link[pracma:savgol]{pracma::savgol()}})} } diff --git a/man/tfb_fpc.Rd b/man/tfb_fpc.Rd index 5af4c775..fd405087 100644 --- a/man/tfb_fpc.Rd +++ b/man/tfb_fpc.Rd @@ -54,7 +54,8 @@ evaluations.} \item{domain}{range of the \code{arg}.} \item{method}{the function to use that computes eigenfunctions and scores. -Defaults to \code{\link[=fpc_wsvd]{fpc_wsvd()}}, which gives unsmoothed eigenfunctions.} +Defaults to \code{\link[=fpc_wsvd]{fpc_wsvd()}}, which is quick and easy but returns completely +unsmoothed eigenfunctions unlikely to be suited for noisy data.} } \value{ an object of class \code{tfb_fpc}, inheriting from \code{tfb}. @@ -64,17 +65,20 @@ mean and eigenfunctions. \description{ These functions perform a (functional) principal component analysis (FPCA) of the input data and return an \code{tfb_fpc} \code{tf}-object that uses the empirical -eigenfunctions as basis functions for representing the data. By default, a -simple, not smoothed, truncated weighted SVD of the functions is used to -compute those ("\code{method = fpc_wsvd}"). Note that this is suitable only for -regular data all observed on the same (not necessarily equidistant) grid. See -Details / Example for possible alternatives and extensions. \cr +eigenfunctions as basis functions for representing the data. The default +("\code{method = fpc_wsvd}") uses a (truncated) weighted SVD for complete +data on a common grid and a nuclear-norm regularized (truncated) weighted SVD +for partially missing data on a common grid, see \code{\link[=fpc_wsvd]{fpc_wsvd()}}. +The latter is likely to break down for high PVE and/or high amounts of +missingness.\cr } \details{ -Any "factorization" method that accepts a \code{data.frame} with +For the FPC basis, any factorization method that accepts a \code{data.frame} with columns \code{id}, \code{arg}, \code{value} containing the functional data and returns a -list structured like the return object +list with eigenfunctions and FPC scores structured like the return object of \code{\link[=fpc_wsvd]{fpc_wsvd()}} can be used for the `method`` argument, see example below. +Note that the mean function, with a fixed "score" of 1 for all functions, +is used as the first basis function for all FPC bases. } \section{Methods (by class)}{ \itemize{ @@ -83,11 +87,32 @@ data is NULL }} \examples{ -# Apply FPCA for sparse data using refund::fpca.sc: +set.seed(13121) +x <- tf_rgp(25, nugget = .02) +x_pc <- tfb_fpc(x, pve = .9) +x_pc +plot(x, lwd = 3) +lines(x_pc, col = 2, lty = 2) +x_pc_full <- tfb_fpc(x, pve = .995) +x_pc_full +lines(x_pc_full, col = 3, lty = 2) +# partially missing data on common grid: +x_mis <- x |> tf_sparsify(dropout = .05) +x_pc_mis <- tfb_fpc(x_mis, pve = .9) +x_pc_mis +plot(x_mis, lwd = 3) +lines(x_pc_mis, col = 4, lty = 2) +# extract FPC basis -- +# first "eigenvector" in black is (always) the mean function +x_pc |> tf_basis(as_tfd = TRUE) |> plot(col = 1:5) +\donttest{ +# Apply FPCA for sparse, irregular data using refund::fpca.sc: set.seed(99290) -# create sparse data: -data <- tf_rgp(15) |> - tf_sparsify() |> +# create sparse, irregular data: +x_irreg <- x |> + tf_jiggle() |> tf_sparsify(dropout = 0.3) +plot(x_irreg) +x_df <- x_irreg |> as.data.frame(unnest = TRUE) # wrap refund::fpca_sc for use as FPCA method in tfb_fpc: fpca_sc_wrapper <- function(data, arg, pve = 0.995, ...) { @@ -95,7 +120,11 @@ fpca_sc_wrapper <- function(data, arg, pve = 0.995, ...) { fpca <- refund::fpca.sc( Y = data_mat, argvals = attr(data_mat, "arg"), pve = pve, ... ) - fpca[c("mu", "efunctions", "scores", "npc")] + c(fpca[c("mu", "efunctions", "scores", "npc")], + scoring_function = tf:::.fpc_wsvd_scores) +} +x_pc <- tfb_fpc(x_df, method = fpca_sc_wrapper) +lines(x_pc, col = 2, lty = 2) } } \seealso{ diff --git a/tests/testthat/test-rebase.R b/tests/testthat/test-rebase.R index 079d7ab1..915fd431 100644 --- a/tests/testthat/test-rebase.R +++ b/tests/testthat/test-rebase.R @@ -5,14 +5,15 @@ test_that("tf_rebase.tfd preserves args & evals and transfers attributes", { names(x) <- letters[1:5] l <- list( + x_2 = tfd(as.matrix(x), resolution = tf_resolution(x) * 2), x_sp = tf_sparsify(x, dropout = .1), - x_ir = tf_sparsify(x, dropout = .1) |> tf_jiggle(amount = .2), + x_ir = tf_sparsify(x, dropout = .1) |> tf_jiggle(amount = .05), b = tfb(x, k = 45, verbose = FALSE), b2 = tfb(x, k = 15, bs = "tp", sp= .1, verbose = FALSE), bu = tfb(x, k = 15, penalized = FALSE, verbose = FALSE), bg = tfb(x, k = 5, global = TRUE, verbose = FALSE), - fp = tfb_fpc(x, pve = 1), - fp_low = tfb_fpc(x, pve = .95) + fpc = tfb_fpc(x, pve = 1), + fpc_low = tfb_fpc(x, pve = .95) ) for (i in seq_along(l)) { # cat(i) @@ -30,7 +31,7 @@ test_that("tf_rebase.tfd preserves args & evals and transfers attributes", { compare_tf_attribs(x_rebase, l[[i]], check_attrib = FALSE) |> all() ) expect_equal(names(x_rebase), names(x)) - # c(x_rebase, l[[i]]) !! see #77 + # try(c(x_rebase, l[[i]])) #!! see #77 } }) @@ -48,8 +49,8 @@ test_that("tf_rebase.tfb_spline preserves args & evals and transfers attributes" b2 = tfb(x, k = 15, bs = "tp", sp = .1, verbose = FALSE), bu = tfb(x, k = 15, penalized = FALSE, verbose = FALSE), bg = tfb(x, k = 5, global = TRUE, verbose = FALSE), - fp = tfb_fpc(x, pve = 1), - fp_low = tfb_fpc(x, pve = .95) + fpc = tfb_fpc(x, pve = 1), + fpc_low = tfb_fpc(x, pve = .95) ) for (i in seq_along(l)) { # cat(i) @@ -67,7 +68,7 @@ test_that("tf_rebase.tfb_spline preserves args & evals and transfers attributes" compare_tf_attribs(x_rebase, l[[i]], check_attrib = FALSE) |> all() ) expect_equal(names(x_rebase), names(x)) - # c(x_rebase, l[[i]]) !! see #77 + # c(x_rebase, l[[i]]) # !! see #77 } }) @@ -76,7 +77,7 @@ test_that("tf_rebase.tfb_fpc preserves args & evals and transfers attributes", { x <- tf_rgp(5, arg = 301L) |> tf_smooth() |> tfd(evaluator = tf_approx_fill_extend) |> suppressMessages() names(x) <- letters[1:5] - fp <- tfb_fpc(x, pve = 1) + fpc <- tfb_fpc(x, pve = 1) l <- list( x = x, @@ -86,11 +87,11 @@ test_that("tf_rebase.tfb_fpc preserves args & evals and transfers attributes", { b2 = tfb(x, k = 15, bs = "tp", sp= .1, verbose = FALSE), bu = tfb(x, k = 15, penalized = FALSE, verbose = FALSE), bg = tfb(x, k = 5, global = TRUE, verbose = FALSE), - fp_low = tfb_fpc(x, pve = .95) + fpc_low = tfb_fpc(x, pve = .95) ) for (i in seq_along(l)) { - # cat(i) - x_rebase <- tf_rebase(x, l[[i]], verbose = FALSE) + + x_rebase <- tf_rebase(fpc, l[[i]], verbose = FALSE) expect_equal( x_rebase |> tf_evaluations(), l[[i]] |> tf_evaluations(), @@ -104,6 +105,6 @@ test_that("tf_rebase.tfb_fpc preserves args & evals and transfers attributes", { compare_tf_attribs(x_rebase, l[[i]], check_attrib = FALSE) |> all() ) expect_equal(names(x_rebase), names(x)) - # c(x_rebase, l[[i]]) !! see #77 + # c(x_rebase, l[[i]]) # !! see #77 } }) diff --git a/tests/testthat/test-tfb-fpc.R b/tests/testthat/test-tfb-fpc.R index 1bd0002c..a437356f 100644 --- a/tests/testthat/test-tfb-fpc.R +++ b/tests/testthat/test-tfb-fpc.R @@ -41,6 +41,21 @@ test_that("fpc_wsvd works for smooth non-equidistant data", { ) }) +test_that("fpc_wsvd works for partially missing data", { + expect_s3_class(tfb_fpc(sparse), "tfb_fpc") |> suppressWarnings() |> + suppressMessages() + expect_warning(tfb_fpc(sparse), "High ") |> suppressMessages() + expect_message(tfb_fpc(sparse), "Using softImpute") |> suppressWarnings() + set.seed(1312) + x <- tf_rgp(50) + x_sp_pc <- x |> tf_sparsify(.02) |> tfb_fpc(pve = .98) |> + suppressMessages() |> suppressWarnings() + expect_equal( + as.matrix(x_sp_pc, arg = tf_arg(x)), + as.matrix(x, arg = tf_arg(x)), + tolerance = .1) +}) + test_that("tfb_fpc defaults work for all kinds of regular input", { expect_s3_class(tfb_fpc(smoo), "tfb_fpc") expect_length(tfb_fpc(smoo), length(smoo))