Skip to content

Commit

Permalink
Merge branch 'wsvd_na' into dev
Browse files Browse the repository at this point in the history
# Conflicts:
#	R/tfb-fpc.R
#	man/tfb_fpc.Rd
#	tests/testthat/test-rebase.R
  • Loading branch information
fabian-s committed Mar 11, 2024
2 parents 2f22ec9 + 3fde9b2 commit 188b9a9
Show file tree
Hide file tree
Showing 13 changed files with 418 additions and 75 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ Collate:
'rebase.R'
'rng.R'
'smooth.R'
'soft-impute-svd.R'
'summarize.R'
'tf-package.R'
'tfb-fpc.R'
Expand Down
21 changes: 21 additions & 0 deletions R/bibentries.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,27 @@ bibentries <- c(
year = "2009",
publisher = "Taylor & Francis"
),
mazumder2010 = bibentry("article",
title= "Spectral regularization algorithms for learning large incomplete matrices",
author = "Mazumder, Rahul and Hastie, Trevor and Tibshirani, Robert",
journal = "The Journal of Machine Learning Research",
volume = "11",
pages = "2287-2322",
year = "2010"
),
softimpute = bibentry("manual",
title = "\\code{softImpute}: Matrix Completion via Iterative Soft-Thresholded SVD",
author = "Trevor Hastie and Rahul Mazumder",
year = "2021",
note = "R package version 1.4-1",
url = "https://CRAN.R-project.org/package=softImpute"
),
meng2023mogsa = bibentry("manual",
title = "\\code{mogsa}: Multiple omics data integrative clustering and gene set analysis",
author = "Chen Meng",
year = "2023",
url = "https://bioconductor.org/packages/mogsa"
),
swihart2010lasagna = bibentry("article",
title = "Lasagna plots: a saucy alternative to spaghetti plots",
author = "Swihart, Bruce J and Caffo, Brian and James, Bryan D and Strand, Matthew and Schwartz, Brian S and Punjabi, Naresh M",
Expand Down
5 changes: 5 additions & 0 deletions R/rebase.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
tf_rebase <- function(object, basis_from, arg = tf_arg(basis_from), ...) {
all.equal(tf_domain(object), tf_domain(basis_from)) |>
assert_true()
assert_arg(arg, object, check_unique = FALSE)
assert_arg(arg, basis_from, check_unique = FALSE)
UseMethod("tf_rebase", object)
}

Expand Down Expand Up @@ -59,6 +61,9 @@ tf_rebase.tfd.tfb_spline <- function(object, basis_from, arg = tf_arg(basis_fro
}
#'@export
tf_rebase.tfd.tfb_fpc <- function(object, basis_from, arg = tf_arg(basis_from), ...) {
if (is_irreg(object)) {
warning("")
}
data <- tf_interpolate(object, arg = arg) |> as.data.frame(unnest = TRUE)
new_tfb_fpc(data = data, basis_from = basis_from,
domain = tf_domain(basis_from),
Expand Down
17 changes: 9 additions & 8 deletions R/smooth.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#' @param x a `tf` object containing functional data
#' @param method one of "lowess" (see [stats::lowess()]), "rollmean",
#' "rollmedian" (see [zoo::rollmean()]) or "savgol" (see [pracma::savgol()])
#' @param verbose give lots of diagnostic messages? Defaults to TRUE
#' @param ... arguments for the respective `method`. See Details.
#' @returns a smoothed version of the input. For some methods/options, the
#' smoothed functions may be shorter than the original ones (at both ends).
Expand All @@ -31,8 +32,8 @@ tf_smooth <- function(x, ...) {

#' @export
#' @rdname tf_smooth
tf_smooth.tfb <- function(x, ...) {
warning(
tf_smooth.tfb <- function(x, verbose = TRUE, ...) {
if (verbose) warning(
"you called tf_smooth on a tfb object, not on a tfd object -- ",
"just use a smaller basis or stronger penalization.\n",
"Returning unchanged tfb object.",
Expand Down Expand Up @@ -70,12 +71,12 @@ tf_smooth.tfb <- function(x, ...) {
#' lines(f_sg, col = "red")
tf_smooth.tfd <- function(x,
method = c("lowess", "rollmean", "rollmedian", "savgol"),
...) {
verbose = TRUE, ...) {
method <- match.arg(method)
smoother <- get(method, mode = "function")
dots <- list(...)
if (method %in% c("savgol", "rollmean", "rollmedian")) {
if (!is_equidist(x)) {
if (verbose & !is_equidist(x)) {
warning(
"non-equidistant arg-values in ", sQuote(deparse(substitute(x))),
" ignored by ", method, ".",
Expand All @@ -86,18 +87,18 @@ tf_smooth.tfd <- function(x,
if (is.null(dots$k)) {
dots$k <- ceiling(0.05 * min(tf_count(x)))
dots$k <- dots$k + !(dots$k %% 2) # make uneven
message("using k = ", dots$k, " observations for rolling data window.")
if (verbose) message("using k = ", dots$k, " observations for rolling data window.")
}
if (is.null(dots$fill)) {
message("setting fill = 'extend' for start/end values.")
if (verbose) message("setting fill = 'extend' for start/end values.")
dots$fill <- "extend"
}
}
if (method == "savgol") {
if (is.null(dots$fl)) {
dots$fl <- ceiling(0.15 * min(tf_count(x)))
dots$fl <- dots$fl + !(dots$fl %% 2) # make uneven
message("using fl = ", dots$fl, " observations for rolling data window.")
if (verbose) message("using fl = ", dots$fl, " observations for rolling data window.")
}
}

Expand All @@ -109,7 +110,7 @@ tf_smooth.tfd <- function(x,
if (method == "lowess") {
if (is.null(dots$f)) {
dots$f <- 0.15
message("using f = ", dots$f, " as smoother span for lowess")
if (verbose) message("using f = ", dots$f, " as smoother span for lowess")
}
smoothed <- map(
tf_evaluations(x), \(x) do.call(smoother, append(list(x), dots))$y
Expand Down
59 changes: 59 additions & 0 deletions R/soft-impute-svd.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# copied from softImpute::Frob (v 1.4-1) under GPL-2
# original authors: Trevor Hastie <hastie@stanford.edu> and Rahul Mazumder <rahul.mazumder@gmail.com>
frob <- function(Uold, Dsqold, Vold, U, Dsq, V) {
denom = sum(Dsqold^2)
utu = Dsq * (t(U) %*% Uold)
vtv = Dsqold * (t(Vold) %*% V)
uvprod = sum(diag(utu %*% vtv))
num = denom + sum(Dsq^2) - 2 * uvprod
num/max(denom, 1e-09)
}

# code slightly adapted and shortened from softImpute::simpute.svd.R (v 1.4-1) under GPL-2
# original authors: Trevor Hastie <hastie@stanford.edu> and Rahul Mazumder <rahul.mazumder@gmail.com>
# adaptations:
# - no warm starts, no returned call, no attributes
# inputs: x matrix of weighted (w/ integration weights) & centered (!) function evaluations, with NAs
# output: (regularized) SVD of x
simpute_svd <- function(x, J = min(dim(x)) - 1, thresh = 1e-05, lambda = 0, maxit = 100,
trace.it = FALSE, ...) {
n <- dim(x)
m <- n[2]
n <- n[1]
xnas <- is.na(x)

nz <- m * n - sum(xnas)
xfill <- x
xfill[xnas] <- 0

svd.xfill <- svd(xfill)
ratio <- 1
iter <- 0
while ((ratio > thresh) & (iter < maxit)) {
iter <- iter + 1
svd.old <- svd.xfill
d <- svd.xfill$d
d <- pmax(d - lambda, 0)
xhat <- svd.xfill$u[, seq(J)] %*% (d[seq(J)] * t(svd.xfill$v[, seq(J)]))
xfill[xnas] <- xhat[xnas]
svd.xfill <- svd(xfill)
ratio <- frob(
svd.old$u[, seq(J)], d[seq(J)], svd.old$v[, seq(J)],
svd.xfill$u[, seq(J)], pmax(svd.xfill$d - lambda, 0)[seq(J)], svd.xfill$v[, seq(J)]
)
if (trace.it) {
obj <- (.5 * sum((xfill - xhat)[!xnas]^2) + lambda * sum(d)) / nz
cat(iter, ":", "obj", format(round(obj, 5)), "ratio", ratio, "\n")
}
}
d <- pmax(svd.xfill$d[seq(J)] - lambda, 0)
J <- min(sum(d > 0) + 1, J)
svd.xfill <- list(u = svd.xfill$u[, seq(J)], d = d[seq(J)], v = svd.xfill$v[, seq(J)])
if (iter == maxit) {
warning(paste("Incomplete-data-SVD convergence not achieved by", maxit,
"iterations"), call. = FALSE)
}
svd.xfill
}


75 changes: 61 additions & 14 deletions R/tfb-fpc-utils.R
Original file line number Diff line number Diff line change
@@ -1,19 +1,43 @@
#' Eigenfunctions via weighted SVD
#' Eigenfunctions via weighted, regularized SVD
#'
#' Compute (truncated) orthonormal eigenfunctions and scores
#' for data potentially on a non-equidistant grid.
#' for (partially missing) data on a common (potentially non-equidistant) grid.
#'
#' Performs a weighted SVD with trapezoidal quadrature weights s.t. returned
#' vectors represent (evaluations of)
#' orthonormal eigen*functions* \eqn{\phi_j(t)}, not eigen*vectors*
#' \eqn{\phi_j = (\phi_j(t_1), \dots, \phi_j(t_n))}, specifically:\cr
#' \eqn{\int_T \phi_j(t)^2 dt \approx \sum_i \Delta_i \phi_j(t_i)^2 = 1}
#' given quadrature weights \eqn{\Delta_i}, not
#' \eqn{\phi_j'\phi_j = \sum_i \phi_j(t_i)^2 = 1};\cr
#' \eqn{\int_T \phi_j(t) \phi_k(t) dt = 0} not
#' \eqn{\phi_j'\phi_k = \sum_i \phi_j(t_i)\phi_k(t_i) = 0},
#' c.f. `mogsa::wsvd()`.\cr
#' For incomplete data, this uses an adaptation of `softImpute::softImpute()`,
#' see references. Note that will not work well for data on a common grid if more
#' than a few percent of data points are missing, and it breaks down completely
#' for truly irregular data with no/few common timepoints, even if observed very
#' densely. For such data, either re-evaluate on a common grid first or use more
#' advanced FPCA approaches like `refund::fpca_sc()`,
#' see last example for [tfb_fpc()]
#'
#' @param data numeric matrix of function evaluations
#' (each row is one curve, no NAs)
#' @param arg numeric vector of argument values
#' @param pve percentage of variance explained
#' @returns a list with entries
#' - `mu`` estimated mean function (numeric vector)
#' - `efunctions`` estimated FPCs (numeric matrix, columns represent FPCs)
#' - `mu` estimated mean function (numeric vector)
#' - `efunctions` estimated FPCs (numeric matrix, columns represent FPCs)
#' - `scores` estimated FPC scores (one row per observed curve)
#' - `npc` how many FPCs were returned for the given `pve` (integer)
#' @references code adapted from / inspired by `wsvd()` function of Bioconductor
#' package `mogsa` by Cheng Meng.
#' @author Cheng Meng, Fabian Scheipl
#' - `scoring_function` a function that returns FPC scores for new data
#' and given eigenfunctions, see `tf:::.fpc_wsvd_scores` for an example.
#' @author Trevor Hastie, Rahul Mazumder, Cheng Meng, Fabian Scheipl
#' @references code adapted from / inspired by `mogsa::wsvd()` by Cheng Meng
#' and `softImpute::softImpute()` by Trevor Hastie and Rahul Mazumder.\cr
#' `r format_bib("meng2023mogsa")`\cr
#' `r format_bib("mazumder2010")`\cr
#' `r format_bib("softimpute")`
#' @family tfb-class
#' @family tfb_fpc-class
fpc_wsvd <- function(data, arg, pve = 0.995) {
Expand All @@ -22,24 +46,42 @@ fpc_wsvd <- function(data, arg, pve = 0.995) {

#' @rdname fpc_wsvd
#' @importFrom utils head tail
#' @importFrom stats lowess
#' @export
fpc_wsvd.matrix <- function(data, arg, pve = 0.995) {
assert_matrix(data, mode = "numeric", any.missing = FALSE,
min.cols = 2, min.rows = 1)
assert_matrix(data, mode = "numeric", min.cols = 2, min.rows = 1)
assert_numeric(arg, any.missing = FALSE, sorted = TRUE, len = ncol(data))
assert_number(pve, lower = 0, upper = 1)

delta <- c(0, diff(arg))
# trapezoid integration weights:
weights <- 0.5 * c(delta[-1] + head(delta, -1), tail(delta, 1))
mean <- colMeans(data)
mean <- colMeans(data, na.rm = TRUE)

data_wc <- t((t(data) - mean) * sqrt(weights))

pc <- svd(data_wc, nu = 0, nv = min(dim(data)))
nas <- is.na(data)
pc <- if (!any(nas)) {
svd(data_wc, nu = 0, nv = min(dim(data)))
} else {
message("Using softImpute SVD on ", round(mean(nas)*100, 1), "% missing data")
if (pve + mean(nas) > 1) {
warning("High <pve> with many missings likely to yield bad FPC estimates.",
call. = FALSE)
}
simpute_svd(data_wc)
}

pve_observed <- cumsum(pc$d^2) / sum(pc$d^2)
use <- min(which(pve_observed >= pve))

efunctions <- pc$v[, 1:use] / sqrt(weights)

if (any(nas)) {
# slightly smooth efunctions from incomplete data to reduce artefacts
efunctions <- apply(efunctions, 2,
\(ef) stats::lowess(x = arg, y = ef, f = .15)$y)
}
evalues <- (pc$d[1:use])^2
scores <- .fpc_wsvd_scores(data, efunctions, mean, weights) #!!

Expand All @@ -51,9 +93,14 @@ fpc_wsvd.matrix <- function(data, arg, pve = 0.995) {
)
}

#extract for reuse in tf_rebase
.fpc_wsvd_scores <- function(data_matrix, efunctions, mean, weights) {
data_wc <- t((t(data_matrix) - mean) * sqrt(weights))
# extract scoring function for reuse in tf_rebase
# performs simple (weighted) LS fit of data onto the eigenfunctions.
.fpc_wsvd_scores <- function(data_matrix, efunctions, mean, weights) {
w_mat <- matrix(weights, ncol = length(weights), nrow = nrow(data_matrix),
byrow = TRUE)
w_mat[is.na(data_matrix)] <- 0
data_matrix[is.na(data_matrix)] <- 0
data_wc <- t((t(data_matrix) - mean) * sqrt(t(w_mat)))
t(qr.coef(qr(efunctions), t(data_wc) / sqrt(weights)))
}

Expand Down
62 changes: 43 additions & 19 deletions R/tfb-fpc.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ new_tfb_fpc <- function(data, domain = NULL,
stop("Can't specify both method *and* basis_from for new_tfb_fpc",
call. = FALSE)
}
arg <- sort(unique(data$arg))
arg <- mgcv::uniquecombs(data$arg, ordered = TRUE) |> unlist()

domain <- domain %||% range(arg)
if (!isTRUE(all.equal(domain, range(arg)))) {
Expand Down Expand Up @@ -74,20 +74,24 @@ new_tfb_fpc <- function(data, domain = NULL,
#'
#' These functions perform a (functional) principal component analysis (FPCA) of
#' the input data and return an `tfb_fpc` `tf`-object that uses the empirical
#' eigenfunctions as basis functions for representing the data. By default, a
#' simple, not smoothed, truncated weighted SVD of the functions is used to
#' compute those ("`method = fpc_wsvd`"). Note that this is suitable only for
#' regular data all observed on the same (not necessarily equidistant) grid. See
#' Details / Example for possible alternatives and extensions. \cr
#' eigenfunctions as basis functions for representing the data. The default
#' ("`method = fpc_wsvd`") uses a (truncated) weighted SVD for complete
#' data on a common grid and a nuclear-norm regularized (truncated) weighted SVD
#' for partially missing data on a common grid, see [fpc_wsvd()].
#' The latter is likely to break down for high PVE and/or high amounts of
#' missingness.\cr
#'
#' Any "factorization" method that accepts a `data.frame` with
#' For the FPC basis, any factorization method that accepts a `data.frame` with
#' columns `id`, `arg`, `value` containing the functional data and returns a
#' list structured like the return object
#' list with eigenfunctions and FPC scores structured like the return object
#' of [fpc_wsvd()] can be used for the `method`` argument, see example below.
#' Note that the mean function, with a fixed "score" of 1 for all functions,
#' is used as the first basis function for all FPC bases.
#'
#' @export
#' @param method the function to use that computes eigenfunctions and scores.
#' Defaults to [fpc_wsvd()], which gives unsmoothed eigenfunctions.
#' Defaults to [fpc_wsvd()], which is quick and easy but returns completely
#' unsmoothed eigenfunctions unlikely to be suited for noisy data.
#' @param ... arguments to the `method` which computes the
#' (regularized/smoothed) FPCA - see e.g. [fpc_wsvd()].
#' Unless set by the user, uses proportion of variance explained
Expand All @@ -107,19 +111,44 @@ tfb_fpc <- function(data, ...) UseMethod("tfb_fpc")
#' @export
#' @inheritParams tfd.data.frame
#' @examples
#' # Apply FPCA for sparse data using refund::fpca.sc:
#' set.seed(13121)
#' x <- tf_rgp(25, nugget = .02)
#' x_pc <- tfb_fpc(x, pve = .9)
#' x_pc
#' plot(x, lwd = 3)
#' lines(x_pc, col = 2, lty = 2)
#' x_pc_full <- tfb_fpc(x, pve = .995)
#' x_pc_full
#' lines(x_pc_full, col = 3, lty = 2)
#' # partially missing data on common grid:
#' x_mis <- x |> tf_sparsify(dropout = .05)
#' x_pc_mis <- tfb_fpc(x_mis, pve = .9)
#' x_pc_mis
#' plot(x_mis, lwd = 3)
#' lines(x_pc_mis, col = 4, lty = 2)
#' # extract FPC basis --
#' # first "eigenvector" in black is (always) the mean function
#' x_pc |> tf_basis(as_tfd = TRUE) |> plot(col = 1:5)
#' \donttest{
#' # Apply FPCA for sparse, irregular data using refund::fpca.sc:
#' set.seed(99290)
#' # create sparse data:
#' data <- tf_rgp(15) |>
#' tf_sparsify() |>
#' # create sparse, irregular data:
#' x_irreg <- x |>
#' tf_jiggle() |> tf_sparsify(dropout = 0.3)
#' plot(x_irreg)
#' x_df <- x_irreg |>
#' as.data.frame(unnest = TRUE)
#' # wrap refund::fpca_sc for use as FPCA method in tfb_fpc:
#' fpca_sc_wrapper <- function(data, arg, pve = 0.995, ...) {
#' data_mat <- tf:::df_2_mat(data)
#' fpca <- refund::fpca.sc(
#' Y = data_mat, argvals = attr(data_mat, "arg"), pve = pve, ...
#' )
#' fpca[c("mu", "efunctions", "scores", "npc")]
#' c(fpca[c("mu", "efunctions", "scores", "npc")],
#' scoring_function = tf:::.fpc_wsvd_scores)
#' }
#' x_pc <- tfb_fpc(x_df, method = fpca_sc_wrapper)
#' lines(x_pc, col = 2, lty = 2)
#' }
tfb_fpc.data.frame <- function(data, id = 1, arg = 2, value = 3,
domain = NULL, method = fpc_wsvd, ...) {
Expand All @@ -146,11 +175,6 @@ tfb_fpc.numeric <- function(data, arg = NULL, domain = NULL,
tfb_fpc(data = data, arg = arg, method = method, domain = domain, ...)
}

# #' @rdname tfb_fpc
# #' @export
# tfb_fpc.list <- function(data, arg = NULL, domain = NULL, method = fpc_wsvd,
# TODO

#' @rdname tfb_fpc
#' @export
tfb_fpc.tf <- function(data, arg = NULL, method = fpc_wsvd, ...) {
Expand Down
Loading

0 comments on commit 188b9a9

Please sign in to comment.