From ef406c711cefb5fa0d92f0444f1f65218180c201 Mon Sep 17 00:00:00 2001 From: TuomasBorman Date: Wed, 8 Jan 2025 15:02:49 +0200 Subject: [PATCH] Utilize vegan::rrarefy --- NAMESPACE | 1 + R/rarefyAssay.R | 162 +++++++++++++++++++------------ man/mergeSEs.Rd | 3 - man/rarefyAssay.Rd | 44 +++++---- tests/testthat/test-8subsample.R | 28 +++--- 5 files changed, 145 insertions(+), 93 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 771585489..0366bc059 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -420,5 +420,6 @@ importFrom(vegan,dbrda) importFrom(vegan,eigenvals) importFrom(vegan,monoMDS) importFrom(vegan,permutest) +importFrom(vegan,rrarefy) importFrom(vegan,scores) importFrom(vegan,vegdist) diff --git a/R/rarefyAssay.R b/R/rarefyAssay.R index 1c5a152a9..c36fbcda1 100644 --- a/R/rarefyAssay.R +++ b/R/rarefyAssay.R @@ -18,7 +18,12 @@ #' #' To maintain the reproducibility, please define the seed using set.seed() #' before implement this function. -#' +#' +#' When \code{replace = FALSE}, the function uses internally +#' \code{vegan::rarefy} while with replacement enabled the function utilizes +#' own implementation, inspired by \code{phyloseq::rarefy_even_depth}. +#' +#' # #' @inheritParams transformAssay #' @inheritParams getDominant #' @@ -28,15 +33,15 @@ #' #' @param min_size Deprecated. Use \code{sample} instead. #' -#' @param replace \code{Logical scalar}. The default is with -#' replacement (\code{replace=TRUE}). -#' See \code{\link[phyloseq:rarefy_even_depth]{phyloseq::rarefy_even_depth}} -#' for details on implications of this parameter. (Default: \code{TRUE}) -#' -#' @param verbose \code{Logical scalar}. Choose whether to show messages. -#' (Default: \code{TRUE}) +#' @param replace \code{Logical scalar}. Whether to åperform subsampling with +#' replacement. Ths works similarly to \code{sample(..., replace = TRUE)}. +#' (Default: \code{FALSE}) #' -#' @param ... additional arguments not used +#' @param ... optional arguments: +#' \itemize{ +#' \item \code{verbose}: \code{Logical scalar}. Choose whether to show +#' messages. (Default: \code{TRUE}) +#' } #' #' @references #' McMurdie PJ, Holmes S. Waste not, want not: why rarefying microbiome data @@ -67,14 +72,20 @@ #' dim(tse) #' dim(assay(tse_subsampled, "subsampled")) #' +#' @seealso +#' \itemize{ +#' \item \code{\link[vegan:rrarefy]{vegan::rrarefy}} +#' \item \code{\link[phyloseq:rarefy_even_depth]{phyloseq::rarefy_even_depth}} +#' } +#' NULL #' @rdname rarefyAssay #' @export setGeneric("rarefyAssay", signature = c("x"), function( x, assay.type = assay_name, assay_name = "counts", - sample = min_size, min_size = min(colSums2(assay(x))), - replace = TRUE, name = "subsampled", verbose = TRUE, ...) + sample = min_size, min_size = min(colSums2(assay(x, assay.type))), + replace = FALSE, name = "subsampled", ...) standardGeneric("rarefyAssay")) #' @importFrom SummarizedExperiment assay assay<- @@ -84,22 +95,20 @@ setGeneric("rarefyAssay", signature = c("x"), function( setMethod("rarefyAssay", signature = c(x = "SummarizedExperiment"), function(x, assay.type = assay_name, assay_name = "counts", sample = min_size, min_size = min(colSums2(assay(x, assay.type))), - replace = TRUE, name = "subsampled", verbose = TRUE, ...){ + replace = FALSE, name = "subsampled", ...){ # Input check # Check that assay name is correct and that assay is counts table. .check_assay_present(assay.type, x) - if( any(assay(x, assay.type) %% 1 != 0) ){ - warning("assay contains non-integer values. Only counts table ", - "is applicable...", call. = FALSE) + if( any(assay(x, assay.type) %% 1 != 0 & + !is.na(assay(x, assay.type)) ) ){ + stop("assay contains non-integer values. Only counts table ", + "is applicable.", call. = FALSE) } - if(any(assay(x, assay.type) < 0)){ + if( any(assay(x, assay.type) < 0 & !is.na(assay(x, assay.type))) ){ stop("assay contains strictly-negative values. Only counts ", "table is applicable...", call. = FALSE) } - # Check that verbose and replace are boolean values - if( !.is_a_bool(verbose) ){ - stop("'verbose' must be TRUE or FALSE.", call. = FALSE) - } + # Check that replace is boolean values if( !.is_a_bool(replace) ){ stop("`replace` must be TRUE or FALSE.", call. = FALSE) } @@ -109,61 +118,90 @@ setMethod("rarefyAssay", signature = c(x = "SummarizedExperiment"), "different from 'assay.type'.", call. = FALSE) } # Check sample. It must be single positive integer value. - if(!is.numeric(sample) || length(sample) != 1 || + if( is.na(sample) || !is.numeric(sample) || length(sample) != 1 || sample %% 1 != 0 && sample <= 0 ){ stop("'sample' needs to be a positive integer value.", call. = FALSE) } # Input check end - - # 'sample' determines the number of reads subsampled from samples. - # This means that every samples should have at least 'sample' of reads. - # If they do not have, drop those samples at this point. - # Get those sample names that we are going to remove due to too - # small number of reads. - rm_samples <- colSums2(assay(x, assay.type)) < sample - if( any(rm_samples) ){ - # Remove sample(s) from TreeSE (or keep rest of the samples) - x <- x[ , !rm_samples, drop = FALSE] - # Return NULL, if no samples were found after subsampling - if( ncol(x) == 0 ){ - stop("No samples were found after subsampling. Consider ", - "lower 'sample'.", call. = FALSE) - } - # Give message which samples were removed - if( verbose ){ - message( - sum(rm_samples), " samples removed because they contained ", - "fewer reads than `sample`.") - } - } - # Subsample specified assay. - mat <- apply( - assay(x, assay.type), 2, - .subsample_assay, sample = sample, replace = replace) - # Add rownames to new assay. The returned value from .subsample_assay - # is a vector that do not have feature names. - rownames(mat) <- rownames(x) - # remove features not present in any samples after subsampling - feat_inc <- rowSums2(mat, na.rm = TRUE) > 0 - mat <- mat[feat_inc, ] - # Give message if some features were dropped - if( verbose && any(!feat_inc) ){ - message( - sum(!feat_inc), " features removed because they are not ", - "present in all samples after subsampling." - ) - } + # Remove samples that do not have enoguh counts + x <- .remove_samples_below_counts_th(x, assay.type, sample, ...) + # Subsample specified assay + mat <- .get_subsamples_matrix(x, assay.type, sample, replace, ...) # Subset the TreeSE based on new feature-set x <- x[rownames(mat), ] # Add new assay to TreeSE assay(x, name, withDimnames = FALSE) <- mat - # Add info on sample to metadata - x <- .add_values_to_metadata(x, "rarefyAssay_sample", min_size) return(x) } ) +# 'sample' determines the number of reads subsampled from samples. +# This means that every samples should have at least 'sample' of reads. +# If they do not have, drop those samples at this point. +# Get those sample names that we are going to remove due to too +# small number of reads. +.remove_samples_below_counts_th <- function( + x, assay.type, sample, verbose = TRUE, ...){ + # + if( !.is_a_bool(verbose) ){ + stop("'verbose' must be TRUE or FALSE.", call. = FALSE) + } + # + rm_samples <- colSums2(assay(x, assay.type), na.rm = TRUE) < sample + if( any(rm_samples) ){ + # Remove sample(s) from TreeSE (or keep rest of the samples) + x <- x[ , !rm_samples, drop = FALSE] + # Return NULL, if no samples were found after subsampling + if( ncol(x) == 0 ){ + stop("No samples were found after subsampling. Consider ", + "lower 'sample'.", call. = FALSE) + } + # Give message which samples were removed + if( verbose ){ + message( + sum(rm_samples), " samples removed because they contained ", + "fewer reads than `sample`.") + } + } + return(x) +} + +# This function gets abundance table as input and returns subsampled matrix. +#' @importFrom vegan rrarefy +.get_subsamples_matrix <- function( + x, assay.type, sample, replace, verbose = TRUE, ...){ + # + if( !.is_a_bool(verbose) ){ + stop("'verbose' must be TRUE or FALSE.", call. = FALSE) + } + # + # We use vegan::rrarefy by default. However, as it does not support + # replace=TRUE, we use our own implementation in this case. + mat <- assay(x, assay.type) + if( replace ){ + # Loop throguh samples and subsample samples one-by-one + mat <- apply( + mat, 2, .subsample_assay, sample = sample, replace = replace) + rownames(mat) <- rownames(x) + } else{ + mat <- t( vegan::rrarefy(t(mat), sample) ) + } + # remove features not present in any samples after subsampling + feat_inc <- rowSums2(mat, na.rm = TRUE) > 0 + mat <- mat[feat_inc, ] + # Give message if some features were dropped + if( verbose && any(!feat_inc) ){ + message( + sum(!feat_inc), " features removed because they are not ", + "present in any of the samples after subsampling." + ) + } + # Add info on sample to attributes + attr(mat, "subsample") <- sample + return(mat) +} + ## Modified Sub sampling function from phyloseq internals .subsample_assay <- function(x, sample, replace){ # Create replacement species vector diff --git a/man/mergeSEs.Rd b/man/mergeSEs.Rd index 5b76bd0b6..8a98ea721 100644 --- a/man/mergeSEs.Rd +++ b/man/mergeSEs.Rd @@ -63,9 +63,6 @@ taxonomy information along with OTU number matches. \item{collapse_features}{Deprecated. Use \code{collapse.rows} instead.} -\item{verbose}{\code{Logical scalar}. Choose whether to show messages. -(Default: \code{TRUE})} - \item{y}{a \code{\link{SummarizedExperiment}} object when \code{x} is a \code{\link{SummarizedExperiment}} object. Disabled when \code{x} is a list.} } diff --git a/man/rarefyAssay.Rd b/man/rarefyAssay.Rd index 5ffb2a3bc..997f193c6 100644 --- a/man/rarefyAssay.Rd +++ b/man/rarefyAssay.Rd @@ -10,10 +10,9 @@ rarefyAssay( assay.type = assay_name, assay_name = "counts", sample = min_size, - min_size = min(colSums2(assay(x))), - replace = TRUE, + min_size = min(colSums2(assay(x, assay.type))), + replace = FALSE, name = "subsampled", - verbose = TRUE, ... ) @@ -23,9 +22,8 @@ rarefyAssay( assay_name = "counts", sample = min_size, min_size = min(colSums2(assay(x, assay.type))), - replace = TRUE, + replace = FALSE, name = "subsampled", - verbose = TRUE, ... ) } @@ -43,18 +41,18 @@ counts found in a sample or a user specified number.} \item{min_size}{Deprecated. Use \code{sample} instead.} -\item{replace}{\code{Logical scalar}. The default is with -replacement (\code{replace=TRUE}). -See \code{\link[phyloseq:rarefy_even_depth]{phyloseq::rarefy_even_depth}} -for details on implications of this parameter. (Default: \code{TRUE})} +\item{replace}{\code{Logical scalar}. Whether to åperform subsampling with +replacement. Ths works similarly to \code{sample(..., replace = TRUE)}. +(Default: \code{FALSE})} \item{name}{\code{Character scalar}. A name for the column of the \code{colData} where results will be stored. (Default: \code{"method"})} -\item{verbose}{\code{Logical scalar}. Choose whether to show messages. -(Default: \code{TRUE})} - -\item{...}{additional arguments not used} +\item{...}{optional arguments: +\itemize{ +\item \code{verbose}: \code{Logical scalar}. Choose whether to show +messages. (Default: \code{TRUE}) +}} } \value{ \code{rarefyAssay} return \code{x} with subsampled data. @@ -66,18 +64,24 @@ for details on implications of this parameter. (Default: \code{TRUE})} subsampled assay. } \details{ -Although the subsampling approach is highly debated in microbiome research, -we include the \code{rarefyAssay} function because there may be some +Although the subsampling approach is highly debated in microbiome research, +we include the \code{rarefyAssay} function because there may be some instances where it can be useful. -Note that the output of \code{rarefyAssay} is not the equivalent as the +Note that the output of \code{rarefyAssay} is not the equivalent as the input and any result have to be verified with the original dataset. Subsampling/Rarefying may undermine downstream analyses and have unintended consequences. Therefore, make sure this normalization is appropriate for your data. -To maintain the reproducibility, please define the seed using set.seed() +To maintain the reproducibility, please define the seed using set.seed() before implement this function. + +When \code{replace = FALSE}, the function uses internally +\code{vegan::rarefy} while with replacement enabled the function utilizes +own implementation, inspired by \code{phyloseq::rarefy_even_depth}. + +# } \examples{ # When samples in TreeSE are less than specified sample, they will be removed. @@ -105,3 +109,9 @@ Zaneveld JR, Vázquez-Baeza Y, Birmingham A, Hyde ER. Normalization and microbial differential abundance strategies depend upon data characteristics. Microbiome. 2017 Dec;5(1):1-8. } +\seealso{ +\itemize{ +\item \code{\link[vegan:rrarefy]{vegan::rrarefy}} +\item \code{\link[phyloseq:rarefy_even_depth]{phyloseq::rarefy_even_depth}} +} +} diff --git a/tests/testthat/test-8subsample.R b/tests/testthat/test-8subsample.R index f600b4d2a..dadabedb6 100644 --- a/tests/testthat/test-8subsample.R +++ b/tests/testthat/test-8subsample.R @@ -36,28 +36,34 @@ test_that("rarefyAssay", { expect_equal(unname(colSums2(assay(tse.subsampled, "subsampled"))), expColSums) # When replace = FALSE + sample <- 60000 seed = 1938 set.seed(seed) tse.subsampled.rp <- rarefyAssay( GlobalPatterns, - sample = 60000, + sample = sample, name = "subsampled", replace = FALSE) # check number of features removed is correct - expnFeaturesRemovedRp <- 6731 - obsnFeaturesRemovedRp <- nrow(GlobalPatterns) - nrow(tse.subsampled.rp) - expect_equal(obsnFeaturesRemovedRp, expnFeaturesRemovedRp) + set.seed(seed) + mat <- assay(GlobalPatterns, "counts") + # When replace = FALSE, the function utilize vegan::rrarefy + suppressWarnings( + sub_mat <- t( vegan::rrarefy(t(mat), sample = sample) ) + ) + # The function removes those samples whose library size is lower than + # 'sample' + sub_mat <- sub_mat[, colSums(mat) > sample] + # Moreover, it removes those features that are not present in any of the + # samples + sub_mat <- sub_mat[rowSums(sub_mat) > 0, ] + expect_equal(assay(tse.subsampled.rp, "subsampled"), sub_mat, check.attributes = FALSE) # check if all samples subsampled to even depth - expColSumsRp <- rep(60000, 25) + expColSumsRp <- rep(sample, 25) expect_equal(unname(colSums2(assay(tse.subsampled.rp, "subsampled"))), expColSumsRp) # check if same Features removed - obsFeaturesRemovedRp <- rownames(GlobalPatterns)[!rownames(GlobalPatterns) %in% rownames(tse.subsampled.rp)] - - expFeaturesRemovedRP <- c("522457","951","586076","244960","215972", - "31759","30678","138353","406058","1126") - - expect_equal(obsFeaturesRemovedRp[1:10], expFeaturesRemovedRP) + expect_true( all(rownames(tse.subsampled.rp) == rownames(sub_mat)) ) })