Skip to content

Commit

Permalink
Utilize vegan::rrarefy
Browse files Browse the repository at this point in the history
  • Loading branch information
TuomasBorman committed Jan 8, 2025
1 parent 2dc3430 commit ef406c7
Show file tree
Hide file tree
Showing 5 changed files with 145 additions and 93 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -420,5 +420,6 @@ importFrom(vegan,dbrda)
importFrom(vegan,eigenvals)
importFrom(vegan,monoMDS)
importFrom(vegan,permutest)
importFrom(vegan,rrarefy)
importFrom(vegan,scores)
importFrom(vegan,vegdist)
162 changes: 100 additions & 62 deletions R/rarefyAssay.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@
#'
#' To maintain the reproducibility, please define the seed using set.seed()
#' before implement this function.
#'
#'
#' When \code{replace = FALSE}, the function uses internally
#' \code{vegan::rarefy} while with replacement enabled the function utilizes
#' own implementation, inspired by \code{phyloseq::rarefy_even_depth}.
#'
#' #
#' @inheritParams transformAssay
#' @inheritParams getDominant
#'
Expand All @@ -28,15 +33,15 @@
#'
#' @param min_size Deprecated. Use \code{sample} instead.
#'
#' @param replace \code{Logical scalar}. The default is with
#' replacement (\code{replace=TRUE}).
#' See \code{\link[phyloseq:rarefy_even_depth]{phyloseq::rarefy_even_depth}}
#' for details on implications of this parameter. (Default: \code{TRUE})
#'
#' @param verbose \code{Logical scalar}. Choose whether to show messages.
#' (Default: \code{TRUE})
#' @param replace \code{Logical scalar}. Whether to åperform subsampling with
#' replacement. Ths works similarly to \code{sample(..., replace = TRUE)}.
#' (Default: \code{FALSE})
#'
#' @param ... additional arguments not used
#' @param ... optional arguments:
#' \itemize{
#' \item \code{verbose}: \code{Logical scalar}. Choose whether to show
#' messages. (Default: \code{TRUE})
#' }
#'
#' @references
#' McMurdie PJ, Holmes S. Waste not, want not: why rarefying microbiome data
Expand Down Expand Up @@ -67,14 +72,20 @@
#' dim(tse)
#' dim(assay(tse_subsampled, "subsampled"))
#'
#' @seealso
#' \itemize{
#' \item \code{\link[vegan:rrarefy]{vegan::rrarefy}}
#' \item \code{\link[phyloseq:rarefy_even_depth]{phyloseq::rarefy_even_depth}}
#' }
#'
NULL

#' @rdname rarefyAssay
#' @export
setGeneric("rarefyAssay", signature = c("x"), function(
x, assay.type = assay_name, assay_name = "counts",
sample = min_size, min_size = min(colSums2(assay(x))),
replace = TRUE, name = "subsampled", verbose = TRUE, ...)
sample = min_size, min_size = min(colSums2(assay(x, assay.type))),
replace = FALSE, name = "subsampled", ...)
standardGeneric("rarefyAssay"))

#' @importFrom SummarizedExperiment assay assay<-
Expand All @@ -84,22 +95,20 @@ setGeneric("rarefyAssay", signature = c("x"), function(
setMethod("rarefyAssay", signature = c(x = "SummarizedExperiment"),
function(x, assay.type = assay_name, assay_name = "counts",
sample = min_size, min_size = min(colSums2(assay(x, assay.type))),
replace = TRUE, name = "subsampled", verbose = TRUE, ...){
replace = FALSE, name = "subsampled", ...){
# Input check
# Check that assay name is correct and that assay is counts table.
.check_assay_present(assay.type, x)
if( any(assay(x, assay.type) %% 1 != 0) ){
warning("assay contains non-integer values. Only counts table ",
"is applicable...", call. = FALSE)
if( any(assay(x, assay.type) %% 1 != 0 &
!is.na(assay(x, assay.type)) ) ){
stop("assay contains non-integer values. Only counts table ",
"is applicable.", call. = FALSE)
}
if(any(assay(x, assay.type) < 0)){
if( any(assay(x, assay.type) < 0 & !is.na(assay(x, assay.type))) ){
stop("assay contains strictly-negative values. Only counts ",
"table is applicable...", call. = FALSE)
}
# Check that verbose and replace are boolean values
if( !.is_a_bool(verbose) ){
stop("'verbose' must be TRUE or FALSE.", call. = FALSE)
}
# Check that replace is boolean values
if( !.is_a_bool(replace) ){
stop("`replace` must be TRUE or FALSE.", call. = FALSE)
}
Expand All @@ -109,61 +118,90 @@ setMethod("rarefyAssay", signature = c(x = "SummarizedExperiment"),
"different from 'assay.type'.", call. = FALSE)
}
# Check sample. It must be single positive integer value.
if(!is.numeric(sample) || length(sample) != 1 ||
if( is.na(sample) || !is.numeric(sample) || length(sample) != 1 ||
sample %% 1 != 0 && sample <= 0 ){
stop("'sample' needs to be a positive integer value.",
call. = FALSE)
}
# Input check end

# 'sample' determines the number of reads subsampled from samples.
# This means that every samples should have at least 'sample' of reads.
# If they do not have, drop those samples at this point.
# Get those sample names that we are going to remove due to too
# small number of reads.
rm_samples <- colSums2(assay(x, assay.type)) < sample
if( any(rm_samples) ){
# Remove sample(s) from TreeSE (or keep rest of the samples)
x <- x[ , !rm_samples, drop = FALSE]
# Return NULL, if no samples were found after subsampling
if( ncol(x) == 0 ){
stop("No samples were found after subsampling. Consider ",
"lower 'sample'.", call. = FALSE)
}
# Give message which samples were removed
if( verbose ){
message(
sum(rm_samples), " samples removed because they contained ",
"fewer reads than `sample`.")
}
}
# Subsample specified assay.
mat <- apply(
assay(x, assay.type), 2,
.subsample_assay, sample = sample, replace = replace)
# Add rownames to new assay. The returned value from .subsample_assay
# is a vector that do not have feature names.
rownames(mat) <- rownames(x)
# remove features not present in any samples after subsampling
feat_inc <- rowSums2(mat, na.rm = TRUE) > 0
mat <- mat[feat_inc, ]
# Give message if some features were dropped
if( verbose && any(!feat_inc) ){
message(
sum(!feat_inc), " features removed because they are not ",
"present in all samples after subsampling."
)
}
# Remove samples that do not have enoguh counts
x <- .remove_samples_below_counts_th(x, assay.type, sample, ...)
# Subsample specified assay
mat <- .get_subsamples_matrix(x, assay.type, sample, replace, ...)
# Subset the TreeSE based on new feature-set
x <- x[rownames(mat), ]
# Add new assay to TreeSE
assay(x, name, withDimnames = FALSE) <- mat
# Add info on sample to metadata
x <- .add_values_to_metadata(x, "rarefyAssay_sample", min_size)
return(x)
}
)

# 'sample' determines the number of reads subsampled from samples.
# This means that every samples should have at least 'sample' of reads.
# If they do not have, drop those samples at this point.
# Get those sample names that we are going to remove due to too
# small number of reads.
.remove_samples_below_counts_th <- function(
x, assay.type, sample, verbose = TRUE, ...){
#
if( !.is_a_bool(verbose) ){
stop("'verbose' must be TRUE or FALSE.", call. = FALSE)
}
#
rm_samples <- colSums2(assay(x, assay.type), na.rm = TRUE) < sample
if( any(rm_samples) ){
# Remove sample(s) from TreeSE (or keep rest of the samples)
x <- x[ , !rm_samples, drop = FALSE]
# Return NULL, if no samples were found after subsampling
if( ncol(x) == 0 ){
stop("No samples were found after subsampling. Consider ",
"lower 'sample'.", call. = FALSE)
}
# Give message which samples were removed
if( verbose ){
message(
sum(rm_samples), " samples removed because they contained ",
"fewer reads than `sample`.")
}
}
return(x)
}

# This function gets abundance table as input and returns subsampled matrix.
#' @importFrom vegan rrarefy
.get_subsamples_matrix <- function(
x, assay.type, sample, replace, verbose = TRUE, ...){
#
if( !.is_a_bool(verbose) ){
stop("'verbose' must be TRUE or FALSE.", call. = FALSE)
}
#
# We use vegan::rrarefy by default. However, as it does not support
# replace=TRUE, we use our own implementation in this case.
mat <- assay(x, assay.type)
if( replace ){
# Loop throguh samples and subsample samples one-by-one
mat <- apply(
mat, 2, .subsample_assay, sample = sample, replace = replace)
rownames(mat) <- rownames(x)
} else{
mat <- t( vegan::rrarefy(t(mat), sample) )
}
# remove features not present in any samples after subsampling
feat_inc <- rowSums2(mat, na.rm = TRUE) > 0
mat <- mat[feat_inc, ]
# Give message if some features were dropped
if( verbose && any(!feat_inc) ){
message(
sum(!feat_inc), " features removed because they are not ",
"present in any of the samples after subsampling."
)
}
# Add info on sample to attributes
attr(mat, "subsample") <- sample
return(mat)
}

## Modified Sub sampling function from phyloseq internals
.subsample_assay <- function(x, sample, replace){
# Create replacement species vector
Expand Down
3 changes: 0 additions & 3 deletions man/mergeSEs.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

44 changes: 27 additions & 17 deletions man/rarefyAssay.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 17 additions & 11 deletions tests/testthat/test-8subsample.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,28 +36,34 @@ test_that("rarefyAssay", {
expect_equal(unname(colSums2(assay(tse.subsampled, "subsampled"))), expColSums)

# When replace = FALSE
sample <- 60000
seed = 1938
set.seed(seed)
tse.subsampled.rp <- rarefyAssay(
GlobalPatterns,
sample = 60000,
sample = sample,
name = "subsampled",
replace = FALSE)

# check number of features removed is correct
expnFeaturesRemovedRp <- 6731
obsnFeaturesRemovedRp <- nrow(GlobalPatterns) - nrow(tse.subsampled.rp)
expect_equal(obsnFeaturesRemovedRp, expnFeaturesRemovedRp)
set.seed(seed)
mat <- assay(GlobalPatterns, "counts")
# When replace = FALSE, the function utilize vegan::rrarefy
suppressWarnings(
sub_mat <- t( vegan::rrarefy(t(mat), sample = sample) )
)
# The function removes those samples whose library size is lower than
# 'sample'
sub_mat <- sub_mat[, colSums(mat) > sample]
# Moreover, it removes those features that are not present in any of the
# samples
sub_mat <- sub_mat[rowSums(sub_mat) > 0, ]
expect_equal(assay(tse.subsampled.rp, "subsampled"), sub_mat, check.attributes = FALSE)

# check if all samples subsampled to even depth
expColSumsRp <- rep(60000, 25)
expColSumsRp <- rep(sample, 25)
expect_equal(unname(colSums2(assay(tse.subsampled.rp, "subsampled"))), expColSumsRp)

# check if same Features removed
obsFeaturesRemovedRp <- rownames(GlobalPatterns)[!rownames(GlobalPatterns) %in% rownames(tse.subsampled.rp)]

expFeaturesRemovedRP <- c("522457","951","586076","244960","215972",
"31759","30678","138353","406058","1126")

expect_equal(obsFeaturesRemovedRp[1:10], expFeaturesRemovedRP)
expect_true( all(rownames(tse.subsampled.rp) == rownames(sub_mat)) )
})

0 comments on commit ef406c7

Please sign in to comment.