Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Utilize vegan::rrarefy #670

Open
wants to merge 2 commits into
base: devel
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -420,5 +420,6 @@ importFrom(vegan,dbrda)
importFrom(vegan,eigenvals)
importFrom(vegan,monoMDS)
importFrom(vegan,permutest)
importFrom(vegan,rrarefy)
importFrom(vegan,scores)
importFrom(vegan,vegdist)
161 changes: 99 additions & 62 deletions R/rarefyAssay.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@
#'
#' To maintain the reproducibility, please define the seed using set.seed()
#' before implement this function.
#'
#'
#' When \code{replace = FALSE}, the function uses internally
#' \code{vegan::rarefy} while with replacement enabled the function utilizes
#' own implementation, inspired by \code{phyloseq::rarefy_even_depth}.
#'
#' @inheritParams transformAssay
#' @inheritParams getDominant
#'
Expand All @@ -28,15 +32,15 @@
#'
#' @param min_size Deprecated. Use \code{sample} instead.
#'
#' @param replace \code{Logical scalar}. The default is with
#' replacement (\code{replace=TRUE}).
#' See \code{\link[phyloseq:rarefy_even_depth]{phyloseq::rarefy_even_depth}}
#' for details on implications of this parameter. (Default: \code{TRUE})
#'
#' @param verbose \code{Logical scalar}. Choose whether to show messages.
#' (Default: \code{TRUE})
#' @param replace \code{Logical scalar}. Whether to åperform subsampling with
#' replacement. Ths works similarly to \code{sample(..., replace = TRUE)}.
#' (Default: \code{FALSE})
#'
#' @param ... additional arguments not used
#' @param ... optional arguments:
#' \itemize{
#' \item \code{verbose}: \code{Logical scalar}. Choose whether to show
#' messages. (Default: \code{TRUE})
#' }
#'
#' @references
#' McMurdie PJ, Holmes S. Waste not, want not: why rarefying microbiome data
Expand Down Expand Up @@ -67,14 +71,20 @@
#' dim(tse)
#' dim(assay(tse_subsampled, "subsampled"))
#'
#' @seealso
#' \itemize{
#' \item \code{\link[vegan:rrarefy]{vegan::rrarefy}}
#' \item \code{\link[phyloseq:rarefy_even_depth]{phyloseq::rarefy_even_depth}}
#' }
#'
NULL

#' @rdname rarefyAssay
#' @export
setGeneric("rarefyAssay", signature = c("x"), function(
x, assay.type = assay_name, assay_name = "counts",
sample = min_size, min_size = min(colSums2(assay(x))),
replace = TRUE, name = "subsampled", verbose = TRUE, ...)
sample = min_size, min_size = min(colSums2(assay(x, assay.type))),
replace = FALSE, name = "subsampled", ...)
standardGeneric("rarefyAssay"))

#' @importFrom SummarizedExperiment assay assay<-
Expand All @@ -84,22 +94,20 @@ setGeneric("rarefyAssay", signature = c("x"), function(
setMethod("rarefyAssay", signature = c(x = "SummarizedExperiment"),
function(x, assay.type = assay_name, assay_name = "counts",
sample = min_size, min_size = min(colSums2(assay(x, assay.type))),
replace = TRUE, name = "subsampled", verbose = TRUE, ...){
replace = FALSE, name = "subsampled", ...){
# Input check
# Check that assay name is correct and that assay is counts table.
.check_assay_present(assay.type, x)
if( any(assay(x, assay.type) %% 1 != 0) ){
warning("assay contains non-integer values. Only counts table ",
"is applicable...", call. = FALSE)
if( any(assay(x, assay.type) %% 1 != 0 &
!is.na(assay(x, assay.type)) ) ){
stop("assay contains non-integer values. Only counts table ",
"is applicable.", call. = FALSE)
}
if(any(assay(x, assay.type) < 0)){
if( any(assay(x, assay.type) < 0 & !is.na(assay(x, assay.type))) ){
stop("assay contains strictly-negative values. Only counts ",
"table is applicable...", call. = FALSE)
}
# Check that verbose and replace are boolean values
if( !.is_a_bool(verbose) ){
stop("'verbose' must be TRUE or FALSE.", call. = FALSE)
}
# Check that replace is boolean values
if( !.is_a_bool(replace) ){
stop("`replace` must be TRUE or FALSE.", call. = FALSE)
}
Expand All @@ -109,61 +117,90 @@ setMethod("rarefyAssay", signature = c(x = "SummarizedExperiment"),
"different from 'assay.type'.", call. = FALSE)
}
# Check sample. It must be single positive integer value.
if(!is.numeric(sample) || length(sample) != 1 ||
if( is.na(sample) || !is.numeric(sample) || length(sample) != 1 ||
sample %% 1 != 0 && sample <= 0 ){
stop("'sample' needs to be a positive integer value.",
call. = FALSE)
}
# Input check end

# 'sample' determines the number of reads subsampled from samples.
# This means that every samples should have at least 'sample' of reads.
# If they do not have, drop those samples at this point.
# Get those sample names that we are going to remove due to too
# small number of reads.
rm_samples <- colSums2(assay(x, assay.type)) < sample
if( any(rm_samples) ){
# Remove sample(s) from TreeSE (or keep rest of the samples)
x <- x[ , !rm_samples, drop = FALSE]
# Return NULL, if no samples were found after subsampling
if( ncol(x) == 0 ){
stop("No samples were found after subsampling. Consider ",
"lower 'sample'.", call. = FALSE)
}
# Give message which samples were removed
if( verbose ){
message(
sum(rm_samples), " samples removed because they contained ",
"fewer reads than `sample`.")
}
}
# Subsample specified assay.
mat <- apply(
assay(x, assay.type), 2,
.subsample_assay, sample = sample, replace = replace)
# Add rownames to new assay. The returned value from .subsample_assay
# is a vector that do not have feature names.
rownames(mat) <- rownames(x)
# remove features not present in any samples after subsampling
feat_inc <- rowSums2(mat, na.rm = TRUE) > 0
mat <- mat[feat_inc, ]
# Give message if some features were dropped
if( verbose && any(!feat_inc) ){
message(
sum(!feat_inc), " features removed because they are not ",
"present in all samples after subsampling."
)
}
# Remove samples that do not have enoguh counts
x <- .remove_samples_below_counts_th(x, assay.type, sample, ...)
# Subsample specified assay
mat <- .get_subsamples_matrix(x, assay.type, sample, replace, ...)
# Subset the TreeSE based on new feature-set
x <- x[rownames(mat), ]
# Add new assay to TreeSE
assay(x, name, withDimnames = FALSE) <- mat
# Add info on sample to metadata
x <- .add_values_to_metadata(x, "rarefyAssay_sample", min_size)
return(x)
}
)

# 'sample' determines the number of reads subsampled from samples.
# This means that every samples should have at least 'sample' of reads.
# If they do not have, drop those samples at this point.
# Get those sample names that we are going to remove due to too
# small number of reads.
.remove_samples_below_counts_th <- function(
x, assay.type, sample, verbose = TRUE, ...){
#
if( !.is_a_bool(verbose) ){
stop("'verbose' must be TRUE or FALSE.", call. = FALSE)
}
#
rm_samples <- colSums2(assay(x, assay.type), na.rm = TRUE) < sample
if( any(rm_samples) ){
# Remove sample(s) from TreeSE (or keep rest of the samples)
x <- x[ , !rm_samples, drop = FALSE]
# Return NULL, if no samples were found after subsampling
if( ncol(x) == 0 ){
stop("No samples were found after subsampling. Consider ",
"lower 'sample'.", call. = FALSE)
}
# Give message which samples were removed
if( verbose ){
message(
sum(rm_samples), " samples removed because they contained ",
"fewer reads than `sample`.")
}
}
return(x)
}

# This function gets abundance table as input and returns subsampled matrix.
#' @importFrom vegan rrarefy
.get_subsamples_matrix <- function(
x, assay.type, sample, replace, verbose = TRUE, ...){
#
if( !.is_a_bool(verbose) ){
stop("'verbose' must be TRUE or FALSE.", call. = FALSE)
}
#
# We use vegan::rrarefy by default. However, as it does not support
# replace=TRUE, we use our own implementation in this case.
mat <- assay(x, assay.type)
if( replace ){
# Loop throguh samples and subsample samples one-by-one
mat <- apply(
mat, 2, .subsample_assay, sample = sample, replace = replace)
rownames(mat) <- rownames(x)
} else{
mat <- t( vegan::rrarefy(t(mat), sample) )
}
# remove features not present in any samples after subsampling
feat_inc <- rowSums2(mat, na.rm = TRUE) > 0
mat <- mat[feat_inc, ]
# Give message if some features were dropped
if( verbose && any(!feat_inc) ){
message(
sum(!feat_inc), " features removed because they are not ",
"present in any of the samples after subsampling."
)
}
# Add info on sample to attributes
attr(mat, "subsample") <- sample
return(mat)
}

## Modified Sub sampling function from phyloseq internals
.subsample_assay <- function(x, sample, replace){
# Create replacement species vector
Expand Down
3 changes: 0 additions & 3 deletions man/mergeSEs.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

34 changes: 21 additions & 13 deletions man/rarefyAssay.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 17 additions & 11 deletions tests/testthat/test-8subsample.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,28 +36,34 @@ test_that("rarefyAssay", {
expect_equal(unname(colSums2(assay(tse.subsampled, "subsampled"))), expColSums)

# When replace = FALSE
sample <- 60000
seed = 1938
set.seed(seed)
tse.subsampled.rp <- rarefyAssay(
GlobalPatterns,
sample = 60000,
sample = sample,
name = "subsampled",
replace = FALSE)

# check number of features removed is correct
expnFeaturesRemovedRp <- 6731
obsnFeaturesRemovedRp <- nrow(GlobalPatterns) - nrow(tse.subsampled.rp)
expect_equal(obsnFeaturesRemovedRp, expnFeaturesRemovedRp)
set.seed(seed)
mat <- assay(GlobalPatterns, "counts")
# When replace = FALSE, the function utilize vegan::rrarefy
suppressWarnings(
sub_mat <- t( vegan::rrarefy(t(mat), sample = sample) )
)
# The function removes those samples whose library size is lower than
# 'sample'
sub_mat <- sub_mat[, colSums(mat) > sample]
# Moreover, it removes those features that are not present in any of the
# samples
sub_mat <- sub_mat[rowSums(sub_mat) > 0, ]
expect_equal(assay(tse.subsampled.rp, "subsampled"), sub_mat, check.attributes = FALSE)

# check if all samples subsampled to even depth
expColSumsRp <- rep(60000, 25)
expColSumsRp <- rep(sample, 25)
expect_equal(unname(colSums2(assay(tse.subsampled.rp, "subsampled"))), expColSumsRp)

# check if same Features removed
obsFeaturesRemovedRp <- rownames(GlobalPatterns)[!rownames(GlobalPatterns) %in% rownames(tse.subsampled.rp)]

expFeaturesRemovedRP <- c("522457","951","586076","244960","215972",
"31759","30678","138353","406058","1126")

expect_equal(obsFeaturesRemovedRp[1:10], expFeaturesRemovedRP)
expect_true( all(rownames(tse.subsampled.rp) == rownames(sub_mat)) )
})
Loading