From ef406c711cefb5fa0d92f0444f1f65218180c201 Mon Sep 17 00:00:00 2001
From: TuomasBorman <tvborm@utu.fi>
Date: Wed, 8 Jan 2025 15:02:49 +0200
Subject: [PATCH 1/6] Utilize vegan::rrarefy

---
 NAMESPACE                        |   1 +
 R/rarefyAssay.R                  | 162 +++++++++++++++++++------------
 man/mergeSEs.Rd                  |   3 -
 man/rarefyAssay.Rd               |  44 +++++----
 tests/testthat/test-8subsample.R |  28 +++---
 5 files changed, 145 insertions(+), 93 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 771585489..0366bc059 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -420,5 +420,6 @@ importFrom(vegan,dbrda)
 importFrom(vegan,eigenvals)
 importFrom(vegan,monoMDS)
 importFrom(vegan,permutest)
+importFrom(vegan,rrarefy)
 importFrom(vegan,scores)
 importFrom(vegan,vegdist)
diff --git a/R/rarefyAssay.R b/R/rarefyAssay.R
index 1c5a152a9..c36fbcda1 100644
--- a/R/rarefyAssay.R
+++ b/R/rarefyAssay.R
@@ -18,7 +18,12 @@
 #'
 #' To maintain the reproducibility, please define the seed using set.seed() 
 #' before implement this function.
-#'
+#' 
+#' When \code{replace = FALSE}, the function uses internally
+#' \code{vegan::rarefy} while with replacement enabled the function utilizes
+#' own implementation, inspired by \code{phyloseq::rarefy_even_depth}.
+#' 
+#' #
 #' @inheritParams transformAssay
 #' @inheritParams getDominant
 #' 
@@ -28,15 +33,15 @@
 #' 
 #' @param min_size Deprecated. Use \code{sample} instead. 
 #'   
-#' @param replace \code{Logical scalar}. The default is with 
-#'   replacement (\code{replace=TRUE}). 
-#'   See \code{\link[phyloseq:rarefy_even_depth]{phyloseq::rarefy_even_depth}}
-#'   for details on implications of this parameter. (Default: \code{TRUE})   
-#' 
-#' @param verbose \code{Logical scalar}. Choose whether to show messages. 
-#' (Default: \code{TRUE})
+#' @param replace \code{Logical scalar}. Whether to åperform subsampling with
+#' replacement. Ths works similarly to \code{sample(..., replace = TRUE)}.
+#' (Default: \code{FALSE})
 #' 
-#' @param ... additional arguments not used
+#' @param ... optional arguments:
+#' \itemize{
+#'   \item \code{verbose}: \code{Logical scalar}. Choose whether to show
+#'   messages. (Default: \code{TRUE})
+#' }
 #' 
 #' @references
 #' McMurdie PJ, Holmes S. Waste not, want not: why rarefying microbiome data 
@@ -67,14 +72,20 @@
 #' dim(tse)
 #' dim(assay(tse_subsampled, "subsampled"))
 #' 
+#' @seealso
+#' \itemize{
+#'   \item \code{\link[vegan:rrarefy]{vegan::rrarefy}}
+#'   \item \code{\link[phyloseq:rarefy_even_depth]{phyloseq::rarefy_even_depth}}
+#' }
+#' 
 NULL
 
 #' @rdname rarefyAssay
 #' @export
 setGeneric("rarefyAssay", signature = c("x"), function(
         x, assay.type = assay_name, assay_name = "counts", 
-        sample = min_size, min_size = min(colSums2(assay(x))),
-        replace = TRUE, name = "subsampled", verbose = TRUE, ...)
+        sample = min_size, min_size = min(colSums2(assay(x, assay.type))),
+        replace = FALSE, name = "subsampled", ...)
             standardGeneric("rarefyAssay"))
 
 #' @importFrom SummarizedExperiment assay assay<-
@@ -84,22 +95,20 @@ setGeneric("rarefyAssay", signature = c("x"), function(
 setMethod("rarefyAssay", signature = c(x = "SummarizedExperiment"),
     function(x, assay.type = assay_name, assay_name = "counts", 
             sample = min_size, min_size = min(colSums2(assay(x, assay.type))),
-            replace = TRUE, name = "subsampled", verbose = TRUE, ...){
+            replace = FALSE, name = "subsampled", ...){
         # Input check
         # Check that assay name is correct and that assay is counts table.
         .check_assay_present(assay.type, x)
-        if( any(assay(x, assay.type) %% 1 != 0) ){
-            warning("assay contains non-integer values. Only counts table ",
-                    "is applicable...", call. = FALSE)
+        if( any(assay(x, assay.type) %% 1 != 0 &
+                !is.na(assay(x, assay.type)) ) ){
+            stop("assay contains non-integer values. Only counts table ",
+                "is applicable.", call. = FALSE)
         }
-        if(any(assay(x, assay.type) < 0)){
+        if( any(assay(x, assay.type) < 0 & !is.na(assay(x, assay.type))) ){
             stop("assay contains strictly-negative values. Only counts ",
                 "table is applicable...", call. = FALSE)
         }
-        # Check that verbose and replace are boolean values
-        if( !.is_a_bool(verbose) ){
-            stop("'verbose' must be TRUE or FALSE.", call. = FALSE)
-        }
+        # Check that replace is boolean values
         if( !.is_a_bool(replace) ){
             stop("`replace` must be TRUE or FALSE.", call. = FALSE)
         } 
@@ -109,61 +118,90 @@ setMethod("rarefyAssay", signature = c(x = "SummarizedExperiment"),
                 "different from 'assay.type'.", call. = FALSE)
         }
         # Check sample. It must be single positive integer value.
-        if(!is.numeric(sample) || length(sample) != 1 ||
+        if( is.na(sample) || !is.numeric(sample) || length(sample) != 1 ||
                 sample %% 1 != 0 && sample <= 0  ){
             stop("'sample' needs to be a positive integer value.",
                 call. = FALSE)
         }
         # Input check end
-        
-        # 'sample' determines the number of reads subsampled from samples.
-        # This means that every samples should have at least 'sample' of reads.
-        # If they do not have, drop those samples at this point.
-        # Get those sample names that we are going to remove due to too
-        # small number of reads.
-        rm_samples <- colSums2(assay(x, assay.type)) < sample
-        if( any(rm_samples) ){
-            # Remove sample(s) from TreeSE (or keep rest of the samples)
-            x <- x[ , !rm_samples, drop = FALSE]
-            # Return NULL, if no samples were found after subsampling
-            if( ncol(x) == 0 ){
-                stop("No samples were found after subsampling. Consider ",
-                    "lower 'sample'.", call. = FALSE)
-            }
-            # Give message which samples were removed
-            if( verbose ){
-                message(
-                    sum(rm_samples), " samples removed because they contained ",
-                    "fewer reads than `sample`.")
-            }
-        }
-        # Subsample specified assay.
-        mat <- apply(
-            assay(x, assay.type), 2,
-            .subsample_assay, sample = sample, replace = replace)
-        # Add rownames to new assay. The returned value from .subsample_assay
-        # is a vector that do not have feature names.
-        rownames(mat) <- rownames(x)
-        # remove features not present in any samples after subsampling
-        feat_inc <- rowSums2(mat, na.rm = TRUE) > 0
-        mat <- mat[feat_inc, ]
-        # Give message if some features were dropped
-        if( verbose && any(!feat_inc) ){
-            message(
-                sum(!feat_inc), " features removed because they are not ",
-                "present in all samples after subsampling."
-                )
-        }
+        # Remove samples that do not have enoguh counts
+        x <- .remove_samples_below_counts_th(x, assay.type, sample, ...)
+        # Subsample specified assay
+        mat <- .get_subsamples_matrix(x, assay.type, sample, replace, ...)
         # Subset the TreeSE based on new feature-set
         x <- x[rownames(mat), ]
         # Add new assay to TreeSE
         assay(x, name, withDimnames = FALSE) <- mat
-        # Add info on sample to metadata
-        x <- .add_values_to_metadata(x, "rarefyAssay_sample", min_size)
         return(x)
     }
 )
 
+# 'sample' determines the number of reads subsampled from samples.
+# This means that every samples should have at least 'sample' of reads.
+# If they do not have, drop those samples at this point.
+# Get those sample names that we are going to remove due to too
+# small number of reads.
+.remove_samples_below_counts_th <- function(
+        x, assay.type, sample, verbose = TRUE, ...){
+    #
+    if( !.is_a_bool(verbose) ){
+        stop("'verbose' must be TRUE or FALSE.", call. = FALSE)
+    }
+    #
+    rm_samples <- colSums2(assay(x, assay.type), na.rm = TRUE) < sample
+    if( any(rm_samples) ){
+        # Remove sample(s) from TreeSE (or keep rest of the samples)
+        x <- x[ , !rm_samples, drop = FALSE]
+        # Return NULL, if no samples were found after subsampling
+        if( ncol(x) == 0 ){
+            stop("No samples were found after subsampling. Consider ",
+                 "lower 'sample'.", call. = FALSE)
+        }
+        # Give message which samples were removed
+        if( verbose ){
+            message(
+                sum(rm_samples), " samples removed because they contained ",
+                "fewer reads than `sample`.")
+        }
+    }
+    return(x)
+}
+
+# This function gets abundance table as input and returns subsampled matrix.
+#' @importFrom vegan rrarefy
+.get_subsamples_matrix <- function(
+        x, assay.type, sample, replace, verbose = TRUE, ...){
+    #
+    if( !.is_a_bool(verbose) ){
+        stop("'verbose' must be TRUE or FALSE.", call. = FALSE)
+    }
+    #
+    # We use vegan::rrarefy by default. However, as it does not support
+    # replace=TRUE, we use our own implementation in this case.
+    mat <- assay(x, assay.type)
+    if( replace ){
+        # Loop throguh samples and subsample samples one-by-one
+        mat <- apply(
+            mat, 2, .subsample_assay, sample = sample, replace = replace)
+        rownames(mat) <- rownames(x)
+    } else{
+        mat <- t( vegan::rrarefy(t(mat), sample) )
+    }
+    # remove features not present in any samples after subsampling
+    feat_inc <- rowSums2(mat, na.rm = TRUE) > 0
+    mat <- mat[feat_inc, ]
+    # Give message if some features were dropped
+    if( verbose && any(!feat_inc) ){
+        message(
+            sum(!feat_inc), " features removed because they are not ",
+            "present in any of the samples after subsampling."
+        )
+    }
+    # Add info on sample to attributes
+    attr(mat, "subsample") <- sample
+    return(mat)
+}
+
 ## Modified Sub sampling function from phyloseq internals
 .subsample_assay <- function(x, sample, replace){
     # Create replacement species vector
diff --git a/man/mergeSEs.Rd b/man/mergeSEs.Rd
index 5b76bd0b6..8a98ea721 100644
--- a/man/mergeSEs.Rd
+++ b/man/mergeSEs.Rd
@@ -63,9 +63,6 @@ taxonomy information along with OTU number matches.
 
 \item{collapse_features}{Deprecated. Use \code{collapse.rows} instead.}
 
-\item{verbose}{\code{Logical scalar}. Choose whether to show messages.
-(Default: \code{TRUE})}
-
 \item{y}{a \code{\link{SummarizedExperiment}} object when \code{x} is a
 \code{\link{SummarizedExperiment}} object. Disabled when \code{x} is a list.}
 }
diff --git a/man/rarefyAssay.Rd b/man/rarefyAssay.Rd
index 5ffb2a3bc..997f193c6 100644
--- a/man/rarefyAssay.Rd
+++ b/man/rarefyAssay.Rd
@@ -10,10 +10,9 @@ rarefyAssay(
   assay.type = assay_name,
   assay_name = "counts",
   sample = min_size,
-  min_size = min(colSums2(assay(x))),
-  replace = TRUE,
+  min_size = min(colSums2(assay(x, assay.type))),
+  replace = FALSE,
   name = "subsampled",
-  verbose = TRUE,
   ...
 )
 
@@ -23,9 +22,8 @@ rarefyAssay(
   assay_name = "counts",
   sample = min_size,
   min_size = min(colSums2(assay(x, assay.type))),
-  replace = TRUE,
+  replace = FALSE,
   name = "subsampled",
-  verbose = TRUE,
   ...
 )
 }
@@ -43,18 +41,18 @@ counts found in a sample or a user specified number.}
 
 \item{min_size}{Deprecated. Use \code{sample} instead.}
 
-\item{replace}{\code{Logical scalar}. The default is with
-replacement (\code{replace=TRUE}).
-See \code{\link[phyloseq:rarefy_even_depth]{phyloseq::rarefy_even_depth}}
-for details on implications of this parameter. (Default: \code{TRUE})}
+\item{replace}{\code{Logical scalar}. Whether to åperform subsampling with
+replacement. Ths works similarly to \code{sample(..., replace = TRUE)}.
+(Default: \code{FALSE})}
 
 \item{name}{\code{Character scalar}. A name for the column of the
 \code{colData} where results will be stored. (Default: \code{"method"})}
 
-\item{verbose}{\code{Logical scalar}. Choose whether to show messages.
-(Default: \code{TRUE})}
-
-\item{...}{additional arguments not used}
+\item{...}{optional arguments:
+\itemize{
+\item \code{verbose}: \code{Logical scalar}. Choose whether to show
+messages. (Default: \code{TRUE})
+}}
 }
 \value{
 \code{rarefyAssay} return \code{x} with subsampled data.
@@ -66,18 +64,24 @@ for details on implications of this parameter. (Default: \code{TRUE})}
 subsampled assay.
 }
 \details{
-Although the subsampling approach is highly debated in microbiome research,
-we include the \code{rarefyAssay} function because there may be some
+Although the subsampling approach is highly debated in microbiome research, 
+we include the \code{rarefyAssay} function because there may be some 
 instances where it can be useful.
-Note that the output of \code{rarefyAssay} is not the equivalent as the
+Note that the output of \code{rarefyAssay} is not the equivalent as the 
 input and any result have to be verified with the original dataset.
 
 Subsampling/Rarefying may undermine downstream analyses and have unintended
 consequences. Therefore, make sure this normalization is appropriate for
 your data.
 
-To maintain the reproducibility, please define the seed using set.seed()
+To maintain the reproducibility, please define the seed using set.seed() 
 before implement this function.
+
+When \code{replace = FALSE}, the function uses internally
+\code{vegan::rarefy} while with replacement enabled the function utilizes
+own implementation, inspired by \code{phyloseq::rarefy_even_depth}.
+
+#
 }
 \examples{
 # When samples in TreeSE are less than specified sample, they will be removed.
@@ -105,3 +109,9 @@ Zaneveld JR, Vázquez-Baeza Y, Birmingham A, Hyde ER. Normalization and
 microbial differential abundance strategies depend upon data characteristics.
 Microbiome. 2017 Dec;5(1):1-8.
 }
+\seealso{
+\itemize{
+\item \code{\link[vegan:rrarefy]{vegan::rrarefy}}
+\item \code{\link[phyloseq:rarefy_even_depth]{phyloseq::rarefy_even_depth}}
+}
+}
diff --git a/tests/testthat/test-8subsample.R b/tests/testthat/test-8subsample.R
index f600b4d2a..dadabedb6 100644
--- a/tests/testthat/test-8subsample.R
+++ b/tests/testthat/test-8subsample.R
@@ -36,28 +36,34 @@ test_that("rarefyAssay", {
     expect_equal(unname(colSums2(assay(tse.subsampled, "subsampled"))), expColSums)
     
     # When replace = FALSE
+    sample <- 60000
     seed = 1938
     set.seed(seed)
     tse.subsampled.rp <- rarefyAssay(
         GlobalPatterns, 
-        sample = 60000, 
+        sample = sample, 
         name = "subsampled",
         replace = FALSE)
     
     # check number of features removed is correct
-    expnFeaturesRemovedRp <- 6731
-    obsnFeaturesRemovedRp <- nrow(GlobalPatterns) - nrow(tse.subsampled.rp)
-    expect_equal(obsnFeaturesRemovedRp, expnFeaturesRemovedRp)
+    set.seed(seed)
+    mat <- assay(GlobalPatterns, "counts")
+    # When replace = FALSE, the function utilize vegan::rrarefy
+    suppressWarnings(
+    sub_mat <- t( vegan::rrarefy(t(mat), sample = sample) )
+    )
+    # The function removes those samples whose library size is lower than
+    # 'sample'
+    sub_mat <- sub_mat[, colSums(mat) > sample]
+    # Moreover, it removes those features that are not present in any of the
+    # samples
+    sub_mat <- sub_mat[rowSums(sub_mat) > 0, ]
+    expect_equal(assay(tse.subsampled.rp, "subsampled"), sub_mat, check.attributes = FALSE)
     
     # check if all samples subsampled to even depth
-    expColSumsRp <- rep(60000, 25)
+    expColSumsRp <- rep(sample, 25)
     expect_equal(unname(colSums2(assay(tse.subsampled.rp, "subsampled"))), expColSumsRp)
     
     # check if same Features removed
-    obsFeaturesRemovedRp <- rownames(GlobalPatterns)[!rownames(GlobalPatterns) %in% rownames(tse.subsampled.rp)]
-    
-    expFeaturesRemovedRP <- c("522457","951","586076","244960","215972",
-                              "31759","30678","138353","406058","1126")
-    
-    expect_equal(obsFeaturesRemovedRp[1:10], expFeaturesRemovedRP)
+    expect_true( all(rownames(tse.subsampled.rp) == rownames(sub_mat)) )
 })

From 1d23fca0465eec8f2f66cfa5b05be6d7015aba8e Mon Sep 17 00:00:00 2001
From: TuomasBorman <tvborm@utu.fi>
Date: Wed, 8 Jan 2025 15:06:09 +0200
Subject: [PATCH 2/6] up

---
 R/rarefyAssay.R    |  1 -
 man/rarefyAssay.Rd | 10 ++++------
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/R/rarefyAssay.R b/R/rarefyAssay.R
index c36fbcda1..8a9e34d6e 100644
--- a/R/rarefyAssay.R
+++ b/R/rarefyAssay.R
@@ -23,7 +23,6 @@
 #' \code{vegan::rarefy} while with replacement enabled the function utilizes
 #' own implementation, inspired by \code{phyloseq::rarefy_even_depth}.
 #' 
-#' #
 #' @inheritParams transformAssay
 #' @inheritParams getDominant
 #' 
diff --git a/man/rarefyAssay.Rd b/man/rarefyAssay.Rd
index 997f193c6..ce7ef94b8 100644
--- a/man/rarefyAssay.Rd
+++ b/man/rarefyAssay.Rd
@@ -64,24 +64,22 @@ messages. (Default: \code{TRUE})
 subsampled assay.
 }
 \details{
-Although the subsampling approach is highly debated in microbiome research, 
-we include the \code{rarefyAssay} function because there may be some 
+Although the subsampling approach is highly debated in microbiome research,
+we include the \code{rarefyAssay} function because there may be some
 instances where it can be useful.
-Note that the output of \code{rarefyAssay} is not the equivalent as the 
+Note that the output of \code{rarefyAssay} is not the equivalent as the
 input and any result have to be verified with the original dataset.
 
 Subsampling/Rarefying may undermine downstream analyses and have unintended
 consequences. Therefore, make sure this normalization is appropriate for
 your data.
 
-To maintain the reproducibility, please define the seed using set.seed() 
+To maintain the reproducibility, please define the seed using set.seed()
 before implement this function.
 
 When \code{replace = FALSE}, the function uses internally
 \code{vegan::rarefy} while with replacement enabled the function utilizes
 own implementation, inspired by \code{phyloseq::rarefy_even_depth}.
-
-#
 }
 \examples{
 # When samples in TreeSE are less than specified sample, they will be removed.

From 5006e83115bf96f6aee751c55294ae1f4bbb5b80 Mon Sep 17 00:00:00 2001
From: TuomasBorman <tvborm@utu.fi>
Date: Fri, 17 Jan 2025 22:59:11 +0200
Subject: [PATCH 3/6] up

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 3aaff563a..f054f76b2 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: mia
 Type: Package
-Version: 1.15.12
+Version: 1.15.13
 Authors@R:
     c(person(given = "Tuomas", family = "Borman", role = c("aut", "cre"),
              email = "tuomas.v.borman@utu.fi",

From b92c3d03ecb1758668585a21b2c063feceb012dc Mon Sep 17 00:00:00 2001
From: TuomasBorman <tvborm@utu.fi>
Date: Sat, 18 Jan 2025 09:36:22 +0200
Subject: [PATCH 4/6] up

---
 man/getCrossAssociation.Rd | 43 +++++++++++++++++++++++++++-----------
 man/rarefyAssay.Rd         | 10 +++++----
 2 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/man/getCrossAssociation.Rd b/man/getCrossAssociation.Rd
index 4578240ac..5e47ecf80 100644
--- a/man/getCrossAssociation.Rd
+++ b/man/getCrossAssociation.Rd
@@ -66,6 +66,16 @@ corresponding variable-pair. This decreases the number of calculations in
 and give numeric values as an output. Adjust \code{method} and other
 parameters correspondingly. Supported functions are, for example,
 \code{stats::dist} and \code{vegan::vegdist}.
+
+\item \code{dimred1} \code{Character scalar} or \code{numeric scalar}.
+Specifies reduced dimensionality from the \code{reducedDim} of experiment
+\enumerate{
+\item (Default: \code{NULL})
+}
+
+\item \code{dimred2} \code{Character scalar} or \code{numeric scalar}.
+Specifies reduced dimensionality from the \code{reducedDim} of experiment
+2. (Default: \code{NULL})
 }}
 
 \item{experiment1}{\code{Character scalar} or \code{numeric scalar}.
@@ -91,12 +101,14 @@ assay in experiment 2 to be transformed.. (Default: \code{"counts"})}
 \item{assay_name2}{Deprecated. Use \code{assay.type2} instead.}
 
 \item{altexp1}{\code{Character scalar} or \code{numeric scalar}. Specifies
-alternative experiment from the altExp of experiment 1. If NULL, then the
-experiment is itself and altExp option is disabled. (Default: \code{NULL})}
+alternative experiment from the \code{altExp} of experiment 1. If NULL, then
+the experiment is itself and altExp option is disabled.
+(Default: \code{NULL})}
 
 \item{altexp2}{\code{Character scalar} or \code{numeric scalar}. Specifies
-alternative experiment from the altExp of experiment 2. If NULL, then the
-experiment is itself and altExp option is disabled. (Default: \code{NULL})}
+alternative experiment from the \code{altExp} of experiment 2. If NULL, then
+the experiment is itself and altExp option is disabled.
+(Default: \code{NULL})}
 
 \item{col.var1}{\code{Character scalar}. Specifies column(s) from
 \code{colData} of experiment 1. If col.var1 is used, assay.type1 is disabled.
@@ -224,7 +236,7 @@ result <- getCrossAssociation(
 # Show first 5 entries
 head(result, 5)
 
-# If test.signif = TRUE, then getCrossAssociation additionally returns 
+# If test.signif = TRUE, then getCrossAssociation additionally returns
 # significances
 # filter.self.cor = TRUE filters self correlations
 # p.adj.threshold can be used to filter those features that do not
@@ -234,7 +246,7 @@ result <- getCrossAssociation(
     filter.self.cor = TRUE, p.adj.threshold = 0.05, test.signif = TRUE)
 # Show first 5 entries
 head(result, 5)
-                                        
+
 # Returned value is a list of matrices
 names(result)
 
@@ -243,19 +255,19 @@ names(result)
 result <- getCrossAssociation(
     mae[[1]], mae[[1]], by = 2, paired = FALSE,
     association.fun = getDissimilarity, method = "bray")
-                                        
+
 # If experiments are equal and measure is symmetric
 # (e.g., taxa1 vs taxa2 == taxa2 vs taxa1),
 # it is possible to speed-up calculations by calculating association only
 # for unique variable-pairs. Use "symmetric" to choose whether to measure
 # association for only other half of of variable-pairs.
 result <- getCrossAssociation(
-    mae, experiment1 = "microbiota", experiment2 = "microbiota", 
+    mae, experiment1 = "microbiota", experiment2 = "microbiota",
     assay.type1 = "counts", assay.type2 = "counts", symmetric = TRUE)
 
 # For big data sets, the calculations might take a long time.
-# To speed them up, you can take a random sample from the data. 
-# When dealing with complex biological problems, random samples can be 
+# To speed them up, you can take a random sample from the data.
+# When dealing with complex biological problems, random samples can be
 # enough to describe the data. Here, our random sample is 30 \% of whole data.
 sample_size <- 0.3
 tse <- mae[[1]]
@@ -270,7 +282,7 @@ mae[[1]] <- addAlpha(mae[[1]])
 # an assay named assay.type from assay slot, it fetches a column named
 # col.var from colData.
 result <- getCrossAssociation(
-    mae[[1]], assay.type1 = "counts", 
+    mae[[1]], assay.type1 = "counts",
     col.var2 = c("shannon_diversity", "coverage_diversity"),
     test.signif = TRUE)
 
@@ -288,5 +300,12 @@ res <- getCrossAssociation(
     assay.type2 = "nmr"
 )
 
-                                        
+# To calculate correlation of features to principal coordinates, you have to
+# first calculate PCoA...
+tse <- runMDS(
+    tse, assay.type = "rclr", FUN = getDissimilarity, method = "euclidean")
+# ...then calculate the correlation.
+res <- getCrossAssociation(tse, assay.type1 = "rclr", dimred2 = "MDS")
+head(res)
+
 }
diff --git a/man/rarefyAssay.Rd b/man/rarefyAssay.Rd
index 6eeb8f958..5a5245ffc 100644
--- a/man/rarefyAssay.Rd
+++ b/man/rarefyAssay.Rd
@@ -10,9 +10,10 @@ rarefyAssay(
   assay.type = assay_name,
   assay_name = "counts",
   sample = min_size,
-  min_size = min(colSums2(assay(x, assay.type))),
-  replace = FALSE,
+  min_size = min(colSums2(assay(x))),
+  replace = TRUE,
   name = "subsampled",
+  verbose = TRUE,
   ...
 )
 
@@ -21,9 +22,10 @@ rarefyAssay(
   assay.type = assay_name,
   assay_name = "counts",
   sample = min_size,
-  min_size = min(colSums2(assay(x, assay.type))),
-  replace = FALSE,
+  min_size = min(colSums2(assay(x))),
+  replace = TRUE,
   name = "subsampled",
+  verbose = TRUE,
   ...
 )
 }

From 65b6d19909ad15fd9da76b0396e6fcc7801fc4c6 Mon Sep 17 00:00:00 2001
From: TuomasBorman <tvborm@utu.fi>
Date: Sat, 18 Jan 2025 10:06:19 +0200
Subject: [PATCH 5/6] up

---
 R/AllGenerics.R            |   6 +-
 R/getCrossAssociation.R    |   1 +
 R/mergeSEs.R               | 159 +++++++++++++++++++------------------
 man/getCrossAssociation.Rd |   1 +
 man/mergeSEs.Rd            |   5 +-
 man/rarefyAssay.Rd         |  29 +++----
 6 files changed, 97 insertions(+), 104 deletions(-)

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
index 20668220f..a2c9ead77 100644
--- a/R/AllGenerics.R
+++ b/R/AllGenerics.R
@@ -200,11 +200,7 @@ setGeneric("mergeSEs", signature = c("x"), function(x, ... )
 
 #' @rdname rarefyAssay
 #' @export
-setGeneric("rarefyAssay", signature = c("x"),
-    function(
-        x, assay.type = assay_name, assay_name = "counts",
-        sample = min_size, min_size = min(colSums2(assay(x))),
-        replace = TRUE, name = "subsampled", verbose = TRUE, ...)
+setGeneric("rarefyAssay", signature = c("x"), function(x, ...)
     standardGeneric("rarefyAssay"))
 
 #' @rdname runCCA
diff --git a/R/getCrossAssociation.R b/R/getCrossAssociation.R
index 8f969e691..edf3630bc 100644
--- a/R/getCrossAssociation.R
+++ b/R/getCrossAssociation.R
@@ -253,6 +253,7 @@
 #'
 #' # To calculate correlation of features to principal coordinates, you have to
 #' # first calculate PCoA...
+#' library(scater)
 #' tse <- runMDS(
 #'     tse, assay.type = "rclr", FUN = getDissimilarity, method = "euclidean")
 #' # ...then calculate the correlation.
diff --git a/R/mergeSEs.R b/R/mergeSEs.R
index 7c1b6eca5..a31a32ad9 100644
--- a/R/mergeSEs.R
+++ b/R/mergeSEs.R
@@ -1,26 +1,26 @@
 #' Merge SE objects into single SE object.
-#' 
+#'
 #' @inheritParams rarefyAssay
 #' @inheritParams getDominant
-#' 
+#'
 #' @param y a \code{\link{SummarizedExperiment}} object when \code{x} is a
 #' \code{\link{SummarizedExperiment}} object. Disabled when \code{x} is a list.
-#' 
+#'
 #' @param join \code{Character scalar}. A value for selecting the joining
 #' method. Must be 'full', 'inner', 'left', or 'right'. 'left' and 'right' are
 #' disabled when more than two objects are being merged.
 #' (Default: \code{"full"})
-#' 
+#'
 #' @param missing.values \code{NA}, \code{0} or \code{Character scalar}.
 #' Specifies the notation of missing values. (By default: \code{NA})
-#' 
+#'
 #' @param missing_values Deprecated. Use \code{missing.values} instead.
-#' 
+#'
 #' @param collapse.cols \code{Logical scalar}. Determines whether to collapse
 #' identically named samples to one. (Default: \code{FALSE})
-#' 
+#'
 #' @param collapse_samples Deprecated. Use \code{collapse.cols} instead.
-#' 
+#'
 #' @param collapse.rows \code{Logical scalar}. Selects whether to collapse
 #' identically named features to one. Since all taxonomy information is
 #' taken into account,
@@ -30,9 +30,12 @@
 #' option, it is possible to specify whether these strains are combined if their
 #' taxonomy information along with OTU number matches.
 #' (Default: \code{TRUE})
-#' 
+#'
 #' @param collapse_features Deprecated. Use \code{collapse.rows} instead.
 #'
+#' @param verbose \code{Logical scalar}. Choose whether to show
+#' messages. (Default: \code{TRUE})
+#'
 #' @param ... optional arguments (not used).
 #'
 #' @return A single \code{SummarizedExperiment} object.
@@ -45,14 +48,14 @@
 #' \code{rownames} and
 #' \code{colnames}. \code{rowTree} and \code{colTree} are preserved if linkage
 #' between rows/cols and the tree is found.
-#' 
+#'
 #' Equally named rows are interpreted as equal. Further
-#' matching based on \code{rowData} is not done. For samples, collapsing 
-#' is disabled by default meaning that equally named samples that are stored 
-#' in different objects are interpreted as unique. Collapsing can be enabled 
+#' matching based on \code{rowData} is not done. For samples, collapsing
+#' is disabled by default meaning that equally named samples that are stored
+#' in different objects are interpreted as unique. Collapsing can be enabled
 #' with \code{collapse.cols = TRUE} when equally named samples describe the same
-#' sample. 
-#' 
+#' sample.
+#'
 #' If, for example, all rows are not shared with
 #' individual objects, there are missing values in \code{assays}.
 #' The notation of missing
@@ -60,31 +63,31 @@
 #' of
 #' \code{TreeSummarizedExperiment} objects, also \code{rowTree}, \code{colTree},
 #' and
-#' \code{referenceSeq} are preserved if possible. The data is preserved if 
+#' \code{referenceSeq} are preserved if possible. The data is preserved if
 #' all the rows or columns can be found from it.
-#' 
-#' Compared to \code{cbind} and \code{rbind} \code{mergeSEs} 
-#' allows more freely merging since \code{cbind} and \code{rbind} expect 
+#'
+#' Compared to \code{cbind} and \code{rbind} \code{mergeSEs}
+#' allows more freely merging since \code{cbind} and \code{rbind} expect
 #' that rows and columns are matching, respectively.
-#' 
+#'
 #' You can choose joining methods from \code{'full'}, \code{'inner'},
-#'  \code{'left'}, and  \code{'right'}. In all the methods, all the samples are 
+#'  \code{'left'}, and  \code{'right'}. In all the methods, all the samples are
 #'  included in the result object. However, with different methods, it is
-#'  possible 
+#'  possible
 #'  to choose which rows are included.
-#' 
+#'
 #' \itemize{
 #'   \item{\code{full} -- all unique features}
 #'   \item{\code{inner} -- all shared features}
 #'   \item{\code{left} -- all the features of the first object}
 #'   \item{\code{right} -- all the features of the second object}
 #' }
-#' 
+#'
 #' The output depends on the input. If the input contains
 #' \code{SummarizedExperiment}
 #' object, then the output will be \code{SummarizedExperiment}. When all the
 #' input
-#' objects belong to \code{TreeSummarizedExperiment}, the output will be 
+#' objects belong to \code{TreeSummarizedExperiment}, the output will be
 #' \code{TreeSummarizedExperiment}.
 #'
 #' @seealso
@@ -99,42 +102,42 @@
 #'
 #' @name mergeSEs
 #' @export
-#' 
+#'
 #' @examples
 #' data(GlobalPatterns)
 #' data(esophagus)
 #' data(enterotype)
-#' 
+#'
 #' # Take only subsets so that it wont take so long
 #' tse1 <- GlobalPatterns[1:100, ]
 #' tse2 <- esophagus
 #' tse3 <- enterotype[1:100, ]
-#' 
+#'
 #' # Merge two TreeSEs
 #' tse <- mergeSEs(tse1, tse2)
-#' 
+#'
 #' # Merge a list of TreeSEs
 #' list <- SimpleList(tse1, tse2, tse3)
 #' tse <- mergeSEs(list, assay.type = "counts", missing.values = 0)
 #' tse
-#' 
+#'
 #' # With 'join', it is possible to specify the merging method. Subsets are used
 #' # here just to show the functionality
 #' tse_temp <- mergeSEs(tse[1:10, 1:10], tse[5:100, 11:20], join = "left")
 #' tse_temp
-#' 
+#'
 #' # If your objects contain samples that describe one and same sample,
 #' # you can collapse equally named samples to one by specifying 'collapse.cols'
-#' tse_temp <- mergeSEs(list(tse[1:10, 1], tse[1:20, 1], tse[1:5, 1]), 
+#' tse_temp <- mergeSEs(list(tse[1:10, 1], tse[1:20, 1], tse[1:5, 1]),
 #'                        collapse.cols = TRUE,
 #'                        join = "inner")
 #' tse_temp
-#' 
+#'
 #' # Merge all available assays
 #' tse <- transformAssay(tse, method="relabundance")
 #' ts1 <- transformAssay(tse1, method="relabundance")
 #' tse_temp <- mergeSEs(tse, tse1, assay.type = assayNames(tse))
-#' 
+#'
 NULL
 
 ###################### Function for SimpleList of TreeSEs ######################
@@ -143,12 +146,12 @@ NULL
 #' @export
 setMethod("mergeSEs", signature = c(x = "SimpleList"),
         function(x, assay.type="counts", assay_name = NULL, join = "full",
-                missing.values = missing_values, missing_values = NA, 
-                collapse.cols = collapse_samples, collapse_samples = FALSE, 
+                missing.values = missing_values, missing_values = NA,
+                collapse.cols = collapse_samples, collapse_samples = FALSE,
                 collapse.rows = collapse_features, collapse_features = TRUE,
                 verbose = TRUE, ... ){
             ################## Input check ##################
-            # Check the objects 
+            # Check the objects
             class <- .check_objects_and_give_class(x)
             if (!is.null(assay_name) & is.null(assay.type)) {
                 .Deprecated(new="assay.type", old="assay_name", msg=paste0(
@@ -227,7 +230,7 @@ setMethod("mergeSEs", signature = c(x = "SummarizedExperiment"),
             if( !(is(y, "SummarizedExperiment")) ){
                 stop("'y' must be a 'SummarizedExperiment' object.",
                     call. = FALSE)
-            } 
+            }
             ################ Input check end ################
             # Create a list based on TreeSEs
             list <- SimpleList(x, y)
@@ -300,7 +303,7 @@ setMethod("mergeSEs", signature = c(x = "list"),
             if( verbose ){
                 message("\r", i+1, "/", length(x)+1, appendLF = FALSE)
             }
-            
+
             # Get the ith object
             temp <- x[[i]]
             # Add rownames to rowData so that full matches are found
@@ -326,7 +329,7 @@ setMethod("mergeSEs", signature = c(x = "list"),
             if( class == "TreeSummarizedExperiment" ){
                 tse_args <- .get_TreeSE_args(temp, tse_args)
             }
-            
+
             # Create an object
             tse <- do.call(FUN_constructor, args = args)
         }
@@ -411,7 +414,7 @@ setMethod("mergeSEs", signature = c(x = "list"),
     if( verbose ){
         message("Adding referenceSeqs...")
     }
-    
+
     # Get the rownames that are included in reference sequences
     rows_that_have_seqs <- lapply(refSeqs, FUN = function(x){
         names(x[[1]])
@@ -427,7 +430,7 @@ setMethod("mergeSEs", signature = c(x = "list"),
     # Get the maximum number of DNA sets that individual TreeSE had / max
     # number of sets that individual rownames set had.
     max_numrow <- max(lengths(refSeqs))
-    
+
     # Initialize a list
     result_list <- list()
     # Loop from 1 to max number of DNA sets
@@ -487,7 +490,7 @@ setMethod("mergeSEs", signature = c(x = "list"),
     # All rownames/colnames should be included in trees/links
     if( !all(names %in% links[["names"]]) ||
         is.null(names) || length(names) == 0 ){
-        warning(MARGIN, "Tree(s) does not match with the data so it ", 
+        warning(MARGIN, "Tree(s) does not match with the data so it ",
                 "is discarded.", call. = FALSE)
         return(tse)
     }
@@ -568,12 +571,12 @@ setMethod("mergeSEs", signature = c(x = "list"),
     if( !is.null(tse@rowTree) ){
         # Get trees that will be added
         trees_add <- tse@rowTree
-        # Get rowLinks, convert them to basic DataFrame, 
+        # Get rowLinks, convert them to basic DataFrame,
         # so that additional column can be added
         links <- DataFrame(rowLinks(tse))
         # Add rownames as one of the columns
         links$names <- rownames(tse)
-        
+
         # If there is no data yet / if rowTree arguments are NULL
         if( is.null(tse_args$rowTrees) ){
             # Get the tree data as a list. Tree is as a list, and links as DF
@@ -588,7 +591,7 @@ setMethod("mergeSEs", signature = c(x = "list"),
             # How many trees there already are
             tree_num_before <- length(tse_args$rowTrees$tree)
             # Get unique names
-            unique_names <- make.unique( 
+            unique_names <- make.unique(
                 names( c(tse_args$rowTrees$tree, trees_add) )
             )
             # Update the names of current data
@@ -598,12 +601,12 @@ setMethod("mergeSEs", signature = c(x = "list"),
             # Get corresponding current names
             names_add <- names(trees_add)
             # Update tree names from links
-            links[ , "whichTree" ] <- 
+            links[ , "whichTree" ] <-
                 unique_names_add[ match( links[ , "whichTree" ], names_add ) ]
             # Update tree names
             names(trees_add) <- unique_names_add
             # Add data to a list
-            tse_args$rowTrees <- list( 
+            tse_args$rowTrees <- list(
                 trees = c(tse_args$rowTrees$trees, trees_add),
                 links = rbind(tse_args$rowTrees$links, links)
                 )
@@ -613,12 +616,12 @@ setMethod("mergeSEs", signature = c(x = "list"),
     if( !is.null(tse@colTree) ){
         # Get trees that will be added
         trees_add <- tse@rowTree
-        # Get colLinks, convert them to basic DataFrame, 
+        # Get colLinks, convert them to basic DataFrame,
         # so that additional column can be added
         links <- DataFrame(colLinks(tse))
         # Add colnames as one of the columns
         links$names <- colnames(tse)
-        
+
         # If there is no data yet / if colTree arguments are NULL
         if( is.null(tse_args$colTrees) ){
             # Get the tree data as a list. Tree is as a list, and links as DF
@@ -633,7 +636,7 @@ setMethod("mergeSEs", signature = c(x = "list"),
             # How many trees there already are
             tree_num_before <- length(tse_args$colTrees$tree)
             # Get unique names
-            unique_names <- make.unique( 
+            unique_names <- make.unique(
                 names( c(tse_args$colTrees$tree, trees_add) )
             )
             # Update the names of current data
@@ -643,12 +646,12 @@ setMethod("mergeSEs", signature = c(x = "list"),
             # Get corresponding current names
             names_add <- names(trees_add)
             # Update tree names from links
-            links[ , "whichTree" ] <- 
+            links[ , "whichTree" ] <-
                 unique_names_add[ match( links[ , "whichTree" ], names_add ) ]
             # Update tree names
             names(trees_add) <- unique_names_add
             # Add data to a list
-            tse_args$rowTrees <- list( 
+            tse_args$rowTrees <- list(
                 trees = c(tse_args$colTrees$trees, trees_add),
                 links = rbind(tse_args$colTrees$links, links)
             )
@@ -673,14 +676,14 @@ setMethod("mergeSEs", signature = c(x = "list"),
             tse_args$refSeqs <- refSeqs
         } else{
             # otherwise add data to a list
-            tse_args$refSeqs <- c( tse_args$refSeqs, refSeqs ) 
+            tse_args$refSeqs <- c( tse_args$refSeqs, refSeqs )
         }
     }
     return(tse_args)
 }
 
 ######################## .get_SummarizedExperiment_data ########################
-# This function gets the desired data from one SE object and creates a list of 
+# This function gets the desired data from one SE object and creates a list of
 # arguments containing the data
 # Arguments of SCE and TreeSE are also fetched with this function.
 # TreeSE-specific slots are collected with different function so that they are
@@ -701,7 +704,7 @@ setMethod("mergeSEs", signature = c(x = "list"),
                 metadata = metadata
     )
     return(args)
-    
+
 }
 
 ######################## .check_objects_and_give_class #########################
@@ -713,7 +716,7 @@ setMethod("mergeSEs", signature = c(x = "list"),
     # Allowed classes
     allowed_classes <- c("TreeSummarizedExperiment", "SingleCellExperiment",
         "SummarizedExperiment")
-    
+
     # Get the class based on hierarchy TreeSE --> SCE --> SE
     # and check that objects are in correct format
     classes <- lapply(x, .check_object_for_merge)
@@ -726,7 +729,7 @@ setMethod("mergeSEs", signature = c(x = "list"),
     } else {
         class <- allowed_classes[3]
     }
-    
+
     # If there are multiple classes, give a warning
     if( length(unique( classes )) > 1 ){
         warning("The Input consist of multiple classes. ",
@@ -766,7 +769,7 @@ setMethod("mergeSEs", signature = c(x = "list"),
     # Get class
     class <- class(x)
     return(class)
-    
+
 }
 ########################### .assays_cannot_be_found ############################
 # This function checks that the assay(s) can be found from TreeSE objects of
@@ -816,7 +819,7 @@ setMethod("mergeSEs", signature = c(x = "list"),
         {
             .check_assay_present(assay.type, tse)
             return(TRUE)
-            
+
         },
         error = function(cond) {
             return(FALSE)
@@ -882,7 +885,7 @@ setMethod("mergeSEs", signature = c(x = "list"),
     names(assays) <- assay.type
     # Combine metadata
     metadata <- c( metadata(tse1), metadata(tse2) )
-    
+
     # Create a list of data
     args <- list(assays = assays,
                 rowData = rowdata,
@@ -902,16 +905,16 @@ setMethod("mergeSEs", signature = c(x = "list"),
     # Take assays
     assay1 <- assay(tse1, assay.type)
     assay2 <- assay(tse2, assay.type)
-    
+
     # Merge two assays into one
     assay <- .join_two_tables(assay1, assay2, join)
-    
+
     # Convert into matrix
     assay <- as.matrix(assay)
-    
+
     # Fill missing values
     assay[ is.na(assay) ] <- missing.values
-    
+
     # Order the assay based on rowData and colData
     assay <- assay[ match(rownames(rd), rownames(assay)), , drop = FALSE ]
     assay <- assay[ , match(rownames(cd), colnames(assay)), drop = FALSE]
@@ -927,7 +930,7 @@ setMethod("mergeSEs", signature = c(x = "list"),
     # Take rowDatas
     rd1 <- rowData(tse1)
     rd2 <- rowData(tse2)
-    
+
     # Convert column names to lower
     if( length(colnames(rd1)) > 0 ){
         colnames(rd1) <- tolower(colnames(rd1))
@@ -935,10 +938,10 @@ setMethod("mergeSEs", signature = c(x = "list"),
     if( length(colnames(rd2)) > 0 ){
         colnames(rd2) <- tolower(colnames(rd2))
     }
-    
+
     # Merge rowdata
     rd <- .join_two_tables(rd1, rd2, join)
-    
+
     # There might be duplicated rownames. This might occur when there are
     # features with equal taxonomy data but merged datasets have some
     # additional info that do not match with each other. --> collapse
@@ -991,9 +994,9 @@ setMethod("mergeSEs", signature = c(x = "list"),
             rownames(rd) <- rd[["rownames_merge_ID"]]
             rd[["rownames_merge_ID"]] <- NULL
         }
-        
+
     }
-    
+
     # Get column indices that match with taxonomy ranks
     ranks_ind <- match( TAXONOMY_RANKS, colnames(rd) )
     # Remove NAs
@@ -1012,7 +1015,7 @@ setMethod("mergeSEs", signature = c(x = "list"),
             substr(rank_names, 2, nchar(rank_names)), sep = "")
         # Add new names to colnames of rd_rank
         colnames(rd_rank) <- new_rank_names
-        
+
         # Combine columns
         rd <- cbind(rd_rank, rd_other)
     }
@@ -1028,12 +1031,12 @@ setMethod("mergeSEs", signature = c(x = "list"),
     # Take colDatas
     cd1 <- colData(tse1)
     cd2 <- colData(tse2)
-    
+
     # Merge coldata
     cd <- .join_two_tables(cd1, cd2, join = "full")
     # Convert into DataFrame
     cd <- DataFrame(cd)
-    
+
     return(cd)
 }
 
@@ -1058,11 +1061,11 @@ setMethod("mergeSEs", signature = c(x = "list"),
                     left = FALSE,
                     right = TRUE
     )
-    
+
     # Ensure that the data is always data.frame
     df1 <- as.data.frame(df1)
     df2 <- as.data.frame(df2)
-    
+
     # STEP 1: Check whether variables can be merged; if their classes are equal.
     # Adjust colnames if their classes differ
     if( ncol(df1) > 0 && ncol(df2) > 0 ){
@@ -1091,7 +1094,7 @@ setMethod("mergeSEs", signature = c(x = "list"),
         # Take into account if other df has only NA values in column --> class
         # of that variable can be wrong --> use known class
         classes$no_match <- FALSE
-        classes[classes$found_both, "no_match"] <- 
+        classes[classes$found_both, "no_match"] <-
             classes[classes$found_both, "class.x"] != classes[
                 classes$found_both, "class.y"] & classes$not_na.x[
                     classes$found_both] & classes$not_na.y[classes$found_both]
@@ -1131,11 +1134,11 @@ setMethod("mergeSEs", signature = c(x = "list"),
     # Add rownames to one of the columns
     df1$rownames_merge_ID <- rownames(df1)
     df2$rownames_merge_ID <- rownames(df2)
-    
+
     # STEP 2: merge
     # Finally merge data frames into one data frame
     df <- merge(df1, df2, all.x = all.x, all.y = all.y)
-    
+
     # STEP 3: polish
     # Get column names because they can be changed when class is changed
     colnames <- colnames(df)
diff --git a/man/getCrossAssociation.Rd b/man/getCrossAssociation.Rd
index 5e47ecf80..886cbb6b2 100644
--- a/man/getCrossAssociation.Rd
+++ b/man/getCrossAssociation.Rd
@@ -302,6 +302,7 @@ res <- getCrossAssociation(
 
 # To calculate correlation of features to principal coordinates, you have to
 # first calculate PCoA...
+library(scater)
 tse <- runMDS(
     tse, assay.type = "rclr", FUN = getDissimilarity, method = "euclidean")
 # ...then calculate the correlation.
diff --git a/man/mergeSEs.Rd b/man/mergeSEs.Rd
index f19931aa7..12811a042 100644
--- a/man/mergeSEs.Rd
+++ b/man/mergeSEs.Rd
@@ -65,6 +65,9 @@ taxonomy information along with OTU number matches.
 
 \item{collapse_features}{Deprecated. Use \code{collapse.rows} instead.}
 
+\item{verbose}{\code{Logical scalar}. Choose whether to show
+messages. (Default: \code{TRUE})}
+
 \item{y}{a \code{\link{SummarizedExperiment}} object when \code{x} is a
 \code{\link{SummarizedExperiment}} object. Disabled when \code{x} is a list.}
 }
@@ -149,7 +152,7 @@ tse_temp
 
 # If your objects contain samples that describe one and same sample,
 # you can collapse equally named samples to one by specifying 'collapse.cols'
-tse_temp <- mergeSEs(list(tse[1:10, 1], tse[1:20, 1], tse[1:5, 1]), 
+tse_temp <- mergeSEs(list(tse[1:10, 1], tse[1:20, 1], tse[1:5, 1]),
                        collapse.cols = TRUE,
                        join = "inner")
 tse_temp
diff --git a/man/rarefyAssay.Rd b/man/rarefyAssay.Rd
index 5a5245ffc..242e5bb32 100644
--- a/man/rarefyAssay.Rd
+++ b/man/rarefyAssay.Rd
@@ -5,33 +5,28 @@
 \alias{rarefyAssay,SummarizedExperiment-method}
 \title{Subsample Counts}
 \usage{
-rarefyAssay(
-  x,
-  assay.type = assay_name,
-  assay_name = "counts",
-  sample = min_size,
-  min_size = min(colSums2(assay(x))),
-  replace = TRUE,
-  name = "subsampled",
-  verbose = TRUE,
-  ...
-)
+rarefyAssay(x, ...)
 
 \S4method{rarefyAssay}{SummarizedExperiment}(
   x,
   assay.type = assay_name,
   assay_name = "counts",
   sample = min_size,
-  min_size = min(colSums2(assay(x))),
-  replace = TRUE,
+  min_size = min(colSums2(assay(x, assay.type))),
+  replace = FALSE,
   name = "subsampled",
-  verbose = TRUE,
   ...
 )
 }
 \arguments{
 \item{x}{\code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}.}
 
+\item{...}{optional arguments:
+\itemize{
+\item \code{verbose}: \code{Logical scalar}. Choose whether to show
+messages. (Default: \code{TRUE})
+}}
+
 \item{assay.type}{\code{Character scalar}. Specifies which assay to use for
 calculation. (Default: \code{"counts"})}
 
@@ -49,12 +44,6 @@ replacement. Ths works similarly to \code{sample(..., replace = TRUE)}.
 
 \item{name}{\code{Character scalar}. The name for the transformed assay to
 be stored. (Default: \code{method})}
-
-\item{...}{optional arguments:
-\itemize{
-\item \code{verbose}: \code{Logical scalar}. Choose whether to show
-messages. (Default: \code{TRUE})
-}}
 }
 \value{
 \code{rarefyAssay} return \code{x} with subsampled data.

From d01f785b087d171ce0381ce62e31c0fea94399bb Mon Sep 17 00:00:00 2001
From: TuomasBorman <tvborm@utu.fi>
Date: Sat, 18 Jan 2025 10:50:02 +0200
Subject: [PATCH 6/6] up

---
 tests/testthat/test-5Unifrac.R | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/tests/testthat/test-5Unifrac.R b/tests/testthat/test-5Unifrac.R
index 212dd834e..0c1036d2e 100644
--- a/tests/testthat/test-5Unifrac.R
+++ b/tests/testthat/test-5Unifrac.R
@@ -3,36 +3,36 @@ test_that("Unifrac beta diversity", {
     data(esophagus, package="mia")
     tse <- esophagus
     tse <- transformAssay(tse, assay.type="counts", method="relabundance")
-    
+
     expect_error(
-        getDissimilarity(tse, method = "unifrac",assay.type = "test", 
+        getDissimilarity(tse, method = "unifrac",assay.type = "test",
                           tree.name = "phylo", weighted = FALSE)
     )
     expect_error(
-        getDissimilarity(tse, method = "unifrac", assay.type = 2, 
+        getDissimilarity(tse, method = "unifrac", assay.type = 2,
                           tree.name = "phylo", weighted = FALSE)
     )
     expect_error(
-        getDissimilarity(tse, method = "unifrac", assay.type = TRUE, 
+        getDissimilarity(tse, method = "unifrac", assay.type = TRUE,
                           tree.name = "phylo", weighted = FALSE)
     )
     expect_error(
-      getDissimilarity(tse, method = "unifrac", assay.type = "counts", 
+      getDissimilarity(tse, method = "unifrac", assay.type = "counts",
                        tree.name = "test", weighted = FALSE)
     )
     expect_error(
-      getDissimilarity(tse, method = "unifrac", assay.type = "counts", 
+      getDissimilarity(tse, method = "unifrac", assay.type = "counts",
                        tree.name = 1, weighted = FALSE)
     )
     expect_error(
-      getDissimilarity(tse, method = "unifrac", assay.type = "counts", 
+      getDissimilarity(tse, method = "unifrac", assay.type = "counts",
                        tree.name = TRUE, weighted = FALSE)
     )
     expect_error(
-      getDissimilarity(tse, method = "unifrac", assay.type = "counts", 
+      getDissimilarity(tse, method = "unifrac", assay.type = "counts",
                        tree.name = "phylo", weighted = 1)
     )
-    
+
     data(GlobalPatterns, package="mia")
     tse <- GlobalPatterns
     # Calculate unweighted unifrac
@@ -44,12 +44,12 @@ test_that("Unifrac beta diversity", {
     # Calculate weighted unifrac. Allow tolerance since weighted unifrac
     # calculation in rbiom has some stochasticity. That is most likely due
     # multithreading and complex structure of tree (loops).
-    unifrac_mia <- as.matrix(getDissimilarity(tse, method = "unifrac", 
+    unifrac_mia <- as.matrix(getDissimilarity(tse, method = "unifrac",
                                               weighted = TRUE))
     unifrac_rbiom <- as.matrix(rbiom::unifrac(assay(tse), weighted = TRUE,
                                               rowTree(tse)))
-    expect_equal(unifrac_mia, unifrac_rbiom, tolerance = 1e-3)
-    
+    expect_equal(unifrac_mia, unifrac_rbiom, tolerance = 5e-3)
+
     # Test that the function works correctly when there are multiple trees.
     # The function should subset the data based on tree.
     tse <- GlobalPatterns
@@ -78,9 +78,9 @@ test_that("Unifrac beta diversity", {
     unifrac_rbiom <- as.matrix(rbiom::unifrac(assay(tse_ref), weighted = TRUE,
                                               rowTree(tse_ref)))
     expect_equal(unifrac_mia, unifrac_rbiom, tolerance = 1e-3)
-    
-    # Test the function with agglomerated data. .get_unifrac renames 
-    # rownames based on tips and links to them. Then it also prunes the tree so 
+
+    # Test the function with agglomerated data. .get_unifrac renames
+    # rownames based on tips and links to them. Then it also prunes the tree so
     # that rows are in tips.
     tse <- GlobalPatterns
     tse <- agglomerateByRank(tse, rank = "Species")