v0.5.3: minor updates to make the package cran-compliant.

zellerlab · Aug 23, 2024 · f6a0708 · f6a0708
1 parent 4f540ce
commit f6a0708
Show file tree

Hide file tree

Showing 27 changed files with 330 additions and 3,345 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,34 +1,72 @@
-Package: SIMBA
+Package: simbaR
 Type: Package
-Title: Simulation of Microbiome data with Biological Accuracy
-Version: 0.5.1
+Title: Simulation of Microbiome Data with Biological Accuracy
+Version: 0.5.3
 Authors@R: c(person("Jakob", "Wirbel", role = c("aut", "cre"),
-                email = "jakob.wirbel@embl.de",
+                email = "jakob.wirbel@gmail.com",
                 comment = c(ORCID = "0000-0002-4073-3562")),
              person("Morgan", "Essex",
-                email = "morgan.essex@embl.de", role = c("aut")),
+                email = "essexmorgan@gmail.com", role = c("aut")),
              person("Sofia", "Forslund",
                 email = "Sofia.Forslund@mdc-berlin.de", role = c("aut")),
              person("Georg", "Zeller",
-                email = "zeller@embl.de", role = c("aut")))
-Maintainer: Jakob Wirbel <jakob.wirbel@embl.de>
-Description: Based on real data, the package simulates new metagenomic data 
-    by re-sampling real samples and then implants differentially abundant 
-    features. Additionally, the package can simulate confounding factors based 
-    on metadata variables in the real data. The simulations are stored in an 
-    `.h5` file, which is then the basis for downstream benchmarking, 
-    involving i) reality assessment of the  simulations, ii) testing for 
-    differential abundance, and iii) evaluation of the output from 
-    differential abundance testing methods.
+                email = "georg.zeller@gmail.com", role = c("aut")))
+Maintainer: Jakob Wirbel <jakob.wirbel@gmail.com>
+Description: Based on real microbiome data, the package simulates new metagenomic data by re-sampling real samples and then implants differentially abundant features. Additionally, the package can simulate confounding factors based on metadata variables in the real data. The simulations are stored in an `.h5` file, which is then the basis for downstream benchmarking, involving i) reality assessment of the simulations, ii) testing for differential abundance, and iii) evaluation of the output from differential abundance testing methods. See Wirbel, Essex, et al. for more information on benchmarking differential abundance testing methods <doi:10.1101/2022.05.09.491139>.
 Depends:
     R (>= 4.0.0),
     rhdf5
+Imports:
+    ineq,
+    labdsv,
+    vegan,
+    progress,
+    pROC,
+    matrixStats,
+    phyloseq,
+    nlme,
+    ggplot2,
+    tmvtnorm
 Suggests:
     knitr,
     rmarkdown,
     tidyverse,
-    patchwork
+    dplyr,
+    stringr,
+    purrr,
+    patchwork,
+    BiocStyle,
+    MASS,
+    MAST,
+    edgeR,
+    exactRankTests,
+    limma,
+    coin,
+    scde,
+    lmerTest,
+    fastANCOM,
+    SingleCellExperiment,
+    distinct,
+    corncob,
+    ANCOMBC,
+    metagenomeSeq,
+    LDM,
+    SIAMCAT,
+    DESeq2,
+    LinDA,
+    ALDEx2,
+    ZINQ,
+    ZIBSeq,
+    sparseDOSSA,
+    mvtnorm,
+    SpiecEasi,
+    TailRank,
+    GUniFrac,
+    car,
+    mixOmics,
+    lmtest
+VignetteBuilder: knitr
 License: GPL-3
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.2.1
+RoxygenNote: 7.2.3
diff --git a/NAMESPACE b/NAMESPACE
@@ -15,5 +15,30 @@ importFrom(matrixStats,rowQuantiles)
 importFrom(matrixStats,rowVars)
 importFrom(pROC,roc)
 importFrom(progress,progress_bar)
+importFrom(stats,anova)
+importFrom(stats,as.formula)
+importFrom(stats,coefficients)
+importFrom(stats,cor)
+importFrom(stats,cov2cor)
+importFrom(stats,dnorm)
+importFrom(stats,fisher.test)
+importFrom(stats,formula)
+importFrom(stats,kruskal.test)
+importFrom(stats,ks.test)
+importFrom(stats,lm)
+importFrom(stats,model.matrix)
+importFrom(stats,na.omit)
+importFrom(stats,p.adjust)
+importFrom(stats,pnorm)
+importFrom(stats,qnbinom)
+importFrom(stats,quantile)
+importFrom(stats,residuals)
+importFrom(stats,rgamma)
+importFrom(stats,rmultinom)
+importFrom(stats,rnbinom)
+importFrom(stats,sd)
+importFrom(stats,t.test)
+importFrom(stats,var)
+importFrom(stats,wilcox.test)
 importFrom(vegan,adonis)
 importFrom(vegan,vegdist)
diff --git a/R/data_docs.R b/R/data_docs.R
@@ -1,21 +1,38 @@
-#' @title Toy example
+#' @title SIMBA example features
 #'
-#' @description Feature and metadata table from the dataset from Zeevi et al
-#' (see \url{https://doi.org/10.1016/j.cell.2015.11.001}), containing microbial
-#' features profiled with mOTUs2
-#' (see \url{http://dx.doi.org/10.1038/s41467-019-08844-4}). For some
-#' patients, more than one same has been collected.
+#' @description Taxonomic profiles from the study by Zeevi et al. Cell 2014
+#' (see \doi{10.1016/j.cell.2015.11.001}), 
+#' containing 1952 features from 1181 samples, created by the motus profiler.
 #'
-#' Mainly used for running the examples in the function documentation and in
-#' the package vignette.
+#' Mainly used for running the examples in the function documentation and
+#' in the vignette.
 #'
 #' @encoding UTF-8
 #'
-#' @name toy_example
+#' @name toy.feat
 #'
-#' @source \url{https://doi.org/10.1016/j.cell.2015.11.001}
+#' @source \doi{10.1016/j.cell.2015.11.001}
 #'
 #' @docType data
 #'
 #' @keywords data
 NULL
+
+#' @title SIMBA example metadata
+#'
+#' @description Metadata from the study by Zeevi et al. Cell 2014
+#' (see \doi{10.1016/j.cell.2015.11.001}).
+#'
+#' Mainly used for running the examples in the function documentation and
+#' in the vignette.
+#'
+#' @encoding UTF-8
+#'
+#' @name toy.meta
+#'
+#' @source \doi{10.1016/j.cell.2015.11.001}
+#'
+#' @docType data
+#'
+#' @keywords data
+NULL
diff --git a/R/eval_test.R b/R/eval_test.R
@@ -9,7 +9,7 @@
 #' @description This function takes the results of the \link{apply.test}
 #' function and calculates different evaluation measures (see Details).
 #'
-#' @usage eval.test(sim.location, group, res.mat, alpha = 0.05)
+#' @usage eval.test(sim.location, group, res.mat, adjust='BH', alpha = 0.05)
 #'
 #' @param sim.location file name for the .h5 file containing the simulations
 #'
@@ -18,6 +18,9 @@
 #'
 #' @param res.mat matrix, output from the \link{apply.test} function
 #'
+#' @param adjust character, indicate the multiple hypothesis testing to be
+#' performed on the P-values, defaults to "BH"
+#' 
 #' @param alpha numeric, significance threshold at which the test will be
 #' evaluated, defaults to \code{0.05}
 #'
@@ -105,9 +108,9 @@ check.eval.parameters <- function(sim.location, group,
   if (mode(adjust)!='character' | length(adjust)!=1){
     stop("Parameter 'adjust' should be a character and of length 1!")
   }
-  if (!adjust %in% p.adjust.methods){
+  if (!adjust %in% stats::p.adjust.methods){
     stop("Parameter 'adjust' must be one of these: c('", 
-         paste(p.adjust.methods, collapse = "', '"), "')")
+         paste(stats::p.adjust.methods, collapse = "', '"), "')")
   }
   # get marker features
   this <- h5read(file = sim.location, name = group)

diff --git a/R/helper_ancom_2.R b/R/helper_ancom_2.R
@@ -148,7 +148,7 @@ ANCOM = function(feature_table, meta_data, struc_zero = NULL, main_var, p_adj_me
   if (is.null(rand_formula) & is.null(adj_formula)) {
     # Basic model
     # Whether the main variable of interest has two levels or more?
-    if (length(unique(meta_data%>%pull(main_var))) == 2) {
+    if (length(unique(dplyr::pull(meta_data, main_var))) == 2) {
       # Two levels: Wilcoxon rank-sum test
       tfun = stats::wilcox.test
     } else{
@@ -237,7 +237,7 @@ ANCOM = function(feature_table, meta_data, struc_zero = NULL, main_var, p_adj_me
   out_comp = data.frame(taxa_id, W, row.names = NULL, check.names = FALSE)
   # Declare a taxon to be differentially abundant based on the quantile of W statistic.
   # We perform (n_taxa - 1) hypothesis testings on each taxon, so the maximum number of rejections is (n_taxa - 1).
-  out_comp = out_comp%>%mutate(detected_0.9 = ifelse(W > 0.9 * (n_taxa -1), TRUE, FALSE),
+  out_comp = dplyr::mutate(out_comp, detected_0.9 = ifelse(W > 0.9 * (n_taxa -1), TRUE, FALSE),
                                detected_0.8 = ifelse(W > 0.8 * (n_taxa -1), TRUE, FALSE),
                                detected_0.7 = ifelse(W > 0.7 * (n_taxa -1), TRUE, FALSE),
                                detected_0.6 = ifelse(W > 0.6 * (n_taxa -1), TRUE, FALSE))

diff --git a/R/helper_clean.R b/R/helper_clean.R
@@ -273,7 +273,7 @@ validate.original.data <- function(d.list, sim.method){
   # features
   feat.all <- do.call(cbind, feat.final)
   # metadata
-  colnames.final <- reduce(colnames.meta, intersect)
+  colnames.final <- purrr::reduce(colnames.meta, intersect)
   for (j in seq_along(meta.final)){
     meta.final[[j]] <- meta.final[[j]][,colnames.final]
   }
@@ -442,9 +442,11 @@ confounder.check <- function(feat, meta){
   }
 
   # create plot
-  ggplot(df.plot.all, aes(x=effect.size, y=-log10(p.val))) +
-    geom_point() +
-    facet_grid(~variable)
+  ggplot2::ggplot(df.plot.all, 
+                  ggplot2::aes(x=df.plot.all$effect.size, 
+                               y=-log10(df.plot.all$p.val))) +
+    ggplot2::geom_point() +
+    ggplot2::facet_grid(~variable)
 }
 
 # check simulation parameters

diff --git a/R/helper_hawinkel.R b/R/helper_hawinkel.R
@@ -87,8 +87,8 @@ simulate.betabin <- function(feat, meta, sim.out, sim.params){
     message("++ Only ", n.cores, " cores detected. It would be better to",
             " have more cores available!")
   }
-  res <- spiec.easi(t(feat), verbose=TRUE, pulsar.params=list(ncores=n.cores))
-  correlation <- getOptCov(res)
+  res <- SpiecEasi::spiec.easi(t(feat), verbose=TRUE, pulsar.params=list(ncores=n.cores))
+  correlation <- SpiecEasi::getOptCov(res)
 
   # calculate rhos/phis
   message("++ Starting to estimate parameters ",
@@ -149,8 +149,8 @@ simulate.negbin <- function(feat, meta, sim.out, sim.params){
       message("++ Only ", n.cores, " cores detected. It would be better to",
               " have more cores available!")
     }
-    res <- spiec.easi(t(feat), verbose=TRUE, pulsar.params=list(ncores=n.cores))
-    correlation <- getOptCov(res)
+    res <- SpiecEasi::spiec.easi(t(feat), verbose=TRUE, pulsar.params=list(ncores=n.cores))
+    correlation <- SpiecEasi::getOptCov(res)
   } else {
     correlation <- NULL
   }
@@ -323,7 +323,7 @@ simulate.markers.betabin <- function(pis, theta,
   for(nRun in seq_along(sampleSizes)){
     indSel <- (samSizeSeq[nRun] + 1L):samSizeSeq[nRun + 1L]
     dmData[indSel, ] <- rmvbetabin(n=sampleSizes[nRun], pi=pis.all[, nRun],
-                                   libSize=libSizesOrig[indSel],
+                                   libSizes=libSizesOrig[indSel],
                                    Sigma=correlation, theta = theta)
   }
 
@@ -417,10 +417,11 @@ rmvnegbin <- function(n, mu, Sigma, ks, ...) {
   if (dim(mu)[2] != dim(Sigma)[2])
     stop("Sigma and mu dimensions don't match")
   if (missing(ks)) {
-    ks <- unlist(lapply(1:length(SDs), function(i) .negbin_getK(mu[i], SDs[i])))
+    # ks <- unlist(lapply(1:length(SDs), function(i) .negbin_getK(mu[i], SDs[i])))
+    stop("Problem with this dataset!")
   }
   d <- dim(mu)[2]
-  normd <- rmvnorm(n, rep(0, d), Sigma = Cor)  #The normal-to-anything framework
+  normd <- mvtnorm::rmvnorm(n, rep(0, d), Sigma = Cor)  #The normal-to-anything framework
   unif <- pnorm(normd)
   data <- t(qnbinom(t(unif), mu = t(mu), size = ks, ...))
   data <- fixInf(data)
@@ -431,7 +432,7 @@ rmvnegbin <- function(n, mu, Sigma, ks, ...) {
 # on the Tailrank package
 #' @keywords internal
 qbetabin = function(p, N, u, v) {
-  pp <- cumsum(dbb(0:N, N, u, v))
+  pp <- cumsum(TailRank::dbb(0:N, N, u, v))
   sapply(p, function(x) sum(pp < x, na.rm = TRUE))
 }
 
@@ -450,7 +451,7 @@ rmvbetabin = function(n, pi, Sigma, theta, libSizes, ...) {
     stop("No overdispersion parameter supplied")
   }
   d <- length(pi)
-  normd <- rmvnorm(n, rep(0, d), Sigma = Cor)  #The normal-to-anything framework
+  normd <- mvtnorm::rmvnorm(n, rep(0, d), Sigma = Cor)  #The normal-to-anything framework
   unif <- pnorm(normd)
   data <- mapply(unif, rep(pi, each = nrow(unif)), libSizes,
                  FUN = function(u, p, l) {