diff --git a/DESCRIPTION b/DESCRIPTION index d3bd9814..74659a01 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: splatter Type: Package Title: Simple Simulation of Single-cell RNA Sequencing Data -Version: 1.13.0 -Date: 2020-04-29 +Version: 1.13.1 +Date: 2020-10-22 Authors@R: c(person("Luke", "Zappia", role = c("aut", "cre"), email = "luke@lazappi.id.au", @@ -10,6 +10,9 @@ Authors@R: person("Belinda", "Phipson", role = c("aut"), email = "belinda.phipson@petermac.org", comment = c(ORCID = "0000-0002-1711-7454")), + person("Christina", "Azodi", role = c("ctb"), + email = "cazodi@svi.edu.au", + comment = c(ORCID = "0000-0002-6097-606X")), person("Alicia", "Oshlack", role = c("aut"), email = "alicia.oshlack@petermac.org", comment = c(ORCID = "0000-0001-9788-5690"))) @@ -45,12 +48,14 @@ Suggests: BiocStyle, covr, cowplot, + magick, knitr, limSolve, lme4, progress, pscl, testthat, + preprocessCore, rmarkdown, scDD, scran, @@ -63,12 +68,17 @@ Suggests: spelling, igraph, DropletUtils, - BiocSingular + BiocSingular, + VariantAnnotation, + Biostrings, + GenomeInfoDb, + GenomicRanges, + IRanges biocViews: SingleCell, RNASeq, Transcriptomics, GeneExpression, Sequencing, Software, ImmunoOncology URL: https://github.com/Oshlack/splatter BugReports: https://github.com/Oshlack/splatter/issues -RoxygenNote: 7.1.0 +RoxygenNote: 7.1.1 Encoding: UTF-8 VignetteBuilder: knitr Language: en-GB diff --git a/NAMESPACE b/NAMESPACE index b598fdb6..09d64700 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -44,6 +44,10 @@ export(makeDiffPanel) export(makeOverallPanel) export(mfaEstimate) export(mfaSimulate) +export(mockBulkMatrix) +export(mockBulkeQTL) +export(mockGFF) +export(mockVCF) export(newBASiCSParams) export(newKersplatParams) export(newLun2Params) @@ -54,6 +58,7 @@ export(newSCDDParams) export(newSimpleParams) export(newSparseDCParams) export(newSplatParams) +export(newSplatPopParams) export(newZINBParams) export(phenoEstimate) export(phenoSimulate) @@ -66,6 +71,11 @@ export(simpleSimulate) export(sparseDCEstimate) export(sparseDCSimulate) export(splatEstimate) +export(splatPopEstimate) +export(splatPopQuantNorm) +export(splatPopSimulate) +export(splatPopSimulateMeans) +export(splatPopSimulateSC) export(splatSimulate) export(splatSimulateGroups) export(splatSimulatePaths) @@ -83,6 +93,7 @@ exportClasses(SCDDParams) exportClasses(SimpleParams) exportClasses(SparseDCParams) exportClasses(SplatParams) +exportClasses(SplatPopParams) exportClasses(ZINBParams) importFrom(BiocParallel,SerialParam) importFrom(BiocParallel,bplapply) @@ -90,6 +101,7 @@ importFrom(S4Vectors,"metadata<-") importFrom(S4Vectors,metadata) importFrom(SingleCellExperiment,"cpm<-") importFrom(SingleCellExperiment,SingleCellExperiment) +importFrom(SingleCellExperiment,cbind) importFrom(SingleCellExperiment,cpm) importFrom(SummarizedExperiment,"assays<-") importFrom(SummarizedExperiment,"colData<-") @@ -129,7 +141,9 @@ importFrom(ggplot2,theme) importFrom(ggplot2,theme_minimal) importFrom(ggplot2,xlab) importFrom(ggplot2,ylab) +importFrom(grDevices,boxplot.stats) importFrom(locfit,locfit) +importFrom(matrixStats,rowMedians) importFrom(methods,"slot<-") importFrom(methods,as) importFrom(methods,callNextMethod) @@ -141,6 +155,7 @@ importFrom(methods,slotNames) importFrom(methods,validObject) importFrom(stats,aggregate) importFrom(stats,approxfun) +importFrom(stats,complete.cases) importFrom(stats,cor) importFrom(stats,dbeta) importFrom(stats,density) @@ -148,7 +163,9 @@ importFrom(stats,dnbinom) importFrom(stats,ks.test) importFrom(stats,median) importFrom(stats,model.matrix) +importFrom(stats,na.omit) importFrom(stats,nls) +importFrom(stats,quantile) importFrom(stats,rbinom) importFrom(stats,rchisq) importFrom(stats,rgamma) @@ -157,5 +174,8 @@ importFrom(stats,rnbinom) importFrom(stats,rnorm) importFrom(stats,rpois) importFrom(stats,runif) +importFrom(stats,sd) +importFrom(stats,setNames) importFrom(stats,shapiro.test) +importFrom(utils,data) importFrom(utils,head) diff --git a/NEWS.md b/NEWS.md index 5c469027..cee24790 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # DEVELOPMENT VERSION +## Version 1.13.1 (2020-10-22) + +* Add the splatPop simulation (PR #106) + ## Version 1.13.0 (2020-04-29) * Bioconductor 3.12 devel diff --git a/R/AllClasses.R b/R/AllClasses.R index bc1ce616..cc9dc41d 100644 --- a/R/AllClasses.R +++ b/R/AllClasses.R @@ -25,6 +25,7 @@ setClass("Params", prototype = prototype(nGenes = 10000, nCells = 100, seed = sample(seq_len(1e6), 1))) + #' The SimpleParams class #' #' S4 class that holds parameters for the simple simulation. @@ -417,6 +418,103 @@ setClass("KersplatParams", ambient.scale = 0.05, ambient.nEmpty = 0)) +#' The SplatPopParams class +#' +#' S4 class that holds parameters for the splatPop simulation. +#' +#' @section Parameters: +#' +#' In addition to the \code{\link{SplatParams}} parameters, splatPop simulation +#' requires the following parameters: +#' +#' \describe{ +#' \item{\code{[similarity.scale]}}{Scaling factor for pop.cv.param.rate, +#' where values larger than 1 increase the similarity between individuals in +#' the population and values less than one make the individuals less +#' similar.} +#' \item{\code{[eqtl.n]}}{The number (>1) or percent (<=1) of genes to +#' assign eQTL effects.} +#' \item{\code{[eqtl.dist]}}{Maximum distance between eSNP and eGene} +#' \item{\code{[eqtl.maf.min]}}{Minimum Minor Allele Frequency of eSNPs.} +#' \item{\code{[eqtl.maf.max]}}{Maximum Minor Allele Frequency of eSNPs.} +#' \item{\code{[eqtl.group.specific]}}{Percent of eQTL effects to simulate +#' as group specific.} +#' \item{\emph{eQTL Effect size distribution parameters. Defaults estimated +#' from GTEx eQTL mapping results, see vignette for more information.}}{ +#' \describe{ +#' \item{\code{eqtl.ES.shape}}{Shape parameter for the effect size +#' gamma distribution.} +#' \item{\code{eqtl.ES.rate}}{Rate parameter for the effect size +#' gamma distribution.} +#' } +#' } +#' \item{\emph{Bulk Mean Expression distribution parameters. Defaults +#' estimated from GTEx data, see vignette for more information.}}{ +#' \describe{ +#' \item{\code{pop.mean.shape}}{Shape parameter for the mean (i.e. +#' bulk) expression gamma distribution} +#' \item{\code{pop.mean.rate}}{Rate parameter for the mean (i.e. +#' bulk) expression gamma distribution} +#' } +#' } +#' \item{\emph{Bulk Expression Coefficient of Variation distribution +#' parameters binned. Defaults estimated from GTEx data, see vignette for +#' more information.}}{ +#' \describe{ +#' \item{\code{pop.cv.param}}{Dataframe containing gene +#' mean bin range, and the CV shape, and CV rate parameters for +#' each of those bins.} +#' } +#' } +#'} +#' The parameters not shown in brackets can be estimated from real data using +#' \code{\link{splatPopEstimate}}. For details of the eQTL simulation +#' see \code{\link{splatPopSimulate}}. +#' +#' @name SplatPopParams +#' @rdname SplatPopParams +#' @aliases SplatPopParams-class +#' @exportClass SplatPopParams +setClass("SplatPopParams", + contains = "SplatParams", + slots = c(similarity.scale = "numeric", + pop.mean.shape = "numeric", + pop.mean.rate = "numeric", + pop.cv.bins = "numeric", + pop.cv.param = "data.frame", + eqtl.n = "numeric", + eqtl.dist = "numeric", + eqtl.maf.min = "numeric", + eqtl.maf.max = "numeric", + eqtl.ES.shape = "numeric", + eqtl.ES.rate = "numeric", + eqtl.group.specific = "numeric"), + prototype = prototype(similarity.scale = 1.0, + pop.mean.shape = 0.3395709, + pop.mean.rate = 0.008309486, + pop.cv.bins = 10, + pop.cv.param = + data.frame( + start = c(0, 0.476, 0.955, 1.86, 3.49, + 6.33, 10.4, 16.3, 26.5,49.9), + end = c(0.476 ,0.955, 1.86, 3.49, 6.33, + 10.4, 16.3, 26.5, 49.9, 1e+10), + shape = c(11.636709, 5.084263, 3.161149, + 2.603407, 2.174618, 2.472718, + 2.911565, 3.754947, 3.623545, + 2.540001), + rate = c(8.229737, 3.236401, 1.901426, + 1.615142, 1.467896, 2.141105, + 3.005807, 4.440894, 4.458207, + 2.702462)), + eqtl.n = 1, + eqtl.dist = 1000000, + eqtl.maf.min = 0.05, + eqtl.maf.max = 0.5, + eqtl.ES.shape = 2.538049, + eqtl.ES.rate = 5.962323, + eqtl.group.specific = 0.2)) + #' The LunParams class #' #' S4 class that holds parameters for the Lun simulation. diff --git a/R/SplatPopParams-methods.R b/R/SplatPopParams-methods.R new file mode 100644 index 00000000..71a96c79 --- /dev/null +++ b/R/SplatPopParams-methods.R @@ -0,0 +1,94 @@ +#' @rdname newParams +#' @importFrom methods new +#' @export +newSplatPopParams <- function(...) { + + for (pkg in c("VariantAnnotation", "preprocessCore")) { + if (!requireNamespace(pkg, quietly = TRUE)) { + stop("The splatPop simulation requires the ", pkg, " package.") + } + } + + params <- new("SplatPopParams") + params <- setParams(params, ...) + + return(params) +} + + +#' @importFrom checkmate checkInt checkIntegerish checkNumber checkNumeric +#' checkFlag +setValidity("SplatPopParams", function(object) { + + v <- getParams(object, c(slotNames(object))) + + checks <- c(eqtl.n = checkNumber(v$eqtl.n, lower = 0), + eqtl.dist = checkInt(v$eqtl.dist, lower = 1), + eqtl.maf.min = checkNumber(v$eqtl.maf.min, lower = 0, + upper = 0.5), + eqtl.maf.max = checkNumber(v$eqtl.maf.max, lower = 0, + upper = 0.5), + eqtl.ES.shape = checkNumber(v$eqtl.ES.shape, lower = 0), + eqtl.ES.rate = checkNumber(v$eqtl.ES.rate, lower = 0), + eqtl.group.specific = checkNumber(v$eqtl.group.specific, + lower = 0, upper = 1), + pop.mean.shape = checkNumber(v$pop.mean.shape, lower = 0), + pop.mean.rate = checkNumber(v$pop.mean.rate, lower = 0), + pop.cv.bins = checkInt(v$pop.cv.bins, lower = 1), + pop.cv.param = checkDataFrame(v$pop.cv.param), + similarity.scale = checkNumber(v$similarity.scale, lower = 0)) + + if (all(checks == TRUE)) { + valid <- TRUE + } else { + valid <- checks[checks != TRUE] + valid <- paste(names(valid), valid, sep = ": ") + } + + return(valid) +}) + + +#' @importFrom methods callNextMethod +setMethod("show", "SplatPopParams", function(object) { + + pp <- list("Population params:" = c("(mean.shape)" = "pop.mean.shape", + "(mean.rate)" = "pop.mean.rate", + "[similarity.scale]" = "similarity.scale", + "[cv.bins]" = "pop.cv.bins", + "(cv.params)" = "pop.cv.param"), + "eQTL params:" = c("[eqtl.n]" = "eqtl.n", + "[eqtl.dist]" = "eqtl.dist", + "[eqtl.maf.min]" = "eqtl.maf.min", + "[eqtl.maf.max]" = "eqtl.maf.max", + "[eqtl.group.specific]" = "eqtl.group.specific", + "(eqtl.ES.shape)" = "eqtl.ES.shape", + "(eqtl.ES.rate)" = "eqtl.ES.rate")) + + callNextMethod() + showPP(object, pp) +}) + + +#' @rdname setParam +setMethod("setParam", "SplatPopParams", function(object, name, value) { + checkmate::assertString(name) + + # splatPopParam checks + if (name == "pop.cv.param") { + if (getParam(object, "pop.cv.bins") != nrow(value)) { + stop("Need to set pop.cv.bins to length of pop.cv.param") + } + } + + if (name == "eqtl.maf.min") { + if (getParam(object, "eqtl.maf.min") >= getParam(object, "eqtl.maf.max")) { + stop("Range of acceptable Minor Allele Frequencies is too small... + Be sure eqtl.maf.min < eqtl.maf.max.") + } + } + + object <- callNextMethod() + + return(object) +}) diff --git a/R/listSims.R b/R/listSims.R index 891bb408..48f67a0c 100644 --- a/R/listSims.R +++ b/R/listSims.R @@ -36,6 +36,12 @@ listSims <- function(print = TRUE) { "The Kersplat simulation extends the Splat model by adding a gene network, more complex cell structure, doublets and empty cells (Experimental)."), + c("splatPop", "splatPop", "", + "Oshlack/splatter", + "The splatPop simulation enables splat simulations to be + generated for multiple individuals in a population, + accounting for correlation structure by simulating + expression quantitative trait loci (eQTL)."), c("Simple", "simple", "10.1186/s13059-017-1305-0", "Oshlack/splatter", "A simple simulation with gamma means and negative binomial diff --git a/R/mock-data.R b/R/mock-data.R new file mode 100644 index 00000000..48899cc6 --- /dev/null +++ b/R/mock-data.R @@ -0,0 +1,156 @@ +#' Generate mock gff +#' +#' Quick function to generate a mock gff. +#' +#' @param n.genes Number of genes in mock gff file +#' @param chromosome Chromosome name +#' +#' @return data.frame containing mock gff data. +#' +#' @export +#' +mockGFF <- function(n.genes = 500, chromosome = 22){ + + mock.gff <- data.frame(list(V1 = chromosome, + V2 = "source", + V3 = "gene", + V4 = sort(sample(1e4:2e8, n.genes)))) + mock.gff$V5 <- mock.gff$V4 + floor(rnorm(n.genes, 1500, 1000)) + mock.gff[, c("V6", "V7", "V8", "V9")] <- "." + + return(mock.gff) +} + + +#' Generate mock vcf +#' +#' Quick function to generate mock vcf file. Note this data has unrealistic +#' population structure. +#' +#' @param n.snps Number of SNPs in mock vcf file. +#' @param n.samples Number of samples in mock bulk data. +#' @param chromosome Chromosome name +#' +#' @return data.frame containing mock gff data. +#' +#' @importFrom stats setNames +#' +#' @export +#' +mockVCF <- function(n.snps = 1e4, n.samples = 10, chromosome = 22){ + + + if (!requireNamespace("VariantAnnotation", quietly = TRUE)) { + stop("Creating a mock VCF requires the 'VariantAnnotation' package.") + } + + sample_names <- paste0("sample_", formatC(seq_len(n.samples), + width = nchar(n.samples), + format = "d", + flag = "0")) + snp_names <- paste0("snp_", formatC(seq_len(n.snps), + width = nchar(n.snps), + format = "d", flag = "0")) + # rowRanges + vcf.rowRanges <- GenomicRanges::GRanges( + seqnames = S4Vectors::Rle(rep(chromosome, n.snps)), + ranges = IRanges::IRanges(sample(1:2e8, n.snps, replace = FALSE), + names = snp_names), + strand = S4Vectors::Rle(BiocGenerics::strand(rep("*", n.snps))), + paramRangeID = S4Vectors::Rle(rep(NA, n.snps)) + ) + + # Geno + genotypes <- c(rep("0|0", 100), rep("1|0", 10), rep("1|1", 10)) + geno <- setNames(data.frame(matrix(ncol = n.samples, nrow = n.snps)), + sample_names) + geno[, sample_names] <- sample(genotypes, n.samples * n.snps, + replace = TRUE) + row.names(geno) <- snp_names + geno <- as.matrix(geno) + + # colData, info, and fixed + vcf.col.data <- S4Vectors::DataFrame(list(Samples = seq_len(n.samples))) + row.names(vcf.col.data) <- sample_names + + vcf.info <- S4Vectors::DataFrame(list(VT = rep("SNP", n.snps))) + row.names(vcf.info) <- snp_names + + vcf.fixed <- S4Vectors::DataFrame( + REF = Biostrings::DNAStringSet(rep("A", n.snps)), + ALT = rep(Biostrings::DNAStringSetList("T"), n.snps), + QUAL = rep(100, n.snps), + FILTER = rep("PASS", n.snps) + ) + + # Merge into mock VCF object + mock.vcf <- VariantAnnotation::VCF(rowRanges = vcf.rowRanges, + colData = vcf.col.data, + info = vcf.info, + fixed = vcf.fixed, + geno = list(GT=geno), + collapsed = TRUE) + + return(mock.vcf) +} + + +#' Generate mock bulk population scale expression data +#' +#' Quick function to generate mock bulk expression data for a population, with +#' parameters estimated using real thyroid tissue data from GTEx. +#' +#' @param n.genes Number of genes in mock bulk data. +#' @param n.samples Number of samples in mock bulk data. +#' +#' @return matrix containing mock bulk expression data. +#' +#' @export + +mockBulkMatrix <- function(n.genes = 1000, n.samples = 100){ + + tmp.params <- newSplatPopParams() + mean.shape <- getParam(tmp.params, "pop.mean.shape") + mean.rate <- getParam(tmp.params, "pop.mean.rate") + cv.df <- getParam(tmp.params, "pop.cv.param") + cv.shape <- cv.df[5, "shape"] + cv.rate <- cv.df[5, "rate"] + + key <- data.frame(list(id = seq_len(n.genes), + mean = rgamma(n.genes, mean.shape, mean.rate), + cv = rgamma(n.genes, cv.shape, cv.rate))) + + mock.means <- lapply(key$id, function(g) rnorm(n.samples, + mean = key[key$id == g,]$mean, + sd = key[key$id == g,]$mean * + key[key$id == g,]$cv)) + + mock.means <- do.call(rbind, mock.means) + mock.means[mock.means < 0] <- 0 + + return(mock.means) +} + +#' Generate mock eQTL mapping results +#' +#' Quick function to generate mock eQTL mapping results, with parameters +#' estimated using real eQTL mapping results from GTEX using thyroid tissue. +#' +#' @param n.genes Number of genes in mock eQTL data. +#' +#' @return data.frame containing mock bulk eQTL mapping results. +#' +#' @export +#' +mockBulkeQTL <- function(n.genes = 1000){ + + tmp.params <- newSplatPopParams() + eqtl.shape <- getParam(tmp.params, "eqtl.ES.shape") + eqtl.rate <- getParam(tmp.params, "eqtl.ES.rate") + + mock.eq <- data.frame(list(gene_id = seq_len(n.genes), + pval_nominal = 0.01, + slope = rgamma(n.genes, eqtl.shape, eqtl.rate))) + + return(mock.eq) +} diff --git a/R/splat-estimate.R b/R/splat-estimate.R index d2fe3d50..6d69eae2 100644 --- a/R/splat-estimate.R +++ b/R/splat-estimate.R @@ -13,7 +13,7 @@ #' \code{\link{splatEstOutlier}}, \code{\link{splatEstBCV}}, #' \code{\link{splatEstDropout}} #' -#' @return SplatParams object containing the estimated parameters. +#' @return SplatParams object with estimated values. #' #' @examples #' # Load example data @@ -78,7 +78,7 @@ splatEstimate.matrix <- function(counts, params = newSplatParams()) { #' winsorized by setting the top and bottom 10 percent of values to the 10th #' and 90th percentiles. #' -#' @return SplatParams object with estimated values. +#' @return SplatParams object containing the estimated parameters. splatEstMean <- function(norm.counts, params) { means <- rowMeans(norm.counts) diff --git a/R/splat-simulate.R b/R/splat-simulate.R index 927e0c69..471016e0 100644 --- a/R/splat-simulate.R +++ b/R/splat-simulate.R @@ -7,7 +7,7 @@ #' See \code{\link{SplatParams}} for details. #' @param method which simulation method to use. Options are "single" which #' produces a single population, "groups" which produces distinct groups -#' (eg. cell types) or "paths" which selects cells from continuous +#' (eg. cell types), or "paths" which selects cells from continuous #' trajectories (eg. differentiation processes). #' @param verbose logical. Whether to print progress messages. #' @param ... any additional parameter settings to override what is provided in @@ -156,8 +156,8 @@ splatSimulate <- function(params = newSplatParams(), method <- "single" } - if (verbose) {message("Creating simulation object...")} # Set up name vectors + if (verbose) {message("Creating simulation object...")} cell.names <- paste0("Cell", seq_len(nCells)) gene.names <- paste0("Gene", seq_len(nGenes)) batch.names <- paste0("Batch", seq_len(nBatches)) @@ -219,6 +219,7 @@ splatSimulate <- function(params = newSplatParams(), if (verbose) {message("Done!")} return(sim) + } #' @rdname splatSimulate @@ -295,6 +296,10 @@ splatSimLibSizes <- function(sim, params) { #' @importFrom stats rgamma median splatSimGeneMeans <- function(sim, params) { + # Note: splatPopSimGeneMeans in splatPop-simulate.R mirrors this function. + # If changes are made to the "add expression outliers" method here, please + # make the same changes in splatPopSimGeneMeans. + nGenes <- getParam(params, "nGenes") mean.shape <- getParam(params, "mean.shape") mean.rate <- getParam(params, "mean.rate") diff --git a/R/splatPop-estimate.R b/R/splatPop-estimate.R new file mode 100644 index 00000000..57232fc1 --- /dev/null +++ b/R/splatPop-estimate.R @@ -0,0 +1,172 @@ +#' Estimate population/eQTL simulation parameters +#' +#' Estimate simulation parameters for the eQTL population simulation from +#' real data. See the individual estimation functions for more details on +#' how this is done. +#' +#' @param params SplatPopParams object containing parameters for the +#' simulation of the mean expression levels for the population. +#' See \code{\link{SplatPopParams}} for details. +#' @param counts either a counts matrix or a SingleCellExperiment object +#' containing count data to estimate parameters from. +#' @param means Matrix of real gene means across a population, where +#' each row is a gene and each column is an individual in the population. +#' @param eqtl data.frame with all or top eQTL pairs from a real eQTL analysis. +#' Must include columns: 'gene_id', 'pval_nominal', and 'slope'. +#' +#' @seealso +#' \code{\link{splatPopEstimateEffectSize}}, +#' \code{\link{splatPopEstimateMeanCV}} +#' +#' @return SplatPopParams object containing the estimated parameters. +#' +#' @export +splatPopEstimate <- function(params = newSplatPopParams(), counts = NULL, + means = NULL, eqtl = NULL) { + + checkmate::assertClass(params, "SplatPopParams") + + # Estimate single-cell parameters using base splatEstimate function + if (!is.null(counts)) { + params <- splatEstimate(counts, params) + } + + # Get parameters for eQTL Effect Size distribution + if (!is.null(eqtl)) { + params <- splatPopEstimateEffectSize(params, eqtl) + } + + # Get parameters for population wide gene mean and variance distributions + if (!is.null(means)) { + params <- splatPopEstimateMeanCV(params, means) + } + + return(params) + +} + +#' Estimate eQTL Effect Size parameters +#' +#' Estimate rate and shape parameters for the gamma distribution used to +#' simulate eQTL (eSNP-eGene) effect sizes. +#' +#' @param eqtl data.frame with all or top eQTL pairs from a real eQTL analysis. +#' Must include columns: gene_id, pval_nominal, and slope. +#' @param params SplatPopParams object containing parameters for the +#' simulation of the mean expression levels for the population. +#' See \code{\link{SplatPopParams}} for details. +#' +#' @details +#' Parameters for the gamma distribution are estimated by fitting the top eSNP- +#' eGene pair effect sizes using \code{\link[fitdistrplus]{fitdist}}. The +#' maximum goodness-of-fit estimation method is used to minimise the +#' Cramer-von Mises distance. This can fail in some situations, in which case +#' the method of moments estimation method is used instead. +#' +#' @return params object with estimated values. +#' +splatPopEstimateEffectSize <- function(params, eqtl) { + + # Test input eSNP-eGene pairs + if (!("gene_id" %in% names(eqtl) & + "pval_nominal" %in% names(eqtl) & + "slope" %in% names(eqtl))) { + stop("Incorrect format for eqtl data.")} + + # Select top eSNP for each gene (i.e. lowest p.value) + eqtl.top <- eqtl[order(eqtl$gene_id, eqtl$pval_nominal), ] + eqtl.top <- eqtl.top[!duplicated(eqtl.top$gene_id), ] + + # Fit absolute value of effect sizes to gamma distribution + e.sizes <- abs(eqtl.top$slope) + fit <- fitdistrplus::fitdist(e.sizes, "gamma", method = "mge", gof = "CvM") + if (fit$convergence > 0) { + warning("Fitting effect sizes using the Goodness of Fit method failed,", + " using the Method of Moments instead") + fit <- fitdistrplus::fitdist(e.sizes, "gamma", method = "mme") + } + + params <- setParams(params, eqtl.ES.shape = unname(fit$estimate["shape"]), + eqtl.ES.rate = unname(fit$estimate["rate"])) + + return(params) +} + +#' Estimate gene mean and gene mean variance parameters +#' +#' @param params SplatPopParams object containing parameters for the +#' simulation of the mean expression levels for the population. +#' See \code{\link{SplatPopParams}} for details. +#' @param means data.frame of real gene means across a population, where +#' each row is a gene and each column is an individual in the population. +#' +#' @details +#' Parameters for the mean gamma distribution are estimated by fitting the mean +#' (across the population) expression of genes that meet the criteria (<50% of +#' samples have exp <0.1) and parameters for the cv gamma distribution are +#' estimated for each bin of mean expression using the cv of expression across +#' the population for genes in that bin. Both are fit using +#' \code{\link[fitdistrplus]{fitdist}}. The "Nelder-Mead" method is used to fit +#' the mean gamma distribution and the maximum goodness-of-fit estimation +#' method is used to minimise the Cramer-von Mises distance for the CV +#' distribution. +#' +#' @return params object with estimated values. +#' @importFrom stats quantile +#' @importFrom grDevices boxplot.stats +#' @importFrom matrixStats rowMedians +#' +splatPopEstimateMeanCV <- function(params, means) { + + # Test input gene means + if ((anyNA(means) | !(validObject(rowSums(means))))) { + stop("Incorrect format or NAs present in gene.means. See example data.") + } + + # Remove genes with low variance/low means + abv.thr <- data.frame(perc = (rowSums(means >= 0.1)/ncol(means))) + + means.use <- means[abv.thr$perc > 0.5, ] + + # Calculate mean expression parameters + row.means <- rowMedians(means.use) + names(row.means) <- row.names(means.use) + mfit <- fitdistrplus::fitdist(row.means, "gamma", + optim.method = "Nelder-Mead") + + # Calculate CV parameters for genes based on 10 mean expression bins + nbins <- getParam(params, "pop.cv.bins") + bins <- split(row.means, cut(row.means, quantile(row.means,(0:nbins)/nbins), + include.lowest = TRUE)) + cvparams <- data.frame(start = character(), end = character(), + shape = character(), rate = character(), + stringsAsFactors = FALSE) + + for(b in names(bins)){ + re.brack.paren <- "\\[|\\]|\\)|\\(" + min.max <- strsplit(gsub(re.brack.paren, "", unlist(b)), split = ",") + + b.gene.means <- means.use[row.means > as.numeric(min.max[[1]][1]) & + row.means < as.numeric(min.max[[1]][2]), ] + + cv <- apply(b.gene.means, 1, co.var) + cv[is.na(cv)] <- 0 + cv <- cv[!cv %in% boxplot.stats(cv)$out] + cvfit <- fitdistrplus::fitdist(cv, "gamma", method = "mge", gof = "CvM") + cvparams <- rbind(cvparams, + list(start = as.numeric(as.numeric(min.max[[1]][1])), + end = as.numeric(as.numeric(min.max[[1]][2])), + shape = cvfit$estimate["shape"], + rate = cvfit$estimate["rate"]), + stringsAsFactors = FALSE) + } + + cvparams[1, "start"] <- 0 + cvparams[nrow(cvparams), "end"] <- 1e100 + params <- setParams(params, + pop.mean.shape = unname(mfit$estimate["shape"]), + pop.mean.rate = unname(mfit$estimate["rate"]), + pop.cv.param = cvparams) + + return(params) +} diff --git a/R/splatPop-simulate.R b/R/splatPop-simulate.R new file mode 100644 index 00000000..c2172802 --- /dev/null +++ b/R/splatPop-simulate.R @@ -0,0 +1,843 @@ +#' splatPop simulation +#' +#' Simulate scRNA-seq count data using the splat model for a population of +#' individuals with correlation structure. +#' +#' @param params SplatPopParams object containing parameters for population +#' scale simulations. See \code{\link{SplatPopParams}} for details. +#' @param vcf VariantAnnotation object containing genotypes of samples. +#' @param method which simulation method to use. Options are "single" which +#' produces a single population, "groups" which produces distinct groups +#' (eg. cell types), "paths" which selects cells from continuous +#' trajectories (eg. differentiation processes). +#' @param gff Either NULL or a data.frame object containing a GFF/GTF file. +#' @param key Either NULL or a data.frame object containing a full or partial +#' splatPop key. +#' @param counts.only logical. Whether to save only counts in sce object. +#' @param verbose logical. Whether to print progress messages. +#' @param ... any additional parameter settings to override what is provided in +#' \code{params}. +#' +#' @details +#' +#' This functions is for simulating data in a single step. It consists of a +#' call to \code{\link{splatPopSimulateMeans}}, which simulates a mean +#' expression level per gene per sample, followed by a call to +#' \code{\link{splatPopSimulateSC}}, which uses the splat model to simulate +#' single-cell counts per individual. Please see the documentation for those +#' functions for more details. +#' +#' @seealso +#' \code{\link{splatPopSimulateMeans}}, \code{\link{splatPopSimulateSC}} +#' +#' @return SingleCellExperiment object containing simulated counts, +#' intermediate values like the gene means simulated in `splatPopSimulateMeans`, +#' and information about the differential expression and eQTL effects assigned +#' to each gene. +#' +#' @export +splatPopSimulate <- function(params = newSplatPopParams(nGenes = 1000), + vcf = mockVCF(), + method = c("single", "groups", "paths"), + gff = NULL, + key = NULL, + counts.only = FALSE, + verbose = TRUE, ...) { + + if (verbose) {message("Getting parameters...")} + params <- setParams(params, ...) + params <- expandParams(params) + validObject(params) + + + sim.means <- splatPopSimulateMeans(vcf = vcf, + params = params, + gff = gff, + key = key, + verbose = verbose) + + sim.sc <- splatPopSimulateSC(sim.means = sim.means$means, + params = params, + key = sim.means$key, + method = method, + counts.only = counts.only, + verbose = verbose) + + return(sim.sc) +} + + +#' splatPopSimulateMeans +#' +#' Simulate mean expression levels for all genes for all samples, with between +#' sample correlation structure simulated with eQTL effects and with the option +#' to simulate multiple groups (i.e. cell-types). +#' +#' @param vcf VariantAnnotation object containing genotypes of samples. +#' @param params SplatPopParams object containing parameters for population +#' scale simulations. See \code{\link{SplatPopParams}} for details. +#' @param gff Either NULL or a data.frame object containing a GFF/GTF file. +#' @param key Either FALSE or a data.frame object containing a full or partial +#' splatPop key. +#' @param verbose logical. Whether to print progress messages. +#' @param ... any additional parameter settings to override what is provided in +#' \code{params}. +#' +#' @details SplatPopParams can be set in a variety of ways. 1. If +#' not provided, default parameters are used. 2. Default parameters can be +#' overridden by supplying desired parameters using \code{\link{setParams}}. +#' 3. Parameters can be estimated from real data of your choice using +#' \code{\link{splatPopEstimate}}. +#' +#' `splatPopSimulateMeans` involves the following steps: +#' \enumerate{ +#' \item Load population key or generate random or GFF/GTF based key. +#' \item Format and subset genotype data from the VCF file. +#' \item If not in key, assign expression mean and variance to each gene. +#' \item If not in key, assign eGenes-eSNPs pairs and effect sizes. +#' \item If not in key and groups >1, assign subset of eQTL associations as +#' group-specific and assign DEG group effects. +#' \item Simulate mean gene expression matrix without eQTL effects +#' \item Quantile normalize by sample to fit single-cell expression +#' distribution as defined in `splatEstimate`. +#' \item Add quantile normalized gene mean and cv info the eQTL key. +#' \item Add eQTL effects to means matrix. +#' } +#' +#' @return A list containing: `means` a matrix (or list of matrices if +#' n.groups > 1) with the simulated mean gene expression value for each gene +#' (row) and each sample (column) and `key` a data.frame with population +#' information including eQTL and group effects. +#' +#' @seealso +#' \code{\link{splatPopParseVCF}}, \code{\link{splatPopParseGenes}}, +#' \code{\link{splatPopAssignMeans}}, +#' \code{\link{splatPopQuantNorm}}, \code{\link{splatPopQuantNormKey}} +#' \code{\link{splatPopeQTLEffects}}, \code{\link{splatPopGroupEffects}}, +#' \code{\link{splatPopSimMeans}}, \code{\link{splatPopSimEffects}}, +#' +#' @export +splatPopSimulateMeans <- function(vcf = mockVCF(), + params = newSplatPopParams(nGenes = 1000), + verbose = TRUE, key = NULL, gff = NULL, ...){ + + set.seed(getParam(params, "seed")) + + nGroups <- getParam(params, "nGroups") + + vcf <- splatPopParseVCF(vcf, params) + group.names <- paste0("Group", seq_len(nGroups)) + + # Genes from key or gff or mock (in that order) + if (is.null(key)) { + key <- splatPopParseGenes(params, gff) + } + + if (!all(c("meanSampled", "cvSampled") %in% names(key))) { + key <- splatPopAssignMeans(params, key) + } + + if (!all(c("eQTL.type", "eSNP.ID", "eQTL.EffectSize") %in% names(key))) { + key <- splatPopeQTLEffects(params, key, vcf) + + if(length(group.names) > 1){ + key <- splatPopGroupEffects(params, key, group.names) + } + } + + if (verbose) {message("Simulating gene means for population...")} + + means.pop <- splatPopSimMeans(vcf, key) + + means.pop <- splatPopQuantNorm(params, means.pop) + key <- splatPopQuantNormKey(key, means.pop) + + eMeansPop <- splatPopSimEffects("global", key, vcf, means.pop) + + if (length(group.names) > 1) { + eMeansPopq.groups <- list() + + for (id in group.names) { + eMeansPop.g <- splatPopSimEffects(id, key, vcf, eMeansPop) + eMeansPop.g[eMeansPop.g <= 0] <- 1e-5 + eMeansPopq.groups[[id]] <- eMeansPop.g + } + + return(list(means = eMeansPopq.groups, key = key)) + + } else { + eMeansPop[eMeansPop <= 0] <- 1e-5 + return(list(means = eMeansPop, key = key)) + } +} + + +#' splatPopSimulateSC +#' +#' Simulate count data for a population from a fictional single-cell +#' RNA-seq experiment using the Splat method. +#' +#' @param sim.means Matrix or list of matrices of gene means for the population. +#' Output from `splatPopSimulateMeans()`. +#' @param params SplatPopParams object containing parameters for population +#' scale simulations. See \code{\link{SplatPopParams}} for details. +#' @param key data.frame object containing a full or partial splatPop key. +#' Output from `splatPopSimulateMeans()`. +#' @param method which simulation method to use. Options are "single" which +#' produces a single cell population for each sample, "groups" which +#' produces distinct groups (eg. cell types) for each sample (note, this +#' creates separate groups from those created in `popSimulate` with only +#' DE effects), and "paths" which selects cells from continuous +#' trajectories (eg. differentiation processes). +#' @param counts.only logical. Whether to return only the counts. +#' @param verbose logical. Whether to print progress messages. +#' @param ... any additional parameter settings to override what is provided in +#' \code{params}. +#' +#' @return SingleCellExperiment object containing simulated counts, +#' intermediate values like the gene means simulated in `splatPopSimulateMeans`, +#' and information about the differential expression and eQTL effects assigned +#' to each gene. +#' +#' @importFrom SingleCellExperiment SingleCellExperiment cbind +#' @importFrom SummarizedExperiment rowData rowData<- +#' @export +splatPopSimulateSC <- function(sim.means, + params, + key, + method = c("single", "groups", "paths"), + counts.only = FALSE, + verbose = TRUE, ...){ + + set.seed(getParam(params, "seed")) + method <- match.arg(method) + + params <- setParams(params, ...) + params <- expandParams(params) + validObject(params) + + seed <- getParam(params, "seed") + set.seed(seed) + + nGroups <- getParam(params, "nGroups") + group.names <- paste0("Group", seq_len(nGroups)) + group.prob <- getParam(params, "group.prob") + batchCells <- getParam(params, "batchCells") + + # Simulate single-cell counts with group-specific effects + if (is.list(sim.means)){ + if (length(group.prob) != length(sim.means)) { + group.prob <- rep(1 / length(sim.means), length(sim.means)) + } + + group.n <- lapply(group.prob, function(x) {ceiling(x * batchCells)}) + names(group.n) <- group.names + samples <- colnames((sim.means[[1]])) + group.sims <- list() + + for (g in group.names) { + if (verbose) {message(paste0("Simulating sc counts for ", g, "..."))} + paramsG <- setParams(params, batchCells = unlist(group.n[g])) + + sims <- lapply(samples, function(x) { + splatPopSimulateSample(params = paramsG, + method = method, + sample.means = sim.means[[g]][, x], + counts.only = counts.only, + verbose = verbose) + }) + + for (i in seq(1, length(sims))) { + s <- samples[i] + sims[i][[1]]$Sample <- s + sims[i][[1]]$Group <- g + names(rowData(sims[i][[1]])) <- paste(s, g, + names(rowData(sims[i][[1]])), + sep = "_")} + + group.sims[[g]] <- do.call(SingleCellExperiment::cbind, sims) + } + + sim.all <- do.call(SingleCellExperiment::cbind, group.sims) + + }else{ + if (verbose) {message("Simulating population single cell counts...")} + samples <- colnames(sim.means) + sims <- lapply(samples, function(x) { + splatPopSimulateSample(params = params, + method = method, + sample.means = sim.means[, x], + counts.only = counts.only, + verbose = verbose) + }) + + for (i in seq(1, length(sims))) { + s <- samples[i] + sims[i][[1]]$Sample <- s + names(rowData(sims[i][[1]])) <- paste(s, + names(rowData(sims[i][[1]])), + sep = "_") + } + + sim.all <- do.call(SingleCellExperiment::cbind, sims) + } + + sim.all <- splatPopCleanSCE(sim.all) + + metadata(sim.all)$Simulated_Means <- sim.means + rowData(sim.all) <- merge(rowData(sim.all), key, + by.x = "row.names", by.y = "geneID") + + if (verbose) {message("Done!")} + return (sim.all) +} + +#' splatPopSimulateSample simulation +#' +#' Simulate count data for one sample from a fictional single-cell RNA-seq +#' experiment using the Splat method. +#' +#' @param params SplatPopParams object containing parameters for population +#' scale simulations. See \code{\link{SplatPopParams}} for details. +#' @param method which simulation method to use. Options are "single" which +#' produces a single population, "groups" which produces distinct groups +#' (eg. cell types), "paths" which selects cells from continuous +#' trajectories (eg. differentiation processes). +#' @param sample.means Gene means to use if running splatSimulatePop(). +#' @param counts.only logical. Whether to return only the counts. +#' @param verbose logical. Whether to print progress messages. +#' @param ... any additional parameter settings to override what is provided in +#' \code{params}. +#' +#' @details +#' This function closely mirrors \code{\link{splatSimulate}}. The main +#' difference is that it takes the means simulated by splatPopSimulateMeans +#' instead of randomly sampling a mean for each gene. For details about this +#' function see the documentation for \code{\link{splatSimulate}}. +#' +#' @return SingleCellExperiment object containing the simulated counts and +#' intermediate values for one sample. +#' +#' @seealso +#' \code{\link{splatSimLibSizes}}, \code{\link{splatPopSimGeneMeans}}, +#' \code{\link{splatSimBatchEffects}}, \code{\link{splatSimBatchCellMeans}}, +#' \code{\link{splatSimDE}}, \code{\link{splatSimCellMeans}}, +#' \code{\link{splatSimBCVMeans}}, \code{\link{splatSimTrueCounts}}, +#' \code{\link{splatSimDropout}}, \code{\link{splatPopSimulateSC}} +#' +#' @importFrom SummarizedExperiment rowData colData colData<- assays +#' @importFrom SingleCellExperiment SingleCellExperiment +splatPopSimulateSample <- function(params = newSplatPopParams(), + method = c("single", "groups", "paths"), + counts.only = FALSE, + verbose = TRUE, + sample.means, ...) { + + method <- match.arg(method) + set.seed(getParam(params, "seed")) + + # Get the parameters we are going to use + nCells <- getParam(params, "nCells") + nGenes <- length(sample.means) + params <- setParams(params, nGenes = nGenes) + nBatches <- getParam(params, "nBatches") + batch.cells <- getParam(params, "batchCells") + nGroups <- getParam(params, "nGroups") + group.prob <- getParam(params, "group.prob") + + # Run sanity checks + if (nGroups == 1 && method == "groups") { + warning("nGroups is 1, switching to single mode") + method <- "single" + } + + # Set up name vectors + cell.names <- paste0("Cell", seq_len(nCells)) + gene.names <- names(sample.means) + batch.names <- paste0("Batch", seq_len(nBatches)) + if (method == "groups") { + group.names <- paste0("Group", seq_len(nGroups)) + } else if (method == "paths") { + group.names <- paste0("Path", seq_len(nGroups)) + } + + # Create SingleCellExperiment to store simulation + cells <- data.frame(Cell = cell.names) + rownames(cells) <- cell.names + features <- data.frame(Gene = gene.names) + rownames(features) <- gene.names + + sim <- SingleCellExperiment(rowData = features, colData = cells, + metadata = list(Params = params)) + + # Make batches vector which is the index of param$batchCells repeated + # params$batchCells[index] times + batches <- lapply(seq_len(nBatches), function(i, b) {rep(i, b[i])}, + b = batch.cells) + batches <- unlist(batches) + colData(sim)$Batch <- batch.names[batches] + + if (method != "single") { + groups <- sample(seq_len(nGroups), nCells, prob = group.prob, + replace = TRUE) + colData(sim)$Group <- factor(group.names[groups], levels = group.names) + } + + sim <- splatSimLibSizes(sim, params) + sim <- splatPopSimGeneMeans(sim, params, base.means.gene = sample.means) + + if (nBatches > 1) { + sim <- splatSimBatchEffects(sim, params) + } + sim <- splatSimBatchCellMeans(sim, params) + + if (method == "single") { + sim <- splatSimSingleCellMeans(sim, params) + } else if (method == "groups") { + sim <- splatSimGroupDE(sim, params) + sim <- splatSimGroupCellMeans(sim, params) + } else { + sim <- splatSimPathDE(sim, params) + sim <- splatSimPathCellMeans(sim, params) + } + + sim <- splatSimBCVMeans(sim, params) + sim <- splatSimTrueCounts(sim, params) + sim <- splatSimDropout(sim, params) + + if (counts.only) { + assays(sim)[!grepl('counts', names(assays(sim)))] <- NULL + } + + return(sim) + +} + + +#' Format and subset genotype data from a VCF file. +#' +#' Extract numeric alleles from vcf object and filter out SNPs missing genotype +#' data or outside the Minor Allele Frequency range in `SplatPopParams`. +#' +#' @param vcf VariantAnnotation object containing genotypes of samples. +#' @param params SplatPopParams object containing parameters for population +#' scale simulations. See \code{\link{SplatPopParams}} for details. +#' +#' @return Genotype data.frame +#' +#' @importFrom stats complete.cases na.omit +#' @importFrom utils data +splatPopParseVCF <- function(vcf, params){ + + # Filter SNPs with NAs and outside MAF range + eqtl.maf.min <- getParam(params, "eqtl.maf.min") + eqtl.maf.max <- getParam(params, "eqtl.maf.max") + + SummarizedExperiment::rowRanges(vcf)$MAF <- + VariantAnnotation::snpSummary(vcf)$a1Freq + + vcf <- vcf[SummarizedExperiment::rowRanges(vcf)$MAF >= eqtl.maf.min & + SummarizedExperiment::rowRanges(vcf)$MAF <= eqtl.maf.max] + + return(vcf) +} + +#' Generate population key matrix from random or gff provided gene information +#' +#' @param params SplatPopParams object containing parameters for population +#' scale simulations. See \code{\link{SplatPopParams}} for details. +#' @param gff Either NULL or a data.frame object containing a GFF/GTF file. +#' +#' @return The Partial splatPop key data.frame. +splatPopParseGenes <- function(params, gff){ + + nGenes <- getParam(params, "nGenes") + + if(is.null(gff)){ + gff <- mockGFF(nGenes) + }else{ + gff <- as.data.frame(gff) + if ((length(names(gff)) < 8 | nrow(gff[gff[,3] == "gene",]) < 1)) { + stop("GFF file did not contain gene features or other issue with + file format. See example data.")} + nGenes <- nrow(gff) + } + + key <- gff[gff[,3] %in% c("gene", "Gene"),] + key$geneID <- paste0("gene_", formatC(seq_len(nGenes), + width = nchar(nrow(key)), + format = "d", flag = "0")) + + key[['chromosome']] <- key[, 1] + key[['geneStart']] <- key[, 4] + key[['geneEnd']] <- key[, 5] + key[['geneMiddle']] <- floor(abs((key$geneStart - key$geneEnd)/2)) + + key$geneStart + key <- key[,c("geneID", "chromosome", "geneStart", "geneEnd", "geneMiddle")] + + return(key) +} + +#' Sample expression mean and variance for each gene +#' +#' A mean and coefficient of variation is assigned to each gene by sampling from +#' gamma distributions parameterized from real data in `splatPopEstimate`. +#' The cv gamma distributions are binned by gene mean because the distribution +#' of variance in real data is not independent from the mean. The degree of +#' similarity between individuals can be further tuned using the +#' similarity.scale parameter in `SplatPopParams`. +#' +#' @param params SplatPopParams object containing parameters for population +#' scale simulations. See \code{\link{SplatPopParams}} for details. +#' @param key Partial splatPop key data.frame. +#' +#' @return The key updated with assigned means and variances. +splatPopAssignMeans <- function(params, key){ + + mean.shape <- getParam(params, "pop.mean.shape") + mean.rate <- getParam(params, "pop.mean.rate") + cv.param <- getParam(params, "pop.cv.param") + similarity.scale <- getParam(params, "similarity.scale") + + # Scale the CV rate parameters + cv.param$rate <- cv.param$rate * similarity.scale + + # Sample gene means + key[["meanSampled"]] <- rgamma(nrow(key), shape = mean.shape, rate = mean.rate) + key[["cvSampled"]] <- NULL + + # Sample coefficient of variation for each gene + for (g in seq_len(nrow(key))) { + exp.mean <- key[g, "meanSampled"] + bin <- cv.param[(cv.param$start < exp.mean) & + (cv.param$end >= exp.mean), ] + key[g,"cvSampled"] <- rgamma(1, shape = bin$shape, rate = bin$rate) + } + + return(key) +} + + +#' Assign eGenes-eSNPs pairs and effect sizes. +#' +#' Randomly pairs N genes (eGene) a SNP (eSNP) within the window size +#' (eqtl.dist) and assigns each pair an effect size sampled from a gamma +#' distribution parameterized using the effect sizes from a real eQTL study. +#' +#' @param params SplatPopParams object containing parameters for population +#' scale simulations. See \code{\link{SplatPopParams}} for details. +#' @param key Partial splatPop key data.frame. +#' @param vcf VariantAnnotation object containing genotypes of samples. +#' +#' @return The key updated with assigned eQTL effects. +splatPopeQTLEffects <- function(params, key, vcf){ + + eqtl.n <- getParam(params, "eqtl.n") + if (eqtl.n > nrow(key)){eqtl.n <- nrow(key)} # Can't be greater than nGenes + if (eqtl.n <= 1){eqtl.n <- nrow(key) * eqtl.n} # If <= 1 it is a percent + + eqtl.dist <- getParam(params, "eqtl.dist") + eqtlES.shape <- getParam(params, "eqtl.ES.shape") + eqtlES.rate <- getParam(params, "eqtl.ES.rate") + + # Set up data.frame to save info about selected eSNP-eGENE pairs + snps.list <- row.names(vcf) + key2 <- key + key[["eQTL.type"]] <- NA + key[["eSNP.ID"]] <- NA + key[["eSNP.chromosome"]] <- NA + key[["eSNP.loc"]] <- NA + key[["eSNP.MAF"]] <- NA + key[["eQTL.EffectSize"]] <- 0 + + for(i in seq_len(eqtl.n)){ + again <- TRUE + while (again == TRUE){ + if(length(snps.list) == 0) { + stop("Not enough SNPs in MAF range.") + } + s <- sample(snps.list, 1) + snps.list <- snps.list[!snps.list == s] + s.chr <- as.character(GenomeInfoDb::seqnames(vcf[s])@values) + s.loc <- BiocGenerics::start(vcf[s]) + + matches <- subset(key2, (as.character(key2$chromosome) == s.chr & + key2$geneMiddle > s.loc - eqtl.dist & + key2$geneMiddle < s.loc + eqtl.dist)) + if(nrow(matches) > 0){ + match <- sample(matches$geneID, 1) + again <- FALSE + } + } + + key2 <- key2[!(key2$geneID == match),] + ES <- rgamma(1, shape = eqtlES.shape, rate = eqtlES.rate) + + key[key$geneID == match, ]$eSNP.ID <- s + key[key$geneID == match, ]$eSNP.chromosome <- + as.character(GenomeInfoDb::seqnames(vcf[s])) + key[key$geneID == match, ]$eSNP.loc <- BiocGenerics::start(vcf[s]) + key[key$geneID == match, ]$eQTL.EffectSize <- ES + key[key$geneID == match, ]$eSNP.MAF <- + SummarizedExperiment::rowRanges(vcf[s])$MAF + key[key$geneID == match, ]$eQTL.type <- "global" + + # Randomly make some effects negative + key$eQTL.EffectSize <- key$eQTL.EffectSize * sample(c(1, -1), + length(key$eQTL.EffectSize), + replace = TRUE) + } + + return(key) +} + + +#' Assign group-specific eQTL and DEGs. +#' +#' If groups > 1, n eSNP-eGene pairs (n = 'eqtl.group.specific') are randomly +#' assigned as group specific. +#' +#' @param params SplatPopParams object containing parameters for population +#' scale simulations. See \code{\link{SplatPopParams}} for details. +#' @param key Partial splatPop key data.frame. +#' @param groups array of group names +#' +#' @return The key updated with group eQTL and non-eQTL effects. +splatPopGroupEffects <- function(params, key, groups){ + + # Assign group-specific eQTL + eqtl.n <- getParam(params, "eqtl.n") + if (eqtl.n > nrow(key)){eqtl.n <- nrow(key)} # Can't be greater than nGenes + if (eqtl.n <= 1){eqtl.n <- nrow(key) * eqtl.n} # If <= 1 it is a percent + + g.specific.perc <- getParam(params, "eqtl.group.specific") + n.groups <- length(groups) + n.specific.each <- ceiling(eqtl.n * g.specific.perc / n.groups) + + for(g in groups){ + glob.genes <- subset(key, key$eQTL.type == "global")$geneID + g.specific <- sample(glob.genes, size = n.specific.each) + key[["eQTL.type"]][key$geneID %in% g.specific] <- g + } + + # Assign group-specific effects (differential expression, not eQTL) + nGenes <- nrow(key) + de.prob <- getParam(params, "de.prob") + de.downProb <- getParam(params, "de.downProb") + de.facLoc <- getParam(params, "de.facLoc") + de.facScale <- getParam(params, "de.facScale") + + for (idx in seq_len(n.groups)) { + de.facs <- getLNormFactors(nGenes, de.prob, de.downProb, + de.facLoc, de.facScale) + key[, paste0(groups[idx], ".GroupEffect")] <- de.facs + } + return(key) +} + +#' Simulate mean gene expression matrix without eQTL effects +#' +#' Gene mean expression levels are assigned to each gene for each pair randomly +#' from a normal distribution parameterized using the mean and cv assigned to +#' each gene in the key. +#' +#' @param vcf VariantAnnotation object containing genotypes of samples. +#' @param key Partial splatPop key data.frame. +#' +#' @return matrix of gene mean expression levels WITHOUT eQTL effects. +#' +#' @importFrom stats rnorm +splatPopSimMeans <- function(vcf, key){ + + means <- matrix(rnorm(ncol(vcf) * nrow(key), mean = key$meanSampled, + sd = key$meanSampled * key$cvSampled), + nrow = nrow(key), ncol = ncol(vcf)) + + rownames(means) <- key$geneID + colnames(means) <- colnames(VariantAnnotation::geno(vcf)$GT) + + return(means) +} + + +#' Add eQTL effects to means matrix +#' +#' Add eQTL effects and non-eQTL group effects to simulated means matrix. +#' The eQTL effects are incorporated using the following equation: +#' \deqn{Ygs = (ESg x Mgs x Gs) + Mgs } +#' Where Ygs is the mean for gene g and sample s, ESg is the effect size +#' assigned to g, Mgs is the mean expression assigned to g for s, and Gs +#' is the genotype (number of minor alleles) for s. Non-eQTL group effects are +#' incorporated as: +#' \deqn{Ygs = Mgs x GEg} +#' Where GEg is the group effect (i.e. differential expression) assigned to g. +#' To simulate multiple gene mean matrices with different group effects, this +#' function can be run with `id` designating the group id. +#' +#' @param id The group ID (e.g. "global" or "g1") +#' @param key Partial splatPop key data.frame. +#' @param vcf VariantAnnotation object containing genotypes of samples. +#' @param means.pop Population mean gene expression matrix +#' +#' @return data.frame of gene mean expression levels WITH eQTL effects. +#' +splatPopSimEffects <- function(id, key, vcf, means.pop){ + + # Add group-specific eQTL effects + genes.use <- subset(key, key$eQTL.type == id)$geneID + samples <- colnames(means.pop) + + for (g in genes.use) { + without.eqtl <- as.numeric(means.pop[g, ]) + ES <- key[key$geneID == g, "eQTL.EffectSize"] + eSNPsample <- key[key$geneID == g, "eSNP.ID"] + genotype_code <- as.array(VariantAnnotation::geno( + vcf[eSNPsample, samples])$GT + ) + genotype <- lengths(regmatches(genotype_code, + gregexpr("1", genotype_code))) + + means.pop[g, ] <- without.eqtl + (ES * genotype * means.pop[g, ]) + } + + # Add group-specific non-eQTL effects + if (id != "global") { + means.pop <- means.pop * key[, paste0(id, ".GroupEffect")] + } + + means.pop[means.pop < 0] <- 0 + + return(means.pop) +} + + +#' Quantile normalize by sample to fit sc expression distribution. +#' +#' For each sample, expression values are quantile normalized (qgamma) +#' using the gamma distribution parameterized from splatEstimate(). This ensures +#' the simulated gene means reflect the distribution expected from a sc dataset +#' and not a bulk dataset. +#' +#' @param params SplatPopParams object containing parameters for population +#' scale simulations. See \code{\link{SplatPopParams}} for details. +#' @param means Mean gene expression matrix with eQTL effects. +#' +#' @return matrix of quantile normalized gene mean expression levels. +#' +#' @export + +splatPopQuantNorm <- function(params, means){ + + # Generate sample target distribution from sc parameters + mean.shape <- getParam(params, "mean.shape") + mean.rate <- getParam(params, "mean.rate") + target <- rgamma(10000, shape = mean.shape, rate = mean.rate) + + means.norm <- preprocessCore::normalize.quantiles.use.target(means, target) + means.norm[means.norm < 0] <- 0 + + rownames(means.norm) <- rownames(means) + colnames(means.norm) <- colnames(means) + + return(means.norm) +} + +#' Add quantile normalized gene mean and cv info the eQTL key. +#' +#' @param key Partial splatPop key data.frame. +#' @param means matrix or list of matrices containing means from +#' `splatPopQuantNorm` +#' +#' @return Final eQTL key. +splatPopQuantNormKey <- function(key, means){ + + if (is.list(means)){ + qn.means <- list() + qn.cvs <- list() + + for(group in names(means)){ + qn.mean.gr <- rowMeans(means[[group]]) + qn.cv.gr <- apply(means[[group]], 1, FUN = co.var) + qn.means[[group]] <- qn.mean.gr + qn.cvs[[group]] <- qn.cv.gr + } + qn.mean <- rowMeans(as.data.frame(qn.means)) + qn.cv <- rowMeans(as.data.frame(qn.cvs)) + + }else{ + qn.mean <- rowMeans(means) + qn.cv <- apply(means, 1, FUN = co.var) + } + + qn.df <- data.frame(list(meanQuantileNorm = qn.mean, + cvQuantileNorm = qn.cv)) + qn.df$geneID <- row.names(qn.df) + key <- merge(key, qn.df, by = "geneID") + + return(key) +} + + +#' Simulate gene means for splatPop +#' +#' Simulate outlier expression factors for splatPop. Genes with an outlier +#' factor not equal to 1 are replaced with the median mean expression +#' multiplied by the outlier factor. +#' +#' @param sim SingleCellExperiment to add gene means to. +#' @param params SplatParams object with simulation parameters. +#' @param base.means.gene List of gene means for sample from matrix +#' generated by `splatPopSimulateMeans` and with the sample specified +#' in `splatPopSimulateSC`. +#' +#' @return SingleCellExperiment with simulated gene means. +#' +#' @importFrom SummarizedExperiment rowData rowData<- +#' @importFrom stats rgamma median +splatPopSimGeneMeans <- function(sim, params, base.means.gene) { + + # Note: This function is similar to splatSimGeneMeans, except it uses the + # simulated gene mean instead of sampling one randomly. If changes are made + # to the outlier method for splat, they should also be made here. + + nGenes <- getParam(params, "nGenes") + out.prob <- getParam(params, "out.prob") + out.facLoc <- getParam(params, "out.facLoc") + out.facScale <- getParam(params, "out.facScale") + + # Add expression outliers + outlier.facs <- getLNormFactors(nGenes, out.prob, 0, out.facLoc, + out.facScale) + median.means.gene <- median(base.means.gene) + outlier.means <- median.means.gene * outlier.facs + is.outlier <- outlier.facs != 1 + means.gene <- base.means.gene + means.gene[is.outlier] <- outlier.means[is.outlier] + + rowData(sim)$BaseGeneMean <- base.means.gene + rowData(sim)$OutlierFactor <- outlier.facs + rowData(sim)$GeneMean <- means.gene + + return(sim) +} + +#' Clean up the population-scale SCE to remove redundant information +#' +#' @param sim.all SingleCellExperiment object with counts for all samples +#' +#' @return SingleCellExperiment with simulated sc counts. +#' +splatPopCleanSCE <- function(sim.all){ + + # Remove redundant sce info + keep.id <- gsub("_Gene", "", names(rowData(sim.all))[1]) + + shared.out.factor <- rowData(sim.all)[[paste0(keep.id, "_OutlierFactor")]] + rowData(sim.all)[grepl("_OutlierFactor", names(rowData(sim.all)))] <- NULL + rowData(sim.all)$Shared_OutlierFactor <- shared.out.factor + + rowData(sim.all)[grepl("_Gene", names(rowData(sim.all)))] <- NULL + metadata(sim.all)[2:length(names(metadata(sim.all)))] <- NULL + + return(sim.all) +} diff --git a/R/utils.R b/R/utils.R index 96fec570..983b1ad1 100644 --- a/R/utils.R +++ b/R/utils.R @@ -80,3 +80,15 @@ winsorize <- function(x, q) { return(x) } + +#' Calculate coefficient of variation +#' +#' Implementation of the coefficient of variation +#' +#' @param x vector of values. +#' +#' @return Value of coefficient of variation for vector +#' @importFrom stats sd +co.var <- function(x) { + sd(x) / mean(x) +} diff --git a/inst/NEWS.Rd b/inst/NEWS.Rd index 5e2ff91e..19ad0862 100644 --- a/inst/NEWS.Rd +++ b/inst/NEWS.Rd @@ -1,6 +1,15 @@ \name{NEWS} \title{News for Package \pkg{splatter}} +\section{Version 1.14.0, Bioconductor 3.12 Release (2020-10-28)}{ + \itemize{ + \item{Add the splatPop simulation. This is a extension to the splat + simulation contributed by Christina Azodi and Davis McCarthy that adds + population effects. It allows you to specify relatedness between individuals + and generate cell-type specific eQTL effects.} + } +} + \section{Version 1.12.0, Bioconductor 3.11 Release (2020-04-20)}{ \itemize{ \item{Add checks for cycles in the Splat path.from parameter.} diff --git a/inst/WORDLIST b/inst/WORDLIST index e1b228cf..b6a06265 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -12,30 +12,56 @@ Bioc Biocondutor biocViews bioRxiv +bp BPPARAM +center Chu CMD compareSCEs compareSCESets cpm cutoff +cv +cvSampled +DEGs df diffSCEs diffSCESets +dist DM doi +downProb DP Dudoit EE +EffectSize eg +eGene +eGenes +eqtl +eQTL +ESg +eSNP +eSNPs etc +facLoc +facScale fpkm FPKM gam +GEg +geneID +geneMiddle getCounts getLNormFactors +gff +GFF Github Gribkova +Gs +GTEx +GTEX +GTF ImmunoOncology ingroup JC @@ -54,18 +80,23 @@ LunParams lunSimulate MADs MAEs +MAF Marioni meanlog +meanSampled MeanZeros mfa MFA MFAParams mfaSimulate +Mgs mockSCE modeling +Nelder nGenes nGroups NOTEs +param params Params Parra @@ -73,11 +104,17 @@ Perraudeau PhenoParams PhenoPath PLoS +plotPCA poisson Poisson +popSimulate +prob PROSSTT pseudotime +pval +qgamma Quickstart +rd RG Risso RMSEs @@ -113,8 +150,16 @@ splatEstimate splatEstLib splatParams SplatParams +splatPop +splatPopEstimate +splatPopParams +splatPopQuantNorm +splatPopSimulateMeans +splatPopSimulateSample +splatPopSimulateSC splatSimulate splatSimulatePaths +splatSimulatePop Splatter's splotchEstimate SplotchParams @@ -128,6 +173,9 @@ TPM UMAP UpperCamelCase Vallejos +VariantAnnotation +vcf +VCF WaVE Wellcome Wilks @@ -137,6 +185,7 @@ winsorized Winsorized WithSpikes Yau +Ygs ZINB ZINBParams zinbwave diff --git a/man/co.var.Rd b/man/co.var.Rd new file mode 100644 index 00000000..3349e074 --- /dev/null +++ b/man/co.var.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{co.var} +\alias{co.var} +\title{Calculate coefficient of variation} +\usage{ +co.var(x) +} +\arguments{ +\item{x}{vector of values.} +} +\value{ +Value of coefficient of variation for vector +} +\description{ +Implementation of the coefficient of variation +} diff --git a/man/mockBulkMatrix.Rd b/man/mockBulkMatrix.Rd new file mode 100644 index 00000000..3e697292 --- /dev/null +++ b/man/mockBulkMatrix.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mock-data.R +\name{mockBulkMatrix} +\alias{mockBulkMatrix} +\title{Generate mock bulk population scale expression data} +\usage{ +mockBulkMatrix(n.genes = 1000, n.samples = 100) +} +\arguments{ +\item{n.genes}{Number of genes in mock bulk data.} + +\item{n.samples}{Number of samples in mock bulk data.} +} +\value{ +matrix containing mock bulk expression data. +} +\description{ +Quick function to generate mock bulk expression data for a population, with +parameters estimated using real thyroid tissue data from GTEx. +} diff --git a/man/mockBulkeQTL.Rd b/man/mockBulkeQTL.Rd new file mode 100644 index 00000000..5abb8dcb --- /dev/null +++ b/man/mockBulkeQTL.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mock-data.R +\name{mockBulkeQTL} +\alias{mockBulkeQTL} +\title{Generate mock eQTL mapping results} +\usage{ +mockBulkeQTL(n.genes = 1000) +} +\arguments{ +\item{n.genes}{Number of genes in mock eQTL data.} +} +\value{ +data.frame containing mock bulk eQTL mapping results. +} +\description{ +Quick function to generate mock eQTL mapping results, with parameters +estimated using real eQTL mapping results from GTEX using thyroid tissue. +} diff --git a/man/mockGFF.Rd b/man/mockGFF.Rd new file mode 100644 index 00000000..ed64223e --- /dev/null +++ b/man/mockGFF.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mock-data.R +\name{mockGFF} +\alias{mockGFF} +\title{Generate mock gff} +\usage{ +mockGFF(n.genes = 500, chromosome = 22) +} +\arguments{ +\item{n.genes}{Number of genes in mock gff file} + +\item{chromosome}{Chromosome name} +} +\value{ +data.frame containing mock gff data. +} +\description{ +Quick function to generate a mock gff. +} diff --git a/man/mockVCF.Rd b/man/mockVCF.Rd new file mode 100644 index 00000000..8889d664 --- /dev/null +++ b/man/mockVCF.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mock-data.R +\name{mockVCF} +\alias{mockVCF} +\title{Generate mock vcf} +\usage{ +mockVCF(n.snps = 10000, n.samples = 10, chromosome = 22) +} +\arguments{ +\item{n.snps}{Number of SNPs in mock vcf file.} + +\item{n.samples}{Number of samples in mock bulk data.} + +\item{chromosome}{Chromosome name} +} +\value{ +data.frame containing mock gff data. +} +\description{ +Quick function to generate mock vcf file. Note this data has unrealistic +population structure. +} diff --git a/man/newParams.Rd b/man/newParams.Rd index 71d00e6b..795b6ab8 100644 --- a/man/newParams.Rd +++ b/man/newParams.Rd @@ -3,7 +3,7 @@ % R/KersplatParams-methods.R, R/Lun2Params-methods.R, R/LunParams-methods.R, % R/MFAParams-methods.R, R/PhenoParams-methods.R, R/SCDDParams-methods.R, % R/SimpleParams-methods.R, R/SparseDCParams-methods.R, -% R/SplatParams-methods.R, R/ZINBParams-methods.R +% R/SplatParams-methods.R, R/SplatPopParams-methods.R, R/ZINBParams-methods.R \name{newParams} \alias{newParams} \alias{newBASiCSParams} @@ -16,6 +16,7 @@ \alias{newSimpleParams} \alias{newSparseDCParams} \alias{newSplatParams} +\alias{newSplatPopParams} \alias{newZINBParams} \title{New Params} \usage{ @@ -39,6 +40,8 @@ newSparseDCParams(...) newSplatParams(...) +newSplatPopParams(...) + newZINBParams(...) } \arguments{ diff --git a/man/setParam.Rd b/man/setParam.Rd index e7dca4cc..5d8d28ec 100644 --- a/man/setParam.Rd +++ b/man/setParam.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/AllGenerics.R, R/BASiCSParams-methods.R, % R/KersplatParams-methods.R, R/Lun2Params-methods.R, R/LunParams-methods.R, % R/Params-methods.R, R/PhenoParams-methods.R, R/SCDDParams-methods.R, -% R/SplatParams-methods.R, R/ZINBParams-methods.R +% R/SplatParams-methods.R, R/SplatPopParams-methods.R, R/ZINBParams-methods.R \name{setParam} \alias{setParam} \alias{setParam,BASiCSParams-method} @@ -13,6 +13,7 @@ \alias{setParam,PhenoParams-method} \alias{setParam,SCDDParams-method} \alias{setParam,SplatParams-method} +\alias{setParam,SplatPopParams-method} \alias{setParam,ZINBParams-method} \title{Set a parameter} \usage{ @@ -34,6 +35,8 @@ setParam(object, name, value) \S4method{setParam}{SplatParams}(object, name, value) +\S4method{setParam}{SplatPopParams}(object, name, value) + \S4method{setParam}{ZINBParams}(object, name, value) } \arguments{ diff --git a/man/splatEstMean.Rd b/man/splatEstMean.Rd index a9192dff..01b46bae 100644 --- a/man/splatEstMean.Rd +++ b/man/splatEstMean.Rd @@ -12,7 +12,7 @@ splatEstMean(norm.counts, params) \item{params}{SplatParams object to store estimated values in.} } \value{ -SplatParams object with estimated values. +SplatParams object containing the estimated parameters. } \description{ Estimate rate and shape parameters for the gamma distribution used to diff --git a/man/splatEstimate.Rd b/man/splatEstimate.Rd index 7139933e..9b1edc91 100644 --- a/man/splatEstimate.Rd +++ b/man/splatEstimate.Rd @@ -19,7 +19,7 @@ containing count data to estimate parameters from.} \item{params}{SplatParams object to store estimated values in.} } \value{ -SplatParams object containing the estimated parameters. +SplatParams object with estimated values. } \description{ Estimate simulation parameters for the Splat simulation from a real diff --git a/man/splatPopAssignMeans.Rd b/man/splatPopAssignMeans.Rd new file mode 100644 index 00000000..0a327fd0 --- /dev/null +++ b/man/splatPopAssignMeans.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splatPop-simulate.R +\name{splatPopAssignMeans} +\alias{splatPopAssignMeans} +\title{Sample expression mean and variance for each gene} +\usage{ +splatPopAssignMeans(params, key) +} +\arguments{ +\item{params}{SplatPopParams object containing parameters for population +scale simulations. See \code{\link{SplatPopParams}} for details.} + +\item{key}{Partial splatPop key data.frame.} +} +\value{ +The key updated with assigned means and variances. +} +\description{ +A mean and coefficient of variation is assigned to each gene by sampling from +gamma distributions parameterized from real data in `splatPopEstimate`. +The cv gamma distributions are binned by gene mean because the distribution +of variance in real data is not independent from the mean. The degree of +similarity between individuals can be further tuned using the +similarity.scale parameter in `SplatPopParams`. +} diff --git a/man/splatPopCleanSCE.Rd b/man/splatPopCleanSCE.Rd new file mode 100644 index 00000000..afdb359d --- /dev/null +++ b/man/splatPopCleanSCE.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splatPop-simulate.R +\name{splatPopCleanSCE} +\alias{splatPopCleanSCE} +\title{Clean up the population-scale SCE to remove redundant information} +\usage{ +splatPopCleanSCE(sim.all) +} +\arguments{ +\item{sim.all}{SingleCellExperiment object with counts for all samples} +} +\value{ +SingleCellExperiment with simulated sc counts. +} +\description{ +Clean up the population-scale SCE to remove redundant information +} diff --git a/man/splatPopEstimate.Rd b/man/splatPopEstimate.Rd new file mode 100644 index 00000000..849b4a7a --- /dev/null +++ b/man/splatPopEstimate.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splatPop-estimate.R +\name{splatPopEstimate} +\alias{splatPopEstimate} +\title{Estimate population/eQTL simulation parameters} +\usage{ +splatPopEstimate( + params = newSplatPopParams(), + counts = NULL, + means = NULL, + eqtl = NULL +) +} +\arguments{ +\item{params}{SplatPopParams object containing parameters for the +simulation of the mean expression levels for the population. +See \code{\link{SplatPopParams}} for details.} + +\item{counts}{either a counts matrix or a SingleCellExperiment object +containing count data to estimate parameters from.} + +\item{means}{Matrix of real gene means across a population, where +each row is a gene and each column is an individual in the population.} + +\item{eqtl}{data.frame with all or top eQTL pairs from a real eQTL analysis. +Must include columns: 'gene_id', 'pval_nominal', and 'slope'.} +} +\value{ +SplatPopParams object containing the estimated parameters. +} +\description{ +Estimate simulation parameters for the eQTL population simulation from +real data. See the individual estimation functions for more details on +how this is done. +} +\seealso{ +\code{\link{splatPopEstimateEffectSize}}, +\code{\link{splatPopEstimateMeanCV}} +} diff --git a/man/splatPopEstimateEffectSize.Rd b/man/splatPopEstimateEffectSize.Rd new file mode 100644 index 00000000..9824402b --- /dev/null +++ b/man/splatPopEstimateEffectSize.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splatPop-estimate.R +\name{splatPopEstimateEffectSize} +\alias{splatPopEstimateEffectSize} +\title{Estimate eQTL Effect Size parameters} +\usage{ +splatPopEstimateEffectSize(params, eqtl) +} +\arguments{ +\item{params}{SplatPopParams object containing parameters for the +simulation of the mean expression levels for the population. +See \code{\link{SplatPopParams}} for details.} + +\item{eqtl}{data.frame with all or top eQTL pairs from a real eQTL analysis. +Must include columns: gene_id, pval_nominal, and slope.} +} +\value{ +params object with estimated values. +} +\description{ +Estimate rate and shape parameters for the gamma distribution used to +simulate eQTL (eSNP-eGene) effect sizes. +} +\details{ +Parameters for the gamma distribution are estimated by fitting the top eSNP- +eGene pair effect sizes using \code{\link[fitdistrplus]{fitdist}}. The +maximum goodness-of-fit estimation method is used to minimise the +Cramer-von Mises distance. This can fail in some situations, in which case +the method of moments estimation method is used instead. +} diff --git a/man/splatPopEstimateMeanCV.Rd b/man/splatPopEstimateMeanCV.Rd new file mode 100644 index 00000000..abe8b957 --- /dev/null +++ b/man/splatPopEstimateMeanCV.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splatPop-estimate.R +\name{splatPopEstimateMeanCV} +\alias{splatPopEstimateMeanCV} +\title{Estimate gene mean and gene mean variance parameters} +\usage{ +splatPopEstimateMeanCV(params, means) +} +\arguments{ +\item{params}{SplatPopParams object containing parameters for the +simulation of the mean expression levels for the population. +See \code{\link{SplatPopParams}} for details.} + +\item{means}{data.frame of real gene means across a population, where +each row is a gene and each column is an individual in the population.} +} +\value{ +params object with estimated values. +} +\description{ +Estimate gene mean and gene mean variance parameters +} +\details{ +Parameters for the mean gamma distribution are estimated by fitting the mean +(across the population) expression of genes that meet the criteria (<50% of +samples have exp <0.1) and parameters for the cv gamma distribution are +estimated for each bin of mean expression using the cv of expression across +the population for genes in that bin. Both are fit using +\code{\link[fitdistrplus]{fitdist}}. The "Nelder-Mead" method is used to fit +the mean gamma distribution and the maximum goodness-of-fit estimation +method is used to minimise the Cramer-von Mises distance for the CV +distribution. +} diff --git a/man/splatPopGroupEffects.Rd b/man/splatPopGroupEffects.Rd new file mode 100644 index 00000000..f9039a8c --- /dev/null +++ b/man/splatPopGroupEffects.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splatPop-simulate.R +\name{splatPopGroupEffects} +\alias{splatPopGroupEffects} +\title{Assign group-specific eQTL and DEGs.} +\usage{ +splatPopGroupEffects(params, key, groups) +} +\arguments{ +\item{params}{SplatPopParams object containing parameters for population +scale simulations. See \code{\link{SplatPopParams}} for details.} + +\item{key}{Partial splatPop key data.frame.} + +\item{groups}{array of group names} +} +\value{ +The key updated with group eQTL and non-eQTL effects. +} +\description{ +If groups > 1, n eSNP-eGene pairs (n = 'eqtl.group.specific') are randomly +assigned as group specific. +} diff --git a/man/splatPopParams.Rd b/man/splatPopParams.Rd new file mode 100644 index 00000000..a2291786 --- /dev/null +++ b/man/splatPopParams.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/AllClasses.R +\docType{class} +\name{SplatPopParams} +\alias{SplatPopParams} +\alias{SplatPopParams-class} +\title{The SplatPopParams class} +\description{ +S4 class that holds parameters for the splatPop simulation. +} +\section{Parameters}{ + + +In addition to the \code{\link{SplatParams}} parameters, splatPop simulation +requires the following parameters: + +\describe{ + \item{\code{[similarity.scale]}}{Scaling factor for pop.cv.param.rate, + where values larger than 1 increase the similarity between individuals in + the population and values less than one make the individuals less + similar.} + \item{\code{[eqtl.n]}}{The number (>1) or percent (<=1) of genes to + assign eQTL effects.} + \item{\code{[eqtl.dist]}}{Maximum distance between eSNP and eGene} + \item{\code{[eqtl.maf.min]}}{Minimum Minor Allele Frequency of eSNPs.} + \item{\code{[eqtl.maf.max]}}{Maximum Minor Allele Frequency of eSNPs.} + \item{\code{[eqtl.group.specific]}}{Percent of eQTL effects to simulate + as group specific.} + \item{\emph{eQTL Effect size distribution parameters. Defaults estimated + from GTEx eQTL mapping results, see vignette for more information.}}{ + \describe{ + \item{\code{eqtl.ES.shape}}{Shape parameter for the effect size + gamma distribution.} + \item{\code{eqtl.ES.rate}}{Rate parameter for the effect size + gamma distribution.} + } + } + \item{\emph{Bulk Mean Expression distribution parameters. Defaults + estimated from GTEx data, see vignette for more information.}}{ + \describe{ + \item{\code{pop.mean.shape}}{Shape parameter for the mean (i.e. + bulk) expression gamma distribution} + \item{\code{pop.mean.rate}}{Rate parameter for the mean (i.e. + bulk) expression gamma distribution} + } + } + \item{\emph{Bulk Expression Coefficient of Variation distribution + parameters binned. Defaults estimated from GTEx data, see vignette for + more information.}}{ + \describe{ + \item{\code{pop.cv.param}}{Dataframe containing gene + mean bin range, and the CV shape, and CV rate parameters for + each of those bins.} + } + } +} +The parameters not shown in brackets can be estimated from real data using +\code{\link{splatPopEstimate}}. For details of the eQTL simulation +see \code{\link{splatPopSimulate}}. +} + diff --git a/man/splatPopParseGenes.Rd b/man/splatPopParseGenes.Rd new file mode 100644 index 00000000..0a15c1d4 --- /dev/null +++ b/man/splatPopParseGenes.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splatPop-simulate.R +\name{splatPopParseGenes} +\alias{splatPopParseGenes} +\title{Generate population key matrix from random or gff provided gene information} +\usage{ +splatPopParseGenes(params, gff) +} +\arguments{ +\item{params}{SplatPopParams object containing parameters for population +scale simulations. See \code{\link{SplatPopParams}} for details.} + +\item{gff}{Either NULL or a data.frame object containing a GFF/GTF file.} +} +\value{ +The Partial splatPop key data.frame. +} +\description{ +Generate population key matrix from random or gff provided gene information +} diff --git a/man/splatPopParseVCF.Rd b/man/splatPopParseVCF.Rd new file mode 100644 index 00000000..0ed80f0e --- /dev/null +++ b/man/splatPopParseVCF.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splatPop-simulate.R +\name{splatPopParseVCF} +\alias{splatPopParseVCF} +\title{Format and subset genotype data from a VCF file.} +\usage{ +splatPopParseVCF(vcf, params) +} +\arguments{ +\item{vcf}{VariantAnnotation object containing genotypes of samples.} + +\item{params}{SplatPopParams object containing parameters for population +scale simulations. See \code{\link{SplatPopParams}} for details.} +} +\value{ +Genotype data.frame +} +\description{ +Extract numeric alleles from vcf object and filter out SNPs missing genotype +data or outside the Minor Allele Frequency range in `SplatPopParams`. +} diff --git a/man/splatPopQuantNorm.Rd b/man/splatPopQuantNorm.Rd new file mode 100644 index 00000000..e25c81ea --- /dev/null +++ b/man/splatPopQuantNorm.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splatPop-simulate.R +\name{splatPopQuantNorm} +\alias{splatPopQuantNorm} +\title{Quantile normalize by sample to fit sc expression distribution.} +\usage{ +splatPopQuantNorm(params, means) +} +\arguments{ +\item{params}{SplatPopParams object containing parameters for population +scale simulations. See \code{\link{SplatPopParams}} for details.} + +\item{means}{Mean gene expression matrix with eQTL effects.} +} +\value{ +matrix of quantile normalized gene mean expression levels. +} +\description{ +For each sample, expression values are quantile normalized (qgamma) +using the gamma distribution parameterized from splatEstimate(). This ensures +the simulated gene means reflect the distribution expected from a sc dataset +and not a bulk dataset. +} diff --git a/man/splatPopQuantNormKey.Rd b/man/splatPopQuantNormKey.Rd new file mode 100644 index 00000000..63bf6aaf --- /dev/null +++ b/man/splatPopQuantNormKey.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splatPop-simulate.R +\name{splatPopQuantNormKey} +\alias{splatPopQuantNormKey} +\title{Add quantile normalized gene mean and cv info the eQTL key.} +\usage{ +splatPopQuantNormKey(key, means) +} +\arguments{ +\item{key}{Partial splatPop key data.frame.} + +\item{means}{matrix or list of matrices containing means from +`splatPopQuantNorm`} +} +\value{ +Final eQTL key. +} +\description{ +Add quantile normalized gene mean and cv info the eQTL key. +} diff --git a/man/splatPopSimEffects.Rd b/man/splatPopSimEffects.Rd new file mode 100644 index 00000000..f3b19926 --- /dev/null +++ b/man/splatPopSimEffects.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splatPop-simulate.R +\name{splatPopSimEffects} +\alias{splatPopSimEffects} +\title{Add eQTL effects to means matrix} +\usage{ +splatPopSimEffects(id, key, vcf, means.pop) +} +\arguments{ +\item{id}{The group ID (e.g. "global" or "g1")} + +\item{key}{Partial splatPop key data.frame.} + +\item{vcf}{VariantAnnotation object containing genotypes of samples.} + +\item{means.pop}{Population mean gene expression matrix} +} +\value{ +data.frame of gene mean expression levels WITH eQTL effects. +} +\description{ +Add eQTL effects and non-eQTL group effects to simulated means matrix. +The eQTL effects are incorporated using the following equation: +\deqn{Ygs = (ESg x Mgs x Gs) + Mgs } +Where Ygs is the mean for gene g and sample s, ESg is the effect size +assigned to g, Mgs is the mean expression assigned to g for s, and Gs +is the genotype (number of minor alleles) for s. Non-eQTL group effects are +incorporated as: +\deqn{Ygs = Mgs x GEg} +Where GEg is the group effect (i.e. differential expression) assigned to g. +To simulate multiple gene mean matrices with different group effects, this +function can be run with `id` designating the group id. +} diff --git a/man/splatPopSimGeneMeans.Rd b/man/splatPopSimGeneMeans.Rd new file mode 100644 index 00000000..5325662c --- /dev/null +++ b/man/splatPopSimGeneMeans.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splatPop-simulate.R +\name{splatPopSimGeneMeans} +\alias{splatPopSimGeneMeans} +\title{Simulate gene means for splatPop} +\usage{ +splatPopSimGeneMeans(sim, params, base.means.gene) +} +\arguments{ +\item{sim}{SingleCellExperiment to add gene means to.} + +\item{params}{SplatParams object with simulation parameters.} + +\item{base.means.gene}{List of gene means for sample from matrix +generated by `splatPopSimulateMeans` and with the sample specified +in `splatPopSimulateSC`.} +} +\value{ +SingleCellExperiment with simulated gene means. +} +\description{ +Simulate outlier expression factors for splatPop. Genes with an outlier +factor not equal to 1 are replaced with the median mean expression +multiplied by the outlier factor. +} diff --git a/man/splatPopSimMeans.Rd b/man/splatPopSimMeans.Rd new file mode 100644 index 00000000..7d8971a6 --- /dev/null +++ b/man/splatPopSimMeans.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splatPop-simulate.R +\name{splatPopSimMeans} +\alias{splatPopSimMeans} +\title{Simulate mean gene expression matrix without eQTL effects} +\usage{ +splatPopSimMeans(vcf, key) +} +\arguments{ +\item{vcf}{VariantAnnotation object containing genotypes of samples.} + +\item{key}{Partial splatPop key data.frame.} +} +\value{ +matrix of gene mean expression levels WITHOUT eQTL effects. +} +\description{ +Gene mean expression levels are assigned to each gene for each pair randomly +from a normal distribution parameterized using the mean and cv assigned to +each gene in the key. +} diff --git a/man/splatPopSimulate.Rd b/man/splatPopSimulate.Rd new file mode 100644 index 00000000..eb5a7d4b --- /dev/null +++ b/man/splatPopSimulate.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splatPop-simulate.R +\name{splatPopSimulate} +\alias{splatPopSimulate} +\title{splatPop simulation} +\usage{ +splatPopSimulate( + params = newSplatPopParams(nGenes = 1000), + vcf = mockVCF(), + method = c("single", "groups", "paths"), + gff = NULL, + key = NULL, + counts.only = FALSE, + verbose = TRUE, + ... +) +} +\arguments{ +\item{params}{SplatPopParams object containing parameters for population +scale simulations. See \code{\link{SplatPopParams}} for details.} + +\item{vcf}{VariantAnnotation object containing genotypes of samples.} + +\item{method}{which simulation method to use. Options are "single" which +produces a single population, "groups" which produces distinct groups +(eg. cell types), "paths" which selects cells from continuous +trajectories (eg. differentiation processes).} + +\item{gff}{Either NULL or a data.frame object containing a GFF/GTF file.} + +\item{key}{Either NULL or a data.frame object containing a full or partial +splatPop key.} + +\item{counts.only}{logical. Whether to save only counts in sce object.} + +\item{verbose}{logical. Whether to print progress messages.} + +\item{...}{any additional parameter settings to override what is provided in +\code{params}.} +} +\value{ +SingleCellExperiment object containing simulated counts, +intermediate values like the gene means simulated in `splatPopSimulateMeans`, +and information about the differential expression and eQTL effects assigned +to each gene. +} +\description{ +Simulate scRNA-seq count data using the splat model for a population of +individuals with correlation structure. +} +\details{ +This functions is for simulating data in a single step. It consists of a +call to \code{\link{splatPopSimulateMeans}}, which simulates a mean +expression level per gene per sample, followed by a call to +\code{\link{splatPopSimulateSC}}, which uses the splat model to simulate +single-cell counts per individual. Please see the documentation for those +functions for more details. +} +\seealso{ +\code{\link{splatPopSimulateMeans}}, \code{\link{splatPopSimulateSC}} +} diff --git a/man/splatPopSimulateMeans.Rd b/man/splatPopSimulateMeans.Rd new file mode 100644 index 00000000..d584bace --- /dev/null +++ b/man/splatPopSimulateMeans.Rd @@ -0,0 +1,71 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splatPop-simulate.R +\name{splatPopSimulateMeans} +\alias{splatPopSimulateMeans} +\title{splatPopSimulateMeans} +\usage{ +splatPopSimulateMeans( + vcf = mockVCF(), + params = newSplatPopParams(nGenes = 1000), + verbose = TRUE, + key = NULL, + gff = NULL, + ... +) +} +\arguments{ +\item{vcf}{VariantAnnotation object containing genotypes of samples.} + +\item{params}{SplatPopParams object containing parameters for population +scale simulations. See \code{\link{SplatPopParams}} for details.} + +\item{verbose}{logical. Whether to print progress messages.} + +\item{key}{Either FALSE or a data.frame object containing a full or partial +splatPop key.} + +\item{gff}{Either NULL or a data.frame object containing a GFF/GTF file.} + +\item{...}{any additional parameter settings to override what is provided in +\code{params}.} +} +\value{ +A list containing: `means` a matrix (or list of matrices if +n.groups > 1) with the simulated mean gene expression value for each gene +(row) and each sample (column) and `key` a data.frame with population +information including eQTL and group effects. +} +\description{ +Simulate mean expression levels for all genes for all samples, with between +sample correlation structure simulated with eQTL effects and with the option +to simulate multiple groups (i.e. cell-types). +} +\details{ +SplatPopParams can be set in a variety of ways. 1. If +not provided, default parameters are used. 2. Default parameters can be +overridden by supplying desired parameters using \code{\link{setParams}}. +3. Parameters can be estimated from real data of your choice using +\code{\link{splatPopEstimate}}. + +`splatPopSimulateMeans` involves the following steps: +\enumerate{ + \item Load population key or generate random or GFF/GTF based key. + \item Format and subset genotype data from the VCF file. + \item If not in key, assign expression mean and variance to each gene. + \item If not in key, assign eGenes-eSNPs pairs and effect sizes. + \item If not in key and groups >1, assign subset of eQTL associations as + group-specific and assign DEG group effects. + \item Simulate mean gene expression matrix without eQTL effects + \item Quantile normalize by sample to fit single-cell expression + distribution as defined in `splatEstimate`. + \item Add quantile normalized gene mean and cv info the eQTL key. + \item Add eQTL effects to means matrix. +} +} +\seealso{ +\code{\link{splatPopParseVCF}}, \code{\link{splatPopParseGenes}}, +\code{\link{splatPopAssignMeans}}, +\code{\link{splatPopQuantNorm}}, \code{\link{splatPopQuantNormKey}} +\code{\link{splatPopeQTLEffects}}, \code{\link{splatPopGroupEffects}}, +\code{\link{splatPopSimMeans}}, \code{\link{splatPopSimEffects}}, +} diff --git a/man/splatPopSimulateSC.Rd b/man/splatPopSimulateSC.Rd new file mode 100644 index 00000000..1c706793 --- /dev/null +++ b/man/splatPopSimulateSC.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splatPop-simulate.R +\name{splatPopSimulateSC} +\alias{splatPopSimulateSC} +\title{splatPopSimulateSC} +\usage{ +splatPopSimulateSC( + sim.means, + params, + key, + method = c("single", "groups", "paths"), + counts.only = FALSE, + verbose = TRUE, + ... +) +} +\arguments{ +\item{sim.means}{Matrix or list of matrices of gene means for the population. +Output from `splatPopSimulateMeans()`.} + +\item{params}{SplatPopParams object containing parameters for population +scale simulations. See \code{\link{SplatPopParams}} for details.} + +\item{key}{data.frame object containing a full or partial splatPop key. +Output from `splatPopSimulateMeans()`.} + +\item{method}{which simulation method to use. Options are "single" which +produces a single cell population for each sample, "groups" which +produces distinct groups (eg. cell types) for each sample (note, this +creates separate groups from those created in `popSimulate` with only +DE effects), and "paths" which selects cells from continuous +trajectories (eg. differentiation processes).} + +\item{counts.only}{logical. Whether to return only the counts.} + +\item{verbose}{logical. Whether to print progress messages.} + +\item{...}{any additional parameter settings to override what is provided in +\code{params}.} +} +\value{ +SingleCellExperiment object containing simulated counts, +intermediate values like the gene means simulated in `splatPopSimulateMeans`, +and information about the differential expression and eQTL effects assigned +to each gene. +} +\description{ +Simulate count data for a population from a fictional single-cell +RNA-seq experiment using the Splat method. +} diff --git a/man/splatPopSimulateSample.Rd b/man/splatPopSimulateSample.Rd new file mode 100644 index 00000000..ce1d64bc --- /dev/null +++ b/man/splatPopSimulateSample.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splatPop-simulate.R +\name{splatPopSimulateSample} +\alias{splatPopSimulateSample} +\title{splatPopSimulateSample simulation} +\usage{ +splatPopSimulateSample( + params = newSplatPopParams(), + method = c("single", "groups", "paths"), + counts.only = FALSE, + verbose = TRUE, + sample.means, + ... +) +} +\arguments{ +\item{params}{SplatPopParams object containing parameters for population +scale simulations. See \code{\link{SplatPopParams}} for details.} + +\item{method}{which simulation method to use. Options are "single" which +produces a single population, "groups" which produces distinct groups +(eg. cell types), "paths" which selects cells from continuous +trajectories (eg. differentiation processes).} + +\item{counts.only}{logical. Whether to return only the counts.} + +\item{verbose}{logical. Whether to print progress messages.} + +\item{sample.means}{Gene means to use if running splatSimulatePop().} + +\item{...}{any additional parameter settings to override what is provided in +\code{params}.} +} +\value{ +SingleCellExperiment object containing the simulated counts and +intermediate values for one sample. +} +\description{ +Simulate count data for one sample from a fictional single-cell RNA-seq +experiment using the Splat method. +} +\details{ +This function closely mirrors \code{\link{splatSimulate}}. The main +difference is that it takes the means simulated by splatPopSimulateMeans +instead of randomly sampling a mean for each gene. For details about this +function see the documentation for \code{\link{splatSimulate}}. +} +\seealso{ +\code{\link{splatSimLibSizes}}, \code{\link{splatPopSimGeneMeans}}, +\code{\link{splatSimBatchEffects}}, \code{\link{splatSimBatchCellMeans}}, +\code{\link{splatSimDE}}, \code{\link{splatSimCellMeans}}, +\code{\link{splatSimBCVMeans}}, \code{\link{splatSimTrueCounts}}, +\code{\link{splatSimDropout}}, \code{\link{splatPopSimulateSC}} +} diff --git a/man/splatPopeQTLEffects.Rd b/man/splatPopeQTLEffects.Rd new file mode 100644 index 00000000..bab995b5 --- /dev/null +++ b/man/splatPopeQTLEffects.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/splatPop-simulate.R +\name{splatPopeQTLEffects} +\alias{splatPopeQTLEffects} +\title{Assign eGenes-eSNPs pairs and effect sizes.} +\usage{ +splatPopeQTLEffects(params, key, vcf) +} +\arguments{ +\item{params}{SplatPopParams object containing parameters for population +scale simulations. See \code{\link{SplatPopParams}} for details.} + +\item{key}{Partial splatPop key data.frame.} + +\item{vcf}{VariantAnnotation object containing genotypes of samples.} +} +\value{ +The key updated with assigned eQTL effects. +} +\description{ +Randomly pairs N genes (eGene) a SNP (eSNP) within the window size +(eqtl.dist) and assigns each pair an effect size sampled from a gamma +distribution parameterized using the effect sizes from a real eQTL study. +} diff --git a/man/splatSimulate.Rd b/man/splatSimulate.Rd index e74df3d2..e0c465e6 100644 --- a/man/splatSimulate.Rd +++ b/man/splatSimulate.Rd @@ -26,7 +26,7 @@ See \code{\link{SplatParams}} for details.} \item{method}{which simulation method to use. Options are "single" which produces a single population, "groups" which produces distinct groups -(eg. cell types) or "paths" which selects cells from continuous +(eg. cell types), or "paths" which selects cells from continuous trajectories (eg. differentiation processes).} \item{verbose}{logical. Whether to print progress messages.} diff --git a/tests/testthat/test-SplatParams.R b/tests/testthat/test-SplatParams.R index 371cb113..45ca9f0a 100644 --- a/tests/testthat/test-SplatParams.R +++ b/tests/testthat/test-SplatParams.R @@ -18,6 +18,8 @@ test_that("nGroups checks work", { "nGroups cannot be set directly, set group.prob instead") }) + +### These tests are also run in test-SplatPopParams.R, please update both test_that("path.from checks work", { pp <- setParams(params, group.prob = c(0.5, 0.5)) pp <- setParamUnchecked(pp, "path.from", c(0, 1)) @@ -39,6 +41,7 @@ test_that("path.from checks work", { expect_error(validObject(pp), "path.from cannot contain cycles") }) +### These tests are also run in test-SplatPopParams.R, please update both test_that("dropout.type checks work", { expect_error(setParam(params, "dropout.type", "cell"), "dropout.type cannot be set to 'cell'") diff --git a/tests/testthat/test-SplatPopParams.R b/tests/testthat/test-SplatPopParams.R new file mode 100644 index 00000000..f28cb510 --- /dev/null +++ b/tests/testthat/test-SplatPopParams.R @@ -0,0 +1,35 @@ +context("splatPopParams") + +if (requireNamespace("VariantAnnotation", quietly = TRUE) && + requireNamespace("preprocessCore", quietly = TRUE)) { + params <- newSplatPopParams() +} + +test_that("printing works", { + skip_if_not_installed("VariantAnnotation") + skip_if_not_installed("preprocessCore") + expect_output(show(params), "A Params object of class SplatPopParams") +}) + +test_that("nCells checks work", { + skip_if_not_installed("VariantAnnotation") + skip_if_not_installed("preprocessCore") + expect_error(setParam(params, "nCells", 1), + "nCells cannot be set directly, set batchCells instead") +}) + +test_that("CV params checks work", { + skip_if_not_installed("VariantAnnotation") + skip_if_not_installed("preprocessCore") + expect_error(setParam(params, "pop.cv.param", data.frame( + start = c(0, 10), shape = c(10, 2), rate = c(7, 3))), + "Need to set pop.cv.bins to length of pop.cv.param") + +}) + +test_that("setParams order doesn't matter", { + skip_if_not_installed("VariantAnnotation") + skip_if_not_installed("preprocessCore") + expect_silent(setParams(params, eqtl.n = 10, eqtl.maf.max = 0.4)) + expect_silent(setParams(params, eqtl.maf.max = 0.4, eqtl.n = 10)) +}) diff --git a/tests/testthat/test-splatPop-simulate.R b/tests/testthat/test-splatPop-simulate.R new file mode 100644 index 00000000..17ea16b4 --- /dev/null +++ b/tests/testthat/test-splatPop-simulate.R @@ -0,0 +1,45 @@ +context("splatPopSimulate") + +if (requireNamespace("VariantAnnotation", quietly = TRUE) && + requireNamespace("preprocessCore", quietly = TRUE)) { + + set.seed(1) + + n.samples <- 6 + n.genes <- 10 + vcf <- mockVCF(n.samples = n.samples, n.snps = 1000) + + params <- setParams(newSplatPopParams(), eqtl.n = 10, nGenes = n.genes) +} + +test_that("splatPopSimulate output is valid and works", { + skip_if_not_installed("VariantAnnotation") + skip_if_not_installed("preprocessCore") + expect_true(validObject(splatPopSimulate(params = params, vcf = vcf))) + pop <- splatPopSimulateMeans(params = params, vcf = vcf) + expect_false(any(is.na(pop$means))) + expect_false(any(sapply(pop$means, is.infinite))) + expect_equal(ncol(pop$means), n.samples) + expect_length(pop$key, 15) # Number of columns expected in splatPop key + expect_true(validObject(splatPopSimulateSC(sim.means = pop$means, + key = pop$key, + params = params))) +}) + + +test_that("splatPopSimulate on multiple groups output is valid and works", { + skip_if_not_installed("VariantAnnotation") + skip_if_not_installed("preprocessCore") + params.g2 <- setParams(params, group.prob = c(0.5, 0.5), nGenes = n.genes) + eqtl <- splatPopSimulate(params = params.g2, vcf = vcf) + expect_true(validObject(eqtl)) + +}) + +test_that("splatPopSimulate can read genes from gff data.frame", { + skip_if_not_installed("VariantAnnotation") + skip_if_not_installed("preprocessCore") + gff <- mockGFF(n.genes = n.genes) + sim <- splatPopSimulateMeans(vcf = vcf, params = params, gff = gff) + expect_true(validObject(sim)) +}) diff --git a/tests/testthat/test-splatPopEstimate.R b/tests/testthat/test-splatPopEstimate.R new file mode 100644 index 00000000..93f46bb4 --- /dev/null +++ b/tests/testthat/test-splatPopEstimate.R @@ -0,0 +1,47 @@ +context("splatPopEstimate") + +if (requireNamespace("VariantAnnotation", quietly = TRUE) && + requireNamespace("preprocessCore", quietly = TRUE)) { + + set.seed(1) + + # Mock data + bulk.means <- mockBulkMatrix(n.genes=500, n.samples=100) + bulk.eqtl <- mockBulkeQTL(n.genes=500) + + counts <- scater::mockSCE() +} + +test_that("splatPopEstimate works", { + skip_if_not_installed("VariantAnnotation") + skip_if_not_installed("preprocessCore") + # Check separate functions + params <- splatPopEstimateMeanCV(newSplatPopParams(), bulk.means) + expect_true(validObject(params)) + params <- splatPopEstimateEffectSize(newSplatPopParams(), bulk.eqtl) + expect_true(validObject(params)) + + # Check full function + params <- splatPopEstimate(means=bulk.means, + eqtl=bulk.eqtl, + counts=counts) + expect_true(validObject(params)) +}) + +test_that("splatPopEstimate checks on input data", { + skip_if_not_installed("VariantAnnotation") + skip_if_not_installed("preprocessCore") + bulk.eqtl.bad <- bulk.eqtl + bulk.eqtl.bad$gene_id <- NULL + expect_error(splatPopEstimate(means=bulk.means, + eqtl=bulk.eqtl.bad, + counts=counts), + "Incorrect format for eqtl data.") + + bulk.means.bad <- bulk.means + bulk.means.bad[, 1] <- NA + expect_error(splatPopEstimate(means=bulk.means.bad, + eqtl=bulk.eqtl, + counts=counts), + "Incorrect format or NAs present in gene.means.") +}) diff --git a/vignettes/splatPop-logo-small.png b/vignettes/splatPop-logo-small.png new file mode 100644 index 00000000..fb2bb404 Binary files /dev/null and b/vignettes/splatPop-logo-small.png differ diff --git a/vignettes/splatPop-model.png b/vignettes/splatPop-model.png new file mode 100644 index 00000000..4358117c Binary files /dev/null and b/vignettes/splatPop-model.png differ diff --git a/vignettes/splatPop.Rmd b/vignettes/splatPop.Rmd new file mode 100644 index 00000000..04e6c493 --- /dev/null +++ b/vignettes/splatPop.Rmd @@ -0,0 +1,416 @@ +--- +title: "splatPop: simulating single-cell data for populations" +author: "Christina Azodi" +package: splatter +date: "Last updated: 21 October 2020" +output: + BiocStyle::html_document: + toc: true + toc_float: true +vignette: > + %\VignetteIndexEntry{splatPop simulation} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r knitr-options, include = FALSE} +knitr::opts_chunk$set(collapse = TRUE, comment = "#>") +``` + +```{r setup} +suppressPackageStartupMessages({ + library("splatter") + library("scater") + library("VariantAnnotation") + library("ggplot2") +}) +``` + +![splatPop logo](splatPop-logo-small.png) + +# Introduction + +splatPop is an extension of the splat model that allows you to simulate single +cell count data for an entire population of individuals. Like with splat, these +simulations resemble real single-cell data because they use parameters estimated +from empirical data. Provided with genotype information (VCF) for a population +as input, splatPop simulates gene counts for multiple cells for all individuals +in the population. Realistic population structure (the pattern of genetic +relatedness between individuals in the population) in the simulations is achieved +by modelling expression Quantitative Trait Loci (eQTL) effects, where the +expression of a gene is associated with the genotype of the individual at a +specific loci. Finally, splatPop allows for the simulation of complex datasets +with cells from multiple groups (e.g. cell types), cells along differentiation +trajectories, and cells from different batches. + +# The splatPop model + +The primary simulation function is `splatPopSimulate`, which runs through the +two main phases: + +1. `splatPopSimulateMeans`: the simulation of means for all genes for all +individuals in the population. +2. `splatPopSimulateSC`: the simulation of single-cell counts for all cells for +all genes for all individuals. + +The second phase is essentially a wrapper around the original `splatSimulate()` +function, which is described in detail [here](splatter.html). The figure below +describes the first phase. Input parameters that can be estimated from real data +have double borders and are shaded by the type of data used (blue = single-cell +counts, yellow = population scale bulk/sc-aggregated RNA-seq data, and green = +eQTL mapping results). The final output (red) is a matrix of means for each +gene and each individual that is used as input to the second phase. + +![The splatPop model for estimating gene means.](splatPop-model.png) + +To get started with splatPop, you need genotype information for the population +you want to simulate (i.e. a VCF). Genotype information should be provided as a +[VariantAnnotation object](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html). +A mock VariantAnnotation object can be produced using the `mockVCF()` function. +Here we simulate single-cell RNA-sequencing counts for 100 random genes for 6 +random samples: + +```{r quick-start} +vcf <- mockVCF(n.samples = 6) + +sim <- splatPopSimulate(vcf = vcf, "nGenes" = 100) + +sim <- logNormCounts(sim) +sim <- runPCA(sim, ncomponents = 10) +plotPCA(sim, colour_by = "Sample") +``` + +# Detailed look into splatPop + +## Step 1: Parameter Estimation + +The parameters used in splatPop have sensible default values, but can also be +estimated from real data provided by the user. For example, gene mean and +variance levels are sampled from gamma distributions derived from real +population scale RNA-seq data and eQTL effect sizes from a gamma distribution +derived from real eQTL mapping results. The default parameters were derived +from GTEx data (v7, thyroid tissue). However, they can +also be estimated from user provided data using `splatPopEstimate()`. You can +also provide `splatPopEstimate()` with real single-cell RNA-sequencing data to +estimate single-cell parameters as in `splatEstimate()`. + +All parameters needed for splatPop simulations are stored in a +`SplatPopParams` object. In addition to the compartments in `SplatParams` +described in detail in the [Splat parameters vignette](splat_params.html) and +the parameters that are set manually (described below), +`SplatPopParams` also contains the following parameters that can be estimated +from real data: + +* **Population parameters** + * `pop.mean.shape` - Shape parameter for mean expression from population + scale data. + * `pop.mean.rate` - Rate parameter for mean expression from population + scale data. + * `pop.cv.param` - Shape and rate parameters for the coefficient of + variation (cv) across individuals from the population scale data, binned by + mean expression. +* **eQTL effect size parameters** + * `eqtl.ES.shape` - Shape parameter for eQTL effect sizes. + * `eqtl.ES.rate` - Rate parameter for eQTL effect sizes. + +Let's take a look at the default parameters... + +```{r default-SplatPopParams} +params <- newSplatPopParams() +params +``` + +This tells us we have "a `Params` object of class `SplatPopParams`" and shows +the values of these parameters. As with `SplatParams`, the parameters that can +be estimated by `splatPopEstimate` are in parentheses, those that can't be +estimated are in brackets, and those that have been changed from their default +are in ALL CAPS. + +For example, we can estimate new parameter values from user provided data... + +```{r eqtlEstimate} +bulk.means <- mockBulkMatrix(n.genes=100, n.samples=100) +bulk.eqtl <- mockBulkeQTL(n.genes=100) +counts <- mockSCE() + +params.est <- splatPopEstimate(means = bulk.means, + eqtl = bulk.eqtl, + counts = counts) +params.est +``` + +Note that `splatPopEstimate()` will only estimate new parameters if the data +required is provided. For example, if you want to simulate data using default +gene means and eQTL parameters, but from single-cell parameters estimated from +your own real single-cell counts data, you could run `splatPopEstimate()` with +only the `counts` argument provided. + +## Step 2: Simulate gene means + +The `splatPopSimulate()` function runs both phases of splatPop, however we can +run these two phases separately to highlight their unique functions. The +first phase is run using `splatPopSimulateMeans()`. + +### Input data + +This function requires two pieces of input data: genotypes and genes. Mock +genotype and gene data can be provided using `mockVCF()` and `mockGFF()`, +respectively. These mock functions generate random SNP and gene annotation data +for chromosome 22. To simulate populations with realistic population structure, +the user should provide real (or simulated) genotypes as a VCF file read in as a +`VariantAnnotation` object. + +splatPop takes in information about what genes to simulate in three ways: + +1. **GFF/GTF (-gff data.frame):** Provide a GFF/GTF file as a `data.frame` + object. splatPop will filter out all non-gene features (3rd column != gene). + This method uses real gene names and locations, but will randomly assign + expression values and eQTL effects to these genes. +2. **Key (-key data.frame):** Provide a `data.frame` object including + information about genes you want to simulate. This object must include the + gene's name (*geneID*), chromosome (*chromosome*), and location + (*geneMiddle*). With just those columns, splatPop will function the same as + if a GFF was provided. However, you can also use this object to specify + other information. For example, if you provide a desired mean (*meanSampled*) + and variance (*cvSampled*) for each gene, splatPop will use these instead of + randomly sampled values. Finally, if you provide the type (*eQTL.type*, e.g. + NA or global), SNP identifier (*eSNP.ID*), and effect size + (*eQTL.EffectSize*), splatPop will simulate gene means with these eQTL + associations instead of generating eQTL associations randomly. +3. **Randomly (-gff NULL -key NULL):** This option will call `mockGFF()` to + generate a random GFF file for a specified chromosome. This is the default + option if neither `gff` or `key` is provided. + +### Control parameters + +In addition to the parameters estimated from real data, the `SplatPopParams` +object also includes control parameters that must be set by the user. The +following `SplatPopParams` control parameters can be changed using +`setParams()`: + +* **Population parameters** + * `similarity.scale` - Scaling factor for the population variance (cv) rate + parameter. Increasing this scaling factor increases the similarity between + individuals. +* **eQTL Parameters** + * `eqtl.n` - Number (>1) or percent (<=1) of genes to assign with eQTL + effects. + * `eqtl.dist` - Maximum distance (bp) between the center of a gene and + possible eSNPs for that gene. + * `eqtl.maf.min` - Minimum Minor Allele Frequency (MAF) of eSNPs. + * `eqtl.maf.max` - Maximum MAF of eSNPs. + * `eqtl.group.specific` - Percent of eQTL effects to make group specific. + The number of groups is specified using the "group.prob" parameter. + * **Group specific parameters** + * `nGroups` - Number of groups to simulate for each individual. + * `group.prob` - Array of the proportion of cells that should be simulated + in each group. + +In addition to the group specific eQTL effects, each group will have group +specific differential expression effects, which are not associated with a +genetic variant). These parameters are estimated from real single-cell data as +described in [splatter](splatter.html). + +### Output + +The output of `splatPopSimulateMeans()` is a list containing: + +* `means` - a data.frame (or list of data.frames if `nGroups` > 1) with + simulated mean gene expression value for each gene (row) and each sample + (column). +* `key` - a data.frame listing for all simulated genes: the assigned mean + and variance (before and after quantile normalization), the assigned eSNP + and its effect size and type (global/group specific), and other group effects. + +Note that when `splatPopSimulate()` is run, these to objects are contained in +the output SingleCellExperiment object (details below). Let's look at a +snapshot of some simulated means and the corresponding key... + +```{r splatPopSimulateMeans} +vcf <- mockVCF(n.samples = 6) +gff <- mockGFF(n.genes = 100) + +sim.means <- splatPopSimulateMeans(vcf = vcf, gff = gff, + params = newSplatPopParams()) + +round(sim.means$means[1:5, 1:6], digits = 2) + +print(sim.means$key[1:5, ], digits = 2) +``` + +### Other examples + +**Replicate a simulation by providing a gene key** + +As described above, information about genes can also be provided in a data.frame +using the `key` argument. If you provide `splatPopSimulateMeans()` with the key +output from a previous run, it will generate a new population with the same +properties, essentially creating a replicate. Here is a snapshot of such a +replicate using the key simulated above: + +```{r splatPopSimulateMeans-from-key} +sim.means.rep2 <- splatPopSimulateMeans(vcf = vcf, key=sim.means$key, + params = newSplatPopParams()) + +round(sim.means.rep2$means[1:5, 1:6], digits = 2) +``` + +**Use real population-scale bulk expression data** + +An important step of `splatPopSimulate()` is the quantile normalization of +simulated gene means for each sample to match a gamma distribution +estimated from real single-cell RNA-seq data using `splatEstimate()` or +`splatPopEstimate()`. This step ensures that even if bulk sequencing data are +used to estimate population parameters, the means output from +`splatPopSimulateMeans()` will be distributed like a single-cell dataset. + +If you already have bulk expression data for a population, you can use this +quantile normalization function directly on that data and use the output as +input to `splatPopSimulateSC()`. Note that this will not simulate eQTL or group +effects, just simulate single-cell counts using the bulk means provided. + +```{r quant-normalize-population-data} +bulk.qnorm <- splatPopQuantNorm(newSplatPopParams(), bulk.means) +round(bulk.qnorm[1:5, 1:5], 3) +``` + +## Step 3: Simulate single cell counts + +Finally, single cell level data is simulated using `splatPopSimulateSC()`. +Running this function on its own requires the `SplatPopParams` object, and the +two outputs from `splatPopSimulateMeans()`: the key and the simulated means +matrix (or list of matrices if nGroups > 1). The user can also provide +additional parameters for the single-cell simulation, for example how many +cells to simulate. + +Looking at the output of `splatPopSimulateSC()` we see that it is a single +`SingleCellExperiment` object with a row for each feature (gene) and +a column for each cell. The simulated counts are accessed using `counts`. +although it can also hold other expression measures such as FPKM or TPM. +Information about each cell (e.g. sample, group, batch) is held in the +`colData` and information about each gene (e.g. location, eQTL effects, and +other data from the splatPop key) is held in the `rowData`. + +```{r eqtl-splatPopSimulateSC-simple-object} +sim.sc <- splatPopSimulateSC(params=params, + key = sim.means$key, + sim.means=sim.means$means, + batchCells=50) +sim.sc +``` + +We can visualize these simulations using plotting functions from **scater** like +plotPCA... + +```{r eqtl-splatPopSimulateSC-simple-plots} +sim.sc <- logNormCounts(sim.sc) +sim.sc <- runPCA(sim.sc, ncomponents = 10) +plotPCA(sim.sc, colour_by = "Sample") +``` + +## splatPop with group, batch, and path effects + +Using the same methods as splat, splatPop allows you to simulate single-cell +counts for a population with group (e.g. cell-types), batch, and path (e.g. +developmental series) effects. Group effects are simulated by +`splatPopSimulateMeans()` and applied to the single cell simulations in +`splatPopSimulateSC()`. Path and batch effects are simulated by +`splatPopSimulateSC()`. + +### Simulating population scale single-cell data with group effects + +The population simulated above is an example of a dataset with a single cell +type across many samples. However, splatPop also allows you to simulate +population-scale data for a mixture of cell-types (i.e. groups). + +Two types of group effects are included: group-eQTL and group-differential +expression (DE) effects. The number of groups to simulate is set using the +*group.prob* parameter in `SplatPopParams`. The DE effects are implemented as +in the `splat` simulation, with the user able to control `splatPopParam` +parameters including *de.prob*, *de.downProb*, *de.facLoc*, and *de.facScale*. +For group-specific eQTL, the proportion of eQTL to designate as group-specific +eQTL is set using *eqtl.group.specific*. + +When used to simulate single-cell data with group-specific effects, +`splatSimulatePop` also outputs: + +* **Cell information (`colData`)** + * `Group` - The group ID for each cell. + +```{r group-specific-eQTL-simulations} +params.group <- newSplatPopParams(nGenes = 50, + batchCells = 40, + group.prob = c(0.5, 0.5)) + +sim.sc.gr2 <- splatPopSimulate(vcf = vcf, params = params.group) + +sim.sc.gr2 <- logNormCounts(sim.sc.gr2) +sim.sc.gr2 <- runPCA(sim.sc.gr2, ncomponents = 10) +plotPCA(sim.sc.gr2, colour_by = "Group", shape_by = "Sample") +``` + +From the PCA plot above you can see that in this simulation the sample effect +outweighs the group effect. But we can tune these parameters to change the +relative weight of these effects. First we can decrease the sample effect by +increasing the similarity.scale parameter. And second we can increase the group +effect by adjusting the *eqtl.group.specific* and *de* parameters: + +```{r group-specific-eQTL-simulations-bigger} +params.group <- newSplatPopParams(batchCells = 40, + nGenes = 50, + similarity.scale = 6, + eqtl.group.specific = 0.6, + de.prob = 0.5, + de.facLoc = 0.5, + de.facScale = 0.4, + group.prob = c(0.5, 0.5)) + +sim.sc.gr2 <- splatPopSimulate(vcf = vcf, params = params.group) + +sim.sc.gr2 <- logNormCounts(sim.sc.gr2) +sim.sc.gr2 <- runPCA(sim.sc.gr2, ncomponents = 10) +plotPCA(sim.sc.gr2, colour_by = "Group", shape_by = "Sample") +``` + +### Simulate SC data for population with path and batch effects + +Like splat, splatPop also allows you to simulate single-cell data with path or +batch effects using the `method` tag in `splatSimulatePop`. Note that you can +also set *method = group*, but this is done automatically by setting the +*group.prob* parameter. For more information about these settings, see the +[Splat parameters vignette](splat_params.html). + +#### Batch effects + +```{r simulate-population-with-batch-effects} +params.batches <- newSplatPopParams(batchCells = c(20, 20), + nGenes = 50, + similarity.scale = 5, + batch.facLoc = 0.3, + batch.facScale = 0.3) + +sim.pop.batches <- splatPopSimulate(vcf = vcf, params = params.batches) +sim.pop.batches <- logNormCounts(sim.pop.batches) +sim.pop.batches <- runPCA(sim.pop.batches, ncomponents = 10) +plotPCA(sim.pop.batches, colour_by = "Batch", shape_by = "Sample", + ncomponents = 5:6) + +``` + +#### Path effects + +```{r simulate-population-with-path-effects} +params.paths <- newSplatPopParams(batchCells = 40, + nGenes = 50, + similarity.scale = 6, + de.facLoc = 0.5, + de.facScale = 0.5, + de.prob = 0.5) + +sim.pop.paths <- splatPopSimulate(vcf = vcf, params = params.paths, + method = "paths") +sim.pop.paths <- logNormCounts(sim.pop.paths) +sim.pop.paths <- runPCA(sim.pop.paths, ncomponents = 10) +plotPCA(sim.pop.paths, colour_by = "Step", shape_by = "Sample", + ncomponents = 5:6) +``` diff --git a/vignettes/splat_params.Rmd b/vignettes/splat_params.Rmd index 90641fad..28c63699 100644 --- a/vignettes/splat_params.Rmd +++ b/vignettes/splat_params.Rmd @@ -1,6 +1,8 @@ --- title: "Splat simulation parameters" -date: "Last updated: 18 April 2019" +author: "Luke Zappia" +package: splatter +date: "Last updated: 19 October 2020" output: BiocStyle::html_document: toc: true @@ -11,11 +13,8 @@ vignette: > %\VignetteEncoding{UTF-8} --- -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) +```{r knitr-options, include = FALSE} +knitr::opts_chunk$set(collapse = TRUE, comment = "#>") ``` This vignette describes the Splat simulation model and the parameters it uses diff --git a/vignettes/splatter.Rmd b/vignettes/splatter.Rmd index 260eafd6..5c5e37e4 100644 --- a/vignettes/splatter.Rmd +++ b/vignettes/splatter.Rmd @@ -1,10 +1,15 @@ --- title: "Introduction to Splatter" -author: "Luke Zappia" -date: "`r Sys.Date()`" +author: + - "Luke Zappia" + - "Belinda Phipson" + - "Alicia Oshlack" +package: splatter +date: "Last updated: 19 October 2020" output: BiocStyle::html_document: toc: true + toc_float: true vignette: > %\VignetteIndexEntry{An introduction to the Splatter package} %\VignetteEngine{knitr::rmarkdown} @@ -12,10 +17,7 @@ vignette: > --- ```{r knitr-options, echo = FALSE, message = FALSE, warning = FALSE} -# To render an HTML version that works nicely with github and web pages, do: -# rmarkdown::render("vignettes/splatter.Rmd", "all") -knitr::opts_chunk$set(fig.align = 'center', fig.width = 6, fig.height = 5, - dev = 'png') +knitr::opts_chunk$set(collapse = TRUE, comment = "#>") # Use exact BSPARAM to avoid warnings options(BiocSingularParam.default = BiocSingular::ExactParam()) @@ -31,9 +33,7 @@ introduction to Splatter's functionality. Splatter can be installed from Bioconductor: -```{r install, eval = FALSE} -if (!requireNamespace("BiocManager", quietly=TRUE)) - install.packages("BiocManager") +```{r install-bioc, eval = FALSE} BiocManager::install("splatter") ``` @@ -52,10 +52,12 @@ Splatter. Here is an example a mock dataset generated with the `scater` package: ```{r quickstart} # Load package -library(splatter) +suppressPackageStartupMessages({ + library(splatter) + library(scater) +}) # Create mock data -library(scater) set.seed(1) sce <- mockSCE() @@ -238,9 +240,9 @@ plotPCA(sim) (**NOTE:** Your values and plots may look different as the simulation is random and produces different results each time it is run.) -For more details about the `SingleCellExperiment` object refer to the [vignette] -[SCE-vignette]. For information about what you can do with `scater` refer to the -`scater` documentation and [vignette][scater-vignette]. +For more details about the `SingleCellExperiment` object refer to the +[vignette][SCE-vignette]. For information about what you can do with `scater` +refer to the `scater` documentation and [vignette][scater-vignette]. The `splatSimulate` function outputs the following additional information about the simulation: @@ -323,6 +325,8 @@ to the final, differentiated, cell type. By setting additional parameters it is possible to simulate more complex process (for example multiple mature cell types from a single progenitor). + + ## Batch effects Another factor that is important in the analysis of any sequencing experiment @@ -365,6 +369,17 @@ To simulate a single population use `splatSimulateSingle()` (equivalent to simulate paths use `splatSimulatePaths()` (equivalent to `splatSimulate(method = "paths")`). +## splatPop: Simulating populations + +splatPop uses the splat model to simulate single cell count data across a +population with relationship structure including expression quantitative loci +(eQTL) effects. The major addition in splatPop is the `splatPopSimulateMeans` +function, which simulates gene means for each gene for each individual using +parameters estimated from real data. These simulated means are then used as +input to`splatPopSimulateSC`, which is essentially a wrapper around the base +`splatSimulate`. For more information on generating population scale single-cell +count data, see the [splatPop vignette](splatPop.html). + # Other simulations As well as it's own Splat simulation method the Splatter package contains