From b53bf4fd09cee8aefbd9b61b9f086197d47fdcac Mon Sep 17 00:00:00 2001 From: npcooley Date: Wed, 28 Feb 2024 16:10:12 -0500 Subject: [PATCH] new functions, dependency updates --- DESCRIPTION | 4 +- NAMESPACE | 3 + NEWS.md | 3 + R/PrepareSeqs.R | 268 +++++++++++++++++++++++++++++++++++++++++++++ man/PrepareSeqs.Rd | 55 ++++++++++ 5 files changed, 331 insertions(+), 2 deletions(-) create mode 100644 R/PrepareSeqs.R create mode 100644 man/PrepareSeqs.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 2d3c454..0ad2d87 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: SynExtend Type: Package Title: Tools for Working With Synteny Objects -Version: 1.15.1 +Version: 1.15.2 Authors@R: c( person("Nicholas", "Cooley", email = "npc19@pitt.edu", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-6029-304X")), person("Aidan", "Lakshman", email = "ahl27@pitt.edu", role = c("aut", "ctb"), comment = c(ORCID = "0000-0002-9465-6785")), @@ -10,7 +10,7 @@ Authors@R: c( biocViews: Genetics, Clustering, ComparativeGenomics, DataImport Description: Shared order between genomic sequences provide a great deal of information. Synteny objects produced by the R package DECIPHER provides quantitative information about that shared order. SynExtend provides tools for extracting information from Synteny objects. Depends: R (>= 4.3.0), DECIPHER (>= 2.28.0) -Imports: methods, Biostrings, S4Vectors, IRanges, utils, stats, parallel, graphics, grDevices +Imports: methods, Biostrings, S4Vectors, IRanges, utils, stats, parallel, graphics, grDevices, RSQLite, DBI Suggests: BiocStyle, knitr, igraph, markdown, rmarkdown License: GPL-3 ByteCompile: true diff --git a/NAMESPACE b/NAMESPACE index 6331819..df1f787 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,8 @@ import(IRanges) import(Biostrings) import(DECIPHER) import(S4Vectors) +import(DBI) +import(RSQLite) importFrom("utils", "setTxtProgressBar", "txtProgressBar", "data", "object.size", "getS3method", "read.table") importFrom("stats", "predict", "runif", "setNames", "rnorm", "optim", "kmeans", "density", "nls", "ppois", "pt", "pnorm", "as.dist", "fisher.test", "is.leaf", "pchisq") @@ -33,6 +35,7 @@ export(MoransI) export(dendrapply) export(FastQFromSRR) export(FitchParsimony) +export(PrepareSeqs) S3method(subset, 'dendrogram') diff --git a/NEWS.md b/NEWS.md index 1a5fe71..c57fc74 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,6 @@ +# SynExtend 1.15.2 +* Addition of `PrepareSeqs` function, beginning the process of deprecating `PairSummaries` in favor of more cohesive and user friendly functions. + # SynExtend 1.15.1 * Fixes bug in JRF distance causing scores to be higher than expected. diff --git a/R/PrepareSeqs.R b/R/PrepareSeqs.R new file mode 100644 index 0000000..26a26e8 --- /dev/null +++ b/R/PrepareSeqs.R @@ -0,0 +1,268 @@ +###### -- extract a bunch of sequences from some genecalls -------------------- +# author: nicholas cooley +# maintainer: nicholas cooley +# contact: npc19@pitt.edu + +# given an object that has a GeneCalls attribute + +PrepareSeqs <- function(SynExtendObject, + DataBase, + DefaultTranslationTable = "11", + Storage = 1, + Verbose = FALSE) { + + if (Verbose) { + TimeStart <- Sys.time() + pBar <- txtProgressBar(style = 1) + } + + # check object type -- currently only PairSummaries and LinkedPairs + if (!is(object = SynExtendObject, + class2 = "PairSummaries") & + !is(object = SynExtendObject, + class2 = "LinkedPairs")) { + stop("'SynExtendObject' must be an object of class 'PairSummaries' or 'LinkedPairs'.") + } + # initialize database + if (is.character(DataBase)) { + if (!requireNamespace(package = "RSQLite", + quietly = TRUE)) { + stop("Package 'RSQLite' must be installed.") + } + dbConn <- dbConnect(dbDriver("SQLite"), DataBase) + on.exit(dbDisconnect(dbConn)) + } else { + dbConn <- DataBase + if (!dbIsValid(dbConn)) { + stop("The connection has expired.") + } + } + # check storage + if (Storage < 0) { + stop("Storage must be greater than zero.") + } else { + Storage <- Storage * 1e9 # conversion to gigabytes + } + + # get the genecalls! + GeneCalls <- attr(x = SynExtendObject, + which = "GeneCalls") + + ###### -- subset gene calls based on the names of the links object ---------- + if (is(SynExtendObject, + class2 = "LinkedPairs")) { + if (length(GeneCalls) != nrow(SynExtendObject)) { + GeneCalls <- GeneCalls[match(x = dimnames(SynExtendObject)[[1]], + table = names(GeneCalls))] + } + } # else genecalls from PairSummaries objects will be guaranteed to be correctly shaped? + + # load in genomes and ALL extracted features at the top until storage limit + # is hit + if (Verbose) { + cat("Preparing overhead data.\n") + } + # load in structure matrices once for PredictHEC + # these hardcoded values come from Erik ... AlignProfiles maybe? + MAT1 <- get(data("HEC_MI1", + package = "DECIPHER", + envir = environment())) + MAT2 <- get(data("HEC_MI2", + package = "DECIPHER", + envir = environment())) + structureMatrix <- matrix(c(0.187, -0.8, -0.873, + -0.8, 0.561, -0.979, + -0.873, -0.979, 0.221), + 3, + 3, + dimnames=list(c("H", "E", "C"), + c("H", "E", "C"))) + substitutionMatrix <- matrix(c(1.5, -2.134, -0.739, -1.298, + -2.134, 1.832, -2.462, 0.2, + -0.739, -2.462, 1.522, -2.062, + -1.298, 0.2, -2.062, 1.275), + nrow = 4, + dimnames = list(DNA_BASES, DNA_BASES)) + Features01 <- Features02 <- AAStruct <- vector("list", + length = length(GeneCalls)) + L <- length(GeneCalls) + + Count <- 1L + while (object.size(Features01) < Storage & + Count <= L) { + # print(object.size(Features01)) + # print(Count) + Genome <- SearchDB(dbFile = dbConn, + identifier = names(GeneCalls[Count]), + nameBy = "description", + type = "DNAStringSet", + verbose = FALSE) + PresentIndices <- unique(GeneCalls[[Count]]$Index) + # Reset any coding & non-translation table features to the default + # move this somewhere else eventually... + if (any(is.na(GeneCalls[[Count]]$Translation_Table))) { + w <- which(is.na(GeneCalls[[Count]]$Translation_Table) & + GeneCalls[[Count]]$Coding) + if (length(w) > 0) { + GeneCalls[[Count]]$Translation_Table[w] <- DefaultTranslationTable + } + } + if (length(PresentIndices) > 1L) { + # many indices, loop through present indices and extract + # slam together at the end + Features01[[Count]] <- vector(mode = "list", + length = length(PresentIndices)) + for (m3 in seq_along(PresentIndices)) { + ph <- GeneCalls[[Count]]$Index == PresentIndices[m3] + # implementation 3 - faster so far + # set up the succinct extraction + # build an index of where stringset positions need to be collapsed + z1 <- unname(GeneCalls[[Count]]$Range[ph]) + z2 <- lengths(z1) + # convert IRangesList to IRanges object for simple extractAt + z1 <- unlist(z1, + recursive = FALSE) + Features01[[Count]][[m3]] <- extractAt(x = Genome[[PresentIndices[m3]]], + at = z1) + CollapseCount <- 0L + w <- which(z2 > 1L) + # if no collapsing needs to occur, do not enter loop + if (length(w) > 0L) { + # if collapsing must take place build a placeholder of positions to remove + # once collapsing correct positions has occurred + remove <- vector(mode = "integer", + length = sum(z2[w]) - length(w)) + for (m4 in w) { + Features01[[Count]][[m3]][[m4 + CollapseCount]] <- unlist(Features01[[Count]][[m3]][m4:(m4 + z2[m4] - 1L) + CollapseCount]) + remove[(CollapseCount + 1L):(CollapseCount + z2[m4] - 1L)] <- (m4 + 1L):(m4 + z2[m4] - 1L) + CollapseCount + CollapseCount <- CollapseCount + z2[m4] - 1L + } + Features01[[Count]][[m3]][remove] <- NULL + } + FlipMe <- GeneCalls[[Count]]$Strand[ph] == 1L + if (any(FlipMe)) { + Features01[[Count]][[m3]][FlipMe] <- reverseComplement(Features01[[Count]][[m3]][FlipMe]) + } + } + Features01[[Count]] <- do.call(c, + Features01[[Count]]) + + } else { + # implementation 3 - shortest possible collapse loops and fewest copies - so far + z1 <- unname(GeneCalls[[Count]]$Range) + z2 <- lengths(z1) + z1 <- unlist(z1, + recursive = FALSE) + Features01[[Count]] <- extractAt(x = Genome[[PresentIndices]], + at = z1) + CollapseCount <- 0L + w <- which(z2 > 1) + if (length(w) > 0) { + remove <- vector(mode = "integer", + length = sum(z2[w]) - length(w)) + for (m4 in w) { + Features01[[Count]][[m4 + CollapseCount]] <- unlist(Features01[[Count]][m4:(m4 + z2[m4] - 1L) + CollapseCount]) + remove[(CollapseCount + 1L):(CollapseCount + z2[m4] - 1L)] <- (m4 + 1L):(m4 + z2[m4] - 1L) + CollapseCount + CollapseCount <- CollapseCount + z2[m4] - 1L + } + Features01[[Count]][remove] <- NULL + } + + FlipMe <- GeneCalls[[Count]]$Strand == 1L + if (any(FlipMe)) { + Features01[[Count]][FlipMe] <- reverseComplement(Features01[[Count]][FlipMe]) + } + + } + names(Features01[[Count]]) <- paste(rep(names(GeneCalls)[Count], length(Features01[[Count]])), + GeneCalls[[Count]]$Index, + seq(length(Features01[[Count]])), + sep = "_") + + # translate all translatable features with as few calls as possible + ph <- unique(GeneCalls[[Count]]$Translation_Table) + + ph <- ph[!is.na(ph)] + if (length(ph) < 1L) { + ph <- DefaultTranslationTable + phkey <- which(GeneCalls[[Count]]$Coding & + GeneCalls[[Count]]$Type == "gene") + CurrentGeneticCode <- getGeneticCode(id_or_name2 = ph, + full.search = FALSE, + as.data.frame = FALSE) + Features02[[Count]] <- translate(x = Features01[[Count]][phkey], + genetic.code = CurrentGeneticCode, + if.fuzzy.codon = "solve") + Features02[[Count]] <- Features02[[Count]][order(phkey)] + # print(length(Features02[[Count]])) + } else { + Features02[[Count]] <- vector(mode = "list", + length = length(ph)) + phkey <- vector(mode = "list", + length = length(ph)) + + for (m4 in seq_along(ph)) { + matchph <- which(GeneCalls[[Count]]$Translation_Table == ph[m4] & + GeneCalls[[Count]]$Coding & + GeneCalls[[Count]]$Type == "gene") + phkey[[m4]] <- matchph + CurrentGeneticCode <- getGeneticCode(id_or_name2 = ph[m4], + full.search = FALSE, + as.data.frame = FALSE) + Features02[[Count]][[m4]] <- translate(x = Features01[[Count]][matchph], + genetic.code = CurrentGeneticCode, + if.fuzzy.codon = "solve") + } + Features02[[Count]] <- do.call(c, + Features02[[Count]]) + phkey <- unlist(phkey) + Features02[[Count]] <- Features02[[Count]][order(phkey)] + + } + # rewrite ph to provide the correct names for the features + ph <- GeneCalls[[Count]]$Coding & GeneCalls[[Count]]$Type == "gene" + + names(Features02[[Count]]) <- paste(rep(names(GeneCalls)[Count], length(Features02[[Count]])), + GeneCalls[[Count]]$Index[ph], + seq(length(Features01[[Count]]))[ph], + sep = "_") + + # generate structures for aa alignments if there are + if (!is.null(Features02[[Count]])) { + AAStruct[[Count]] <- PredictHEC(myAAStringSet = Features02[[Count]], + type = "probabilities", + HEC_MI1 = MAT1, + HEC_MI2 = MAT2) + } + + if (Verbose) { + setTxtProgressBar(pb = pBar, + value = Count / L) + } + + Count <- Count + 1L + # will extract till storage is exceeded + # will not cap at storage + } # end while loop + + if (Verbose) { + close(pBar) + if (Count < L) { + cat("Overhead is larger than storage request.\nSearch loops will require database lookups.\n") + # RemoveWhenAble <- TRUE + } else { + cat("Complete!\n") + # RemoveWhenAble <- FALSE + } + TimeEnd <- Sys.time() + print(TimeEnd - TimeStart) + } + + res <- list("DNA" = Features01, + "AA" = Features02, + "Struct" = AAStruct) + class(res) <- c("list", "SynExtendSeqs") + return(res) +} + + diff --git a/man/PrepareSeqs.Rd b/man/PrepareSeqs.Rd new file mode 100644 index 0000000..939ddb5 --- /dev/null +++ b/man/PrepareSeqs.Rd @@ -0,0 +1,55 @@ +\name{PrepareSeqs} +\alias{PrepareSeqs} +\title{ +Return gene sequences. +} +\description{ +Given a \code{SynExtend} object with a \code{GeneCalls} attribute, and a \code{DECIPHER} database, return all gene sequences and their translations. +} +\usage{ +PrepareSeqs(SynExtendObject, + DataBase, + DefaultTranslationTable = "11", + Storage = 1, + Verbose = FALSE) +} +\arguments{ + \item{SynExtendObject}{ +An object of class \code{PairSummaries} or of \code{LinkedPairs}. Object must have a \code{GeneCalls} attribute. +} + \item{DataBase}{ +A character string pointing to a SQLite database, or a connection to a \code{DECIPHER} database. +} + \item{DefaultTranslationTable}{ +A character vector of length 1 identifying the translation table to use if one is not supplied in the \code{GeneCalls} attribute. +} + \item{Storage}{ +A soft memory limit for how much space to allow when building the resulting object. Translated to Gb. +} + \item{Verbose}{ +Logical indicating whether or not to display a progress bar and print the time difference upon completion. +} +} +\details{ +\code{PrepareSeqs} returns the sequences of genes and their translations where appropriate. +} +\value{ +An object of class \code{SynExtendSeqs}. +} +\author{ +Nicholas Cooley \email{npc19@pitt.edu} +} + +\seealso{ +\code{\link{PairSummaries}}, \code{\link{NucleotideOverlap}}, \code{\link{FindSynteny}} +} +\examples{ +DBPATH <- system.file("extdata", + "Endosymbionts.sqlite", + package = "SynExtend") + +data("Endosymbionts_Pairs01", package = "SynExtend") +CurrentSeqs <- PrepareSeqs(SynExtendObject = Endosymbionts_Pairs01, + DataBase = DBPATH, + Verbose = TRUE) +}