new functions, dependency updates

npcooley · Feb 28, 2024 · b53bf4f · b53bf4f
1 parent af77e88
commit b53bf4f
Show file tree

Hide file tree

Showing 5 changed files with 331 additions and 2 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: SynExtend
 Type: Package
 Title: Tools for Working With Synteny Objects
-Version: 1.15.1
+Version: 1.15.2
 Authors@R: c(
 	person("Nicholas", "Cooley", email = "npc19@pitt.edu", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-6029-304X")),
 	person("Aidan", "Lakshman", email = "ahl27@pitt.edu", role = c("aut", "ctb"), comment = c(ORCID = "0000-0002-9465-6785")),
@@ -10,7 +10,7 @@ Authors@R: c(
 biocViews: Genetics, Clustering, ComparativeGenomics, DataImport
 Description: Shared order between genomic sequences provide a great deal of information. Synteny objects produced by the R package DECIPHER provides quantitative information about that shared order. SynExtend provides tools for extracting information from Synteny objects.
 Depends: R (>= 4.3.0), DECIPHER (>= 2.28.0)
-Imports: methods, Biostrings, S4Vectors, IRanges, utils, stats, parallel, graphics, grDevices
+Imports: methods, Biostrings, S4Vectors, IRanges, utils, stats, parallel, graphics, grDevices, RSQLite, DBI
 Suggests: BiocStyle, knitr, igraph, markdown, rmarkdown
 License: GPL-3
 ByteCompile: true

diff --git a/NAMESPACE b/NAMESPACE
@@ -5,6 +5,8 @@ import(IRanges)
 import(Biostrings)
 import(DECIPHER)
 import(S4Vectors)
+import(DBI)
+import(RSQLite)
 
 importFrom("utils", "setTxtProgressBar", "txtProgressBar", "data", "object.size", "getS3method", "read.table")
 importFrom("stats", "predict", "runif", "setNames", "rnorm", "optim", "kmeans", "density", "nls", "ppois", "pt", "pnorm", "as.dist", "fisher.test", "is.leaf", "pchisq")
@@ -33,6 +35,7 @@ export(MoransI)
 export(dendrapply)
 export(FastQFromSRR)
 export(FitchParsimony)
+export(PrepareSeqs)
 
 S3method(subset, 'dendrogram')
 

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,6 @@
+# SynExtend 1.15.2
+* Addition of `PrepareSeqs` function, beginning the process of deprecating `PairSummaries` in favor of more cohesive and user friendly functions.
+
 # SynExtend 1.15.1
 * Fixes bug in JRF distance causing scores to be higher than expected.
 

diff --git a/R/PrepareSeqs.R b/R/PrepareSeqs.R
@@ -0,0 +1,268 @@
+###### -- extract a bunch of sequences from some genecalls --------------------
+# author: nicholas cooley
+# maintainer: nicholas cooley
+# contact: npc19@pitt.edu
+
+# given an object that has a GeneCalls attribute
+
+PrepareSeqs <- function(SynExtendObject,
+                        DataBase,
+                        DefaultTranslationTable = "11",
+                        Storage = 1,
+                        Verbose = FALSE) {
+
+  if (Verbose) {
+    TimeStart <- Sys.time()
+    pBar <- txtProgressBar(style = 1)
+  }
+
+  # check object type -- currently only PairSummaries and LinkedPairs
+  if (!is(object = SynExtendObject,
+          class2 = "PairSummaries") &
+      !is(object = SynExtendObject,
+          class2 = "LinkedPairs")) {
+    stop("'SynExtendObject' must be an object of class 'PairSummaries' or 'LinkedPairs'.")
+  }
+  # initialize database
+  if (is.character(DataBase)) {
+    if (!requireNamespace(package = "RSQLite",
+                          quietly = TRUE)) {
+      stop("Package 'RSQLite' must be installed.")
+    }
+    dbConn <- dbConnect(dbDriver("SQLite"), DataBase)
+    on.exit(dbDisconnect(dbConn))
+  } else {
+    dbConn <- DataBase
+    if (!dbIsValid(dbConn)) {
+      stop("The connection has expired.")
+    }
+  }
+  # check storage
+  if (Storage < 0) {
+    stop("Storage must be greater than zero.")
+  } else {
+    Storage <- Storage * 1e9 # conversion to gigabytes
+  }
+
+  # get the genecalls!
+  GeneCalls <- attr(x = SynExtendObject,
+                    which = "GeneCalls")
+
+  ###### -- subset gene calls based on the names of the links object ----------
+  if (is(SynExtendObject,
+         class2 = "LinkedPairs")) {
+    if (length(GeneCalls) != nrow(SynExtendObject)) {
+      GeneCalls <- GeneCalls[match(x = dimnames(SynExtendObject)[[1]],
+                                   table = names(GeneCalls))]
+    }
+  } # else genecalls from PairSummaries objects will be guaranteed to be correctly shaped?
+
+  # load in genomes and ALL extracted features at the top until storage limit
+  # is hit
+  if (Verbose) {
+    cat("Preparing overhead data.\n")
+  }
+  # load in structure matrices once for PredictHEC
+  # these hardcoded values come from Erik ... AlignProfiles maybe?
+  MAT1 <- get(data("HEC_MI1",
+                   package = "DECIPHER",
+                   envir = environment()))
+  MAT2 <- get(data("HEC_MI2",
+                   package = "DECIPHER",
+                   envir = environment()))
+  structureMatrix <- matrix(c(0.187, -0.8, -0.873,
+                              -0.8, 0.561, -0.979,
+                              -0.873, -0.979, 0.221),
+                            3,
+                            3,
+                            dimnames=list(c("H", "E", "C"),
+                                          c("H", "E", "C")))
+  substitutionMatrix <- matrix(c(1.5, -2.134, -0.739, -1.298,
+                                 -2.134, 1.832, -2.462, 0.2,
+                                 -0.739, -2.462, 1.522, -2.062,
+                                 -1.298, 0.2, -2.062, 1.275),
+                               nrow = 4,
+                               dimnames = list(DNA_BASES, DNA_BASES))
+  Features01 <- Features02 <- AAStruct <- vector("list",
+                                                 length = length(GeneCalls))
+  L <- length(GeneCalls)
+
+  Count <- 1L
+  while (object.size(Features01) < Storage &
+         Count <= L) {
+    # print(object.size(Features01))
+    # print(Count)
+    Genome <- SearchDB(dbFile = dbConn,
+                       identifier = names(GeneCalls[Count]),
+                       nameBy = "description",
+                       type = "DNAStringSet",
+                       verbose = FALSE)
+    PresentIndices <- unique(GeneCalls[[Count]]$Index)
+    # Reset any coding & non-translation table features to the default
+    # move this somewhere else eventually...
+    if (any(is.na(GeneCalls[[Count]]$Translation_Table))) {
+      w <- which(is.na(GeneCalls[[Count]]$Translation_Table) &
+                   GeneCalls[[Count]]$Coding)
+      if (length(w) > 0) {
+        GeneCalls[[Count]]$Translation_Table[w] <- DefaultTranslationTable
+      }
+    }
+    if (length(PresentIndices) > 1L) {
+      # many indices, loop through present indices and extract
+      # slam together at the end
+      Features01[[Count]] <- vector(mode = "list",
+                                    length = length(PresentIndices))
+      for (m3 in seq_along(PresentIndices)) {
+        ph <- GeneCalls[[Count]]$Index == PresentIndices[m3]
+        # implementation 3 - faster so far
+        # set up the succinct extraction
+        # build an index of where stringset positions need to be collapsed
+        z1 <- unname(GeneCalls[[Count]]$Range[ph])
+        z2 <- lengths(z1)
+        # convert IRangesList to IRanges object for simple extractAt
+        z1 <- unlist(z1,
+                     recursive = FALSE)
+        Features01[[Count]][[m3]] <- extractAt(x = Genome[[PresentIndices[m3]]],
+                                               at = z1)
+        CollapseCount <- 0L
+        w <- which(z2 > 1L)
+        # if no collapsing needs to occur, do not enter loop
+        if (length(w) > 0L) {
+          # if collapsing must take place build a placeholder of positions to remove
+          # once collapsing correct positions has occurred
+          remove <- vector(mode = "integer",
+                           length = sum(z2[w]) - length(w))
+          for (m4 in w) {
+            Features01[[Count]][[m3]][[m4 + CollapseCount]] <- unlist(Features01[[Count]][[m3]][m4:(m4 + z2[m4] - 1L) + CollapseCount])
+            remove[(CollapseCount + 1L):(CollapseCount + z2[m4] - 1L)] <- (m4 + 1L):(m4 + z2[m4] - 1L) + CollapseCount
+            CollapseCount <- CollapseCount + z2[m4] - 1L
+          }
+          Features01[[Count]][[m3]][remove] <- NULL
+        }
+        FlipMe <- GeneCalls[[Count]]$Strand[ph] == 1L
+        if (any(FlipMe)) {
+          Features01[[Count]][[m3]][FlipMe] <- reverseComplement(Features01[[Count]][[m3]][FlipMe])
+        }
+      }
+      Features01[[Count]] <- do.call(c,
+                                     Features01[[Count]])
+
+    } else {
+      # implementation 3 - shortest possible collapse loops and fewest copies - so far
+      z1 <- unname(GeneCalls[[Count]]$Range)
+      z2 <- lengths(z1)
+      z1 <- unlist(z1,
+                   recursive = FALSE)
+      Features01[[Count]] <- extractAt(x = Genome[[PresentIndices]],
+                                       at = z1)
+      CollapseCount <- 0L
+      w <- which(z2 > 1)
+      if (length(w) > 0) {
+        remove <- vector(mode = "integer",
+                         length = sum(z2[w]) - length(w))
+        for (m4 in w) {
+          Features01[[Count]][[m4 + CollapseCount]] <- unlist(Features01[[Count]][m4:(m4 + z2[m4] - 1L) + CollapseCount])
+          remove[(CollapseCount + 1L):(CollapseCount + z2[m4] - 1L)] <- (m4 + 1L):(m4 + z2[m4] - 1L) + CollapseCount
+          CollapseCount <- CollapseCount + z2[m4] - 1L
+        }
+        Features01[[Count]][remove] <- NULL
+      }
+
+      FlipMe <- GeneCalls[[Count]]$Strand == 1L
+      if (any(FlipMe)) {
+        Features01[[Count]][FlipMe] <- reverseComplement(Features01[[Count]][FlipMe])
+      }
+
+    }
+    names(Features01[[Count]]) <- paste(rep(names(GeneCalls)[Count], length(Features01[[Count]])),
+                                        GeneCalls[[Count]]$Index,
+                                        seq(length(Features01[[Count]])),
+                                        sep = "_")
+
+    # translate all translatable features with as few calls as possible
+    ph <- unique(GeneCalls[[Count]]$Translation_Table)
+
+    ph <- ph[!is.na(ph)]
+    if (length(ph) < 1L) {
+      ph <- DefaultTranslationTable
+      phkey <- which(GeneCalls[[Count]]$Coding &
+                       GeneCalls[[Count]]$Type == "gene")
+      CurrentGeneticCode <- getGeneticCode(id_or_name2 = ph,
+                                           full.search = FALSE,
+                                           as.data.frame = FALSE)
+      Features02[[Count]] <- translate(x = Features01[[Count]][phkey],
+                                       genetic.code = CurrentGeneticCode,
+                                       if.fuzzy.codon = "solve")
+      Features02[[Count]] <- Features02[[Count]][order(phkey)]
+      # print(length(Features02[[Count]]))
+    } else {
+      Features02[[Count]] <- vector(mode = "list",
+                                    length = length(ph))
+      phkey <- vector(mode = "list",
+                      length = length(ph))
+
+      for (m4 in seq_along(ph)) {
+        matchph <- which(GeneCalls[[Count]]$Translation_Table == ph[m4] &
+                           GeneCalls[[Count]]$Coding &
+                           GeneCalls[[Count]]$Type == "gene")
+        phkey[[m4]] <- matchph
+        CurrentGeneticCode <- getGeneticCode(id_or_name2 = ph[m4],
+                                             full.search = FALSE,
+                                             as.data.frame = FALSE)
+        Features02[[Count]][[m4]] <- translate(x = Features01[[Count]][matchph],
+                                               genetic.code = CurrentGeneticCode,
+                                               if.fuzzy.codon = "solve")
+      }
+      Features02[[Count]] <- do.call(c,
+                                     Features02[[Count]])
+      phkey <- unlist(phkey)
+      Features02[[Count]] <- Features02[[Count]][order(phkey)]
+
+    }
+    # rewrite ph to provide the correct names for the features
+    ph <- GeneCalls[[Count]]$Coding & GeneCalls[[Count]]$Type == "gene"
+
+    names(Features02[[Count]]) <- paste(rep(names(GeneCalls)[Count], length(Features02[[Count]])),
+                                        GeneCalls[[Count]]$Index[ph],
+                                        seq(length(Features01[[Count]]))[ph],
+                                        sep = "_")
+
+    # generate structures for aa alignments if there are 
+    if (!is.null(Features02[[Count]])) {
+      AAStruct[[Count]] <- PredictHEC(myAAStringSet = Features02[[Count]],
+                                      type = "probabilities",
+                                      HEC_MI1 = MAT1,
+                                      HEC_MI2 = MAT2)
+    }
+
+    if (Verbose) {
+      setTxtProgressBar(pb = pBar,
+                        value = Count / L)
+    }
+
+    Count <- Count + 1L
+    # will extract till storage is exceeded
+    # will not cap at storage
+  } # end while loop
+
+  if (Verbose) {
+    close(pBar)
+    if (Count < L) {
+      cat("Overhead is larger than storage request.\nSearch loops will require database lookups.\n")
+      # RemoveWhenAble <- TRUE
+    } else {
+      cat("Complete!\n")
+      # RemoveWhenAble <- FALSE
+    }
+    TimeEnd <- Sys.time()
+    print(TimeEnd - TimeStart)
+  }
+
+  res <- list("DNA" = Features01,
+              "AA" = Features02,
+              "Struct" = AAStruct)
+  class(res) <- c("list", "SynExtendSeqs")
+  return(res)
+}
+
+
diff --git a/man/PrepareSeqs.Rd b/man/PrepareSeqs.Rd
@@ -0,0 +1,55 @@
+\name{PrepareSeqs}
+\alias{PrepareSeqs}
+\title{
+Return gene sequences.
+}
+\description{
+Given a \code{SynExtend} object with a \code{GeneCalls} attribute, and a \code{DECIPHER} database, return all gene sequences and their translations.
+}
+\usage{
+PrepareSeqs(SynExtendObject,
+            DataBase,
+            DefaultTranslationTable = "11",
+            Storage = 1,
+            Verbose = FALSE)
+}
+\arguments{
+  \item{SynExtendObject}{
+An object of class \code{PairSummaries} or of \code{LinkedPairs}. Object must have a \code{GeneCalls} attribute.
+}
+  \item{DataBase}{
+A character string pointing to a SQLite database, or a connection to a \code{DECIPHER} database.
+}
+  \item{DefaultTranslationTable}{
+A character vector of length 1 identifying the translation table to use if one is not supplied in the \code{GeneCalls} attribute.
+}
+  \item{Storage}{
+A soft memory limit for how much space to allow when building the resulting object. Translated to Gb.
+}
+  \item{Verbose}{
+Logical indicating whether or not to display a progress bar and print the time difference upon completion.
+}
+}
+\details{
+\code{PrepareSeqs} returns the sequences of genes and their translations where appropriate.
+}
+\value{
+An object of class \code{SynExtendSeqs}.
+}
+\author{
+Nicholas Cooley \email{npc19@pitt.edu}
+}
+
+\seealso{
+\code{\link{PairSummaries}}, \code{\link{NucleotideOverlap}}, \code{\link{FindSynteny}}
+}
+\examples{
+DBPATH <- system.file("extdata",
+                      "Endosymbionts.sqlite",
+                      package = "SynExtend")
+
+data("Endosymbionts_Pairs01", package = "SynExtend")
+CurrentSeqs <- PrepareSeqs(SynExtendObject = Endosymbionts_Pairs01,
+                           DataBase = DBPATH,
+                           Verbose = TRUE)
+}