Skip to content

Commit

Permalink
new functions, dependency updates
Browse files Browse the repository at this point in the history
  • Loading branch information
npcooley committed Feb 28, 2024
1 parent af77e88 commit b53bf4f
Show file tree
Hide file tree
Showing 5 changed files with 331 additions and 2 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: SynExtend
Type: Package
Title: Tools for Working With Synteny Objects
Version: 1.15.1
Version: 1.15.2
Authors@R: c(
person("Nicholas", "Cooley", email = "npc19@pitt.edu", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-6029-304X")),
person("Aidan", "Lakshman", email = "ahl27@pitt.edu", role = c("aut", "ctb"), comment = c(ORCID = "0000-0002-9465-6785")),
Expand All @@ -10,7 +10,7 @@ Authors@R: c(
biocViews: Genetics, Clustering, ComparativeGenomics, DataImport
Description: Shared order between genomic sequences provide a great deal of information. Synteny objects produced by the R package DECIPHER provides quantitative information about that shared order. SynExtend provides tools for extracting information from Synteny objects.
Depends: R (>= 4.3.0), DECIPHER (>= 2.28.0)
Imports: methods, Biostrings, S4Vectors, IRanges, utils, stats, parallel, graphics, grDevices
Imports: methods, Biostrings, S4Vectors, IRanges, utils, stats, parallel, graphics, grDevices, RSQLite, DBI
Suggests: BiocStyle, knitr, igraph, markdown, rmarkdown
License: GPL-3
ByteCompile: true
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import(IRanges)
import(Biostrings)
import(DECIPHER)
import(S4Vectors)
import(DBI)
import(RSQLite)

importFrom("utils", "setTxtProgressBar", "txtProgressBar", "data", "object.size", "getS3method", "read.table")
importFrom("stats", "predict", "runif", "setNames", "rnorm", "optim", "kmeans", "density", "nls", "ppois", "pt", "pnorm", "as.dist", "fisher.test", "is.leaf", "pchisq")
Expand Down Expand Up @@ -33,6 +35,7 @@ export(MoransI)
export(dendrapply)
export(FastQFromSRR)
export(FitchParsimony)
export(PrepareSeqs)

S3method(subset, 'dendrogram')

Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# SynExtend 1.15.2
* Addition of `PrepareSeqs` function, beginning the process of deprecating `PairSummaries` in favor of more cohesive and user friendly functions.

# SynExtend 1.15.1
* Fixes bug in JRF distance causing scores to be higher than expected.

Expand Down
268 changes: 268 additions & 0 deletions R/PrepareSeqs.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
###### -- extract a bunch of sequences from some genecalls --------------------
# author: nicholas cooley
# maintainer: nicholas cooley
# contact: npc19@pitt.edu

# given an object that has a GeneCalls attribute

PrepareSeqs <- function(SynExtendObject,
DataBase,
DefaultTranslationTable = "11",
Storage = 1,
Verbose = FALSE) {

if (Verbose) {
TimeStart <- Sys.time()
pBar <- txtProgressBar(style = 1)
}

# check object type -- currently only PairSummaries and LinkedPairs
if (!is(object = SynExtendObject,
class2 = "PairSummaries") &
!is(object = SynExtendObject,
class2 = "LinkedPairs")) {
stop("'SynExtendObject' must be an object of class 'PairSummaries' or 'LinkedPairs'.")
}
# initialize database
if (is.character(DataBase)) {
if (!requireNamespace(package = "RSQLite",
quietly = TRUE)) {
stop("Package 'RSQLite' must be installed.")
}
dbConn <- dbConnect(dbDriver("SQLite"), DataBase)
on.exit(dbDisconnect(dbConn))
} else {
dbConn <- DataBase
if (!dbIsValid(dbConn)) {
stop("The connection has expired.")
}
}
# check storage
if (Storage < 0) {
stop("Storage must be greater than zero.")
} else {
Storage <- Storage * 1e9 # conversion to gigabytes
}

# get the genecalls!
GeneCalls <- attr(x = SynExtendObject,
which = "GeneCalls")

###### -- subset gene calls based on the names of the links object ----------
if (is(SynExtendObject,
class2 = "LinkedPairs")) {
if (length(GeneCalls) != nrow(SynExtendObject)) {
GeneCalls <- GeneCalls[match(x = dimnames(SynExtendObject)[[1]],
table = names(GeneCalls))]
}
} # else genecalls from PairSummaries objects will be guaranteed to be correctly shaped?

# load in genomes and ALL extracted features at the top until storage limit
# is hit
if (Verbose) {
cat("Preparing overhead data.\n")
}
# load in structure matrices once for PredictHEC
# these hardcoded values come from Erik ... AlignProfiles maybe?
MAT1 <- get(data("HEC_MI1",
package = "DECIPHER",
envir = environment()))
MAT2 <- get(data("HEC_MI2",
package = "DECIPHER",
envir = environment()))
structureMatrix <- matrix(c(0.187, -0.8, -0.873,
-0.8, 0.561, -0.979,
-0.873, -0.979, 0.221),
3,
3,
dimnames=list(c("H", "E", "C"),
c("H", "E", "C")))
substitutionMatrix <- matrix(c(1.5, -2.134, -0.739, -1.298,
-2.134, 1.832, -2.462, 0.2,
-0.739, -2.462, 1.522, -2.062,
-1.298, 0.2, -2.062, 1.275),
nrow = 4,
dimnames = list(DNA_BASES, DNA_BASES))
Features01 <- Features02 <- AAStruct <- vector("list",
length = length(GeneCalls))
L <- length(GeneCalls)

Count <- 1L
while (object.size(Features01) < Storage &
Count <= L) {
# print(object.size(Features01))
# print(Count)
Genome <- SearchDB(dbFile = dbConn,
identifier = names(GeneCalls[Count]),
nameBy = "description",
type = "DNAStringSet",
verbose = FALSE)
PresentIndices <- unique(GeneCalls[[Count]]$Index)
# Reset any coding & non-translation table features to the default
# move this somewhere else eventually...
if (any(is.na(GeneCalls[[Count]]$Translation_Table))) {
w <- which(is.na(GeneCalls[[Count]]$Translation_Table) &
GeneCalls[[Count]]$Coding)
if (length(w) > 0) {
GeneCalls[[Count]]$Translation_Table[w] <- DefaultTranslationTable
}
}
if (length(PresentIndices) > 1L) {
# many indices, loop through present indices and extract
# slam together at the end
Features01[[Count]] <- vector(mode = "list",
length = length(PresentIndices))
for (m3 in seq_along(PresentIndices)) {
ph <- GeneCalls[[Count]]$Index == PresentIndices[m3]
# implementation 3 - faster so far
# set up the succinct extraction
# build an index of where stringset positions need to be collapsed
z1 <- unname(GeneCalls[[Count]]$Range[ph])
z2 <- lengths(z1)
# convert IRangesList to IRanges object for simple extractAt
z1 <- unlist(z1,
recursive = FALSE)
Features01[[Count]][[m3]] <- extractAt(x = Genome[[PresentIndices[m3]]],
at = z1)
CollapseCount <- 0L
w <- which(z2 > 1L)
# if no collapsing needs to occur, do not enter loop
if (length(w) > 0L) {
# if collapsing must take place build a placeholder of positions to remove
# once collapsing correct positions has occurred
remove <- vector(mode = "integer",
length = sum(z2[w]) - length(w))
for (m4 in w) {
Features01[[Count]][[m3]][[m4 + CollapseCount]] <- unlist(Features01[[Count]][[m3]][m4:(m4 + z2[m4] - 1L) + CollapseCount])
remove[(CollapseCount + 1L):(CollapseCount + z2[m4] - 1L)] <- (m4 + 1L):(m4 + z2[m4] - 1L) + CollapseCount
CollapseCount <- CollapseCount + z2[m4] - 1L
}
Features01[[Count]][[m3]][remove] <- NULL
}
FlipMe <- GeneCalls[[Count]]$Strand[ph] == 1L
if (any(FlipMe)) {
Features01[[Count]][[m3]][FlipMe] <- reverseComplement(Features01[[Count]][[m3]][FlipMe])
}
}
Features01[[Count]] <- do.call(c,
Features01[[Count]])

} else {
# implementation 3 - shortest possible collapse loops and fewest copies - so far
z1 <- unname(GeneCalls[[Count]]$Range)
z2 <- lengths(z1)
z1 <- unlist(z1,
recursive = FALSE)
Features01[[Count]] <- extractAt(x = Genome[[PresentIndices]],
at = z1)
CollapseCount <- 0L
w <- which(z2 > 1)
if (length(w) > 0) {
remove <- vector(mode = "integer",
length = sum(z2[w]) - length(w))
for (m4 in w) {
Features01[[Count]][[m4 + CollapseCount]] <- unlist(Features01[[Count]][m4:(m4 + z2[m4] - 1L) + CollapseCount])
remove[(CollapseCount + 1L):(CollapseCount + z2[m4] - 1L)] <- (m4 + 1L):(m4 + z2[m4] - 1L) + CollapseCount
CollapseCount <- CollapseCount + z2[m4] - 1L
}
Features01[[Count]][remove] <- NULL
}

FlipMe <- GeneCalls[[Count]]$Strand == 1L
if (any(FlipMe)) {
Features01[[Count]][FlipMe] <- reverseComplement(Features01[[Count]][FlipMe])
}

}
names(Features01[[Count]]) <- paste(rep(names(GeneCalls)[Count], length(Features01[[Count]])),
GeneCalls[[Count]]$Index,
seq(length(Features01[[Count]])),
sep = "_")

# translate all translatable features with as few calls as possible
ph <- unique(GeneCalls[[Count]]$Translation_Table)

ph <- ph[!is.na(ph)]
if (length(ph) < 1L) {
ph <- DefaultTranslationTable
phkey <- which(GeneCalls[[Count]]$Coding &
GeneCalls[[Count]]$Type == "gene")
CurrentGeneticCode <- getGeneticCode(id_or_name2 = ph,
full.search = FALSE,
as.data.frame = FALSE)
Features02[[Count]] <- translate(x = Features01[[Count]][phkey],
genetic.code = CurrentGeneticCode,
if.fuzzy.codon = "solve")
Features02[[Count]] <- Features02[[Count]][order(phkey)]
# print(length(Features02[[Count]]))
} else {
Features02[[Count]] <- vector(mode = "list",
length = length(ph))
phkey <- vector(mode = "list",
length = length(ph))

for (m4 in seq_along(ph)) {
matchph <- which(GeneCalls[[Count]]$Translation_Table == ph[m4] &
GeneCalls[[Count]]$Coding &
GeneCalls[[Count]]$Type == "gene")
phkey[[m4]] <- matchph
CurrentGeneticCode <- getGeneticCode(id_or_name2 = ph[m4],
full.search = FALSE,
as.data.frame = FALSE)
Features02[[Count]][[m4]] <- translate(x = Features01[[Count]][matchph],
genetic.code = CurrentGeneticCode,
if.fuzzy.codon = "solve")
}
Features02[[Count]] <- do.call(c,
Features02[[Count]])
phkey <- unlist(phkey)
Features02[[Count]] <- Features02[[Count]][order(phkey)]

}
# rewrite ph to provide the correct names for the features
ph <- GeneCalls[[Count]]$Coding & GeneCalls[[Count]]$Type == "gene"

names(Features02[[Count]]) <- paste(rep(names(GeneCalls)[Count], length(Features02[[Count]])),
GeneCalls[[Count]]$Index[ph],
seq(length(Features01[[Count]]))[ph],
sep = "_")

# generate structures for aa alignments if there are
if (!is.null(Features02[[Count]])) {
AAStruct[[Count]] <- PredictHEC(myAAStringSet = Features02[[Count]],
type = "probabilities",
HEC_MI1 = MAT1,
HEC_MI2 = MAT2)
}

if (Verbose) {
setTxtProgressBar(pb = pBar,
value = Count / L)
}

Count <- Count + 1L
# will extract till storage is exceeded
# will not cap at storage
} # end while loop

if (Verbose) {
close(pBar)
if (Count < L) {
cat("Overhead is larger than storage request.\nSearch loops will require database lookups.\n")
# RemoveWhenAble <- TRUE
} else {
cat("Complete!\n")
# RemoveWhenAble <- FALSE
}
TimeEnd <- Sys.time()
print(TimeEnd - TimeStart)
}

res <- list("DNA" = Features01,
"AA" = Features02,
"Struct" = AAStruct)
class(res) <- c("list", "SynExtendSeqs")
return(res)
}


55 changes: 55 additions & 0 deletions man/PrepareSeqs.Rd
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
\name{PrepareSeqs}
\alias{PrepareSeqs}
\title{
Return gene sequences.
}
\description{
Given a \code{SynExtend} object with a \code{GeneCalls} attribute, and a \code{DECIPHER} database, return all gene sequences and their translations.
}
\usage{
PrepareSeqs(SynExtendObject,
DataBase,
DefaultTranslationTable = "11",
Storage = 1,
Verbose = FALSE)
}
\arguments{
\item{SynExtendObject}{
An object of class \code{PairSummaries} or of \code{LinkedPairs}. Object must have a \code{GeneCalls} attribute.
}
\item{DataBase}{
A character string pointing to a SQLite database, or a connection to a \code{DECIPHER} database.
}
\item{DefaultTranslationTable}{
A character vector of length 1 identifying the translation table to use if one is not supplied in the \code{GeneCalls} attribute.
}
\item{Storage}{
A soft memory limit for how much space to allow when building the resulting object. Translated to Gb.
}
\item{Verbose}{
Logical indicating whether or not to display a progress bar and print the time difference upon completion.
}
}
\details{
\code{PrepareSeqs} returns the sequences of genes and their translations where appropriate.
}
\value{
An object of class \code{SynExtendSeqs}.
}
\author{
Nicholas Cooley \email{npc19@pitt.edu}
}

\seealso{
\code{\link{PairSummaries}}, \code{\link{NucleotideOverlap}}, \code{\link{FindSynteny}}
}
\examples{
DBPATH <- system.file("extdata",
"Endosymbionts.sqlite",
package = "SynExtend")

data("Endosymbionts_Pairs01", package = "SynExtend")
CurrentSeqs <- PrepareSeqs(SynExtendObject = Endosymbionts_Pairs01,
DataBase = DBPATH,
Verbose = TRUE)
}

0 comments on commit b53bf4f

Please sign in to comment.