-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
331 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,268 @@ | ||
###### -- extract a bunch of sequences from some genecalls -------------------- | ||
# author: nicholas cooley | ||
# maintainer: nicholas cooley | ||
# contact: npc19@pitt.edu | ||
|
||
# given an object that has a GeneCalls attribute | ||
|
||
PrepareSeqs <- function(SynExtendObject, | ||
DataBase, | ||
DefaultTranslationTable = "11", | ||
Storage = 1, | ||
Verbose = FALSE) { | ||
|
||
if (Verbose) { | ||
TimeStart <- Sys.time() | ||
pBar <- txtProgressBar(style = 1) | ||
} | ||
|
||
# check object type -- currently only PairSummaries and LinkedPairs | ||
if (!is(object = SynExtendObject, | ||
class2 = "PairSummaries") & | ||
!is(object = SynExtendObject, | ||
class2 = "LinkedPairs")) { | ||
stop("'SynExtendObject' must be an object of class 'PairSummaries' or 'LinkedPairs'.") | ||
} | ||
# initialize database | ||
if (is.character(DataBase)) { | ||
if (!requireNamespace(package = "RSQLite", | ||
quietly = TRUE)) { | ||
stop("Package 'RSQLite' must be installed.") | ||
} | ||
dbConn <- dbConnect(dbDriver("SQLite"), DataBase) | ||
on.exit(dbDisconnect(dbConn)) | ||
} else { | ||
dbConn <- DataBase | ||
if (!dbIsValid(dbConn)) { | ||
stop("The connection has expired.") | ||
} | ||
} | ||
# check storage | ||
if (Storage < 0) { | ||
stop("Storage must be greater than zero.") | ||
} else { | ||
Storage <- Storage * 1e9 # conversion to gigabytes | ||
} | ||
|
||
# get the genecalls! | ||
GeneCalls <- attr(x = SynExtendObject, | ||
which = "GeneCalls") | ||
|
||
###### -- subset gene calls based on the names of the links object ---------- | ||
if (is(SynExtendObject, | ||
class2 = "LinkedPairs")) { | ||
if (length(GeneCalls) != nrow(SynExtendObject)) { | ||
GeneCalls <- GeneCalls[match(x = dimnames(SynExtendObject)[[1]], | ||
table = names(GeneCalls))] | ||
} | ||
} # else genecalls from PairSummaries objects will be guaranteed to be correctly shaped? | ||
|
||
# load in genomes and ALL extracted features at the top until storage limit | ||
# is hit | ||
if (Verbose) { | ||
cat("Preparing overhead data.\n") | ||
} | ||
# load in structure matrices once for PredictHEC | ||
# these hardcoded values come from Erik ... AlignProfiles maybe? | ||
MAT1 <- get(data("HEC_MI1", | ||
package = "DECIPHER", | ||
envir = environment())) | ||
MAT2 <- get(data("HEC_MI2", | ||
package = "DECIPHER", | ||
envir = environment())) | ||
structureMatrix <- matrix(c(0.187, -0.8, -0.873, | ||
-0.8, 0.561, -0.979, | ||
-0.873, -0.979, 0.221), | ||
3, | ||
3, | ||
dimnames=list(c("H", "E", "C"), | ||
c("H", "E", "C"))) | ||
substitutionMatrix <- matrix(c(1.5, -2.134, -0.739, -1.298, | ||
-2.134, 1.832, -2.462, 0.2, | ||
-0.739, -2.462, 1.522, -2.062, | ||
-1.298, 0.2, -2.062, 1.275), | ||
nrow = 4, | ||
dimnames = list(DNA_BASES, DNA_BASES)) | ||
Features01 <- Features02 <- AAStruct <- vector("list", | ||
length = length(GeneCalls)) | ||
L <- length(GeneCalls) | ||
|
||
Count <- 1L | ||
while (object.size(Features01) < Storage & | ||
Count <= L) { | ||
# print(object.size(Features01)) | ||
# print(Count) | ||
Genome <- SearchDB(dbFile = dbConn, | ||
identifier = names(GeneCalls[Count]), | ||
nameBy = "description", | ||
type = "DNAStringSet", | ||
verbose = FALSE) | ||
PresentIndices <- unique(GeneCalls[[Count]]$Index) | ||
# Reset any coding & non-translation table features to the default | ||
# move this somewhere else eventually... | ||
if (any(is.na(GeneCalls[[Count]]$Translation_Table))) { | ||
w <- which(is.na(GeneCalls[[Count]]$Translation_Table) & | ||
GeneCalls[[Count]]$Coding) | ||
if (length(w) > 0) { | ||
GeneCalls[[Count]]$Translation_Table[w] <- DefaultTranslationTable | ||
} | ||
} | ||
if (length(PresentIndices) > 1L) { | ||
# many indices, loop through present indices and extract | ||
# slam together at the end | ||
Features01[[Count]] <- vector(mode = "list", | ||
length = length(PresentIndices)) | ||
for (m3 in seq_along(PresentIndices)) { | ||
ph <- GeneCalls[[Count]]$Index == PresentIndices[m3] | ||
# implementation 3 - faster so far | ||
# set up the succinct extraction | ||
# build an index of where stringset positions need to be collapsed | ||
z1 <- unname(GeneCalls[[Count]]$Range[ph]) | ||
z2 <- lengths(z1) | ||
# convert IRangesList to IRanges object for simple extractAt | ||
z1 <- unlist(z1, | ||
recursive = FALSE) | ||
Features01[[Count]][[m3]] <- extractAt(x = Genome[[PresentIndices[m3]]], | ||
at = z1) | ||
CollapseCount <- 0L | ||
w <- which(z2 > 1L) | ||
# if no collapsing needs to occur, do not enter loop | ||
if (length(w) > 0L) { | ||
# if collapsing must take place build a placeholder of positions to remove | ||
# once collapsing correct positions has occurred | ||
remove <- vector(mode = "integer", | ||
length = sum(z2[w]) - length(w)) | ||
for (m4 in w) { | ||
Features01[[Count]][[m3]][[m4 + CollapseCount]] <- unlist(Features01[[Count]][[m3]][m4:(m4 + z2[m4] - 1L) + CollapseCount]) | ||
remove[(CollapseCount + 1L):(CollapseCount + z2[m4] - 1L)] <- (m4 + 1L):(m4 + z2[m4] - 1L) + CollapseCount | ||
CollapseCount <- CollapseCount + z2[m4] - 1L | ||
} | ||
Features01[[Count]][[m3]][remove] <- NULL | ||
} | ||
FlipMe <- GeneCalls[[Count]]$Strand[ph] == 1L | ||
if (any(FlipMe)) { | ||
Features01[[Count]][[m3]][FlipMe] <- reverseComplement(Features01[[Count]][[m3]][FlipMe]) | ||
} | ||
} | ||
Features01[[Count]] <- do.call(c, | ||
Features01[[Count]]) | ||
|
||
} else { | ||
# implementation 3 - shortest possible collapse loops and fewest copies - so far | ||
z1 <- unname(GeneCalls[[Count]]$Range) | ||
z2 <- lengths(z1) | ||
z1 <- unlist(z1, | ||
recursive = FALSE) | ||
Features01[[Count]] <- extractAt(x = Genome[[PresentIndices]], | ||
at = z1) | ||
CollapseCount <- 0L | ||
w <- which(z2 > 1) | ||
if (length(w) > 0) { | ||
remove <- vector(mode = "integer", | ||
length = sum(z2[w]) - length(w)) | ||
for (m4 in w) { | ||
Features01[[Count]][[m4 + CollapseCount]] <- unlist(Features01[[Count]][m4:(m4 + z2[m4] - 1L) + CollapseCount]) | ||
remove[(CollapseCount + 1L):(CollapseCount + z2[m4] - 1L)] <- (m4 + 1L):(m4 + z2[m4] - 1L) + CollapseCount | ||
CollapseCount <- CollapseCount + z2[m4] - 1L | ||
} | ||
Features01[[Count]][remove] <- NULL | ||
} | ||
|
||
FlipMe <- GeneCalls[[Count]]$Strand == 1L | ||
if (any(FlipMe)) { | ||
Features01[[Count]][FlipMe] <- reverseComplement(Features01[[Count]][FlipMe]) | ||
} | ||
|
||
} | ||
names(Features01[[Count]]) <- paste(rep(names(GeneCalls)[Count], length(Features01[[Count]])), | ||
GeneCalls[[Count]]$Index, | ||
seq(length(Features01[[Count]])), | ||
sep = "_") | ||
|
||
# translate all translatable features with as few calls as possible | ||
ph <- unique(GeneCalls[[Count]]$Translation_Table) | ||
|
||
ph <- ph[!is.na(ph)] | ||
if (length(ph) < 1L) { | ||
ph <- DefaultTranslationTable | ||
phkey <- which(GeneCalls[[Count]]$Coding & | ||
GeneCalls[[Count]]$Type == "gene") | ||
CurrentGeneticCode <- getGeneticCode(id_or_name2 = ph, | ||
full.search = FALSE, | ||
as.data.frame = FALSE) | ||
Features02[[Count]] <- translate(x = Features01[[Count]][phkey], | ||
genetic.code = CurrentGeneticCode, | ||
if.fuzzy.codon = "solve") | ||
Features02[[Count]] <- Features02[[Count]][order(phkey)] | ||
# print(length(Features02[[Count]])) | ||
} else { | ||
Features02[[Count]] <- vector(mode = "list", | ||
length = length(ph)) | ||
phkey <- vector(mode = "list", | ||
length = length(ph)) | ||
|
||
for (m4 in seq_along(ph)) { | ||
matchph <- which(GeneCalls[[Count]]$Translation_Table == ph[m4] & | ||
GeneCalls[[Count]]$Coding & | ||
GeneCalls[[Count]]$Type == "gene") | ||
phkey[[m4]] <- matchph | ||
CurrentGeneticCode <- getGeneticCode(id_or_name2 = ph[m4], | ||
full.search = FALSE, | ||
as.data.frame = FALSE) | ||
Features02[[Count]][[m4]] <- translate(x = Features01[[Count]][matchph], | ||
genetic.code = CurrentGeneticCode, | ||
if.fuzzy.codon = "solve") | ||
} | ||
Features02[[Count]] <- do.call(c, | ||
Features02[[Count]]) | ||
phkey <- unlist(phkey) | ||
Features02[[Count]] <- Features02[[Count]][order(phkey)] | ||
|
||
} | ||
# rewrite ph to provide the correct names for the features | ||
ph <- GeneCalls[[Count]]$Coding & GeneCalls[[Count]]$Type == "gene" | ||
|
||
names(Features02[[Count]]) <- paste(rep(names(GeneCalls)[Count], length(Features02[[Count]])), | ||
GeneCalls[[Count]]$Index[ph], | ||
seq(length(Features01[[Count]]))[ph], | ||
sep = "_") | ||
|
||
# generate structures for aa alignments if there are | ||
if (!is.null(Features02[[Count]])) { | ||
AAStruct[[Count]] <- PredictHEC(myAAStringSet = Features02[[Count]], | ||
type = "probabilities", | ||
HEC_MI1 = MAT1, | ||
HEC_MI2 = MAT2) | ||
} | ||
|
||
if (Verbose) { | ||
setTxtProgressBar(pb = pBar, | ||
value = Count / L) | ||
} | ||
|
||
Count <- Count + 1L | ||
# will extract till storage is exceeded | ||
# will not cap at storage | ||
} # end while loop | ||
|
||
if (Verbose) { | ||
close(pBar) | ||
if (Count < L) { | ||
cat("Overhead is larger than storage request.\nSearch loops will require database lookups.\n") | ||
# RemoveWhenAble <- TRUE | ||
} else { | ||
cat("Complete!\n") | ||
# RemoveWhenAble <- FALSE | ||
} | ||
TimeEnd <- Sys.time() | ||
print(TimeEnd - TimeStart) | ||
} | ||
|
||
res <- list("DNA" = Features01, | ||
"AA" = Features02, | ||
"Struct" = AAStruct) | ||
class(res) <- c("list", "SynExtendSeqs") | ||
return(res) | ||
} | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
\name{PrepareSeqs} | ||
\alias{PrepareSeqs} | ||
\title{ | ||
Return gene sequences. | ||
} | ||
\description{ | ||
Given a \code{SynExtend} object with a \code{GeneCalls} attribute, and a \code{DECIPHER} database, return all gene sequences and their translations. | ||
} | ||
\usage{ | ||
PrepareSeqs(SynExtendObject, | ||
DataBase, | ||
DefaultTranslationTable = "11", | ||
Storage = 1, | ||
Verbose = FALSE) | ||
} | ||
\arguments{ | ||
\item{SynExtendObject}{ | ||
An object of class \code{PairSummaries} or of \code{LinkedPairs}. Object must have a \code{GeneCalls} attribute. | ||
} | ||
\item{DataBase}{ | ||
A character string pointing to a SQLite database, or a connection to a \code{DECIPHER} database. | ||
} | ||
\item{DefaultTranslationTable}{ | ||
A character vector of length 1 identifying the translation table to use if one is not supplied in the \code{GeneCalls} attribute. | ||
} | ||
\item{Storage}{ | ||
A soft memory limit for how much space to allow when building the resulting object. Translated to Gb. | ||
} | ||
\item{Verbose}{ | ||
Logical indicating whether or not to display a progress bar and print the time difference upon completion. | ||
} | ||
} | ||
\details{ | ||
\code{PrepareSeqs} returns the sequences of genes and their translations where appropriate. | ||
} | ||
\value{ | ||
An object of class \code{SynExtendSeqs}. | ||
} | ||
\author{ | ||
Nicholas Cooley \email{npc19@pitt.edu} | ||
} | ||
|
||
\seealso{ | ||
\code{\link{PairSummaries}}, \code{\link{NucleotideOverlap}}, \code{\link{FindSynteny}} | ||
} | ||
\examples{ | ||
DBPATH <- system.file("extdata", | ||
"Endosymbionts.sqlite", | ||
package = "SynExtend") | ||
|
||
data("Endosymbionts_Pairs01", package = "SynExtend") | ||
CurrentSeqs <- PrepareSeqs(SynExtendObject = Endosymbionts_Pairs01, | ||
DataBase = DBPATH, | ||
Verbose = TRUE) | ||
} |