Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: OOM label propagation #45

Merged
merged 18 commits into from
Apr 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: SynExtend
Type: Package
Title: Tools for Working With Synteny Objects
Version: 1.15.2
Version: 1.15.3
Authors@R: c(
person("Nicholas", "Cooley", email = "npc19@pitt.edu", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-6029-304X")),
person("Aidan", "Lakshman", email = "ahl27@pitt.edu", role = c("aut", "ctb"), comment = c(ORCID = "0000-0002-9465-6785")),
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ export(MoransI)
export(dendrapply)
export(FastQFromSRR)
export(FitchParsimony)
export(ShoalFinder)
export(EstimateShoal)
export(PrepareSeqs)

S3method(subset, 'dendrogram')
Expand Down
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# SynExtend 1.15.3
* Addition of `FastLabelOOM` function to find communities in graphs/networks on disk space.

# SynExtend 1.15.2
* Addition of `PrepareSeqs` function, beginning the process of deprecating `PairSummaries` in favor of more cohesive and user friendly functions.

Expand Down
1 change: 1 addition & 0 deletions R/EvoWeaver-PPPreds.R
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,7 @@ PAPV.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
pap[] <- as.integer(pap)
ARGS <- list(nr=nr)
FXN <- function(v1, v2, ARGS, ii, jj) {
if(all(v1==v1[1]) || all(v2==v2[1])) return(0)
return(1-fisher.test(v1, v2, simulate.p.value=TRUE)$p.value)
}
pairscores <- BuildSimMatInternal(pap, uvals, evalmap, l, n, FXN, ARGS, Verbose)
Expand Down
2 changes: 1 addition & 1 deletion R/EvoWeaver-PSPreds.R
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ ContextTree.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE, precalcProfs=NU

TreeDistance.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
precalcSubset=NULL,
TreeMethods="CI", JRFk=4, ...){
TreeMethods="RF", JRFk=4, ...){
if (!is.null(precalcSubset))
subs <- precalcSubset
else
Expand Down
85 changes: 44 additions & 41 deletions R/EvoWeaver-SLPreds.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,24 @@ NVDT <- function(ew, ...) UseMethod('NVDT')
Ancestral <- function(ew, ...) UseMethod('Ancestral')
################################

ResidueMI.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
precalcSubset=NULL, gapCutoff=0.5,
ResidueMI.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
precalcSubset=NULL, gapCutoff=0.5,
useDNA=FALSE, Processors=1L, useWeights=TRUE, ...){
useResidue <- attr(ew, 'useResidue')
useMT <- attr(ew, 'useMT')
useColoc <- attr(ew, 'useColoc')
Processors <- NormArgProcessors(Processors)

stopifnot('EvoWeaver object must be initialized with dendrograms to run Residue methods'=
useMT)
stopifnot('EvoWeaver dendrograms must have ancestral states to run Residue methods'=
useResidue)

if (!is.null(precalcSubset))
subs <- precalcSubset
else
subs <- ProcessSubset(ew, Subset)

uvals <- subs$uvals
evalmap <- subs$evalmap
l <- length(uvals)
Expand All @@ -40,15 +40,15 @@ ResidueMI.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
rownames(mat) <- colnames(mat) <- n
return(mat)
}

if(useDNA)
lookup <- 'ATGC'
else
lookup <- "ARNDCQEGHILKMFPSTWYV"
lookup <- strsplit(lookup, '')[[1]]
lookupMap <- seq_along(lookup)
names(lookupMap) <- lookup

ResidueSets <- vector('list', length(uvals))
if(Verbose){
cat(' Preproccessing sequence sets\n')
Expand All @@ -61,18 +61,18 @@ ResidueMI.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
stopifnot('Sequences are not aligned!'=all(ncharSeq==ncharSeq[1]))
seqs <- toupper(seqs)
if(useDNA){
maskPos <- which(!MaskAlignment(DNAStringSet(seqs),
maskPos <- which(!MaskAlignment(DNAStringSet(seqs),
type='values', includeTerminalGaps = TRUE)[,3])
} else {
maskPos <- which(!MaskAlignment(AAStringSet(seqs), threshold=0.3,
type='values', includeTerminalGaps = TRUE)[,3])
}

if(length(maskPos)==0){
seqs <- matrix(NA_integer_, nrow=length(seqs), ncol=0L)
} else {
seqs <- vapply(seqs,
\(s) lookupMap[(strsplit(s, '')[[1]])[maskPos]],
seqs <- vapply(seqs,
\(s) lookupMap[(strsplit(s, '')[[1]])[maskPos]],
integer(length(maskPos)), USE.NAMES = FALSE)
seqs[is.na(seqs)] <- 0L
seqs <- t(seqs)
Expand All @@ -85,10 +85,10 @@ ResidueMI.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
ResidueSets[[i]] <- seqs
if(Verbose) setTxtProgressBar(pb, i)
}

pairscores <- rep(NA_real_, l*(l-1)/2)
ctr <- 0
if (Verbose){
if (Verbose){
cat('\nDone.\n')
pb <- txtProgressBar(max=(l*(l-1) / 2), style=3)
}
Expand All @@ -104,7 +104,7 @@ ResidueMI.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
#tree2 <- ew[[uval2]]
seqs2 <- ResidueSets[[j]]
#pairscores[ctr+1] <- ResidueMIDend(tree1, tree2, useColoc=useColoc, ...)
score <- ResidueMISeqs(seqs1, seqs2, lookup=lookup,
score <- ResidueMISeqs(seqs1, seqs2, lookup=lookup,
useColoc=useColoc, Processors=Processors, ...)
pairscores[ctr+1] <- ifelse(is.na(score), 0L, score)
}
Expand All @@ -116,28 +116,28 @@ ResidueMI.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
pairscores <- as.simMat(pairscores, NAMES=n, DIAG=FALSE)
Diag(pairscores) <- 1
if (Verbose) cat('\n')

return(pairscores)
}

NVDT.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
precalcSubset=NULL, extended=TRUE,
NVDT.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
precalcSubset=NULL, extended=TRUE,
DNAseqs=TRUE, centerObservations=FALSE,
sqrtCorrelation=TRUE, ...){
#source('/Users/aidan/Nextcloud/RStudioSync/comps/NVDT/calcNVDT.R')
useResidue <- attr(ew, 'useResidue')
useMT <- attr(ew, 'useMT')

stopifnot('EvoWeaver object must be initialized with dendrograms to run Residue methods'=
useMT)
stopifnot('EvoWeaver dendrograms must have ancestral states to run Residue methods'=
useResidue)

if (!is.null(precalcSubset))
subs <- precalcSubset
else
subs <- ProcessSubset(ew, Subset)

uvals <- subs$uvals
evalmap <- subs$evalmap
l <- length(uvals)
Expand Down Expand Up @@ -172,7 +172,7 @@ NVDT.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
#seqs <- gsub('.', '', seqs, fixed=TRUE, useBytes=TRUE)
outv <- numeric(outl)
for (j in seq_along(seqs)){
nvdtvec <- .Call("StringToNVDT",
nvdtvec <- .Call("StringToNVDT",
charToRaw(seqs[j]),
FALSE, # RemoveGaps
extended, # Use di/tri freqs
Expand All @@ -182,7 +182,7 @@ NVDT.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
if(DNAseqs){
divval <- c(rep(seqlen, 8L), rep(1L, 4L))
if(extended){
#correcting 2-mers and 3-mers
#correcting 2-mers and 3-mers
divval <- c(divval, rep(seqlen-1L, 16L), rep(seqlen-2L, 64L))
}
} else {
Expand All @@ -192,10 +192,13 @@ NVDT.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
outv <- outv + nvdtvec
}
if(centerObservations && !DNAseqs){
outv[1:20] <- (outv[1:20] - mean(outv[1:20])) / sd(outv[1:20])
s <- seq_len(20L)
outv[s] <- (outv[s] - mean(outv[s])) / sd(outv[s])
# Center should always be 0.5 for mean
outv[21:40] <- (outv[21:40] - 0.5) / sd(outv[21:40])
outv[41:60] <- (outv[41:60] - mean(outv[41:60])) / sd(outv[41:60])
s <- s + 20L
outv[s] <- (outv[s] - 0.5) / sd(outv[s])
s <- s + 20L
outv[s] <- (outv[s] - mean(outv[s])) / sd(outv[s])
} else {
outv <- outv / sqrt(sum(outv**2))
}
Expand Down Expand Up @@ -230,13 +233,13 @@ NVDT.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
# cval <- .Call('fastPearsonC', v1, v2)
# pval <- pt(cval[2], cval[3]-2)
# pval <- ifelse(pval < 0.5, 2*pval, 2*(pval-0.5))
cval <- suppressWarnings(cor(v1, v2,
use='pairwise.complete.obs',

cval <- suppressWarnings(cor(v1, v2,
use='pairwise.complete.obs',
method='spearman'))
# should be absolute value, negative correlation is same effect
cval <- abs(cval[1])

num_obs <- length(v1)
pval <- 1 - exp(pt(cval, num_obs-2, lower.tail=FALSE, log.p=TRUE))
#pairscores[ctr+1] <- cor(v1,v2)
Expand All @@ -251,38 +254,38 @@ NVDT.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
pairscores <- as.simMat(pairscores, NAMES=n, DIAG=FALSE)
Diag(pairscores) <- 1
if (Verbose) cat('\n')

return(pairscores)

}

Ancestral.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
Ancestral.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
precalcSubset=NULL, ...){
useResidue <- attr(ew, 'useResidue')
useMT <- attr(ew, 'useMT')
useColoc <- attr(ew, 'useColoc')

stopifnot('EvoWeaver object must be initialized with dendrograms to run Residue methods'=
useMT)
stopifnot('EvoWeaver dendrograms must have ancestral states to run Residue methods'=
useResidue)

if (!is.null(precalcSubset))
subs <- precalcSubset
else
subs <- ProcessSubset(ew, Subset)

uvals <- subs$uvals
evalmap <- subs$evalmap
l <- length(uvals)
n <- names(ew)

if ( l == 1 ){
mat <- matrix(1, nrow=1, ncol=1)
rownames(mat) <- colnames(mat) <- n
return(mat)
}

pmlst <- vector('list', length=l)
if (Verbose){
cat("Pre-processing dendrograms...\n")
Expand All @@ -293,7 +296,7 @@ Ancestral.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
pmlst[[i]] <- find_dists_pos(ew[[uv]], useColoc)
if(Verbose) setTxtProgressBar(pb, i)
}

if(Verbose) cat('\nDone.\n')
pairscores <- rep(NA_real_, l*(l-1)/2)
if (Verbose) pb <- txtProgressBar(max=(l*(l-1) / 2), style=3)
Expand All @@ -308,7 +311,7 @@ Ancestral.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
if (i!=j && (is.null(evalmap) || entry %in% evalmap[[accessor]])){
v2 <- pmlst[[j]]
res <- pair_residues(v1,v2)
if(is.na(res[1]) || is.nan(res$R))
if(is.na(res[1]) || is.nan(res$R))
pairscores[ctr+1] <- NA
else
pairscores[ctr+1] <- abs(res$R) * (1-res$P)
Expand All @@ -322,6 +325,6 @@ Ancestral.EvoWeaver <- function(ew, Subset=NULL, Verbose=TRUE,
pairscores <- as.simMat(pairscores, NAMES=n, DIAG=FALSE)
Diag(pairscores) <- 1
if (Verbose) cat('\n')

return(pairscores)
}
}
Loading