Skip to content

Commit

Permalink
# last-1595
Browse files Browse the repository at this point in the history
  • Loading branch information
kullrich committed Nov 6, 2024
1 parent a207607 commit 75a9ea7
Show file tree
Hide file tree
Showing 83 changed files with 1,842 additions and 373 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: CRBHits
Title: Conditional reciprocal best hits (CRBHits) in R
Version: 0.0.6
Version: 0.0.7
Authors@R:
person(given = "Kristian K",
family = "Ullrich",
Expand All @@ -14,7 +14,7 @@ License: MIT + file LICENSE
Encoding: UTF-8
LazyData: false
Depends:
R (>= 4.2.0)
R (>= 4.4.0)
Imports:
Biostrings,
MSA2dist,
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ export(CRBHitsColors)
export(aa2rbh)
export(aadir2orthofinder)
export(aafile2rbh)
export(aafile2rbhplus)
export(cds2genepos)
export(cds2rbh)
export(cdsdir2orthofinder)
Expand Down Expand Up @@ -40,6 +41,7 @@ importFrom(Biostrings,writeXStringSet)
importFrom(MSA2dist,cds2aa)
importFrom(MSA2dist,cds2codonaln)
importFrom(MSA2dist,dnastring2kaks)
importFrom(MSA2dist,indices2kaks)
importFrom(curl,curl_download)
importFrom(doParallel,registerDoParallel)
importFrom(dplyr,arrange)
Expand Down
26 changes: 20 additions & 6 deletions R/aa2rbh.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@
#' If one specifies aa1 and aa2 as the same input a selfblast is conducted.
#' @param aa1 aa1 sequences as \code{AAStringSet} [mandatory]
#' @param aa2 aa2 sequences as \code{AAStringSet} [mandatory]
#' @param dbfile1 aa1 db file [optional]
#' @param dbfile2 aa2 db file [optional]
#' @param searchtool specify sequence search algorithm last, mmseqs2 or diamond
#' [default: last]
#' @param lastpath specify the PATH to the last binaries
#' [default: /extdata/last-1542/bin/]
#' [default: /extdata/last-1595/bin/]
#' @param lastD last option D: query letters per random alignment
#' [default: 1e6]
#' @param mmseqs2path specify the PATH to the mmseqs2 binaries
Expand Down Expand Up @@ -53,6 +55,7 @@
#' @param fit.min specify minimum neighborhood alignment length [default: 5]
#' @param threads number of parallel threads [default: 1]
#' @param remove specify if last result files should be removed [default: TRUE]
#' @param remove.db specify if last db files should be removed [default: TRUE]
#' @return List of three (crbh=FALSE)\cr
#' 1: $crbh.pairs\cr
#' 2: $crbh1 matrix; query > target\cr
Expand Down Expand Up @@ -80,7 +83,7 @@
#' @references Rost B. (1999). Twilight zone of protein sequence alignments.
#' \emph{Protein Engineering}, \bold{12(2)}, 85-94.
#' @examples
#' ## compile last-1542 within CRBHits
#' ## compile last-1595 within CRBHits
#' CRBHits::make_last()
#' ## load example sequence data
#' data("ath", package="CRBHits")
Expand Down Expand Up @@ -109,9 +112,11 @@
#' @author Kristian K Ullrich

aa2rbh <- function(aa1, aa2,
dbfile1=NULL,
dbfile2=NULL,
searchtool="last",
lastpath=paste0(find.package("CRBHits"),
"/extdata/last-1542/bin/"),
"/extdata/last-1595/bin/"),
lastD=1e6,
mmseqs2path=NULL,
mmseqs2sensitivity=5.7,
Expand All @@ -133,7 +138,8 @@ aa2rbh <- function(aa1, aa2,
fit.varweight=0.1,
fit.min=5,
threads=1,
remove=TRUE
remove=TRUE,
remove.db=TRUE
){
#internal function to fit evalue by length
fitSpline <- function(alnlength, evalue, fit.type, fit.varweight,
Expand Down Expand Up @@ -224,6 +230,12 @@ aa2rbh <- function(aa1, aa2,
aa2file <- tempfile("aa2_", outpath)
aa1dbfile <- tempfile("aa1db_", outpath)
aa2dbfile <- tempfile("aa2db_", outpath)
if(!is.null(dbfile1)){
aa1dbfile <- dbfile1
}
if(!is.null(dbfile2)){
aa2dbfile <- dbfile2
}
if(searchtool=="last"){
aa2_aa1_lastout <- tempfile("aa2_aa1_lastout_", outpath)
aa1_aa2_lastout <- tempfile("aa1_aa2_lastout_", outpath)
Expand Down Expand Up @@ -303,11 +315,13 @@ aa2rbh <- function(aa1, aa2,
if(remove){
system2(command="rm", args = aa1file)
system2(command="rm", args = aa2file)
system2(command="rm", args = paste0(aa1dbfile, "*"))
system2(command="rm", args = paste0(aa2dbfile, "*"))
system2(command="rm", args = aa2_aa1_lastout)
system2(command="rm", args = aa1_aa2_lastout)
}
if(remove.db){
system2(command="rm", args = paste0(aa1dbfile, "*"))
system2(command="rm", args = paste0(aa2dbfile, "*"))
}
#selfblast
if(selfblast){
aa1_aa2 <- aa1_aa2[aa1_aa2[, 1]!=aa1_aa2[, 2], , drop=FALSE]
Expand Down
6 changes: 3 additions & 3 deletions R/aadir2orthofinder.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
#' @param searchtool specify sequence search algorithm last, mmseqs2 or diamond
#' [default: last]
#' @param lastpath specify the PATH to the last binaries
#' [default: /extdata/last-1542/bin/]
#' [default: /extdata/last-1595/bin/]
#' @param lastD last option D: query letters per random alignment
#' [default: 1e6]
#' @param mmseqs2path specify the PATH to the mmseqs2 binaries
Expand Down Expand Up @@ -66,7 +66,7 @@
#' @references Rost B. (1999). Twilight zone of protein sequence alignments.
#' \emph{Protein Engineering}, \bold{12(2)}, 85-94.
#' @examples
#' ## compile last-1542 within CRBHits
#' ## compile last-1595 within CRBHits
#' CRBHits::make_last()
#' @export aadir2orthofinder
#' @author Kristian K Ullrich
Expand All @@ -75,7 +75,7 @@ aadir2orthofinder <- function(dir,
file_ending="*",
searchtool="last",
lastpath=paste0(find.package("CRBHits"),
"/extdata/last-1542/bin/"),
"/extdata/last-1595/bin/"),
lastD=1e6,
mmseqs2path=NULL,
mmseqs2sensitivity=5.7,
Expand Down
84 changes: 58 additions & 26 deletions R/aafile2rbh.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@
#' conducted.
#' @param aafile1 aa1 fasta file [mandatory]
#' @param aafile2 aa2 fasta file [mandatory]
#' @param dbfile1 aa1 db file [optional]
#' @param dbfile2 aa2 db file [optional]
#' @param searchtool specify sequence search algorithm last, mmseqs2 or diamond
#' [default: last]
#' @param lastpath specify the PATH to the last binaries
#' [default: /extdata/last-1542/bin/]
#' [default: /extdata/last-1595/bin/]
#' @param lastD last option D: query letters per random alignment
#' [default: 1e6]
#' @param mmseqs2path specify the PATH to the mmseqs2 binaries
Expand Down Expand Up @@ -53,7 +55,11 @@
#' [default: 0.1]
#' @param fit.min specify minimum neighborhood alignment length [default: 5]
#' @param threads number of parallel threads [default: 1]
#' @param aafile2tmp specify if aa input files sequenceIDs should be reduced
#' to the first word and be written to a temporary aa file
#' [default: TRUE]
#' @param remove specify if last result files should be removed [default: TRUE]
#' @param remove.db specify if last db files should be removed [default: TRUE]
#' @return List of three (crbh=FALSE)\cr
#' 1: $crbh.pairs\cr
#' 2: $crbh1 matrix; query > target\cr
Expand Down Expand Up @@ -81,7 +87,7 @@
#' @references Rost B. (1999). Twilight zone of protein sequence alignments.
#' \emph{Protein Engineering}, \bold{12(2)}, 85-94.
#' @examples
#' ## compile last-1542 within CRBHits
#' ## compile last-1595 within CRBHits
#' CRBHits::make_last()
#' ## load example sequence data
#' athfile <- system.file("fasta", "ath.aa.fasta.gz", package="CRBHits")
Expand All @@ -108,9 +114,11 @@
#' @author Kristian K Ullrich

aafile2rbh <- function(aafile1, aafile2,
dbfile1=NULL,
dbfile2=NULL,
searchtool="last",
lastpath=paste0(find.package("CRBHits"),
"/extdata/last-1542/bin/"),
"/extdata/last-1595/bin/"),
lastD=1e6,
mmseqs2path=NULL,
mmseqs2sensitivity=5.7,
Expand All @@ -132,7 +140,9 @@ aafile2rbh <- function(aafile1, aafile2,
fit.varweight=0.1,
fit.min=5,
threads=1,
remove=TRUE
aafile2tmp=TRUE,
remove=TRUE,
remove.db=TRUE
){
#internal function to fit evalue by length
fitSpline <- function(alnlength, evalue, fit.type, fit.varweight, fit.min){
Expand Down Expand Up @@ -218,10 +228,26 @@ aafile2rbh <- function(aafile1, aafile2,
if(aafile1==aafile2){
selfblast <- TRUE
}
aa1file <- tempfile("aa1_", outpath)
aa2file <- tempfile("aa2_", outpath)
aa1file <- aafile1
aa2file <- aafile2
if(aafile2tmp){
aa1file <- tempfile("aa1_", outpath)
aa2file <- tempfile("aa2_", outpath)
aa1 <- Biostrings::readAAStringSet(aafile1)
aa2 <- Biostrings::readAAStringSet(aafile2)
names(aa1) <- stringr::str_split_fixed(names(aa1), " ", 2)[, 1]
names(aa2) <- stringr::str_split_fixed(names(aa2), " ", 2)[, 1]
Biostrings::writeXStringSet(aa1, file=aa1file)
Biostrings::writeXStringSet(aa2, file=aa2file)
}
aa1dbfile <- tempfile("aa1db_", outpath)
aa2dbfile <- tempfile("aa2db_", outpath)
if(!is.null(dbfile1)){
aa1dbfile <- dbfile1
}
if(!is.null(dbfile2)){
aa2dbfile <- dbfile2
}
if(searchtool=="last"){
aa2_aa1_lastout <- tempfile("aa2_aa1_lastout_", outpath)
aa1_aa2_lastout <- tempfile("aa1_aa2_lastout_", outpath)
Expand All @@ -234,17 +260,15 @@ aafile2rbh <- function(aafile1, aafile2,
aa2_aa1_lastout <- tempfile("aa2_aa1_diamond_", outpath)
aa1_aa2_lastout <- tempfile("aa1_aa2_diamond_", outpath)
}
aa1 <- Biostrings::readAAStringSet(aafile1)
aa2 <- Biostrings::readAAStringSet(aafile2)
names(aa1) <- stringr::str_split_fixed(names(aa1), " ", 2)[, 1]
names(aa2) <- stringr::str_split_fixed(names(aa2), " ", 2)[, 1]
Biostrings::writeXStringSet(aa1, file=aa1file)
Biostrings::writeXStringSet(aa2, file=aa2file)
if(searchtool=="last"){
system2(command=paste0(lastpath, "lastdb"),
args = c("-p", "-cR01", "-P", threads, aa1dbfile, aa1file))
system2(command=paste0(lastpath, "lastdb"),
args = c("-p", "-cR01", "-P", threads, aa2dbfile, aa2file))
if(!file.exists(paste0(aa1dbfile, ".suf"))){
system2(command=paste0(lastpath, "lastdb"),
args = c("-p", "-cR01", "-P", threads, aa1dbfile, aa1file))
}
if(!file.exists(paste0(aa2dbfile, ".suf"))){
system2(command=paste0(lastpath, "lastdb"),
args = c("-p", "-cR01", "-P", threads, aa2dbfile, aa2file))
}
system2(command=paste0(lastpath, "lastal"),
args = c("-f", "BlastTab+", "-P", threads, "-D", lastD, aa1dbfile,
aa2file, ">", aa2_aa1_lastout))
Expand All @@ -267,12 +291,16 @@ aafile2rbh <- function(aafile1, aafile2,
"tlen,raw")))
}
if(searchtool=="diamond"){
system2(command=paste0(diamondpath, "diamond"),
args = c("makedb", "--in", aa1file,
"-d", aa1dbfile))
system2(command=paste0(diamondpath, "diamond"),
args = c("makedb", "--in", aa2file,
"-d", aa2dbfile))
if(!file.exists(paste0(aa1dbfile, ".dmnd"))){
system2(command=paste0(diamondpath, "diamond"),
args = c("makedb", "--in", aa1file,
"-d", aa1dbfile))
}
if(!file.exists(paste0(aa2dbfile, ".dmnd"))){
system2(command=paste0(diamondpath, "diamond"),
args = c("makedb", "--in", aa2file,
"-d", aa2dbfile))
}
system2(command=paste0(diamondpath, "diamond"),
args = c("blastp", "--ignore-warnings", "-d", aa2dbfile,
"-q", aa1file, "-o", aa1_aa2_lastout, diamondsensitivity,
Expand Down Expand Up @@ -301,13 +329,17 @@ aafile2rbh <- function(aafile1, aafile2,
aa2_aa1[, "perc_identity"] <- aa2_aa1[, "perc_identity"] * 100
}
if(remove){
system2(command="rm", args = aa1file)
system2(command="rm", args = aa2file)
system2(command="rm", args = paste0(aa1dbfile, "*"))
system2(command="rm", args = paste0(aa2dbfile, "*"))
if(aafile2tmp){
system2(command="rm", args = aa1file)
system2(command="rm", args = aa2file)
}
system2(command="rm", args = aa2_aa1_lastout)
system2(command="rm", args = aa1_aa2_lastout)
}
if(remove.db){
system2(command="rm", args = paste0(aa1dbfile, "*"))
system2(command="rm", args = paste0(aa2dbfile, "*"))
}
#selfblast
if(selfblast){
aa1_aa2 <- aa1_aa2[aa1_aa2[,1]!=aa1_aa2[,2], , drop=FALSE]
Expand Down
Loading

0 comments on commit 75a9ea7

Please sign in to comment.