From ead8b13d58b6b02e931296c795e82f715f4bdc10 Mon Sep 17 00:00:00 2001
From: npcooley <npcooley@gmail.com>
Date: Fri, 9 Aug 2024 11:07:01 -0400
Subject: [PATCH] small edits and man page fixes

---
 R/ExpandDiagonal.R    | 10 +++++-----
 R/SummarizePairs.R    |  6 ++++--
 man/ExpandDiagonal.Rd | 41 +++++++++++++++++++++++------------------
 man/PrepareSeqs.Rd    | 23 ++++++++++-------------
 man/SummarizePairs.Rd | 36 ++++++++++++++++++++++--------------
 man/SuperTree.Rd      |  2 +-
 6 files changed, 65 insertions(+), 53 deletions(-)

diff --git a/R/ExpandDiagonal.R b/R/ExpandDiagonal.R
index 8aa6afe..fcbc068 100644
--- a/R/ExpandDiagonal.R
+++ b/R/ExpandDiagonal.R
@@ -3,8 +3,8 @@
 # contact: npc19@pitt.edu / npcooley@gmail.com
 
 ExpandDiagonal <- function(SynExtendObject,
-                           DataBase,
-                           InheritConfidence = TRUE,
+                           DataBase01,
+                           InheritConfidence = FALSE,
                            GapTolerance = 100L,
                            DropSingletons = FALSE,
                            UserConfidence = list("PID" = 0.3),
@@ -20,7 +20,7 @@ ExpandDiagonal <- function(SynExtendObject,
     stop ("SynExtendObject must be an object of class 'PairSummaries'.")
   }
   # check DBPATH first
-  if (is.character(DataBase)) {
+  if (is.character(DataBase01)) {
     if (!requireNamespace(package = "RSQLite",
                           quietly = TRUE)) {
       stop("Package 'RSQLite' must be installed.")
@@ -29,10 +29,10 @@ ExpandDiagonal <- function(SynExtendObject,
       print("Eventually character vector access to DECIPHER DBs will be deprecated.")
       require(RSQLite, quietly = TRUE)
     }
-    dbConn <- dbConnect(dbDriver("SQLite"), DataBase)
+    dbConn <- dbConnect(dbDriver("SQLite"), DataBase01)
     on.exit(dbDisconnect(dbConn))
   } else {
-    dbConn <- DataBase
+    dbConn <- DataBase01
     if (!dbIsValid(dbConn)) {
       stop("The connection has expired.")
     }
diff --git a/R/SummarizePairs.R b/R/SummarizePairs.R
index 48fb7ef..abae534 100644
--- a/R/SummarizePairs.R
+++ b/R/SummarizePairs.R
@@ -421,6 +421,7 @@ SummarizePairs <- function(SynExtendObject,
           DataPool[[m1]]$len <- width(DataPool[[m1]]$DNA)
           DataPool[[m1]]$mod <- DataPool[[m1]]$len %% 3L == 0
           DataPool[[m1]]$code <- GeneCalls[[m1]]$Coding
+          DataPool[[m1]]$cds <- lengths(GeneCalls[[m1]]$Range)
           # DBQUERY <- paste("select len, mod, code, cds from NTs where identifier is",
           #                  ObjectIDs[m1])
           # DBOUT <- dbGetQuery(conn = dbConn,
@@ -459,6 +460,7 @@ SummarizePairs <- function(SynExtendObject,
           DataPool[[m2]]$len <- width(DataPool[[m2]]$DNA)
           DataPool[[m2]]$mod <- DataPool[[m2]]$len %% 3L == 0
           DataPool[[m2]]$code <- GeneCalls[[m2]]$Coding
+          DataPool[[m2]]$cds <- lengths(GeneCalls[[m2]]$Range)
           # DBQUERY <- paste("select len, mod, code, cds from NTs where identifier is",
           #                  ObjectIDs[m2])
           # DBOUT <- dbGetQuery(conn = dbConn,
@@ -486,7 +488,7 @@ SummarizePairs <- function(SynExtendObject,
           QNTCount <- DataPool[[m1]]$len
           QMod <- DataPool[[m1]]$mod
           QCode <- DataPool[[m1]]$code
-          # QCDSCount <- DataPool[[m1]]$cds
+          QCDSCount <- DataPool[[m1]]$cds
           QueryStruct <- DataPool[[m1]]$struct
         } else {
           # do something else?
@@ -497,7 +499,7 @@ SummarizePairs <- function(SynExtendObject,
         SNTCount <- DataPool[[m2]]$len
         SMod <- DataPool[[m2]]$mod
         SCode <- DataPool[[m2]]$code
-        # SCDSCount <- DataPool[[m2]]$cds
+        SCDSCount <- DataPool[[m2]]$cds
         SubjectStruct <- DataPool[[m2]]$struct
         
         # align everyone as AAs who can be, i.e. modulo of 3, is coding, etc
diff --git a/man/ExpandDiagonal.Rd b/man/ExpandDiagonal.Rd
index ce6afef..2ccf4d5 100644
--- a/man/ExpandDiagonal.Rd
+++ b/man/ExpandDiagonal.Rd
@@ -8,9 +8,8 @@ Attempt to expand blocks of paired features in a \code{PairSummaries} object.
 }
 \usage{
 ExpandDiagonal(SynExtendObject,
-               FeatureSeqs,
-               DataBase,
-               InheritConfidence = TRUE,
+               DataBase01,
+               InheritConfidence = FALSE,
                GapTolerance = 100L,
                DropSingletons = FALSE,
                UserConfidence = list("PID" = 0.3),
@@ -20,10 +19,7 @@ ExpandDiagonal(SynExtendObject,
   \item{SynExtendObject}{
 An object of class \code{PairSummaries}.
 }
-  \item{FeatureSeqs}{
-An object of class \code{FeatureSeqs}.
-}
-  \item{DataBase}{
+  \item{DataBase01}{
 A character string pointing to a SQLite database, or a connection to a \code{DECIPHER} database.
 }
   \item{InheritConfidence}{
@@ -36,7 +32,7 @@ Integer value indicating the \code{diff} between feature IDs that can be tolerat
 Ignore solo pairs when planning expansion routes. Set to \code{FALSE} by default.
 }
   \item{UserConfidence}{
-A named list of length 1 where the name identifies a column of the \code{PairSummaries} object, and the value identifies a user confidence. Every k-means cluster with a center value of the column value selected greater than the confidence is retained.
+A named list of length 1 where the name identifies a column of the \code{PairSummaries} object, and the value identifies a user confidence. To be retained, a pair evaluated for expansion must be above all user specified confidences.
 }
   \item{Verbose}{
 Logical indicating whether or not to display a progress bar and print the time difference upon completion.
@@ -56,18 +52,27 @@ Nicholas Cooley \email{npc19@pitt.edu}
 \code{\link{PairSummaries}}, \code{\link{NucleotideOverlap}}, \code{link{SubSetPairs}}, \code{\link{FindSynteny}}
 }
 \examples{
+library(RSQLite)
 DBPATH <- system.file("extdata",
                       "Endosymbionts_v02.sqlite",
                       package = "SynExtend")
-
-data("Endosymbionts_LinkedFeatures", package = "SynExtend")
-Endosymbiont_Seqs <- PrepareSeqs(SynExtendObject = Endosymbionts_LinkedFeatures,
-                                 DataBase = DBPATH,
-                                 Verbose = TRUE)
+tmp <- tempfile()
+system(command = paste("cp",
+                       DBPATH,
+                       tmp))
+DBCONN <- dbConnect(SQLite(), tmp)
                       
-data("Endosymbionts_Pairs02", package = "SynExtend")
-Pairs03 <- ExpandDiagonal(SynExtendObject = Endosymbionts_Pairs02,
-                          DataBase = DBPATH,
-                          FeatureSeqs = Endosymbiont_Seqs,
-                          Verbose = TRUE)
+data("Endosymbionts_LinkedFeatures", package = "SynExtend")
+PrepareSeqs(SynExtendObject = Endosymbionts_LinkedFeatures,
+            DataBase01 = DBCONN,
+            Verbose = TRUE)
+SummarizedPairs <- SummarizePairs(SynExtendObject = Endosymbionts_LinkedFeatures,
+                                  DataBase01 = DBCONN,
+                                  Verbose = TRUE)
+ExpandedPairs <- ExpandDiagonal(SynExtendObject = SummarizedPairs,
+                                DataBase01 = DBCONN,
+                                Verbose = TRUE)
+dbDisconnect(DBCONN)
+unlink(tmp)
+
 }
diff --git a/man/PrepareSeqs.Rd b/man/PrepareSeqs.Rd
index 1eb3f24..70dc853 100644
--- a/man/PrepareSeqs.Rd
+++ b/man/PrepareSeqs.Rd
@@ -1,24 +1,23 @@
 \name{PrepareSeqs}
 \alias{PrepareSeqs}
 \title{
-Return gene sequences.
+Add feature sequences to Decipher databases.
 }
 \description{
-Given a \code{SynExtend} object with a \code{GeneCalls} attribute, and a \code{DECIPHER} database, return all gene sequences and their translations.
+Given a \code{SynExtend} object with a \code{GeneCalls} attribute, and a \code{DECIPHER} database, add sequence tables named 'AAs' and 'NTs' to the database. The new tables contain all translatable sequences indicated by the genecalls, and all nucleotide feature sequences.
 }
 \usage{
 PrepareSeqs(SynExtendObject,
-            DataBase,
+            DataBase01,
             DefaultTranslationTable = "11",
             Identifiers = NULL,
-            Storage = 1,
             Verbose = FALSE)
 }
 \arguments{
   \item{SynExtendObject}{
 An object of class \code{PairSummaries} or of \code{LinkedPairs}. Object must have a \code{GeneCalls} attribute.
 }
-  \item{DataBase}{
+  \item{DataBase01}{
 A character string pointing to a SQLite database, or a connection to a \code{DECIPHER} database.
 }
   \item{DefaultTranslationTable}{
@@ -26,19 +25,16 @@ A character vector of length 1 identifying the translation table to use if one i
 }
   \item{Identifiers}{
 By default \code{NULL}, but can be used to supply a vector of character identifiers for returning a subset of prepared sequences.
-}
-  \item{Storage}{
-A soft memory limit for how much space to allow when building the resulting object. Translated to Gb.
 }
   \item{Verbose}{
 Logical indicating whether or not to display a progress bar and print the time difference upon completion.
 }
 }
 \details{
-\code{PrepareSeqs} returns the sequences of genes and their translations where appropriate.
+\code{PrepareSeqs} adds two tables to a DECIPHER database. One named 'AAs' that contains all translatable features, i.e. features with a coding length divisible by 3 and designated as coding. And another named 'NTs' which contains all features.
 }
 \value{
-An object of class \code{FeatureSeqs}.
+An integer count of the number of feature sets added to the DECIPHER database.
 }
 \author{
 Nicholas Cooley \email{npc19@pitt.edu}
@@ -53,7 +49,8 @@ DBPATH <- system.file("extdata",
                       package = "SynExtend")
                       
 data("Endosymbionts_LinkedFeatures", package = "SynExtend")
-CurrentSeqs <- PrepareSeqs(SynExtendObject = Endosymbionts_LinkedFeatures,
-                           DataBase = DBPATH,
-                           Verbose = TRUE)
+# this will add seqs to the DB
+# PrepareSeqs(SynExtendObject = Endosymbionts_LinkedFeatures,
+#             DataBase = DBPATH,
+#             Verbose = TRUE)
 }
diff --git a/man/SummarizePairs.Rd b/man/SummarizePairs.Rd
index 94017de..49cc684 100644
--- a/man/SummarizePairs.Rd
+++ b/man/SummarizePairs.Rd
@@ -4,12 +4,11 @@
 Provide summaries of hypothetical orthologs.
 }
 \description{
-Given the correct set of \code{SynExtend} objects and a \code{DECIPHER} database, return a data.frame of summarized genomic feature pairs. \code{SummarizePairs} will collect all the linked genomic features in the supplied \code{\link{LinkedPairs-class}} object and return descriptions of the alignments of those features.
+Given \code{LinkedPairs} object and a \code{DECIPHER} database, return a data.frame of summarized genomic feature pairs. \code{SummarizePairs} will collect all the linked genomic features in the supplied \code{\link{LinkedPairs-class}} object and return descriptions of the alignments of those features.
 }
 \usage{
 SummarizePairs(SynExtendObject,
-               FeatureSeqs,
-               DataBase,
+               DataBase01,
                AlignmentFun = "AlignProfiles",
                RetainAnchors = FALSE,
                DefaultTranslationTable = "11",
@@ -18,16 +17,14 @@ SummarizePairs(SynExtendObject,
                Verbose = FALSE,
                ShowPlot = FALSE,
                Processors = 1,
+               Storage = 2,
                ...)
 }
 \arguments{
   \item{SynExtendObject}{
 An object of class \code{LinkedPairs-class}.
 }
-  \item{FeatureSeqs}{
-An object of class \code{FeatureSeqs}.
-}
-  \item{DataBase}{
+  \item{DataBase01}{
 A character string pointing to a SQLite database, or a connection to a \code{DECIPHER} database.
 }
   \item{AlignmentFun}{
@@ -43,16 +40,19 @@ A character vector of length 1 identifying the translation table to use if one i
 An integer specifying what Kmer size to collect Kmer distance between sequences at.
 }
   \item{IgnoreDefaultStringSet}{
-A soft memory limit for how much space to allow when building the resulting object. Translated to Gb.
+Translate all sequences in nucleotide space.
 }
   \item{Verbose}{
 Logical indicating whether or not to display a progress bar and print the time difference upon completion.
 }
   \item{ShowPlot}{
-Logical indicating whether or not to provide a plot of features collected by the function.
+Logical indicating whether or not to provide a plot of features collected by the function. Currently not implemented.
 }
   \item{Processors}{
 An integer value indicating how many processors to supply to \code{\link{AlignPairs}}.
+}
+  \item{Storage}{
+A soft memory limit for how much sequence data from the database to retain in memory while running. In Gb.
 }
   \item{...}{
 Additional arguments to pass to interior functions. Currently not implemented.
@@ -72,16 +72,24 @@ Nicholas Cooley \email{npc19@pitt.edu}
 \code{\link{PrepareSeqs}}, \code{\link{NucleotideOverlap}}, \code{\link{FindSynteny}}, \code{\link{LinkedPairs-class}}
 }
 \examples{
+library(RSQLite)
 DBPATH <- system.file("extdata",
                       "Endosymbionts_v02.sqlite",
                       package = "SynExtend")
+tmp <- tempfile()
+system(command = paste("cp",
+                       DBPATH,
+                       tmp))
+DBCONN <- dbConnect(SQLite(), tmp)
                       
 data("Endosymbionts_LinkedFeatures", package = "SynExtend")
-Endosymbiont_Seqs <- PrepareSeqs(SynExtendObject = Endosymbionts_LinkedFeatures,
-                                 DataBase = DBPATH,
-                                 Verbose = TRUE)
+PrepareSeqs(SynExtendObject = Endosymbionts_LinkedFeatures,
+            DataBase01 = DBCONN,
+            Verbose = TRUE)
 SummarizedPairs <- SummarizePairs(SynExtendObject = Endosymbionts_LinkedFeatures,
-                                  FeatureSeqs = Endosymbiont_Seqs,
-                                  DataBase = DBPATH)
+                                  DataBase01 = DBCONN,
+                                  Verbose = TRUE)
+dbDisconnect(DBCONN)
+unlink(tmp)
                            
 }
diff --git a/man/SuperTree.Rd b/man/SuperTree.Rd
index 0e7998d..db14c44 100644
--- a/man/SuperTree.Rd
+++ b/man/SuperTree.Rd
@@ -70,7 +70,7 @@ data("SuperTreeEx", package="SynExtend")
 # Notice that the labels of the tree are in #_#_# format
 # See the man page for SuperTreeEx for more info
 labs <- labels(exData[[1]])
-if(interative()) print(labs)
+if(interactive()) print(labs)
 
 # The first number corresponds to the species,
 # so we need to trim the rest in each leaf label