Merge pull request #4 from josuechinchilla/auto_tests

Auto tests
josuechinchilla · Aug 7, 2024 · 4df1ddc · 4df1ddc
2 parents 3fc1906 + 945f354
commit 4df1ddc
Show file tree

Hide file tree

Showing 15 changed files with 451 additions and 181 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,9 @@
-.DS_store
-.Rproj.user
-BIGf90.Rproj
+.DS_store
+.Rproj.user
+.Rbuildignore
+BIGf90.Rproj
+inst/exec_mac
+inst/exec_windows/
+inst/data
+tests/
+.Rhistory
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: BIGf90
 Title: R face front for running K-fold crossvalidation, estimating ebvs and variance component estimation with Blupf90 modules
-Version: 0.3.0
+Version: 0.3.1
 Authors@R: c(person("Josue", "Chinchilla-Vargas", role = c("aut", "cre"), email = "jc3635@cornell.edu"), 
              person("Alexander", "Sandercock", role = "aut"),
              person("Breeding Insight Team", role = "ctb"))
@@ -15,6 +15,8 @@ License: Apache License (== 2.0)
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.3.2
+Suggests:
+    testthat
 Imports: 
     base (>= 4.3.1),
     dplyr (>= 1.1.4),

diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,11 @@
+# BIGf90 0.3.1
+
+* Users now can have executable files, input files, and output files in different locations
+* Warnings were added in case of run_gibbs and run_postgibbs functions that requires inputs to be together with renum results
+* Code changed to always use global path even if it is referring to working directory
+* Verbose argument added to give option to print or not information on screen while running
+* Default values added
+* Output directories need to exist. The function will stop if they don´t
+* Automatic installation checks included
+* NEWS file included
+* GitHub example
diff --git a/R/bf90_cv.R b/R/bf90_cv.R
@@ -6,7 +6,6 @@
 #' The function run_renumf90 needs to be used beforehand to process a .par file created by the user.
 #' This function calculates 2 accuracy estimates: correlation between raw phenotypes and ebvs divided by the square-root of narrow sense heritabilty and correlation between corrected phenotypes and ebvs along with bias estimations calculated as the regression of the phenotypes on the ebvs.
 #'
-#' @param path_2_execs path to a folder that holds all blupf90 executables that ill be used (blupf90+,predictf90). This field should be in quotes "".
 #' @param missing_value_code code used in the .par file after OPTION MISSING to indicate missing phenotype, if this option is no use, this value must be 0.
 #' @param random_effect_col Column where random effect is located, found under RANDOM_GROUP in the renf90.par file.
 #' @param h2 estimate of narrow-sense heritabilty.This value is use to calculate accuracy of ebvs
@@ -15,6 +14,12 @@
 #' @param output_table_name Name of the final tab-separated out-up file. This field should be in quotes "".
 #' @param renf90_ped_name Name of pedigree file generated by renumf90. This field should be in quotes "".
 #' @param snp_file_name the name of the genotype file to be used. If use, this field should be in quotes"". Default for the function is no genotype file.
+#' @param path_2_execs path to a folder that holds all blupf90 executables that ill be used (blupf90+,predictf90). This field should be in quotes "".
+#' @param input_files_dir directory containing files renf90.par renf90.fields renf90.inb renf90.tables renf90.dat
+#' @param output_files_dir path to directory to store the output files
+#' @param seed set seed for the stochastic process
+#' @param verbose logical defining if information will be printed on the console
+#'
 #' @return a tab-separated file that includes accuracy and bias estimates of ebvs.
 #' @import dplyr
 #' @examples
@@ -31,45 +36,89 @@
 #' #     renf90_ped_name = "renadd03.ped",
 #' #     snp_file_name = "my_genos.geno" )
 #'
-
+#'
 #' @export
-bf90_cv <- function(path_2_execs, missing_value_code, random_effect_col, h2, num_runs, num_folds, output_table_name, renf90_ped_name, snp_file_name = NULL) {
-  # set seed for reproducibility
-  base::set.seed(101919)
+bf90_cv <- function(missing_value_code = NULL,
+                    random_effect_col = NULL,
+                    h2 = NULL,
+                    num_runs = NULL,
+                    num_folds = NULL,
+                    path_2_execs = ".",
+                    input_files_dir = ".",
+                    output_files_dir = ".",
+                    output_table_name = NULL,
+                    renf90_ped_name= NULL,
+                    snp_file_name = NULL,
+                    seed = 101919,
+                    verbose = TRUE) {
 
-  # define working directory
-  wd_path <- base::getwd()
-  # set base_path for results folder
-  base_path <- base::paste0(base::getwd(), "/bf90_cv_results/")
-
-  # Function to run commands on the terminal and log output
-  execute_command <- function(command, logfile) {
-    if (.Platform$OS.type == "unix") {
-      output <- system(paste(command, "2>&1 | tee -a", logfile), intern = TRUE)
-    } else if (.Platform$OS.type == "windows") {
-      output <- system(paste("cmd /c", command, ">", logfile, "2>&1"), intern = TRUE)
-    } else {
-      stop("Unsupported OS type")
-    }
-    return(output)
-  }
 
   #check id pedigree and genotype file are present
+  if(is.null(renf90_ped_name)) stop("Specify renf90_ped_name")
 
   if (!file.exists(renf90_ped_name)) {
     stop("Parameter file not found at: ", renf90_ped_name)
   }
+  renf90_ped_name <- normalizePath(renf90_ped_name)
 
-  if (!file.exists("renf90.par")) {
-    stop("Parameter file not found at: ", "renf90.par")
-  }
+  input_files_dir <- normalizePath(input_files_dir)
+
+  if(is.null(missing_value_code)) stop("Specify missing_value_code")
+  if(is.null(random_effect_col)) stop("Specify random_effect_col")
+  if(is.null(h2)) stop("Specify h2")
+  if(is.null(num_runs)) stop("Specify num_runs")
+  if(is.null(num_folds)) stop("Specify num_folds")
+
+  if (!file.exists(file.path(input_files_dir,"renf90.par"))) stop("File 'renf90.par' not found at: ", input_files_dir)
+  if (!file.exists(file.path(input_files_dir,"renf90.inb"))) stop("File 'renf90.inb' not found at: ", input_files_dir)
+  if (!file.exists(file.path(input_files_dir,"renf90.fields"))) stop("File 'renf90.fields' not found at: ", input_files_dir)
+  if (!file.exists(file.path(input_files_dir,"renf90.tables"))) stop("File 'renf90.tables' not found at: ", input_files_dir)
+  if (!file.exists(file.path(input_files_dir,"renf90.dat"))) stop("File 'renf90.dat' not found at: ", input_files_dir)
 
   if (!is.null(snp_file_name)) {
     if (!file.exists(snp_file_name)) {
       stop("Parameter file not found at:", snp_file_name)
+      snp_file_name <- normalizePath(snp_file_name)
     }
   }
 
+  if(is.null(output_table_name)) stop("Define output table name.")
+
+  if (!file.exists(renf90_ped_name)) {
+    stop("Parameter file not found at: ", renf90_ped_name)
+  }
+
+  # set seed for reproducibility
+  base::set.seed(seed)
+
+  # Checks
+  output_files_dir <- normalizePath(output_files_dir)
+  if(file.exists(output_files_dir)){
+    check_files <- list.files(output_files_dir)
+    if(length(check_files) > 0) warning(paste("Directory", output_files_dir, "is not empty. Some files may be replaced."))
+  } else {
+    stop(paste("Directory", output_files_dir, "does not exist. Create it before running the function."))
+  }
+
+  path_2_execs <- normalizePath(path_2_execs)
+
+  # Inform parameters and directories set
+  if(verbose){
+    cat("Parameters set:\n",
+        "  missing_value_code = ", missing_value_code,"\n",
+        "  random_effect_col = ", random_effect_col,"\n",
+        "  h2 = ",h2 ,"\n",
+        "  num_runs = ", num_runs,"\n",
+        "  num_folds =", num_folds,"\n",
+        "Directories:\n",
+        "  input files:", input_files_dir, "\n",
+        "  output files:", output_files_dir, "\n",
+        "  executable files:", path_2_execs)
+  }
+
+  # define working directory
+  wd_path <- base::getwd()
+
   #Assign .exes or not based on OS
   if (.Platform$OS.type == "unix") {
     predict = "predictf90"
@@ -80,78 +129,39 @@ bf90_cv <- function(path_2_execs, missing_value_code, random_effect_col, h2, num
   }
 
   # Run BF90 programs for the whole dataset
-  output <- execute_command(command = paste0(path_2_execs, blup, " renf90.par"), logfile = "run_blup.log")
-
-  command_predict <- paste0(path_2_execs, predict, " renf90.par")
+  setwd(input_files_dir)
+  output <- execute_command(command = paste0(file.path(path_2_execs, blup)," ", "renf90.par"), logfile = "run_blup.log")
+  command_predict <- paste0(file.path(path_2_execs, predict), " ", paste0("renf90.par"))
   output <- execute_command(command = command_predict, logfile = "run_predict.log")
 
+  files_res <- c("run_blup.log", "bvs.dat", "bvs2.dat", "yhat_residual", "solutions")
+  for(i in 1:length(files_res)) file.rename(from = files_res[i], to = file.path(output_files_dir,files_res[i]))
 
   # Prepare files for each BLUP run
-  renf90 <- base::readLines("renf90.par")
-  ped_file <- renf90_ped_name
-  fields_file <- base::readLines("renf90.fields")
-  inb_file <- base::readLines("renf90.inb")
-  tables_file <- base::readLines("renf90.tables")
+  renf90 <- base::readLines(paste0("renf90.par"))
+  #ped_file <- dirname(renf90_ped_name)
+  fields_file <- base::readLines(paste0("renf90.fields"))
+  inb_file <- base::readLines(paste0("renf90.inb"))
+  tables_file <- base::readLines(paste0("renf90.tables"))
 
   # Read and preprocess the phenotype data
-  bf90_phenos <- utils::read.table("renf90.dat", sep = " ", header = FALSE) %>%
+  bf90_phenos <- utils::read.table(paste0("renf90.dat"), sep = " ", header = FALSE) %>%
     dplyr::select(-1)
 
   # Shuffle the data and create folds
   data_shuffled <- base::lapply(1:num_runs, function(x) bf90_phenos[base::sample(base::nrow(bf90_phenos)), ] %>%
                                   dplyr::select((random_effect_col + 1)))
-  create_folds <- function(data) {
-    n <- base::nrow(data)
-    fold_size <- n %/% num_folds
-    folds <- base::vector("list", num_folds)
-    for (i in 1:num_folds) {
-      start_index <- (i - 1) * fold_size + 1
-      end_index <- if (i == num_folds) n else i * fold_size
-      folds[[i]] <- data[start_index:end_index, ]
-    }
-    folds
-  }
-  folds <- base::lapply(data_shuffled, create_folds)
-
-  # Function to mutate phenotypes to missing value for given folds
-  mutate_folds <- function(phenos, folds) {
-    base::lapply(1:num_folds, function(i) {
-      phenos %>%
-        dplyr::mutate(V2 = base::ifelse(V5 %in% folds[[i]], missing_value_code, V2))
-    })
-  }
-  mutated_data <- base::lapply(folds, function(f) mutate_folds(bf90_phenos, f))
-
-  # Create cross-validation datasets
-  create_cv_datasets <- function(run, fold, data_frame, dir_path) {
-    file_name <- base::sprintf("renf90_run%d_fold%d.dat", run, fold)
-    utils::write.table(data_frame, file = file_name, sep = " ", row.names = FALSE, col.names = FALSE, quote = FALSE)
-
-    if (!base::file.exists(dir_path)) {
-      base::dir.create(dir_path, recursive = TRUE)
-    }
-    base::file.rename(file_name, base::file.path(dir_path, file_name))
-
-    modified_content <- base::gsub("renf90.dat", base::sprintf("renf90_run%d_fold%d.dat", run, fold), renf90)
-    base::writeLines(modified_content, base::file.path(dir_path, base::sprintf("renf90_run%d_fold%d.par", run, fold)))
 
-    # Create symbolic links instead of copying files
-    file.symlink(base::file.path(wd_path, renf90_ped_name), base::file.path(dir_path, renf90_ped_name))
-    file.symlink(base::file.path(wd_path, "renf90.fields"), base::file.path(dir_path, "renf90.fields"))
-    file.symlink(base::file.path(wd_path, "renf90.inb"), base::file.path(dir_path, "renf90.inb"))
-    file.symlink(base::file.path(wd_path, "renf90.tables"), base::file.path(dir_path, "renf90.tables"))
-
-    if (!is.null(snp_file_name)) {
-      Xref_file <- paste0(snp_file_name, "_XrefID")
-      file.symlink(base::file.path(wd_path, snp_file_name), base::file.path(dir_path, snp_file_name))
-      file.symlink(base::file.path(wd_path, Xref_file), base::file.path(dir_path, Xref_file))
-    }
-  }
+  folds <- base::lapply(data_shuffled, function(x) create_folds(x, num_folds))
+  mutated_data <- base::lapply(folds, function(f) mutate_folds(bf90_phenos, f, num_folds,missing_value_code))
 
+  setwd(output_files_dir)
   for (run in 1:num_runs) {
     for (fold in 1:num_folds) {
-      dir_path <- base::sprintf("bf90_cv_results/run%d/fold%d", run, fold)
-      create_cv_datasets(run, fold, mutated_data[[run]][[fold]], dir_path)
+      dir_path <- base::sprintf("run%d/fold%d", run, fold)
+      create_cv_datasets(run, fold, mutated_data[[run]][[fold]],
+                         dir_path, renf90, renf90_ped_name,
+                         input_files_dir, snp_file_name)
     }
   }
 
@@ -160,8 +170,8 @@ bf90_cv <- function(path_2_execs, missing_value_code, random_effect_col, h2, num
   for (run in 1:num_runs) {
     ebvs_for_cv_list <- base::list()
     for (fold in 1:num_folds) {
-      base::setwd(base::file.path(base_path, base::sprintf("run%d/fold%d", run, fold)))
-      command <- base::paste0(path_2_execs, blup, base::sprintf(" renf90_run%d_fold%d.par", run, fold))
+      base::setwd(base::file.path(output_files_dir, base::sprintf("run%d/fold%d", run, fold)))
+      command <- base::paste0(file.path(path_2_execs, blup), base::sprintf(" renf90_run%d_fold%d.par", run, fold))
       logfile <- base::sprintf("blup_fold%d_run%d.log", fold, run)
       execute_command(command = command, logfile = logfile)
 
@@ -177,14 +187,15 @@ bf90_cv <- function(path_2_execs, missing_value_code, random_effect_col, h2, num
 
       ebvs_for_cv_list[[fold]] <- ebvs_for_cv
       utils::write.table(ebvs_for_cv, file = base::sprintf("ebvs_for_cv_run%d_fold%d.dat", run, fold), row.names = FALSE, col.names = TRUE, quote = FALSE)
-      base::setwd(wd_path)
+
     }
     ebvs_for_cv_runs[[base::sprintf("ebvs_for_cv_run%d", run)]] <- base::do.call(base::rbind, ebvs_for_cv_list)
   }
+  base::setwd(output_files_dir)
 
   # Calculate correlations and bias
-  corrected_phenos <- utils::read.table("yhat_residual", header = FALSE) %>% dplyr::select(1, 2)
-  raw_phenos <- utils::read.table("renf90.dat") %>% dplyr::select(4, 1)
+  corrected_phenos <- utils::read.table(paste0(output_files_dir,"/yhat_residual"), header = FALSE) %>% dplyr::select(1, 2)
+  raw_phenos <- utils::read.table(paste0(input_files_dir,"/renf90.dat")) %>% dplyr::select(4, 1)
 
   ystar_correlations <- base::numeric(num_runs)
   yraw_correlations <- base::numeric(num_runs)
@@ -215,9 +226,10 @@ bf90_cv <- function(path_2_execs, missing_value_code, random_effect_col, h2, num
     Run = base::c(base::paste("run", 1:num_runs), base::paste("run", 1:num_runs), base::paste("run", 1:num_runs), "", "", "", ""),
     Value = base::round(base::c(yraw_correlations, ystar_correlations, bias_list, yraw_average_corr, ystar_accuracy, yraw_accuracy, average_bias), 3)
   )
-  base::print (base::paste0("****Accuracy and bias table****" ))
+  if(verbose) base::print (base::paste0("****Accuracy and bias table****" ))
 
   utils::write.table(data, file = output_table_name, row.names = FALSE, quote = FALSE)
 
-  base::print(data)
+  if(verbose) base::print(data)
+  setwd(wd_path)
 }