diff --git a/.gitignore b/.gitignore index 3394485..3b06184 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,9 @@ -.DS_store -.Rproj.user -BIGf90.Rproj \ No newline at end of file +.DS_store +.Rproj.user +.Rbuildignore +BIGf90.Rproj +inst/exec_mac +inst/exec_windows/ +inst/data +tests/ +.Rhistory \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index f870c69..0b08e35 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGf90 Title: R face front for running K-fold crossvalidation, estimating ebvs and variance component estimation with Blupf90 modules -Version: 0.3.0 +Version: 0.3.1 Authors@R: c(person("Josue", "Chinchilla-Vargas", role = c("aut", "cre"), email = "jc3635@cornell.edu"), person("Alexander", "Sandercock", role = "aut"), person("Breeding Insight Team", role = "ctb")) @@ -15,6 +15,8 @@ License: Apache License (== 2.0) Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 +Suggests: + testthat Imports: base (>= 4.3.1), dplyr (>= 1.1.4), diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..7b14e6b --- /dev/null +++ b/NEWS.md @@ -0,0 +1,11 @@ +# BIGf90 0.3.1 + +* Users now can have executable files, input files, and output files in different locations +* Warnings were added in case of run_gibbs and run_postgibbs functions that requires inputs to be together with renum results +* Code changed to always use global path even if it is referring to working directory +* Verbose argument added to give option to print or not information on screen while running +* Default values added +* Output directories need to exist. The function will stop if they donĀ“t +* Automatic installation checks included +* NEWS file included +* GitHub example diff --git a/R/bf90_cv.R b/R/bf90_cv.R index 6bdf5b8..8fc16af 100644 --- a/R/bf90_cv.R +++ b/R/bf90_cv.R @@ -6,7 +6,6 @@ #' The function run_renumf90 needs to be used beforehand to process a .par file created by the user. #' This function calculates 2 accuracy estimates: correlation between raw phenotypes and ebvs divided by the square-root of narrow sense heritabilty and correlation between corrected phenotypes and ebvs along with bias estimations calculated as the regression of the phenotypes on the ebvs. #' -#' @param path_2_execs path to a folder that holds all blupf90 executables that ill be used (blupf90+,predictf90). This field should be in quotes "". #' @param missing_value_code code used in the .par file after OPTION MISSING to indicate missing phenotype, if this option is no use, this value must be 0. #' @param random_effect_col Column where random effect is located, found under RANDOM_GROUP in the renf90.par file. #' @param h2 estimate of narrow-sense heritabilty.This value is use to calculate accuracy of ebvs @@ -15,6 +14,12 @@ #' @param output_table_name Name of the final tab-separated out-up file. This field should be in quotes "". #' @param renf90_ped_name Name of pedigree file generated by renumf90. This field should be in quotes "". #' @param snp_file_name the name of the genotype file to be used. If use, this field should be in quotes"". Default for the function is no genotype file. +#' @param path_2_execs path to a folder that holds all blupf90 executables that ill be used (blupf90+,predictf90). This field should be in quotes "". +#' @param input_files_dir directory containing files renf90.par renf90.fields renf90.inb renf90.tables renf90.dat +#' @param output_files_dir path to directory to store the output files +#' @param seed set seed for the stochastic process +#' @param verbose logical defining if information will be printed on the console +#' #' @return a tab-separated file that includes accuracy and bias estimates of ebvs. #' @import dplyr #' @examples @@ -31,45 +36,89 @@ #' # renf90_ped_name = "renadd03.ped", #' # snp_file_name = "my_genos.geno" ) #' - +#' #' @export -bf90_cv <- function(path_2_execs, missing_value_code, random_effect_col, h2, num_runs, num_folds, output_table_name, renf90_ped_name, snp_file_name = NULL) { - # set seed for reproducibility - base::set.seed(101919) +bf90_cv <- function(missing_value_code = NULL, + random_effect_col = NULL, + h2 = NULL, + num_runs = NULL, + num_folds = NULL, + path_2_execs = ".", + input_files_dir = ".", + output_files_dir = ".", + output_table_name = NULL, + renf90_ped_name= NULL, + snp_file_name = NULL, + seed = 101919, + verbose = TRUE) { - # define working directory - wd_path <- base::getwd() - # set base_path for results folder - base_path <- base::paste0(base::getwd(), "/bf90_cv_results/") - - # Function to run commands on the terminal and log output - execute_command <- function(command, logfile) { - if (.Platform$OS.type == "unix") { - output <- system(paste(command, "2>&1 | tee -a", logfile), intern = TRUE) - } else if (.Platform$OS.type == "windows") { - output <- system(paste("cmd /c", command, ">", logfile, "2>&1"), intern = TRUE) - } else { - stop("Unsupported OS type") - } - return(output) - } #check id pedigree and genotype file are present + if(is.null(renf90_ped_name)) stop("Specify renf90_ped_name") if (!file.exists(renf90_ped_name)) { stop("Parameter file not found at: ", renf90_ped_name) } + renf90_ped_name <- normalizePath(renf90_ped_name) - if (!file.exists("renf90.par")) { - stop("Parameter file not found at: ", "renf90.par") - } + input_files_dir <- normalizePath(input_files_dir) + + if(is.null(missing_value_code)) stop("Specify missing_value_code") + if(is.null(random_effect_col)) stop("Specify random_effect_col") + if(is.null(h2)) stop("Specify h2") + if(is.null(num_runs)) stop("Specify num_runs") + if(is.null(num_folds)) stop("Specify num_folds") + + if (!file.exists(file.path(input_files_dir,"renf90.par"))) stop("File 'renf90.par' not found at: ", input_files_dir) + if (!file.exists(file.path(input_files_dir,"renf90.inb"))) stop("File 'renf90.inb' not found at: ", input_files_dir) + if (!file.exists(file.path(input_files_dir,"renf90.fields"))) stop("File 'renf90.fields' not found at: ", input_files_dir) + if (!file.exists(file.path(input_files_dir,"renf90.tables"))) stop("File 'renf90.tables' not found at: ", input_files_dir) + if (!file.exists(file.path(input_files_dir,"renf90.dat"))) stop("File 'renf90.dat' not found at: ", input_files_dir) if (!is.null(snp_file_name)) { if (!file.exists(snp_file_name)) { stop("Parameter file not found at:", snp_file_name) + snp_file_name <- normalizePath(snp_file_name) } } + if(is.null(output_table_name)) stop("Define output table name.") + + if (!file.exists(renf90_ped_name)) { + stop("Parameter file not found at: ", renf90_ped_name) + } + + # set seed for reproducibility + base::set.seed(seed) + + # Checks + output_files_dir <- normalizePath(output_files_dir) + if(file.exists(output_files_dir)){ + check_files <- list.files(output_files_dir) + if(length(check_files) > 0) warning(paste("Directory", output_files_dir, "is not empty. Some files may be replaced.")) + } else { + stop(paste("Directory", output_files_dir, "does not exist. Create it before running the function.")) + } + + path_2_execs <- normalizePath(path_2_execs) + + # Inform parameters and directories set + if(verbose){ + cat("Parameters set:\n", + " missing_value_code = ", missing_value_code,"\n", + " random_effect_col = ", random_effect_col,"\n", + " h2 = ",h2 ,"\n", + " num_runs = ", num_runs,"\n", + " num_folds =", num_folds,"\n", + "Directories:\n", + " input files:", input_files_dir, "\n", + " output files:", output_files_dir, "\n", + " executable files:", path_2_execs) + } + + # define working directory + wd_path <- base::getwd() + #Assign .exes or not based on OS if (.Platform$OS.type == "unix") { predict = "predictf90" @@ -80,78 +129,39 @@ bf90_cv <- function(path_2_execs, missing_value_code, random_effect_col, h2, num } # Run BF90 programs for the whole dataset - output <- execute_command(command = paste0(path_2_execs, blup, " renf90.par"), logfile = "run_blup.log") - - command_predict <- paste0(path_2_execs, predict, " renf90.par") + setwd(input_files_dir) + output <- execute_command(command = paste0(file.path(path_2_execs, blup)," ", "renf90.par"), logfile = "run_blup.log") + command_predict <- paste0(file.path(path_2_execs, predict), " ", paste0("renf90.par")) output <- execute_command(command = command_predict, logfile = "run_predict.log") + files_res <- c("run_blup.log", "bvs.dat", "bvs2.dat", "yhat_residual", "solutions") + for(i in 1:length(files_res)) file.rename(from = files_res[i], to = file.path(output_files_dir,files_res[i])) # Prepare files for each BLUP run - renf90 <- base::readLines("renf90.par") - ped_file <- renf90_ped_name - fields_file <- base::readLines("renf90.fields") - inb_file <- base::readLines("renf90.inb") - tables_file <- base::readLines("renf90.tables") + renf90 <- base::readLines(paste0("renf90.par")) + #ped_file <- dirname(renf90_ped_name) + fields_file <- base::readLines(paste0("renf90.fields")) + inb_file <- base::readLines(paste0("renf90.inb")) + tables_file <- base::readLines(paste0("renf90.tables")) # Read and preprocess the phenotype data - bf90_phenos <- utils::read.table("renf90.dat", sep = " ", header = FALSE) %>% + bf90_phenos <- utils::read.table(paste0("renf90.dat"), sep = " ", header = FALSE) %>% dplyr::select(-1) # Shuffle the data and create folds data_shuffled <- base::lapply(1:num_runs, function(x) bf90_phenos[base::sample(base::nrow(bf90_phenos)), ] %>% dplyr::select((random_effect_col + 1))) - create_folds <- function(data) { - n <- base::nrow(data) - fold_size <- n %/% num_folds - folds <- base::vector("list", num_folds) - for (i in 1:num_folds) { - start_index <- (i - 1) * fold_size + 1 - end_index <- if (i == num_folds) n else i * fold_size - folds[[i]] <- data[start_index:end_index, ] - } - folds - } - folds <- base::lapply(data_shuffled, create_folds) - - # Function to mutate phenotypes to missing value for given folds - mutate_folds <- function(phenos, folds) { - base::lapply(1:num_folds, function(i) { - phenos %>% - dplyr::mutate(V2 = base::ifelse(V5 %in% folds[[i]], missing_value_code, V2)) - }) - } - mutated_data <- base::lapply(folds, function(f) mutate_folds(bf90_phenos, f)) - - # Create cross-validation datasets - create_cv_datasets <- function(run, fold, data_frame, dir_path) { - file_name <- base::sprintf("renf90_run%d_fold%d.dat", run, fold) - utils::write.table(data_frame, file = file_name, sep = " ", row.names = FALSE, col.names = FALSE, quote = FALSE) - - if (!base::file.exists(dir_path)) { - base::dir.create(dir_path, recursive = TRUE) - } - base::file.rename(file_name, base::file.path(dir_path, file_name)) - - modified_content <- base::gsub("renf90.dat", base::sprintf("renf90_run%d_fold%d.dat", run, fold), renf90) - base::writeLines(modified_content, base::file.path(dir_path, base::sprintf("renf90_run%d_fold%d.par", run, fold))) - # Create symbolic links instead of copying files - file.symlink(base::file.path(wd_path, renf90_ped_name), base::file.path(dir_path, renf90_ped_name)) - file.symlink(base::file.path(wd_path, "renf90.fields"), base::file.path(dir_path, "renf90.fields")) - file.symlink(base::file.path(wd_path, "renf90.inb"), base::file.path(dir_path, "renf90.inb")) - file.symlink(base::file.path(wd_path, "renf90.tables"), base::file.path(dir_path, "renf90.tables")) - - if (!is.null(snp_file_name)) { - Xref_file <- paste0(snp_file_name, "_XrefID") - file.symlink(base::file.path(wd_path, snp_file_name), base::file.path(dir_path, snp_file_name)) - file.symlink(base::file.path(wd_path, Xref_file), base::file.path(dir_path, Xref_file)) - } - } + folds <- base::lapply(data_shuffled, function(x) create_folds(x, num_folds)) + mutated_data <- base::lapply(folds, function(f) mutate_folds(bf90_phenos, f, num_folds,missing_value_code)) + setwd(output_files_dir) for (run in 1:num_runs) { for (fold in 1:num_folds) { - dir_path <- base::sprintf("bf90_cv_results/run%d/fold%d", run, fold) - create_cv_datasets(run, fold, mutated_data[[run]][[fold]], dir_path) + dir_path <- base::sprintf("run%d/fold%d", run, fold) + create_cv_datasets(run, fold, mutated_data[[run]][[fold]], + dir_path, renf90, renf90_ped_name, + input_files_dir, snp_file_name) } } @@ -160,8 +170,8 @@ bf90_cv <- function(path_2_execs, missing_value_code, random_effect_col, h2, num for (run in 1:num_runs) { ebvs_for_cv_list <- base::list() for (fold in 1:num_folds) { - base::setwd(base::file.path(base_path, base::sprintf("run%d/fold%d", run, fold))) - command <- base::paste0(path_2_execs, blup, base::sprintf(" renf90_run%d_fold%d.par", run, fold)) + base::setwd(base::file.path(output_files_dir, base::sprintf("run%d/fold%d", run, fold))) + command <- base::paste0(file.path(path_2_execs, blup), base::sprintf(" renf90_run%d_fold%d.par", run, fold)) logfile <- base::sprintf("blup_fold%d_run%d.log", fold, run) execute_command(command = command, logfile = logfile) @@ -177,14 +187,15 @@ bf90_cv <- function(path_2_execs, missing_value_code, random_effect_col, h2, num ebvs_for_cv_list[[fold]] <- ebvs_for_cv utils::write.table(ebvs_for_cv, file = base::sprintf("ebvs_for_cv_run%d_fold%d.dat", run, fold), row.names = FALSE, col.names = TRUE, quote = FALSE) - base::setwd(wd_path) + } ebvs_for_cv_runs[[base::sprintf("ebvs_for_cv_run%d", run)]] <- base::do.call(base::rbind, ebvs_for_cv_list) } + base::setwd(output_files_dir) # Calculate correlations and bias - corrected_phenos <- utils::read.table("yhat_residual", header = FALSE) %>% dplyr::select(1, 2) - raw_phenos <- utils::read.table("renf90.dat") %>% dplyr::select(4, 1) + corrected_phenos <- utils::read.table(paste0(output_files_dir,"/yhat_residual"), header = FALSE) %>% dplyr::select(1, 2) + raw_phenos <- utils::read.table(paste0(input_files_dir,"/renf90.dat")) %>% dplyr::select(4, 1) ystar_correlations <- base::numeric(num_runs) yraw_correlations <- base::numeric(num_runs) @@ -215,9 +226,10 @@ bf90_cv <- function(path_2_execs, missing_value_code, random_effect_col, h2, num Run = base::c(base::paste("run", 1:num_runs), base::paste("run", 1:num_runs), base::paste("run", 1:num_runs), "", "", "", ""), Value = base::round(base::c(yraw_correlations, ystar_correlations, bias_list, yraw_average_corr, ystar_accuracy, yraw_accuracy, average_bias), 3) ) - base::print (base::paste0("****Accuracy and bias table****" )) + if(verbose) base::print (base::paste0("****Accuracy and bias table****" )) utils::write.table(data, file = output_table_name, row.names = FALSE, quote = FALSE) - base::print(data) + if(verbose) base::print(data) + setwd(wd_path) } diff --git a/R/run_gibbs.R b/R/run_gibbs.R index ab54906..28ab849 100644 --- a/R/run_gibbs.R +++ b/R/run_gibbs.R @@ -7,9 +7,13 @@ #' The user will have to enter the number of samples in the MCMC chain, the number of samples to burn and the number used to thin samples. A log file called run_gibbs.log is also produced. #' #' @param path_2_execs path to a folder that holds the renumf90 executable. This field should be in quotes "". +#' @param input_files_dir path to renf90.par file generated by run_renum function +#' @param output_files_dir path to store the result files #' @param gibbs_iter number of samples in the Gibbs sampler. #' @param gibbs_burn number of samples to be discarded at the begining of the Gibbs sampler #' @param gibbs_keep the interval to save samples (thinning). Entering a 1 means all samples are kept. +#' @param verbose logical if TRUE prints log information +#' #' @examples #' ## Example #' @@ -19,38 +23,49 @@ #' # gibbs_keep = 1) #' #' @export -run_gibbs <- function(path_2_execs, gibbs_iter, gibbs_burn, gibbs_keep) { - # Function to run commands on the terminal and log output - execute_command <- function(command, logfile) { - if (.Platform$OS.type == "unix") { - output <- system(paste(command, "2>&1 | tee -a", logfile), intern = TRUE) - } else if (.Platform$OS.type == "windows") { - output <- system(paste("cmd /c", command, ">", logfile, "2>&1"), intern = TRUE) - } else { - stop("Unsupported OS type") - } - return(output) +run_gibbs <- function(path_2_execs, + input_files_dir = ".", + output_files_dir = input_files_dir, + gibbs_iter = 250000, + gibbs_burn = 20000, + gibbs_keep = 1, + verbose = TRUE) { + + if(input_files_dir != output_files_dir) warning("Next steps will required renum and gibbs results stored in same directory.") + + # Checks + if(file.exists(output_files_dir)){ + check_files <- list.files(output_files_dir) + output_files_dir <- normalizePath(output_files_dir) + } else { + stop(paste("Directory", output_files_dir, "does not exist. Create it before running the function.")) } + path_2_execs <- normalizePath(path_2_execs) + cur_dir <- getwd() + #Assign .exe or not based on OS if (.Platform$OS.type == "unix") { gibbs = "gibbsf90+" } else if (.Platform$OS.type == "windows") { gibbs = "gibbsf90+.exe" } - gibbs_exec <- paste0(path_2_execs, gibbs) + gibbs_exec <- file.path(path_2_execs, gibbs) # Check if executable exists if (!file.exists(gibbs_exec)) { stop("Executable not found at: ", gibbs_exec) } - if (!file.exists("renf90.par")) { + + if (!file.exists(file.path(input_files_dir, "renf90.par"))) { stop("Parameter file not found: renf90.par") } + input_files_dir <- normalizePath(input_files_dir) + # Create a temporary file with the inputs temp_input_file <- tempfile() - writeLines(c("renf90.par", gibbs_iter, gibbs_burn, gibbs_keep), temp_input_file) + writeLines(c(file.path(input_files_dir,"renf90.par"), gibbs_iter, gibbs_burn, gibbs_keep), temp_input_file) # Construct the command to use the input file if (.Platform$OS.type == "unix") { @@ -60,22 +75,30 @@ run_gibbs <- function(path_2_execs, gibbs_iter, gibbs_burn, gibbs_keep) { } # Debugging: Print the constructed command - cat("Running command:", command_gibbs, "\n") + if(verbose) cat("Running command:", command_gibbs, "\n") # Run the command and log the output + setwd(input_files_dir) output <- execute_command(command = command_gibbs, logfile = "run_gibbs.log") + if(output_files_dir != input_files_dir){ + files_res <- c("last_solutions", "binary_final_solutions", "fort.99", "gibbs_samples", "run_gibbs.log") + for(i in 1:length(files_res)) file.rename(from = files_res[i], to = file.path(output_files_dir,files_res[i])) + } # Remove the temporary file unlink(temp_input_file) # Debugging: Print the output - cat("Command output:", output, "\n") + if(verbose) cat("Command output:", output, "\n") # Capture and print the log file content - if (file.exists("run_gibbs.log")) { - cat("Log file content:\n") - cat(readLines("run_gibbs.log"), sep = "\n") - } else { - cat("Log file not created.\n") + if(verbose){ + if (file.exists(file.path(output_files_dir,"run_gibbs.log"))) { + cat("Log file content:\n") + cat(readLines(file.path(output_files_dir,"run_gibbs.log")), sep = "\n") + } else { + cat("Log file not created.\n") + } } + setwd(cur_dir) } diff --git a/R/run_postgibbs.R b/R/run_postgibbs.R index c0fde4a..d29f576 100644 --- a/R/run_postgibbs.R +++ b/R/run_postgibbs.R @@ -7,8 +7,12 @@ #' The user will have to enter the number of samples to burn and the number used to thin samples. A log file called run_postgibbs.log is also produced. #' #' @param path_2_execs path to a folder that holds the renumf90 executable. This field should be in quotes "". +#' @param input_files_dir path to renf90.par file generated by run_renum function +#' @param output_files_dir path to store the result files #' @param postgibbs_burn number of samples to be discarded at the begining of the Gibbs sampler #' @param postgibbs_keep the interval to save samples (thinning). Entering a 1 means all samples are kept.#' +#' @param verbose logical if TRUE prints log information in the console +#' #' @examples #' ## Example #' @@ -17,39 +21,46 @@ #' # postgibbs_keep = 100) #' #' @export -run_postgibbs <- function(path_2_execs, postgibbs_burn, postgibbs_keep) { - # Function to run commands on the terminal and log output - execute_command <- function(command, logfile) { - if (.Platform$OS.type == "unix") { - output <- system(paste(command, "2>&1 | tee -a", logfile), intern = TRUE) - } else if (.Platform$OS.type == "windows") { - output <- system(paste("cmd /c", command, ">", logfile, "2>&1"), intern = TRUE) - } else { - stop("Unsupported OS type") - } - return(output) - } +run_postgibbs <- function(path_2_execs, + input_files_dir = ".", + output_files_dir = input_files_dir, + postgibbs_burn, + postgibbs_keep, + verbose = TRUE) { + if(input_files_dir != output_files_dir) warning("Next steps will required renum, gibbs, and postgibbs results stored in same directory.") + + # Checks + if(file.exists(output_files_dir)){ + check_files <- list.files(output_files_dir) + output_files_dir <- normalizePath(output_files_dir) + } else { + stop(paste("Directory", output_files_dir, "does not exist. Create it before running the function.")) + } + + output_files_dir <- normalizePath(output_files_dir) + input_files_dir <- normalizePath(input_files_dir) + cur_dir <- getwd() + #Assign .exe or not based on OS if (.Platform$OS.type == "unix") { postgibbs = "postgibbsf90" } else if (.Platform$OS.type == "windows") { postgibbs = "postgibbsf90.exe" } - postgibbs_exec <- paste0(path_2_execs, postgibbs) + postgibbs_exec <- file.path(path_2_execs, postgibbs) # Check if executable exists - if (!file.exists(postgibbs_exec)) { stop("Executable not found at: ", postgibbs_exec) } - if (!file.exists("renf90.par")) { + if (!file.exists(file.path(input_files_dir, "renf90.par"))) { stop("Parameter file not found: renf90.par") } # Create a temporary file with the inputs temp_input_file <- tempfile() - writeLines(c("renf90.par", postgibbs_burn, postgibbs_keep, "0"), temp_input_file) + writeLines(c(file.path(input_files_dir, "renf90.par"), postgibbs_burn, postgibbs_keep, "0"), temp_input_file) # Construct the command to use the input file if (.Platform$OS.type == "unix") { @@ -59,22 +70,26 @@ run_postgibbs <- function(path_2_execs, postgibbs_burn, postgibbs_keep) { } # Debugging: Print the constructed command - cat("Running command:", command_postgibbs, "\n") + if(verbose) cat("Running command:", command_postgibbs, "\n") # Run the command and log the output + setwd(input_files_dir) output <- execute_command(command = command_postgibbs, logfile = "run_postgibbs.log") # Remove the temporary file unlink(temp_input_file) # Debugging: Print the output - cat("Command output:", output, "\n") + if(verbose) cat("Command output:", output, "\n") # Capture and print the log file content - if (file.exists("run_postgibbs.log")) { - cat("Log file content:\n") - cat(readLines("run_postgibbs.log"), sep = "\n") - } else { - cat("Log file not created.\n") + if(verbose){ + if (file.exists("run_postgibbs.log")) { + cat("Log file content:\n") + cat(readLines("run_postgibbs.log"), sep = "\n") + } else { + cat("Log file not created.\n") + } } + setwd(cur_dir) } diff --git a/R/run_renum.R b/R/run_renum.R index 041b0ab..faba8f7 100644 --- a/R/run_renum.R +++ b/R/run_renum.R @@ -7,27 +7,41 @@ #' #' @param path_2_execs path to a folder that holds the renumf90 executable. This field should be in quotes "". #' @param raw_par_file name of the .par file that will be processed. This field should be in quotes "". +#' @param output_files_dir path to the folder to store the output files renadd03.ped renf90.dat renf90.fields renf90.inb renf90.par renf90.tables run_renum.log +#' @param verbose logical if TRUE prints log information #' @examples -#' +#' #' \donttest{ -#' #run_renum(path_2_execs = "path/bf90_execs/", -#' #raw_par_file = "weight_2022_no_cov_cv.par") +#' #run_renum(path_2_execs = "path/bf90_execs/", +#' #input_files_dir = "weight_2022_no_cov_cv.par") #' } -#' -#' +#' +#' #' @export -run_renum <- function(path_2_execs, raw_par_file) { - - # Function to run commands on the terminal and log output - execute_command <- function(command, logfile) { - if (.Platform$OS.type == "unix") { - output <- system(paste(command, "2>&1 | tee -a", logfile), intern = TRUE) - } else if (.Platform$OS.type == "windows") { - output <- system(paste("cmd /c", command, ">", logfile, "2>&1"), intern = TRUE) - } else { - stop("Unsupported OS type") - } - return(output) +run_renum <- function(path_2_execs = ".", + raw_par_file = NULL, + output_files_dir = NULL, + verbose = TRUE) { + + # Checks + if(is.null(output_files_dir)) stop("Define a output directory in output_files_dir") + + if(file.exists(output_files_dir)){ + check_files <- list.files(output_files_dir) + if(length(check_files) > 0) warning(paste("Directory", output_files_dir, "is not empty. Some files may be replaced.")) + output_files_dir <- normalizePath(output_files_dir) + } else { + stop(paste("Directory '", output_files_dir, "' does not exist. Create it before running the function.")) + } + + path_2_execs <- normalizePath(path_2_execs) + + if(is.null(raw_par_file)) stop("Define raw_par_file.") + raw_file <- normalizePath(raw_par_file) + input_files_dir <- dirname(raw_file) + + if (!file.exists(raw_file)) { + stop("Parameter file not found at: ", raw_file) } #Assign .exe or not based on OS @@ -37,26 +51,39 @@ run_renum <- function(path_2_execs, raw_par_file) { renum = "renumf90.exe" } - raw_file <- paste(base::getwd(), raw_par_file, sep = "/") + cur_dir <- getwd() # save working directory location + # Construct the command - command_renum <- paste0(path_2_execs, renum," ",raw_par_file) + command_renum <- paste0(file.path(path_2_execs, renum)," ", paste0("'",raw_file, "'")) # Check if executable and parameter files exist - if (!file.exists(paste0(path_2_execs, renum))) { + if (!file.exists(file.path(path_2_execs, renum))) { stop("Executable not found at: ", paste0(path_2_execs, renum)) } - if (!file.exists(raw_file)) { - stop("Parameter file not found at: ", raw_file) - } # Run the command and log the output + setwd(input_files_dir) output <- execute_command(command = command_renum, logfile = "run_renum.log") # Capture and print the log file content - if (file.exists("run_renum.log")) { - cat("Log file content:\n") - cat(readLines("run_renum.log"), sep = "\n") - } else { - cat("Log file not created.\n") + if(verbose){ + if (file.exists("run_renum.log")) { + cat("Log file content:\n") + cat(readLines("run_renum.log"), sep = "\n") + } else { + cat("Log file not created.\n") + } } + + # Move generated files to working directory + files_res <- c("renadd03.ped", "renf90.dat", "renf90.fields", "renf90.inb", "renf90.par" ,"renf90.tables", "run_renum.log") + for(i in 1:length(files_res)) file.rename(from = files_res[i], to = file.path(output_files_dir,files_res[i])) + + # Return to past working directory + setwd(cur_dir) } + + + + + diff --git a/R/utils.R b/R/utils.R new file mode 100644 index 0000000..f55150f --- /dev/null +++ b/R/utils.R @@ -0,0 +1,62 @@ +#' Function to run commands on the terminal and log output +#' +#' @param command comment line used to run executable file +#' @param logfile logfile name +#' +execute_command <- function(command, logfile) { + if (.Platform$OS.type == "unix") { + output <- system(paste(command, "2>&1 | tee -a", logfile), intern = TRUE) + } else if (.Platform$OS.type == "windows") { + output <- system(paste("cmd /c", command, ">", logfile, "2>&1"), intern = TRUE) + } else { + stop("Unsupported OS type") + } + return(output) +} + +#' +create_folds <- function(data, num_folds) { + n <- base::nrow(data) + fold_size <- n %/% num_folds + folds <- base::vector("list", num_folds) + for (i in 1:num_folds) { + start_index <- (i - 1) * fold_size + 1 + end_index <- if (i == num_folds) n else i * fold_size + folds[[i]] <- data[start_index:end_index, ] + } + return(folds) +} + +# Function to mutate phenotypes to missing value for given folds +mutate_folds <- function(phenos, folds, num_folds, missing_value_code) { + base::lapply(1:num_folds, function(i) { + phenos %>% + dplyr::mutate(V2 = base::ifelse(V5 %in% folds[[i]], missing_value_code, V2)) + }) +} + +# Create cross-validation datasets +create_cv_datasets <- function(run, fold, data_frame, dir_path, renf90, renf90_ped_name, input_files_dir, snp_file_name) { + file_name <- base::sprintf("renf90_run%d_fold%d.dat", run, fold) + utils::write.table(data_frame, file = file_name, sep = " ", row.names = FALSE, col.names = FALSE, quote = FALSE) + + if (!base::file.exists(dir_path)) { + base::dir.create(dir_path, recursive = TRUE) + } + base::file.rename(file_name, base::file.path(dir_path, file_name)) + + modified_content <- base::gsub("renf90.dat", base::sprintf("renf90_run%d_fold%d.dat", run, fold), renf90) + base::writeLines(modified_content, base::file.path(dir_path, base::sprintf("renf90_run%d_fold%d.par", run, fold))) + + # Create symbolic links instead of copying files + file.symlink(base::file.path(renf90_ped_name), base::file.path(dir_path, basename(renf90_ped_name))) + file.symlink(base::file.path(input_files_dir, "renf90.fields"), base::file.path(dir_path, "renf90.fields")) + file.symlink(base::file.path(input_files_dir, "renf90.inb"), base::file.path(dir_path, "renf90.inb")) + file.symlink(base::file.path(input_files_dir, "renf90.tables"), base::file.path(dir_path, "renf90.tables")) + + if (!is.null(snp_file_name)) { + Xref_file <- paste0(snp_file_name, "_XrefID") + file.symlink(base::file.path(snp_file_name), base::file.path(dir_path, basename(snp_file_name))) + file.symlink(base::file.path(Xref_file), base::file.path(dir_path, basename(Xref_file))) + } +} diff --git a/inst/CITATION b/inst/CITATION index 92c94de..46014c7 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -9,4 +9,4 @@ bibentry( year = "2024", note = "R package version 0.2.1", url = "https://github.com/Breeding-Insight/BIGr" -) +) \ No newline at end of file diff --git a/inst/no_cov_script.R b/inst/no_cov_script.R new file mode 100644 index 0000000..1c97d98 --- /dev/null +++ b/inst/no_cov_script.R @@ -0,0 +1,47 @@ +#Set seed, working directory and load packages needed +set.seed(101919) +library("BIGf90") +setwd("~/Desktop/weight_length_2024/weight_2022phenos_2022ped/no_cov") +#set path to executables +#path_2_execs = system.file("exec_windows",package = "BIGf90") +path_2_execs = system.file("exec_mac",package = "BIGf90") + +#### 1) Process raw parameter file with renumf90 #### +run_renum(path_2_execs, + output_files_dir = "results", + raw_par_file = "weight22_ped22.par") + +#### 2) Estimate variance components with gibbsf90+ and postgibbsf90 #### + +run_gibbs(path_2_execs, + input_files_dir = "results", + gibbs_iter = 250000, + gibbs_burn = 50000, + gibbs_keep = 1) + +run_postgibbs(path_2_execs, input_files_dir = "results", + postgibbs_burn = 1, + postgibbs_keep = 100) + +#### 3) Run a cross-validation analysis (CVA) to estimate the accuracy of your EBV predictions #### + +bf90_cv( + path_2_execs, + missing_value_code = -999, + random_effect_col = 3, + h2 = 0.95, + num_runs = 10, + num_folds = 5, + renf90_ped_name = "results/renadd03.ped", + input_files_dir = "results", + output_files_dir = "bf90_cv_results/", + output_table_name = "CVA_results_weight_2022phenos_2022ped_no_cov.txt" + ) + +# Check in your version if the files generated have the same results below (with some tolerance) +yhat <- read.table("bf90_cv_results/yhat_residual") +dim(yhat) +sum(yhat$V2) # 89.84 + +cv_results <- read.table("bf90_cv_results/CVA_results_weight_2022phenos_2022ped_no_cov.txt", skip = 1, nrows = 30) +sum(cv_results$V4) # 25.746 diff --git a/man/bf90_cv.Rd b/man/bf90_cv.Rd index cd4a366..cf70c6c 100644 --- a/man/bf90_cv.Rd +++ b/man/bf90_cv.Rd @@ -5,20 +5,22 @@ \title{Run K-fold cross-validation analysis (CVA)} \usage{ bf90_cv( - path_2_execs, - missing_value_code, - random_effect_col, - h2, - num_runs, - num_folds, - output_table_name, - renf90_ped_name, - snp_file_name = NULL + missing_value_code = NULL, + random_effect_col = NULL, + h2 = NULL, + num_runs = NULL, + num_folds = NULL, + path_2_execs = ".", + input_files_dir = ".", + output_files_dir = ".", + output_table_name = NULL, + renf90_ped_name = NULL, + snp_file_name = NULL, + seed = 101919, + verbose = TRUE ) } \arguments{ -\item{path_2_execs}{path to a folder that holds all blupf90 executables that ill be used (blupf90+,predictf90). This field should be in quotes "".} - \item{missing_value_code}{code used in the .par file after OPTION MISSING to indicate missing phenotype, if this option is no use, this value must be 0.} \item{random_effect_col}{Column where random effect is located, found under RANDOM_GROUP in the renf90.par file.} @@ -29,11 +31,21 @@ bf90_cv( \item{num_folds}{Number of folds to be generated within each independent run.} +\item{path_2_execs}{path to a folder that holds all blupf90 executables that ill be used (blupf90+,predictf90). This field should be in quotes "".} + +\item{input_files_dir}{directory containing files renf90.par renf90.fields renf90.inb renf90.tables renf90.dat} + +\item{output_files_dir}{path to directory to store the output files} + \item{output_table_name}{Name of the final tab-separated out-up file. This field should be in quotes "".} \item{renf90_ped_name}{Name of pedigree file generated by renumf90. This field should be in quotes "".} \item{snp_file_name}{the name of the genotype file to be used. If use, this field should be in quotes"". Default for the function is no genotype file.} + +\item{seed}{set seed for the stochastic process} + +\item{verbose}{logical defining if information will be printed on the console} } \value{ a tab-separated file that includes accuracy and bias estimates of ebvs. @@ -60,4 +72,5 @@ This function calculates 2 accuracy estimates: correlation between raw phenotype # renf90_ped_name = "renadd03.ped", # snp_file_name = "my_genos.geno" ) + } diff --git a/man/execute_command.Rd b/man/execute_command.Rd new file mode 100644 index 0000000..b79f891 --- /dev/null +++ b/man/execute_command.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{execute_command} +\alias{execute_command} +\title{Function to run commands on the terminal and log output} +\usage{ +execute_command(command, logfile) +} +\arguments{ +\item{command}{comment line used to run executable file} + +\item{logfile}{logfile name} +} +\description{ +Function to run commands on the terminal and log output +} diff --git a/man/run_gibbs.Rd b/man/run_gibbs.Rd index 438823f..136b3a9 100644 --- a/man/run_gibbs.Rd +++ b/man/run_gibbs.Rd @@ -4,16 +4,30 @@ \alias{run_gibbs} \title{Run gibbsf90+} \usage{ -run_gibbs(path_2_execs, gibbs_iter, gibbs_burn, gibbs_keep) +run_gibbs( + path_2_execs, + input_files_dir = ".", + output_files_dir = input_files_dir, + gibbs_iter = 250000, + gibbs_burn = 20000, + gibbs_keep = 1, + verbose = TRUE +) } \arguments{ \item{path_2_execs}{path to a folder that holds the renumf90 executable. This field should be in quotes "".} +\item{input_files_dir}{path to renf90.par file generated by run_renum function} + +\item{output_files_dir}{path to store the result files} + \item{gibbs_iter}{number of samples in the Gibbs sampler.} \item{gibbs_burn}{number of samples to be discarded at the begining of the Gibbs sampler} \item{gibbs_keep}{the interval to save samples (thinning). Entering a 1 means all samples are kept.} + +\item{verbose}{logical if TRUE prints log information} } \description{ This function runs gibbsf90+. diff --git a/man/run_postgibbs.Rd b/man/run_postgibbs.Rd index 37d49fa..37ae6fe 100644 --- a/man/run_postgibbs.Rd +++ b/man/run_postgibbs.Rd @@ -4,14 +4,27 @@ \alias{run_postgibbs} \title{Run postgibbsf90} \usage{ -run_postgibbs(path_2_execs, postgibbs_burn, postgibbs_keep) +run_postgibbs( + path_2_execs, + input_files_dir = ".", + output_files_dir = input_files_dir, + postgibbs_burn, + postgibbs_keep, + verbose = TRUE +) } \arguments{ \item{path_2_execs}{path to a folder that holds the renumf90 executable. This field should be in quotes "".} +\item{input_files_dir}{path to renf90.par file generated by run_renum function} + +\item{output_files_dir}{path to store the result files} + \item{postgibbs_burn}{number of samples to be discarded at the begining of the Gibbs sampler} \item{postgibbs_keep}{the interval to save samples (thinning). Entering a 1 means all samples are kept.#'} + +\item{verbose}{logical if TRUE prints log information in the console} } \description{ This function runs postgibbsf90 using the renf90.par file used to run gibbsf90+ diff --git a/man/run_renum.Rd b/man/run_renum.Rd index 551d7d1..15af7f1 100644 --- a/man/run_renum.Rd +++ b/man/run_renum.Rd @@ -4,12 +4,21 @@ \alias{run_renum} \title{Run renumf90} \usage{ -run_renum(path_2_execs, raw_par_file) +run_renum( + path_2_execs = ".", + raw_par_file = NULL, + output_files_dir = NULL, + verbose = TRUE +) } \arguments{ \item{path_2_execs}{path to a folder that holds the renumf90 executable. This field should be in quotes "".} \item{raw_par_file}{name of the .par file that will be processed. This field should be in quotes "".} + +\item{output_files_dir}{path to the folder to store the output files renadd03.ped renf90.dat renf90.fields renf90.inb renf90.par renf90.tables run_renum.log} + +\item{verbose}{logical if TRUE prints log information} } \description{ This function runs renumf90. @@ -21,8 +30,8 @@ The outputs will be the standard output files produced by renumf90. A log file c \examples{ \donttest{ - #run_renum(path_2_execs = "path/bf90_execs/", - #raw_par_file = "weight_2022_no_cov_cv.par") + #run_renum(path_2_execs = "path/bf90_execs/", + #input_files_dir = "weight_2022_no_cov_cv.par") }