Skip to content

Commit

Permalink
Merge pull request #4 from josuechinchilla/auto_tests
Browse files Browse the repository at this point in the history
Auto tests
  • Loading branch information
josuechinchilla authored Aug 7, 2024
2 parents 3fc1906 + 945f354 commit 4df1ddc
Show file tree
Hide file tree
Showing 15 changed files with 451 additions and 181 deletions.
12 changes: 9 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
.DS_store
.Rproj.user
BIGf90.Rproj
.DS_store
.Rproj.user
.Rbuildignore
BIGf90.Rproj
inst/exec_mac
inst/exec_windows/
inst/data
tests/
.Rhistory
4 changes: 3 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: BIGf90
Title: R face front for running K-fold crossvalidation, estimating ebvs and variance component estimation with Blupf90 modules
Version: 0.3.0
Version: 0.3.1
Authors@R: c(person("Josue", "Chinchilla-Vargas", role = c("aut", "cre"), email = "jc3635@cornell.edu"),
person("Alexander", "Sandercock", role = "aut"),
person("Breeding Insight Team", role = "ctb"))
Expand All @@ -15,6 +15,8 @@ License: Apache License (== 2.0)
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.2
Suggests:
testthat
Imports:
base (>= 4.3.1),
dplyr (>= 1.1.4),
Expand Down
11 changes: 11 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# BIGf90 0.3.1

* Users now can have executable files, input files, and output files in different locations
* Warnings were added in case of run_gibbs and run_postgibbs functions that requires inputs to be together with renum results
* Code changed to always use global path even if it is referring to working directory
* Verbose argument added to give option to print or not information on screen while running
* Default values added
* Output directories need to exist. The function will stop if they don´t
* Automatic installation checks included
* NEWS file included
* GitHub example
188 changes: 100 additions & 88 deletions R/bf90_cv.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
#' The function run_renumf90 needs to be used beforehand to process a .par file created by the user.
#' This function calculates 2 accuracy estimates: correlation between raw phenotypes and ebvs divided by the square-root of narrow sense heritabilty and correlation between corrected phenotypes and ebvs along with bias estimations calculated as the regression of the phenotypes on the ebvs.
#'
#' @param path_2_execs path to a folder that holds all blupf90 executables that ill be used (blupf90+,predictf90). This field should be in quotes "".
#' @param missing_value_code code used in the .par file after OPTION MISSING to indicate missing phenotype, if this option is no use, this value must be 0.
#' @param random_effect_col Column where random effect is located, found under RANDOM_GROUP in the renf90.par file.
#' @param h2 estimate of narrow-sense heritabilty.This value is use to calculate accuracy of ebvs
Expand All @@ -15,6 +14,12 @@
#' @param output_table_name Name of the final tab-separated out-up file. This field should be in quotes "".
#' @param renf90_ped_name Name of pedigree file generated by renumf90. This field should be in quotes "".
#' @param snp_file_name the name of the genotype file to be used. If use, this field should be in quotes"". Default for the function is no genotype file.
#' @param path_2_execs path to a folder that holds all blupf90 executables that ill be used (blupf90+,predictf90). This field should be in quotes "".
#' @param input_files_dir directory containing files renf90.par renf90.fields renf90.inb renf90.tables renf90.dat
#' @param output_files_dir path to directory to store the output files
#' @param seed set seed for the stochastic process
#' @param verbose logical defining if information will be printed on the console
#'
#' @return a tab-separated file that includes accuracy and bias estimates of ebvs.
#' @import dplyr
#' @examples
Expand All @@ -31,45 +36,89 @@
#' # renf90_ped_name = "renadd03.ped",
#' # snp_file_name = "my_genos.geno" )
#'

#'
#' @export
bf90_cv <- function(path_2_execs, missing_value_code, random_effect_col, h2, num_runs, num_folds, output_table_name, renf90_ped_name, snp_file_name = NULL) {
# set seed for reproducibility
base::set.seed(101919)
bf90_cv <- function(missing_value_code = NULL,
random_effect_col = NULL,
h2 = NULL,
num_runs = NULL,
num_folds = NULL,
path_2_execs = ".",
input_files_dir = ".",
output_files_dir = ".",
output_table_name = NULL,
renf90_ped_name= NULL,
snp_file_name = NULL,
seed = 101919,
verbose = TRUE) {

# define working directory
wd_path <- base::getwd()
# set base_path for results folder
base_path <- base::paste0(base::getwd(), "/bf90_cv_results/")

# Function to run commands on the terminal and log output
execute_command <- function(command, logfile) {
if (.Platform$OS.type == "unix") {
output <- system(paste(command, "2>&1 | tee -a", logfile), intern = TRUE)
} else if (.Platform$OS.type == "windows") {
output <- system(paste("cmd /c", command, ">", logfile, "2>&1"), intern = TRUE)
} else {
stop("Unsupported OS type")
}
return(output)
}

#check id pedigree and genotype file are present
if(is.null(renf90_ped_name)) stop("Specify renf90_ped_name")

if (!file.exists(renf90_ped_name)) {
stop("Parameter file not found at: ", renf90_ped_name)
}
renf90_ped_name <- normalizePath(renf90_ped_name)

if (!file.exists("renf90.par")) {
stop("Parameter file not found at: ", "renf90.par")
}
input_files_dir <- normalizePath(input_files_dir)

if(is.null(missing_value_code)) stop("Specify missing_value_code")
if(is.null(random_effect_col)) stop("Specify random_effect_col")
if(is.null(h2)) stop("Specify h2")
if(is.null(num_runs)) stop("Specify num_runs")
if(is.null(num_folds)) stop("Specify num_folds")

if (!file.exists(file.path(input_files_dir,"renf90.par"))) stop("File 'renf90.par' not found at: ", input_files_dir)
if (!file.exists(file.path(input_files_dir,"renf90.inb"))) stop("File 'renf90.inb' not found at: ", input_files_dir)
if (!file.exists(file.path(input_files_dir,"renf90.fields"))) stop("File 'renf90.fields' not found at: ", input_files_dir)
if (!file.exists(file.path(input_files_dir,"renf90.tables"))) stop("File 'renf90.tables' not found at: ", input_files_dir)
if (!file.exists(file.path(input_files_dir,"renf90.dat"))) stop("File 'renf90.dat' not found at: ", input_files_dir)

if (!is.null(snp_file_name)) {
if (!file.exists(snp_file_name)) {
stop("Parameter file not found at:", snp_file_name)
snp_file_name <- normalizePath(snp_file_name)
}
}

if(is.null(output_table_name)) stop("Define output table name.")

if (!file.exists(renf90_ped_name)) {
stop("Parameter file not found at: ", renf90_ped_name)
}

# set seed for reproducibility
base::set.seed(seed)

# Checks
output_files_dir <- normalizePath(output_files_dir)
if(file.exists(output_files_dir)){
check_files <- list.files(output_files_dir)
if(length(check_files) > 0) warning(paste("Directory", output_files_dir, "is not empty. Some files may be replaced."))
} else {
stop(paste("Directory", output_files_dir, "does not exist. Create it before running the function."))
}

path_2_execs <- normalizePath(path_2_execs)

# Inform parameters and directories set
if(verbose){
cat("Parameters set:\n",
" missing_value_code = ", missing_value_code,"\n",
" random_effect_col = ", random_effect_col,"\n",
" h2 = ",h2 ,"\n",
" num_runs = ", num_runs,"\n",
" num_folds =", num_folds,"\n",
"Directories:\n",
" input files:", input_files_dir, "\n",
" output files:", output_files_dir, "\n",
" executable files:", path_2_execs)
}

# define working directory
wd_path <- base::getwd()

#Assign .exes or not based on OS
if (.Platform$OS.type == "unix") {
predict = "predictf90"
Expand All @@ -80,78 +129,39 @@ bf90_cv <- function(path_2_execs, missing_value_code, random_effect_col, h2, num
}

# Run BF90 programs for the whole dataset
output <- execute_command(command = paste0(path_2_execs, blup, " renf90.par"), logfile = "run_blup.log")

command_predict <- paste0(path_2_execs, predict, " renf90.par")
setwd(input_files_dir)
output <- execute_command(command = paste0(file.path(path_2_execs, blup)," ", "renf90.par"), logfile = "run_blup.log")
command_predict <- paste0(file.path(path_2_execs, predict), " ", paste0("renf90.par"))
output <- execute_command(command = command_predict, logfile = "run_predict.log")

files_res <- c("run_blup.log", "bvs.dat", "bvs2.dat", "yhat_residual", "solutions")
for(i in 1:length(files_res)) file.rename(from = files_res[i], to = file.path(output_files_dir,files_res[i]))

# Prepare files for each BLUP run
renf90 <- base::readLines("renf90.par")
ped_file <- renf90_ped_name
fields_file <- base::readLines("renf90.fields")
inb_file <- base::readLines("renf90.inb")
tables_file <- base::readLines("renf90.tables")
renf90 <- base::readLines(paste0("renf90.par"))
#ped_file <- dirname(renf90_ped_name)
fields_file <- base::readLines(paste0("renf90.fields"))
inb_file <- base::readLines(paste0("renf90.inb"))
tables_file <- base::readLines(paste0("renf90.tables"))

# Read and preprocess the phenotype data
bf90_phenos <- utils::read.table("renf90.dat", sep = " ", header = FALSE) %>%
bf90_phenos <- utils::read.table(paste0("renf90.dat"), sep = " ", header = FALSE) %>%
dplyr::select(-1)

# Shuffle the data and create folds
data_shuffled <- base::lapply(1:num_runs, function(x) bf90_phenos[base::sample(base::nrow(bf90_phenos)), ] %>%
dplyr::select((random_effect_col + 1)))
create_folds <- function(data) {
n <- base::nrow(data)
fold_size <- n %/% num_folds
folds <- base::vector("list", num_folds)
for (i in 1:num_folds) {
start_index <- (i - 1) * fold_size + 1
end_index <- if (i == num_folds) n else i * fold_size
folds[[i]] <- data[start_index:end_index, ]
}
folds
}
folds <- base::lapply(data_shuffled, create_folds)

# Function to mutate phenotypes to missing value for given folds
mutate_folds <- function(phenos, folds) {
base::lapply(1:num_folds, function(i) {
phenos %>%
dplyr::mutate(V2 = base::ifelse(V5 %in% folds[[i]], missing_value_code, V2))
})
}
mutated_data <- base::lapply(folds, function(f) mutate_folds(bf90_phenos, f))

# Create cross-validation datasets
create_cv_datasets <- function(run, fold, data_frame, dir_path) {
file_name <- base::sprintf("renf90_run%d_fold%d.dat", run, fold)
utils::write.table(data_frame, file = file_name, sep = " ", row.names = FALSE, col.names = FALSE, quote = FALSE)

if (!base::file.exists(dir_path)) {
base::dir.create(dir_path, recursive = TRUE)
}
base::file.rename(file_name, base::file.path(dir_path, file_name))

modified_content <- base::gsub("renf90.dat", base::sprintf("renf90_run%d_fold%d.dat", run, fold), renf90)
base::writeLines(modified_content, base::file.path(dir_path, base::sprintf("renf90_run%d_fold%d.par", run, fold)))

# Create symbolic links instead of copying files
file.symlink(base::file.path(wd_path, renf90_ped_name), base::file.path(dir_path, renf90_ped_name))
file.symlink(base::file.path(wd_path, "renf90.fields"), base::file.path(dir_path, "renf90.fields"))
file.symlink(base::file.path(wd_path, "renf90.inb"), base::file.path(dir_path, "renf90.inb"))
file.symlink(base::file.path(wd_path, "renf90.tables"), base::file.path(dir_path, "renf90.tables"))

if (!is.null(snp_file_name)) {
Xref_file <- paste0(snp_file_name, "_XrefID")
file.symlink(base::file.path(wd_path, snp_file_name), base::file.path(dir_path, snp_file_name))
file.symlink(base::file.path(wd_path, Xref_file), base::file.path(dir_path, Xref_file))
}
}
folds <- base::lapply(data_shuffled, function(x) create_folds(x, num_folds))
mutated_data <- base::lapply(folds, function(f) mutate_folds(bf90_phenos, f, num_folds,missing_value_code))

setwd(output_files_dir)
for (run in 1:num_runs) {
for (fold in 1:num_folds) {
dir_path <- base::sprintf("bf90_cv_results/run%d/fold%d", run, fold)
create_cv_datasets(run, fold, mutated_data[[run]][[fold]], dir_path)
dir_path <- base::sprintf("run%d/fold%d", run, fold)
create_cv_datasets(run, fold, mutated_data[[run]][[fold]],
dir_path, renf90, renf90_ped_name,
input_files_dir, snp_file_name)
}
}

Expand All @@ -160,8 +170,8 @@ bf90_cv <- function(path_2_execs, missing_value_code, random_effect_col, h2, num
for (run in 1:num_runs) {
ebvs_for_cv_list <- base::list()
for (fold in 1:num_folds) {
base::setwd(base::file.path(base_path, base::sprintf("run%d/fold%d", run, fold)))
command <- base::paste0(path_2_execs, blup, base::sprintf(" renf90_run%d_fold%d.par", run, fold))
base::setwd(base::file.path(output_files_dir, base::sprintf("run%d/fold%d", run, fold)))
command <- base::paste0(file.path(path_2_execs, blup), base::sprintf(" renf90_run%d_fold%d.par", run, fold))
logfile <- base::sprintf("blup_fold%d_run%d.log", fold, run)
execute_command(command = command, logfile = logfile)

Expand All @@ -177,14 +187,15 @@ bf90_cv <- function(path_2_execs, missing_value_code, random_effect_col, h2, num

ebvs_for_cv_list[[fold]] <- ebvs_for_cv
utils::write.table(ebvs_for_cv, file = base::sprintf("ebvs_for_cv_run%d_fold%d.dat", run, fold), row.names = FALSE, col.names = TRUE, quote = FALSE)
base::setwd(wd_path)

}
ebvs_for_cv_runs[[base::sprintf("ebvs_for_cv_run%d", run)]] <- base::do.call(base::rbind, ebvs_for_cv_list)
}
base::setwd(output_files_dir)

# Calculate correlations and bias
corrected_phenos <- utils::read.table("yhat_residual", header = FALSE) %>% dplyr::select(1, 2)
raw_phenos <- utils::read.table("renf90.dat") %>% dplyr::select(4, 1)
corrected_phenos <- utils::read.table(paste0(output_files_dir,"/yhat_residual"), header = FALSE) %>% dplyr::select(1, 2)
raw_phenos <- utils::read.table(paste0(input_files_dir,"/renf90.dat")) %>% dplyr::select(4, 1)

ystar_correlations <- base::numeric(num_runs)
yraw_correlations <- base::numeric(num_runs)
Expand Down Expand Up @@ -215,9 +226,10 @@ bf90_cv <- function(path_2_execs, missing_value_code, random_effect_col, h2, num
Run = base::c(base::paste("run", 1:num_runs), base::paste("run", 1:num_runs), base::paste("run", 1:num_runs), "", "", "", ""),
Value = base::round(base::c(yraw_correlations, ystar_correlations, bias_list, yraw_average_corr, ystar_accuracy, yraw_accuracy, average_bias), 3)
)
base::print (base::paste0("****Accuracy and bias table****" ))
if(verbose) base::print (base::paste0("****Accuracy and bias table****" ))

utils::write.table(data, file = output_table_name, row.names = FALSE, quote = FALSE)

base::print(data)
if(verbose) base::print(data)
setwd(wd_path)
}
Loading

0 comments on commit 4df1ddc

Please sign in to comment.