-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path04_junction_pairing.R
110 lines (80 loc) · 4.45 KB
/
04_junction_pairing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#' Title
#' Function to pair novel junctions with annotated introns across the samples of each sample cluster
#' (i.e. case/control, tissue, etc)
#' @param recount3.project.IDs array with the recount3 projects to download
#' e.g. "SRP009615"
#'
#' @return
#' @export
#'
#' @examples
JunctionPairing <- function(recount3.project.IDs,
results.folder,
num.cores,
replace) {
if (replace) {
logger::log_info("Starting 'JunctionPairing' script...")
doParallel::registerDoParallel(num.cores)
foreach(i = seq(length(recount3.project.IDs)), .combine = "rbind") %dopar%{
# i <- 3
project_id <- recount3.project.IDs[i]
folder_root <- file.path(results.folder, project_id)
folder_base_data <- file.path(folder_root, "base_data")
logger::log_info("Getting data from '", project_id, "' project...")
## Load sample clusters for the current project
if (file.exists(paste0(folder_base_data, "/", project_id, "_clusters_used.rds"))) {
clusters_ID <- readRDS(file = paste0(folder_base_data, "/", project_id, "_clusters_used.rds"))
for (cluster_id in clusters_ID) {
# cluster_id <- clusters_ID[2]
logger::log_info(paste0(Sys.time(), " - loading '", cluster_id, "' source data ..."))
############################################
## LOAD DATA FOR THE CURRENT PROJECT
############################################
if ( file.exists(paste0(folder_base_data, "/", project_id, "_", cluster_id, "_samples_used.rds")) &&
file.exists(paste0(folder_base_data, "/", project_id, "_", cluster_id, "_all_split_reads.rds")) &&
file.exists(paste0(folder_base_data, "/", project_id, "_", cluster_id, "_split_read_counts.rds")) ) {
## Load samples
samples_used <- readRDS(file = paste0(folder_base_data, "/", project_id, "_", cluster_id, "_samples_used.rds")) %>% unique()
## Load split read data
all_split_reads_details <- readRDS(file = paste0(folder_base_data, "/", project_id, "_", cluster_id, "_all_split_reads.rds")) %>% as_tibble()
## Load split read counts
split_read_counts <- readRDS(file = paste0(folder_base_data, "/", project_id, "_", cluster_id, "_split_read_counts.rds")) %>% as_tibble()
if (!identical((split_read_counts %>% names())[-1] %>% sort(), samples_used %>% sort())) {
stop("ERROR! different number of samples used!");
}
if (!identical(all_split_reads_details$junID, split_read_counts$junID)) {
stop("ERROR! The number of junctions considered is not correct.");
}
############################################
## DISTANCES SUITE OF FUNCTIONS
############################################
folder_pairing_results <- file.path(folder_root, "junction_pairing", cluster_id)
dir.create(file.path(folder_pairing_results), recursive = TRUE, showWarnings = T)
GetDistances(project.id = project_id,
cluster = cluster_id,
samples = samples_used,
split.read.counts = split_read_counts,
all.split.reads.details = all_split_reads_details,
folder.name = folder_pairing_results,
replace = replace)
gc()
ExtractDistances(cluster = cluster_id,
samples = samples_used,
folder.name = folder_pairing_results,
replace = replace)
gc()
GetNeverMisspliced(cluster = cluster_id,
samples = samples_used,
split.read.counts = split_read_counts,
all.split.reads.details = all_split_reads_details,
folder.name = folder_pairing_results,
replace = replace)
rm(all_split_reads_details)
rm(split_read_counts)
gc()
}
}
}
}
}
}