-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathreformat_multiome_e2g_predictions.R
129 lines (104 loc) · 4.36 KB
/
reformat_multiome_e2g_predictions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
## Reformat pillar project predictors
library(optparse)
# Process input arguments --------------------------------------------------------------------------
# create arguments list
option_list = list(
make_option(c("-i", "--input_file"), type = "character", default = NULL,
help = "Path to transcripts input file", metavar = "character"),
make_option(c("-o", "--output_file"), type = "character", default = NULL,
help = "Path to output file", metavar = "character"),
make_option(c("-g", "--genes_file"), type = "character", default = NULL,
help = "Path to file containing gene information", metavar = "character"),
make_option(c("-c", "--cell_type"), type = "character", default = NULL,
help = "Cell type", metavar = "character"),
make_option(c("-m", "--method"), type = "character", default = NULL,
help = "E2G method that produced the predictions", metavar = "character"),
make_option(c("-v", "--version"), type = "character", default = NULL,
help = "E2G method version", metavar = "character"),
make_option(c("-t", "--threshold"), type = "character", default = NULL,
help = "Used score threshold if applicable", metavar = "character")
)
# parse arguments
opt_parser = OptionParser(option_list = option_list)
opt = parse_args(opt_parser)
# function to check for required arguments
check_required_args <- function(arg, opt, opt_parser) {
if (is.null(opt[[arg]])) {
print_help(opt_parser)
stop(arg, " argument is required!", call. = FALSE)
}
}
# check that all required parameters are provided
required_args <- c("input_file", "output_file", "method", "version")
for (i in required_args) {
check_required_args(i, opt = opt, opt_parser = opt_parser)
}
# Process file -------------------------------------------------------------------------------------
# required packages
suppressPackageStartupMessages({
library(data.table)
library(dplyr)
})
# load genes file
message("Loading predictions...")
genes <- fread(opt$genes_file, select = c("GeneSymbol", "GeneEnsemblID", "GeneTSS"))
# randomly pick one gene id per symbol. THIS NEEDS TO BE FIXED BY USING THE APPROPRIATE GENE FILE
genes <- genes %>%
group_by(GeneSymbol) %>%
slice_sample(n = 1)
# load ENCODE gene TSS file
# genes <- fread(opt$genes_file)
#
# # extract required columns and convert TSS coordinates to 1bp resolution
# genes <- genes %>%
# mutate(GeneTSS = (start + end) / 2) %>%
# select(GeneSymbol = name, GeneEnsemblID = Ensembl_ID, GeneTSS)
# load input file
pred <- fread(opt$input_file)
# get all score columns (all columns except EG-pair defining columns)
message("Reformatting predictions...")
score_cols <- setdiff(colnames(pred), c("chr", "start", "end", "TargetGene", "CellType"))
# set cell type
if (!is.null(opt$cell_type)) {
cell_type <- opt$cell_type
} else {
cell_type <- unique(pred$CellType)
}
# create header lines
header <- c(
paste("# Source:", opt$method),
paste("# Version:", opt$version),
"# GenomeBuild: GRCh38",
"# URL: [add url]",
"# Assays: 10x multiome",
"# BiosampleAgnostic: False",
paste("# BiosampleString:", cell_type)
)
# add threshold if applicable
if (!is.null(opt$threshold)) {
header <- c(header, paste("# Threshold:", opt$threshold))
}
# add Ensembl id and gene TSS columns
pred <- left_join(pred, genes, by = c("TargetGene" = "GeneSymbol"))
# add additional columns and extract output columns
pred <- pred %>%
mutate(ElementChr = paste0("chr", chr), name = paste0(ElementChr, ":", start, "-", end),
ElementClass = NA_character_, ElementStrand = ".") %>%
select(ElementChr, ElementStart = start, ElementEnd = end, ElementName = name, ElementClass,
ElementStrand, GeneSymbol = TargetGene, GeneEnsemblID, GeneTSS, all_of(score_cols))
# save to output file
message("Writing to output file...")
if (tools::file_ext(opt$output_file) == "gz") {
# save to gzip compressed file
tmp_file <- tools::file_path_sans_ext(opt$output_file)
writeLines(header, con = tmp_file)
fwrite(pred, file = tmp_file, sep = "\t", quote = FALSE, na = "NA", append = TRUE,
col.names = TRUE)
system2("gzip", args = c("-f", tmp_file))
} else {
# save to uncompressed file
writeLines(header, con = opt$output_file)
fwrite(pred, file = opt$output_file, sep = "\t", quote = FALSE, na = "NA", append = TRUE,
col.names = TRUE)
}
message("Done!")