-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy path01a_run-scdblfinder.R
executable file
·167 lines (149 loc) · 5.57 KB
/
01a_run-scdblfinder.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/usr/bin/env Rscript
# This script runs scDblFinder on an SCE file and exports a TSV file of results.
# If the SCE has fewer than 10 droplets, results will not be calculated; the exported TSV will contain only NA values.
# Load libraries ------
suppressPackageStartupMessages({
library(SingleCellExperiment)
library(optparse)
})
# Functions -----------
#' Run scDblFinder on an SCE object
#'
#' @param sce SCE object to run scDblFinder on
#' @param cores Number of cores to specify to BiocParallel::MulticoreParam()
#' @param sample_var Required for SCE object with multiple samples, the SCE colData variable that indicates sample of origin.
#' This option should _not_ be used for a multiplexed library which has not been demultiplexed, but is suitable for merged objects.
#' @param random_seed Required for SCE object with multiple samples, a random seed to ensure reproducibility when running on multiple samples.
#' @param columns_to_keep Vector of columns in the final scDblFinder table to keep in the returned table. No checking is done on these column names.
#' @param ... Additional arguments to pass to scDblFinder::scDblFinder
#'
#' @return Data frame object with scDblFinder inferences
run_scdblfinder <- function(sce,
cores = 4,
sample_var = NULL,
random_seed = NULL,
columns_to_keep = c("barcodes", "score", "class"),
...) {
# first check multiple samples, which requires a random seed
if (!is.null(sample_var) && is.null(random_seed)) {
if (!sample_var %in% colnames(colData(sce))) {
stop(
glue::glue("The provided sample variable {sample_var} is not present in the input SCE's colData slot.")
)
}
if (is.null(random_seed)) {
# See section 1.5.6: https://bioconductor.org/packages/3.19/bioc/vignettes/scDblFinder/inst/doc/scDblFinder.html#usage
stop("If multiple samples are present in the input SCE object (e.g., the object contains multiple merged libraries), a random seed must be provided for reproducibility.")
}
}
# set up cores
if (cores == 1) {
bp <- BiocParallel::SerialParam(RNGseed = random_seed)
} else {
bp <- BiocParallel::MulticoreParam(cores, RNGseed = random_seed)
}
# Run doublet finder
result_df <- scDblFinder::scDblFinder(
sce,
samples = sample_var, # Default is NULL so use whatever was provided by the user or default
BPPARAM = bp,
returnType = "table", # return df, not sce
verbose = FALSE,
...
)
result_df <- result_df |>
as.data.frame() |>
tibble::rownames_to_column("barcodes") |>
# remove artifical doublets
dplyr::filter(!stringr::str_starts(barcodes, "rDbl")) |>
# keep only columns of interest
dplyr::select(dplyr::all_of(columns_to_keep))
return(result_df)
}
# Parse options --------
option_list <- list(
make_option(
"--input_sce_file",
type = "character",
help = "Path to input SCE file in RDS format."
),
make_option(
"--output_file",
type = "character",
default = "scdblfinder_results.tsv",
help = "Path to output TSV file with doublet inferences."
),
make_option(
"--cores",
type = "integer",
default = 4,
help = "Number of cores to use during scDblFinder inference."
),
make_option(
"--benchmark",
action = "store_true",
default = FALSE,
help = "Whether the script is being run as part of the doublet benchmarking analysis. If this flag is invoked, the `cxds_score` column will be included in the output TSV along with the `barcodes`, `score`, and `class` columns that are always included."
),
make_option(
"--sample_var",
type = "character",
help = "If multiple samples are present in the SCE, the colData column name with sample ids. This option should be used for merged objects."
),
make_option(
"--random_seed",
type = "integer",
default = 2024,
help = "Random seed. Default is 2024."
)
)
opts <- parse_args(OptionParser(option_list = option_list))
# Check input arguments and set up files, directories ------
if (!file.exists(opts$input_sce_file)) {
glue::glue("Could not find input SCE file, expected at: `{opts$input_sce_file}`.")
}
if (!stringr::str_ends(opts$output_file, ".tsv")) {
stop("The output TSV file must end in .tsv.")
}
fs::dir_create(dirname(opts$output_file)) # create output directory as needed
set.seed(opts$random_seed)
# Detect doublets and export TSV file with inferences -----
# Check the number of cells to see if scDblFinder can be run
cell_threshold <- 10
sce <- readRDS(opts$input_sce_file)
ncells <- ncol(sce)
if (ncells < cell_threshold) {
warning(
glue::glue(
"The provided SingleCellExperiment object only has {ncells} droplets, which is fewer than {cell_threshold} so `scDblFinder` will not be run.
An output TSV file will still be produced, but it will be populated with `NA` values."
)
)
na_df <- data.frame(
barcodes = colnames(sce),
score = NA,
class = NA
)
if (opts$benchmark) {
na_df$cxds_score <- NA
}
readr::write_tsv(na_df, opts$output_file)
} else {
keep_columns <- c(
"barcodes",
"score",
"class"
)
if (opts$benchmark) {
keep_columns <- c(keep_columns, "cxds_score")
}
run_scdblfinder(
sce,
cores = opts$cores,
# only used if there are multiple samples, e.g. if this is processing a merged object
sample_var = opts$sample_var, # will be NULL if not provided as input argument
random_seed = opts$random_seed,
columns_to_keep = keep_columns
) |>
readr::write_tsv(opts$output_file)
}