-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathSTITCH.R
executable file
·553 lines (551 loc) · 24.3 KB
/
STITCH.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
#!/usr/bin/env Rscript
if (!suppressPackageStartupMessages(require("optparse")))
install.packages("optparse", repos="http://cran.rstudio.com/")
option_list <- list(
make_option(
"--chr",
type = "character",
help = "What chromosome to run. Should match BAM headers"
),
make_option(
"--posfile",
type = "character",
help = "Where to find file with positions to run. File is tab seperated with no header, one row per SNP, with col 1 = chromosome, col 2 = physical position (sorted from smallest to largest), col 3 = reference base, col 4 = alternate base. Bases are capitalized. Example first row: 1<tab>1000<tab>A<tab>G<tab>"
),
make_option(
"--K",
type = "integer",
help = "How many founder / mosaic haplotypes to use"
),
make_option(
"--S",
type = "integer",
help = "How many sets of founder / mosaic haplotypes to use [default 1] ",
default = 1
),
make_option(
"--nGen",
type = "double",
help = "Number of generations since founding or mixing. Note that the algorithm is relatively robust to this. Use nGen = 4 * Ne / K if unsure"
),
make_option(
"--outputdir",
type = "character",
help = "What output directory to use"
),
make_option(
"--tempdir",
type = "character",
help = "What directory to use as temporary directory. If set to NA, use default R tempdir. If possible, use ramdisk, like /dev/shm/ [default NA] ",
default = NA
),
make_option(
"--bamlist",
type = "character",
help = "Path to file with bam file locations. File is one row per entry, path to bam files. Bam index files should exist in same directory as for each bam, suffixed either .bam.bai or .bai [default \"\"] ",
default = ""
),
make_option(
"--cramlist",
type = "character",
help = "Path to file with cram file locations. File is one row per entry, path to cram files. cram files are converted to bam files on the fly for parsing into STITCH [default \"\"] ",
default = ""
),
make_option(
"--sampleNames_file",
type = "character",
help = "Optional, if not specified, sampleNames are taken from the SM tag in the header of the BAM / CRAM file. This argument is the path to file with sampleNames for samples. It is used directly to name samples in the order they appear in the bamlist / cramlist [default \"\"] ",
default = ""
),
make_option(
"--reference",
type = "character",
help = "Path to reference fasta used for making cram files. Only required if cramlist is defined [default \"\"] ",
default = ""
),
make_option(
"--genfile",
type = "character",
help = "Path to gen file with high coverage results. Empty for no genfile. File has a header row with a name for each sample, matching what is found in the bam file. Each subject is then a tab seperated column, with 0 = hom ref, 1 = het, 2 = hom alt and NA indicating missing genotype, with rows corresponding to rows of the posfile. Note therefore this file has one more row than posfile which has no header [default \"\"] ",
default = ""
),
make_option(
"--method",
type = "character",
help = "How to run imputation - either diploid, pseudoHaploid, or diploid-inbred. Please see main README for more information. All methods assume diploid samples. diploid is the most accurate but slowest, while pseudoHaploid may be advantageous for large sample sizes and K. diploid-inbred assumes all samples are inbred and invokes an internal haploid mathematical model but outputs diploid genotypes and probabilities [default diploid] ",
default = "diploid"
),
make_option(
"--output_format",
type = "character",
help = "one of bgvcf (i.e. bgziped VCF) or bgen (Layout = 2, CompressedSNPBlocks = 1) [default bgvcf] ",
default = "bgvcf"
),
make_option(
"--B_bit_prob",
type = "integer",
help = "when using bgen, how many bits to use to store each double. Optiosn are 8, 16, 24 or 32 [default 16] ",
default = 16
),
make_option(
"--outputInputInVCFFormat",
type = "logical",
help = "Whether to output the input in vcf format [default FALSE] ",
default = FALSE
),
make_option(
"--downsampleToCov",
type = "double",
help = "What coverage to downsample individual sites to. This ensures no floating point errors at sites with really high coverage [default 50] ",
default = 50
),
make_option(
"--downsampleFraction",
type = "double",
help = "Downsample BAMs by choosing a fraction of reads to retain. Must be value 0<downsampleFraction<1 [default 1] ",
default = 1
),
make_option(
"--readAware",
type = "logical",
help = "Whether to run the algorithm is read aware mode. If false, then reads are split into new reads, one per SNP per read [default TRUE] ",
default = TRUE
),
make_option(
"--chrStart",
type = "integer",
help = "When loading from BAM, some start position, before SNPs occur. Default NA will infer this from either regionStart, regionEnd and buffer, or posfile [default NA] ",
default = NA
),
make_option(
"--chrEnd",
type = "integer",
help = "When loading from BAM, some end position, after SNPs occur. Default NA will infer this from either regionStart, regionEnd and buffer, or posfile [default NA] ",
default = NA
),
make_option(
"--regionStart",
type = "integer",
help = "When running imputation, where to start from. The 1-based position x is kept if regionStart <= x <= regionEnd [default NA] ",
default = NA
),
make_option(
"--regionEnd",
type = "integer",
help = "When running imputation, where to stop. [default NA] ",
default = NA
),
make_option(
"--buffer",
type = "integer",
help = "Buffer of region to perform imputation over. So imputation is run form regionStart-buffer to regionEnd+buffer, and reported for regionStart to regionEnd, including the bases of regionStart and regionEnd [default NA] ",
default = NA
),
make_option(
"--maxDifferenceBetweenReads",
type = "double",
help = "How much of a difference to allow the reads to make in the forward backward probability calculation. For example, if P(read | state 1)=1 and P(read | state 2)=1e-6, re-scale so that their ratio is this value. This helps prevent any individual read as having too much of an influence on state changes, helping prevent against influence by false positive SNPs [default 1000] ",
default = 1000
),
make_option(
"--maxEmissionMatrixDifference",
type = "double",
help = "Similar to maxDifferenceBetweenReads, specifies ratio of how much larger the most probable state can be than the least probable state, but across all reads rather than for a single read. This helps to limit overflow in C++ calculations [default 1e10] ",
default = 1e10
),
make_option(
"--alphaMatThreshold",
type = "double",
help = "Minimum (maximum is 1 minus this) state switching into probabilities [default 1e-4] ",
default = 1e-4
),
make_option(
"--emissionThreshold",
type = "double",
help = "Emission probability bounds. emissionThreshold < P(alt read | state k) < (1-emissionThreshold) [default 1e-4] ",
default = 1e-4
),
make_option(
"--iSizeUpperLimit",
type = "double",
help = "Do not use reads with an insert size of more than this value [default as.integer(600)] ",
default = as.integer(600)
),
make_option(
"--bqFilter",
type = "double",
help = "Minimum BQ for a SNP in a read. Also, the algorithm uses bq<=mq, so if mapping quality is less than this, the read isnt used [default as.integer(17)] ",
default = as.integer(17)
),
make_option(
"--niterations",
type = "integer",
help = "Number of EM iterations. [default 40] ",
default = 40
),
make_option(
"--shuffleHaplotypeIterations",
type = "character",
help = "Iterations on which to perform heuristic attempt to shuffle founder haplotypes for better fit. To disable set to NA. [default c(4, 8, 12, 16)] ",
default = "c(4, 8, 12, 16)"
),
make_option(
"--splitReadIterations",
type = "character",
help = "Iterations to try and split reads which may span recombination breakpoints for a better fit [default 25] ",
default = 25
),
make_option(
"--nCores",
type = "integer",
help = "How many cores to use [default 1] ",
default = 1
),
make_option(
"--expRate",
type = "double",
help = "Expected recombination rate in cM/Mb [default 0.5] ",
default = 0.5
),
make_option(
"--maxRate",
type = "double",
help = "Maximum recomb rate cM/Mb [default 100] ",
default = 100
),
make_option(
"--minRate",
type = "double",
help = "Minimum recomb rate cM/Mb [default 0.1] ",
default = 0.1
),
make_option(
"--Jmax",
type = "integer",
help = "Maximum number of SNPs on a read [default 1000] ",
default = 1000
),
make_option(
"--regenerateInput",
type = "logical",
help = "Whether to regenerate input files. If this is FALSE, please using the same regionStart, regionEnd, buffer and posfile as you used to generate the input. Setting any of those to different values can cause the previous input data to be improperly interpreted. Please also see originalRegionName and regenerateInputWithDefaultValues [default TRUE] ",
default = TRUE
),
make_option(
"--originalRegionName",
type = "character",
help = "If regenerateInput is FALSE (i.e. using existing data), this is the name of the original region name (chr.regionStart.regionEnd). This is necessary to load past variables [default NA] ",
default = NA
),
make_option(
"--keepInterimFiles",
type = "logical",
help = "Whether to keep interim parameter estimates [default FALSE] ",
default = FALSE
),
make_option(
"--keepTempDir",
type = "logical",
help = "Whether to keep files in temporary directory [default FALSE] ",
default = FALSE
),
make_option(
"--switchModelIteration",
type = "integer",
help = "Whether to switch from pseudoHaploid to diploid and at what iteration (NA for no switching) [default NA] ",
default = NA
),
make_option(
"--generateInputOnly",
type = "logical",
help = "Whether to just generate input data then quit [default FALSE] ",
default = FALSE
),
make_option(
"--restartIterations",
type = "character",
help = "In pseudoHaploid method, which iterations to look for collapsed haplotype prnobabilities to resolve [default NA] ",
default = NA
),
make_option(
"--refillIterations",
type = "character",
help = "When to try and refill some of the less frequently used haplotypes [default c(6, 10, 14, 18)] ",
default = "c(6, 10, 14, 18)"
),
make_option(
"--downsampleSamples",
type = "double",
help = "What fraction of samples to retain. Useful for checking effect of N on imputation. Not meant for general use [default 1] ",
default = 1
),
make_option(
"--downsampleSamplesKeepList",
type = "character",
help = "When downsampling samples, specify a numeric list of samples to keep [default NA] ",
default = NA
),
make_option(
"--subsetSNPsfile",
type = "character",
help = "If input data has already been made for a region, then subset down to a new set of SNPs, as given by this file. Not meant for general use [default NA] ",
default = NA
),
make_option(
"--useSoftClippedBases",
type = "logical",
help = "Whether to use (TRUE) or not use (FALSE) bases in soft clipped portions of reads [default FALSE] ",
default = FALSE
),
make_option(
"--outputBlockSize",
type = "integer",
help = "How many samples to write out to disk at the same time when making temporary VCFs that are later pasted together at the end to make the final VCF. Smaller means lower RAM footprint, larger means faster write. [default 1000] ",
default = 1000
),
make_option(
"--outputSNPBlockSize",
type = "integer",
help = "How many SNPs to write to disk at one time to reduce RAM usage when making VCFs [default 10000] ",
default = 10000
),
make_option(
"--inputBundleBlockSize",
type = "integer",
help = "If NA, disable bundling of input files. If not NA, bundle together input files in sets of <= inputBundleBlockSize together [default NA] ",
default = NA
),
make_option(
"--genetic_map_file",
type = "character",
help = "Path to file with genetic map information, a file with 3 white-space delimited entries giving position (1-based), genetic rate map in cM/Mbp, and genetic map in cM [default \"\"] ",
default = ""
),
make_option(
"--reference_haplotype_file",
type = "character",
help = "Path to reference haplotype file in IMPUTE format (file with no header and no rownames, one row per SNP, one column per reference haplotype, space separated, values must be 0 or 1) [default \"\"] ",
default = ""
),
make_option(
"--reference_legend_file",
type = "character",
help = "Path to reference haplotype legend file in IMPUTE format (file with one row per SNP, and a header including position for the physical position in 1 based coordinates, a0 for the reference allele, and a1 for the alternate allele) [default \"\"] ",
default = ""
),
make_option(
"--reference_sample_file",
type = "character",
help = "Path to reference sample file (file with header, one must be POP, corresponding to populations that can be specified using reference_populations) [default \"\"] ",
default = ""
),
make_option(
"--reference_populations",
type = "character",
help = "Vector with character populations to include from reference_sample_file e.g. CHB, CHS [default NA] ",
default = NA
),
make_option(
"--reference_phred",
type = "integer",
help = "Phred scaled likelihood or an error of reference haplotype. Higher means more confidence in reference haplotype genotypes, lower means less confidence [default 20] ",
default = 20
),
make_option(
"--reference_iterations",
type = "integer",
help = "When using reference haplotypes, how many iterations to use to train the starting data [default 40] ",
default = 40
),
make_option(
"--reference_shuffleHaplotypeIterations",
type = "character",
help = "When using reference haplotypes, how much shuffling to do to lead to better global fit [default c(4, 8, 12, 16)] ",
default = "c(4, 8, 12, 16)"
),
make_option(
"--output_filename",
type = "character",
help = "Override the default bgzip-VCF / bgen output name with this given file name. Please note that this does not change the names of inputs or outputs (e.g. RData, plots), so if outputdir is unchanged and if multiple STITCH runs are processing on the same region then they may over-write each others inputs and outputs [default NULL] ",
default = NULL
),
make_option(
"--initial_min_hapProb",
type = "double",
help = "Initial lower bound for probability read comes from haplotype. Double bounded between 0 and 1 [default 0.2] ",
default = 0.2
),
make_option(
"--initial_max_hapProb",
type = "double",
help = "Initial upper bound for probability read comes from haplotype. Double bounded between 0 and 1 [default 0.8] ",
default = 0.8
),
make_option(
"--regenerateInputWithDefaultValues",
type = "logical",
help = "If regenerateInput is FALSE and the original input data was made using regionStart, regionEnd and buffer as default values, set this equal to TRUE [default FALSE] ",
default = FALSE
),
make_option(
"--plotHapSumDuringIterations",
type = "logical",
help = "Boolean TRUE/FALSE about whether to make a plot that shows the relative number of individuals using each ancestral haplotype in each iteration [default FALSE] ",
default = FALSE
),
make_option(
"--plot_shuffle_haplotype_attempts",
type = "logical",
help = "Boolean TRUE/FALSE about whether to make a plot that tries to show the selection of ancestral haplotypes to check for shuffling / flipping [default FALSE] ",
default = FALSE
),
make_option(
"--plotAfterImputation",
type = "logical",
help = "Boolean TRUE/FALSE about whether to make plots after imputation has run (can be set to FALSE if this throws errors on systems without x11) [default TRUE] ",
default = TRUE
),
make_option(
"--plotReferenceAlleleCount",
type = "logical",
help = "Generate plot of reference allele counts, set to FALSE for systmems without x11 [default TRUE]",
default = TRUE
),
make_option(
"--save_sampleReadsInfo",
type = "logical",
help = "Experimental. Boolean TRUE/FALSE about whether to save additional information about the reads that were extracted [default FALSE] ",
default = FALSE
),
make_option(
"--gridWindowSize",
type = "integer",
help = "Whether to work on a grid where reads are binned into windows of this size (1 based, i.e. first bin is bases 1-gridWindowSize). This is particularly appropriate for very low coverage data (e.g. less than 0.2X) and can substantially speed up analyses [default NA] ",
default = NA
),
make_option(
"--shuffle_bin_nSNPs",
type = "integer",
help = "Parameter that controls how to detect ancestral haplotypes that are shuffled during EM for possible re-setting. If set (not NULL), then break per-SNP (or per-grid) every this many SNPs / grids, and compare each to detect whether haplotypes either 1) are more likely to stay where they are or 2) switch from one haplotype to another. Note that only one of shuffle_bin_nSNPs or shuffle_bin_radius should be non-NULL [default NULL] ",
default = NULL
),
make_option(
"--shuffle_bin_radius",
type = "integer",
help = "Parameter that controls how to detect ancestral haplotypes that are shuffled during EM for possible re-setting. If set (not NULL), then recombination rate is calculated around pairs of SNPs in window of twice this value, and those that exceed what should be the maximum (defined by nGen and maxRate) are checked for whether they are shuffled [default 5000] ",
default = 5000
),
make_option(
"--keepSampleReadsInRAM",
type = "logical",
help = "Whether to (generally) keep sampleReads in RAM or store them in the temporary directory. STITCH is substantially faster if this is FALSE at the expense of RAM [default FALSE] ",
default = FALSE
),
make_option(
"--useTempdirWhileWriting",
type = "logical",
help = "Whether to use temporary directory while writing output file (TRUE), or to keep result in RAM (FALSE). Using temporary directory is slower but uses less RAM [default FALSE] ",
default = FALSE
),
make_option(
"--output_haplotype_dosages",
type = "logical",
help = "Whether to output ancestral haplotype dosages, i.e. the expected number of ancestral haplotypes carried by that sample at that locus [default FALSE] ",
default = FALSE
),
make_option(
"--use_bx_tag",
type = "logical",
help = "Whether to try and use BX tag in same to indicate that reads come from the same underlying molecule [default TRUE] ",
default = TRUE
),
make_option(
"--bxTagUpperLimit",
type = "integer",
help = "When using BX tag, at what distance between reads to consider reads with the same BX tag to come from different molecules [default 50000] ",
default = 50000
)
)
opt <- suppressWarnings(parse_args(OptionParser(option_list = option_list)))
suppressPackageStartupMessages(library(STITCH))
Sys.setenv(PATH = paste0(Sys.getenv("PATH"), ":", getwd()))
STITCH(
chr = opt$chr,
posfile = opt$posfile,
K = opt$K,
S = opt$S,
nGen = opt$nGen,
outputdir = opt$outputdir,
tempdir = opt$tempdir,
bamlist = opt$bamlist,
cramlist = opt$cramlist,
sampleNames_file = opt$sampleNames_file,
reference = opt$reference,
genfile = opt$genfile,
method = opt$method,
output_format = opt$output_format,
B_bit_prob = opt$B_bit_prob,
outputInputInVCFFormat = opt$outputInputInVCFFormat,
downsampleToCov = opt$downsampleToCov,
downsampleFraction = opt$downsampleFraction,
readAware = opt$readAware,
chrStart = opt$chrStart,
chrEnd = opt$chrEnd,
regionStart = opt$regionStart,
regionEnd = opt$regionEnd,
buffer = opt$buffer,
maxDifferenceBetweenReads = opt$maxDifferenceBetweenReads,
maxEmissionMatrixDifference = opt$maxEmissionMatrixDifference,
alphaMatThreshold = opt$alphaMatThreshold,
emissionThreshold = opt$emissionThreshold,
iSizeUpperLimit = opt$iSizeUpperLimit,
bqFilter = opt$bqFilter,
niterations = opt$niterations,
shuffleHaplotypeIterations = eval(parse(text=opt$shuffleHaplotypeIterations)),
splitReadIterations = eval(parse(text=opt$splitReadIterations)),
nCores = opt$nCores,
expRate = opt$expRate,
maxRate = opt$maxRate,
minRate = opt$minRate,
Jmax = opt$Jmax,
regenerateInput = opt$regenerateInput,
originalRegionName = opt$originalRegionName,
keepInterimFiles = opt$keepInterimFiles,
keepTempDir = opt$keepTempDir,
switchModelIteration = opt$switchModelIteration,
generateInputOnly = opt$generateInputOnly,
restartIterations = opt$restartIterations,
refillIterations = eval(parse(text=opt$refillIterations)),
downsampleSamples = opt$downsampleSamples,
downsampleSamplesKeepList = opt$downsampleSamplesKeepList,
subsetSNPsfile = opt$subsetSNPsfile,
useSoftClippedBases = opt$useSoftClippedBases,
outputBlockSize = opt$outputBlockSize,
outputSNPBlockSize = opt$outputSNPBlockSize,
inputBundleBlockSize = opt$inputBundleBlockSize,
genetic_map_file = opt$genetic_map_file,
reference_haplotype_file = opt$reference_haplotype_file,
reference_legend_file = opt$reference_legend_file,
reference_sample_file = opt$reference_sample_file,
reference_populations = eval(parse(text=opt$reference_populations)),
reference_phred = opt$reference_phred,
reference_iterations = opt$reference_iterations,
reference_shuffleHaplotypeIterations = eval(parse(text=opt$reference_shuffleHaplotypeIterations)),
output_filename = opt$output_filename,
initial_min_hapProb = opt$initial_min_hapProb,
initial_max_hapProb = opt$initial_max_hapProb,
regenerateInputWithDefaultValues = opt$regenerateInputWithDefaultValues,
plotHapSumDuringIterations = opt$plotHapSumDuringIterations,
plot_shuffle_haplotype_attempts = opt$plot_shuffle_haplotype_attempts,
plotAfterImputation = opt$plotAfterImputation,
plotReferenceAlleleCount = opt$plotReferenceAlleleCount,
save_sampleReadsInfo = opt$save_sampleReadsInfo,
gridWindowSize = opt$gridWindowSize,
shuffle_bin_nSNPs = opt$shuffle_bin_nSNPs,
shuffle_bin_radius = opt$shuffle_bin_radius,
keepSampleReadsInRAM = opt$keepSampleReadsInRAM,
useTempdirWhileWriting = opt$useTempdirWhileWriting,
output_haplotype_dosages = opt$output_haplotype_dosages,
use_bx_tag = opt$use_bx_tag,
bxTagUpperLimit = opt$bxTagUpperLimit
)