-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathPIPELINE.mk
executable file
·331 lines (264 loc) · 14.3 KB
/
PIPELINE.mk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
#######################
### PIPELINE PARAMS ###
#######################
### system / executables ##
PL := ~/bin/AlleleSeq2
NTHR := 1 # multithread, works for mapping, sorting, fastqc
SAMTOOLS := ~/bin/samtools-1.3.1/samtools
PICARD := ~/bin/picard-tools-2.1.1/picard.jar
JAVA := java
JAVA_MEM := 80g
STAR := ~/bin/STAR/STAR-2.6.0c/bin/Linux_x86_64/STAR
FASTQC := fastqc
CUTADAPT := ~/bin/cutadapt
### input files / paths ##
READS_R1 :=
READS_R2 :=
PGENOME_DIR := NULL
VCF_SAMPLE_ID := NULL
### params ##
ALIGNMENT_MODE := NULL # can be 'ASE', 'ASB', 'custom', 'ASCA' -- currently, for with known adapters (if present) only
RM_DUPLICATE_READS := on # with 'on' duplicate reads will be removed using picard
PERFORM_FASTQC := on
# needed for all: ASE, ASB, custom, or ASCA:
GenomeIdx_STAR_diploid := $(PGENOME_DIR)/STAR_idx_diploid
STAR_outFilterMismatchNoverReadLmax := 0.03
STAR_outFilterMatchNminOverLread := 0.95
STAR_readFilesCommand := zcat # zcat, cat, etc
STAR_limitSjdbInsertNsj := 1500000 # star default is 1000000
# needed if ASE
REFGENOME_VERSION := GRCh37 #GRCh38 or CRCh37
Annotation_diploid := $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_diploid.gencode.v19.annotation.gtf
ifeq ($(REFGENOME_VERSION), GRCh38)
Annotation_diploid := $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_diploid.gencode.v24.annotation.gtf
endif
STAR_sjdbOverhang := 100 #STAR default will work almost as well as with the ideal (readlength -1) value according to the manual
# if custom alignment mode
STAR_parameters_file := $(PL)/STAR_custom_parameters_sample_file
# needed if ASCA, atac-seq:
R1_ADAPTER_SEQ := CTGTCTCTTATA
R2_ADAPTER_SEQ := CTGTCTCTTATA
# for ASCA, Nextera and transposase adapter sequences trimming, similar to what ENCODE prototype peak calling pipeline finds in ENTEx samples
# stats / counts params:
FDR_SIMS := 500
FDR_CUTOFF := 0.05
Cntthresh_tot := 6
Cntthresh_min := 0
#AMB_MODE := adjust # 'adjust' or allelic ratio diff threshold for filtering
# only 'adjust' mode only for now: add the 'weaker allele' base counts from multi-mapping reads, if any:
# all of them or until balanced with the stonger to make sure the imbalance is not caused by the multimapping reads
KEEP_CHR := # empty or e.g. 'X'
######################
### PIPELINE STEPS ###
######################
# todo: a better way than using these 'tmp's?
empty_string:=
ifeq ($(READS_R2),$(empty_string))
tmp1 = $(READS_R1:.gz=)
tmp2 = $(tmp1:.fastq=)
PREFIX = $(tmp2:.fq=)
FASTQC_out = $(PREFIX)_fastqc.html
else
tmp11 = $(READS_R1:.gz=)
tmp12 = $(tmp11:.fastq=)
FASTQC_out = $(tmp12:.fq=)_fastqc.html
tmp21 = $(READS_R2:.gz=)
tmp22 = $(tmp21:.fastq=)
PREFIX = $(tmp12:.fq=)_$(tmp22:.fq=)
endif
ifeq ($(RM_DUPLICATE_READS),on)
DEDUP_SUFFIX = rmdup.
endif
ifeq ($(PERFORM_FASTQC),off)
FASTQC_out = $(empty_string)
endif
FINAL_ALIGNMENT_FILENAME = $(PREFIX)_$(ALIGNMENT_MODE)-params.Aligned.sortedByCoord.out.$(DEDUP_SUFFIX)bam
HetSNV_UNIQALNS_FILENAME = $(PREFIX)_$(ALIGNMENT_MODE)-params_crdsorted_uniqreads_over_hetSNVs.bam
HetSNV_MMAPALNS_FILENAME = $(PREFIX)_$(ALIGNMENT_MODE)-params_crdsorted_mmapreads_over_hetSNVs.bam
$(info PGENOME_DIR: $(PGENOME_DIR))
$(info READS_R1: $(READS_R1))
$(info READS_R2: $(READS_R2))
$(info PERFORM_FASTQC: $(PERFORM_FASTQC))
$(info PREFIX: $(PREFIX))
$(info ALIGNMENT_MODE: $(ALIGNMENT_MODE))
$(info RM_DUPLICATE_READS: $(RM_DUPLICATE_READS))
$(info FINAL_ALIGNMENT_FILENAME: $(FINAL_ALIGNMENT_FILENAME))
$(info HetSNV_UNIQALNS_FILENAME: $(HetSNV_UNIQALNS_FILENAME))
$(info HetSNV_MMAPALNS_FILENAME: $(HetSNV_MMAPALNS_FILENAME))
$(info $(empty_string))
$(info $(empty_string))
$(info $(empty_string))
$(info $(empty_string))
######################
### PIPELINE START ###
######################
all: $(FASTQC_out) $(PREFIX)_ref_allele_ratios.raw_counts.pdf $(PREFIX)_ref_allele_ratios.filtered_counts.pdf $(PREFIX)_ref_allele_ratios.filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min.pdf $(PREFIX)_interestingHets.FDR-$(FDR_CUTOFF).binom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv $(PREFIX)_interestingHets.FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv
#currently, keeping alleleDB betabinomial scripts with as few modifications as possible
$(PREFIX)_interestingHets.FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv: $(PREFIX)_filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv
Rscript $(PL)/alleledb_calcOverdispersion.R \
$< \
$(PREFIX)_FDR-$(FDR_CUTOFF).betabinomial.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min
Rscript $(PL)/alleledb_alleleseqBetabinomial.R \
$< \
$(PREFIX)_FDR-$(FDR_CUTOFF).betabinomial.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min \
$(PREFIX)_counts.FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv \
$@ \
$(PREFIX)_FDR-$(FDR_CUTOFF).betabinom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.txt \
$(FDR_CUTOFF)
$(PREFIX)_interestingHets.FDR-$(FDR_CUTOFF).binom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv: $(PREFIX)_filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv
python $(PL)/FalsePos.py $< $(FDR_SIMS) $(FDR_CUTOFF) > $(PREFIX)_FDR-$(FDR_CUTOFF).binom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.txt
cat $< | python $(PL)/filter_by_pval.py $(PREFIX)_FDR-$(FDR_CUTOFF).binom.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.txt > $@
# allelic ratio distrs
$(PREFIX)_ref_allele_ratios.filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min.pdf: $(PREFIX)_filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv
Rscript $(PL)/plot_AllelicRatio_distribution.R $< $(PREFIX) filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min
# filter based on total counts and min per allele count
# and in non-autosomal chr, optionally keeping X;
$(PREFIX)_filtered_counts.chrs1-22$(KEEP_CHR).$(Cntthresh_tot)-tot_$(Cntthresh_min)-min_cnt.tsv: $(PREFIX)_filtered_counts.tsv
cat $< | \
python $(PL)/filter_non-autosomal_chr.py $(KEEP_CHR) | \
python $(PL)/filter_by_counts.py $(Cntthresh_tot) $(Cntthresh_min) > $@
# allelic ratio distrs
$(PREFIX)_ref_allele_ratios.filtered_counts.pdf: $(PREFIX)_filtered_counts.tsv
Rscript $(PL)/plot_AllelicRatio_distribution.R $< $(PREFIX) filtered_counts
# filter out sites in potential cnv regions
# and sites with seemingly misphased/miscalled nearby variants
# filter/adjust sites imbalanced likely due to unaccounted multi-mapping reads
# will use 'adjust' only for now
$(PREFIX)_filtered_counts.tsv: $(PREFIX)_raw_counts.tsv $(PREFIX)_hap1_mmapreads.mpileup $(PREFIX)_hap2_mmapreads.mpileup
cat $< | \
python $(PL)/filter_cnv_sites.py $(PREFIX)_discarded_HetSNVs_potential-CNV.log $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_rd.tab | \
python $(PL)/filter_phase_warnings.py $(PREFIX)_discarded_HetSNVs_warn-haplotype.log | \
#python $(PL)/filter_sites_w_mmaps.py $(AMB_MODE) $(PREFIX)_discarded_HetSNVs_warn-mmaps.log $(PREFIX)_mmap_reads_over_hetSNVs.log \
python $(PL)/filter_sites_w_mmaps.py adjust $(PREFIX)_discarded_HetSNVs_warn-mmaps.log $(PREFIX)_mmap_reads_over_hetSNVs.log \
$(PREFIX)_hap1_mmapreads.mpileup $(PREFIX)_hap2_mmapreads.mpileup > $@
# allelic ratio distrs
$(PREFIX)_ref_allele_ratios.raw_counts.pdf: $(PREFIX)_raw_counts.tsv
Rscript $(PL)/plot_AllelicRatio_distribution.R $< $(PREFIX) raw_counts
# counts
$(PREFIX)_raw_counts.tsv: $(PREFIX)_hap1_uniqreads.mpileup $(PREFIX)_hap2_uniqreads.mpileup
python $(PL)/pileup2counts.py 1 $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_ref.bed \
$(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_hap1.bed $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_hap2.bed \
$(PREFIX)_discarded_HetSNVs_from-pileups.log \
$(PREFIX)_hap1_uniqreads.mpileup $(PREFIX)_hap2_uniqreads.mpileup > $@
# pileups
$(PREFIX)_hap1_mmapreads.mpileup: $(HetSNV_MMAPALNS_FILENAME)
$(SAMTOOLS) mpileup -BQ0 --max-depth 999999 --ff UNMAP -f $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hap1.fa $< \
--positions $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_hap1.bed --output $@
$(PREFIX)_hap2_mmapreads.mpileup: $(HetSNV_MMAPALNS_FILENAME)
$(SAMTOOLS) mpileup -BQ0 --max-depth 999999 --ff UNMAP -f $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hap2.fa $< \
--positions $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_hap2.bed --output $@
$(PREFIX)_hap1_uniqreads.mpileup: $(HetSNV_UNIQALNS_FILENAME)
$(SAMTOOLS) mpileup -BQ0 --max-depth 999999 --ff UNMAP -f $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hap1.fa $< \
--positions $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_hap1.bed --output $@
$(PREFIX)_hap2_uniqreads.mpileup: $(HetSNV_UNIQALNS_FILENAME)
$(SAMTOOLS) mpileup -BQ0 --max-depth 999999 --ff UNMAP -f $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hap2.fa $< \
--positions $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_hap2.bed --output $@
# by default --ff was also filtering-out some other - secondary? reads: [UNMAP,SECONDARY,QCFAIL,DUP], leaving only UNMAP for now
# non-uniq alns over hetSNVs:
$(PREFIX)_$(ALIGNMENT_MODE)-params_crdsorted_mmapreads_over_hetSNVs.bam: $(FINAL_ALIGNMENT_FILENAME)
cat $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_hap1.bed $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_hap2.bed | \
$(SAMTOOLS) view -h -L - $< | awk '$$5!="255" {print $$0}' | \
$(SAMTOOLS) view -b - > $@
$(SAMTOOLS) index $@
$(SAMTOOLS) flagstat $@ > $@.stat
# uniq alns over hetSNVs:
$(PREFIX)_$(ALIGNMENT_MODE)-params_crdsorted_uniqreads_over_hetSNVs.bam: $(FINAL_ALIGNMENT_FILENAME)
cat $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_hap1.bed $(PGENOME_DIR)/$(VCF_SAMPLE_ID)_hetSNVs_hap2.bed | \
$(SAMTOOLS) view -h -q 255 -L - $< | \
$(SAMTOOLS) view -b - > $@
$(SAMTOOLS) index $@
$(SAMTOOLS) flagstat $@ > $@.stat
# if removing duplicate reads
$(PREFIX)_$(ALIGNMENT_MODE)-params.Aligned.sortedByCoord.out.rmdup.bam: $(PREFIX)_$(ALIGNMENT_MODE)-params.Aligned.sortedByCoord.out.bam
$(JAVA) -Xmx$(JAVA_MEM) -jar $(PICARD) MarkDuplicates \
INPUT=$< OUTPUT=$@ METRICS_FILE=$(@:.bam=.metrics) \
REMOVE_DUPLICATES=true \
DUPLICATE_SCORING_STRATEGY=SUM_OF_BASE_QUALITIES
$(SAMTOOLS) index $@
$(SAMTOOLS) flagstat $@ > $@.stat
## specific additional params will be read from $(STAR_parameters_file)
$(PREFIX)_custom-params.Aligned.sortedByCoord.out.bam: $(READS_R1)
$(STAR) \
--runThreadN $(NTHR) \
--genomeDir $(GenomeIdx_STAR_diploid) \
--readFilesIn $< $(READS_R2) \
--readFilesCommand $(STAR_readFilesCommand) \
--outFileNamePrefix $(@:Aligned.sortedByCoord.out.bam=) \
--outSAMattributes All \
--outFilterMultimapNmax 999999 \
--scoreGenomicLengthLog2scale 0.0 \
--sjdbOverhang $(STAR_sjdbOverhang) \
--sjdbGTFfile $(Annotation_diploid) \
--parametersFiles $(STAR_parameters_file) \
--limitSjdbInsertNsj $(STAR_limitSjdbInsertNsj) \
--outSAMtype BAM SortedByCoordinate
$(SAMTOOLS) flagstat $@ > $@.stat
$(SAMTOOLS) index $@
## params for RNA-seq; will use as default for ASE
$(PREFIX)_ASE-params.Aligned.sortedByCoord.out.bam: $(READS_R1)
$(STAR) \
--runThreadN $(NTHR) \
--genomeDir $(GenomeIdx_STAR_diploid) \
--twopassMode Basic \
--readFilesIn $< $(READS_R2) \
--readFilesCommand $(STAR_readFilesCommand) \
--outFileNamePrefix $(@:Aligned.sortedByCoord.out.bam=) \
--outSAMattributes All \
--outFilterMismatchNoverReadLmax $(STAR_outFilterMismatchNoverReadLmax) \
--outFilterMatchNminOverLread $(STAR_outFilterMatchNminOverLread) \
--outFilterMultimapNmax 999999 \
--scoreGenomicLengthLog2scale 0.0 \
--sjdbOverhang $(STAR_sjdbOverhang) \
--sjdbGTFfile $(Annotation_diploid) \
--limitSjdbInsertNsj $(STAR_limitSjdbInsertNsj) \
--outSAMtype BAM SortedByCoordinate
$(SAMTOOLS) flagstat $@ > $@.stat
$(SAMTOOLS) index $@
## opts similar to AlleleSeq v1.2a bowtie1 -v 2 -m 1 mode; but with small gaps allowed, no splicing; will use as default for ASB
$(PREFIX)_ASB-params.Aligned.sortedByCoord.out.bam: $(READS_R1)
$(STAR) \
--runThreadN $(NTHR) \
--genomeDir $(GenomeIdx_STAR_diploid) \
--readFilesIn $< $(READS_R2) \
--readFilesCommand $(STAR_readFilesCommand) \
--outFileNamePrefix $(@:Aligned.sortedByCoord.out.bam=) \
--outSAMattributes All \
--outFilterMismatchNoverReadLmax $(STAR_outFilterMismatchNoverReadLmax) \
--outFilterMatchNminOverLread $(STAR_outFilterMatchNminOverLread) \
--outFilterMultimapNmax 999999 \
--scoreGap -100 \
--scoreGenomicLengthLog2scale 0.0 \
--sjdbScore 0 \
--limitSjdbInsertNsj $(STAR_limitSjdbInsertNsj) \
--outSAMtype BAM SortedByCoordinate
$(SAMTOOLS) flagstat $@ > $@.stat
$(SAMTOOLS) index $@
## opts for ASCA, atac-seq, similar to ASB, but will require adapter-trimmed reads
$(PREFIX)_ASCA-params.Aligned.sortedByCoord.out.bam: $(READS_R1).trimmed.fastq.gz
$(STAR) \
--runThreadN $(NTHR) \
--genomeDir $(GenomeIdx_STAR_diploid) \
--readFilesIn $< $(READS_R2).trimmed.fastq.gz \
--readFilesCommand $(STAR_readFilesCommand) \
--outFileNamePrefix $(@:Aligned.sortedByCoord.out.bam=) \
--outSAMattributes All \
--outFilterMismatchNoverReadLmax $(STAR_outFilterMismatchNoverReadLmax) \
--outFilterMatchNminOverLread $(STAR_outFilterMatchNminOverLread) \
--outFilterMultimapNmax 999999 \
--scoreGap -100 \
--scoreGenomicLengthLog2scale 0.0 \
--sjdbScore 0 \
--limitSjdbInsertNsj $(STAR_limitSjdbInsertNsj) \
--outSAMtype BAM SortedByCoordinate
$(SAMTOOLS) flagstat $@ > $@.stat
$(SAMTOOLS) index $@
$(READS_R1).trimmed.fastq.gz $(READS_R2).trimmed.fastq.gz: $(READS_R1) $(READS_R2)
$(CUTADAPT) -m 5 \
-a $(R1_ADAPTER_SEQ) \
-A $(R2_ADAPTER_SEQ) \
-o $(READS_R1).trimmed.fastq.gz \
-p $(READS_R2).trimmed.fastq.gz \
$(READS_R1) $(READS_R2)
$(FASTQC) --threads $(NTHR) $(READS_R1).trimmed.fastq.gz $(READS_R2).trimmed.fastq.gz
$(FASTQC_out): $(READS_R1)
$(FASTQC) --threads $(NTHR) $(READS_R1) $(READS_R2)